From b39d52fa97fc70ca9b41960443ec1a733bc1ef89 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 21 Oct 2024 18:30:32 -0500
Subject: [PATCH 01/15] add owsm-ctc recipe

---
 egs2/owsm_ctc_v3.1/s2t1/README.md             |  14 +++
 egs2/owsm_ctc_v3.1/s2t1/cmd.sh                | 110 ++++++++++++++++++
 egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml  |   7 ++
 egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf       |   2 +
 egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf         |  11 ++
 egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf       |   1 +
 egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf       |  12 ++
 egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf       |  14 +++
 ..._multitask-ctc_ebf27_conv2d8_size1024.yaml | 109 +++++++++++++++++
 egs2/owsm_ctc_v3.1/s2t1/db.sh                 |   1 +
 .../s2t1/local/convert_owsm_data.py           |  61 ++++++++++
 egs2/owsm_ctc_v3.1/s2t1/local/path.sh         |   0
 egs2/owsm_ctc_v3.1/s2t1/path.sh               |   1 +
 egs2/owsm_ctc_v3.1/s2t1/pyscripts             |   1 +
 egs2/owsm_ctc_v3.1/s2t1/run.sh                |  36 ++++++
 egs2/owsm_ctc_v3.1/s2t1/s2t.sh                |   1 +
 egs2/owsm_ctc_v3.1/s2t1/scripts               |   1 +
 egs2/owsm_ctc_v3.1/s2t1/utils                 |   1 +
 18 files changed, 383 insertions(+)
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/README.md
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/cmd.sh
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
 create mode 120000 egs2/owsm_ctc_v3.1/s2t1/db.sh
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py
 create mode 100644 egs2/owsm_ctc_v3.1/s2t1/local/path.sh
 create mode 120000 egs2/owsm_ctc_v3.1/s2t1/path.sh
 create mode 120000 egs2/owsm_ctc_v3.1/s2t1/pyscripts
 create mode 100755 egs2/owsm_ctc_v3.1/s2t1/run.sh
 create mode 120000 egs2/owsm_ctc_v3.1/s2t1/s2t.sh
 create mode 120000 egs2/owsm_ctc_v3.1/s2t1/scripts
 create mode 120000 egs2/owsm_ctc_v3.1/s2t1/utils
diff --git a/egs2/owsm_ctc_v3.1/s2t1/README.md b/egs2/owsm_ctc_v3.1/s2t1/README.md
new file mode 100644
index 00000000000..7c6d7f980a4
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/README.md
@@ -0,0 +1,14 @@
+# OWSM-CTC v3.1
+
+[OWSM-CTC](https://aclanthology.org/2024.acl-long.549/) is an encoder-only speech foundation model based on hierarchical multi-task self-conditioned CTC.
+This version is trained on 180k hours of public audio data for multilingual speech recognition, any-to-any speech translation, and language identification, which follows the design of the project, [Open Whisper-style Speech Model (OWSM)](https://arxiv.org/abs/2401.16658).
+
+## Data Preparation
+
+The training data follows the same format as the encoder-decoder OWSM v3.1, except that timestamps are removed from the `text` file. Please first follow the `egs2/owsm_v3.1/s2t1` recipe to prepare OWSM data, and then convert `text` into the new format by running `python local/convert_owsm_data.py` (the path to the BPE tokenizer needs to be modified to your path).
+
+## Pre-trained Model
+
+The pre-trained model is available at: https://huggingface.co/pyf98/owsm_ctc_v3.1_1B
+
+The model page also contains example usage.
diff --git a/egs2/owsm_ctc_v3.1/s2t1/cmd.sh b/egs2/owsm_ctc_v3.1/s2t1/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml b/egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml
new file mode 100644
index 00000000000..6aa2877d464
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/decode_s2t.yaml
@@ -0,0 +1,7 @@
+beam_size: 1
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+lm_weight: 0.0
+lang_sym: <eng>
+task_sym: <asr>
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf b/egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf
new file mode 100644
index 00000000000..75232358639
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000
+--num-mel-bins=80
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf b/egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf b/egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf
new file mode 100644
index 00000000000..e959a19d5b8
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf b/egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf b/egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml b/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
new file mode 100644
index 00000000000..adc8b639399
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
@@ -0,0 +1,109 @@
+preprocessor: s2t_ctc
+preprocessor_conf:
+    na_symbol: "<na>"
+    speech_length: 30
+    speech_init_silence: 30
+    text_prev_apply_prob: 0.5
+    lang_apply_prob: 0.5
+    nolang_symbol: "<nolang>"
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: false
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+encoder: e_branchformer_ctc
+encoder_conf:
+    output_size: 1024
+    attention_heads: 16
+    attention_layer_type: selfattn
+    pos_enc_layer_type: abs_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 4096
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 27
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d8
+    layer_drop_rate: 0.0
+    linear_units: 4096
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+    interctc_layer_idx: [6, 12, 15, 21]
+    interctc_use_conditioning: true
+    use_cross_attention: [0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,]
+    use_flash_attn: true    # flash attn
+
+promptencoder: transformer
+promptencoder_conf:
+    output_size: 512
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 4
+    use_flash_attn: true
+
+model: espnet_ctc
+model_conf:
+    interctc_weight: 0.8
+    sym_na: "<na>"
+    ctc_asr_only: [true, true, true, false, false]
+
+optim: adamw
+optim_conf:
+    lr: 2.0e-04
+    betas:
+    - 0.9
+    - 0.98
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: piecewiselinearwarmuplr
+scheduler_conf:
+    warmup_steps_list: [0, 30000, 60000]
+    warmup_lr_list: [0., 5.0e-05, 2.0e-04]
+
+# 4 samples per GPU
+batch_type: unsorted
+batch_size: 256
+accum_grad: 1
+num_iters_per_epoch: 15000
+max_epoch: 55
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - cer_ctc
+    - min
+-   - valid
+    - loss_ctc
+    - min
+-   - valid
+    - total_count
+    - max
+keep_nbest_models: 5
+nbest_averaging_interval: 5     # average ckpt every 5 epochs
+use_amp: true
+num_workers: 4
+unused_parameters: false
+seed: 2024
+num_att_plot: 0     # set to 0 due to flash_attn
diff --git a/egs2/owsm_ctc_v3.1/s2t1/db.sh b/egs2/owsm_ctc_v3.1/s2t1/db.sh
new file mode 120000
index 00000000000..7a4ec1f85fa
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/s2t1/db.sh
\ No newline at end of file
diff --git a/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py b/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py
new file mode 100644
index 00000000000..1488702c929
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py
@@ -0,0 +1,61 @@
+import numpy as np
+from pathlib import Path
+
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+
+
+def parse_owsm(
+    text_in,
+    text_out,
+    tokenizer,
+    token_id_converter,
+):
+    first_time = token_id_converter.token2id["<0.00>"]
+    last_time = token_id_converter.token2id["<30.00>"]
+
+    with open(text_in, 'r') as fin, open(
+        text_out, 'w'
+    ) as fout:
+        for line in fin:
+            utt_id, text = line.strip().split(maxsplit=1)
+
+            tokens = tokenizer.text2tokens(text)
+            text_ints = np.array(token_id_converter.tokens2ids(tokens))
+
+            text_ints = text_ints[
+                np.logical_or(
+                    text_ints < first_time,
+                    text_ints > last_time,
+                )
+            ]
+
+            tokens = token_id_converter.ids2tokens(text_ints)
+            text = tokenizer.tokens2text(tokens)
+
+            fout.write(
+                f"{utt_id} {text}\n"
+            )
+
+
+if __name__ == "__main__":
+    root = "dump/raw"
+
+    tokenizer = build_tokenizer(
+        token_type="bpe",
+        bpemodel="owsm_v3.1/s2t1/data/token_list/bpe_unigram50000/bpe.model",
+    )
+
+    token_id_converter = TokenIDConverter(
+        token_list="owsm_v3.1/s2t1/data/token_list/bpe_unigram50000/tokens.txt",
+        unk_symbol="<unk>",
+    )
+
+    for name in ['train_v3', 'dev_v3']:
+        (Path(root) / name / 'text').rename(Path(root) / name / 'text.old')
+        parse_owsm(
+            Path(root) / name / 'text.old',
+            Path(root) / name / 'text',
+            tokenizer,
+            token_id_converter,
+        )
diff --git a/egs2/owsm_ctc_v3.1/s2t1/local/path.sh b/egs2/owsm_ctc_v3.1/s2t1/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/owsm_ctc_v3.1/s2t1/path.sh b/egs2/owsm_ctc_v3.1/s2t1/path.sh
new file mode 120000
index 00000000000..b4e4590282d
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/s2t1/path.sh
\ No newline at end of file
diff --git a/egs2/owsm_ctc_v3.1/s2t1/pyscripts b/egs2/owsm_ctc_v3.1/s2t1/pyscripts
new file mode 120000
index 00000000000..bd0b573573e
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/s2t1/pyscripts
\ No newline at end of file
diff --git a/egs2/owsm_ctc_v3.1/s2t1/run.sh b/egs2/owsm_ctc_v3.1/s2t1/run.sh
new file mode 100755
index 00000000000..bfb9f81957b
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/run.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_v3
+valid_set=dev_v3
+test_sets=dev_v3
+
+nbpe=50000
+s2t_config=conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
+inference_config=conf/decode_s2t.yaml
+
+./s2t.sh \
+    --use_lm false \
+    --num_nodes 16 \
+    --ngpu 4 \
+    --nj 64 \
+    --gpu_inference true \
+    --inference_nj 8 \
+    --num_splits_s2t 12 \
+    --feats_type raw \
+    --audio_format flac.ark \
+    --token_type bpe \
+    --nbpe ${nbpe} \
+    --bpe_input_sentence_size 15000000 \
+    --s2t_config "${s2t_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --bpe_train_text "dump/raw/${train_set}/text" \
+    --bpe_nlsyms data/nlsyms.txt \
+    --lm_train_text "dump/raw/${train_set}/text" "$@"
diff --git a/egs2/owsm_ctc_v3.1/s2t1/s2t.sh b/egs2/owsm_ctc_v3.1/s2t1/s2t.sh
new file mode 120000
index 00000000000..1c9ecaefd54
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/s2t.sh
@@ -0,0 +1 @@
+../../TEMPLATE/s2t1/s2t.sh
\ No newline at end of file
diff --git a/egs2/owsm_ctc_v3.1/s2t1/scripts b/egs2/owsm_ctc_v3.1/s2t1/scripts
new file mode 120000
index 00000000000..741c1e33ec9
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/s2t1/scripts
\ No newline at end of file
diff --git a/egs2/owsm_ctc_v3.1/s2t1/utils b/egs2/owsm_ctc_v3.1/s2t1/utils
new file mode 120000
index 00000000000..f02255b553a
--- /dev/null
+++ b/egs2/owsm_ctc_v3.1/s2t1/utils
@@ -0,0 +1 @@
+../../TEMPLATE/s2t1/utils
\ No newline at end of file

From fc0164b2ac50462e57aaf551c66316274ab250f0 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 21 Oct 2024 20:25:00 -0500
Subject: [PATCH 02/15] add OWSM-CTC models

---
 .../transformer/subsampling.py                |   29 +-
 .../asr/encoder/e_branchformer_ctc_encoder.py |  526 ++++++++
 espnet2/bin/s2t_ctc_align.py                  |  888 +++++++++++++
 espnet2/bin/s2t_inference_ctc.py              | 1127 +++++++++++++++++
 espnet2/s2t/espnet_ctc_model.py               |  349 +++++
 espnet2/tasks/s2t_ctc.py                      |  481 +++++++
 espnet2/train/preprocessor.py                 |  169 +++
 7 files changed, 3561 insertions(+), 8 deletions(-)
 create mode 100644 espnet2/asr/encoder/e_branchformer_ctc_encoder.py
 create mode 100755 espnet2/bin/s2t_ctc_align.py
 create mode 100644 espnet2/bin/s2t_inference_ctc.py
 create mode 100644 espnet2/s2t/espnet_ctc_model.py
 create mode 100644 espnet2/tasks/s2t_ctc.py

diff --git a/espnet/nets/pytorch_backend/transformer/subsampling.py b/espnet/nets/pytorch_backend/transformer/subsampling.py
index 0566907a84a..ab4a9eb6b82 100644
--- a/espnet/nets/pytorch_backend/transformer/subsampling.py
+++ b/espnet/nets/pytorch_backend/transformer/subsampling.py
@@ -473,17 +473,16 @@ def __init__(self, idim, odim, dropout_rate, pos_enc=None):
             torch.nn.Conv2d(odim, odim, 3, 2),
             torch.nn.ReLU(),
         )
-        self.out = torch.nn.Sequential(
-            torch.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim),
-            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate),
-        )
+        self.out = torch.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
+        self.pos_enc = pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
 
-    def forward(self, x, x_mask):
+    def forward(self, x, x_mask, prefix_embeds=None):
         """Subsample x.
 
         Args:
             x (torch.Tensor): Input tensor (#batch, time, idim).
             x_mask (torch.Tensor): Input mask (#batch, 1, time).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings (#batch, prefix_len, odim).
 
         Returns:
             torch.Tensor: Subsampled tensor (#batch, time', odim),
@@ -496,6 +495,20 @@ def forward(self, x, x_mask):
         x = self.conv(x)
         b, c, t, f = x.size()
         x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
-        if x_mask is None:
-            return x, None
-        return x, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
+        if x_mask is not None:
+            x_mask = x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
+
+        if prefix_embeds is not None:
+            x = torch.cat([prefix_embeds, x], dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat(
+                    [
+                        torch.ones(x_mask.shape[0], 1, prefix_embeds.size(1), dtype=x_mask.dtype, device=x_mask.device),
+                        x_mask
+                    ],
+                    dim=-1,
+                )
+
+        x = self.pos_enc(x)
+
+        return x, x_mask
diff --git a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
new file mode 100644
index 00000000000..971c4cf18a3
--- /dev/null
+++ b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
@@ -0,0 +1,526 @@
+"""E-Branchformer encoder used by OWSM-CTC.
+
+Compared to the original encoder, this variant supports additional
+cross-attention modules.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+import torch
+from typeguard import typechecked
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.layers.cgmlp import ConvolutionalGatingMLP
+from espnet2.asr.layers.fastformer import FastSelfAttention
+from espnet.nets.pytorch_backend.nets_utils import get_activation, make_pad_mask
+from espnet.nets.pytorch_backend.transformer.attention import (  # noqa: H301
+    LegacyRelPositionMultiHeadedAttention,
+    MultiHeadedAttention,
+    RelPositionMultiHeadedAttention,
+)
+from espnet.nets.pytorch_backend.transformer.embedding import (  # noqa: H301
+    LegacyRelPositionalEncoding,
+    PositionalEncoding,
+    RelPositionalEncoding,
+    ScaledPositionalEncoding,
+)
+from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,
+)
+from espnet.nets.pytorch_backend.transformer.repeat import repeat
+from espnet.nets.pytorch_backend.transformer.subsampling import (
+    Conv1dSubsampling1,
+    Conv1dSubsampling2,
+    Conv1dSubsampling3,
+    Conv2dSubsampling,
+    Conv2dSubsampling1,
+    Conv2dSubsampling2,
+    Conv2dSubsampling6,
+    Conv2dSubsampling8,
+    TooShortUttError,
+    check_short_utt,
+)
+
+
+class EBranchformerEncoderLayer(torch.nn.Module):
+    """E-Branchformer encoder layer module.
+
+    Args:
+        size (int): model dimension
+        attn: standard self-attention or efficient attention
+        cgmlp: ConvolutionalGatingMLP
+        feed_forward: feed-forward module, optional
+        feed_forward_macaron: macaron-style feed-forward module, optional
+        cross_attn: cross attention module
+        dropout_rate (float): dropout probability
+        merge_conv_kernel (int): kernel size of the depth-wise conv in merge module
+    """
+
+    def __init__(
+        self,
+        size: int,
+        attn: torch.nn.Module,
+        cgmlp: torch.nn.Module,
+        feed_forward: Optional[torch.nn.Module],
+        feed_forward_macaron: Optional[torch.nn.Module],
+        cross_attn: Optional[torch.nn.Module],
+        dropout_rate: float,
+        merge_conv_kernel: int = 3,
+    ):
+        super().__init__()
+
+        self.size = size
+        self.attn = attn
+        self.cgmlp = cgmlp
+
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.ff_scale = 1.0
+        if self.feed_forward is not None:
+            self.norm_ff = LayerNorm(size)
+        if self.feed_forward_macaron is not None:
+            self.ff_scale = 0.5
+            self.norm_ff_macaron = LayerNorm(size)
+
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        self.norm_mlp = LayerNorm(size)  # for the MLP module
+        self.norm_final = LayerNorm(size)  # for the final output of the block
+
+        # for cross attention
+        self.cross_attn = cross_attn
+        if self.cross_attn is not None:
+            self.norm_cross_attn = LayerNorm(size)
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+        self.depthwise_conv_fusion = torch.nn.Conv1d(
+            size + size,
+            size + size,
+            kernel_size=merge_conv_kernel,
+            stride=1,
+            padding=(merge_conv_kernel - 1) // 2,
+            groups=size + size,
+            bias=True,
+        )
+        self.merge_proj = torch.nn.Linear(size + size, size)
+
+    def forward(self, x_input, mask, cache=None, memory=None, memory_mask=None,):
+        """Compute encoded features.
+
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, 1, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+
+        if cache is not None:
+            raise NotImplementedError("cache is not None, which is not tested")
+
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        if self.feed_forward_macaron is not None:
+            residual = x
+            x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+
+        # Two branches
+        x1 = x
+        x2 = x
+
+        # Branch 1: multi-headed attention module
+        x1 = self.norm_mha(x1)
+
+        if isinstance(self.attn, FastSelfAttention):
+            x_att = self.attn(x1, mask)
+        else:
+            if pos_emb is not None:
+                x_att = self.attn(x1, x1, x1, pos_emb, mask)
+            else:
+                x_att = self.attn(x1, x1, x1, mask)
+
+        x1 = self.dropout(x_att)
+
+        # Branch 2: convolutional gating mlp
+        x2 = self.norm_mlp(x2)
+
+        if pos_emb is not None:
+            x2 = (x2, pos_emb)
+        x2 = self.cgmlp(x2, mask)
+        if isinstance(x2, tuple):
+            x2 = x2[0]
+
+        x2 = self.dropout(x2)
+
+        # Merge two branches
+        x_concat = torch.cat([x1, x2], dim=-1)
+        x_tmp = x_concat.transpose(1, 2)
+        x_tmp = self.depthwise_conv_fusion(x_tmp)
+        x_tmp = x_tmp.transpose(1, 2)
+        x = x + self.dropout(self.merge_proj(x_concat + x_tmp))
+
+        if self.feed_forward is not None:
+            # feed forward module
+            residual = x
+            x = self.norm_ff(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+
+        # Cross attention
+        if self.cross_attn is not None and memory is not None:
+            residual = x
+            x = self.norm_cross_attn(x)
+            x = residual + self.dropout(self.cross_attn(x, memory, memory, memory_mask))
+
+        x = self.norm_final(x)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
+
+
+class EBranchformerCTCEncoder(AbsEncoder):
+    """E-Branchformer encoder module."""
+
+    @typechecked
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        attention_layer_type: str = "rel_selfattn",
+        pos_enc_layer_type: str = "rel_pos",
+        rel_pos_type: str = "latest",
+        cgmlp_linear_units: int = 2048,
+        cgmlp_conv_kernel: int = 31,
+        use_linear_after_conv: bool = False,
+        gate_activation: str = "identity",
+        num_blocks: int = 12,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        zero_triu: bool = False,
+        padding_idx: int = -1,
+        layer_drop_rate: float = 0.0,
+        max_pos_emb_len: int = 5000,
+        use_ffn: bool = False,
+        macaron_ffn: bool = False,
+        ffn_activation_type: str = "swish",
+        linear_units: int = 2048,
+        positionwise_layer_type: str = "linear",
+        merge_conv_kernel: int = 3,
+        interctc_layer_idx=None,
+        interctc_use_conditioning: bool = False,
+        use_cross_attention=True,   # bool or list of bool
+        use_flash_attn: bool = False,
+    ):
+        super().__init__()
+        self._output_size = output_size
+
+        if rel_pos_type == "legacy":
+            if pos_enc_layer_type == "rel_pos":
+                pos_enc_layer_type = "legacy_rel_pos"
+            if attention_layer_type == "rel_selfattn":
+                attention_layer_type = "legacy_rel_selfattn"
+        elif rel_pos_type == "latest":
+            assert attention_layer_type != "legacy_rel_selfattn"
+            assert pos_enc_layer_type != "legacy_rel_pos"
+        else:
+            raise ValueError("unknown rel_pos_type: " + rel_pos_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert attention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            assert attention_layer_type == "legacy_rel_selfattn"
+            pos_enc_class = LegacyRelPositionalEncoding
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv1d1":
+            self.embed = Conv1dSubsampling1(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv1d2":
+            self.embed = Conv1dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv1d3":
+            self.embed = Conv1dSubsampling3(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d1":
+            self.embed = Conv2dSubsampling1(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer is None:
+            if input_size == output_size:
+                self.embed = torch.nn.Sequential(
+                    pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len)
+                )
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        activation = get_activation(ffn_activation_type)
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type is None:
+            logging.warning("no macaron ffn")
+        else:
+            raise ValueError("Support only linear.")
+
+        if attention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                False,  # no qk_norm
+                use_flash_attn,
+            )
+        elif attention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+            logging.warning(
+                "Using legacy_rel_selfattn and it will be deprecated in the future."
+            )
+        elif attention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        elif attention_layer_type == "fast_selfattn":
+            assert pos_enc_layer_type in ["abs_pos", "scaled_abs_pos"]
+            encoder_selfattn_layer = FastSelfAttention
+            encoder_selfattn_layer_args = (
+                output_size,
+                attention_heads,
+                attention_dropout_rate,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + attention_layer_type)
+
+        cgmlp_layer = ConvolutionalGatingMLP
+        cgmlp_layer_args = (
+            output_size,
+            cgmlp_linear_units,
+            cgmlp_conv_kernel,
+            dropout_rate,
+            use_linear_after_conv,
+            gate_activation,
+        )
+
+        if isinstance(use_cross_attention, bool):
+            use_cross_attention = [use_cross_attention for _ in range(num_blocks)]
+        assert isinstance(use_cross_attention, list) and len(use_cross_attention) == num_blocks
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EBranchformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                cgmlp_layer(*cgmlp_layer_args),
+                positionwise_layer(*positionwise_layer_args) if use_ffn else None,
+                positionwise_layer(*positionwise_layer_args)
+                if use_ffn and macaron_ffn
+                else None,
+                MultiHeadedAttention(
+                    attention_heads,
+                    output_size,
+                    attention_dropout_rate,
+                    False,  # no qk_norm
+                    use_flash_attn,
+                    cross_attn=True,
+                ) if use_cross_attention[lnum] else None,
+                dropout_rate,
+                merge_conv_kernel,
+            ),
+            layer_drop_rate,
+        )
+        self.after_norm = LayerNorm(output_size)
+
+        if interctc_layer_idx is None:
+            interctc_layer_idx = []
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        ctc: CTC = None,
+        max_layer: int = None,
+        prefix_embeds: torch.tensor = None,  # (batch, 2, output_size)
+        memory=None,
+        memory_mask=None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+            ctc (CTC): Intermediate CTC module.
+            max_layer (int): Layer depth below which InterCTC is applied.
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv1dSubsampling1)
+            or isinstance(self.embed, Conv1dSubsampling2)
+            or isinstance(self.embed, Conv1dSubsampling3)
+            or isinstance(self.embed, Conv2dSubsampling1)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks, prefix_embeds)
+        elif self.embed is not None:
+            xs_pad = self.embed(xs_pad)
+
+        intermediate_outs = []
+        for layer_idx, encoder_layer in enumerate(self.encoders):
+            xs_pad, masks = encoder_layer(xs_pad, masks, memory=memory, memory_mask=memory_mask)
+
+            if layer_idx + 1 in self.interctc_layer_idx:
+                encoder_out = xs_pad
+
+                if isinstance(encoder_out, tuple):
+                    encoder_out = encoder_out[0]
+
+                intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                if self.interctc_use_conditioning:
+                    ctc_out = ctc.softmax(encoder_out)
+
+                    if isinstance(xs_pad, tuple):
+                        xs_pad = list(xs_pad)
+                        xs_pad[0] = xs_pad[0] + self.conditioning_layer(ctc_out)
+                        xs_pad = tuple(xs_pad)
+                    else:
+                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+            if max_layer is not None and layer_idx >= max_layer:
+                break
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+
+        xs_pad = self.after_norm(xs_pad)
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None
diff --git a/espnet2/bin/s2t_ctc_align.py b/espnet2/bin/s2t_ctc_align.py
new file mode 100755
index 00000000000..cbfe969aa8c
--- /dev/null
+++ b/espnet2/bin/s2t_ctc_align.py
@@ -0,0 +1,888 @@
+#!/usr/bin/env python3
+# Copyright 2021, Ludwig Kürzinger; Kamo Naoyuki; Yifan Peng
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+"""Perform CTC segmentation to align utterances within audio files using OWSM-CTC."""
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import List, Optional, TextIO, Union
+
+import numpy as np
+import soundfile
+import torch
+
+# imports for CTC segmentation
+from ctc_segmentation import (
+    CtcSegmentationParameters,
+    ctc_segmentation,
+    determine_utterance_segments,
+    prepare_text,
+    prepare_token_list,
+)
+from typeguard import typechecked
+
+from espnet2.tasks.s2t_ctc import S2TTask
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool, str_or_none
+
+# imports for inference
+from espnet.utils.cli_utils import get_commandline_args
+
+
+class CTCSegmentationTask:
+    """Task object for CTC segmentation.
+
+    When formatted with str(·), this object returns
+    results in a kaldi-style segments file formatting.
+    The human-readable output can be configured with
+    the printing options.
+
+    Properties:
+        text: Utterance texts, separated by line. But without the utterance
+            name at the beginning of the line (as in kaldi-style text).
+        ground_truth_mat: Ground truth matrix (CTC segmentation).
+        utt_begin_indices: Utterance separator for the Ground truth matrix.
+        timings: Time marks of the corresponding chars.
+        state_list: Estimated alignment of chars/tokens.
+        segments: Calculated segments as: (start, end, confidence score).
+        config: CTC Segmentation configuration object.
+        name: Name of aligned audio file (Optional). If given, name is
+            considered when generating the text.
+        utt_ids: The list of utterance names (Optional). This list should
+            have the same length as the number of utterances.
+        lpz: CTC posterior log probabilities (Optional).
+
+    Properties for printing:
+        print_confidence_score: Includes the confidence score.
+        print_utterance_text: Includes utterance text.
+    """
+
+    text = None
+    ground_truth_mat = None
+    utt_begin_indices = None
+    timings = None
+    char_probs = None
+    state_list = None
+    segments = None
+    config = None
+    done = False
+    # Optional
+    name = "utt"
+    utt_ids = None
+    lpz = None
+    # Printing
+    print_confidence_score = True
+    print_utterance_text = True
+
+    def __init__(self, **kwargs):
+        """Initialize the module."""
+        self.set(**kwargs)
+
+    def set(self, **kwargs):
+        """Update properties.
+
+        Args:
+            **kwargs: Key-value dict that contains all properties
+                with their new values. Unknown properties are ignored.
+        """
+        for key in kwargs:
+            if (
+                not key.startswith("_")
+                and hasattr(self, key)
+                and kwargs[key] is not None
+            ):
+                setattr(self, key, kwargs[key])
+
+    def __str__(self):
+        """Return a kaldi-style ``segments`` file (string)."""
+        output = ""
+        num_utts = len(self.segments)
+        if self.utt_ids is None:
+            utt_names = [f"{self.name}_{i:04}" for i in range(num_utts)]
+        else:
+            # ensure correct mapping of segments to utterance ids
+            assert num_utts == len(self.utt_ids)
+            utt_names = self.utt_ids
+        for i, boundary in enumerate(self.segments):
+            # utterance name and file name
+            utt_entry = f"{utt_names[i]} {self.name} "
+            # segment start and end
+            utt_entry += f"{boundary[0]:.2f} {boundary[1]:.2f}"
+            # confidence score
+            if self.print_confidence_score:
+                utt_entry += f" {boundary[2]:3.4f}"
+            # utterance ground truth
+            if self.print_utterance_text:
+                utt_entry += f" {self.text[i]}"
+            output += utt_entry + "\n"
+        return output
+
+
+class CTCSegmentation:
+    """Align text to audio using CTC segmentation.
+
+    Usage:
+        Initialize with given ASR model and parameters.
+        If needed, parameters for CTC segmentation can be set with ``set_config(·)``.
+        Then call the instance as function to align text within an audio file.
+
+    Example:
+        >>> # example file included in the ESPnet repository
+        >>> import soundfile
+        >>> speech, fs = soundfile.read("test_utils/ctc_align_test.wav")
+        >>> # load an ASR model
+        >>> from espnet_model_zoo.downloader import ModelDownloader
+        >>> d = ModelDownloader()
+        >>> wsjmodel = d.download_and_unpack( "kamo-naoyuki/wsj" )
+        >>> # Apply CTC segmentation
+        >>> aligner = CTCSegmentation( **wsjmodel )
+        >>> text=["utt1 THE SALE OF THE HOTELS", "utt2 ON PROPERTY MANAGEMENT"]
+        >>> aligner.set_config( gratis_blank=True )
+        >>> segments = aligner( speech, text, fs=fs )
+        >>> print( segments )
+        utt1 utt 0.27 1.72 -0.1663 THE SALE OF THE HOTELS
+        utt2 utt 4.54 6.10 -4.9646 ON PROPERTY MANAGEMENT
+
+    On multiprocessing:
+        To parallelize the computation with multiprocessing, these three steps
+        can be separated:
+        (1) ``get_lpz``: obtain the lpz,
+        (2) ``prepare_segmentation_task``: prepare the task, and
+        (3) ``get_segments``: perform CTC segmentation.
+        Note that the function `get_segments` is a staticmethod and therefore
+        independent of an already initialized CTCSegmentation object.
+
+    References:
+        CTC-Segmentation of Large Corpora for German End-to-end Speech Recognition
+        2020, Kürzinger, Winkelbauer, Li, Watzel, Rigoll
+        https://arxiv.org/abs/2007.09127
+
+    More parameters are described in https://github.com/lumaku/ctc-segmentation
+
+    """
+
+    fs = 16000
+    samples_to_frames_ratio = None
+    time_stamps = "auto"
+    choices_time_stamps = ["auto", "fixed"]
+    text_converter = "tokenize"
+    choices_text_converter = ["tokenize", "classic"]
+    warned_about_misconfiguration = False
+    config = CtcSegmentationParameters()
+
+    @typechecked
+    def __init__(
+        self,
+        s2t_train_config: Union[Path, str] = None,
+        s2t_model_file: Union[Path, str] = None,
+        fs: int = 16000,
+        ngpu: int = 0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        kaldi_style_text: bool = True,
+        text_converter: str = "tokenize",
+        time_stamps: str = "auto",
+        lang_sym: str = "<eng>",
+        task_sym: str = "<asr>",
+        context_len_in_secs: float = 4,
+        frames_per_sec: float = 12.5,
+        **ctc_segmentation_args,
+    ):
+        """Initialize the CTCSegmentation module.
+
+        Args:
+            s2t_train_config: S2T model config file (yaml).
+            s2t_model_file: S2T model file (pth).
+            fs: Sample rate of audio file.
+            ngpu: Number of GPUs. Set 0 for processing on CPU, set to 1 for
+                processing on GPU. Multi-GPU aligning is currently not
+                implemented. Default: 0.
+            batch_size: Currently, only batch size == 1 is implemented.
+            dtype: Data type used for inference. Set dtype according to
+                the ASR model.
+            kaldi_style_text: A kaldi-style text file includes the name of the
+                utterance at the start of the line. If True, the utterance name
+                is expected as first word at each line. If False, utterance
+                names are automatically generated. Set this option according to
+                your input data. Default: True.
+            text_converter: How CTC segmentation handles text.
+                "tokenize": Use ESPnet 2 preprocessing to tokenize the text.
+                "classic": The text is preprocessed as in ESPnet 1 which takes
+                token length into account. If the ASR model has longer tokens,
+                this option may yield better results. Default: "tokenize".
+            time_stamps: Choose the method how the time stamps are
+                calculated. While "fixed" and "auto" use both the sample rate,
+                the ratio of samples to one frame is either automatically
+                determined for each inference or fixed at a certain ratio that
+                is initially determined by the module, but can be changed via
+                the parameter ``samples_to_frames_ratio``. Recommended for
+                longer audio files: "auto".
+            **ctc_segmentation_args: Parameters for CTC segmentation.
+        """
+
+        # Basic settings
+        device = "cpu"
+        if ngpu == 1:
+            device = "cuda"
+        elif ngpu > 1:
+            logging.error("Multi-GPU not yet implemented.")
+            raise NotImplementedError("Only single GPU decoding is supported")
+
+        # Prepare ASR model
+        s2t_model, s2t_train_args = S2TTask.build_model_from_file(
+            s2t_train_config, s2t_model_file, device
+        )
+        s2t_model.to(dtype=getattr(torch, dtype)).eval()
+        self.preprocess_fn = S2TTask.build_preprocess_fn(s2t_train_args, False)
+
+        # Warn for nets with high memory consumption on long audio files
+        if hasattr(s2t_model, "encoder"):
+            encoder_module = s2t_model.encoder.__class__.__module__
+        else:
+            encoder_module = "Unknown"
+        logging.info(f"Encoder module: {encoder_module}")
+        logging.info(f"CTC module:     {s2t_model.ctc.__class__.__module__}")
+
+        self.s2t_model = s2t_model
+        self.s2t_train_args = s2t_train_args
+        self.device = device
+        self.dtype = dtype
+        self.ctc = s2t_model.ctc
+
+        self.kaldi_style_text = kaldi_style_text
+        self.token_list = s2t_model.token_list
+        # Apply configuration
+        self.set_config(
+            fs=fs,
+            time_stamps=time_stamps,
+            kaldi_style_text=kaldi_style_text,
+            text_converter=text_converter,
+            **ctc_segmentation_args,
+        )
+        self.config.char_list = s2t_model.token_list
+
+        self.batch_size = batch_size
+        self.lang_sym = lang_sym
+        self.task_sym = task_sym
+        self.context_len_in_secs = context_len_in_secs
+        self.frames_per_sec = frames_per_sec
+
+    def set_config(self, **kwargs):
+        """Set CTC segmentation parameters.
+
+        Parameters for timing:
+            time_stamps: Select method how CTC index duration is estimated, and
+                thus how the time stamps are calculated.
+            fs: Sample rate.
+            samples_to_frames_ratio: If you want to directly determine the
+                ratio of samples to CTC frames, set this parameter, and
+                set ``time_stamps`` to "fixed".
+                Note: If you want to calculate the time stamps as in
+                ESPnet 1, set this parameter to:
+                ``subsampling_factor * frame_duration / 1000``.
+
+        Parameters for text preparation:
+            set_blank: Index of blank in token list. Default: 0.
+            replace_spaces_with_blanks: Inserts blanks between words, which is
+                useful for handling long pauses between words. Only used in
+                ``text_converter="classic"`` preprocessing mode. Default: False.
+            kaldi_style_text: Determines whether the utterance name is expected
+                as fist word of the utterance. Set at module initialization.
+            text_converter: How CTC segmentation handles text.
+                Set at module initialization.
+
+        Parameters for alignment:
+            min_window_size: Minimum number of frames considered for a single
+                utterance. The current default value of 8000 corresponds to
+                roughly 4 minutes (depending on ASR model) and should be OK in
+                most cases. If your utterances are further apart, increase
+                this value, or decrease it for smaller audio files.
+            max_window_size: Maximum window size. It should not be necessary
+                to change this value.
+            gratis_blank: If True, the transition cost of blank is set to zero.
+                Useful for long preambles or if there are large unrelated segments
+                between utterances. Default: False.
+
+        Parameters for calculation of confidence score:
+            scoring_length: Block length to calculate confidence score. The
+                default value of 30 should be OK in most cases.
+        """
+        # Parameters for timing
+        if "time_stamps" in kwargs:
+            if kwargs["time_stamps"] not in self.choices_time_stamps:
+                raise NotImplementedError(
+                    f"Parameter ´time_stamps´ has to be one of "
+                    f"{list(self.choices_time_stamps)}",
+                )
+            self.time_stamps = kwargs["time_stamps"]
+        if "fs" in kwargs:
+            self.fs = float(kwargs["fs"])
+        if "samples_to_frames_ratio" in kwargs:
+            self.samples_to_frames_ratio = float(kwargs["samples_to_frames_ratio"])
+        # Parameters for text preparation
+        if "set_blank" in kwargs:
+            assert isinstance(kwargs["set_blank"], int)
+            self.config.blank = kwargs["set_blank"]
+        if "replace_spaces_with_blanks" in kwargs:
+            self.config.replace_spaces_with_blanks = bool(
+                kwargs["replace_spaces_with_blanks"]
+            )
+        if "kaldi_style_text" in kwargs:
+            assert isinstance(kwargs["kaldi_style_text"], bool)
+            self.kaldi_style_text = kwargs["kaldi_style_text"]
+        if "text_converter" in kwargs:
+            if kwargs["text_converter"] not in self.choices_text_converter:
+                raise NotImplementedError(
+                    f"Parameter ´text_converter´ has to be one of "
+                    f"{list(self.choices_text_converter)}",
+                )
+            self.text_converter = kwargs["text_converter"]
+        # Parameters for alignment
+        if "min_window_size" in kwargs:
+            assert isinstance(kwargs["min_window_size"], int)
+            self.config.min_window_size = kwargs["min_window_size"]
+        if "max_window_size" in kwargs:
+            assert isinstance(kwargs["max_window_size"], int)
+            self.config.max_window_size = kwargs["max_window_size"]
+        if "gratis_blank" in kwargs:
+            self.config.blank_transition_cost_zero = bool(kwargs["gratis_blank"])
+        if (
+            self.config.blank_transition_cost_zero
+            and self.config.replace_spaces_with_blanks
+            and not self.warned_about_misconfiguration
+        ):
+            logging.error(
+                "Blanks are inserted between words, and also the transition cost of"
+                " blank is zero. This configuration may lead to misalignments!"
+            )
+            self.warned_about_misconfiguration = True
+        # Parameter for calculation of confidence score
+        if "scoring_length" in kwargs:
+            assert isinstance(kwargs["scoring_length"], int)
+            self.config.score_min_mean_over_L = kwargs["scoring_length"]
+
+    def get_timing_config(self, speech_len=None, lpz_len=None):
+        """Obtain parameters to determine time stamps."""
+        timing_cfg = {
+            "index_duration": self.config.index_duration,
+        }
+        # As the parameter ctc_index_duration vetoes the other
+        if self.time_stamps == "fixed":
+            # Initialize the value, if not yet available
+            if self.samples_to_frames_ratio is None:
+                ratio = self.estimate_samples_to_frames_ratio()
+                self.samples_to_frames_ratio = ratio
+            index_duration = self.samples_to_frames_ratio / self.fs
+        else:
+            assert self.time_stamps == "auto"
+            samples_to_frames_ratio = speech_len / lpz_len
+            index_duration = samples_to_frames_ratio / self.fs
+        timing_cfg["index_duration"] = index_duration
+        return timing_cfg
+
+    def estimate_samples_to_frames_ratio(self, speech_len=215040):
+        """Determine the ratio of encoded frames to sample points.
+
+        This method helps to determine the time a single encoded frame occupies.
+        As the sample rate already gave the number of samples, only the ratio
+        of samples per encoded CTC frame are needed. This function estimates them by
+        doing one inference, which is only needed once.
+
+        Args:
+            speech_len: Length of randomly generated speech vector for single
+                inference. Default: 215040.
+
+        Returns:
+            samples_to_frames_ratio: Estimated ratio.
+        """
+        random_input = torch.rand(speech_len)
+        lpz = self.get_lpz(random_input)
+        lpz_len = lpz.shape[0]
+        # Most frontends (DefaultFrontend, SlidingWindow) discard trailing data
+        lpz_len = lpz_len + 1
+        samples_to_frames_ratio = speech_len // lpz_len
+        return samples_to_frames_ratio
+
+    @torch.no_grad()
+    def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
+        """Obtain CTC posterior log probabilities for given speech data.
+
+        Args:
+            speech: Speech audio input.
+
+        Returns:
+            lpz: Numpy vector with CTC log posterior probabilities.
+        """
+
+        lang_id = self.token_list.index(self.lang_sym)
+        task_id = self.token_list.index(self.task_sym)
+        context_len_in_secs = self.context_len_in_secs
+        sample_rate = self.fs
+        frames_per_sec = self.frames_per_sec
+        batch_size = self.batch_size
+
+        buffer_len_in_secs = self.s2t_train_args.preprocessor_conf["speech_length"]
+        chunk_len_in_secs = buffer_len_in_secs - 2 * context_len_in_secs
+        buffer_len = int(sample_rate * buffer_len_in_secs)
+        chunk_len = int(sample_rate * chunk_len_in_secs)
+
+        speech = np.pad(speech, (int(sample_rate * context_len_in_secs), int(sample_rate * context_len_in_secs)))
+        buffer_list = []
+        for i in range(0, len(speech), chunk_len):
+            cur_buffer = speech[i:i+buffer_len]
+            if len(cur_buffer) < buffer_len:
+                buffer_list.append(
+                    np.pad(cur_buffer, (0, buffer_len - len(cur_buffer)))
+                )
+                break
+            else:
+                buffer_list.append(cur_buffer)
+
+        speech = torch.tensor(np.array(buffer_list)).to(getattr(torch, self.dtype))
+        buffer_frames = int(frames_per_sec * buffer_len_in_secs)
+        context_frames = int(frames_per_sec * context_len_in_secs)
+
+        unmerged = []
+        for idx in range(0, speech.size(0), batch_size):
+            cur_speech = speech[idx : idx + batch_size]
+            cur_speech_lengths = cur_speech.new_full(
+                [cur_speech.size(0)], dtype=torch.long, fill_value=cur_speech.size(1)
+            )
+
+            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            text_prev_lengths = text_prev.new_full(
+                [cur_speech.size(0)], dtype=torch.long, fill_value=text_prev.size(1)
+            )
+
+            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            prefix_lengths = prefix.new_full(
+                [cur_speech.size(0)], dtype=torch.long, fill_value=prefix.size(-1)
+            )
+
+            batch = {
+                "speech": cur_speech,
+                "speech_lengths": cur_speech_lengths,
+                "text_prev": text_prev,
+                "text_prev_lengths": text_prev_lengths,
+                "prefix": prefix,
+                "prefix_lengths": prefix_lengths,
+            }
+
+            # a. To device
+            batch = to_device(batch, device=self.device)
+
+            # b. Forward Encoder
+            enc, enc_olens = self.s2t_model.encode(**batch)
+
+            intermediate_outs = None
+            if isinstance(enc, tuple):
+                enc, intermediate_outs = enc
+
+            # enc: (B, T, D)
+            enc = enc[:, :buffer_frames]    # NOTE(yifan): IMPORTANT: it might be longer due to padding in conv
+            batched_log_p = self.ctc.log_softmax(enc).detach()      # (B, T, V)
+            valid_log_p = batched_log_p[:, context_frames : -context_frames].reshape(-1, batched_log_p.size(-1))    # (T', V)
+            unmerged.append(valid_log_p)
+
+        lpz = torch.cat(unmerged, dim=0).cpu().numpy()   # (time, V)
+        return lpz
+
+    def _split_text(self, text):
+        """Convert text to list and extract utterance IDs."""
+        utt_ids = None
+        # Handle multiline strings
+        if isinstance(text, str):
+            text = text.splitlines()
+        # Remove empty lines
+        text = list(filter(len, text))
+        # Handle kaldi-style text format
+        if self.kaldi_style_text:
+            utt_ids_and_text = [utt.split(" ", 1) for utt in text]
+            # remove utterances with empty text
+            utt_ids_and_text = filter(lambda ui: len(ui) == 2, utt_ids_and_text)
+            utt_ids_and_text = list(utt_ids_and_text)
+            utt_ids = [utt[0] for utt in utt_ids_and_text]
+            text = [utt[1] for utt in utt_ids_and_text]
+        return utt_ids, text
+
+    def prepare_segmentation_task(self, text, lpz, name=None, speech_len=None):
+        """Preprocess text, and gather text and lpz into a task object.
+
+        Text is pre-processed and tokenized depending on configuration.
+        If ``speech_len`` is given, the timing configuration is updated.
+        Text, lpz, and configuration is collected in a CTCSegmentationTask
+        object. The resulting object can be serialized and passed in a
+        multiprocessing computation.
+
+        A minimal amount of text processing is done, i.e., splitting the
+        utterances in ``text`` into a list and applying ``text_cleaner``.
+        It is recommended that you normalize the text beforehand, e.g.,
+        change numbers into their spoken equivalent word, remove special
+        characters, and convert UTF-8 characters to chars corresponding to
+        your ASR model dictionary.
+
+        The text is tokenized based on the ``text_converter`` setting:
+
+        The "tokenize" method is more efficient and the easiest for models
+        based on latin or cyrillic script that only contain the main chars,
+        ["a", "b", ...] or for Japanese or Chinese ASR models with ~3000
+        short Kanji / Hanzi tokens.
+
+        The "classic" method improves the the accuracy of the alignments
+        for models that contain longer tokens, but with a greater complexity
+        for computation. The function scans for partial tokens which may
+        improve time resolution.
+        For example, the word "▁really" will be broken down into
+        ``['▁', '▁r', '▁re', '▁real', '▁really']``. The alignment will be
+        based on the most probable activation sequence given by the network.
+
+        Args:
+            text: List or multiline-string with utterance ground truths.
+            lpz: Log CTC posterior probabilities obtained from the CTC-network;
+                numpy array shaped as ( <time steps>, <classes> ).
+            name: Audio file name. Choose a unique name, or the original audio
+                file name, to distinguish multiple audio files. Default: None.
+            speech_len: Number of sample points. If given, the timing
+                configuration is automatically derived from length of fs, length
+                of speech and length of lpz. If None is given, make sure the
+                timing parameters are correct, see time_stamps for reference!
+                Default: None.
+
+        Returns:
+            task: CTCSegmentationTask object that can be passed to
+                ``get_segments()`` in order to obtain alignments.
+        """
+        config = self.config
+        # Update timing parameters, if needed
+        if speech_len is not None:
+            lpz_len = lpz.shape[0]
+            timing_cfg = self.get_timing_config(speech_len, lpz_len)
+            config.set(**timing_cfg)
+        # `text` is needed in the form of a list.
+        utt_ids, text = self._split_text(text)
+        # Obtain utterance & label sequence from text
+        if self.text_converter == "tokenize":
+            # list of str --tokenize--> list of np.array
+            token_list = [
+                self.preprocess_fn("<dummy>", {"text": utt})["text"] for utt in text
+            ]
+            # filter out any instances of the <unk> token
+            unk = config.char_list.index("<unk>")
+            token_list = [utt[utt != unk] for utt in token_list]
+            ground_truth_mat, utt_begin_indices = prepare_token_list(config, token_list)
+        else:
+            assert self.text_converter == "classic"
+            text = [self.preprocess_fn.text_cleaner(utt) for utt in text]
+            token_list = [
+                "".join(self.preprocess_fn.tokenizer.text2tokens(utt)) for utt in text
+            ]
+            token_list = [utt.replace("<unk>", "") for utt in token_list]
+            ground_truth_mat, utt_begin_indices = prepare_text(config, token_list)
+        task = CTCSegmentationTask(
+            config=config,
+            name=name,
+            text=text,
+            ground_truth_mat=ground_truth_mat,
+            utt_begin_indices=utt_begin_indices,
+            utt_ids=utt_ids,
+            lpz=lpz,
+        )
+        return task
+
+    @staticmethod
+    def get_segments(task: CTCSegmentationTask):
+        """Obtain segments for given utterance texts and CTC log posteriors.
+
+        Args:
+            task: CTCSegmentationTask object that contains ground truth and
+                CTC posterior probabilities.
+
+        Returns:
+            result: Dictionary with alignments. Combine this with the task
+                object to obtain a human-readable segments representation.
+        """
+        assert task.config is not None
+        config = task.config
+        lpz = task.lpz
+        ground_truth_mat = task.ground_truth_mat
+        utt_begin_indices = task.utt_begin_indices
+        text = task.text
+        # Align using CTC segmentation
+        timings, char_probs, state_list = ctc_segmentation(
+            config, lpz, ground_truth_mat
+        )
+        # Obtain list of utterances with time intervals and confidence score
+        segments = determine_utterance_segments(
+            config, utt_begin_indices, char_probs, timings, text
+        )
+        # Store results
+        result = {
+            "name": task.name,
+            "timings": timings,
+            "char_probs": char_probs,
+            "state_list": state_list,
+            "segments": segments,
+            "done": True,
+        }
+        return result
+
+    @typechecked
+    def __call__(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        text: Union[List[str], str],
+        fs: Optional[int] = None,
+        name: Optional[str] = None,
+    ) -> CTCSegmentationTask:
+        """Align utterances.
+
+        Args:
+            speech: Audio file.
+            text: List or multiline-string with utterance ground truths.
+            fs: Sample rate in Hz. Optional, as this can be given when
+                the module is initialized.
+            name: Name of the file. Utterance names are derived from it.
+
+        Returns:
+            CTCSegmentationTask object with segments.
+        
+        """
+
+
+        if fs is not None:
+            self.set_config(fs=fs)
+        # Get log CTC posterior probabilities
+        lpz = self.get_lpz(speech)
+        # Conflate text & lpz & config as a segmentation task object
+        task = self.prepare_segmentation_task(text, lpz, name, speech.shape[0])
+        # Apply CTC segmentation
+        segments = self.get_segments(task)
+        task.set(**segments)
+        return task
+
+
+@typechecked
+def ctc_align(
+    log_level: Union[int, str],
+    s2t_train_config: str,
+    s2t_model_file: str,
+    audio: Path,
+    text: TextIO,
+    output: TextIO,
+    print_utt_text: bool = True,
+    print_utt_score: bool = True,
+    **kwargs,
+):
+    """Provide the scripting interface to align text to audio."""
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    # Ignore configuration values that are set to None (from parser).
+    kwargs = {k: v for (k, v) in kwargs.items() if v is not None}
+
+    # Prepare CTC segmentation module
+    model = {
+        "s2t_train_config": s2t_train_config,
+        "s2t_model_file": s2t_model_file,
+    }
+    aligner = CTCSegmentation(**model, **kwargs)
+
+    # load audio file
+    assert audio.name != ""
+    name = audio.stem
+    speech, fs = soundfile.read(str(audio))
+    # load text file
+    transcripts = text.read()
+
+    # perform inference and CTC segmentation
+    segments = aligner(speech=speech, text=transcripts, fs=fs, name=name)
+
+    # Write to "segments" file or stdout
+    segments.print_utterance_text = print_utt_text
+    segments.print_confidence_score = print_utt_score
+    segments_str = str(segments)
+    output.write(segments_str)
+
+
+def get_parser():
+    """Obtain an argument-parser for the script interface."""
+    parser = config_argparse.ArgumentParser(
+        description="CTC alignment",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+
+    group = parser.add_argument_group("Model configuration related")
+    group.add_argument("--s2t_train_config", type=str, required=True)
+    group.add_argument("--s2t_model_file", type=str, required=True)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    group = parser.add_argument_group("CTC segmentation related")
+    group.add_argument(
+        "--fs",
+        type=int,
+        default=16000,
+        help="Sampling Frequency."
+        " The sampling frequency (in Hz) is needed to correctly determine the"
+        " starting and ending time of aligned segments.",
+    )
+    group.add_argument(
+        "--min_window_size",
+        type=int,
+        default=None,
+        help="Minimum window size considered for utterance.",
+    )
+    group.add_argument(
+        "--max_window_size",
+        type=int,
+        default=None,
+        help="Maximum window size considered for utterance.",
+    )
+    group.add_argument(
+        "--set_blank",
+        type=int,
+        default=None,
+        help="Index of model dictionary for blank token.",
+    )
+    group.add_argument(
+        "--gratis_blank",
+        type=str2bool,
+        default=False,
+        help="Set the transition cost of the blank token to zero. Audio sections"
+        " labeled with blank tokens can then be skipped without penalty. Useful"
+        " if there are unrelated audio segments between utterances.",
+    )
+    group.add_argument(
+        "--replace_spaces_with_blanks",
+        type=str2bool,
+        default=False,
+        help="Fill blanks in between words to better model pauses between words."
+        " This option is only active for `--text_converter classic`."
+        " Segments can be misaligned if this option is combined with"
+        " --gratis-blank.",
+    )
+    group.add_argument(
+        "--scoring_length",
+        type=int,
+        default=None,
+        help="Changes partitioning length L for calculation of the confidence score.",
+    )
+    group.add_argument(
+        "--time_stamps",
+        type=str,
+        default=CTCSegmentation.time_stamps,
+        choices=CTCSegmentation.choices_time_stamps,
+        help="Select method how CTC index duration is estimated, and"
+        " thus how the time stamps are calculated.",
+    )
+    group.add_argument(
+        "--text_converter",
+        type=str,
+        default=CTCSegmentation.text_converter,
+        choices=CTCSegmentation.choices_text_converter,
+        help="How CTC segmentation handles text.",
+    )
+
+    group = parser.add_argument_group("Input/output arguments")
+    group.add_argument(
+        "--kaldi_style_text",
+        type=str2bool,
+        default=True,
+        help="Assume that the input text file is kaldi-style formatted, i.e., the"
+        " utterance name is at the beginning of each line.",
+    )
+    group.add_argument(
+        "--print_utt_text",
+        type=str2bool,
+        default=True,
+        help="Include the utterance text in the segments output.",
+    )
+    group.add_argument(
+        "--print_utt_score",
+        type=str2bool,
+        default=True,
+        help="Include the confidence score in the segments output.",
+    )
+    group.add_argument(
+        "-a",
+        "--audio",
+        type=Path,
+        required=True,
+        help="Input audio file.",
+    )
+    group.add_argument(
+        "-t",
+        "--text",
+        type=argparse.FileType("r"),
+        required=True,
+        help="Input text file."
+        " Each line contains the ground truth of a single utterance."
+        " Kaldi-style text files include the name of the utterance as"
+        " the first word in the line.",
+    )
+    group.add_argument(
+        "-o",
+        "--output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="Output in the form of a `segments` file."
+        " If not given, output is written to stdout.",
+    )
+    return parser
+
+
+def main(cmd=None):
+    """Parse arguments and start the alignment in ctc_align(·)."""
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    ctc_align(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/bin/s2t_inference_ctc.py b/espnet2/bin/s2t_inference_ctc.py
new file mode 100644
index 00000000000..ea70ba3e81c
--- /dev/null
+++ b/espnet2/bin/s2t_inference_ctc.py
@@ -0,0 +1,1127 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import sys
+from itertools import groupby
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.quantization
+from typeguard import typechecked
+
+from espnet2.fileio.datadir_writer import DatadirWriter
+from espnet2.tasks.lm import LMTask
+from espnet2.tasks.s2t_ctc import S2TTask
+from espnet2.text.build_tokenizer import build_tokenizer
+from espnet2.text.token_id_converter import TokenIDConverter
+from espnet2.text.whisper_token_id_converter import OpenAIWhisperTokenIDConverter
+from espnet2.torch_utils.device_funcs import to_device
+from espnet2.torch_utils.set_all_random_seed import set_all_random_seed
+from espnet2.utils import config_argparse
+from espnet2.utils.types import str2bool, str2triple_str, str_or_none
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.batch_beam_search_online_sim import BatchBeamSearchOnlineSim
+from espnet.nets.beam_search import BeamSearch, Hypothesis
+from espnet.nets.pytorch_backend.transformer.subsampling import TooShortUttError
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.ctc import CTCPrefixScorer
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.cli_utils import get_commandline_args
+
+# Alias for typing
+ListOfHypothesis = List[
+    Tuple[
+        Optional[str],
+        List[str],
+        List[int],
+        Optional[str],
+        Optional[Hypothesis],
+    ]
+]
+
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("s2t_config.yml", "s2t.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    @typechecked
+    def __init__(
+        self,
+        s2t_train_config: Union[Path, str] = None,
+        s2t_model_file: Union[Path, str] = None,
+        lm_train_config: Union[Path, str] = None,
+        lm_file: Union[Path, str] = None,
+        ngram_scorer: str = "full",
+        ngram_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        maxlenratio: float = 0.0,
+        minlenratio: float = 0.0,
+        batch_size: int = 1,
+        dtype: str = "float32",
+        beam_size: int = 20,
+        lm_weight: float = 0.0,
+        ngram_weight: float = 0.0,
+        penalty: float = 0.0,
+        nbest: int = 1,
+        streaming: bool = False,
+        quantize_s2t_model: bool = False,
+        quantize_lm: bool = False,
+        quantize_modules: List[str] = ["Linear"],
+        quantize_dtype: str = "qint8",
+        lang_sym: str = "<nolang>",
+        task_sym: str = "<asr>",
+        use_flash_attn: bool = False,
+        generate_interctc_outputs: bool = False,
+    ):
+
+        quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules])
+        quantize_dtype = getattr(torch, quantize_dtype)
+
+        # 1. Build S2T model
+        s2t_model, s2t_train_args = S2TTask.build_model_from_file(
+            s2t_train_config, s2t_model_file, device
+        )
+        s2t_model.to(dtype=getattr(torch, dtype)).eval()
+
+        # Set flash_attn
+        for m in s2t_model.modules():
+            if hasattr(m, "use_flash_attn"):
+                setattr(m, "use_flash_attn", use_flash_attn)
+
+        if quantize_s2t_model:
+            logging.info("Use quantized s2t model for decoding.")
+
+            s2t_model = torch.quantization.quantize_dynamic(
+                s2t_model, qconfig_spec=quantize_modules, dtype=quantize_dtype
+            )
+
+        ctc = CTCPrefixScorer(ctc=s2t_model.ctc, eos=s2t_model.eos)
+        token_list = s2t_model.token_list
+        scorers = dict(
+            decoder=None,
+            ctc=ctc,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+
+            if quantize_lm:
+                logging.info("Use quantized lm for decoding.")
+
+                lm = torch.quantization.quantize_dynamic(
+                    lm, qconfig_spec=quantize_modules, dtype=quantize_dtype
+                )
+
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        if ngram_file is not None:
+            if ngram_scorer == "full":
+                from espnet.nets.scorers.ngram import NgramFullScorer
+
+                ngram = NgramFullScorer(ngram_file, token_list)
+            else:
+                from espnet.nets.scorers.ngram import NgramPartScorer
+
+                ngram = NgramPartScorer(ngram_file, token_list)
+            scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        weights = dict(
+            decoder=0.0,
+            ctc=1.0,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=s2t_model.sos,
+            eos=s2t_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key=None,
+        )
+
+        # TODO(karita): make all scorers batchfied
+        if batch_size == 1:
+            non_batch = [
+                k
+                for k, v in beam_search.full_scorers.items()
+                if not isinstance(v, BatchScorerInterface)
+            ]
+            if len(non_batch) == 0:
+                if streaming:
+                    beam_search.__class__ = BatchBeamSearchOnlineSim
+                    beam_search.set_streaming_config(s2t_train_config)
+                    logging.info("BatchBeamSearchOnlineSim implementation is selected.")
+                else:
+                    beam_search.__class__ = BatchBeamSearch
+                    logging.info("BatchBeamSearch implementation is selected.")
+            else:
+                logging.warning(
+                    f"As non-batch scorers {non_batch} are found, "
+                    f"fall back to non-batch implementation."
+                )
+
+            beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+            for scorer in scorers.values():
+                if isinstance(scorer, torch.nn.Module):
+                    scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+            logging.info(f"Beam_search: {beam_search}")
+            logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = s2t_train_args.token_type
+        if bpemodel is None:
+            bpemodel = s2t_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif (
+            token_type == "bpe"
+            or token_type == "hugging_face"
+            or "whisper" in token_type
+        ):
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+
+        if bpemodel not in ["whisper_en", "whisper_multilingual"]:
+            converter = TokenIDConverter(token_list=token_list)
+        else:
+            converter = OpenAIWhisperTokenIDConverter(model_type=bpemodel)
+            beam_search.set_hyp_primer(
+                list(converter.tokenizer.sot_sequence_including_notimestamps)
+            )
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.s2t_model = s2t_model
+        self.s2t_train_args = s2t_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+        self.generate_interctc_outputs = generate_interctc_outputs
+
+        # default lang and task symbols
+        self.lang_sym = lang_sym
+        self.task_sym = task_sym
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        text_prev: Union[torch.Tensor, np.ndarray, str] = "<na>",
+        lang_sym: Optional[str] = None,
+        task_sym: Optional[str] = None,
+    ) -> Union[
+        ListOfHypothesis,
+        Tuple[
+            ListOfHypothesis,
+            Optional[Dict[int, List[str]]],
+        ],
+    ]:
+        """Inference for a short utterance.
+
+        Args:
+            speech: Input speech
+            text_prev: Previous text used as condition
+        Returns:
+            text, token, token_int, hyp
+
+        """
+
+        # Obtain lang and task tokens
+        lang_sym = lang_sym if lang_sym is not None else self.lang_sym
+        task_sym = task_sym if task_sym is not None else self.task_sym
+        lang_id = self.converter.token2id[lang_sym]
+        task_id = self.converter.token2id[task_sym]
+
+        if isinstance(text_prev, str):
+            text_prev = self.converter.tokens2ids(
+                self.tokenizer.text2tokens(text_prev)
+            )
+        else:
+            text_prev = text_prev.tolist()
+
+        # Check if text_prev is valid
+        if self.s2t_model.na in text_prev:
+            text_prev = [self.s2t_model.na]
+
+        text_prev = torch.tensor(text_prev, dtype=torch.long).unsqueeze(0)  # (1, length)
+        text_prev_lengths = text_prev.new_full([1], dtype=torch.long, fill_value=text_prev.size(1))
+
+        # Prepare prefix
+        prefix = torch.tensor([[lang_id, task_id]], dtype=torch.long) # (1, 2)
+        prefix_lengths = prefix.new_full([1], dtype=torch.long, fill_value=prefix.size(-1))
+
+        # Preapre speech
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # Batchify input
+        # speech: (nsamples,) -> (1, nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lengths: (1,)
+        speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        logging.info("speech length: " + str(speech.size(1)))
+
+        batch = {
+            "speech": speech,
+            "speech_lengths": speech_lengths,
+            "text_prev": text_prev,
+            "text_prev_lengths": text_prev_lengths,
+            "prefix": prefix,
+            "prefix_lengths": prefix_lengths,
+        }
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, enc_olens = self.s2t_model.encode(**batch)
+
+        intermediate_outs = None
+        if isinstance(enc, tuple):
+            enc, intermediate_outs = enc
+
+        assert len(enc) == 1, len(enc)
+
+        # c. Pass the encoder result to the beam search
+        results = self._decode_single_sample(enc[0])
+
+        # Encoder intermediate CTC predictions
+        if intermediate_outs is not None and self.generate_interctc_outputs:
+            encoder_interctc_res = self._decode_interctc(intermediate_outs)
+            results = (results, encoder_interctc_res)
+
+        return results
+
+    def _decode_interctc(
+        self, intermediate_outs: List[Tuple[int, torch.Tensor]]
+    ) -> Dict[int, List[str]]:
+
+        exclude_ids = [self.s2t_model.blank_id, self.s2t_model.sos, self.s2t_model.eos]
+        res = {}
+        token_list = self.beam_search.token_list
+
+        for layer_idx, encoder_out in intermediate_outs:
+            y = self.s2t_model.ctc.argmax(encoder_out)[0]  # batch_size = 1
+            y = [x[0] for x in groupby(y) if x[0] not in exclude_ids]
+            y = [token_list[x] for x in y]
+
+            res[layer_idx] = y
+
+        return res
+
+    def _decode_single_sample(self, enc: torch.Tensor):
+        nbest_hyps = self.beam_search(
+            x=enc, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+
+        nbest_hyps = nbest_hyps[: self.nbest]
+
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, Hypothesis), type(hyp)
+
+            # remove sos/eos and get results
+            last_pos = -1
+            if isinstance(hyp.yseq, list):
+                token_int = hyp.yseq[:last_pos]
+            else:
+                token_int = hyp.yseq[:last_pos].tolist()
+            token_int = token_int[token_int.index(self.s2t_model.sos) + 1 :]
+
+            # remove blank symbol id
+            token_int = list(filter(lambda x: x != self.s2t_model.blank_id, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            # remove special tokens (task, timestamp, etc.)
+            token_nospecial = [x for x in token if not (x[0] == "<" and x[-1] == ">")]
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+                text_nospecial = self.tokenizer.tokens2text(token_nospecial)
+            else:
+                text, text_nospecial = None, None
+            results.append((text, token, token_int, text_nospecial, hyp))
+
+        return results
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2Text instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2Text: Speech2Text instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2Text(**kwargs)
+
+
+class Speech2TextGreedySearch:
+    """Speech2Text with greedy search for CTC."""
+
+    def __init__(
+        self,
+        s2t_train_config: Union[Path, str] = None,
+        s2t_model_file: Union[Path, str] = None,
+        token_type: str = None,
+        bpemodel: str = None,
+        device: str = "cpu",
+        batch_size: int = 1,
+        dtype: str = "float32",
+        quantize_s2t_model: bool = False,
+        quantize_modules: List[str] = ["Linear"],
+        quantize_dtype: str = "qint8",
+        lang_sym: str = "<nolang>",
+        task_sym: str = "<asr>",
+        use_flash_attn: bool = False,
+        generate_interctc_outputs: bool = False,
+        **kwargs,
+    ):
+
+        quantize_modules = set([getattr(torch.nn, q) for q in quantize_modules])
+        quantize_dtype = getattr(torch, quantize_dtype)
+
+        # 1. Build S2T model
+        s2t_model, s2t_train_args = S2TTask.build_model_from_file(
+            s2t_train_config, s2t_model_file, device
+        )
+        s2t_model.to(dtype=getattr(torch, dtype)).eval()
+
+        # Set flash_attn
+        for m in s2t_model.modules():
+            if hasattr(m, "use_flash_attn"):
+                setattr(m, "use_flash_attn", use_flash_attn)
+
+        if quantize_s2t_model:
+            logging.info("Use quantized s2t model for decoding.")
+
+            s2t_model = torch.quantization.quantize_dynamic(
+                s2t_model, qconfig_spec=quantize_modules, dtype=quantize_dtype
+            )
+    
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = s2t_train_args.token_type
+        if bpemodel is None:
+            bpemodel = s2t_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif (
+            token_type == "bpe"
+            or token_type == "hugging_face"
+            or "whisper" in token_type
+        ):
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+
+        if bpemodel not in ["whisper_en", "whisper_multilingual"]:
+            converter = TokenIDConverter(token_list=s2t_model.token_list)
+        else:
+            converter = OpenAIWhisperTokenIDConverter(model_type=bpemodel)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.s2t_model = s2t_model
+        self.s2t_train_args = s2t_train_args
+        self.preprocessor_conf = s2t_train_args.preprocessor_conf
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.device = device
+        self.dtype = dtype
+        self.generate_interctc_outputs = generate_interctc_outputs
+
+        # default lang and task symbols
+        self.lang_sym = lang_sym
+        self.task_sym = task_sym
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        text_prev: Union[torch.Tensor, np.ndarray, str] = "<na>",
+        lang_sym: Optional[str] = None,
+        task_sym: Optional[str] = None,
+    ) -> Union[
+        ListOfHypothesis,
+        Tuple[
+            ListOfHypothesis,
+            Optional[Dict[int, List[str]]],
+        ],
+    ]:
+        """Inference for a short utterance.
+
+        Args:
+            speech: Input speech
+            text_prev: Previous text used as condition
+        Returns:
+            text, token, token_int, hyp
+
+        """
+
+        # Obtain lang and task tokens
+        lang_sym = lang_sym if lang_sym is not None else self.lang_sym
+        task_sym = task_sym if task_sym is not None else self.task_sym
+        lang_id = self.converter.token2id[lang_sym]
+        task_id = self.converter.token2id[task_sym]
+
+        if isinstance(text_prev, str):
+            text_prev = self.converter.tokens2ids(
+                self.tokenizer.text2tokens(text_prev)
+            )
+        else:
+            text_prev = text_prev.tolist()
+
+        # Check if text_prev is valid
+        if self.s2t_model.na in text_prev:
+            text_prev = [self.s2t_model.na]
+
+        text_prev = torch.tensor(text_prev, dtype=torch.long).unsqueeze(0)  # (1, length)
+        text_prev_lengths = text_prev.new_full([1], dtype=torch.long, fill_value=text_prev.size(1))
+
+        # Prepare prefix
+        prefix = torch.tensor([[lang_id, task_id]], dtype=torch.long) # (1, 2)
+        prefix_lengths = prefix.new_full([1], dtype=torch.long, fill_value=prefix.size(-1))
+
+        # Preapre speech
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        # Batchify input
+        # speech: (nsamples,) -> (1, nsamples)
+        speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        # lengths: (1,)
+        speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        logging.info("speech length: " + str(speech.size(1)))
+
+        batch = {
+            "speech": speech,
+            "speech_lengths": speech_lengths,
+            "text_prev": text_prev,
+            "text_prev_lengths": text_prev_lengths,
+            "prefix": prefix,
+            "prefix_lengths": prefix_lengths,
+        }
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, enc_olens = self.s2t_model.encode(**batch)
+
+        intermediate_outs = None
+        if isinstance(enc, tuple):
+            enc, intermediate_outs = enc
+
+        assert len(enc) == 1, len(enc)
+
+        # c. Pass the encoder result to the beam search
+        results = self._decode_single_sample(enc)
+
+        # Encoder intermediate CTC predictions
+        if intermediate_outs is not None and self.generate_interctc_outputs:
+            encoder_interctc_res = self._decode_interctc(intermediate_outs)
+            results = (results, encoder_interctc_res)
+
+        return results
+
+    def _decode_interctc(
+        self, intermediate_outs: List[Tuple[int, torch.Tensor]]
+    ) -> Dict[int, List[str]]:
+
+        exclude_ids = [self.s2t_model.blank_id, self.s2t_model.sos, self.s2t_model.eos]
+        token_list = self.s2t_model.token_list
+
+        res = {}
+        for layer_idx, encoder_out in intermediate_outs:
+            y = self.s2t_model.ctc.argmax(encoder_out)[0]  # batch_size = 1
+            y = [x[0] for x in groupby(y) if x[0] not in exclude_ids]
+            y = [token_list[x] for x in y]
+
+            res[layer_idx] = y
+
+        return res
+
+    def _decode_single_sample(self, enc: torch.Tensor):
+        # enc: (B, T, D)
+        token_int = self.s2t_model.ctc.argmax(enc)[0]   # batch size is 1; (T,)
+        token_int = torch.unique_consecutive(token_int).cpu().tolist()
+        token_int = list(filter(lambda x: x != self.s2t_model.blank_id, token_int))
+        token = self.converter.ids2tokens(token_int)
+        token_nospecial = [x for x in token if not (x[0] == "<" and x[-1] == ">")]
+
+        if self.tokenizer is not None:
+            text = self.tokenizer.tokens2text(token)
+            text_nospecial = self.tokenizer.tokens2text(token_nospecial)
+        else:
+            text, text_nospecial = None, None
+
+        logging.info(f"best hypo: {text}")
+
+        results = [(text, token, token_int, text_nospecial, None)]
+        return results
+
+    @torch.no_grad()
+    def decode_long_batched_buffered(
+        self,
+        speech: Union[torch.Tensor, np.ndarray],
+        batch_size: int = 1,
+        sample_rate: int = 16000,
+        context_len_in_secs: float = 2,
+        frames_per_sec: float = 12.5,
+        lang_sym: Optional[str] = None,
+        task_sym: Optional[str] = None,
+    ):
+        """Decode unsegmented long-form speech.
+
+        Args:
+            speech: 1D long-form input speech
+            batch_size (int): decode this number of segments together in parallel
+
+        Returns:
+            utterances: list of tuples of (start_time, end_time, text)
+
+        """
+
+        lang_sym = lang_sym if lang_sym is not None else self.lang_sym
+        task_sym = task_sym if task_sym is not None else self.task_sym
+        lang_id = self.converter.token2id[lang_sym]
+        task_id = self.converter.token2id[task_sym]
+
+        buffer_len_in_secs = self.preprocessor_conf["speech_length"]
+        chunk_len_in_secs = buffer_len_in_secs - 2 * context_len_in_secs
+
+        class AudioChunkIterator():
+            def __init__(self, samples, chunk_len_in_secs, sample_rate):
+                self._samples = samples
+                self._chunk_len = chunk_len_in_secs * sample_rate
+                self._start = 0
+                self.output = True
+        
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if not self.output:
+                    raise StopIteration
+                last = int(self._start + self._chunk_len)
+                if last <= len(self._samples):
+                    chunk = self._samples[self._start: last]
+                    self._start = last
+                else:
+                    chunk = np.zeros([int(self._chunk_len)], dtype='float32')
+                    samp_len = len(self._samples) - self._start
+                    chunk[0:samp_len] = self._samples[self._start:len(self._samples)]
+                    self.output = False
+        
+                return chunk
+
+        buffer_len = int(sample_rate * buffer_len_in_secs)
+        chunk_len = int(sample_rate * chunk_len_in_secs)
+        sampbuffer = np.zeros([buffer_len], dtype=np.float32)
+
+        chunk_reader = AudioChunkIterator(speech, chunk_len_in_secs, sample_rate)
+        buffer_list = []
+        for chunk in chunk_reader:
+            sampbuffer[:-chunk_len] = sampbuffer[chunk_len:]
+            sampbuffer[-chunk_len:] = chunk
+            buffer_list.append(np.array(sampbuffer))
+
+        speech = torch.tensor(np.array(buffer_list)).to(getattr(torch, self.dtype))
+        context_frames = int(frames_per_sec * context_len_in_secs)
+
+        unmerged = []
+        for idx in range(0, speech.size(0), batch_size):
+            cur_speech = speech[idx : idx + batch_size]
+            cur_speech_lengths = cur_speech.new_full(
+                [cur_speech.size(0)], dtype=torch.long, fill_value=cur_speech.size(1)
+            )
+
+            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            text_prev_lengths = text_prev.new_full(
+                [cur_speech.size(0)], dtype=torch.long, fill_value=text_prev.size(1)
+            )
+
+            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            prefix_lengths = prefix.new_full(
+                [cur_speech.size(0)], dtype=torch.long, fill_value=prefix.size(-1)
+            )
+
+            batch = {
+                "speech": cur_speech,
+                "speech_lengths": cur_speech_lengths,
+                "text_prev": text_prev,
+                "text_prev_lengths": text_prev_lengths,
+                "prefix": prefix,
+                "prefix_lengths": prefix_lengths,
+            }
+
+            # a. To device
+            batch = to_device(batch, device=self.device)
+
+            # b. Forward Encoder
+            enc, enc_olens = self.s2t_model.encode(**batch)
+
+            intermediate_outs = None
+            if isinstance(enc, tuple):
+                enc, intermediate_outs = enc
+
+            # enc: (B, T, D)
+            batched_token_int = self.s2t_model.ctc.argmax(enc)      # (B, T)
+            valid_token_int = batched_token_int[:, context_frames : -context_frames].reshape(-1)
+            unmerged.append(valid_token_int)
+        
+        unmerged = torch.cat(unmerged)
+        merged = torch.unique_consecutive(unmerged).cpu().tolist()
+        token_int = list(filter(lambda x: x != self.s2t_model.blank_id, merged))
+        token = self.converter.ids2tokens(token_int)
+        token_nospecial = [x for x in token if not (x[0] == "<" and x[-1] == ">")]
+        text_nospecial = self.tokenizer.tokens2text(token_nospecial)
+
+        return text_nospecial
+
+    @staticmethod
+    def from_pretrained(
+        model_tag: Optional[str] = None,
+        **kwargs: Optional[Any],
+    ):
+        """Build Speech2TextGreedySearch instance from the pretrained model.
+
+        Args:
+            model_tag (Optional[str]): Model tag of the pretrained models.
+                Currently, the tags of espnet_model_zoo are supported.
+
+        Returns:
+            Speech2TextGreedySearch: Speech2TextGreedySearch instance.
+
+        """
+        if model_tag is not None:
+            try:
+                from espnet_model_zoo.downloader import ModelDownloader
+
+            except ImportError:
+                logging.error(
+                    "`espnet_model_zoo` is not installed. "
+                    "Please install via `pip install -U espnet_model_zoo`."
+                )
+                raise
+            d = ModelDownloader()
+            kwargs.update(**d.download_and_unpack(model_tag))
+
+        return Speech2TextGreedySearch(**kwargs)
+
+
+@typechecked
+def inference(
+    output_dir: str,
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    dtype: str,
+    beam_size: int,
+    ngpu: int,
+    seed: int,
+    lm_weight: float,
+    ngram_weight: float,
+    penalty: float,
+    nbest: int,
+    num_workers: int,
+    log_level: Union[int, str],
+    data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
+    key_file: Optional[str],
+    s2t_train_config: Optional[str],
+    s2t_model_file: Optional[str],
+    lm_train_config: Optional[str],
+    lm_file: Optional[str],
+    word_lm_train_config: Optional[str],
+    word_lm_file: Optional[str],
+    ngram_file: Optional[str],
+    model_tag: Optional[str],
+    token_type: Optional[str],
+    bpemodel: Optional[str],
+    allow_variable_data_keys: bool,
+    streaming: bool,
+    quantize_s2t_model: bool,
+    quantize_lm: bool,
+    quantize_modules: List[str],
+    quantize_dtype: str,
+    lang_sym: str,
+    task_sym: str,
+    generate_interctc_outputs: bool,
+):
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    if ngpu >= 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+
+    # NOTE(yifan): < and > cannot be passed in command line
+    lang_sym = f"<{lang_sym.lstrip('<').rstrip('>')}>"
+    task_sym = f"<{task_sym.lstrip('<').rstrip('>')}>"
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        s2t_train_config=s2t_train_config,
+        s2t_model_file=s2t_model_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        ngram_file=ngram_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+        streaming=streaming,
+        quantize_s2t_model=quantize_s2t_model,
+        quantize_lm=quantize_lm,
+        quantize_modules=quantize_modules,
+        quantize_dtype=quantize_dtype,
+        lang_sym=lang_sym,
+        task_sym=task_sym,
+        use_flash_attn=False,
+        generate_interctc_outputs=generate_interctc_outputs,
+    )
+    speech2text_class = Speech2TextGreedySearch if beam_size == 1 else Speech2Text
+    logging.info(f"Speech2Text Class: {speech2text_class}")
+    speech2text = speech2text_class.from_pretrained(
+        model_tag=model_tag,
+        **speech2text_kwargs,
+    )
+
+    # 3. Build data-iterator
+    loader = S2TTask.build_streaming_iterator(
+        data_path_and_name_and_type,
+        dtype=dtype,
+        batch_size=batch_size,
+        key_file=key_file,
+        num_workers=num_workers,
+        preprocess_fn=S2TTask.build_preprocess_fn(speech2text.s2t_train_args, False),
+        collate_fn=S2TTask.build_collate_fn(speech2text.s2t_train_args, False),
+        allow_variable_data_keys=allow_variable_data_keys,
+        inference=True,
+    )
+
+    # 7 .Start for-loop
+    # FIXME(kamo): The output format should be discussed about
+    with DatadirWriter(output_dir) as writer:
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+            # N-best list of (text, token, token_int, text_nospecial, hyp_object)
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], " ", hyp]] * nbest
+
+            # Only supporting batch_size==1
+            key = keys[0]
+            encoder_interctc_res = None
+            if isinstance(results, tuple):
+                results, encoder_interctc_res = results
+
+            for n, (text, token, token_int, text_nospecial, hyp) in zip(
+                range(1, nbest + 1), results
+            ):
+                # Create a directory: outdir/{n}best_recog
+                ibest_writer = writer[f"{n}best_recog"]
+
+                # Write the result to each file
+                ibest_writer["token"][key] = " ".join(token)
+                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                if hyp is not None:
+                    ibest_writer["score"][key] = str(hyp.score)
+
+                if text is not None:
+                    ibest_writer["text"][key] = text
+                if text_nospecial is not None:
+                    ibest_writer["text_nospecial"][key] = text_nospecial
+
+            # Write intermediate predictions to
+            # encoder_interctc_layer<layer_idx>.txt
+            ibest_writer = writer[f"1best_recog"]
+            if encoder_interctc_res is not None:
+                for idx, text in encoder_interctc_res.items():
+                    ibest_writer[f"encoder_interctc_layer{idx}.txt"][key] = " ".join(
+                        text
+                    )
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="S2T Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=True,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--s2t_train_config",
+        type=str,
+        help="S2T training configuration",
+    )
+    group.add_argument(
+        "--s2t_model_file",
+        type=str,
+        help="S2T model parameter file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+        "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Quantization related")
+    group.add_argument(
+        "--quantize_s2t_model",
+        type=str2bool,
+        default=False,
+        help="Apply dynamic quantization to S2T model.",
+    )
+    group.add_argument(
+        "--quantize_lm",
+        type=str2bool,
+        default=False,
+        help="Apply dynamic quantization to LM.",
+    )
+    group.add_argument(
+        "--quantize_modules",
+        type=str,
+        nargs="*",
+        default=["Linear"],
+        help="""List of modules to be dynamically quantized.
+        E.g.: --quantize_modules=[Linear,LSTM,GRU].
+        Each specified module should be an attribute of 'torch.nn', e.g.:
+        torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
+    )
+    group.add_argument(
+        "--quantize_dtype",
+        type=str,
+        default="qint8",
+        choices=["float16", "qint8"],
+        help="Dtype for dynamic quantization.",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+        "If maxlenratio=0.0 (default), it uses a end-detect "
+        "function "
+        "to automatically find maximum hypothesis lengths."
+        "If maxlenratio<0.0, its absolute value is interpreted"
+        "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument("--lm_weight", type=float, default=0.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.0, help="ngram weight")
+    group.add_argument("--streaming", type=str2bool, default=False)
+
+    group.add_argument(
+        "--lang_sym", type=str, default="nolang", help="Language symbol."
+    )
+    group.add_argument(
+        "--task_sym", type=str, default="asr", help="Task symbol."
+    )
+    group.add_argument(
+        "--generate_interctc_outputs", type=bool, default=False,
+        help="Also write intermediate CTC outputs."
+    )
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", "word", None],
+        help="The token type for S2T model. "
+        "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+        "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/espnet2/s2t/espnet_ctc_model.py b/espnet2/s2t/espnet_ctc_model.py
new file mode 100644
index 00000000000..146f756c25f
--- /dev/null
+++ b/espnet2/s2t/espnet_ctc_model.py
@@ -0,0 +1,349 @@
+import logging
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from torch.cuda.amp import autocast
+from typeguard import typechecked
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.torch_utils.device_funcs import force_gatherable
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+from espnet.nets.e2e_asr_common import ErrorCalculator
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+
+
+class ESPnetS2TCTCModel(AbsESPnetModel):
+    """OWSM-CTC model"""
+
+    @typechecked
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        encoder: AbsEncoder,
+        prompt_encoder: AbsEncoder,
+        ctc: CTC,
+        interctc_weight: float = 0.0,
+        ignore_id: int = -1,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        sym_sos: str = "<sos>",
+        sym_eos: str = "<eos>",
+        sym_sop: str = "<sop>",  # start of prev
+        sym_na: str = "<na>",  # not available
+        extract_feats_in_collect_stats: bool = True,
+        ctc_asr_only: List[bool] = [False],
+    ):
+        assert 0.0 <= interctc_weight < 1.0, interctc_weight
+
+        super().__init__()
+
+        self.blank_id = token_list.index(sym_blank)
+        self.sos = token_list.index(sym_sos)
+        self.eos = token_list.index(sym_eos)
+        self.sop = token_list.index(sym_sop)
+        self.na = token_list.index(sym_na)
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.interctc_weight = interctc_weight
+        self.token_list = token_list.copy()
+        self.ctc_asr_only = ctc_asr_only  # type of interctc
+
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.encoder = encoder
+        self.prompt_encoder = prompt_encoder
+
+        self.embed = torch.nn.Embedding(
+            vocab_size, self.prompt_encoder.output_size()
+        )
+        self.pos_enc = PositionalEncoding(self.prompt_encoder.output_size(), 0.0)
+
+        if self.encoder.output_size() != self.prompt_encoder.output_size():
+            # used in encoder to inject task and lang tokens
+            self.embed_proj = torch.nn.Linear(self.prompt_encoder.output_size(), self.encoder.output_size())
+            # applied to the output of prompt encoder
+            self.prompt_proj = torch.nn.Linear(self.prompt_encoder.output_size(), self.encoder.output_size())
+        else:
+            self.embed_proj = torch.nn.Identity()
+            self.prompt_proj = torch.nn.Identity()
+
+        if not hasattr(self.encoder, "interctc_use_conditioning"):
+            self.encoder.interctc_use_conditioning = False
+        if self.encoder.interctc_use_conditioning:
+            self.encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, self.encoder.output_size()
+            )
+
+        self.error_calculator = None
+
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+
+        self.ctc = ctc
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+
+        self.is_encoder_whisper = "Whisper" in type(self.encoder).__name__
+
+        if self.is_encoder_whisper:
+            assert (
+                self.frontend is None
+            ), "frontend should be None when using full Whisper model"
+
+    def encode(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text_prev: torch.Tensor,
+        text_prev_lengths: torch.Tensor,
+        prefix: torch.Tensor,
+        prefix_lengths: torch.Tensor,
+    ):
+        """Encode input speech."""
+
+        # Forward prompt encoder
+        text_prev[text_prev == -1] = self.eos
+        memory, memory_lengths, _ = self.prompt_encoder(
+            self.pos_enc(self.embed(text_prev)),
+            text_prev_lengths
+        )
+        memory_mask = (~make_pad_mask(memory_lengths)[:, None, :]).to(memory.device)
+
+        # Extract speech features
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Forward encoder
+        encoder_out, encoder_out_lens, _ = self.encoder(
+            feats,
+            feats_lengths,
+            ctc=self.ctc,
+            prefix_embeds=self.embed_proj(self.embed(prefix)),
+            memory=self.prompt_proj(memory),
+            memory_mask=memory_mask,
+        )
+        return encoder_out, encoder_out_lens
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        text_prev: torch.Tensor,
+        text_prev_lengths: torch.Tensor,
+        text_ctc: torch.Tensor,
+        text_ctc_lengths: torch.Tensor,
+        prefix: torch.Tensor,
+        prefix_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+            text_prev: (Batch, Length)
+            text_prev_lengths: (Batch,)
+            text_ctc: (Batch, Length)
+            text_ctc_lengths: (Batch,)
+            prefix: (Batch, Length=2), <lang> and <task>
+            prefix_lengths: (Batch,)
+            kwargs: "utt_id" is among the input.
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+            == text_prev.shape[0]
+            == text_prev_lengths.shape[0]
+            == text_ctc.shape[0]
+            == text_ctc_lengths.shape[0]
+            == prefix.shape[0]
+            == prefix_lengths.shape[0]
+        ), (
+            speech.shape,
+            speech_lengths.shape,
+            text.shape,
+            text_lengths.shape,
+            text_prev.shape,
+            text_prev_lengths.shape,
+            text_ctc.shape,
+            text_ctc_lengths.shape,
+            prefix.shape,
+            prefix_lengths.shape
+        )
+        batch_size = speech.shape[0]
+
+        # -1 is used as padding index in collate fn
+        text[text == -1] = self.ignore_id
+
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+
+        encoder_out, encoder_out_lens = self.encode(
+            speech, speech_lengths,
+            text_prev, text_prev_lengths,
+            prefix, prefix_lengths
+        )
+
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            encoder_out, intermediate_outs = encoder_out
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        if (
+            getattr(self.encoder, "selfattention_layer_type", None) != "lf_selfattn"
+            and not self.is_encoder_whisper
+        ):
+            assert encoder_out.size(-2) <= encoder_out_lens.max(), (
+                encoder_out.size(),
+                encoder_out_lens.max(),
+            )
+
+        loss_ctc, cer_ctc = None, None
+        stats = dict()
+
+        # 1. CTC branch
+        if self.ctc_asr_only[-1]:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text_ctc, text_ctc_lengths
+            )
+        else:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+        # Collect CTC branch stats
+        stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+        stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            assert len(self.ctc_asr_only) == len(intermediate_outs) + 1
+            for (layer_idx, intermediate_out), asr_only in zip(intermediate_outs, self.ctc_asr_only):
+                if asr_only:
+                    loss_ic, cer_ic = self._calc_ctc_loss(
+                        intermediate_out, encoder_out_lens, text_ctc, text_ctc_lengths
+                    )
+                else:
+                    loss_ic, cer_ic = self._calc_ctc_loss(
+                        intermediate_out, encoder_out_lens, text, text_lengths
+                    )
+
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (
+                1 - self.interctc_weight
+            ) * loss_ctc + self.interctc_weight * loss_interctc
+
+        loss = loss_ctc
+
+        # Collect total loss stats
+        stats["loss"] = loss.detach()
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        text_prev: torch.Tensor,
+        text_prev_lengths: torch.Tensor,
+        text_ctc: torch.Tensor,
+        text_ctc_lengths: torch.Tensor,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert speech_lengths.dim() == 1, speech_lengths.shape
+
+        # for data-parallel
+        speech = speech[:, : speech_lengths.max()]
+
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. STFT and Feature extract
+            #       data_loader may send time-domain signal in this case
+            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+            feats, feats_lengths = self.frontend(speech, speech_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = speech, speech_lengths
+        return feats, feats_lengths
+
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Filter out invalid samples where text is not available
+        is_valid = [self.na not in y for y in ys_pad]
+        if not any(is_valid):
+            return torch.tensor(0.0), None
+
+        encoder_out = encoder_out[is_valid]
+        encoder_out_lens = encoder_out_lens[is_valid]
+        ys_pad = ys_pad[is_valid]
+        ys_pad_lens = ys_pad_lens[is_valid]
+
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
diff --git a/espnet2/tasks/s2t_ctc.py b/espnet2/tasks/s2t_ctc.py
new file mode 100644
index 00000000000..a7d2d32b9cb
--- /dev/null
+++ b/espnet2/tasks/s2t_ctc.py
@@ -0,0 +1,481 @@
+import argparse
+import logging
+from typing import Callable, Collection, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from typeguard import typechecked
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.abs_encoder import AbsEncoder
+from espnet2.asr.encoder.branchformer_encoder import BranchformerEncoder
+from espnet2.asr.encoder.conformer_encoder import ConformerEncoder
+from espnet2.asr.encoder.contextual_block_conformer_encoder import (
+    ContextualBlockConformerEncoder,
+)
+from espnet2.asr.encoder.contextual_block_transformer_encoder import (
+    ContextualBlockTransformerEncoder,
+)
+from espnet2.asr.encoder.e_branchformer_encoder import EBranchformerEncoder
+from espnet2.asr.encoder.hubert_encoder import (
+    FairseqHubertEncoder,
+    FairseqHubertPretrainEncoder,
+    TorchAudioHuBERTPretrainEncoder,
+)
+from espnet2.asr.encoder.longformer_encoder import LongformerEncoder
+from espnet2.asr.encoder.rnn_encoder import RNNEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.transformer_encoder_multispkr import (
+    TransformerEncoder as TransformerEncoderMultiSpkr,
+)
+from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
+from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
+from espnet2.asr.encoder.whisper_encoder import OpenAIWhisperEncoder
+from espnet2.asr.encoder.e_branchformer_ctc_encoder import EBranchformerCTCEncoder
+from espnet2.asr.frontend.abs_frontend import AbsFrontend
+from espnet2.asr.frontend.default import DefaultFrontend
+from espnet2.asr.frontend.fused import FusedFrontends
+from espnet2.asr.frontend.s3prl import S3prlFrontend
+from espnet2.asr.frontend.whisper import WhisperFrontend
+from espnet2.asr.frontend.windowing import SlidingWindow
+from espnet2.asr.specaug.abs_specaug import AbsSpecAug
+from espnet2.asr.specaug.specaug import SpecAug
+from espnet2.layers.abs_normalize import AbsNormalize
+from espnet2.layers.global_mvn import GlobalMVN
+from espnet2.layers.utterance_mvn import UtteranceMVN
+from espnet2.s2t.espnet_model import ESPnetS2TModel
+from espnet2.s2t.espnet_ctc_model import ESPnetS2TCTCModel
+from espnet2.tasks.abs_task import AbsTask
+from espnet2.text.phoneme_tokenizer import g2p_choices
+from espnet2.torch_utils.initialize import initialize
+from espnet2.train.abs_espnet_model import AbsESPnetModel
+from espnet2.train.class_choices import ClassChoices
+from espnet2.train.collate_fn import CommonCollateFn
+from espnet2.train.preprocessor import AbsPreprocessor, S2TPreprocessor, S2TCTCPreprocessor
+from espnet2.train.trainer import Trainer
+from espnet2.utils.get_default_kwargs import get_default_kwargs
+from espnet2.utils.nested_dict_action import NestedDictAction
+from espnet2.utils.types import float_or_none, int_or_none, str2bool, str_or_none
+
+frontend_choices = ClassChoices(
+    name="frontend",
+    classes=dict(
+        default=DefaultFrontend,
+        sliding_window=SlidingWindow,
+        s3prl=S3prlFrontend,
+        fused=FusedFrontends,
+        whisper=WhisperFrontend,
+    ),
+    type_check=AbsFrontend,
+    default="default",
+)
+specaug_choices = ClassChoices(
+    name="specaug",
+    classes=dict(
+        specaug=SpecAug,
+    ),
+    type_check=AbsSpecAug,
+    default=None,
+    optional=True,
+)
+normalize_choices = ClassChoices(
+    name="normalize",
+    classes=dict(
+        global_mvn=GlobalMVN,
+        utterance_mvn=UtteranceMVN,
+    ),
+    type_check=AbsNormalize,
+    default="utterance_mvn",
+    optional=True,
+)
+model_choices = ClassChoices(
+    name="model",
+    classes=dict(
+        espnet=ESPnetS2TModel,
+        espnet_ctc=ESPnetS2TCTCModel,
+    ),
+    type_check=AbsESPnetModel,
+    default="espnet_ctc",
+)
+encoder_choices = ClassChoices(
+    "encoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        transformer_multispkr=TransformerEncoderMultiSpkr,
+        contextual_block_transformer=ContextualBlockTransformerEncoder,
+        contextual_block_conformer=ContextualBlockConformerEncoder,
+        vgg_rnn=VGGRNNEncoder,
+        rnn=RNNEncoder,
+        wav2vec2=FairSeqWav2Vec2Encoder,
+        hubert=FairseqHubertEncoder,
+        hubert_pretrain=FairseqHubertPretrainEncoder,
+        torchaudiohubert=TorchAudioHuBERTPretrainEncoder,
+        longformer=LongformerEncoder,
+        branchformer=BranchformerEncoder,
+        whisper=OpenAIWhisperEncoder,
+        e_branchformer=EBranchformerEncoder,
+        e_branchformer_ctc=EBranchformerCTCEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="e_branchformer_ctc",
+)
+promptencoder_choices = ClassChoices(
+    "promptencoder",
+    classes=dict(
+        conformer=ConformerEncoder,
+        transformer=TransformerEncoder,
+        branchformer=BranchformerEncoder,
+        e_branchformer=EBranchformerEncoder,
+    ),
+    type_check=AbsEncoder,
+    default="transformer",
+)
+preprocessor_choices = ClassChoices(
+    "preprocessor",
+    classes=dict(
+        s2t=S2TPreprocessor,
+        s2t_ctc=S2TCTCPreprocessor,
+    ),
+    type_check=AbsPreprocessor,
+    default="s2t_ctc",
+)
+
+
+class S2TTask(AbsTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --model and --model_conf
+        model_choices,
+        # --promptencoder and --promptencoder_conf
+        promptencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+        # --preprocessor and --preprocessor_conf
+        preprocessor_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        group = parser.add_argument_group(description="Task related")
+
+        # NOTE(kamo): add_arguments(..., required=True) can't be used
+        # to provide --print_config mode. Instead of it, do as
+        required = parser.get_default("required")
+        required += ["token_list"]
+
+        group.add_argument(
+            "--token_list",
+            type=str_or_none,
+            default=None,
+            help="A text mapping int-id to token",
+        )
+        group.add_argument(
+            "--init",
+            type=lambda x: str_or_none(x.lower()),
+            default=None,
+            help="The initialization method",
+            choices=[
+                "chainer",
+                "xavier_uniform",
+                "xavier_normal",
+                "kaiming_uniform",
+                "kaiming_normal",
+                None,
+            ],
+        )
+
+        group.add_argument(
+            "--input_size",
+            type=int_or_none,
+            default=None,
+            help="The number of input dimension of the feature",
+        )
+
+        group.add_argument(
+            "--ctc_conf",
+            action=NestedDictAction,
+            default=get_default_kwargs(CTC),
+            help="The keyword arguments for CTC class.",
+        )
+
+        group = parser.add_argument_group(description="Preprocess related")
+        group.add_argument(
+            "--use_preprocessor",
+            type=str2bool,
+            default=True,
+            help="Apply preprocessing to data or not",
+        )
+        group.add_argument(
+            "--token_type",
+            type=str,
+            default="bpe",
+            choices=[
+                "bpe",
+                "char",
+                "word",
+                "phn",
+                "hugging_face",
+                "whisper_en",
+                "whisper_multilingual",
+            ],
+            help="The text will be tokenized in the specified level token",
+        )
+        group.add_argument(
+            "--bpemodel",
+            type=str_or_none,
+            default=None,
+            help="The model file of sentencepiece",
+        )
+        parser.add_argument(
+            "--non_linguistic_symbols",
+            type=str_or_none,
+            help="non_linguistic_symbols file path",
+        )
+        group.add_argument(
+            "--cleaner",
+            type=str_or_none,
+            choices=[
+                None,
+                "tacotron",
+                "jaconv",
+                "vietnamese",
+                "whisper_en",
+                "whisper_basic",
+            ],
+            default=None,
+            help="Apply text cleaning",
+        )
+        group.add_argument(
+            "--g2p",
+            type=str_or_none,
+            choices=g2p_choices,
+            default=None,
+            help="Specify g2p method if --token_type=phn",
+        )
+        group.add_argument(
+            "--speech_volume_normalize",
+            type=float_or_none,
+            default=None,
+            help="Scale the maximum amplitude to the given value.",
+        )
+        group.add_argument(
+            "--rir_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of rir scp file.",
+        )
+        group.add_argument(
+            "--rir_apply_prob",
+            type=float,
+            default=1.0,
+            help="THe probability for applying RIR convolution.",
+        )
+        group.add_argument(
+            "--noise_scp",
+            type=str_or_none,
+            default=None,
+            help="The file path of noise scp file.",
+        )
+        group.add_argument(
+            "--noise_apply_prob",
+            type=float,
+            default=1.0,
+            help="The probability applying Noise adding.",
+        )
+        group.add_argument(
+            "--noise_db_range",
+            type=str,
+            default="13_15",
+            help="The range of noise decibel level.",
+        )
+        group.add_argument(
+            "--short_noise_thres",
+            type=float,
+            default=0.5,
+            help="If len(noise) / len(speech) is smaller than this threshold during "
+            "dynamic mixing, a warning will be displayed.",
+        )
+
+        for class_choices in cls.class_choices_list:
+            # Append --<name> and --<name>_conf.
+            # e.g. --encoder and --encoder_conf
+            class_choices.add_arguments(group)
+
+    @classmethod
+    @typechecked
+    def build_collate_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Callable[
+        [Collection[Tuple[str, Dict[str, np.ndarray]]]],
+        Tuple[List[str], Dict[str, torch.Tensor]],
+    ]:
+        # NOTE(kamo): int value = 0 is reserved by CTC-blank symbol
+        return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+
+    @classmethod
+    @typechecked
+    def build_preprocess_fn(
+        cls, args: argparse.Namespace, train: bool
+    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
+        if args.use_preprocessor:
+            try:
+                _ = getattr(args, "preprocessor")
+            except AttributeError:
+                setattr(args, "preprocessor", "default")
+                setattr(args, "preprocessor_conf", dict())
+            except Exception as e:
+                raise e
+
+            preprocessor_class = preprocessor_choices.get_class(args.preprocessor)
+            retval = preprocessor_class(
+                train=train,
+                token_type=args.token_type,
+                token_list=args.token_list,
+                bpemodel=args.bpemodel,
+                text_cleaner=args.cleaner,
+                g2p_type=args.g2p,
+                non_linguistic_symbols=args.non_linguistic_symbols,
+                # NOTE(kamo): Check attribute existence for backward compatibility
+                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+                rir_apply_prob=args.rir_apply_prob
+                if hasattr(args, "rir_apply_prob")
+                else 1.0,
+                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+                noise_apply_prob=args.noise_apply_prob
+                if hasattr(args, "noise_apply_prob")
+                else 1.0,
+                noise_db_range=args.noise_db_range
+                if hasattr(args, "noise_db_range")
+                else "13_15",
+                short_noise_thres=args.short_noise_thres
+                if hasattr(args, "short_noise_thres")
+                else 0.5,
+                speech_volume_normalize=args.speech_volume_normalize
+                if hasattr(args, "rir_scp")
+                else None,
+                **args.preprocessor_conf,
+            )
+        else:
+            retval = None
+        return retval
+
+    @classmethod
+    def required_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        if not inference:
+            retval = ("speech", "text")
+        else:
+            # Recognition mode
+            retval = ("speech",)
+        return retval
+
+    @classmethod
+    @typechecked
+    def optional_data_names(
+        cls, train: bool = True, inference: bool = False
+    ) -> Tuple[str, ...]:
+        MAX_REFERENCE_NUM = 4
+
+        retval = ["text_prev", "text_ctc"] + [
+            "text_spk{}".format(n) for n in range(2, MAX_REFERENCE_NUM + 1)
+        ]
+        retval = tuple(retval)
+
+        logging.info(f"Optional Data Names: {retval}")
+        return retval
+
+    @classmethod
+    @typechecked
+    def build_model(cls, args: argparse.Namespace):
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size}")
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            frontend = frontend_class(**args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 4. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+
+        # 5. Prompt Encoder
+        promptencoder_class = promptencoder_choices.get_class(args.promptencoder)
+        promptencoder = promptencoder_class(
+            input_size=args.promptencoder_conf['output_size'],
+            input_layer=None,
+            **args.promptencoder_conf
+        )
+
+        # 6. CTC
+        ctc = CTC(
+            odim=vocab_size, encoder_output_size=encoder.output_size(), **args.ctc_conf
+        )
+
+        # 7. Build model
+        model_class = model_choices.get_class(args.model)
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            encoder=encoder,
+            prompt_encoder=promptencoder,
+            ctc=ctc,
+            token_list=token_list,
+            **args.model_conf,
+        )
+
+        # FIXME(kamo): Should be done in model?
+        # 8. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        return model
diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py
index a288821656c..271d4c69901 100644
--- a/espnet2/train/preprocessor.py
+++ b/espnet2/train/preprocessor.py
@@ -1,3 +1,4 @@
+import copy
 import json
 import logging
 import random
@@ -2358,6 +2359,174 @@ def __call__(
         return data
 
 
+class S2TCTCPreprocessor(CommonPreprocessor):
+    """Preprocessor for OWSM-CTC."""
+
+    def __init__(
+        self,
+        train: bool,
+        token_type: str = None,
+        token_list: Union[Path, str, Iterable[str]] = None,
+        bpemodel: Union[Path, str, Iterable[str]] = None,
+        text_cleaner: Collection[str] = None,
+        g2p_type: str = None,
+        unk_symbol: str = "<unk>",
+        space_symbol: str = "<space>",
+        non_linguistic_symbols: Union[Path, str, Iterable[str]] = None,
+        delimiter: str = None,
+        rir_scp: str = None,
+        rir_apply_prob: float = 1.0,
+        noise_scp: str = None,
+        noise_apply_prob: float = 1.0,
+        noise_db_range: str = "3_10",
+        short_noise_thres: float = 0.5,
+        speech_volume_normalize: float = None,
+        speech_name: str = "speech",
+        text_name: str = "text",
+        text_prev_name: str = "text_prev",
+        text_ctc_name: str = "text_ctc",
+        fs: int = 16000,
+        na_symbol: str = "<na>",  # text is not available e.g. for prev or ctc
+        speech_length: float = 30,  # pad or trim speech to this value in seconds
+        speech_init_silence: float = 1.0,  # max silence before speech for data aug
+        text_prev_apply_prob: float = 0.5,  # whether to condition on text_prev
+        lang_apply_prob: float = 0.5,  # whether to use groundtruth language or unknown
+        nolang_symbol: str = "<nolang>",
+    ):
+        super().__init__(
+            train=train,
+            token_type=token_type,
+            token_list=token_list,
+            bpemodel=bpemodel,
+            text_cleaner=text_cleaner,
+            g2p_type=g2p_type,
+            unk_symbol=unk_symbol,
+            space_symbol=space_symbol,
+            non_linguistic_symbols=non_linguistic_symbols,
+            delimiter=delimiter,
+            rir_scp=rir_scp,
+            rir_apply_prob=rir_apply_prob,
+            noise_scp=noise_scp,
+            noise_apply_prob=noise_apply_prob,
+            noise_db_range=noise_db_range,
+            short_noise_thres=short_noise_thres,
+            speech_volume_normalize=speech_volume_normalize,
+            speech_name=speech_name,
+            text_name=text_name,
+            fs=fs,
+        )
+        self.text_prev_name = text_prev_name
+        self.text_ctc_name = text_ctc_name
+        self.speech_length = int(speech_length * fs)
+        self.speech_init_silence = int(speech_init_silence * fs)
+        self.text_prev_apply_prob = text_prev_apply_prob
+        self.lang_apply_prob = lang_apply_prob
+
+        # Obtain the token id of special tokens
+        self.na_symbol = na_symbol
+        self.nolang = self.token_id_converter.token2id[nolang_symbol]
+
+    @typechecked
+    def _pad_or_trim_speech(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Tuple[Dict[str, Union[str, np.ndarray]], int]:
+
+        init_pad = 0
+        if self.speech_name in data:
+            speech = data[self.speech_name]
+
+            # speech: (Nmic, Time)
+            if speech.ndim == 1:
+                speech = speech[None, :]
+            else:
+                speech = speech.T
+
+            # Add silence to the left
+            if self.train and speech.shape[-1] < self.speech_length:
+                init_pad = np.random.randint(
+                    min(self.speech_length - speech.shape[-1], self.speech_init_silence)
+                    + 1
+                )
+                speech = np.pad(speech, ((0, 0), (init_pad, 0)))
+
+            # Pad or trim to max_samples
+            if speech.shape[-1] < self.speech_length:
+                speech = np.pad(
+                    speech, ((0, 0), (0, self.speech_length - speech.shape[-1]))
+                )
+            else:
+                speech = speech[:, : self.speech_length]
+
+            data[self.speech_name] = speech.T  # convert back to time first
+
+        return data, init_pad
+
+    @typechecked
+    def _text_process(
+        self, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+
+        # NOTE: the order is important
+        text_names = [self.text_name, self.text_prev_name, self.text_ctc_name]
+        if self.tokenizer is not None:
+            for name in text_names:
+                if name in data:
+                    text = data[name]
+
+                    # Remove prev text by setting it to <na>
+                    if (
+                        self.train
+                        and name == self.text_prev_name
+                        and np.random.uniform() > self.text_prev_apply_prob
+                    ):
+                        text = self.na_symbol
+
+                    text = self.text_cleaner(text)
+                    tokens = self.tokenizer.text2tokens(text)
+                    text_ints = self.token_id_converter.tokens2ids(tokens)
+                    text_ints = np.array(text_ints, dtype=np.int64)
+
+                    # Augment text
+                    if name == self.text_name:
+                        # NOTE(yifan): The first token is always space
+                        # which should be removed.
+                        # No space is allowed between special tokens.
+                        # This works for bpe, but maybe not for the other types.
+                        text_ints = text_ints[1:]
+
+                        # First two tokens are <lang> and <task>
+                        # NOTE: must copy the array
+                        data["prefix"] = copy.deepcopy(text_ints[:2])
+                        if np.random.uniform() > self.lang_apply_prob:
+                            data["prefix"][0] = self.nolang
+
+                    elif name == self.text_ctc_name:
+                        # Add <lang> and <task> to ASR Text as well
+                        text_ints = np.concatenate([data[self.text_name][:2], text_ints])
+
+                    elif name == self.text_prev_name:
+                        # Remove space before <na>
+                        if text == self.na_symbol:
+                            assert len(text_ints) == 2, text_ints
+                            text_ints = text_ints[1:]
+
+                    data[name] = text_ints
+
+        return data
+
+    @typechecked
+    def __call__(
+        self, uid: str, data: Dict[str, Union[str, np.ndarray]]
+    ) -> Dict[str, np.ndarray]:
+
+        data = self._speech_process(data)
+        data, _ = self._pad_or_trim_speech(data)
+
+        data = self._text_process(data)
+
+        return data
+
+
 class SpeechLMPreprocessor(AbsPreprocessor):
     """Preprocessor specifically for SpeechLM models"""
 

From 5067c1158c319fc62055e61fb549ee15831e910f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 22 Oct 2024 01:28:32 +0000
Subject: [PATCH 03/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../s2t1/local/convert_owsm_data.py           | 19 ++--
 .../transformer/subsampling.py                | 14 ++-
 .../asr/encoder/e_branchformer_ctc_encoder.py | 48 +++++++----
 espnet2/bin/s2t_ctc_align.py                  | 33 ++++---
 espnet2/bin/s2t_inference_ctc.py              | 86 +++++++++++--------
 espnet2/s2t/espnet_ctc_model.py               | 27 +++---
 espnet2/tasks/s2t_ctc.py                      | 50 ++++++-----
 espnet2/train/preprocessor.py                 |  4 +-
 8 files changed, 171 insertions(+), 110 deletions(-)

diff --git a/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py b/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py
index 1488702c929..379b703af09 100644
--- a/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py
+++ b/egs2/owsm_ctc_v3.1/s2t1/local/convert_owsm_data.py
@@ -1,6 +1,7 @@
-import numpy as np
 from pathlib import Path
 
+import numpy as np
+
 from espnet2.text.build_tokenizer import build_tokenizer
 from espnet2.text.token_id_converter import TokenIDConverter
 
@@ -14,9 +15,7 @@ def parse_owsm(
     first_time = token_id_converter.token2id["<0.00>"]
     last_time = token_id_converter.token2id["<30.00>"]
 
-    with open(text_in, 'r') as fin, open(
-        text_out, 'w'
-    ) as fout:
+    with open(text_in, "r") as fin, open(text_out, "w") as fout:
         for line in fin:
             utt_id, text = line.strip().split(maxsplit=1)
 
@@ -33,9 +32,7 @@ def parse_owsm(
             tokens = token_id_converter.ids2tokens(text_ints)
             text = tokenizer.tokens2text(tokens)
 
-            fout.write(
-                f"{utt_id} {text}\n"
-            )
+            fout.write(f"{utt_id} {text}\n")
 
 
 if __name__ == "__main__":
@@ -51,11 +48,11 @@ def parse_owsm(
         unk_symbol="<unk>",
     )
 
-    for name in ['train_v3', 'dev_v3']:
-        (Path(root) / name / 'text').rename(Path(root) / name / 'text.old')
+    for name in ["train_v3", "dev_v3"]:
+        (Path(root) / name / "text").rename(Path(root) / name / "text.old")
         parse_owsm(
-            Path(root) / name / 'text.old',
-            Path(root) / name / 'text',
+            Path(root) / name / "text.old",
+            Path(root) / name / "text",
             tokenizer,
             token_id_converter,
         )
diff --git a/espnet/nets/pytorch_backend/transformer/subsampling.py b/espnet/nets/pytorch_backend/transformer/subsampling.py
index ab4a9eb6b82..d70e6c769e7 100644
--- a/espnet/nets/pytorch_backend/transformer/subsampling.py
+++ b/espnet/nets/pytorch_backend/transformer/subsampling.py
@@ -474,7 +474,9 @@ def __init__(self, idim, odim, dropout_rate, pos_enc=None):
             torch.nn.ReLU(),
         )
         self.out = torch.nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2), odim)
-        self.pos_enc = pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        self.pos_enc = (
+            pos_enc if pos_enc is not None else PositionalEncoding(odim, dropout_rate)
+        )
 
     def forward(self, x, x_mask, prefix_embeds=None):
         """Subsample x.
@@ -503,8 +505,14 @@ def forward(self, x, x_mask, prefix_embeds=None):
             if x_mask is not None:
                 x_mask = torch.cat(
                     [
-                        torch.ones(x_mask.shape[0], 1, prefix_embeds.size(1), dtype=x_mask.dtype, device=x_mask.device),
-                        x_mask
+                        torch.ones(
+                            x_mask.shape[0],
+                            1,
+                            prefix_embeds.size(1),
+                            dtype=x_mask.dtype,
+                            device=x_mask.device,
+                        ),
+                        x_mask,
                     ],
                     dim=-1,
                 )
diff --git a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
index 971c4cf18a3..ca2f4493e57 100644
--- a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
+++ b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
@@ -107,7 +107,14 @@ def __init__(
         )
         self.merge_proj = torch.nn.Linear(size + size, size)
 
-    def forward(self, x_input, mask, cache=None, memory=None, memory_mask=None,):
+    def forward(
+        self,
+        x_input,
+        mask,
+        cache=None,
+        memory=None,
+        memory_mask=None,
+    ):
         """Compute encoded features.
 
         Args:
@@ -222,7 +229,7 @@ def __init__(
         merge_conv_kernel: int = 3,
         interctc_layer_idx=None,
         interctc_use_conditioning: bool = False,
-        use_cross_attention=True,   # bool or list of bool
+        use_cross_attention=True,  # bool or list of bool
         use_flash_attn: bool = False,
     ):
         super().__init__()
@@ -404,7 +411,10 @@ def __init__(
 
         if isinstance(use_cross_attention, bool):
             use_cross_attention = [use_cross_attention for _ in range(num_blocks)]
-        assert isinstance(use_cross_attention, list) and len(use_cross_attention) == num_blocks
+        assert (
+            isinstance(use_cross_attention, list)
+            and len(use_cross_attention) == num_blocks
+        )
 
         self.encoders = repeat(
             num_blocks,
@@ -413,17 +423,23 @@ def __init__(
                 encoder_selfattn_layer(*encoder_selfattn_layer_args),
                 cgmlp_layer(*cgmlp_layer_args),
                 positionwise_layer(*positionwise_layer_args) if use_ffn else None,
-                positionwise_layer(*positionwise_layer_args)
-                if use_ffn and macaron_ffn
-                else None,
-                MultiHeadedAttention(
-                    attention_heads,
-                    output_size,
-                    attention_dropout_rate,
-                    False,  # no qk_norm
-                    use_flash_attn,
-                    cross_attn=True,
-                ) if use_cross_attention[lnum] else None,
+                (
+                    positionwise_layer(*positionwise_layer_args)
+                    if use_ffn and macaron_ffn
+                    else None
+                ),
+                (
+                    MultiHeadedAttention(
+                        attention_heads,
+                        output_size,
+                        attention_dropout_rate,
+                        False,  # no qk_norm
+                        use_flash_attn,
+                        cross_attn=True,
+                    )
+                    if use_cross_attention[lnum]
+                    else None
+                ),
                 dropout_rate,
                 merge_conv_kernel,
             ),
@@ -493,7 +509,9 @@ def forward(
 
         intermediate_outs = []
         for layer_idx, encoder_layer in enumerate(self.encoders):
-            xs_pad, masks = encoder_layer(xs_pad, masks, memory=memory, memory_mask=memory_mask)
+            xs_pad, masks = encoder_layer(
+                xs_pad, masks, memory=memory, memory_mask=memory_mask
+            )
 
             if layer_idx + 1 in self.interctc_layer_idx:
                 encoder_out = xs_pad
diff --git a/espnet2/bin/s2t_ctc_align.py b/espnet2/bin/s2t_ctc_align.py
index cbfe969aa8c..4e474ed84fd 100755
--- a/espnet2/bin/s2t_ctc_align.py
+++ b/espnet2/bin/s2t_ctc_align.py
@@ -429,10 +429,16 @@ def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
         buffer_len = int(sample_rate * buffer_len_in_secs)
         chunk_len = int(sample_rate * chunk_len_in_secs)
 
-        speech = np.pad(speech, (int(sample_rate * context_len_in_secs), int(sample_rate * context_len_in_secs)))
+        speech = np.pad(
+            speech,
+            (
+                int(sample_rate * context_len_in_secs),
+                int(sample_rate * context_len_in_secs),
+            ),
+        )
         buffer_list = []
         for i in range(0, len(speech), chunk_len):
-            cur_buffer = speech[i:i+buffer_len]
+            cur_buffer = speech[i : i + buffer_len]
             if len(cur_buffer) < buffer_len:
                 buffer_list.append(
                     np.pad(cur_buffer, (0, buffer_len - len(cur_buffer)))
@@ -452,12 +458,16 @@ def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
                 [cur_speech.size(0)], dtype=torch.long, fill_value=cur_speech.size(1)
             )
 
-            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(
+                cur_speech.size(0), 1
+            )
             text_prev_lengths = text_prev.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=text_prev.size(1)
             )
 
-            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(
+                cur_speech.size(0), 1
+            )
             prefix_lengths = prefix.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=prefix.size(-1)
             )
@@ -482,12 +492,16 @@ def get_lpz(self, speech: Union[torch.Tensor, np.ndarray]):
                 enc, intermediate_outs = enc
 
             # enc: (B, T, D)
-            enc = enc[:, :buffer_frames]    # NOTE(yifan): IMPORTANT: it might be longer due to padding in conv
-            batched_log_p = self.ctc.log_softmax(enc).detach()      # (B, T, V)
-            valid_log_p = batched_log_p[:, context_frames : -context_frames].reshape(-1, batched_log_p.size(-1))    # (T', V)
+            enc = enc[
+                :, :buffer_frames
+            ]  # NOTE(yifan): IMPORTANT: it might be longer due to padding in conv
+            batched_log_p = self.ctc.log_softmax(enc).detach()  # (B, T, V)
+            valid_log_p = batched_log_p[:, context_frames:-context_frames].reshape(
+                -1, batched_log_p.size(-1)
+            )  # (T', V)
             unmerged.append(valid_log_p)
 
-        lpz = torch.cat(unmerged, dim=0).cpu().numpy()   # (time, V)
+        lpz = torch.cat(unmerged, dim=0).cpu().numpy()  # (time, V)
         return lpz
 
     def _split_text(self, text):
@@ -648,9 +662,8 @@ def __call__(
 
         Returns:
             CTCSegmentationTask object with segments.
-        
-        """
 
+        """
 
         if fs is not None:
             self.set_config(fs=fs)
diff --git a/espnet2/bin/s2t_inference_ctc.py b/espnet2/bin/s2t_inference_ctc.py
index ea70ba3e81c..b663d8c254a 100644
--- a/espnet2/bin/s2t_inference_ctc.py
+++ b/espnet2/bin/s2t_inference_ctc.py
@@ -266,9 +266,7 @@ def __call__(
         task_id = self.converter.token2id[task_sym]
 
         if isinstance(text_prev, str):
-            text_prev = self.converter.tokens2ids(
-                self.tokenizer.text2tokens(text_prev)
-            )
+            text_prev = self.converter.tokens2ids(self.tokenizer.text2tokens(text_prev))
         else:
             text_prev = text_prev.tolist()
 
@@ -276,12 +274,18 @@ def __call__(
         if self.s2t_model.na in text_prev:
             text_prev = [self.s2t_model.na]
 
-        text_prev = torch.tensor(text_prev, dtype=torch.long).unsqueeze(0)  # (1, length)
-        text_prev_lengths = text_prev.new_full([1], dtype=torch.long, fill_value=text_prev.size(1))
+        text_prev = torch.tensor(text_prev, dtype=torch.long).unsqueeze(
+            0
+        )  # (1, length)
+        text_prev_lengths = text_prev.new_full(
+            [1], dtype=torch.long, fill_value=text_prev.size(1)
+        )
 
         # Prepare prefix
-        prefix = torch.tensor([[lang_id, task_id]], dtype=torch.long) # (1, 2)
-        prefix_lengths = prefix.new_full([1], dtype=torch.long, fill_value=prefix.size(-1))
+        prefix = torch.tensor([[lang_id, task_id]], dtype=torch.long)  # (1, 2)
+        prefix_lengths = prefix.new_full(
+            [1], dtype=torch.long, fill_value=prefix.size(-1)
+        )
 
         # Preapre speech
         if isinstance(speech, np.ndarray):
@@ -291,7 +295,9 @@ def __call__(
         # speech: (nsamples,) -> (1, nsamples)
         speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
         # lengths: (1,)
-        speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        speech_lengths = speech.new_full(
+            [1], dtype=torch.long, fill_value=speech.size(1)
+        )
         logging.info("speech length: " + str(speech.size(1)))
 
         batch = {
@@ -452,7 +458,7 @@ def __init__(
             s2t_model = torch.quantization.quantize_dynamic(
                 s2t_model, qconfig_spec=quantize_modules, dtype=quantize_dtype
             )
-    
+
         logging.info(f"Decoding device={device}, dtype={dtype}")
 
         # [Optional] Build Text converter: e.g. bpe-sym -> Text
@@ -525,9 +531,7 @@ def __call__(
         task_id = self.converter.token2id[task_sym]
 
         if isinstance(text_prev, str):
-            text_prev = self.converter.tokens2ids(
-                self.tokenizer.text2tokens(text_prev)
-            )
+            text_prev = self.converter.tokens2ids(self.tokenizer.text2tokens(text_prev))
         else:
             text_prev = text_prev.tolist()
 
@@ -535,12 +539,18 @@ def __call__(
         if self.s2t_model.na in text_prev:
             text_prev = [self.s2t_model.na]
 
-        text_prev = torch.tensor(text_prev, dtype=torch.long).unsqueeze(0)  # (1, length)
-        text_prev_lengths = text_prev.new_full([1], dtype=torch.long, fill_value=text_prev.size(1))
+        text_prev = torch.tensor(text_prev, dtype=torch.long).unsqueeze(
+            0
+        )  # (1, length)
+        text_prev_lengths = text_prev.new_full(
+            [1], dtype=torch.long, fill_value=text_prev.size(1)
+        )
 
         # Prepare prefix
-        prefix = torch.tensor([[lang_id, task_id]], dtype=torch.long) # (1, 2)
-        prefix_lengths = prefix.new_full([1], dtype=torch.long, fill_value=prefix.size(-1))
+        prefix = torch.tensor([[lang_id, task_id]], dtype=torch.long)  # (1, 2)
+        prefix_lengths = prefix.new_full(
+            [1], dtype=torch.long, fill_value=prefix.size(-1)
+        )
 
         # Preapre speech
         if isinstance(speech, np.ndarray):
@@ -550,7 +560,9 @@ def __call__(
         # speech: (nsamples,) -> (1, nsamples)
         speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
         # lengths: (1,)
-        speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        speech_lengths = speech.new_full(
+            [1], dtype=torch.long, fill_value=speech.size(1)
+        )
         logging.info("speech length: " + str(speech.size(1)))
 
         batch = {
@@ -603,7 +615,7 @@ def _decode_interctc(
 
     def _decode_single_sample(self, enc: torch.Tensor):
         # enc: (B, T, D)
-        token_int = self.s2t_model.ctc.argmax(enc)[0]   # batch size is 1; (T,)
+        token_int = self.s2t_model.ctc.argmax(enc)[0]  # batch size is 1; (T,)
         token_int = torch.unique_consecutive(token_int).cpu().tolist()
         token_int = list(filter(lambda x: x != self.s2t_model.blank_id, token_int))
         token = self.converter.ids2tokens(token_int)
@@ -650,13 +662,13 @@ def decode_long_batched_buffered(
         buffer_len_in_secs = self.preprocessor_conf["speech_length"]
         chunk_len_in_secs = buffer_len_in_secs - 2 * context_len_in_secs
 
-        class AudioChunkIterator():
+        class AudioChunkIterator:
             def __init__(self, samples, chunk_len_in_secs, sample_rate):
                 self._samples = samples
                 self._chunk_len = chunk_len_in_secs * sample_rate
                 self._start = 0
                 self.output = True
-        
+
             def __iter__(self):
                 return self
 
@@ -665,14 +677,14 @@ def __next__(self):
                     raise StopIteration
                 last = int(self._start + self._chunk_len)
                 if last <= len(self._samples):
-                    chunk = self._samples[self._start: last]
+                    chunk = self._samples[self._start : last]
                     self._start = last
                 else:
-                    chunk = np.zeros([int(self._chunk_len)], dtype='float32')
+                    chunk = np.zeros([int(self._chunk_len)], dtype="float32")
                     samp_len = len(self._samples) - self._start
-                    chunk[0:samp_len] = self._samples[self._start:len(self._samples)]
+                    chunk[0:samp_len] = self._samples[self._start : len(self._samples)]
                     self.output = False
-        
+
                 return chunk
 
         buffer_len = int(sample_rate * buffer_len_in_secs)
@@ -696,12 +708,16 @@ def __next__(self):
                 [cur_speech.size(0)], dtype=torch.long, fill_value=cur_speech.size(1)
             )
 
-            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(
+                cur_speech.size(0), 1
+            )
             text_prev_lengths = text_prev.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=text_prev.size(1)
             )
 
-            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(
+                cur_speech.size(0), 1
+            )
             prefix_lengths = prefix.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=prefix.size(-1)
             )
@@ -726,10 +742,12 @@ def __next__(self):
                 enc, intermediate_outs = enc
 
             # enc: (B, T, D)
-            batched_token_int = self.s2t_model.ctc.argmax(enc)      # (B, T)
-            valid_token_int = batched_token_int[:, context_frames : -context_frames].reshape(-1)
+            batched_token_int = self.s2t_model.ctc.argmax(enc)  # (B, T)
+            valid_token_int = batched_token_int[
+                :, context_frames:-context_frames
+            ].reshape(-1)
             unmerged.append(valid_token_int)
-        
+
         unmerged = torch.cat(unmerged)
         merged = torch.unique_consecutive(unmerged).cpu().tolist()
         token_int = list(filter(lambda x: x != self.s2t_model.blank_id, merged))
@@ -1086,12 +1104,12 @@ def get_parser():
     group.add_argument(
         "--lang_sym", type=str, default="nolang", help="Language symbol."
     )
+    group.add_argument("--task_sym", type=str, default="asr", help="Task symbol.")
     group.add_argument(
-        "--task_sym", type=str, default="asr", help="Task symbol."
-    )
-    group.add_argument(
-        "--generate_interctc_outputs", type=bool, default=False,
-        help="Also write intermediate CTC outputs."
+        "--generate_interctc_outputs",
+        type=bool,
+        default=False,
+        help="Also write intermediate CTC outputs.",
     )
 
     group = parser.add_argument_group("Text converter related")
diff --git a/espnet2/s2t/espnet_ctc_model.py b/espnet2/s2t/espnet_ctc_model.py
index 146f756c25f..ebd353a4fb1 100644
--- a/espnet2/s2t/espnet_ctc_model.py
+++ b/espnet2/s2t/espnet_ctc_model.py
@@ -13,8 +13,8 @@
 from espnet2.torch_utils.device_funcs import force_gatherable
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet.nets.e2e_asr_common import ErrorCalculator
-from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 from espnet.nets.pytorch_backend.nets_utils import make_pad_mask
+from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
 
 
 class ESPnetS2TCTCModel(AbsESPnetModel):
@@ -65,16 +65,18 @@ def __init__(
         self.encoder = encoder
         self.prompt_encoder = prompt_encoder
 
-        self.embed = torch.nn.Embedding(
-            vocab_size, self.prompt_encoder.output_size()
-        )
+        self.embed = torch.nn.Embedding(vocab_size, self.prompt_encoder.output_size())
         self.pos_enc = PositionalEncoding(self.prompt_encoder.output_size(), 0.0)
 
         if self.encoder.output_size() != self.prompt_encoder.output_size():
             # used in encoder to inject task and lang tokens
-            self.embed_proj = torch.nn.Linear(self.prompt_encoder.output_size(), self.encoder.output_size())
+            self.embed_proj = torch.nn.Linear(
+                self.prompt_encoder.output_size(), self.encoder.output_size()
+            )
             # applied to the output of prompt encoder
-            self.prompt_proj = torch.nn.Linear(self.prompt_encoder.output_size(), self.encoder.output_size())
+            self.prompt_proj = torch.nn.Linear(
+                self.prompt_encoder.output_size(), self.encoder.output_size()
+            )
         else:
             self.embed_proj = torch.nn.Identity()
             self.prompt_proj = torch.nn.Identity()
@@ -118,8 +120,7 @@ def encode(
         # Forward prompt encoder
         text_prev[text_prev == -1] = self.eos
         memory, memory_lengths, _ = self.prompt_encoder(
-            self.pos_enc(self.embed(text_prev)),
-            text_prev_lengths
+            self.pos_enc(self.embed(text_prev)), text_prev_lengths
         )
         memory_mask = (~make_pad_mask(memory_lengths)[:, None, :]).to(memory.device)
 
@@ -199,7 +200,7 @@ def forward(
             text_ctc.shape,
             text_ctc_lengths.shape,
             prefix.shape,
-            prefix_lengths.shape
+            prefix_lengths.shape,
         )
         batch_size = speech.shape[0]
 
@@ -210,9 +211,7 @@ def forward(
         text = text[:, : text_lengths.max()]
 
         encoder_out, encoder_out_lens = self.encode(
-            speech, speech_lengths,
-            text_prev, text_prev_lengths,
-            prefix, prefix_lengths
+            speech, speech_lengths, text_prev, text_prev_lengths, prefix, prefix_lengths
         )
 
         intermediate_outs = None
@@ -253,7 +252,9 @@ def forward(
         loss_interctc = 0.0
         if self.interctc_weight != 0.0 and intermediate_outs is not None:
             assert len(self.ctc_asr_only) == len(intermediate_outs) + 1
-            for (layer_idx, intermediate_out), asr_only in zip(intermediate_outs, self.ctc_asr_only):
+            for (layer_idx, intermediate_out), asr_only in zip(
+                intermediate_outs, self.ctc_asr_only
+            ):
                 if asr_only:
                     loss_ic, cer_ic = self._calc_ctc_loss(
                         intermediate_out, encoder_out_lens, text_ctc, text_ctc_lengths
diff --git a/espnet2/tasks/s2t_ctc.py b/espnet2/tasks/s2t_ctc.py
index a7d2d32b9cb..2461275b2ef 100644
--- a/espnet2/tasks/s2t_ctc.py
+++ b/espnet2/tasks/s2t_ctc.py
@@ -16,6 +16,7 @@
 from espnet2.asr.encoder.contextual_block_transformer_encoder import (
     ContextualBlockTransformerEncoder,
 )
+from espnet2.asr.encoder.e_branchformer_ctc_encoder import EBranchformerCTCEncoder
 from espnet2.asr.encoder.e_branchformer_encoder import EBranchformerEncoder
 from espnet2.asr.encoder.hubert_encoder import (
     FairseqHubertEncoder,
@@ -31,7 +32,6 @@
 from espnet2.asr.encoder.vgg_rnn_encoder import VGGRNNEncoder
 from espnet2.asr.encoder.wav2vec2_encoder import FairSeqWav2Vec2Encoder
 from espnet2.asr.encoder.whisper_encoder import OpenAIWhisperEncoder
-from espnet2.asr.encoder.e_branchformer_ctc_encoder import EBranchformerCTCEncoder
 from espnet2.asr.frontend.abs_frontend import AbsFrontend
 from espnet2.asr.frontend.default import DefaultFrontend
 from espnet2.asr.frontend.fused import FusedFrontends
@@ -43,15 +43,19 @@
 from espnet2.layers.abs_normalize import AbsNormalize
 from espnet2.layers.global_mvn import GlobalMVN
 from espnet2.layers.utterance_mvn import UtteranceMVN
-from espnet2.s2t.espnet_model import ESPnetS2TModel
 from espnet2.s2t.espnet_ctc_model import ESPnetS2TCTCModel
+from espnet2.s2t.espnet_model import ESPnetS2TModel
 from espnet2.tasks.abs_task import AbsTask
 from espnet2.text.phoneme_tokenizer import g2p_choices
 from espnet2.torch_utils.initialize import initialize
 from espnet2.train.abs_espnet_model import AbsESPnetModel
 from espnet2.train.class_choices import ClassChoices
 from espnet2.train.collate_fn import CommonCollateFn
-from espnet2.train.preprocessor import AbsPreprocessor, S2TPreprocessor, S2TCTCPreprocessor
+from espnet2.train.preprocessor import (
+    AbsPreprocessor,
+    S2TCTCPreprocessor,
+    S2TPreprocessor,
+)
 from espnet2.train.trainer import Trainer
 from espnet2.utils.get_default_kwargs import get_default_kwargs
 from espnet2.utils.nested_dict_action import NestedDictAction
@@ -316,9 +320,7 @@ def add_task_arguments(cls, parser: argparse.ArgumentParser):
 
     @classmethod
     @typechecked
-    def build_collate_fn(
-        cls, args: argparse.Namespace, train: bool
-    ) -> Callable[
+    def build_collate_fn(cls, args: argparse.Namespace, train: bool) -> Callable[
         [Collection[Tuple[str, Dict[str, np.ndarray]]]],
         Tuple[List[str], Dict[str, torch.Tensor]],
     ]:
@@ -350,22 +352,24 @@ def build_preprocess_fn(
                 non_linguistic_symbols=args.non_linguistic_symbols,
                 # NOTE(kamo): Check attribute existence for backward compatibility
                 rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
-                rir_apply_prob=args.rir_apply_prob
-                if hasattr(args, "rir_apply_prob")
-                else 1.0,
+                rir_apply_prob=(
+                    args.rir_apply_prob if hasattr(args, "rir_apply_prob") else 1.0
+                ),
                 noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
-                noise_apply_prob=args.noise_apply_prob
-                if hasattr(args, "noise_apply_prob")
-                else 1.0,
-                noise_db_range=args.noise_db_range
-                if hasattr(args, "noise_db_range")
-                else "13_15",
-                short_noise_thres=args.short_noise_thres
-                if hasattr(args, "short_noise_thres")
-                else 0.5,
-                speech_volume_normalize=args.speech_volume_normalize
-                if hasattr(args, "rir_scp")
-                else None,
+                noise_apply_prob=(
+                    args.noise_apply_prob if hasattr(args, "noise_apply_prob") else 1.0
+                ),
+                noise_db_range=(
+                    args.noise_db_range if hasattr(args, "noise_db_range") else "13_15"
+                ),
+                short_noise_thres=(
+                    args.short_noise_thres
+                    if hasattr(args, "short_noise_thres")
+                    else 0.5
+                ),
+                speech_volume_normalize=(
+                    args.speech_volume_normalize if hasattr(args, "rir_scp") else None
+                ),
                 **args.preprocessor_conf,
             )
         else:
@@ -449,9 +453,9 @@ def build_model(cls, args: argparse.Namespace):
         # 5. Prompt Encoder
         promptencoder_class = promptencoder_choices.get_class(args.promptencoder)
         promptencoder = promptencoder_class(
-            input_size=args.promptencoder_conf['output_size'],
+            input_size=args.promptencoder_conf["output_size"],
             input_layer=None,
-            **args.promptencoder_conf
+            **args.promptencoder_conf,
         )
 
         # 6. CTC
diff --git a/espnet2/train/preprocessor.py b/espnet2/train/preprocessor.py
index 271d4c69901..e93095c7920 100644
--- a/espnet2/train/preprocessor.py
+++ b/espnet2/train/preprocessor.py
@@ -2502,7 +2502,9 @@ def _text_process(
 
                     elif name == self.text_ctc_name:
                         # Add <lang> and <task> to ASR Text as well
-                        text_ints = np.concatenate([data[self.text_name][:2], text_ints])
+                        text_ints = np.concatenate(
+                            [data[self.text_name][:2], text_ints]
+                        )
 
                     elif name == self.text_prev_name:
                         # Remove space before <na>

From 0bf54c19cfc6989cc0bb2a9303391be289fb8151 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Thu, 24 Oct 2024 19:15:36 -0500
Subject: [PATCH 04/15] fix ci

---
 espnet/nets/pytorch_backend/transformer/subsampling.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/espnet/nets/pytorch_backend/transformer/subsampling.py b/espnet/nets/pytorch_backend/transformer/subsampling.py
index d70e6c769e7..90701756c15 100644
--- a/espnet/nets/pytorch_backend/transformer/subsampling.py
+++ b/espnet/nets/pytorch_backend/transformer/subsampling.py
@@ -484,7 +484,8 @@ def forward(self, x, x_mask, prefix_embeds=None):
         Args:
             x (torch.Tensor): Input tensor (#batch, time, idim).
             x_mask (torch.Tensor): Input mask (#batch, 1, time).
-            prefix_embeds (torch.Tensor or None): Prefix token embeddings (#batch, prefix_len, odim).
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings 
+                (#batch, prefix_len, odim).
 
         Returns:
             torch.Tensor: Subsampled tensor (#batch, time', odim),

From a7b6378e573c7cdd50eb1219b38d48da641aaaa4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 25 Oct 2024 00:16:45 +0000
Subject: [PATCH 05/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 espnet/nets/pytorch_backend/transformer/subsampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espnet/nets/pytorch_backend/transformer/subsampling.py b/espnet/nets/pytorch_backend/transformer/subsampling.py
index 90701756c15..c825112db20 100644
--- a/espnet/nets/pytorch_backend/transformer/subsampling.py
+++ b/espnet/nets/pytorch_backend/transformer/subsampling.py
@@ -484,7 +484,7 @@ def forward(self, x, x_mask, prefix_embeds=None):
         Args:
             x (torch.Tensor): Input tensor (#batch, time, idim).
             x_mask (torch.Tensor): Input mask (#batch, 1, time).
-            prefix_embeds (torch.Tensor or None): Prefix token embeddings 
+            prefix_embeds (torch.Tensor or None): Prefix token embeddings
                 (#batch, prefix_len, odim).
 
         Returns:

From 8c3f3c24df323a5a69761f4027bfa9ef44b1d654 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 28 Oct 2024 15:44:50 -0500
Subject: [PATCH 06/15] add tests and fix formats

---
 .../asr/encoder/e_branchformer_ctc_encoder.py |   2 +-
 espnet2/bin/s2t_ctc_align.py                  |   1 -
 espnet2/bin/s2t_inference_ctc.py              |  94 +++------
 .../test_e_branchformer_ctc_encoder.py        | 137 +++++++++++++
 test/espnet2/bin/test_s2t_ctc_align.py        | 131 ++++++++++++
 test/espnet2/bin/test_s2t_inference_ctc.py    | 187 ++++++++++++++++++
 test/espnet2/s2t/test_espnet_ctc_model.py     |  58 ++++++
 test/espnet2/s2t/test_espnet_model.py         |   7 +-
 test/espnet2/tasks/test_s2t_ctc.py            |  36 ++++
 9 files changed, 583 insertions(+), 70 deletions(-)
 create mode 100644 test/espnet2/asr/encoder/test_e_branchformer_ctc_encoder.py
 create mode 100644 test/espnet2/bin/test_s2t_ctc_align.py
 create mode 100644 test/espnet2/bin/test_s2t_inference_ctc.py
 create mode 100644 test/espnet2/s2t/test_espnet_ctc_model.py
 create mode 100644 test/espnet2/tasks/test_s2t_ctc.py

diff --git a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
index ca2f4493e57..9ac6bab27b1 100644
--- a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
+++ b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
@@ -216,7 +216,7 @@ def __init__(
         dropout_rate: float = 0.1,
         positional_dropout_rate: float = 0.1,
         attention_dropout_rate: float = 0.0,
-        input_layer: Optional[str] = "conv2d",
+        input_layer: Optional[str] = "conv2d8",
         zero_triu: bool = False,
         padding_idx: int = -1,
         layer_drop_rate: float = 0.0,
diff --git a/espnet2/bin/s2t_ctc_align.py b/espnet2/bin/s2t_ctc_align.py
index 4e474ed84fd..dfcc2afacf9 100755
--- a/espnet2/bin/s2t_ctc_align.py
+++ b/espnet2/bin/s2t_ctc_align.py
@@ -173,7 +173,6 @@ class CTCSegmentation:
     warned_about_misconfiguration = False
     config = CtcSegmentationParameters()
 
-    @typechecked
     def __init__(
         self,
         s2t_train_config: Union[Path, str] = None,
diff --git a/espnet2/bin/s2t_inference_ctc.py b/espnet2/bin/s2t_inference_ctc.py
index b663d8c254a..08e750ddf28 100644
--- a/espnet2/bin/s2t_inference_ctc.py
+++ b/espnet2/bin/s2t_inference_ctc.py
@@ -44,28 +44,18 @@
 
 
 class Speech2Text:
-    """Speech2Text class
+    """Speech2Text class"""
 
-    Examples:
-        >>> import soundfile
-        >>> speech2text = Speech2Text("s2t_config.yml", "s2t.pth")
-        >>> audio, rate = soundfile.read("speech.wav")
-        >>> speech2text(audio)
-        [(text, token, token_int, hypothesis object), ...]
-
-    """
-
-    @typechecked
     def __init__(
         self,
-        s2t_train_config: Union[Path, str] = None,
-        s2t_model_file: Union[Path, str] = None,
-        lm_train_config: Union[Path, str] = None,
-        lm_file: Union[Path, str] = None,
+        s2t_train_config: Optional[Union[Path, str]] = None,
+        s2t_model_file: Optional[Union[Path, str]] = None,
+        lm_train_config: Optional[Union[Path, str]] = None,
+        lm_file: Optional[Union[Path, str]] = None,
         ngram_scorer: str = "full",
-        ngram_file: Union[Path, str] = None,
-        token_type: str = None,
-        bpemodel: str = None,
+        ngram_file: Optional[Union[Path, str]] = None,
+        token_type: Optional[str] = None,
+        bpemodel: Optional[str] = None,
         device: str = "cpu",
         maxlenratio: float = 0.0,
         minlenratio: float = 0.0,
@@ -421,10 +411,10 @@ class Speech2TextGreedySearch:
 
     def __init__(
         self,
-        s2t_train_config: Union[Path, str] = None,
-        s2t_model_file: Union[Path, str] = None,
-        token_type: str = None,
-        bpemodel: str = None,
+        s2t_train_config: Optional[Union[Path, str]] = None,
+        s2t_model_file: Optional[Union[Path, str]] = None,
+        token_type: Optional[str] = None,
+        bpemodel: Optional[str] = None,
         device: str = "cpu",
         batch_size: int = 1,
         dtype: str = "float32",
@@ -659,46 +649,25 @@ def decode_long_batched_buffered(
         lang_id = self.converter.token2id[lang_sym]
         task_id = self.converter.token2id[task_sym]
 
-        buffer_len_in_secs = self.preprocessor_conf["speech_length"]
+        buffer_len_in_secs = self.s2t_train_args.preprocessor_conf["speech_length"]
         chunk_len_in_secs = buffer_len_in_secs - 2 * context_len_in_secs
-
-        class AudioChunkIterator:
-            def __init__(self, samples, chunk_len_in_secs, sample_rate):
-                self._samples = samples
-                self._chunk_len = chunk_len_in_secs * sample_rate
-                self._start = 0
-                self.output = True
-
-            def __iter__(self):
-                return self
-
-            def __next__(self):
-                if not self.output:
-                    raise StopIteration
-                last = int(self._start + self._chunk_len)
-                if last <= len(self._samples):
-                    chunk = self._samples[self._start : last]
-                    self._start = last
-                else:
-                    chunk = np.zeros([int(self._chunk_len)], dtype="float32")
-                    samp_len = len(self._samples) - self._start
-                    chunk[0:samp_len] = self._samples[self._start : len(self._samples)]
-                    self.output = False
-
-                return chunk
-
         buffer_len = int(sample_rate * buffer_len_in_secs)
         chunk_len = int(sample_rate * chunk_len_in_secs)
-        sampbuffer = np.zeros([buffer_len], dtype=np.float32)
 
-        chunk_reader = AudioChunkIterator(speech, chunk_len_in_secs, sample_rate)
+        speech = np.pad(speech, (int(sample_rate * context_len_in_secs), int(sample_rate * context_len_in_secs)))
         buffer_list = []
-        for chunk in chunk_reader:
-            sampbuffer[:-chunk_len] = sampbuffer[chunk_len:]
-            sampbuffer[-chunk_len:] = chunk
-            buffer_list.append(np.array(sampbuffer))
+        for i in range(0, len(speech), chunk_len):
+            cur_buffer = speech[i:i+buffer_len]
+            if len(cur_buffer) < buffer_len:
+                buffer_list.append(
+                    np.pad(cur_buffer, (0, buffer_len - len(cur_buffer)))
+                )
+                break
+            else:
+                buffer_list.append(cur_buffer)
 
         speech = torch.tensor(np.array(buffer_list)).to(getattr(torch, self.dtype))
+        buffer_frames = int(frames_per_sec * buffer_len_in_secs)
         context_frames = int(frames_per_sec * context_len_in_secs)
 
         unmerged = []
@@ -708,16 +677,12 @@ def __next__(self):
                 [cur_speech.size(0)], dtype=torch.long, fill_value=cur_speech.size(1)
             )
 
-            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(
-                cur_speech.size(0), 1
-            )
+            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(cur_speech.size(0), 1)
             text_prev_lengths = text_prev.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=text_prev.size(1)
             )
 
-            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(
-                cur_speech.size(0), 1
-            )
+            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(cur_speech.size(0), 1)
             prefix_lengths = prefix.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=prefix.size(-1)
             )
@@ -742,10 +707,9 @@ def __next__(self):
                 enc, intermediate_outs = enc
 
             # enc: (B, T, D)
-            batched_token_int = self.s2t_model.ctc.argmax(enc)  # (B, T)
-            valid_token_int = batched_token_int[
-                :, context_frames:-context_frames
-            ].reshape(-1)
+            enc = enc[:, :buffer_frames]    # NOTE(yifan): IMPORTANT: it might be longer due to padding in conv
+            batched_token_int = self.s2t_model.ctc.argmax(enc)      # (B, T)
+            valid_token_int = batched_token_int[:, context_frames : -context_frames].reshape(-1)
             unmerged.append(valid_token_int)
 
         unmerged = torch.cat(unmerged)
diff --git a/test/espnet2/asr/encoder/test_e_branchformer_ctc_encoder.py b/test/espnet2/asr/encoder/test_e_branchformer_ctc_encoder.py
new file mode 100644
index 00000000000..ef23812fac2
--- /dev/null
+++ b/test/espnet2/asr/encoder/test_e_branchformer_ctc_encoder.py
@@ -0,0 +1,137 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.e_branchformer_ctc_encoder import EBranchformerCTCEncoder
+
+
+@pytest.mark.parametrize(
+    "input_layer",
+    [
+        "conv2d8",
+    ],
+)
+@pytest.mark.parametrize("use_linear_after_conv", [True, False])
+@pytest.mark.parametrize(
+    "rel_pos_type, pos_enc_layer_type, attention_layer_type",
+    [
+        ("legacy", "abs_pos", "selfattn"),
+        ("latest", "rel_pos", "rel_selfattn"),
+        ("legacy", "rel_pos", "rel_selfattn"),
+        ("legacy", "legacy_rel_pos", "legacy_rel_selfattn"),
+        ("legacy", "abs_pos", "fast_selfattn"),
+    ],
+)
+@pytest.mark.parametrize("max_pos_emb_len", [128, 5000])
+@pytest.mark.parametrize("use_ffn", [True, False])
+@pytest.mark.parametrize("macaron_ffn", [True, False])
+@pytest.mark.parametrize("linear_units", [1024, 2048])
+@pytest.mark.parametrize("merge_conv_kernel", [3, 31])
+@pytest.mark.parametrize("layer_drop_rate", [0.0, 0.1])
+@pytest.mark.parametrize(
+    "interctc_layer_idx, interctc_use_conditioning",
+    [
+        ([], False),
+        ([1], False),
+        ([1], True),
+    ],
+)
+def test_encoder_forward_backward(
+    input_layer,
+    use_linear_after_conv,
+    rel_pos_type,
+    pos_enc_layer_type,
+    attention_layer_type,
+    max_pos_emb_len,
+    use_ffn,
+    macaron_ffn,
+    linear_units,
+    merge_conv_kernel,
+    layer_drop_rate,
+    interctc_layer_idx,
+    interctc_use_conditioning,
+):
+    encoder = EBranchformerCTCEncoder(
+        20,
+        output_size=2,
+        attention_heads=2,
+        attention_layer_type=attention_layer_type,
+        pos_enc_layer_type=pos_enc_layer_type,
+        rel_pos_type=rel_pos_type,
+        cgmlp_linear_units=4,
+        cgmlp_conv_kernel=3,
+        use_linear_after_conv=use_linear_after_conv,
+        gate_activation="identity",
+        num_blocks=2,
+        input_layer=input_layer,
+        max_pos_emb_len=max_pos_emb_len,
+        use_ffn=use_ffn,
+        macaron_ffn=macaron_ffn,
+        linear_units=linear_units,
+        merge_conv_kernel=merge_conv_kernel,
+        layer_drop_rate=layer_drop_rate,
+        interctc_layer_idx=interctc_layer_idx,
+        interctc_use_conditioning=interctc_use_conditioning,
+    )
+    if input_layer == "embed":
+        x = torch.randint(0, 10, [2, 32])
+    else:
+        x = torch.randn(2, 32, 20, requires_grad=True)
+    x_lens = torch.LongTensor([32, 28])
+
+    if len(interctc_layer_idx) > 0:  # intermediate CTC
+        encoder.conditioning_layer = torch.nn.Linear(2, 2)
+        y, _, _ = encoder(x, x_lens, ctc=CTC(odim=2, encoder_output_size=2))
+        y, intermediate_outs = y
+    else:
+        y, _, _ = encoder(x, x_lens)
+
+    y.sum().backward()
+
+
+def test_encoder_invalid_layer_type():
+    with pytest.raises(ValueError):
+        EBranchformerCTCEncoder(20, input_layer="dummy")
+    with pytest.raises(ValueError):
+        EBranchformerCTCEncoder(20, rel_pos_type="dummy")
+    with pytest.raises(ValueError):
+        EBranchformerCTCEncoder(20, pos_enc_layer_type="dummy")
+    with pytest.raises(ValueError):
+        EBranchformerCTCEncoder(
+            20, pos_enc_layer_type="abc_pos", attention_layer_type="dummy"
+        )
+    with pytest.raises(ValueError):
+        EBranchformerCTCEncoder(20, positionwise_layer_type="dummy")
+
+
+def test_encoder_invalid_rel_pos_combination():
+    with pytest.raises(AssertionError):
+        EBranchformerCTCEncoder(
+            20,
+            rel_pos_type="latest",
+            pos_enc_layer_type="legacy_rel_pos",
+            attention_layer_type="legacy_rel_sselfattn",
+        )
+    with pytest.raises(AssertionError):
+        EBranchformerCTCEncoder(
+            20,
+            pos_enc_layer_type="rel_pos",
+            attention_layer_type="legacy_rel_sselfattn",
+        )
+    with pytest.raises(AssertionError):
+        EBranchformerCTCEncoder(
+            20,
+            pos_enc_layer_type="legacy_rel_pos",
+            attention_layer_type="rel_sselfattn",
+        )
+    with pytest.raises(AssertionError):
+        EBranchformerCTCEncoder(
+            20,
+            attention_layer_type="fast_selfattn",
+            pos_enc_layer_type="rel_pos",
+        )
+
+
+def test_encoder_output_size():
+    encoder = EBranchformerCTCEncoder(20, output_size=256)
+    assert encoder.output_size() == 256
diff --git a/test/espnet2/bin/test_s2t_ctc_align.py b/test/espnet2/bin/test_s2t_ctc_align.py
new file mode 100644
index 00000000000..eac36f5e450
--- /dev/null
+++ b/test/espnet2/bin/test_s2t_ctc_align.py
@@ -0,0 +1,131 @@
+import string
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from espnet2.bin.s2t_ctc_align import CTCSegmentation, CTCSegmentationTask, get_parser, main
+from espnet2.tasks.s2t_ctc import S2TTask
+
+
+def test_get_parser():
+    """Check the parser."""
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    """Run main(·) once."""
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        tokens = [
+            "<blank>",
+            "<unk>",
+            "<na>",
+            "<nolang>",
+            "<eng>",
+            "<zho>",
+            "<asr>",
+            "<st_eng>",
+            *list(string.ascii_letters),
+            "<sos>",
+            "<eos>",
+            "<sop>",
+        ]
+        for tok in tokens:
+            f.write(f"{tok}\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def s2t_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    S2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "s2t"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--promptencoder_conf",
+            "output_size=4",
+            "--preprocessor_conf",
+            "fs=16000",
+            "--preprocessor_conf",
+            "speech_length=4",
+        ]
+    )
+    return tmp_path / "s2t" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_CTCSegmentation(s2t_config_file):
+    """Test CTC segmentation.
+
+    Note that due to the random vector that is given to the CTC segmentation function,
+    there is a small chance that this test might randomly fail. If this ever happens,
+    use the test file test_utils/ctc_align_test.wav instead, or a fixed test vector.
+    """
+
+    num_samples = 200000
+    fs = 16000
+    # text includes:
+    #   one blank line
+    #   kaldi-style utterance names
+    #   one char not included in char list
+    text = (
+        "\n"
+        "utt_a HOTELS\n"
+        "utt_b HOLIDAY'S STRATEGY\n"
+        "utt_c ASSETS\n"
+        "utt_d PROPERTY MANAGEMENT\n"
+    )
+    # speech either from the test audio file or random
+    speech = np.random.randn(num_samples)
+    aligner = CTCSegmentation(
+        s2t_train_config=s2t_config_file,
+        fs=fs,
+        context_len_in_secs=1,
+        kaldi_style_text=True,
+        min_window_size=10,
+    )
+    segments = aligner(speech, text, fs=fs)
+    # check segments
+    assert isinstance(segments, CTCSegmentationTask)
+    kaldi_text = str(segments)
+    first_line = kaldi_text.splitlines()[0]
+    assert "utt_a" == first_line.split(" ")[0]
+    start, end, score = segments.segments[0]
+    assert start > 0.0
+    assert start < (num_samples / fs)
+    assert end >= start
+    assert score < 0.0
+    # check options and align with "classic" text converter
+    option_dict = {
+        "fs": 16000,
+        "time_stamps": "fixed",
+        "samples_to_frames_ratio": 512,
+        "min_window_size": 100,
+        "max_window_size": 20000,
+        "set_blank": 0,
+        "scoring_length": 10,
+        "replace_spaces_with_blanks": True,
+        "gratis_blank": True,
+        "kaldi_style_text": False,
+        "text_converter": "classic",
+    }
+    aligner.set_config(**option_dict)
+    assert aligner.warned_about_misconfiguration
+    text = ["HOTELS", "HOLIDAY'S STRATEGY", "ASSETS", "PROPERTY MANAGEMENT"]
+    segments = aligner(speech, text, name="foo")
+    segments_str = str(segments)
+    first_line = segments_str.splitlines()[0]
+    assert "foo_0000" == first_line.split(" ")[0]
diff --git a/test/espnet2/bin/test_s2t_inference_ctc.py b/test/espnet2/bin/test_s2t_inference_ctc.py
new file mode 100644
index 00000000000..c07c9708020
--- /dev/null
+++ b/test/espnet2/bin/test_s2t_inference_ctc.py
@@ -0,0 +1,187 @@
+from argparse import ArgumentParser
+from pathlib import Path
+
+import numpy as np
+import pytest
+
+from espnet2.bin.s2t_inference_ctc import Speech2Text, Speech2TextGreedySearch, get_parser, main
+from espnet2.tasks.s2t_ctc import S2TTask
+from espnet.nets.beam_search import Hypothesis
+
+
+def test_get_parser():
+    assert isinstance(get_parser(), ArgumentParser)
+
+
+def test_main():
+    with pytest.raises(SystemExit):
+        main()
+
+
+@pytest.fixture()
+def token_list(tmp_path: Path):
+    with (tmp_path / "tokens.txt").open("w") as f:
+        tokens = [
+            "<blank>",
+            "<unk>",
+            "<na>",
+            "<nolang>",
+            "<eng>",
+            "<zho>",
+            "<asr>",
+            "<st_eng>",
+            "a",
+            "<sos>",
+            "<eos>",
+            "<sop>",
+        ]
+        for tok in tokens:
+            f.write(f"{tok}\n")
+    return tmp_path / "tokens.txt"
+
+
+@pytest.fixture()
+def s2t_config_file(tmp_path: Path, token_list):
+    # Write default configuration file
+    S2TTask.main(
+        cmd=[
+            "--dry_run",
+            "true",
+            "--output_dir",
+            str(tmp_path / "s2t"),
+            "--token_list",
+            str(token_list),
+            "--token_type",
+            "char",
+            "--promptencoder_conf",
+            "output_size=4",
+            "--preprocessor_conf",
+            "fs=2000",
+            "--preprocessor_conf",
+            "speech_length=3",
+        ]
+    )
+    return tmp_path / "s2t" / "config.yaml"
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text(s2t_config_file):
+    speech2text = Speech2Text(
+        s2t_train_config=s2t_config_file,
+        beam_size=1,
+        maxlenratio=-5,
+    )
+    speech = np.random.randn(3000)
+    results = speech2text(speech)
+    for text, token, token_int, text_nospecial, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(text_nospecial, str)
+        assert isinstance(hyp, Hypothesis)
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text_overwrite_args(s2t_config_file):
+    speech2text = Speech2Text(
+        s2t_train_config=s2t_config_file,
+        beam_size=1,
+        maxlenratio=-5,
+    )
+    speech = np.random.randn(3000)
+    results = speech2text(
+        speech,
+        text_prev="<na>",
+        lang_sym="<zho>",
+        task_sym="<st_eng>",
+    )
+    for text, token, token_int, text_nospecial, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(text_nospecial, str)
+        assert isinstance(hyp, Hypothesis)
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2Text_quantized(s2t_config_file):
+    speech2text = Speech2Text(
+        s2t_train_config=s2t_config_file,
+        beam_size=1,
+        maxlenratio=-5,
+        quantize_s2t_model=True,
+    )
+    speech = np.random.randn(3000)
+    results = speech2text(speech)
+    for text, token, token_int, text_nospecial, hyp in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(text_nospecial, str)
+        assert isinstance(hyp, Hypothesis)
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2TextGreedy(s2t_config_file):
+    speech2text = Speech2TextGreedySearch(
+        s2t_train_config=s2t_config_file,
+        maxlenratio=-5,
+    )
+    speech = np.random.randn(3000)
+    results = speech2text(speech)
+    for text, token, token_int, text_nospecial, _ in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(text_nospecial, str)
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2TextGreedy_overwrite_args(s2t_config_file):
+    speech2text = Speech2TextGreedySearch(
+        s2t_train_config=s2t_config_file,
+        maxlenratio=-5,
+    )
+    speech = np.random.randn(3000)
+    results = speech2text(
+        speech,
+        text_prev="<na>",
+        lang_sym="<zho>",
+        task_sym="<st_eng>",
+    )
+    for text, token, token_int, text_nospecial, _ in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(text_nospecial, str)
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2TextGreedy_quantized(s2t_config_file):
+    speech2text = Speech2TextGreedySearch(
+        s2t_train_config=s2t_config_file,
+        maxlenratio=-5,
+        quantize_s2t_model=True,
+    )
+    speech = np.random.randn(3000)
+    results = speech2text(speech)
+    for text, token, token_int, text_nospecial, _ in results:
+        assert isinstance(text, str)
+        assert isinstance(token[0], str)
+        assert isinstance(token_int[0], int)
+        assert isinstance(text_nospecial, str)
+
+
+@pytest.mark.execution_timeout(5)
+def test_Speech2TextGreedy_longform(s2t_config_file):
+    speech2text = Speech2TextGreedySearch(
+        s2t_train_config=s2t_config_file,
+        maxlenratio=-5,
+    )
+    speech = np.random.randn(3000)
+    result = speech2text.decode_long_batched_buffered(
+        speech,
+        sample_rate=2000,
+        context_len_in_secs=1,
+    )
+    assert isinstance(result, str)
diff --git a/test/espnet2/s2t/test_espnet_ctc_model.py b/test/espnet2/s2t/test_espnet_ctc_model.py
new file mode 100644
index 00000000000..80cf8e2401c
--- /dev/null
+++ b/test/espnet2/s2t/test_espnet_ctc_model.py
@@ -0,0 +1,58 @@
+import pytest
+import torch
+
+from espnet2.asr.ctc import CTC
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
+from espnet2.asr.encoder.e_branchformer_ctc_encoder import EBranchformerCTCEncoder
+from espnet2.s2t.espnet_ctc_model import ESPnetS2TCTCModel
+
+
+@pytest.mark.execution_timeout(5)
+@pytest.mark.parametrize("encoder_arch", [EBranchformerCTCEncoder])
+@pytest.mark.parametrize("prompt_encoder_arch", [TransformerEncoder])
+def test_espnet_model(encoder_arch, prompt_encoder_arch):
+    token_list = [
+        "<blank>",
+        "<unk>",
+        "<na>",
+        "<nolang>",
+        "<eng>",
+        "<asr>",
+        "<st_eng>",
+        "a",
+        "<sos>",
+        "<eos>",
+        "<sop>",
+    ]
+    vocab_size = len(token_list)
+    enc_out = 1
+    encoder = encoder_arch(15, output_size=enc_out, attention_heads=1, attention_layer_type="selfattn", pos_enc_layer_type="abs_pos", linear_units=2, cgmlp_linear_units=2, num_blocks=2, cgmlp_conv_kernel=3, interctc_layer_idx=[1], interctc_use_conditioning=True, use_cross_attention=[False, True], use_flash_attn=False, dropout_rate=0, positional_dropout_rate=0, attention_dropout_rate=0,)
+    prompt_encoder = prompt_encoder_arch(enc_out, attention_heads=1, output_size=enc_out, linear_units=2, num_blocks=1, input_layer=None, use_flash_attn=False, dropout_rate=0, positional_dropout_rate=0, attention_dropout_rate=0,)
+    ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
+
+    model = ESPnetS2TCTCModel(
+        vocab_size,
+        token_list=token_list,
+        frontend=None,
+        specaug=None,
+        normalize=None,
+        encoder=encoder,
+        prompt_encoder=prompt_encoder,
+        ctc=ctc,
+        interctc_weight=0.5,
+        ctc_asr_only=[True, False],
+    )
+
+    inputs = dict(
+        speech=torch.randn(2, 16, 15, requires_grad=True),
+        speech_lengths=torch.tensor([16, 16], dtype=torch.long),
+        text=torch.randint(2, 4, [2, 4], dtype=torch.long),
+        text_lengths=torch.tensor([4, 3], dtype=torch.long),
+        text_prev=torch.tensor([[2], [7]], dtype=torch.long),
+        text_prev_lengths=torch.tensor([1, 1], dtype=torch.long),
+        text_ctc=torch.randint(2, 4, [2, 4], dtype=torch.long),
+        text_ctc_lengths=torch.tensor([4, 3], dtype=torch.long),
+        prefix=torch.tensor([[4, 5], [4, 6]], dtype=torch.long),
+        prefix_lengths=torch.tensor([2, 2], dtype=torch.long),
+    )
+    loss, *_ = model(**inputs)
diff --git a/test/espnet2/s2t/test_espnet_model.py b/test/espnet2/s2t/test_espnet_model.py
index 12d5e058717..8d1873aaa4c 100644
--- a/test/espnet2/s2t/test_espnet_model.py
+++ b/test/espnet2/s2t/test_espnet_model.py
@@ -17,7 +17,8 @@ def test_espnet_model(encoder_arch, decoder_arch):
         "<nospeech>",
         "<en>",
         "<asr>",
-        "<st_en>" "<notimestamps>",
+        "<st_en>",
+        "<notimestamps>",
         "<0.00>",
         "<30.00>",
         "a",
@@ -28,8 +29,8 @@ def test_espnet_model(encoder_arch, decoder_arch):
     ]
     vocab_size = len(token_list)
     enc_out = 4
-    encoder = encoder_arch(20, output_size=enc_out, linear_units=4, num_blocks=2)
-    decoder = decoder_arch(vocab_size, enc_out, linear_units=4, num_blocks=2)
+    encoder = encoder_arch(20, output_size=enc_out, linear_units=4, num_blocks=2, use_flash_attn=False)
+    decoder = decoder_arch(vocab_size, enc_out, linear_units=4, num_blocks=2, use_flash_attn=False)
     ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
 
     model = ESPnetS2TModel(
diff --git a/test/espnet2/tasks/test_s2t_ctc.py b/test/espnet2/tasks/test_s2t_ctc.py
new file mode 100644
index 00000000000..a9cce192252
--- /dev/null
+++ b/test/espnet2/tasks/test_s2t_ctc.py
@@ -0,0 +1,36 @@
+import pytest
+
+from espnet2.tasks.s2t_ctc import S2TTask
+
+
+def test_add_arguments():
+    S2TTask.get_parser()
+
+
+def test_add_arguments_help():
+    parser = S2TTask.get_parser()
+    with pytest.raises(SystemExit):
+        parser.parse_args(["--help"])
+
+
+def test_main_help():
+    with pytest.raises(SystemExit):
+        S2TTask.main(cmd=["--help"])
+
+
+def test_main_print_config():
+    with pytest.raises(SystemExit):
+        S2TTask.main(cmd=["--print_config"])
+
+
+def test_main_with_no_args():
+    with pytest.raises(SystemExit):
+        S2TTask.main(cmd=[])
+
+
+def test_print_config_and_load_it(tmp_path):
+    config_file = tmp_path / "config.yaml"
+    with config_file.open("w") as f:
+        S2TTask.print_config(f)
+    parser = S2TTask.get_parser()
+    parser.parse_args(["--config", str(config_file)])

From 75b3b02732dadc4d68ef21b155d78211e105531a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 28 Oct 2024 20:46:07 +0000
Subject: [PATCH 07/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 espnet2/bin/s2t_inference_ctc.py           | 28 +++++++++++++-----
 test/espnet2/bin/test_s2t_ctc_align.py     |  7 ++++-
 test/espnet2/bin/test_s2t_inference_ctc.py |  7 ++++-
 test/espnet2/s2t/test_espnet_ctc_model.py  | 34 ++++++++++++++++++++--
 test/espnet2/s2t/test_espnet_model.py      |  8 +++--
 5 files changed, 70 insertions(+), 14 deletions(-)

diff --git a/espnet2/bin/s2t_inference_ctc.py b/espnet2/bin/s2t_inference_ctc.py
index 08e750ddf28..aeac35bf58f 100644
--- a/espnet2/bin/s2t_inference_ctc.py
+++ b/espnet2/bin/s2t_inference_ctc.py
@@ -654,10 +654,16 @@ def decode_long_batched_buffered(
         buffer_len = int(sample_rate * buffer_len_in_secs)
         chunk_len = int(sample_rate * chunk_len_in_secs)
 
-        speech = np.pad(speech, (int(sample_rate * context_len_in_secs), int(sample_rate * context_len_in_secs)))
+        speech = np.pad(
+            speech,
+            (
+                int(sample_rate * context_len_in_secs),
+                int(sample_rate * context_len_in_secs),
+            ),
+        )
         buffer_list = []
         for i in range(0, len(speech), chunk_len):
-            cur_buffer = speech[i:i+buffer_len]
+            cur_buffer = speech[i : i + buffer_len]
             if len(cur_buffer) < buffer_len:
                 buffer_list.append(
                     np.pad(cur_buffer, (0, buffer_len - len(cur_buffer)))
@@ -677,12 +683,16 @@ def decode_long_batched_buffered(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=cur_speech.size(1)
             )
 
-            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            text_prev = torch.tensor([self.s2t_model.na], dtype=torch.long).repeat(
+                cur_speech.size(0), 1
+            )
             text_prev_lengths = text_prev.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=text_prev.size(1)
             )
 
-            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(cur_speech.size(0), 1)
+            prefix = torch.tensor([lang_id, task_id], dtype=torch.long).repeat(
+                cur_speech.size(0), 1
+            )
             prefix_lengths = prefix.new_full(
                 [cur_speech.size(0)], dtype=torch.long, fill_value=prefix.size(-1)
             )
@@ -707,9 +717,13 @@ def decode_long_batched_buffered(
                 enc, intermediate_outs = enc
 
             # enc: (B, T, D)
-            enc = enc[:, :buffer_frames]    # NOTE(yifan): IMPORTANT: it might be longer due to padding in conv
-            batched_token_int = self.s2t_model.ctc.argmax(enc)      # (B, T)
-            valid_token_int = batched_token_int[:, context_frames : -context_frames].reshape(-1)
+            enc = enc[
+                :, :buffer_frames
+            ]  # NOTE(yifan): IMPORTANT: it might be longer due to padding in conv
+            batched_token_int = self.s2t_model.ctc.argmax(enc)  # (B, T)
+            valid_token_int = batched_token_int[
+                :, context_frames:-context_frames
+            ].reshape(-1)
             unmerged.append(valid_token_int)
 
         unmerged = torch.cat(unmerged)
diff --git a/test/espnet2/bin/test_s2t_ctc_align.py b/test/espnet2/bin/test_s2t_ctc_align.py
index eac36f5e450..242954acd69 100644
--- a/test/espnet2/bin/test_s2t_ctc_align.py
+++ b/test/espnet2/bin/test_s2t_ctc_align.py
@@ -5,7 +5,12 @@
 import numpy as np
 import pytest
 
-from espnet2.bin.s2t_ctc_align import CTCSegmentation, CTCSegmentationTask, get_parser, main
+from espnet2.bin.s2t_ctc_align import (
+    CTCSegmentation,
+    CTCSegmentationTask,
+    get_parser,
+    main,
+)
 from espnet2.tasks.s2t_ctc import S2TTask
 
 
diff --git a/test/espnet2/bin/test_s2t_inference_ctc.py b/test/espnet2/bin/test_s2t_inference_ctc.py
index c07c9708020..df2be5b06ba 100644
--- a/test/espnet2/bin/test_s2t_inference_ctc.py
+++ b/test/espnet2/bin/test_s2t_inference_ctc.py
@@ -4,7 +4,12 @@
 import numpy as np
 import pytest
 
-from espnet2.bin.s2t_inference_ctc import Speech2Text, Speech2TextGreedySearch, get_parser, main
+from espnet2.bin.s2t_inference_ctc import (
+    Speech2Text,
+    Speech2TextGreedySearch,
+    get_parser,
+    main,
+)
 from espnet2.tasks.s2t_ctc import S2TTask
 from espnet.nets.beam_search import Hypothesis
 
diff --git a/test/espnet2/s2t/test_espnet_ctc_model.py b/test/espnet2/s2t/test_espnet_ctc_model.py
index 80cf8e2401c..bb7e1ed3527 100644
--- a/test/espnet2/s2t/test_espnet_ctc_model.py
+++ b/test/espnet2/s2t/test_espnet_ctc_model.py
@@ -2,8 +2,8 @@
 import torch
 
 from espnet2.asr.ctc import CTC
-from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 from espnet2.asr.encoder.e_branchformer_ctc_encoder import EBranchformerCTCEncoder
+from espnet2.asr.encoder.transformer_encoder import TransformerEncoder
 from espnet2.s2t.espnet_ctc_model import ESPnetS2TCTCModel
 
 
@@ -26,8 +26,36 @@ def test_espnet_model(encoder_arch, prompt_encoder_arch):
     ]
     vocab_size = len(token_list)
     enc_out = 1
-    encoder = encoder_arch(15, output_size=enc_out, attention_heads=1, attention_layer_type="selfattn", pos_enc_layer_type="abs_pos", linear_units=2, cgmlp_linear_units=2, num_blocks=2, cgmlp_conv_kernel=3, interctc_layer_idx=[1], interctc_use_conditioning=True, use_cross_attention=[False, True], use_flash_attn=False, dropout_rate=0, positional_dropout_rate=0, attention_dropout_rate=0,)
-    prompt_encoder = prompt_encoder_arch(enc_out, attention_heads=1, output_size=enc_out, linear_units=2, num_blocks=1, input_layer=None, use_flash_attn=False, dropout_rate=0, positional_dropout_rate=0, attention_dropout_rate=0,)
+    encoder = encoder_arch(
+        15,
+        output_size=enc_out,
+        attention_heads=1,
+        attention_layer_type="selfattn",
+        pos_enc_layer_type="abs_pos",
+        linear_units=2,
+        cgmlp_linear_units=2,
+        num_blocks=2,
+        cgmlp_conv_kernel=3,
+        interctc_layer_idx=[1],
+        interctc_use_conditioning=True,
+        use_cross_attention=[False, True],
+        use_flash_attn=False,
+        dropout_rate=0,
+        positional_dropout_rate=0,
+        attention_dropout_rate=0,
+    )
+    prompt_encoder = prompt_encoder_arch(
+        enc_out,
+        attention_heads=1,
+        output_size=enc_out,
+        linear_units=2,
+        num_blocks=1,
+        input_layer=None,
+        use_flash_attn=False,
+        dropout_rate=0,
+        positional_dropout_rate=0,
+        attention_dropout_rate=0,
+    )
     ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
 
     model = ESPnetS2TCTCModel(
diff --git a/test/espnet2/s2t/test_espnet_model.py b/test/espnet2/s2t/test_espnet_model.py
index 8d1873aaa4c..091b1046d3c 100644
--- a/test/espnet2/s2t/test_espnet_model.py
+++ b/test/espnet2/s2t/test_espnet_model.py
@@ -29,8 +29,12 @@ def test_espnet_model(encoder_arch, decoder_arch):
     ]
     vocab_size = len(token_list)
     enc_out = 4
-    encoder = encoder_arch(20, output_size=enc_out, linear_units=4, num_blocks=2, use_flash_attn=False)
-    decoder = decoder_arch(vocab_size, enc_out, linear_units=4, num_blocks=2, use_flash_attn=False)
+    encoder = encoder_arch(
+        20, output_size=enc_out, linear_units=4, num_blocks=2, use_flash_attn=False
+    )
+    decoder = decoder_arch(
+        vocab_size, enc_out, linear_units=4, num_blocks=2, use_flash_attn=False
+    )
     ctc = CTC(odim=vocab_size, encoder_output_size=enc_out)
 
     model = ESPnetS2TModel(

From 17b384764da386a8b9f2a96004ea2915040c2c4f Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 28 Oct 2024 16:19:31 -0500
Subject: [PATCH 08/15] update config

---
 .../conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml  | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml b/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
index adc8b639399..dad3c0a421d 100644
--- a/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
+++ b/egs2/owsm_ctc_v3.1/s2t1/conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml
@@ -1,3 +1,5 @@
+# 64 NVIDIA A100 GPUs (40GB)
+# Total training time: 300 hours
 preprocessor: s2t_ctc
 preprocessor_conf:
     na_symbol: "<na>"
@@ -87,7 +89,7 @@ batch_type: unsorted
 batch_size: 256
 accum_grad: 1
 num_iters_per_epoch: 15000
-max_epoch: 55
+max_epoch: 45
 patience: none
 init: none
 best_model_criterion:

From d7aac19ba839659412296ddff6e0f6fd0d59c30f Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 28 Oct 2024 16:41:42 -0500
Subject: [PATCH 09/15] update data format

---
 egs2/owsm_ctc_v3.1/s2t1/README.md | 53 +++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/egs2/owsm_ctc_v3.1/s2t1/README.md b/egs2/owsm_ctc_v3.1/s2t1/README.md
index 7c6d7f980a4..d3dc080d11f 100644
--- a/egs2/owsm_ctc_v3.1/s2t1/README.md
+++ b/egs2/owsm_ctc_v3.1/s2t1/README.md
@@ -7,6 +7,59 @@ This version is trained on 180k hours of public audio data for multilingual spee
 
 The training data follows the same format as the encoder-decoder OWSM v3.1, except that timestamps are removed from the `text` file. Please first follow the `egs2/owsm_v3.1/s2t1` recipe to prepare OWSM data, and then convert `text` into the new format by running `python local/convert_owsm_data.py` (the path to the BPE tokenizer needs to be modified to your path).
 
+### OWSM-CTC Data Format
+
+The prepared data directory contains the following files:
+
+```
+dump/raw/train
+├── feats_type
+├── spk2utt
+├── text
+├── text.ctc
+├── text.prev
+├── utt2spk
+├── wav.scp
+```
+
+`feats_type` has a single line of text, which should be automatically generated in the data preparation stage:
+```
+raw
+```
+
+`spk2utt` and `utt2spk` have the same meaning as the standard Kaldi recipes (see `asr1` recipe for example). Typically, the speaker information is not utilized. Hence, each utterance has a unique speaker ID which is simply its utterance ID.
+
+`wav.scp` also follows the standard Kaldi format.
+
+`text` contains the multitask reference (ASR or ST) with language and task tokens but without timestamps:
+
+```
+AIDATATANG_200ZH_T0055G0013S0001_000000000_000003561_zho_asr <zho><asr> 今天什么日子
+...
+GigaST_YOU0000009624_002208970_002218840_en_st_zh <eng><st_zho> 大会结束后,我们要求有兴趣进一步参与我们项目或进一步参与气候教育的学生站出来,
+...
+MLS_en_sikhreligion6_22_macauliffe_64kb_003555300_003571720_en_asr <eng><asr> it farid considered that faqiri or holiness consisted in four things namely to be blind to the faults of muhammadans to be deaf to slander to be dumb when evil speaking is suggested and to be lame when there is a desire to visit evil places
+...
+```
+
+`text.ctc` contains the pure ASR reference:
+
+```
+AIDATATANG_200ZH_T0055G0013S0001_000000000_000003561_zho_asr 今天什么日子
+...
+CoVoST2_147d94ad8405722d5930a859295bfac7b925ccd40c587334d34f3ebd2668a70242240866e93907398f10b7f2265a4ddb82b5355eb21fe37993d04a69900df388-common_voice_en_19741894_000000000_000006270_en_st_ca He appointed military officers to most leading government positions.
+...
+```
+
+`text.prev` contains the previous sentence that will be used as an additional prompt. If a sample does not have a prompt, then `<na>` is used.
+
+```
+AIDATATANG_200ZH_T0055G0013S0001_000000000_000003561_zho_asr <na>
+...
+GigaST_YOU0000009624_002208970_002218840_en_st_zh 与员工和同事一起，这将有助于为事物创造空间，帮助为我们创造空间，一些掩护，尝试新事物，
+...
+```
+
 ## Pre-trained Model
 
 The pre-trained model is available at: https://huggingface.co/pyf98/owsm_ctc_v3.1_1B

From 12767db1d4d8831dc40e760ffc5c084a16fc6326 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 28 Oct 2024 16:53:13 -0500
Subject: [PATCH 10/15] update readme

---
 egs2/owsm_ctc_v3.1/s2t1/README.md | 99 ++++++++++++++++++++++++++++++-
 1 file changed, 98 insertions(+), 1 deletion(-)

diff --git a/egs2/owsm_ctc_v3.1/s2t1/README.md b/egs2/owsm_ctc_v3.1/s2t1/README.md
index d3dc080d11f..4911861052a 100644
--- a/egs2/owsm_ctc_v3.1/s2t1/README.md
+++ b/egs2/owsm_ctc_v3.1/s2t1/README.md
@@ -64,4 +64,101 @@ GigaST_YOU0000009624_002208970_002218840_en_st_zh 与员工和同事一起，这
 
 The pre-trained model is available at: https://huggingface.co/pyf98/owsm_ctc_v3.1_1B
 
-The model page also contains example usage.
+The model is trained with this config: [conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml](conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml)
+
+
+### Example script for short-form ASR/ST
+
+```python
+import soundfile as sf
+import numpy as np
+import librosa
+import kaldiio
+from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
+
+
+s2t = Speech2TextGreedySearch.from_pretrained(
+    "pyf98/owsm_ctc_v3.1_1B",
+    device="cuda",
+    generate_interctc_outputs=False,
+    lang_sym='<eng>',
+    task_sym='<asr>',
+)
+
+speech, rate = sf.read(
+    "xxx.wav"
+)
+speech = librosa.util.fix_length(speech, size=(16000 * 30))
+
+res = s2t(speech)[0]
+print(res)
+```
+
+### Example script for long-form ASR/ST
+
+```python
+import soundfile as sf
+import torch
+from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
+
+
+context_len_in_secs = 4   # left and right context when doing buffered inference
+batch_size = 32   # depends on the GPU memory
+s2t = Speech2TextGreedySearch.from_pretrained(
+    "pyf98/owsm_ctc_v3.1_1B",
+    device='cuda' if torch.cuda.is_available() else 'cpu',
+    generate_interctc_outputs=False,
+    lang_sym='<eng>',
+    task_sym='<asr>',
+)
+
+speech, rate = sf.read(
+    "xxx.wav"
+)
+
+text = s2t.decode_long_batched_buffered(
+    speech,
+    batch_size=batch_size,
+    context_len_in_secs=context_len_in_secs,
+    frames_per_sec=12.5,        # 80ms shift, model-dependent, don't change
+)
+print(text)
+```
+
+### Example for CTC forced alignment using `ctc-segmentation`
+
+It can be efficiently applied to audio of an arbitrary length.
+For model downloading, please refer to https://github.com/espnet/espnet?tab=readme-ov-file#ctc-segmentation-demo
+
+```python
+import soundfile as sf
+from espnet2.bin.s2t_ctc_align import CTCSegmentation
+
+
+## Please download model first
+aligner = CTCSegmentation(
+    s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
+    fs=16000,
+    ngpu=1,
+    batch_size=16,    # batched parallel decoding; reduce it if your GPU memory is smaller
+    kaldi_style_text=True,
+    time_stamps="fixed",
+    samples_to_frames_ratio=1280,   # 80ms time shift; don't change as it depends on the pre-trained model
+    lang_sym="<eng>",
+    task_sym="<asr>",
+    context_len_in_secs=2,  # left and right context in buffered decoding
+    frames_per_sec=12.5,    # 80ms time shift; don't change as it depends on the pre-trained model
+)
+
+speech, rate = sf.read(
+    "example.wav"
+)
+print(f"speech duration: {len(speech) / rate : .2f} seconds")
+text = '''
+utt1 hello there
+utt2 welcome to this repo
+'''
+
+segments = aligner(speech, text)
+print(segments)
+```

From aa7e9cbbcc4467ba42eac5ef41402b001cd5d783 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 11 Nov 2024 12:22:57 -0600
Subject: [PATCH 11/15] add comments to some modules

---
 egs2/owsm_ctc_v3.1/s2t1/README.md                | 16 +++++++---------
 egs2/owsm_ctc_v3.1/s2t1/run.sh                   |  1 +
 .../asr/encoder/e_branchformer_ctc_encoder.py    | 13 +++++++++++--
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/egs2/owsm_ctc_v3.1/s2t1/README.md b/egs2/owsm_ctc_v3.1/s2t1/README.md
index 4911861052a..b83714a4477 100644
--- a/egs2/owsm_ctc_v3.1/s2t1/README.md
+++ b/egs2/owsm_ctc_v3.1/s2t1/README.md
@@ -62,7 +62,9 @@ GigaST_YOU0000009624_002208970_002218840_en_st_zh 与员工和同事一起，这
 
 ## Pre-trained Model
 
-The pre-trained model is available at: https://huggingface.co/pyf98/owsm_ctc_v3.1_1B
+**IMPORTANT: Our model is trained on 16kHz audio with fixed duration 30s. When using the pre-trained model, please ensure the input speech is 16kHz and pad or truncate it to 30s.**
+
+The pre-trained model is available at: https://huggingface.co/espnet/owsm_ctc_v3.1_1B
 
 The model is trained with this config: [conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml](conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml)
 
@@ -70,24 +72,20 @@ The model is trained with this config: [conf/train_s2t_multitask-ctc_ebf27_conv2
 ### Example script for short-form ASR/ST
 
 ```python
-import soundfile as sf
-import numpy as np
 import librosa
-import kaldiio
 from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
 
 
 s2t = Speech2TextGreedySearch.from_pretrained(
-    "pyf98/owsm_ctc_v3.1_1B",
+    "espnet/owsm_ctc_v3.1_1B",
     device="cuda",
     generate_interctc_outputs=False,
     lang_sym='<eng>',
     task_sym='<asr>',
 )
 
-speech, rate = sf.read(
-    "xxx.wav"
-)
+# NOTE: OWSM-CTC is trained on 16kHz audio with a fixed 30s duration. Please ensure your input has the correct sample rate; otherwise resample it to 16k before feeding it to the model
+speech, rate = librosa.load("xxx.wav", sr=16000)
 speech = librosa.util.fix_length(speech, size=(16000 * 30))
 
 res = s2t(speech)[0]
@@ -105,7 +103,7 @@ from espnet2.bin.s2t_inference_ctc import Speech2TextGreedySearch
 context_len_in_secs = 4   # left and right context when doing buffered inference
 batch_size = 32   # depends on the GPU memory
 s2t = Speech2TextGreedySearch.from_pretrained(
-    "pyf98/owsm_ctc_v3.1_1B",
+    "espnet/owsm_ctc_v3.1_1B",
     device='cuda' if torch.cuda.is_available() else 'cpu',
     generate_interctc_outputs=False,
     lang_sym='<eng>',
diff --git a/egs2/owsm_ctc_v3.1/s2t1/run.sh b/egs2/owsm_ctc_v3.1/s2t1/run.sh
index bfb9f81957b..f0e07f29740 100755
--- a/egs2/owsm_ctc_v3.1/s2t1/run.sh
+++ b/egs2/owsm_ctc_v3.1/s2t1/run.sh
@@ -5,6 +5,7 @@ set -e
 set -u
 set -o pipefail
 
+# NOTE: please check README.md for data preparation scripts
 train_set=train_v3
 valid_set=dev_v3
 test_sets=dev_v3
diff --git a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
index 9ac6bab27b1..e7b87e5a18f 100644
--- a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
+++ b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
@@ -1,7 +1,7 @@
 """E-Branchformer encoder used by OWSM-CTC.
 
 Compared to the original encoder, this variant supports additional
-cross-attention modules.
+cross-attention modules and extra language and task token inputs.
 """
 
 import logging
@@ -48,6 +48,9 @@
 class EBranchformerEncoderLayer(torch.nn.Module):
     """E-Branchformer encoder layer module.
 
+    Compared to the original encoder layer in e_branchformer_encoder.py,
+    this variant supports additional cross-attention modules.
+
     Args:
         size (int): model dimension
         attn: standard self-attention or efficient attention
@@ -197,7 +200,13 @@ def forward(
 
 
 class EBranchformerCTCEncoder(AbsEncoder):
-    """E-Branchformer encoder module."""
+    """E-Branchformer encoder module.
+    
+    Compared to the original encoder in e_branchformer_encoder.py,
+    this variant supports additional cross-attention modules.
+    Additionally, it supports extra prefix tokens for the input.
+    This is useful for language and task conditioning.
+    """
 
     @typechecked
     def __init__(

From fa2c445971e3dd1efedea911dd758a6d19c09cb4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 18:24:08 +0000
Subject: [PATCH 12/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 espnet2/asr/encoder/e_branchformer_ctc_encoder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
index e7b87e5a18f..4b583d211af 100644
--- a/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
+++ b/espnet2/asr/encoder/e_branchformer_ctc_encoder.py
@@ -201,7 +201,7 @@ def forward(
 
 class EBranchformerCTCEncoder(AbsEncoder):
     """E-Branchformer encoder module.
-    
+
     Compared to the original encoder in e_branchformer_encoder.py,
     this variant supports additional cross-attention modules.
     Additionally, it supports extra prefix tokens for the input.

From 4a3520e6505eea1c318117cc18de04d98a622ac6 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 11 Nov 2024 13:12:53 -0600
Subject: [PATCH 13/15] auto compute frames per sec

---
 egs2/owsm_ctc_v3.1/s2t1/README.md | 24 ++++++++++++-----------
 espnet2/bin/s2t_ctc_align.py      | 13 +++++++++++--
 espnet2/bin/s2t_inference_ctc.py  | 32 +++++++++++++++++++++++--------
 3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/egs2/owsm_ctc_v3.1/s2t1/README.md b/egs2/owsm_ctc_v3.1/s2t1/README.md
index b83714a4477..162040e189f 100644
--- a/egs2/owsm_ctc_v3.1/s2t1/README.md
+++ b/egs2/owsm_ctc_v3.1/s2t1/README.md
@@ -118,44 +118,46 @@ text = s2t.decode_long_batched_buffered(
     speech,
     batch_size=batch_size,
     context_len_in_secs=context_len_in_secs,
-    frames_per_sec=12.5,        # 80ms shift, model-dependent, don't change
 )
 print(text)
 ```
 
 ### Example for CTC forced alignment using `ctc-segmentation`
 
-It can be efficiently applied to audio of an arbitrary length.
-For model downloading, please refer to https://github.com/espnet/espnet?tab=readme-ov-file#ctc-segmentation-demo
+CTC segmentation can be efficiently applied to audio of an arbitrary length.
 
 ```python
 import soundfile as sf
 from espnet2.bin.s2t_ctc_align import CTCSegmentation
+from espnet_model_zoo.downloader import ModelDownloader
 
 
 ## Please download model first
+d = ModelDownloader()
+downloaded = d.download_and_unpack("espnet/owsm_ctc_v3.1_1B")
+
 aligner = CTCSegmentation(
-    s2t_model_file="exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/valid.total_count.ave_5best.till45epoch.pth",
+    **downloaded,
     fs=16000,
     ngpu=1,
     batch_size=16,    # batched parallel decoding; reduce it if your GPU memory is smaller
     kaldi_style_text=True,
     time_stamps="fixed",
-    samples_to_frames_ratio=1280,   # 80ms time shift; don't change as it depends on the pre-trained model
     lang_sym="<eng>",
     task_sym="<asr>",
     context_len_in_secs=2,  # left and right context in buffered decoding
-    frames_per_sec=12.5,    # 80ms time shift; don't change as it depends on the pre-trained model
 )
 
 speech, rate = sf.read(
-    "example.wav"
+    "./test_utils/ctc_align_test.wav"
 )
 print(f"speech duration: {len(speech) / rate : .2f} seconds")
-text = '''
-utt1 hello there
-utt2 welcome to this repo
-'''
+text = """
+utt1 THE SALE OF THE HOTELS
+utt2 IS PART OF HOLIDAY'S STRATEGY
+utt3 TO SELL OFF ASSETS
+utt4 AND CONCENTRATE ON PROPERTY MANAGEMENT
+"""
 
 segments = aligner(speech, text)
 print(segments)
diff --git a/espnet2/bin/s2t_ctc_align.py b/espnet2/bin/s2t_ctc_align.py
index dfcc2afacf9..d329c5ce57f 100755
--- a/espnet2/bin/s2t_ctc_align.py
+++ b/espnet2/bin/s2t_ctc_align.py
@@ -187,7 +187,6 @@ def __init__(
         lang_sym: str = "<eng>",
         task_sym: str = "<asr>",
         context_len_in_secs: float = 4,
-        frames_per_sec: float = 12.5,
         **ctc_segmentation_args,
     ):
         """Initialize the CTCSegmentation module.
@@ -267,7 +266,17 @@ def __init__(
         self.lang_sym = lang_sym
         self.task_sym = task_sym
         self.context_len_in_secs = context_len_in_secs
-        self.frames_per_sec = frames_per_sec
+
+        subsample_dict = {
+            "conv2d1": 1,
+            "conv2d2": 2,
+            "conv2d": 4,
+            "conv2d6": 6,
+            "conv2d8": 8,
+        }
+        subsample_factor = subsample_dict[s2t_train_args.encoder_conf['input_layer']]
+        self.samples_to_frames_ratio = s2t_train_args.frontend_conf["hop_length"] * subsample_factor
+        self.frames_per_sec = fs / self.samples_to_frames_ratio
 
     def set_config(self, **kwargs):
         """Set CTC segmentation parameters.
diff --git a/espnet2/bin/s2t_inference_ctc.py b/espnet2/bin/s2t_inference_ctc.py
index aeac35bf58f..597973b6367 100644
--- a/espnet2/bin/s2t_inference_ctc.py
+++ b/espnet2/bin/s2t_inference_ctc.py
@@ -5,10 +5,9 @@
 from itertools import groupby
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
-
+import humanfriendly
 import numpy as np
 import torch
-import torch.nn.functional as F
 import torch.quantization
 from typeguard import typechecked
 
@@ -490,6 +489,26 @@ def __init__(
         self.lang_sym = lang_sym
         self.task_sym = task_sym
 
+        # retrieve sample rate and compute frames per second
+        sample_rate = s2t_train_args.frontend_conf["fs"]
+        if isinstance(sample_rate, str):
+            sample_rate = humanfriendly.parse_size(sample_rate)
+        self.sample_rate = sample_rate
+        logging.info(f"Audio sampling rate: {sample_rate}")
+
+        subsample_dict = {
+            "conv2d1": 1,
+            "conv2d2": 2,
+            "conv2d": 4,
+            "conv2d6": 6,
+            "conv2d8": 8,
+        }
+        subsample_factor = subsample_dict[s2t_train_args.encoder_conf['input_layer']]
+        frames_per_sec = sample_rate / s2t_train_args.frontend_conf["hop_length"]
+        frames_per_sec /= subsample_factor
+        self.frames_per_sec = frames_per_sec
+        logging.info(f"Final encoder frames per second: {frames_per_sec}")
+
     @torch.no_grad()
     def __call__(
         self,
@@ -627,9 +646,7 @@ def decode_long_batched_buffered(
         self,
         speech: Union[torch.Tensor, np.ndarray],
         batch_size: int = 1,
-        sample_rate: int = 16000,
         context_len_in_secs: float = 2,
-        frames_per_sec: float = 12.5,
         lang_sym: Optional[str] = None,
         task_sym: Optional[str] = None,
     ):
@@ -638,12 +655,11 @@ def decode_long_batched_buffered(
         Args:
             speech: 1D long-form input speech
             batch_size (int): decode this number of segments together in parallel
-
-        Returns:
-            utterances: list of tuples of (start_time, end_time, text)
-
         """
 
+        sample_rate = self.sample_rate
+        frames_per_sec = self.frames_per_sec
+
         lang_sym = lang_sym if lang_sym is not None else self.lang_sym
         task_sym = task_sym if task_sym is not None else self.task_sym
         lang_id = self.converter.token2id[lang_sym]

From e3f4abb447129e7e4d2924458e3d1939e676d31f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 19:14:37 +0000
Subject: [PATCH 14/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 espnet2/bin/s2t_ctc_align.py     | 6 ++++--
 espnet2/bin/s2t_inference_ctc.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/espnet2/bin/s2t_ctc_align.py b/espnet2/bin/s2t_ctc_align.py
index d329c5ce57f..504b1f4ed53 100755
--- a/espnet2/bin/s2t_ctc_align.py
+++ b/espnet2/bin/s2t_ctc_align.py
@@ -274,8 +274,10 @@ def __init__(
             "conv2d6": 6,
             "conv2d8": 8,
         }
-        subsample_factor = subsample_dict[s2t_train_args.encoder_conf['input_layer']]
-        self.samples_to_frames_ratio = s2t_train_args.frontend_conf["hop_length"] * subsample_factor
+        subsample_factor = subsample_dict[s2t_train_args.encoder_conf["input_layer"]]
+        self.samples_to_frames_ratio = (
+            s2t_train_args.frontend_conf["hop_length"] * subsample_factor
+        )
         self.frames_per_sec = fs / self.samples_to_frames_ratio
 
     def set_config(self, **kwargs):
diff --git a/espnet2/bin/s2t_inference_ctc.py b/espnet2/bin/s2t_inference_ctc.py
index 597973b6367..6c855c3052f 100644
--- a/espnet2/bin/s2t_inference_ctc.py
+++ b/espnet2/bin/s2t_inference_ctc.py
@@ -5,6 +5,7 @@
 from itertools import groupby
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
 import humanfriendly
 import numpy as np
 import torch
@@ -503,7 +504,7 @@ def __init__(
             "conv2d6": 6,
             "conv2d8": 8,
         }
-        subsample_factor = subsample_dict[s2t_train_args.encoder_conf['input_layer']]
+        subsample_factor = subsample_dict[s2t_train_args.encoder_conf["input_layer"]]
         frames_per_sec = sample_rate / s2t_train_args.frontend_conf["hop_length"]
         frames_per_sec /= subsample_factor
         self.frames_per_sec = frames_per_sec

From ce075c482f8d21d7d9eef4816857dde8b2b93692 Mon Sep 17 00:00:00 2001
From: Yifan Peng <pengyf21@gmail.com>
Date: Mon, 11 Nov 2024 14:22:30 -0600
Subject: [PATCH 15/15] update tests

---
 test/espnet2/bin/test_s2t_ctc_align.py     | 6 ++++++
 test/espnet2/bin/test_s2t_inference_ctc.py | 7 ++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/test/espnet2/bin/test_s2t_ctc_align.py b/test/espnet2/bin/test_s2t_ctc_align.py
index 242954acd69..d8c6ac80c01 100644
--- a/test/espnet2/bin/test_s2t_ctc_align.py
+++ b/test/espnet2/bin/test_s2t_ctc_align.py
@@ -66,6 +66,12 @@ def s2t_config_file(tmp_path: Path, token_list):
             "fs=16000",
             "--preprocessor_conf",
             "speech_length=4",
+            "--frontend_conf",
+            "fs=16k",
+            "--frontend_conf",
+            "hop_length=160",
+            "--encoder_conf",
+            "input_layer=conv2d8",
         ]
     )
     return tmp_path / "s2t" / "config.yaml"
diff --git a/test/espnet2/bin/test_s2t_inference_ctc.py b/test/espnet2/bin/test_s2t_inference_ctc.py
index df2be5b06ba..946561bb65f 100644
--- a/test/espnet2/bin/test_s2t_inference_ctc.py
+++ b/test/espnet2/bin/test_s2t_inference_ctc.py
@@ -64,6 +64,12 @@ def s2t_config_file(tmp_path: Path, token_list):
             "fs=2000",
             "--preprocessor_conf",
             "speech_length=3",
+            "--frontend_conf",
+            "fs=16k",
+            "--frontend_conf",
+            "hop_length=160",
+            "--encoder_conf",
+            "input_layer=conv2d8",
         ]
     )
     return tmp_path / "s2t" / "config.yaml"
@@ -186,7 +192,6 @@ def test_Speech2TextGreedy_longform(s2t_config_file):
     speech = np.random.randn(3000)
     result = speech2text.decode_long_batched_buffered(
         speech,
-        sample_rate=2000,
         context_len_in_secs=1,
     )
     assert isinstance(result, str)