From 27d0633d6dee498027db9f5f84255b1eee5193d2 Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br012.ib.bridges2.psc.edu>
Date: Thu, 25 Jul 2024 20:47:09 -0400
Subject: [PATCH 01/13] add aishell3_tts2 recipe

---
 egs2/aishell3/tts2/README.md                  |  163 +++
 egs2/aishell3/tts2/cmd.sh                     |  110 ++
 .../tts2/conf/decode_fastspeech2.yaml         |   10 +
 egs2/aishell3/tts2/conf/decode_teacher.yaml   |   15 +
 egs2/aishell3/tts2/conf/mfcc.conf             |    7 +
 egs2/aishell3/tts2/conf/pbs.conf              |   11 +
 egs2/aishell3/tts2/conf/queue.conf            |   12 +
 egs2/aishell3/tts2/conf/slurm.conf            |   14 +
 .../aishell3/tts2/conf/train_fastspeech2.yaml |  104 ++
 egs2/aishell3/tts2/conf/train_teacher.yaml    |   81 ++
 egs2/aishell3/tts2/conf/vad.conf              |    4 +
 egs2/aishell3/tts2/db.sh                      |    1 +
 egs2/aishell3/tts2/local/data.sh              |   97 ++
 egs2/aishell3/tts2/local/data_prep.py         |    1 +
 .../aishell3/tts2/local/download_and_untar.sh |    1 +
 egs2/aishell3/tts2/local/path.sh              |    0
 egs2/aishell3/tts2/local/run_mfa.sh           |   23 +
 egs2/aishell3/tts2/path.sh                    |    1 +
 egs2/aishell3/tts2/pyscripts                  |    1 +
 egs2/aishell3/tts2/run.sh                     |   65 +
 egs2/aishell3/tts2/run_train_teacher.sh       |   61 +
 egs2/aishell3/tts2/scripts                    |    1 +
 egs2/aishell3/tts2/sid                        |    1 +
 egs2/aishell3/tts2/steps                      |    1 +
 egs2/aishell3/tts2/tts.sh                     | 1215 +++++++++++++++++
 egs2/aishell3/tts2/tts2.sh                    |    1 +
 egs2/aishell3/tts2/utils                      |    1 +
 27 files changed, 2002 insertions(+)
 create mode 100644 egs2/aishell3/tts2/README.md
 create mode 100644 egs2/aishell3/tts2/cmd.sh
 create mode 100644 egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
 create mode 100644 egs2/aishell3/tts2/conf/decode_teacher.yaml
 create mode 100644 egs2/aishell3/tts2/conf/mfcc.conf
 create mode 100644 egs2/aishell3/tts2/conf/pbs.conf
 create mode 100644 egs2/aishell3/tts2/conf/queue.conf
 create mode 100644 egs2/aishell3/tts2/conf/slurm.conf
 create mode 100644 egs2/aishell3/tts2/conf/train_fastspeech2.yaml
 create mode 100644 egs2/aishell3/tts2/conf/train_teacher.yaml
 create mode 100644 egs2/aishell3/tts2/conf/vad.conf
 create mode 120000 egs2/aishell3/tts2/db.sh
 create mode 100644 egs2/aishell3/tts2/local/data.sh
 create mode 120000 egs2/aishell3/tts2/local/data_prep.py
 create mode 120000 egs2/aishell3/tts2/local/download_and_untar.sh
 create mode 100644 egs2/aishell3/tts2/local/path.sh
 create mode 100755 egs2/aishell3/tts2/local/run_mfa.sh
 create mode 120000 egs2/aishell3/tts2/path.sh
 create mode 120000 egs2/aishell3/tts2/pyscripts
 create mode 100755 egs2/aishell3/tts2/run.sh
 create mode 100755 egs2/aishell3/tts2/run_train_teacher.sh
 create mode 120000 egs2/aishell3/tts2/scripts
 create mode 120000 egs2/aishell3/tts2/sid
 create mode 120000 egs2/aishell3/tts2/steps
 create mode 100755 egs2/aishell3/tts2/tts.sh
 create mode 120000 egs2/aishell3/tts2/tts2.sh
 create mode 120000 egs2/aishell3/tts2/utils

diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
new file mode 100644
index 00000000000..bdb7d4fc8f8
--- /dev/null
+++ b/egs2/aishell3/tts2/README.md
@@ -0,0 +1,163 @@
+# AISHELL3 RECIPE
+
+This is the recipe of Mandrain multi-speaker TTS2 model with [aishell3](https://www.openslr.org/93/) corpus.
+
+See the following pages for running on clusters. They can help you to set the environment and get familiar with ESPNet's repo structure.
+- [PSC usage tutorial](https://www.wavlab.org/activities/2022/psc-usage/)
+- [Espnet recipe tutorial]((https://github.com/espnet/notebook/blob/master/ESPnet2/Course/CMU_SpeechRecognition_Fall2022/recipe_tutorial.ipynb) )
+
+
+## Brief on TTS2
+
+- In terms of features
+  
+  ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the training of a discrete FastSpeech2 model. 
+- In terms of data
+  
+  ``tts2`` additionally requires duration information, which can be obtained from **Speech-Text Alignment Tools** (tacotron teacher model or mfa). According to FastSpeech2 paper, mfa has a higher quality.
+
+
+## Run the Recipe
+
+🌟 Please notice that most of the ``bash files`` are symbolic linked from the TEMPLATE. It might be updated by later commmits using other corpus, so please double check and customize the parameters before your run.
+
+Here is the basic order for running scripts, followed by more details.
+
+1. ``./local/run_mfa.sh``
+2. ``./run_train_teacher.sh`` (to stage 8, must use teacher forcing in decoding)
+3. ``./run_train_teacher.sh`` (stage 6 only, to extract energy and pitch)
+4. ``./run.sh --stop_stage 8 --s3prl_upstream_name hf_hubert_custom`` 
+5. Train a vocoder at [PWG](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs) (We use discrete hifigan here)
+6. ``./run.sh --stage 9`` 
+7. Evaluate the generated wav using [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation)
+   
+
+### 1. Data Preparation
+
+* Download aishell-3 dataset(trainset & testset)
+* Trim slience to improve the efficiency and potentially improve the generated wave quality by cutting off noise. 
+* Get the initial ``{dset}_phn`` dictionary.
+* Split 250 samples from the trainset to be the devset.
+
+```
+//{dset}/text sample
+SSB00050353 深交所副总经理周明指出
+//{dset}_phn/text sample
+SSB00050353 shen1 jiao1 suo3 fu4 zong3 jing1 li3 zhou1 ming2 zhi3 chu1
+```
+
+  NOTE: The parameters like ``fs``, ``n_fft``, in ``trim_slience.sh`` don't have to be the same as what in ``run.sh``, since they only determine the precision of slience trimming, where the outcome of different sets of parameters will be roughly the same (corpus w/ minimum slience sound).
+
+### 2. Train the teacher model
+Following ``tts1``, we train a Tacotron2 model to be the teacher model for FastSpeech2 in ``tts2``. 
+
+Set ``audio_format=wav`` is recommended, as it can be directly processed if you want to use x-vector. Or you can use ``flac``, but take ``egs2/libirspeech/asr1/local/data.sh`` as a reference for ``uttid path-to-utt``
+
+Remember to keep the frame shift(fs) for the teacher model and the student model to be the same, only by which the soft target generated by teacher Tacotron2 can be aligned with the Fastspeech2 input.
+
+### 3. Extract additional features
+
+Calculate pitch and energy (still following ``tts1``), for fastspeech2.
+
+### 4. Train discrete fastspeech2 
+The datasets include text, durations, speech, discrete speech, pitch, energy, and spembs. Use cn_hubert(pretrained on mandarin) here for discrete tts feature extraction.
+
+### 5. Train a vocoder 
+A customized vocoder for aishell3 discrete features is necessary for the purpose of generating ``wav`` from discrete hubert features. 
+
+The vocoder for tts2 are not exactly mel2text, so our goal here is not to train a rule-based vocoder like ``tts1``, but another unique vocoder that maps discrete features to waves.
+
+We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs), and here are the detailed steps:
+
+
+* git clone https://github.com/kan-bayashi/ParallelWaveGAN.git , ``cd ParallelWaveGAN/egs/aishell3/hubert_voc1``.
+
+* Collect hubert text to a single file, which can be done conveniently using ``vim``
+
+  ```shell
+  vim path/to/train_hubert.txt
+  :r path/to/dev_hubert.txt
+  :r path/to/test_hubert.txt
+  :w path/to/newfile_all.txt
+  :q!
+  ```
+* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data (``wav`` format is better supported in kaldiio than ``flac``). Notice that aishell3 has unknown speakers, so we don't use sid. 
+
+* Modify ``num_embs``(equals to number of k-means clusters) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. 
+
+* Start feature extraction and training from stage 1.
+
+
+### 6. Inference
+Run the inference stage in espnet2 recipe with your trained vocoder. Waveform will be directly generated this time.
+
+### 7. Evaluate model performance 
+Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation).
+
+
+## Other references
+
+**Speech-Text Alignment Tools**
+
+The token duration is predicted using Speech-Text alignment tools, which can be either force-aligner or attention-based auto-regressive model (e.g., Tacotron2). Please refer to [Alignment from Tacotron2](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#fastspeech-training) and [Montreal Forced Aligner(MFA)](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#new-mfa-aligments-generation) for details.
+
+
+**MFA**
+
+Firstly make sure ``mfa`` has been prepared in the environment.
+```
+cd ../../../tools
+make mfa.done
+cd -
+```
+
+Originally, ``Stage 1`` in ``run.sh`` calls ``local/data.sh``, but here we won't run ``Stage 1``, instead, we use
+
+```
+./local/run_mfa.sh
+```
+
+which is an entry point that will call ``scripts/mfa.sh`` and further call ``local/data.sh``. If ``--train false``, this script will download pretrained g2p and acoustic models, else if ``--train true``, this script will generate the alignments. The generated results will be stored in the ``<split_sets>_phn`` lexicon. 
+
+For aishell-3, we train a new G2P model on ``mandarin_china_mfa`` dictionary, and generate the lexicon. Then train the speech-text alignment MFA.
+
+If you want to use the duration extracted by mfa, then you can continue the training on the main script from ``Stage 2``:
+
+```
+./run_.sh --stage 2 --stop_stage 2 --teacher_dumpdir "data"
+```
+
+### Multi-Speakers tts2
+
+In multi-spk scenario, adding speaker id or speaker embedding can help better tell speakers apart, specified using ``--use_spk_embed`` or ``--use_sid``. But since aishell-3 is not a fixed speaker corpus, i.e. exists speakers with unknown id, so here we use speaker embeddings.
+
+**Speaker Embeddings**
+
+ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrain, espnet_spk). The recently proposed espnet_spk shows SOTA performance among many tasks, thus we use it here. 
+
+
+### Discrete Speech Challenge Baseline
+
+<table class="table">
+  <thread>
+    <tr>
+      <th scope="col">Model</th>
+      <th scope="col">MCD ⬇️</th>
+      <th scope="col">Log F0 RMSE ⬇️</th>
+      <th scope="col">CER</th>
+      <th scope="col">UTMOS ⬆️</th>
+    </tr>
+  </thread>
+  <tbody>
+    <tr>
+      <th scope="col">HuBERT-base-layer6</th>
+      <th scope="col">11.7626 ± 1.6673</th>
+      <th scope="col">0.4608 ± 0.1724</th>
+      <th scope="col"></th>
+      <th scope="col">1.4078 ± 0.1414</th>
+    </tr>
+  </tbody>
+</table>
+
+
+* CER is currently unfilled since it requires an additional asr model.
diff --git a/egs2/aishell3/tts2/cmd.sh b/egs2/aishell3/tts2/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aishell3/tts2/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml b/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
new file mode 100644
index 00000000000..186dcb59d4f
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/conf/decode_teacher.yaml b/egs2/aishell3/tts2/conf/decode_teacher.yaml
new file mode 100644
index 00000000000..ed39fb610e8
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/decode_teacher.yaml
@@ -0,0 +1,15 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+threshold: 0.5            # threshold to stop the generation
+maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
+minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
+use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
+backward_window: 1        # Backward window size in the attention constraint
+forward_window: 3         # Forward window size in the attention constraint
diff --git a/egs2/aishell3/tts2/conf/mfcc.conf b/egs2/aishell3/tts2/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/aishell3/tts2/conf/pbs.conf b/egs2/aishell3/tts2/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aishell3/tts2/conf/queue.conf b/egs2/aishell3/tts2/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aishell3/tts2/conf/slurm.conf b/egs2/aishell3/tts2/conf/slurm.conf
new file mode 100644
index 00000000000..36b40149ae0
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/conf/train_fastspeech2.yaml b/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
new file mode 100644
index 00000000000..27b2c1e6d58
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
@@ -0,0 +1,104 @@
+# This configuration is for ESPnet2 to train FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    spk_embed_dim: 192              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 100  # number of iterations per epoch
+max_epoch: 135           # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 24000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: 201         # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts2/conf/train_teacher.yaml b/egs2/aishell3/tts2/conf/train_teacher.yaml
new file mode 100644
index 00000000000..5567a1d0a4e
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/train_teacher.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 192              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 500    # number of iterations per epoch
+max_epoch: 100              # number of epochs, 500 by default
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts2/conf/vad.conf b/egs2/aishell3/tts2/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/aishell3/tts2/db.sh b/egs2/aishell3/tts2/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aishell3/tts2/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/data.sh b/egs2/aishell3/tts2/local/data.sh
new file mode 100644
index 00000000000..1938e661869
--- /dev/null
+++ b/egs2/aishell3/tts2/local/data.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+stage=0
+stop_stage=3
+threshold=35
+nj=40
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
+if [ -z "${AISHELL3}" ]; then
+   log "Fill the value of 'AISHELL3' of db.sh"
+   exit 1
+fi
+db_root=${AISHELL3}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    mkdir -p ${db_root} # yw added, otherwise will say: local/download_and_untar.sh: no such directory downloads
+    log "stage -1: download data from openslr"
+    local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/93/data_aishell3.tgz" data_aishell3.tgz
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "stage 1: prepare aishell3 data"
+    mkdir -p data
+    for x in train test; do
+        mkdir -p data/${x}
+        python local/data_prep.py --src "${db_root}"/${x}/ --dest data/${x}
+        sort data/${x}/utt2spk -o data/${x}/utt2spk
+        sort data/${x}/wav.scp -o data/${x}/wav.scp
+        sort data/${x}/text -o data/${x}/text
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+
+    for x in train_phn test_phn; do
+        mkdir -p data/${x}
+        python local/data_prep.py --src "${db_root}"/"$(echo ${x} | cut -d'_' -f 1)"/ --dest data/${x} --external_g2p false
+        sort data/${x}/utt2spk -o data/${x}/utt2spk
+        sort data/${x}/wav.scp -o data/${x}/wav.scp
+        sort data/${x}/text -o data/${x}/text
+        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
+        utils/validate_data_dir.sh --no-feats data/${x}
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "stage 2: scripts/audio/trim_silence.sh"
+    for x in train test train_phn test_phn; do
+        # shellcheck disable=SC2154
+        scripts/audio/trim_silence.sh \
+             --cmd "${train_cmd}" \
+             --nj "${nj}" \
+             --fs 24000 \
+             --win_length 2048 \
+             --shift_length 512 \
+             --threshold "${threshold}" \
+             data/${x} data/${x}/log
+
+        utils/fix_data_dir.sh data/${x}
+    done
+fi
+# use {dset},_phn here, to be consistent with mfa
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "stage 3: split for development set"
+    utils/subset_data_dir.sh data/train 250 data/dev
+    utils/subset_data_dir.sh data/train_phn 250 data/dev_phn
+    utils/copy_data_dir.sh data/train data/train_no_dev
+    utils/copy_data_dir.sh data/train_phn data/train_no_dev_phn
+    utils/filter_scp.pl --exclude data/dev/wav.scp \
+        data/train/wav.scp > data/train_no_dev/wav.scp
+    utils/filter_scp.pl --exclude data/dev_phn/wav.scp \
+        data/train_phn/wav.scp > data/train_no_dev_phn/wav.scp
+    utils/fix_data_dir.sh data/train_no_dev
+    utils/fix_data_dir.sh data/train_no_dev_phn
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/data_prep.py b/egs2/aishell3/tts2/local/data_prep.py
new file mode 120000
index 00000000000..c21477a81c2
--- /dev/null
+++ b/egs2/aishell3/tts2/local/data_prep.py
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/download_and_untar.sh b/egs2/aishell3/tts2/local/download_and_untar.sh
new file mode 120000
index 00000000000..077f7073ae6
--- /dev/null
+++ b/egs2/aishell3/tts2/local/download_and_untar.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/path.sh b/egs2/aishell3/tts2/local/path.sh
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/egs2/aishell3/tts2/local/run_mfa.sh b/egs2/aishell3/tts2/local/run_mfa.sh
new file mode 100755
index 00000000000..d70f29caf0f
--- /dev/null
+++ b/egs2/aishell3/tts2/local/run_mfa.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_shift=300
+
+./scripts/utils/mfa.sh \
+    --language pypinyin_phone  \
+    --train true \
+    --cleaner tacotron \
+    --acoustic_model mandarin_mfa \
+    --dictionary mandarin_china_mfa \
+    --g2p_model pypinyin_g2p_phone \
+    --samplerate ${fs} \
+    --hop-size ${n_shift} \
+    --clean_temp true \
+    --stage 0 \
+    --stop_stage 5 \
+    "$@"
diff --git a/egs2/aishell3/tts2/path.sh b/egs2/aishell3/tts2/path.sh
new file mode 120000
index 00000000000..2b62cec99da
--- /dev/null
+++ b/egs2/aishell3/tts2/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts2/path.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/pyscripts b/egs2/aishell3/tts2/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aishell3/tts2/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/run.sh b/egs2/aishell3/tts2/run.sh
new file mode 100755
index 00000000000..b38503ea22f
--- /dev/null
+++ b/egs2/aishell3/tts2/run.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000 # 44100 24000 yw change the default frequency
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi # opts="--audio_format wav " # flac for saving storage space, but require further processing afterwards.
+
+train_config=conf/train_fastspeech2.yaml
+inference_config=conf/decode_fastspeech2.yaml
+
+# train_set=train_no_dev
+# valid_set=dev
+# test_sets="dev test"
+# g2p=pypinyin_g2p_phone
+# Input: 卡尔普陪外孙玩滑梯
+# pypinyin_g2p: ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+# pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
+
+# if you want to use officially provided phoneme text (better for the quality)
+train_set=train_no_dev_phn
+valid_set=dev_phn
+test_sets="test_phn"
+g2p=none
+
+vocoder_file="vocoder/checkpoint-180000steps_cn.pkl" # vocoder/vocoder.pkl
+
+./tts2.sh \
+    --lang zh \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --src_token_type phn \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_spk_embed true \
+    --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_train.loss.ave \
+    --tts2_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_train.loss.ave/stats \
+    --tts2_exp exp/tts_train_teacher_raw_phn_none_cn_hubert \
+    --vocoder_file ${vocoder_file} \
+    ${opts} "$@"
+
+# --inference_args "--use_teacher_forcing true" \
+
+
diff --git a/egs2/aishell3/tts2/run_train_teacher.sh b/egs2/aishell3/tts2/run_train_teacher.sh
new file mode 100755
index 00000000000..a36fd05379c
--- /dev/null
+++ b/egs2/aishell3/tts2/run_train_teacher.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=300
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+# opts="--audio_format wav " # flac for saving storage space, but require further processing afterwards.
+
+train_config=conf/train_teacher.yaml
+inference_config=conf/decode_teacher.yaml
+
+# train_set=train_no_dev
+# valid_set=dev
+# test_sets="dev test"
+# g2p=pypinyin_g2p_phone
+# Input: 卡尔普陪外孙玩滑梯
+# pypinyin_g2p: ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+# pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
+
+# if you want to use officially provided phoneme text (better for the quality)
+train_set=train_no_dev_phn
+valid_set=dev_phn
+test_sets="dev_phn test_phn"
+g2p=none
+
+./tts.sh \
+    --lang zh \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_spk_embed true \
+    --stage 8 \
+    --stop_stage 8 \
+    --tts_exp exp/tts_train_teacher_raw_phn_none \
+    --test_sets "train_no_dev_phn dev_phn test_phn" \
+    --inference_args "--use_teacher_forcing true" \
+    ${opts} "$@"
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/scripts b/egs2/aishell3/tts2/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/aishell3/tts2/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/sid b/egs2/aishell3/tts2/sid
new file mode 120000
index 00000000000..e1f18977f62
--- /dev/null
+++ b/egs2/aishell3/tts2/sid
@@ -0,0 +1 @@
+../../TEMPLATE/tts2/sid
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/steps b/egs2/aishell3/tts2/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/aishell3/tts2/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/tts.sh b/egs2/aishell3/tts2/tts.sh
new file mode 100755
index 00000000000..28b424b97b9
--- /dev/null
+++ b/egs2/aishell3/tts2/tts.sh
@@ -0,0 +1,1215 @@
+#!/usr/bin/env bash
+
+# Copyright 2019 Tomoki Hayashi
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+min() {
+  local a b
+  a=$1
+  for b in "$@"; do
+      if [ "${b}" -le "${a}" ]; then
+          a="${b}"
+      fi
+  done
+  echo "${a}"
+}
+SECONDS=0
+
+# General configuration
+stage=1                 # Processes starts from the specified stage.
+stop_stage=10000        # Processes is stopped at the specified stage.
+skip_data_prep=false    # Skip data preparation stages.
+skip_train=false        # Skip training stages.
+skip_eval=false         # Skip decoding and evaluation stages.
+skip_packing=true       # Skip the packing stage.
+skip_upload_hf=true     # Skip uploading to huggingface stage.
+ngpu=1                  # The number of gpus ("0" uses cpu, otherwise use gpu).
+num_nodes=1             # The number of nodes.
+nj=32                   # The number of parallel jobs.
+inference_nj=32         # The number of parallel jobs in decoding.
+gpu_inference=false     # Whether to perform gpu decoding.
+dumpdir=dump            # Directory to dump features.
+expdir=exp              # Directory to save experiments.
+python=python3          # Specify python to execute espnet commands.
+
+# Data preparation related
+local_data_opts="" # Options to be passed to local/data.sh.
+
+# Feature extraction related
+feats_type=raw             # Input feature type.
+audio_format=flac          # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
+min_wav_duration=0.1       # Minimum duration in second.
+max_wav_duration=20        # Maximum duration in second.
+use_sid=false              # Whether to use speaker id as the inputs (Need utt2spk in data directory).
+use_lid=false              # Whether to use language id as the inputs (Need utt2lang in data directory).
+feats_extract=fbank        # On-the-fly feature extractor.
+feats_normalize=global_mvn # On-the-fly feature normalizer.
+fs=16000                   # Sampling rate.
+n_fft=1024                 # The number of fft points.
+n_shift=256                # The number of shift points.
+win_length=null            # Window length.
+fmin=80                    # Minimum frequency of Mel basis.
+fmax=7600                  # Maximum frequency of Mel basis.
+n_mels=80                  # The number of mel basis.
+# Only used for the model using pitch & energy features (e.g. FastSpeech2)
+f0min=80  # Maximum f0 for pitch extraction.
+f0max=400 # Minimum f0 for pitch extraction.
+
+# Speaker embedding related
+use_spk_embed=false      # Whether to use speaker embedding.
+spk_embed_tag=espnet_spk # The additional tag of speaker embedding folder, use "xvector" for compatibility.
+spk_embed_gpu_inference=false # Whether to use gpu to inference speaker embedding.
+spk_embed_tool=espnet    # Toolkit for extracting x-vector (speechbrain, rawnet, espnet, kaldi).
+spk_embed_model=espnet/voxcelebs12_rawnet3  # For only espnet, speechbrain, or rawnet.
+
+# Vocabulary related
+oov="<unk>"         # Out of vocabrary symbol.
+blank="<blank>"     # CTC blank symbol.
+sos_eos="<sos/eos>" # sos and eos symbols.
+
+# Training related
+train_config=""    # Config for training.
+train_args=""      # Arguments for training, e.g., "--max_epoch 1".
+                   # Note that it will overwrite args in train config.
+tag=""             # Suffix for training directory.
+tts_exp=""         # Specify the directory path for experiment. If this option is specified, tag is ignored.
+tts_stats_dir=""   # Specify the directory path for statistics. If empty, automatically decided.
+num_splits=1       # Number of splitting for tts corpus.
+teacher_dumpdir="" # Directory of teacher outputs (needed if tts=fastspeech).
+write_collected_feats=false # Whether to dump features in stats collection.
+tts_task=tts                # TTS task (tts or gan_tts).
+
+# Decoding related
+inference_config="" # Config for decoding.
+inference_args=""   # Arguments for decoding (e.g., "--threshold 0.75").
+                    # Note that it will overwrite args in inference config.
+inference_tag=""    # Suffix for decoding directory.
+inference_model=train.loss.ave.pth # Model path for decoding.
+                                   # e.g.
+                                   # inference_model=train.loss.best.pth
+                                   # inference_model=3epoch.pth
+                                   # inference_model=valid.acc.best.pth
+                                   # inference_model=valid.loss.ave.pth
+vocoder_file=none  # Vocoder parameter file, If set to none, Griffin-Lim will be used.
+download_model=""  # Download a model from Model Zoo and use it for decoding.
+
+# [Task dependent] Set the datadir name created by local/data.sh
+train_set=""     # Name of training set.
+valid_set=""     # Name of validation set used for monitoring/tuning network training.
+test_sets=""     # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
+srctexts=""      # Texts to create token list. Multiple items can be specified.
+nlsyms_txt=none  # Non-linguistic symbol list (needed if existing).
+token_type=phn   # Transcription type (char or phn).
+cleaner=tacotron # Text cleaner.
+g2p=g2p_en       # g2p method (needed if token_type=phn).
+lang=noinfo      # The language type of corpus.
+text_fold_length=150   # fold_length for text data.
+speech_fold_length=800 # fold_length for speech data.
+
+# Upload model related
+hf_repo=
+
+help_message=$(cat << EOF
+Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>" --srctexts "<srctexts>"
+
+Options:
+    # General configuration
+    --stage              # Processes starts from the specified stage (default="${stage}").
+    --stop_stage         # Processes is stopped at the specified stage (default="${stop_stage}").
+    --skip_data_prep     # Skip data preparation stages (default="${skip_data_prep}").
+    --skip_train         # Skip training stages (default="${skip_train}").
+    --skip_eval          # Skip decoding and evaluation stages (default="${skip_eval}").
+    --skip_packing       # Skip the packing stage (default="${skip_packing}").
+    --skip_upload_hf     # Skip uploading to huggingface stage (default="${skip_upload_hf}").
+    --ngpu               # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
+    --num_nodes          # The number of nodes (default="${num_nodes}").
+    --nj                 # The number of parallel jobs (default="${nj}").
+    --inference_nj       # The number of parallel jobs in decoding (default="${inference_nj}").
+    --gpu_inference      # Whether to perform gpu decoding (default="${gpu_inference}").
+    --dumpdir            # Directory to dump features (default="${dumpdir}").
+    --expdir             # Directory to save experiments (default="${expdir}").
+    --python             # Specify python to execute espnet commands (default="${python}").
+
+    # Data prep related
+    --local_data_opts # Options to be passed to local/data.sh (default="${local_data_opts}").
+
+    # Feature extraction related
+    --feats_type       # Feature type (default="${feats_type}").
+    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
+    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
+    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
+    --use_spk_embed    # Whether to use speaker_embedding (default="${use_spk_embed}").
+    --spk_embed_tag    # The tag of speaker embedding folder, use "xvector" for compatibility (default="${spk_embed_tag}").
+    --spk_embed_gpu_inference # Whether to use gpu to inference speaker embedding (default="${spk_embed_gpu_inference}").
+    --spk_embed_tool   # Toolkit for generating the speaker embedding (default="${spk_embed_tool}").
+    --spk_embed_model  # Pretrained model to generate the speaker embedding (default="${spk_embed_model}").
+    --use_sid          # Whether to use speaker id as the inputs (default="${use_sid}").
+    --use_lid          # Whether to use language id as the inputs (default="${use_lid}").
+    --feats_extract    # On the fly feature extractor (default="${feats_extract}").
+    --feats_normalize  # Feature normalizer for on the fly feature extractor (default="${feats_normalize}")
+    --fs               # Sampling rate (default="${fs}").
+    --fmax             # Maximum frequency of Mel basis (default="${fmax}").
+    --fmin             # Minimum frequency of Mel basis (default="${fmin}").
+    --n_mels           # The number of mel basis (default="${n_mels}").
+    --n_fft            # The number of fft points (default="${n_fft}").
+    --n_shift          # The number of shift points (default="${n_shift}").
+    --win_length       # Window length (default="${win_length}").
+    --f0min            # Maximum f0 for pitch extraction (default="${f0min}").
+    --f0max            # Minimum f0 for pitch extraction (default="${f0max}").
+    --oov              # Out of vocabrary symbol (default="${oov}").
+    --blank            # CTC blank symbol (default="${blank}").
+    --sos_eos          # sos and eos symbole (default="${sos_eos}").
+
+    # Training related
+    --train_config  # Config for training (default="${train_config}").
+    --train_args    # Arguments for training (default="${train_args}").
+                    # e.g., --train_args "--max_epoch 1"
+                    # Note that it will overwrite args in train config.
+    --tag           # Suffix for training directory (default="${tag}").
+    --tts_exp       # Specify the directory path for experiment.
+                    # If this option is specified, tag is ignored (default="${tts_exp}").
+    --tts_stats_dir # Specify the directory path for statistics.
+                    # If empty, automatically decided (default="${tts_stats_dir}").
+    --num_splits    # Number of splitting for tts corpus (default="${num_splits}").
+    --teacher_dumpdir       # Directory of teacher outputs (needed if tts=fastspeech, default="${teacher_dumpdir}").
+    --write_collected_feats # Whether to dump features in statistics collection (default="${write_collected_feats}").
+    --tts_task              # TTS task {tts or gan_tts} (default="${tts_task}").
+
+    # Decoding related
+    --inference_config  # Config for decoding (default="${inference_config}").
+    --inference_args    # Arguments for decoding, (default="${inference_args}").
+                        # e.g., --inference_args "--threshold 0.75"
+                        # Note that it will overwrite args in inference config.
+    --inference_tag     # Suffix for decoding directory (default="${inference_tag}").
+    --inference_model   # Model path for decoding (default=${inference_model}).
+    --vocoder_file      # Vocoder paramemter file (default=${vocoder_file}).
+                        # If set to none, Griffin-Lim vocoder will be used.
+    --download_model    # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+
+    # [Task dependent] Set the datadir name created by local/data.sh.
+    --train_set          # Name of training set (required).
+    --valid_set          # Name of validation set used for monitoring/tuning network training (required).
+    --test_sets          # Names of test sets (required).
+                         # Note that multiple items (e.g., both dev and eval sets) can be specified.
+    --srctexts           # Texts to create token list (required).
+                         # Note that multiple items can be specified.
+    --nlsyms_txt         # Non-linguistic symbol list (default="${nlsyms_txt}").
+    --token_type         # Transcription type (default="${token_type}").
+    --cleaner            # Text cleaner (default="${cleaner}").
+    --g2p                # g2p method (default="${g2p}").
+    --lang               # The language type of corpus (default="${lang}").
+    --text_fold_length   # Fold length for text data (default="${text_fold_length}").
+    --speech_fold_length # Fold length for speech data (default="${speech_fold_length}").
+EOF
+)
+
+log "$0 $*"
+# Save command line args for logging (they will be lost after utils/parse_options.sh)
+run_args=$(scripts/utils/print_args.sh $0 "$@")
+. utils/parse_options.sh
+
+if [ $# -ne 0 ]; then
+    log "${help_message}"
+    log "Error: No positional arguments are required."
+    exit 2
+fi
+
+. ./path.sh
+. ./cmd.sh
+
+# Check feature type
+if [ "${feats_type}" = raw ]; then
+    data_feats="${dumpdir}/raw"
+else
+    log "${help_message}"
+    log "Error: only supported: --feats_type raw"
+    exit 2
+fi
+
+# Check token list type
+token_listdir="${dumpdir}/token_list/${token_type}"
+if [ "${cleaner}" != none ]; then
+    token_listdir+="_${cleaner}"
+fi
+if [ "${token_type}" = phn ]; then
+    token_listdir+="_${g2p}"
+fi
+token_list="${token_listdir}/tokens.txt"
+
+# Check old version token list dir existence
+if [ -e data/token_list ] && [ ! -e "${dumpdir}/token_list" ]; then
+    log "Default token_list directory path is changed from data to ${dumpdir}."
+    log "Copy data/token_list to ${dumpdir}/token_list for the compatibility."
+    [ ! -e ${dumpdir} ] && mkdir -p ${dumpdir}
+    cp -a "data/token_list" "${dumpdir}/token_list"
+fi
+
+# Set tag for naming of model directory
+if [ -z "${tag}" ]; then
+    if [ -n "${train_config}" ]; then
+        tag="$(basename "${train_config}" .yaml)_${feats_type}_${token_type}"
+    else
+        tag="train_${feats_type}_${token_type}"
+    fi
+    if [ "${cleaner}" != none ]; then
+        tag+="_${cleaner}"
+    fi
+    if [ "${token_type}" = phn ]; then
+        tag+="_${g2p}"
+    fi
+    # Add overwritten arg's info
+    if [ -n "${train_args}" ]; then
+        tag+="$(echo "${train_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+fi
+if [ -z "${inference_tag}" ]; then
+    if [ -n "${inference_config}" ]; then
+        inference_tag="$(basename "${inference_config}" .yaml)"
+    else
+        inference_tag=inference
+    fi
+    # Add overwritten arg's info
+    if [ -n "${inference_args}" ]; then
+        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
+    fi
+    inference_tag+="_$(echo "${inference_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
+fi
+
+# The directory used for collect-stats mode
+if [ -z "${tts_stats_dir}" ]; then
+    tts_stats_dir="${expdir}/tts_stats_${feats_type}"
+    if [ "${feats_extract}" != fbank ]; then
+        tts_stats_dir+="_${feats_extract}"
+    fi
+    tts_stats_dir+="_${token_type}"
+    if [ "${cleaner}" != none ]; then
+        tts_stats_dir+="_${cleaner}"
+    fi
+    if [ "${token_type}" = phn ]; then
+        tts_stats_dir+="_${g2p}"
+    fi
+fi
+# The directory used for training commands
+if [ -z "${tts_exp}" ]; then
+    tts_exp="${expdir}/tts_${tag}"
+fi
+
+
+# ========================== Main stages start from here. ==========================
+
+if ! "${skip_data_prep}"; then
+    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
+        # [Task dependent] Need to create data.sh for new corpus
+        local/data.sh ${local_data_opts}
+    fi
+
+
+    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+        # TODO(kamo): Change kaldi-ark to npy or HDF5?
+        # ====== Recreating "wav.scp" ======
+        # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
+        # shouldn't be used in training process.
+        # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
+        # and also it can also change the audio-format and sampling rate.
+        # If nothing is need, then format_wav_scp.sh does nothing:
+        # i.e. the input file format and rate is same as the output.
+
+        log "Stage 2: Format wav.scp: data/ -> ${data_feats}/"
+        for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                _suf="/org"
+            else
+                _suf=""
+            fi
+            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
+            rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
+            _opts=
+            if [ -e data/"${dset}"/segments ]; then
+                _opts+="--segments data/${dset}/segments "
+            fi
+
+            # shellcheck disable=SC2086
+            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
+                --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
+                "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
+            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
+        done
+    fi
+
+    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+        # Extract speaker embedding
+        if "${use_spk_embed}"; then
+            if [ "${spk_embed_tool}" = "kaldi" ]; then
+		log "${spk_embed_tag} will be set to 'xvector' for Kaldi extraction"
+		spk_embed_tag=xvector
+
+                log "Stage 3.1: Extract X-vector with Kaldi: data/ -> ${dumpdir}/${spk_embed_tag} (Require Kaldi)"
+                # Download X-vector pretrained model
+                xvector_exp=${expdir}/xvector_nnet_1a
+                if [ ! -e "${xvector_exp}" ]; then
+                    log "X-vector model does not exist. Download pre-trained model."
+                    wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
+                    tar xvf 0008_sitw_v2_1a.tar.gz
+                    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
+                    mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${xvector_exp}"
+                    rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
+                fi
+
+                # Generate the MFCC features, VAD decision, and X-vector
+                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                        _suf="/org"
+                    else
+                        _suf=""
+                    fi
+                    # 1. Copy datadir and resample to 16k
+                    utils/copy_data_dir.sh "${data_feats}${_suf}/${dset}" "${dumpdir}/mfcc/${dset}"
+                    utils/data/resample_data_dir.sh 16000 "${dumpdir}/mfcc/${dset}"
+
+                    # 2. Extract mfcc features
+                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/utt2spk wc -l)")
+                    steps/make_mfcc.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                        --write-utt2num-frames true \
+                        --mfcc-config conf/mfcc.conf \
+                        "${dumpdir}/mfcc/${dset}"
+                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
+
+                    # 3. Compute VAD decision
+                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/spk2utt wc -l)")
+                    sid/compute_vad_decision.sh --nj ${_nj} --cmd "${train_cmd}" \
+                        --vad-config conf/vad.conf \
+                        "${dumpdir}/mfcc/${dset}"
+                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
+
+                    # 4. Extract X-vector
+                    sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \
+                        "${xvector_exp}" \
+                        "${dumpdir}/mfcc/${dset}" \
+                        "${dumpdir}/${spk_embed_tag}/${dset}"
+
+                    # 5. Filter scp
+                    # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed,
+                    #   the number of utts will be different from the original features (raw or fbank).
+                    #   To avoid this mismatch, perform filtering of the original feature scp here.
+                    cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak}
+                    <"${data_feats}${_suf}/${dset}/wav.scp.bak" \
+                        utils/filter_scp.pl "${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp" \
+                        >"${data_feats}${_suf}/${dset}/wav.scp"
+                    utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
+                done
+            else
+                # Assume that others toolkits are python-based
+                log "Stage 3.1: Extract speaker embedding: data/ -> ${dumpdir}/${spk_embed_tag} using python toolkits"
+
+                if ${spk_embed_gpu_inference}; then
+                    _cmd="${cuda_cmd}"
+                    _ngpu=1
+                else
+                    _cmd="${decode_cmd}"
+                    _ngpu=0
+                fi
+
+                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                        _suf="/org"
+                    else
+                        _suf=""
+                    fi
+                    if [ "${spk_embed_tool}" = "rawnet" ]; then
+                        spk_embed_model="RawNet"
+                    fi
+
+                    ${_cmd} --gpu "${_ngpu}" ${dumpdir}/${spk_embed_tag}/${dset}/spk_embed_extract.log \
+                    pyscripts/utils/extract_spk_embed.py \
+                        --pretrained_model ${spk_embed_model} \
+                        --toolkit ${spk_embed_tool} \
+			--spk_embed_tag ${spk_embed_tag} \
+                        ${data_feats}${_suf}/${dset} \
+                        ${dumpdir}/${spk_embed_tag}/${dset}
+                done
+            fi
+        else
+            log "Skip Stage 3.1, no speaker embedding extraction set"
+        fi
+
+        # Prepare spk id input
+        if "${use_sid}"; then
+            log "Stage 3.2: Prepare speaker id: data/ -> ${data_feats}/"
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                if [ "${dset}" = "${train_set}" ]; then
+                    # Make spk2sid
+                    # NOTE(kan-bayashi): 0 is reserved for unknown speakers
+                    echo "<unk> 0" > "${data_feats}${_suf}/${dset}/spk2sid"
+                    cut -f 2 -d " " "${data_feats}${_suf}/${dset}/utt2spk" | sort | uniq | \
+                        awk '{print $1 " " NR}' >> "${data_feats}${_suf}/${dset}/spk2sid"
+                fi
+                pyscripts/utils/utt2spk_to_utt2sid.py \
+                    "${data_feats}/org/${train_set}/spk2sid" \
+                    "${data_feats}${_suf}/${dset}/utt2spk" \
+                    > "${data_feats}${_suf}/${dset}/utt2sid"
+            done
+        fi
+
+        # Prepare lang id input
+        if "${use_lid}"; then
+            log "Stage 3.3: Prepare lang id: data/ -> ${data_feats}/"
+            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
+                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
+                    _suf="/org"
+                else
+                    _suf=""
+                fi
+                if [ "${dset}" = "${train_set}" ]; then
+                    # Make lang2lid
+                    # NOTE(kan-bayashi): 0 is reserved for unknown languages
+                    echo "<unk> 0" > "${data_feats}${_suf}/${dset}/lang2lid"
+                    cut -f 2 -d " " "${data_feats}${_suf}/${dset}/utt2lang" | sort | uniq | \
+                        awk '{print $1 " " NR}' >> "${data_feats}${_suf}/${dset}/lang2lid"
+                fi
+                # NOTE(kan-bayashi): We can reuse the same script for making utt2sid
+                pyscripts/utils/utt2spk_to_utt2sid.py \
+                    "${data_feats}/org/${train_set}/lang2lid" \
+                    "${data_feats}${_suf}/${dset}/utt2lang" \
+                    > "${data_feats}${_suf}/${dset}/utt2lid"
+            done
+        fi
+    fi
+
+
+    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
+
+        # NOTE(kamo): Not applying to test_sets to keep original data
+        for dset in "${train_set}" "${valid_set}"; do
+            # Copy data dir
+            utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
+            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
+            if [ -e "${data_feats}/org/${dset}/utt2sid" ]; then
+                cp "${data_feats}/org/${dset}/utt2sid" "${data_feats}/${dset}/utt2sid"
+            fi
+            if [ -e "${data_feats}/org/${dset}/utt2lid" ]; then
+                cp "${data_feats}/org/${dset}/utt2lid" "${data_feats}/${dset}/utt2lid"
+            fi
+
+            # Remove short utterances
+            _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
+            _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
+            _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
+
+            # utt2num_samples is created by format_wav_scp.sh
+            <"${data_feats}/org/${dset}/utt2num_samples" \
+                awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
+                    '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
+                    >"${data_feats}/${dset}/utt2num_samples"
+            <"${data_feats}/org/${dset}/wav.scp" \
+                utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
+                >"${data_feats}/${dset}/wav.scp"
+
+            # Remove empty text
+            <"${data_feats}/org/${dset}/text" \
+                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
+
+            # fix_data_dir.sh leaves only utts which exist in all files
+            _utt_extra_files=""
+            if [ -e "${data_feats}/org/${dset}/utt2sid" ]; then
+                _utt_extra_files+="utt2sid "
+            fi
+            if [ -e "${data_feats}/org/${dset}/utt2lid" ]; then
+                _utt_extra_files+="utt2lid "
+            fi
+            # shellcheck disable=SC2086
+            utils/fix_data_dir.sh --utt_extra_files "${_utt_extra_files}" "${data_feats}/${dset}"
+
+            # Filter spk_embedding
+            if "${use_spk_embed}"; then
+                cp "${dumpdir}/${spk_embed_tag}/${dset}"/${spk_embed_tag}.{scp,scp.bak}
+                <"${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp.bak" \
+                    utils/filter_scp.pl "${data_feats}/${dset}/wav.scp"  \
+                    >"${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp"
+            fi
+        done
+    fi
+
+
+    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+        log "Stage 5: Generate token_list from ${srctexts}"
+        # "nlsyms_txt" should be generated by local/data.sh if need
+
+        # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
+        # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
+
+        # shellcheck disable=SC2002
+        cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
+
+        ${python} -m espnet2.bin.tokenize_text \
+              --token_type "${token_type}" -f 2- \
+              --input "${data_feats}/srctexts" --output "${token_list}" \
+              --non_linguistic_symbols "${nlsyms_txt}" \
+              --cleaner "${cleaner}" \
+              --g2p "${g2p}" \
+              --write_vocabulary true \
+              --add_symbol "${blank}:0" \
+              --add_symbol "${oov}:1" \
+              --add_symbol "${sos_eos}:-1"
+    fi
+else
+    log "Skip the stages for data preparation"
+fi
+
+# ========================== Data preparation is done here. ==========================
+
+
+
+if ! "${skip_train}"; then
+    if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+        _train_dir="${data_feats}/${train_set}"
+        _valid_dir="${data_feats}/${valid_set}"
+        log "Stage 6: TTS collect stats: train_set=${_train_dir}, valid_set=${_valid_dir}"
+
+        _opts=
+        if [ -n "${train_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
+            _opts+="--config ${train_config} "
+        fi
+
+        _scp=wav.scp
+        if [[ "${audio_format}" == *ark* ]]; then
+            _type=kaldi_ark
+        else
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+        fi
+        _opts+="--feats_extract ${feats_extract} "
+        _opts+="--feats_extract_conf n_fft=${n_fft} "
+        _opts+="--feats_extract_conf hop_length=${n_shift} "
+        _opts+="--feats_extract_conf win_length=${win_length} "
+        if [ "${feats_extract}" = fbank ]; then
+            _opts+="--feats_extract_conf fs=${fs} "
+            _opts+="--feats_extract_conf fmin=${fmin} "
+            _opts+="--feats_extract_conf fmax=${fmax} "
+            _opts+="--feats_extract_conf n_mels=${n_mels} "
+        fi
+
+        # Add extra configs for additional inputs
+        # NOTE(kan-bayashi): We always pass this options but not used in default
+        _opts+="--pitch_extract_conf fs=${fs} "
+        _opts+="--pitch_extract_conf n_fft=${n_fft} "
+        _opts+="--pitch_extract_conf hop_length=${n_shift} "
+        _opts+="--pitch_extract_conf f0max=${f0max} "
+        _opts+="--pitch_extract_conf f0min=${f0min} "
+        _opts+="--energy_extract_conf fs=${fs} "
+        _opts+="--energy_extract_conf n_fft=${n_fft} "
+        _opts+="--energy_extract_conf hop_length=${n_shift} "
+        _opts+="--energy_extract_conf win_length=${win_length} "
+
+        if [ -n "${teacher_dumpdir}" ]; then
+            _teacher_train_dir="${teacher_dumpdir}/${train_set}"
+            _teacher_valid_dir="${teacher_dumpdir}/${valid_set}"
+            _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/durations,durations,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/durations,durations,text_int "
+        fi
+
+        if "${use_spk_embed}"; then
+            _spk_embed_train_dir="${dumpdir}/${spk_embed_tag}/${train_set}"
+            _spk_embed_valid_dir="${dumpdir}/${spk_embed_tag}/${valid_set}"
+            _opts+="--train_data_path_and_name_and_type ${_spk_embed_train_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
+            _opts+="--valid_data_path_and_name_and_type ${_spk_embed_valid_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
+        fi
+
+        if "${use_sid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2sid,sids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2sid,sids,text_int "
+        fi
+
+        if "${use_lid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2lid,lids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2lid,lids,text_int "
+        fi
+
+        # 1. Split the key file
+        _logdir="${tts_stats_dir}/logdir"
+        mkdir -p "${_logdir}"
+
+        # Get the minimum number among ${nj} and the number lines of input files
+        _nj=$(min "${nj}" "$(<${_train_dir}/${_scp} wc -l)" "$(<${_valid_dir}/${_scp} wc -l)")
+
+        key_file="${_train_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/train.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        key_file="${_valid_dir}/${_scp}"
+        split_scps=""
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/valid.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+
+        # 2. Generate run.sh
+        log "Generate '${tts_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
+        mkdir -p "${tts_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${tts_stats_dir}/run.sh"; chmod +x "${tts_stats_dir}/run.sh"
+
+        # 3. Submit jobs
+        log "TTS collect_stats started... log: '${_logdir}/stats.*.log'"
+        # shellcheck disable=SC2046,SC2086
+        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
+            ${python} -m "espnet2.bin.${tts_task}_train" \
+                --collect_stats true \
+                --write_collected_feats "${write_collected_feats}" \
+                --use_preprocessor true \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --normalize none \
+                --pitch_normalize none \
+                --energy_normalize none \
+                --train_data_path_and_name_and_type "${_train_dir}/text,text,text" \
+                --train_data_path_and_name_and_type "${_train_dir}/${_scp},speech,${_type}" \
+                --valid_data_path_and_name_and_type "${_valid_dir}/text,text,text" \
+                --valid_data_path_and_name_and_type "${_valid_dir}/${_scp},speech,${_type}" \
+                --train_shape_file "${_logdir}/train.JOB.scp" \
+                --valid_shape_file "${_logdir}/valid.JOB.scp" \
+                --output_dir "${_logdir}/stats.JOB" \
+                ${_opts} ${train_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; }
+
+        # 4. Aggregate shape files
+        _opts=
+        for i in $(seq "${_nj}"); do
+            _opts+="--input_dir ${_logdir}/stats.${i} "
+        done
+        if [ "${feats_normalize}" != global_mvn ]; then
+            # Skip summerizaing stats if not using global MVN
+            _opts+="--skip_sum_stats"
+        fi
+        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${tts_stats_dir}"
+
+        # Append the num-tokens at the last dimensions. This is used for batch-bins count
+        <"${tts_stats_dir}/train/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${tts_stats_dir}/train/text_shape.${token_type}"
+
+        <"${tts_stats_dir}/valid/text_shape" \
+            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
+            >"${tts_stats_dir}/valid/text_shape.${token_type}"
+    fi
+
+
+    if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+        _train_dir="${data_feats}/${train_set}"
+        _valid_dir="${data_feats}/${valid_set}"
+        log "Stage 7: TTS Training: train_set=${_train_dir}, valid_set=${_valid_dir}"
+
+        _opts=
+        if [ -n "${train_config}" ]; then
+            # To generate the config file: e.g.
+            #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
+            _opts+="--config ${train_config} "
+        fi
+
+        if [ -z "${teacher_dumpdir}" ]; then
+            #####################################
+            #     CASE 1: AR model training     #
+            #####################################
+            _scp=wav.scp
+            if [[ "${audio_format}" == *ark* ]]; then
+                _type=kaldi_ark
+            else
+                # "sound" supports "wav", "flac", etc.
+                _type=sound
+            fi
+            _fold_length="$((speech_fold_length * n_shift))"
+            _opts+="--feats_extract ${feats_extract} "
+            _opts+="--feats_extract_conf n_fft=${n_fft} "
+            _opts+="--feats_extract_conf hop_length=${n_shift} "
+            _opts+="--feats_extract_conf win_length=${win_length} "
+            if [ "${feats_extract}" = fbank ]; then
+                _opts+="--feats_extract_conf fs=${fs} "
+                _opts+="--feats_extract_conf fmin=${fmin} "
+                _opts+="--feats_extract_conf fmax=${fmax} "
+                _opts+="--feats_extract_conf n_mels=${n_mels} "
+            fi
+
+            if [ "${num_splits}" -gt 1 ]; then
+                # If you met a memory error when parsing text files, this option may help you.
+                # The corpus is split into subsets and each subset is used for training one by one in order,
+                # so the memory footprint can be limited to the memory required for each dataset.
+
+                _split_dir="${tts_stats_dir}/splits${num_splits}"
+                if [ ! -f "${_split_dir}/.done" ]; then
+                    rm -f "${_split_dir}/.done"
+                    ${python} -m espnet2.bin.split_scps \
+                      --scps \
+                          "${_train_dir}/text" \
+                          "${_train_dir}/${_scp}" \
+                          "${tts_stats_dir}/train/speech_shape" \
+                          "${tts_stats_dir}/train/text_shape.${token_type}" \
+                      --num_splits "${num_splits}" \
+                      --output_dir "${_split_dir}"
+                    touch "${_split_dir}/.done"
+                else
+                    log "${_split_dir}/.done exists. Spliting is skipped"
+                fi
+
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
+                _opts+="--train_shape_file ${_split_dir}/speech_shape "
+                _opts+="--multiple_iterator true "
+
+            else
+                _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
+                _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
+                _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
+            fi
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/text,text,text "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/${_scp},speech,${_type} "
+            _opts+="--valid_shape_file ${tts_stats_dir}/valid/text_shape.${token_type} "
+            _opts+="--valid_shape_file ${tts_stats_dir}/valid/speech_shape "
+        else
+            #####################################
+            #   CASE 2: Non-AR model training   #
+            #####################################
+            _teacher_train_dir="${teacher_dumpdir}/${train_set}"
+            _teacher_valid_dir="${teacher_dumpdir}/${valid_set}"
+            _fold_length="${speech_fold_length}"
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
+            _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/durations,durations,text_int "
+            _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/text,text,text "
+            _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/durations,durations,text_int "
+            _opts+="--valid_shape_file ${tts_stats_dir}/valid/text_shape.${token_type} "
+
+            if [ -e ${_teacher_train_dir}/probs ]; then
+                # Knowledge distillation case: use the outputs of the teacher model as the target
+                _scp=feats.scp
+                _type=npy
+                _odim="$(head -n 1 "${_teacher_train_dir}/speech_shape" | cut -f 2 -d ",")"
+                _opts+="--odim=${_odim} "
+                _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/denorm/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${_teacher_train_dir}/speech_shape "
+                _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/denorm/${_scp},speech,${_type} "
+                _opts+="--valid_shape_file ${_teacher_valid_dir}/speech_shape "
+            else
+                # Teacher forcing case: use groundtruth as the target
+                _scp=wav.scp
+                if [[ "${audio_format}" == *ark* ]]; then
+                    _type=kaldi_ark
+                else
+                    # "sound" supports "wav", "flac", etc.
+                    _type=sound
+                fi
+                _fold_length="$((speech_fold_length * n_shift))"
+                _opts+="--feats_extract ${feats_extract} "
+                _opts+="--feats_extract_conf n_fft=${n_fft} "
+                _opts+="--feats_extract_conf hop_length=${n_shift} "
+                _opts+="--feats_extract_conf win_length=${win_length} "
+                if [ "${feats_extract}" = fbank ]; then
+                    _opts+="--feats_extract_conf fs=${fs} "
+                    _opts+="--feats_extract_conf fmin=${fmin} "
+                    _opts+="--feats_extract_conf fmax=${fmax} "
+                    _opts+="--feats_extract_conf n_mels=${n_mels} "
+                fi
+                _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
+                _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
+                _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/${_scp},speech,${_type} "
+                _opts+="--valid_shape_file ${tts_stats_dir}/valid/speech_shape "
+            fi
+        fi
+
+        # If there are dumped files of additional inputs, we use it to reduce computational cost
+        # NOTE (kan-bayashi): Use dumped files of the target features as well?
+        if [ -e "${tts_stats_dir}/train/collect_feats/pitch.scp" ]; then
+            _scp=pitch.scp
+            _type=npy
+            _train_collect_dir=${tts_stats_dir}/train/collect_feats
+            _valid_collect_dir=${tts_stats_dir}/valid/collect_feats
+            _opts+="--train_data_path_and_name_and_type ${_train_collect_dir}/${_scp},pitch,${_type} "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_collect_dir}/${_scp},pitch,${_type} "
+        fi
+        if [ -e "${tts_stats_dir}/train/collect_feats/energy.scp" ]; then
+            _scp=energy.scp
+            _type=npy
+            _train_collect_dir=${tts_stats_dir}/train/collect_feats
+            _valid_collect_dir=${tts_stats_dir}/valid/collect_feats
+            _opts+="--train_data_path_and_name_and_type ${_train_collect_dir}/${_scp},energy,${_type} "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_collect_dir}/${_scp},energy,${_type} "
+        fi
+
+        # Check extra statistics
+        if [ -e "${tts_stats_dir}/train/pitch_stats.npz" ]; then
+            _opts+="--pitch_extract_conf fs=${fs} "
+            _opts+="--pitch_extract_conf n_fft=${n_fft} "
+            _opts+="--pitch_extract_conf hop_length=${n_shift} "
+            _opts+="--pitch_extract_conf f0max=${f0max} "
+            _opts+="--pitch_extract_conf f0min=${f0min} "
+            _opts+="--pitch_normalize_conf stats_file=${tts_stats_dir}/train/pitch_stats.npz "
+        fi
+        if [ -e "${tts_stats_dir}/train/energy_stats.npz" ]; then
+            _opts+="--energy_extract_conf fs=${fs} "
+            _opts+="--energy_extract_conf n_fft=${n_fft} "
+            _opts+="--energy_extract_conf hop_length=${n_shift} "
+            _opts+="--energy_extract_conf win_length=${win_length} "
+            _opts+="--energy_normalize_conf stats_file=${tts_stats_dir}/train/energy_stats.npz "
+        fi
+
+        # Add speaker embedding to the inputs if needed
+        if "${use_spk_embed}"; then
+            _spk_embed_train_dir="${dumpdir}/${spk_embed_tag}/${train_set}"
+            _spk_embed_valid_dir="${dumpdir}/${spk_embed_tag}/${valid_set}"
+            _opts+="--train_data_path_and_name_and_type ${_spk_embed_train_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
+            _opts+="--valid_data_path_and_name_and_type ${_spk_embed_valid_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
+        fi
+
+        # Add spekaer ID to the inputs if needed
+        if "${use_sid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2sid,sids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2sid,sids,text_int "
+        fi
+
+        # Add language ID to the inputs if needed
+        if "${use_lid}"; then
+            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2lid,lids,text_int "
+            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2lid,lids,text_int "
+        fi
+
+        if [ "${feats_normalize}" = "global_mvn" ]; then
+            _opts+="--normalize_conf stats_file=${tts_stats_dir}/train/feats_stats.npz "
+        fi
+
+        log "Generate '${tts_exp}/run.sh'. You can resume the process from stage 7 using this script"
+        mkdir -p "${tts_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${tts_exp}/run.sh"; chmod +x "${tts_exp}/run.sh"
+
+        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
+
+        log "TTS training started... log: '${tts_exp}/train.log'"
+        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
+            # SGE can't include "/" in a job name
+            jobname="$(basename ${tts_exp})"
+        else
+            jobname="${tts_exp}/train.log"
+        fi
+        # shellcheck disable=SC2086
+        ${python} -m espnet2.bin.launch \
+            --cmd "${cuda_cmd} --name ${jobname}" \
+            --log "${tts_exp}"/train.log \
+            --ngpu "${ngpu}" \
+            --num_nodes "${num_nodes}" \
+            --init_file_prefix "${tts_exp}"/.dist_init_ \
+            --multiprocessing_distributed true -- \
+            ${python} -m "espnet2.bin.${tts_task}_train" \
+                --use_preprocessor true \
+                --token_type "${token_type}" \
+                --token_list "${token_list}" \
+                --non_linguistic_symbols "${nlsyms_txt}" \
+                --cleaner "${cleaner}" \
+                --g2p "${g2p}" \
+                --normalize "${feats_normalize}" \
+                --resume true \
+                --fold_length "${text_fold_length}" \
+                --fold_length "${_fold_length}" \
+                --output_dir "${tts_exp}" \
+                ${_opts} ${train_args}
+
+    fi
+else
+    log "Skip training stages"
+fi
+
+
+if [ -n "${download_model}" ]; then
+    log "Use ${download_model} for decoding and evaluation"
+    tts_exp="${expdir}/${download_model}"
+    mkdir -p "${tts_exp}"
+
+    # If the model already exists, you can skip downloading
+    espnet_model_zoo_download --unpack true "${download_model}" > "${tts_exp}/config.txt"
+
+    # Get the path of each file
+    _model_file=$(<"${tts_exp}/config.txt" sed -e "s/.*'model_file': '\([^']*\)'.*$/\1/")
+    _train_config=$(<"${tts_exp}/config.txt" sed -e "s/.*'train_config': '\([^']*\)'.*$/\1/")
+
+    # Create symbolic links
+    ln -sf "${_model_file}" "${tts_exp}"
+    ln -sf "${_train_config}" "${tts_exp}"
+    inference_model=$(basename "${_model_file}")
+
+fi
+
+
+if ! "${skip_eval}"; then
+    if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+        log "Stage 8: Decoding: training_dir=${tts_exp}"
+
+        if ${gpu_inference}; then
+            _cmd="${cuda_cmd}"
+            _ngpu=1
+        else
+            _cmd="${decode_cmd}"
+            _ngpu=0
+        fi
+
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+
+        _scp=wav.scp
+        if [[ "${audio_format}" == *ark* ]]; then
+            _type=kaldi_ark
+        else
+            # "sound" supports "wav", "flac", etc.
+            _type=sound
+        fi
+
+        log "Generate '${tts_exp}/${inference_tag}/run.sh'. You can resume the process from stage 8 using this script"
+        mkdir -p "${tts_exp}/${inference_tag}"; echo "${run_args} --stage 8 \"\$@\"; exit \$?" > "${tts_exp}/${inference_tag}/run.sh"; chmod +x "${tts_exp}/${inference_tag}/run.sh"
+
+
+        for dset in ${test_sets}; do
+            _data="${data_feats}/${dset}"
+            _speech_data="${_data}"
+            _dir="${tts_exp}/${inference_tag}/${dset}"
+            _logdir="${_dir}/log"
+            mkdir -p "${_logdir}"
+
+            _ex_opts=""
+            if [ -n "${teacher_dumpdir}" ]; then
+                # Use groundtruth of durations
+                _teacher_dir="${teacher_dumpdir}/${dset}"
+                _ex_opts+="--data_path_and_name_and_type ${_teacher_dir}/durations,durations,text_int "
+                # Overwrite speech arguments if use knowledge distillation
+                if [ -e "${teacher_dumpdir}/${train_set}/probs" ]; then
+                    _speech_data="${_teacher_dir}/denorm"
+                    _scp=feats.scp
+                    _type=npy
+                fi
+            fi
+
+            # Add speaker embedding to the inputs if needed
+            if "${use_spk_embed}"; then
+                _spk_embed_dir="${dumpdir}/${spk_embed_tag}/${dset}"
+                _ex_opts+="--data_path_and_name_and_type ${_spk_embed_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
+            fi
+
+            # Add spekaer ID to the inputs if needed
+            if "${use_sid}"; then
+                _ex_opts+="--data_path_and_name_and_type ${_data}/utt2sid,sids,text_int "
+            fi
+
+            # Add language ID to the inputs if needed
+            if "${use_lid}"; then
+                _ex_opts+="--data_path_and_name_and_type ${_data}/utt2lid,lids,text_int "
+            fi
+
+            # 0. Copy feats_type
+            cp "${_data}/feats_type" "${_dir}/feats_type"
+
+            # 1. Split the key file
+            key_file=${_data}/text
+            split_scps=""
+            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
+            for n in $(seq "${_nj}"); do
+                split_scps+=" ${_logdir}/keys.${n}.scp"
+            done
+            # shellcheck disable=SC2086
+            utils/split_scp.pl "${key_file}" ${split_scps}
+
+            # 2. Submit decoding jobs
+            log "Decoding started... log: '${_logdir}/tts_inference.*.log'"
+            # shellcheck disable=SC2046,SC2086
+            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/tts_inference.JOB.log \
+                ${python} -m espnet2.bin.tts_inference \
+                    --ngpu "${_ngpu}" \
+                    --data_path_and_name_and_type "${_data}/text,text,text" \
+                    --data_path_and_name_and_type ${_speech_data}/${_scp},speech,${_type} \
+                    --key_file "${_logdir}"/keys.JOB.scp \
+                    --model_file "${tts_exp}"/"${inference_model}" \
+                    --train_config "${tts_exp}"/config.yaml \
+                    --output_dir "${_logdir}"/output.JOB \
+                    --vocoder_file "${vocoder_file}" \
+                    ${_opts} ${_ex_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/tts_inference.*.log) ; exit 1; }
+
+            # 3. Concatenates the output files from each jobs
+            if [ -e "${_logdir}/output.${_nj}/norm" ]; then
+                mkdir -p "${_dir}"/norm
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/norm/feats.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/norm/feats.scp"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/denorm" ]; then
+                mkdir -p "${_dir}"/denorm
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/denorm/feats.scp"
+                done | LC_ALL=C sort -k1 > "${_dir}/denorm/feats.scp"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/speech_shape" ]; then
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/speech_shape/speech_shape"
+                done | LC_ALL=C sort -k1 > "${_dir}/speech_shape"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/wav" ]; then
+                mkdir -p "${_dir}"/wav
+                for i in $(seq "${_nj}"); do
+                    mv -u "${_logdir}/output.${i}"/wav/*.wav "${_dir}"/wav
+                    rm -rf "${_logdir}/output.${i}"/wav
+                done
+                find "${_dir}/wav" -name "*.wav" | while read -r line; do
+                    echo "$(basename "${line}" .wav) ${line}"
+                done | LC_ALL=C sort -k1 > "${_dir}/wav/wav.scp"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/att_ws" ]; then
+                mkdir -p "${_dir}"/att_ws
+                for i in $(seq "${_nj}"); do
+                    mv -u "${_logdir}/output.${i}"/att_ws/*.png "${_dir}"/att_ws
+                    rm -rf "${_logdir}/output.${i}"/att_ws
+                done
+            fi
+            if [ -e "${_logdir}/output.${_nj}/durations" ]; then
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/durations/durations"
+                done | LC_ALL=C sort -k1 > "${_dir}/durations"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/focus_rates" ]; then
+                for i in $(seq "${_nj}"); do
+                     cat "${_logdir}/output.${i}/focus_rates/focus_rates"
+                done | LC_ALL=C sort -k1 > "${_dir}/focus_rates"
+            fi
+            if [ -e "${_logdir}/output.${_nj}/probs" ]; then
+                mkdir -p "${_dir}"/probs
+                for i in $(seq "${_nj}"); do
+                    mv -u "${_logdir}/output.${i}"/probs/*.png "${_dir}"/probs
+                    rm -rf "${_logdir}/output.${i}"/probs
+                done
+            fi
+        done
+    fi
+else
+    log "Skip the evaluation stages"
+fi
+
+
+packed_model="${tts_exp}/${tts_exp##*/}_${inference_model%.*}.zip"
+if ! "${skip_packing}" && [ -z "${download_model}" ]; then
+    # Skip pack preparation if using a downloaded model or skip_packing is true
+    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+        log "Stage 9: Pack model: ${packed_model}"
+
+        _opts=""
+        if [ -e "${tts_stats_dir}/train/feats_stats.npz" ]; then
+            _opts+=" --option ${tts_stats_dir}/train/feats_stats.npz"
+        fi
+        if [ -e "${tts_stats_dir}/train/pitch_stats.npz" ]; then
+            _opts+=" --option ${tts_stats_dir}/train/pitch_stats.npz"
+        fi
+        if [ -e "${tts_stats_dir}/train/energy_stats.npz" ]; then
+            _opts+=" --option ${tts_stats_dir}/train/energy_stats.npz"
+        fi
+        if "${use_spk_embed}"; then
+            for dset in "${train_set}" ${test_sets}; do
+                _opts+=" --option ${dumpdir}/${spk_embed_tag}/${dset}/spk_${spk_embed_tag}.scp"
+                _opts+=" --option ${dumpdir}/${spk_embed_tag}/${dset}/spk_${spk_embed_tag}.ark"
+            done
+        fi
+        if "${use_sid}"; then
+            _opts+=" --option ${data_feats}/org/${train_set}/spk2sid"
+        fi
+        if "${use_lid}"; then
+            _opts+=" --option ${data_feats}/org/${train_set}/lang2lid"
+        fi
+        ${python} -m espnet2.bin.pack tts \
+            --train_config "${tts_exp}"/config.yaml \
+            --model_file "${tts_exp}"/"${inference_model}" \
+            --option "${tts_exp}"/images  \
+            --outpath "${packed_model}" \
+            ${_opts}
+
+        # NOTE(kamo): If you'll use packed model to inference in this script, do as follows
+        #   % unzip ${packed_model}
+        #   % ./run.sh --stage 9 --tts_exp $(basename ${packed_model} .zip) --inference_model pretrain.pth
+    fi
+else
+    log "Skip the packing stage"
+fi
+
+if ! "${skip_upload_hf}"; then
+    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+        [ -z "${hf_repo}" ] && \
+            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
+            exit 1
+        log "Stage 10: Upload model to HuggingFace: ${hf_repo}"
+
+    if [ ! -f "${packed_model}" ]; then
+        log "ERROR: ${packed_model} does not exist. Please run stage 9 first."
+        exit 1
+    fi
+
+        gitlfs=$(git lfs --version 2> /dev/null || true)
+        [ -z "${gitlfs}" ] && \
+            log "ERROR: You need to install git-lfs first" && \
+            exit 1
+
+        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
+        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
+
+        if command -v git &> /dev/null; then
+            _creator_name="$(git config user.name)"
+            _checkout="git checkout $(git show -s --format=%H)"
+        else
+            _creator_name="$(whoami)"
+            _checkout=""
+        fi
+        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
+        _task="$(pwd | rev | cut -d/ -f2 | rev)"
+        # foo/asr1 -> foo
+        _corpus="${_task%/*}"
+        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
+
+        # copy files in ${dir_repo}
+        unzip -o ${packed_model} -d ${dir_repo}
+        # Generate description file
+        # shellcheck disable=SC2034
+        hf_task=text-to-speech
+        # shellcheck disable=SC2034
+        espnet_task=TTS
+        # shellcheck disable=SC2034
+        task_exp=${tts_exp}
+        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
+
+        this_folder=${PWD}
+        cd ${dir_repo}
+        if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Update model"
+        fi
+        git push
+        cd ${this_folder}
+    fi
+else
+    log "Skip the uploading to HuggingFace stage"
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell3/tts2/tts2.sh b/egs2/aishell3/tts2/tts2.sh
new file mode 120000
index 00000000000..80ac2d16de1
--- /dev/null
+++ b/egs2/aishell3/tts2/tts2.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts2/tts2.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/utils b/egs2/aishell3/tts2/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/aishell3/tts2/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file

From 0ab1ea1ca95bbed444ebf0043fc9d826c95946db Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br012.ib.bridges2.psc.edu>
Date: Thu, 25 Jul 2024 21:03:29 -0400
Subject: [PATCH 02/13] modify model functions

---
 espnet2/bin/tts2_inference.py | 2 +-
 espnet2/tts2/espnet_model.py  | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/espnet2/bin/tts2_inference.py b/espnet2/bin/tts2_inference.py
index 63101df8890..3781147ef5d 100755
--- a/espnet2/bin/tts2_inference.py
+++ b/espnet2/bin/tts2_inference.py
@@ -268,7 +268,7 @@ def from_pretrained(
         return Text2Speech(**kwargs)
 
 
-@typechecked
+# @typechecked NOTE(yiwen) --output_dir "${_logdir}"/output.JOB \    format like this cannot pass typecheck, but it is str
 def inference(
     output_dir: str,
     batch_size: int,
diff --git a/espnet2/tts2/espnet_model.py b/espnet2/tts2/espnet_model.py
index 94b7f934e17..d98cf0cb533 100644
--- a/espnet2/tts2/espnet_model.py
+++ b/espnet2/tts2/espnet_model.py
@@ -193,12 +193,15 @@ def collect_feats(
 
         """
         # feature extraction
+        discrete_feats, discrete_feats_lengths = self.discrete_feats_extract(
+            discrete_speech, discrete_speech_lengths
+        )
         feats, feats_lengths = speech, speech_lengths
         if self.pitch_extract is not None:
             pitch, pitch_lengths = self.pitch_extract(
                 speech,
                 speech_lengths,
-                feats_lengths=discrete_feats_lengths,
+                feats_lengths=discrete_feats_lengths, # 
                 durations=durations,
                 durations_lengths=durations_lengths,
             )
@@ -206,7 +209,7 @@ def collect_feats(
             energy, energy_lengths = self.energy_extract(
                 speech,
                 speech_lengths,
-                feats_lengths=discrete_feats_lengths,
+                feats_lengths=discrete_feats_lengths, # 
                 durations=durations,
                 durations_lengths=durations_lengths,
             )

From ef0138f77c58c0bf21c87a538cd7c7fc925f7d3c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 26 Jul 2024 01:19:48 +0000
Subject: [PATCH 03/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 egs2/aishell3/tts2/README.md                  | 32 +++++++++----------
 .../tts2/conf/decode_fastspeech2.yaml         |  2 +-
 egs2/aishell3/tts2/conf/slurm.conf            |  2 +-
 egs2/aishell3/tts2/local/data.sh              |  2 +-
 egs2/aishell3/tts2/run.sh                     |  2 --
 egs2/aishell3/tts2/run_train_teacher.sh       |  2 +-
 espnet2/tts2/espnet_model.py                  |  4 +--
 7 files changed, 22 insertions(+), 24 deletions(-)

diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
index bdb7d4fc8f8..e8f5ac1c549 100644
--- a/egs2/aishell3/tts2/README.md
+++ b/egs2/aishell3/tts2/README.md
@@ -10,10 +10,10 @@ See the following pages for running on clusters. They can help you to set the en
 ## Brief on TTS2
 
 - In terms of features
-  
-  ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the training of a discrete FastSpeech2 model. 
+
+  ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the training of a discrete FastSpeech2 model.
 - In terms of data
-  
+
   ``tts2`` additionally requires duration information, which can be obtained from **Speech-Text Alignment Tools** (tacotron teacher model or mfa). According to FastSpeech2 paper, mfa has a higher quality.
 
 
@@ -26,16 +26,16 @@ Here is the basic order for running scripts, followed by more details.
 1. ``./local/run_mfa.sh``
 2. ``./run_train_teacher.sh`` (to stage 8, must use teacher forcing in decoding)
 3. ``./run_train_teacher.sh`` (stage 6 only, to extract energy and pitch)
-4. ``./run.sh --stop_stage 8 --s3prl_upstream_name hf_hubert_custom`` 
+4. ``./run.sh --stop_stage 8 --s3prl_upstream_name hf_hubert_custom``
 5. Train a vocoder at [PWG](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs) (We use discrete hifigan here)
-6. ``./run.sh --stage 9`` 
+6. ``./run.sh --stage 9``
 7. Evaluate the generated wav using [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation)
-   
+
 
 ### 1. Data Preparation
 
 * Download aishell-3 dataset(trainset & testset)
-* Trim slience to improve the efficiency and potentially improve the generated wave quality by cutting off noise. 
+* Trim slience to improve the efficiency and potentially improve the generated wave quality by cutting off noise.
 * Get the initial ``{dset}_phn`` dictionary.
 * Split 250 samples from the trainset to be the devset.
 
@@ -49,7 +49,7 @@ SSB00050353 shen1 jiao1 suo3 fu4 zong3 jing1 li3 zhou1 ming2 zhi3 chu1
   NOTE: The parameters like ``fs``, ``n_fft``, in ``trim_slience.sh`` don't have to be the same as what in ``run.sh``, since they only determine the precision of slience trimming, where the outcome of different sets of parameters will be roughly the same (corpus w/ minimum slience sound).
 
 ### 2. Train the teacher model
-Following ``tts1``, we train a Tacotron2 model to be the teacher model for FastSpeech2 in ``tts2``. 
+Following ``tts1``, we train a Tacotron2 model to be the teacher model for FastSpeech2 in ``tts2``.
 
 Set ``audio_format=wav`` is recommended, as it can be directly processed if you want to use x-vector. Or you can use ``flac``, but take ``egs2/libirspeech/asr1/local/data.sh`` as a reference for ``uttid path-to-utt``
 
@@ -59,11 +59,11 @@ Remember to keep the frame shift(fs) for the teacher model and the student model
 
 Calculate pitch and energy (still following ``tts1``), for fastspeech2.
 
-### 4. Train discrete fastspeech2 
+### 4. Train discrete fastspeech2
 The datasets include text, durations, speech, discrete speech, pitch, energy, and spembs. Use cn_hubert(pretrained on mandarin) here for discrete tts feature extraction.
 
-### 5. Train a vocoder 
-A customized vocoder for aishell3 discrete features is necessary for the purpose of generating ``wav`` from discrete hubert features. 
+### 5. Train a vocoder
+A customized vocoder for aishell3 discrete features is necessary for the purpose of generating ``wav`` from discrete hubert features.
 
 The vocoder for tts2 are not exactly mel2text, so our goal here is not to train a rule-based vocoder like ``tts1``, but another unique vocoder that maps discrete features to waves.
 
@@ -81,9 +81,9 @@ We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs
   :w path/to/newfile_all.txt
   :q!
   ```
-* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data (``wav`` format is better supported in kaldiio than ``flac``). Notice that aishell3 has unknown speakers, so we don't use sid. 
+* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data (``wav`` format is better supported in kaldiio than ``flac``). Notice that aishell3 has unknown speakers, so we don't use sid.
 
-* Modify ``num_embs``(equals to number of k-means clusters) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. 
+* Modify ``num_embs``(equals to number of k-means clusters) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``.
 
 * Start feature extraction and training from stage 1.
 
@@ -91,7 +91,7 @@ We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs
 ### 6. Inference
 Run the inference stage in espnet2 recipe with your trained vocoder. Waveform will be directly generated this time.
 
-### 7. Evaluate model performance 
+### 7. Evaluate model performance
 Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation).
 
 
@@ -117,7 +117,7 @@ Originally, ``Stage 1`` in ``run.sh`` calls ``local/data.sh``, but here we won't
 ./local/run_mfa.sh
 ```
 
-which is an entry point that will call ``scripts/mfa.sh`` and further call ``local/data.sh``. If ``--train false``, this script will download pretrained g2p and acoustic models, else if ``--train true``, this script will generate the alignments. The generated results will be stored in the ``<split_sets>_phn`` lexicon. 
+which is an entry point that will call ``scripts/mfa.sh`` and further call ``local/data.sh``. If ``--train false``, this script will download pretrained g2p and acoustic models, else if ``--train true``, this script will generate the alignments. The generated results will be stored in the ``<split_sets>_phn`` lexicon.
 
 For aishell-3, we train a new G2P model on ``mandarin_china_mfa`` dictionary, and generate the lexicon. Then train the speech-text alignment MFA.
 
@@ -133,7 +133,7 @@ In multi-spk scenario, adding speaker id or speaker embedding can help better te
 
 **Speaker Embeddings**
 
-ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrain, espnet_spk). The recently proposed espnet_spk shows SOTA performance among many tasks, thus we use it here. 
+ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrain, espnet_spk). The recently proposed espnet_spk shows SOTA performance among many tasks, thus we use it here.
 
 
 ### Discrete Speech Challenge Baseline
diff --git a/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml b/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
index 186dcb59d4f..cc8a791ef74 100644
--- a/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
+++ b/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
@@ -7,4 +7,4 @@ speed_control_alpha: 1     # alpha to control the speed of generated speech
                            # 1 < alpha makes slower and 1 > alpha makes faster
 use_teacher_forcing: false # whether to use teacher forcing
                            # if true, we use groundtruth of durations
-                           # (+ pitch & energy for FastSpeech2)
\ No newline at end of file
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/aishell3/tts2/conf/slurm.conf b/egs2/aishell3/tts2/conf/slurm.conf
index 36b40149ae0..3b229673638 100644
--- a/egs2/aishell3/tts2/conf/slurm.conf
+++ b/egs2/aishell3/tts2/conf/slurm.conf
@@ -11,4 +11,4 @@ default gpu=0
 option gpu=0 -p cpu
 option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
 # note: the --max-jobs-run option is supported as a special case
-# by slurm.pl and you don't have to handle it in the config file.
\ No newline at end of file
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aishell3/tts2/local/data.sh b/egs2/aishell3/tts2/local/data.sh
index 1938e661869..f250a620a51 100644
--- a/egs2/aishell3/tts2/local/data.sh
+++ b/egs2/aishell3/tts2/local/data.sh
@@ -94,4 +94,4 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     utils/fix_data_dir.sh data/train_no_dev_phn
 fi
 
-log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell3/tts2/run.sh b/egs2/aishell3/tts2/run.sh
index b38503ea22f..84ca5609976 100755
--- a/egs2/aishell3/tts2/run.sh
+++ b/egs2/aishell3/tts2/run.sh
@@ -61,5 +61,3 @@ vocoder_file="vocoder/checkpoint-180000steps_cn.pkl" # vocoder/vocoder.pkl
     ${opts} "$@"
 
 # --inference_args "--use_teacher_forcing true" \
-
-
diff --git a/egs2/aishell3/tts2/run_train_teacher.sh b/egs2/aishell3/tts2/run_train_teacher.sh
index a36fd05379c..70d2c89006c 100755
--- a/egs2/aishell3/tts2/run_train_teacher.sh
+++ b/egs2/aishell3/tts2/run_train_teacher.sh
@@ -58,4 +58,4 @@ g2p=none
     --tts_exp exp/tts_train_teacher_raw_phn_none \
     --test_sets "train_no_dev_phn dev_phn test_phn" \
     --inference_args "--use_teacher_forcing true" \
-    ${opts} "$@"
\ No newline at end of file
+    ${opts} "$@"
diff --git a/espnet2/tts2/espnet_model.py b/espnet2/tts2/espnet_model.py
index d98cf0cb533..da20e9a997a 100644
--- a/espnet2/tts2/espnet_model.py
+++ b/espnet2/tts2/espnet_model.py
@@ -201,7 +201,7 @@ def collect_feats(
             pitch, pitch_lengths = self.pitch_extract(
                 speech,
                 speech_lengths,
-                feats_lengths=discrete_feats_lengths, # 
+                feats_lengths=discrete_feats_lengths,  #
                 durations=durations,
                 durations_lengths=durations_lengths,
             )
@@ -209,7 +209,7 @@ def collect_feats(
             energy, energy_lengths = self.energy_extract(
                 speech,
                 speech_lengths,
-                feats_lengths=discrete_feats_lengths, # 
+                feats_lengths=discrete_feats_lengths,  #
                 durations=durations,
                 durations_lengths=durations_lengths,
             )

From c403dec4090f7a8c71f776a372427b5e182a93bf Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br011.ib.bridges2.psc.edu>
Date: Wed, 7 Aug 2024 09:37:30 -0400
Subject: [PATCH 04/13] modified files according to comments-v1

---
 egs2/TEMPLATE/tts2/tts2.sh                    | 12 ++-
 egs2/aishell3/tts1/local/data.sh              | 13 +--
 egs2/aishell3/tts2/README.md                  | 72 +++++++++++---
 egs2/aishell3/tts2/conf/decode_teacher.yaml   |  7 +-
 .../aishell3/tts2/conf/train_fastspeech2.yaml |  4 +-
 egs2/aishell3/tts2/conf/train_teacher.yaml    |  2 +-
 egs2/aishell3/tts2/local/data.sh              | 98 +------------------
 egs2/aishell3/tts2/local/path.sh              |  1 +
 egs2/aishell3/tts2/local/run_mfa.sh           |  3 +-
 egs2/aishell3/tts2/run.sh                     |  6 +-
 egs2/aishell3/tts2/run_train_teacher.sh       |  9 +-
 11 files changed, 85 insertions(+), 142 deletions(-)
 mode change 100644 => 120000 egs2/aishell3/tts2/local/data.sh
 mode change 100644 => 120000 egs2/aishell3/tts2/local/path.sh

diff --git a/egs2/TEMPLATE/tts2/tts2.sh b/egs2/TEMPLATE/tts2/tts2.sh
index e6c10c4beb5..cf7f7840a03 100755
--- a/egs2/TEMPLATE/tts2/tts2.sh
+++ b/egs2/TEMPLATE/tts2/tts2.sh
@@ -592,11 +592,14 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         log "Stage 6: Discrete TTS discrete unit extraction"
-
-        s3prl_conf="{upstream=${s3prl_upstream_name}}"
+        # (en hubert) 
+        # s3prl_conf="{upstream=${s3prl_upstream_name}}"
+        # kmeans_feature_type=s3prl
+        # kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
+        # (zh hubert) 
+        s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}"
         kmeans_feature_type=s3prl
-        kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
-
+        kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}  
         scripts/feats/perform_kmeans.sh \
             --stage ${discrete_stage} \
             --stop_stage ${discrete_stop_stage} \
@@ -606,6 +609,7 @@ if ! "${skip_data_prep}"; then
             --datadir "${dumpdir}/raw" \
             --featdir "${feature_dir}" \
             --audio_format "${audio_format}" \
+            --audio_sample_rate "${fs}" \
             --feature_type ${kmeans_feature_type} \
             --layer "${feature_layer}" \
             --feature_conf "${kmeans_feature_conf}" \
diff --git a/egs2/aishell3/tts1/local/data.sh b/egs2/aishell3/tts1/local/data.sh
index 32df0584444..3daf803152c 100755
--- a/egs2/aishell3/tts1/local/data.sh
+++ b/egs2/aishell3/tts1/local/data.sh
@@ -34,6 +34,7 @@ fi
 db_root=${AISHELL3}
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    mkdir -p ${db_root}
     log "stage -1: download data from openslr"
     local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/93/data_aishell3.tgz" data_aishell3.tgz
 fi
@@ -69,7 +70,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         scripts/audio/trim_silence.sh \
              --cmd "${train_cmd}" \
              --nj "${nj}" \
-             --fs 44100 \
+             --fs 24000 \
              --win_length 2048 \
              --shift_length 512 \
              --threshold "${threshold}" \
@@ -78,19 +79,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         utils/fix_data_dir.sh data/${x}
     done
 fi
-
+# use {dset},_phn here, to be consistent with mfa
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     log "stage 3: split for development set"
     utils/subset_data_dir.sh data/train 250 data/dev
     utils/subset_data_dir.sh data/train_phn 250 data/dev_phn
     utils/copy_data_dir.sh data/train data/train_no_dev
-    utils/copy_data_dir.sh data/train_phn data/train_phn_no_dev
+    utils/copy_data_dir.sh data/train_phn data/train_no_dev_phn
     utils/filter_scp.pl --exclude data/dev/wav.scp \
         data/train/wav.scp > data/train_no_dev/wav.scp
     utils/filter_scp.pl --exclude data/dev_phn/wav.scp \
-        data/train_phn/wav.scp > data/train_phn_no_dev/wav.scp
+        data/train_phn/wav.scp > data/train_no_dev_phn/wav.scp
     utils/fix_data_dir.sh data/train_no_dev
-    utils/fix_data_dir.sh data/train_phn_no_dev
+    utils/fix_data_dir.sh data/train_no_dev_phn
 fi
 
-log "Successfully finished. [elapsed=${SECONDS}s]"
+log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
index bdb7d4fc8f8..9330e92d613 100644
--- a/egs2/aishell3/tts2/README.md
+++ b/egs2/aishell3/tts2/README.md
@@ -4,7 +4,7 @@ This is the recipe of Mandrain multi-speaker TTS2 model with [aishell3](https://
 
 See the following pages for running on clusters. They can help you to set the environment and get familiar with ESPNet's repo structure.
 - [PSC usage tutorial](https://www.wavlab.org/activities/2022/psc-usage/)
-- [Espnet recipe tutorial]((https://github.com/espnet/notebook/blob/master/ESPnet2/Course/CMU_SpeechRecognition_Fall2022/recipe_tutorial.ipynb) )
+- [Espnet recipe tutorial](https://github.com/espnet/notebook/blob/master/ESPnet2/Course/CMU_SpeechRecognition_Fall2022/recipe_tutorial.ipynb)
 
 
 ## Brief on TTS2
@@ -19,14 +19,14 @@ See the following pages for running on clusters. They can help you to set the en
 
 ## Run the Recipe
 
-🌟 Please notice that most of the ``bash files`` are symbolic linked from the TEMPLATE. It might be updated by later commmits using other corpus, so please double check and customize the parameters before your run.
+🌟 Please notice that most of the ``bash files`` are symbolic linked from the TEMPLATE. It might be updated by later commits using other corpus, so please double check and customize the parameters before your run.
 
 Here is the basic order for running scripts, followed by more details.
 
 1. ``./local/run_mfa.sh``
 2. ``./run_train_teacher.sh`` (to stage 8, must use teacher forcing in decoding)
 3. ``./run_train_teacher.sh`` (stage 6 only, to extract energy and pitch)
-4. ``./run.sh --stop_stage 8 --s3prl_upstream_name hf_hubert_custom`` 
+4. ``./run.sh`` (to stage 8, use custom cn_hubert in certain layer)
 5. Train a vocoder at [PWG](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs) (We use discrete hifigan here)
 6. ``./run.sh --stage 9`` 
 7. Evaluate the generated wav using [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation)
@@ -55,12 +55,58 @@ Set ``audio_format=wav`` is recommended, as it can be directly processed if you
 
 Remember to keep the frame shift(fs) for the teacher model and the student model to be the same, only by which the soft target generated by teacher Tacotron2 can be aligned with the Fastspeech2 input.
 
+More specifically, the script can be executed by:
+
+```
+# Train the teacher model. Total steps >= 100k is recommended.
+./run_train_teacher.sh --stage 2 --stop_stage 7
+```
+
+Notice that ``test_set`` doesn't need all the processing here since only the pseudo labels from ``train_set`` and ``valid_set`` are required. Skipping some steps e.g. mfa, teacher forcing decoding on ``test_set`` is feasible.
+
+However, it is better to specify ``--test_sets`` in stage 2-3. Since the ``wav.scp`` from stage 2 can be used in vocode part, and the ``spk_emb`` extracted from stage 3 can be used for the overall decoding test.
+
+Then generate the pseudo labels from ``train_set`` and ``valid_set``.
+
+```
+# use teacher forcing in decoding
+./run_train_teacher.sh --stage 8 --stop_stage 8 \
+    --tts_exp exp/tts_train_teacher_raw_phn_none \
+    --test_sets "train_no_dev_phn dev_phn" \
+    --inference_args "--use_teacher_forcing true" \
+    --inference_model 50epoch.pth
+```
+
 ### 3. Extract additional features
 
 Calculate pitch and energy (still following ``tts1``), for fastspeech2.
+```
+./run_train_teacher.sh --stage 6 --stop_stage 6 \
+    --train_config conf/train_fastspeech2.yaml \
+    --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch \
+    --tts_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch/stats \
+    --write_collected_feats true
+```
 
 ### 4. Train discrete fastspeech2 
-The datasets include text, durations, speech, discrete speech, pitch, energy, and spembs. Use cn_hubert(pretrained on mandarin) here for discrete tts feature extraction.
+The datasets include text, durations, speech, discrete speech, pitch, energy, and spkembs. We use cn_hubert (pretrained on mandarin) here for discrete tts feature extraction.
+
+```
+# Process test_set for stage 6. The discrete unit will be used in the vocoder part. Modify the bash file to avoid reprocessing train_set
+
+./local/data.sh --stage 1 --stop_stage 1
+./run.sh --stage 2 --stop_stage 2
+```
+
+```
+./run.sh --stage 5 --stop_stage 6 --s3prl_upstream_name hf_hubert_custom --feature_layer 17
+
+./run.sh --stage 8 --stop_stage 8 --s3prl_upstream_name hf_hubert_custom --feature_layer 17 \
+    --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch \
+    --tts2_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch/stats \
+    --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert
+
+```
 
 ### 5. Train a vocoder 
 A customized vocoder for aishell3 discrete features is necessary for the purpose of generating ``wav`` from discrete hubert features. 
@@ -72,18 +118,14 @@ We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs
 
 * git clone https://github.com/kan-bayashi/ParallelWaveGAN.git , ``cd ParallelWaveGAN/egs/aishell3/hubert_voc1``.
 
-* Collect hubert text to a single file, which can be done conveniently using ``vim``
+* Collect hubert text to a single file
 
   ```shell
-  vim path/to/train_hubert.txt
-  :r path/to/dev_hubert.txt
-  :r path/to/test_hubert.txt
-  :w path/to/newfile_all.txt
-  :q!
+  cat path/to/train_hubert.txt path/to/dev_hubert.txt path/to/test_hubert.txt > path/to/newfile_all.txt
   ```
-* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data (``wav`` format is better supported in kaldiio than ``flac``). Notice that aishell3 has unknown speakers, so we don't use sid. 
+* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid. 
 
-* Modify ``num_embs``(equals to number of k-means clusters) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. 
+* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. 
 
 * Start feature extraction and training from stage 1.
 
@@ -91,6 +133,10 @@ We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs
 ### 6. Inference
 Run the inference stage in espnet2 recipe with your trained vocoder. Waveform will be directly generated this time.
 
+```
+./run.sh --stage 9 --stop_stage 9 --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert
+```
+
 ### 7. Evaluate model performance 
 Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation).
 
@@ -160,4 +206,4 @@ ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrai
 </table>
 
 
-* CER is currently unfilled since it requires an additional asr model.
+* Waiting for update
diff --git a/egs2/aishell3/tts2/conf/decode_teacher.yaml b/egs2/aishell3/tts2/conf/decode_teacher.yaml
index ed39fb610e8..599f7887d49 100644
--- a/egs2/aishell3/tts2/conf/decode_teacher.yaml
+++ b/egs2/aishell3/tts2/conf/decode_teacher.yaml
@@ -7,9 +7,4 @@
 ##########################################################
 #                    DECODING SETTING                    #
 ##########################################################
-threshold: 0.5            # threshold to stop the generation
-maxlenratio: 10.0         # maximum length of generated samples = input length * maxlenratio
-minlenratio: 0.0          # minimum length of generated samples = input length * minlenratio
-use_att_constraint: false # Whether to use attention constraint, which is introduced in Deep Voice 3
-backward_window: 1        # Backward window size in the attention constraint
-forward_window: 3         # Forward window size in the attention constraint
+use_teacher_forcing: false
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/conf/train_fastspeech2.yaml b/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
index 27b2c1e6d58..5362a949817 100644
--- a/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
+++ b/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
@@ -80,8 +80,8 @@ scheduler_conf:        # keyword arguments for selected scheduler
 ##########################################################
 #                OTHER TRAINING SETTING                  #
 ##########################################################
-num_iters_per_epoch: 100  # number of iterations per epoch
-max_epoch: 135           # number of epochs
+num_iters_per_epoch: 2000 # number of iterations per epoch
+max_epoch: 100            # number of epochs
 grad_clip: 1.0            # gradient clipping norm
 grad_noise: false         # whether to use gradient noise injection
 accum_grad: 1             # gradient accumulation
diff --git a/egs2/aishell3/tts2/conf/train_teacher.yaml b/egs2/aishell3/tts2/conf/train_teacher.yaml
index 5567a1d0a4e..11b6bb1e187 100644
--- a/egs2/aishell3/tts2/conf/train_teacher.yaml
+++ b/egs2/aishell3/tts2/conf/train_teacher.yaml
@@ -56,7 +56,7 @@ optim_conf:           # keyword arguments for selected optimizer
 ##########################################################
 #                OTHER TRAINING SETTING                  #
 ##########################################################
-num_iters_per_epoch: 500    # number of iterations per epoch
+num_iters_per_epoch: 2000    # number of iterations per epoch
 max_epoch: 100              # number of epochs, 500 by default
 grad_clip: 1.0              # gradient clipping norm
 grad_noise: false           # whether to use gradient noise injection
diff --git a/egs2/aishell3/tts2/local/data.sh b/egs2/aishell3/tts2/local/data.sh
deleted file mode 100644
index 1938e661869..00000000000
--- a/egs2/aishell3/tts2/local/data.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-set -u
-set -o pipefail
-
-log() {
-    local fname=${BASH_SOURCE[1]##*/}
-    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-SECONDS=0
-
-stage=0
-stop_stage=3
-threshold=35
-nj=40
-
-log "$0 $*"
-. utils/parse_options.sh
-
-if [ $# -ne 0 ]; then
-    log "Error: No positional arguments are required."
-    exit 2
-fi
-
-. ./path.sh || exit 1;
-. ./cmd.sh || exit 1;
-. ./db.sh || exit 1;
-
-if [ -z "${AISHELL3}" ]; then
-   log "Fill the value of 'AISHELL3' of db.sh"
-   exit 1
-fi
-db_root=${AISHELL3}
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    mkdir -p ${db_root} # yw added, otherwise will say: local/download_and_untar.sh: no such directory downloads
-    log "stage -1: download data from openslr"
-    local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/93/data_aishell3.tgz" data_aishell3.tgz
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    log "stage 1: prepare aishell3 data"
-    mkdir -p data
-    for x in train test; do
-        mkdir -p data/${x}
-        python local/data_prep.py --src "${db_root}"/${x}/ --dest data/${x}
-        sort data/${x}/utt2spk -o data/${x}/utt2spk
-        sort data/${x}/wav.scp -o data/${x}/wav.scp
-        sort data/${x}/text -o data/${x}/text
-        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
-        utils/validate_data_dir.sh --no-feats data/${x}
-    done
-
-    for x in train_phn test_phn; do
-        mkdir -p data/${x}
-        python local/data_prep.py --src "${db_root}"/"$(echo ${x} | cut -d'_' -f 1)"/ --dest data/${x} --external_g2p false
-        sort data/${x}/utt2spk -o data/${x}/utt2spk
-        sort data/${x}/wav.scp -o data/${x}/wav.scp
-        sort data/${x}/text -o data/${x}/text
-        utils/utt2spk_to_spk2utt.pl data/${x}/utt2spk > data/${x}/spk2utt
-        utils/validate_data_dir.sh --no-feats data/${x}
-    done
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    log "stage 2: scripts/audio/trim_silence.sh"
-    for x in train test train_phn test_phn; do
-        # shellcheck disable=SC2154
-        scripts/audio/trim_silence.sh \
-             --cmd "${train_cmd}" \
-             --nj "${nj}" \
-             --fs 24000 \
-             --win_length 2048 \
-             --shift_length 512 \
-             --threshold "${threshold}" \
-             data/${x} data/${x}/log
-
-        utils/fix_data_dir.sh data/${x}
-    done
-fi
-# use {dset},_phn here, to be consistent with mfa
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    log "stage 3: split for development set"
-    utils/subset_data_dir.sh data/train 250 data/dev
-    utils/subset_data_dir.sh data/train_phn 250 data/dev_phn
-    utils/copy_data_dir.sh data/train data/train_no_dev
-    utils/copy_data_dir.sh data/train_phn data/train_no_dev_phn
-    utils/filter_scp.pl --exclude data/dev/wav.scp \
-        data/train/wav.scp > data/train_no_dev/wav.scp
-    utils/filter_scp.pl --exclude data/dev_phn/wav.scp \
-        data/train_phn/wav.scp > data/train_no_dev_phn/wav.scp
-    utils/fix_data_dir.sh data/train_no_dev
-    utils/fix_data_dir.sh data/train_no_dev_phn
-fi
-
-log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/data.sh b/egs2/aishell3/tts2/local/data.sh
new file mode 120000
index 00000000000..2bb995843ef
--- /dev/null
+++ b/egs2/aishell3/tts2/local/data.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/data.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/path.sh b/egs2/aishell3/tts2/local/path.sh
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/egs2/aishell3/tts2/local/path.sh b/egs2/aishell3/tts2/local/path.sh
new file mode 120000
index 00000000000..032eace2e90
--- /dev/null
+++ b/egs2/aishell3/tts2/local/path.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/path.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/run_mfa.sh b/egs2/aishell3/tts2/local/run_mfa.sh
index d70f29caf0f..71f8aa7e84e 100755
--- a/egs2/aishell3/tts2/local/run_mfa.sh
+++ b/egs2/aishell3/tts2/local/run_mfa.sh
@@ -6,7 +6,7 @@ set -u
 set -o pipefail
 
 fs=24000
-n_shift=300
+n_shift=480
 
 ./scripts/utils/mfa.sh \
     --language pypinyin_phone  \
@@ -18,6 +18,7 @@ n_shift=300
     --samplerate ${fs} \
     --hop-size ${n_shift} \
     --clean_temp true \
+    --split_sets "train_no_dev dev test" \
     --stage 0 \
     --stop_stage 5 \
     "$@"
diff --git a/egs2/aishell3/tts2/run.sh b/egs2/aishell3/tts2/run.sh
index b38503ea22f..0346f933fb9 100755
--- a/egs2/aishell3/tts2/run.sh
+++ b/egs2/aishell3/tts2/run.sh
@@ -5,9 +5,9 @@ set -e
 set -u
 set -o pipefail
 
-fs=24000 # 44100 24000 yw change the default frequency
+fs=24000
 n_fft=2048
-n_shift=300
+n_shift=480
 win_length=1200
 
 opts=
@@ -60,6 +60,4 @@ vocoder_file="vocoder/checkpoint-180000steps_cn.pkl" # vocoder/vocoder.pkl
     --vocoder_file ${vocoder_file} \
     ${opts} "$@"
 
-# --inference_args "--use_teacher_forcing true" \
-
 
diff --git a/egs2/aishell3/tts2/run_train_teacher.sh b/egs2/aishell3/tts2/run_train_teacher.sh
index a36fd05379c..93104982410 100755
--- a/egs2/aishell3/tts2/run_train_teacher.sh
+++ b/egs2/aishell3/tts2/run_train_teacher.sh
@@ -7,7 +7,7 @@ set -o pipefail
 
 fs=24000
 n_fft=2048
-n_shift=300
+n_shift=480
 win_length=1200
 
 opts=
@@ -33,7 +33,6 @@ inference_config=conf/decode_teacher.yaml
 # if you want to use officially provided phoneme text (better for the quality)
 train_set=train_no_dev_phn
 valid_set=dev_phn
-test_sets="dev_phn test_phn"
 g2p=none
 
 ./tts.sh \
@@ -50,12 +49,6 @@ g2p=none
     --inference_config "${inference_config}" \
     --train_set "${train_set}" \
     --valid_set "${valid_set}" \
-    --test_sets "${test_sets}" \
     --srctexts "data/${train_set}/text" \
     --use_spk_embed true \
-    --stage 8 \
-    --stop_stage 8 \
-    --tts_exp exp/tts_train_teacher_raw_phn_none \
-    --test_sets "train_no_dev_phn dev_phn test_phn" \
-    --inference_args "--use_teacher_forcing true" \
     ${opts} "$@"
\ No newline at end of file

From a331f28e8cd7bf5d1b31f42ba56de6907ddc08d0 Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br011.ib.bridges2.psc.edu>
Date: Wed, 7 Aug 2024 09:38:57 -0400
Subject: [PATCH 05/13] modified functions according to comments-v1

---
 .../asr1/pyscripts/feats/dump_km_label.py     |  8 +++
 .../asr1/pyscripts/feats/dump_ssl_feature.py  |  7 +++
 .../asr1/pyscripts/feats/ssl_feature_utils.py | 60 ++++++++++++++++++-
 .../asr1/scripts/feats/perform_kmeans.sh      |  3 +
 espnet2/bin/tts_inference.py                  |  2 +-
 espnet2/tts2/espnet_model.py                  |  4 +-
 6 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py
index 2577f4ef766..210c7331c99 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py
@@ -75,6 +75,12 @@ def get_parser():
         help="Specify the file format for the rspecifier. "
         '"mat" is the matrix format in kaldi',
     )
+    parser.add_argument(
+        "--audio_sample_rate",
+        type=int,
+        default=16000,
+        help="input audio sampling rate (could be different from fs used in SSL)",
+    )
     parser.add_argument(
         "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
     )
@@ -116,6 +122,7 @@ def __call__(self, x):
 def dump_label(
     rspecifier,
     in_filetype,
+    audio_sample_rate,
     wspecifier,
     out_filetype,
     km_path,
@@ -152,6 +159,7 @@ def dump_label(
             )
         if reader_conf.get("layer", None):
             reader_conf["layer"] = int(reader_conf["layer"])
+        reader_conf["audio_sample_rate"] = audio_sample_rate
 
         reader = reader_class(**reader_conf)
         iterator = build_data_iterator(
diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py
index ca5f03a734a..4932d3b7c26 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py
@@ -55,6 +55,12 @@ def get_parser():
         default=None,
         help="Specify the utt2num_samples file.",
     )
+    parser.add_argument(
+        "--audio_sample_rate",
+        type=int,
+        default=16000,
+        help="input audio sampling rate (could be different from fs used in SSL)",
+    )
     parser.add_argument(
         "--write_num_frames", type=str, help="Specify wspecifer for utt2num_frames"
     )
@@ -83,6 +89,7 @@ def main(args):
         reader_conf["multilayer_feature"] = str2bool(reader_conf["multilayer_feature"])
     if reader_conf.get("layer", None):
         reader_conf["layer"] = int(reader_conf["layer"])
+    reader_conf["audio_sample_rate"] = args.audio_sample_rate
     reader = reader_class(use_gpu=args.use_gpu, **reader_conf)
 
     dump_feature(
diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index 92db12c5242..f4f58d84178 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -5,6 +5,7 @@
 import sys
 from typing import List, Optional, Tuple, Union
 
+import librosa
 import numpy as np
 import soundfile as sf
 import torch
@@ -97,7 +98,14 @@ def __init__(self):
 
     def load_audio(self, path: str, ref_len: Optional[int] = None):
         wav, sr = sf.read(path)
-        assert sr == self.sample_rate, sr
+        # assert sr == self.sample_rate, sr
+        if sr != self.sample_rate:
+            logging.warning(
+                "sampling rate mismatch between the requirements of feature extractor {} and source wav {}, conduct resampling".format(
+                    self.sample_rate, sr
+                )
+            )
+            wav = librosa.resample(wav, sr, self.sample_rate, scale=True)
         if wav.ndim == 2:
             wav = wav.mean(-1)
         if ref_len is not None and abs(ref_len - len(wav)) > 160:
@@ -134,9 +142,18 @@ class MfccFeatureReader(BaseFeatureReader):
     def __init__(
         self,
         sample_rate: int = 16000,
+        audio_sample_rate: int = 16000,
         **kwargs,  # placeholder for unused arguments
     ):
         self.sample_rate = sample_rate
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=fs
+            )
+        else:
+            self.resample = None
         self.frame_length = 25 * sample_rate / 1000
         self.frame_shift = 10 * sample_rate / 1000
 
@@ -149,6 +166,9 @@ def get_feats(
         feats, feats_lens = [], []
         with torch.no_grad():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             batch_size = x.shape[0]
             for i in range(batch_size):
                 mfcc = torchaudio.compliance.kaldi.mfcc(
@@ -177,10 +197,19 @@ def __init__(
         hubert_dir_path,
         layer,
         sample_rate=16000,
+        audio_sample_rate=16000,
         max_chunk=1600000,
         use_gpu=True,
     ):
         self.sample_rate = sample_rate
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=fs
+            )
+        else:
+            self.resample = None
 
         self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
@@ -200,6 +229,9 @@ def get_feats(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.no_grad():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             x = x.to(self.device)
             mask = x.zeros_like(x, dtype=torch.long)
             for i in range(x.shape[0]):
@@ -229,10 +261,19 @@ def __init__(
         hubert_model_path,
         layer,
         sample_rate=16000,
+        audio_sample_rate=16000,
         max_chunk=1600000,
         use_gpu=True,
     ):
         self.sample_rate = sample_rate
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=fs
+            )
+        else:
+            self.resample = None
 
         self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         from espnet2.tasks.hubert import HubertTask
@@ -256,6 +297,9 @@ def get_feats(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.inference_mode():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             x = x.to(self.device)
             x_lens = x_lens.to(self.device)
 
@@ -272,6 +316,7 @@ class S3PRLFeatureReader(BaseFeatureReader):
     def __init__(
         self,
         fs: Union[int, str] = 16000,
+        audio_sample_rate: int = 16000,
         s3prl_conf: Optional[dict] = None,
         download_dir: str = None,
         multilayer_feature: bool = False,
@@ -285,6 +330,16 @@ def __init__(
             multilayer_feature=multilayer_feature,
             layer=layer,
         )
+        self.sample_rate = fs
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=fs
+            )
+        else:
+            self.resample = None
+
         self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         self.model = self.model.to(self.device)
 
@@ -296,6 +351,9 @@ def get_feats(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.no_grad():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             x = x.to(self.device)
 
             feats, feats_lens = self.model(x, x_lens)
diff --git a/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh b/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh
index 9a5e560639f..daee0bfd49c 100755
--- a/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh
+++ b/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh
@@ -31,6 +31,7 @@ upsample=           # Upsampling rate of pseudo-labels to measure the pseudo-lab
 use_gpu=false       # Whether to use gpu in feature extraction
 suffix=             # A suffix to distinguish the feature dump directory. Empty in usual cases.
 audio_format="wav"  # The audio format of the source speech (flac, wav, *_ark, etc)
+audio_sample_rate=16000 # the sample rate of input audio
 
 skip_train_kmeans=false     # Whether to skip the kmeans model training
 nclusters=100       # Number of clusters of kmeans model
@@ -152,6 +153,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && ! [[ " ${skip_stages} " =~ [
         ${_cmd} JOB=1:${_nj} ${_logdir}/dump_features.JOB.log \
             ${python} pyscripts/feats/dump_ssl_feature.py \
                 --feature_conf "'${feature_conf}'" \
+                --audio_sample_rate "${audio_sample_rate}" \
                 --use_gpu ${use_gpu} \
                 --in_filetype "${_in_filetype}" \
                 --out_filetype "mat" \
@@ -267,6 +269,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && ! [[ " ${skip_stages} " =~ [
         ${_cmd} JOB=1:${_nj} "${_dump_dir}"/logdir/inference_pseudo_labels_km${nclusters}.JOB.log \
             ${python} pyscripts/feats/dump_km_label.py \
                 ${_opts} \
+                --audio_sample_rate "${audio_sample_rate}" \
                 --km_path "${km_dir}/km_${nclusters}.mdl" \
                 --out_filetype "mat" \
                 --use_gpu ${use_gpu} \
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
index 3f2d9849bc2..604fcb84fee 100755
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -306,7 +306,7 @@ def from_pretrained(
         return Text2Speech(**kwargs)
 
 
-@typechecked
+# @typechecked
 def inference(
     output_dir: str,
     batch_size: int,
diff --git a/espnet2/tts2/espnet_model.py b/espnet2/tts2/espnet_model.py
index d98cf0cb533..aee1b859a6d 100644
--- a/espnet2/tts2/espnet_model.py
+++ b/espnet2/tts2/espnet_model.py
@@ -201,7 +201,7 @@ def collect_feats(
             pitch, pitch_lengths = self.pitch_extract(
                 speech,
                 speech_lengths,
-                feats_lengths=discrete_feats_lengths, # 
+                feats_lengths=discrete_feats_lengths,
                 durations=durations,
                 durations_lengths=durations_lengths,
             )
@@ -209,7 +209,7 @@ def collect_feats(
             energy, energy_lengths = self.energy_extract(
                 speech,
                 speech_lengths,
-                feats_lengths=discrete_feats_lengths, # 
+                feats_lengths=discrete_feats_lengths,
                 durations=durations,
                 durations_lengths=durations_lengths,
             )

From ed107adf2055455473fee475ebff1a9f41ba6951 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 7 Aug 2024 14:48:15 +0000
Subject: [PATCH 06/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 egs2/TEMPLATE/tts2/tts2.sh                  | 6 +++---
 egs2/aishell3/tts1/local/data.sh            | 2 +-
 egs2/aishell3/tts2/README.md                | 8 ++++----
 egs2/aishell3/tts2/conf/decode_teacher.yaml | 2 +-
 egs2/aishell3/tts2/run.sh                   | 2 --
 5 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/egs2/TEMPLATE/tts2/tts2.sh b/egs2/TEMPLATE/tts2/tts2.sh
index cf7f7840a03..13ea21f5c73 100755
--- a/egs2/TEMPLATE/tts2/tts2.sh
+++ b/egs2/TEMPLATE/tts2/tts2.sh
@@ -592,14 +592,14 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         log "Stage 6: Discrete TTS discrete unit extraction"
-        # (en hubert) 
+        # (en hubert)
         # s3prl_conf="{upstream=${s3prl_upstream_name}}"
         # kmeans_feature_type=s3prl
         # kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
-        # (zh hubert) 
+        # (zh hubert)
         s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}"
         kmeans_feature_type=s3prl
-        kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}  
+        kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}
         scripts/feats/perform_kmeans.sh \
             --stage ${discrete_stage} \
             --stop_stage ${discrete_stop_stage} \
diff --git a/egs2/aishell3/tts1/local/data.sh b/egs2/aishell3/tts1/local/data.sh
index 3daf803152c..775970ff4aa 100755
--- a/egs2/aishell3/tts1/local/data.sh
+++ b/egs2/aishell3/tts1/local/data.sh
@@ -94,4 +94,4 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     utils/fix_data_dir.sh data/train_no_dev_phn
 fi
 
-log "Successfully finished. [elapsed=${SECONDS}s]"
\ No newline at end of file
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
index 881888066a8..40fa0f37fbf 100644
--- a/egs2/aishell3/tts2/README.md
+++ b/egs2/aishell3/tts2/README.md
@@ -88,7 +88,7 @@ Calculate pitch and energy (still following ``tts1``), for fastspeech2.
     --write_collected_feats true
 ```
 
-### 4. Train discrete fastspeech2 
+### 4. Train discrete fastspeech2
 The datasets include text, durations, speech, discrete speech, pitch, energy, and spkembs. We use cn_hubert (pretrained on mandarin) here for discrete tts feature extraction.
 
 ```
@@ -123,9 +123,9 @@ We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs
   ```shell
   cat path/to/train_hubert.txt path/to/dev_hubert.txt path/to/test_hubert.txt > path/to/newfile_all.txt
   ```
-* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid. 
+* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid.
 
-* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. 
+* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``.
 
 * Start feature extraction and training from stage 1.
 
@@ -137,7 +137,7 @@ Run the inference stage in espnet2 recipe with your trained vocoder. Waveform wi
 ./run.sh --stage 9 --stop_stage 9 --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert
 ```
 
-### 7. Evaluate model performance 
+### 7. Evaluate model performance
 Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation).
 
 
diff --git a/egs2/aishell3/tts2/conf/decode_teacher.yaml b/egs2/aishell3/tts2/conf/decode_teacher.yaml
index 599f7887d49..131fdd04eba 100644
--- a/egs2/aishell3/tts2/conf/decode_teacher.yaml
+++ b/egs2/aishell3/tts2/conf/decode_teacher.yaml
@@ -7,4 +7,4 @@
 ##########################################################
 #                    DECODING SETTING                    #
 ##########################################################
-use_teacher_forcing: false
\ No newline at end of file
+use_teacher_forcing: false
diff --git a/egs2/aishell3/tts2/run.sh b/egs2/aishell3/tts2/run.sh
index d32969e3a92..e1762d4fb1a 100755
--- a/egs2/aishell3/tts2/run.sh
+++ b/egs2/aishell3/tts2/run.sh
@@ -56,5 +56,3 @@ vocoder_file="vocoder/checkpoint-180000steps.pkl" # vocoder/vocoder.pkl
     --use_spk_embed true \
     --vocoder_file ${vocoder_file} \
     ${opts} "$@"
-
-

From 1448097d046794a7b8feab3a19cfe7bf10c5933b Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br011.ib.bridges2.psc.edu>
Date: Sun, 11 Aug 2024 23:18:08 -0400
Subject: [PATCH 07/13] update eval results

---
 egs2/aishell3/tts2/README.md | 37 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
index 881888066a8..f808902eead 100644
--- a/egs2/aishell3/tts2/README.md
+++ b/egs2/aishell3/tts2/README.md
@@ -11,10 +11,10 @@ See the following pages for running on clusters. They can help you to set the en
 
 - In terms of features
 
-  ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the training of a discrete FastSpeech2 model.
+  ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the discrete FastSpeech2 model training.
 - In terms of data
 
-  ``tts2`` additionally requires duration information, which can be obtained from **Speech-Text Alignment Tools** (tacotron teacher model or mfa). According to FastSpeech2 paper, mfa has a higher quality.
+  ``tts2`` additionally requires duration information, which can be obtained from **Speech-Text Alignment Tools** (tacotron teacher model or mfa). According to the [FastSpeech2](https://arxiv.org/pdf/2006.04558) paper, mfa has a higher quality.
 
 
 ## Run the Recipe
@@ -26,9 +26,9 @@ Here is the basic order for running scripts, followed by more details.
 1. ``./local/run_mfa.sh``
 2. ``./run_train_teacher.sh`` (to stage 8, must use teacher forcing in decoding)
 3. ``./run_train_teacher.sh`` (stage 6 only, to extract energy and pitch)
-4. ``./run.sh`` (to stage 8, use custom cn_hubert in certain layer)
+4. ``./run.sh`` (to stage 8, use custom cn_hubert, layer17 for large)
 5. Train a vocoder at [PWG](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs) (We use discrete hifigan here)
-6. ``./run.sh --stage 9``
+6. ``./run.sh`` (stage 9 only)
 7. Evaluate the generated wav using [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation)
 
 
@@ -53,7 +53,7 @@ Following ``tts1``, we train a Tacotron2 model to be the teacher model for FastS
 
 Set ``audio_format=wav`` is recommended, as it can be directly processed if you want to use x-vector. Or you can use ``flac``, but take ``egs2/libirspeech/asr1/local/data.sh`` as a reference for ``uttid path-to-utt``
 
-Remember to keep the frame shift(fs) for the teacher model and the student model to be the same, only by which the soft target generated by teacher Tacotron2 can be aligned with the Fastspeech2 input.
+Remember to keep the frame shift(fs), hop_size for the teacher model and the student model to be the same, only by which the soft targets generated by teacher Tacotron2 can align with the Fastspeech2 input.
 
 More specifically, the script can be executed by:
 
@@ -64,7 +64,7 @@ More specifically, the script can be executed by:
 
 Notice that ``test_set`` doesn't need all the processing here since only the pseudo labels from ``train_set`` and ``valid_set`` are required. Skipping some steps e.g. mfa, teacher forcing decoding on ``test_set`` is feasible.
 
-However, it is better to specify ``--test_sets`` in stage 2-3. Since the ``wav.scp`` from stage 2 can be used in vocode part, and the ``spk_emb`` extracted from stage 3 can be used for the overall decoding test.
+However, it is better to specify ``--test_sets`` in stage 1-3. Since test set phoneme is converted from grapheme to phoneme after new g2p model trained in mfa, the ``wav.scp`` from stage 2 can be used in vocoder part, and the ``spk_emb`` extracted from stage 3 can be used in the overall decoding test.
 
 Then generate the pseudo labels from ``train_set`` and ``valid_set``.
 
@@ -88,7 +88,7 @@ Calculate pitch and energy (still following ``tts1``), for fastspeech2.
     --write_collected_feats true
 ```
 
-### 4. Train discrete fastspeech2 
+### 4. Train discrete fastspeech2
 The datasets include text, durations, speech, discrete speech, pitch, energy, and spkembs. We use cn_hubert (pretrained on mandarin) here for discrete tts feature extraction.
 
 ```
@@ -123,9 +123,9 @@ We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs
   ```shell
   cat path/to/train_hubert.txt path/to/dev_hubert.txt path/to/test_hubert.txt > path/to/newfile_all.txt
   ```
-* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid. 
+* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid.
 
-* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. 
+* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``.
 
 * Start feature extraction and training from stage 1.
 
@@ -137,7 +137,7 @@ Run the inference stage in espnet2 recipe with your trained vocoder. Waveform wi
 ./run.sh --stage 9 --stop_stage 9 --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert
 ```
 
-### 7. Evaluate model performance 
+### 7. Evaluate model performance
 Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation).
 
 
@@ -175,7 +175,7 @@ If you want to use the duration extracted by mfa, then you can continue the trai
 
 ### Multi-Speakers tts2
 
-In multi-spk scenario, adding speaker id or speaker embedding can help better tell speakers apart, specified using ``--use_spk_embed`` or ``--use_sid``. But since aishell-3 is not a fixed speaker corpus, i.e. exists speakers with unknown id, so here we use speaker embeddings.
+In multi-spk scenario, adding speaker id or speaker embedding can help better tell speakers apart, specified using ``--use_spk_embed`` or ``--use_sid``. But since aishell-3 is not a fixed speaker corpus, i.e. exists speakers with unknown id, here we use speaker embeddings.
 
 **Speaker Embeddings**
 
@@ -190,20 +190,19 @@ ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrai
       <th scope="col">Model</th>
       <th scope="col">MCD ⬇️</th>
       <th scope="col">Log F0 RMSE ⬇️</th>
-      <th scope="col">CER</th>
+      <th scope="col">CER ⬇️</th>
       <th scope="col">UTMOS ⬆️</th>
     </tr>
   </thread>
   <tbody>
     <tr>
-      <th scope="col">HuBERT-base-layer6</th>
-      <th scope="col">11.7626 ± 1.6673</th>
-      <th scope="col">0.4608 ± 0.1724</th>
-      <th scope="col"></th>
-      <th scope="col">1.4078 ± 0.1414</th>
+      <th scope="col">cn_hubert-large-layer17</th>
+      <th scope="col">8.5473 ± 0.9407</th>
+      <th scope="col">0.3032 ± 0.1354</th>
+      <th scope="col">42.3</th>
+      <th scope="col">1.7565 ± 0.3628</th>
     </tr>
   </tbody>
 </table>
 
-
-* Waiting for update
+* CER is calculated using openai-whisper-large and Chinese characters.

From ee4054ea126870c54f8e36361de0708fb614101a Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br014.ib.bridges2.psc.edu>
Date: Tue, 20 Aug 2024 07:21:48 -0400
Subject: [PATCH 08/13] refined according to comments and CI

---
 .../asr1/pyscripts/feats/ssl_feature_utils.py |    3 +-
 egs2/aishell3/tts1/local/data.sh              |    2 +-
 egs2/aishell3/tts2/local/data_prep.py         |   45 +-
 egs2/aishell3/tts2/tts.sh                     | 1216 +----------------
 espnet2/bin/tts2_inference.py                 |    4 +-
 espnet2/bin/tts_inference.py                  |    4 +-
 6 files changed, 52 insertions(+), 1222 deletions(-)
 mode change 120000 => 100644 egs2/aishell3/tts2/local/data_prep.py
 mode change 100755 => 120000 egs2/aishell3/tts2/tts.sh

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index f4f58d84178..2bc9ec092da 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -101,7 +101,8 @@ def load_audio(self, path: str, ref_len: Optional[int] = None):
         # assert sr == self.sample_rate, sr
         if sr != self.sample_rate:
             logging.warning(
-                "sampling rate mismatch between the requirements of feature extractor {} and source wav {}, conduct resampling".format(
+                "sampling rate mismatch between the requirements of feature extractor {} "
+                "and source wav {}, conduct resampling".format(
                     self.sample_rate, sr
                 )
             )
diff --git a/egs2/aishell3/tts1/local/data.sh b/egs2/aishell3/tts1/local/data.sh
index 775970ff4aa..e4be9332c9c 100755
--- a/egs2/aishell3/tts1/local/data.sh
+++ b/egs2/aishell3/tts1/local/data.sh
@@ -70,7 +70,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         scripts/audio/trim_silence.sh \
              --cmd "${train_cmd}" \
              --nj "${nj}" \
-             --fs 24000 \
+             --fs 44100 \
              --win_length 2048 \
              --shift_length 512 \
              --threshold "${threshold}" \
diff --git a/egs2/aishell3/tts2/local/data_prep.py b/egs2/aishell3/tts2/local/data_prep.py
deleted file mode 120000
index c21477a81c2..00000000000
--- a/egs2/aishell3/tts2/local/data_prep.py
+++ /dev/null
@@ -1 +0,0 @@
-/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/data_prep.py
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/data_prep.py b/egs2/aishell3/tts2/local/data_prep.py
new file mode 100644
index 00000000000..679232b9f3e
--- /dev/null
+++ b/egs2/aishell3/tts2/local/data_prep.py
@@ -0,0 +1,44 @@
+import argparse
+import os
+
+from espnet2.utils.types import str2bool
+
+SPK_LABEL_LEN = 7
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str)
+    parser.add_argument("--dest", type=str)
+    parser.add_argument("--external_g2p", type=str2bool, default=True)
+
+    args = parser.parse_args()
+
+    wav_dir = os.path.join(args.src, "wav")
+    transcript = open(os.path.join(args.src, "content.txt"), "r", encoding="utf-8")
+
+    wavscp = open(os.path.join(args.dest, "wav.scp"), "w", encoding="utf-8")
+    utt2spk = open(os.path.join(args.dest, "utt2spk"), "w", encoding="utf-8")
+    text = open(os.path.join(args.dest, "text"), "w", encoding="utf-8")
+
+    while True:
+        utt_info = transcript.readline()
+        if not utt_info:
+            break
+
+        (wav_name, text_info) = utt_info.strip().split("\t")
+        if args.external_g2p:
+            text_info = "".join(text_info.split(" ")[::2])
+        else:
+            text_info = " ".join(text_info.split(" ")[1::2])
+
+        spk_id = wav_name[:SPK_LABEL_LEN]
+        utt_id = wav_name[:-4]
+
+        wavscp.write("{} {}\n".format(utt_id, os.path.join(wav_dir, spk_id, wav_name)))
+        utt2spk.write("{} {}\n".format(utt_id, spk_id))
+        text.write("{} {}\n".format(utt_id, text_info))
+
+    transcript.close()
+    wavscp.close()
+    utt2spk.close()
+    text.close()
diff --git a/egs2/aishell3/tts2/tts.sh b/egs2/aishell3/tts2/tts.sh
deleted file mode 100755
index 28b424b97b9..00000000000
--- a/egs2/aishell3/tts2/tts.sh
+++ /dev/null
@@ -1,1215 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2019 Tomoki Hayashi
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-set -u
-set -o pipefail
-
-log() {
-    local fname=${BASH_SOURCE[1]##*/}
-    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
-}
-min() {
-  local a b
-  a=$1
-  for b in "$@"; do
-      if [ "${b}" -le "${a}" ]; then
-          a="${b}"
-      fi
-  done
-  echo "${a}"
-}
-SECONDS=0
-
-# General configuration
-stage=1                 # Processes starts from the specified stage.
-stop_stage=10000        # Processes is stopped at the specified stage.
-skip_data_prep=false    # Skip data preparation stages.
-skip_train=false        # Skip training stages.
-skip_eval=false         # Skip decoding and evaluation stages.
-skip_packing=true       # Skip the packing stage.
-skip_upload_hf=true     # Skip uploading to huggingface stage.
-ngpu=1                  # The number of gpus ("0" uses cpu, otherwise use gpu).
-num_nodes=1             # The number of nodes.
-nj=32                   # The number of parallel jobs.
-inference_nj=32         # The number of parallel jobs in decoding.
-gpu_inference=false     # Whether to perform gpu decoding.
-dumpdir=dump            # Directory to dump features.
-expdir=exp              # Directory to save experiments.
-python=python3          # Specify python to execute espnet commands.
-
-# Data preparation related
-local_data_opts="" # Options to be passed to local/data.sh.
-
-# Feature extraction related
-feats_type=raw             # Input feature type.
-audio_format=flac          # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw).
-min_wav_duration=0.1       # Minimum duration in second.
-max_wav_duration=20        # Maximum duration in second.
-use_sid=false              # Whether to use speaker id as the inputs (Need utt2spk in data directory).
-use_lid=false              # Whether to use language id as the inputs (Need utt2lang in data directory).
-feats_extract=fbank        # On-the-fly feature extractor.
-feats_normalize=global_mvn # On-the-fly feature normalizer.
-fs=16000                   # Sampling rate.
-n_fft=1024                 # The number of fft points.
-n_shift=256                # The number of shift points.
-win_length=null            # Window length.
-fmin=80                    # Minimum frequency of Mel basis.
-fmax=7600                  # Maximum frequency of Mel basis.
-n_mels=80                  # The number of mel basis.
-# Only used for the model using pitch & energy features (e.g. FastSpeech2)
-f0min=80  # Maximum f0 for pitch extraction.
-f0max=400 # Minimum f0 for pitch extraction.
-
-# Speaker embedding related
-use_spk_embed=false      # Whether to use speaker embedding.
-spk_embed_tag=espnet_spk # The additional tag of speaker embedding folder, use "xvector" for compatibility.
-spk_embed_gpu_inference=false # Whether to use gpu to inference speaker embedding.
-spk_embed_tool=espnet    # Toolkit for extracting x-vector (speechbrain, rawnet, espnet, kaldi).
-spk_embed_model=espnet/voxcelebs12_rawnet3  # For only espnet, speechbrain, or rawnet.
-
-# Vocabulary related
-oov="<unk>"         # Out of vocabrary symbol.
-blank="<blank>"     # CTC blank symbol.
-sos_eos="<sos/eos>" # sos and eos symbols.
-
-# Training related
-train_config=""    # Config for training.
-train_args=""      # Arguments for training, e.g., "--max_epoch 1".
-                   # Note that it will overwrite args in train config.
-tag=""             # Suffix for training directory.
-tts_exp=""         # Specify the directory path for experiment. If this option is specified, tag is ignored.
-tts_stats_dir=""   # Specify the directory path for statistics. If empty, automatically decided.
-num_splits=1       # Number of splitting for tts corpus.
-teacher_dumpdir="" # Directory of teacher outputs (needed if tts=fastspeech).
-write_collected_feats=false # Whether to dump features in stats collection.
-tts_task=tts                # TTS task (tts or gan_tts).
-
-# Decoding related
-inference_config="" # Config for decoding.
-inference_args=""   # Arguments for decoding (e.g., "--threshold 0.75").
-                    # Note that it will overwrite args in inference config.
-inference_tag=""    # Suffix for decoding directory.
-inference_model=train.loss.ave.pth # Model path for decoding.
-                                   # e.g.
-                                   # inference_model=train.loss.best.pth
-                                   # inference_model=3epoch.pth
-                                   # inference_model=valid.acc.best.pth
-                                   # inference_model=valid.loss.ave.pth
-vocoder_file=none  # Vocoder parameter file, If set to none, Griffin-Lim will be used.
-download_model=""  # Download a model from Model Zoo and use it for decoding.
-
-# [Task dependent] Set the datadir name created by local/data.sh
-train_set=""     # Name of training set.
-valid_set=""     # Name of validation set used for monitoring/tuning network training.
-test_sets=""     # Names of test sets. Multiple items (e.g., both dev and eval sets) can be specified.
-srctexts=""      # Texts to create token list. Multiple items can be specified.
-nlsyms_txt=none  # Non-linguistic symbol list (needed if existing).
-token_type=phn   # Transcription type (char or phn).
-cleaner=tacotron # Text cleaner.
-g2p=g2p_en       # g2p method (needed if token_type=phn).
-lang=noinfo      # The language type of corpus.
-text_fold_length=150   # fold_length for text data.
-speech_fold_length=800 # fold_length for speech data.
-
-# Upload model related
-hf_repo=
-
-help_message=$(cat << EOF
-Usage: $0 --train-set "<train_set_name>" --valid-set "<valid_set_name>" --test_sets "<test_set_names>" --srctexts "<srctexts>"
-
-Options:
-    # General configuration
-    --stage              # Processes starts from the specified stage (default="${stage}").
-    --stop_stage         # Processes is stopped at the specified stage (default="${stop_stage}").
-    --skip_data_prep     # Skip data preparation stages (default="${skip_data_prep}").
-    --skip_train         # Skip training stages (default="${skip_train}").
-    --skip_eval          # Skip decoding and evaluation stages (default="${skip_eval}").
-    --skip_packing       # Skip the packing stage (default="${skip_packing}").
-    --skip_upload_hf     # Skip uploading to huggingface stage (default="${skip_upload_hf}").
-    --ngpu               # The number of gpus ("0" uses cpu, otherwise use gpu, default="${ngpu}").
-    --num_nodes          # The number of nodes (default="${num_nodes}").
-    --nj                 # The number of parallel jobs (default="${nj}").
-    --inference_nj       # The number of parallel jobs in decoding (default="${inference_nj}").
-    --gpu_inference      # Whether to perform gpu decoding (default="${gpu_inference}").
-    --dumpdir            # Directory to dump features (default="${dumpdir}").
-    --expdir             # Directory to save experiments (default="${expdir}").
-    --python             # Specify python to execute espnet commands (default="${python}").
-
-    # Data prep related
-    --local_data_opts # Options to be passed to local/data.sh (default="${local_data_opts}").
-
-    # Feature extraction related
-    --feats_type       # Feature type (default="${feats_type}").
-    --audio_format     # Audio format: wav, flac, wav.ark, flac.ark  (only in feats_type=raw, default="${audio_format}").
-    --min_wav_duration # Minimum duration in second (default="${min_wav_duration}").
-    --max_wav_duration # Maximum duration in second (default="${max_wav_duration}").
-    --use_spk_embed    # Whether to use speaker_embedding (default="${use_spk_embed}").
-    --spk_embed_tag    # The tag of speaker embedding folder, use "xvector" for compatibility (default="${spk_embed_tag}").
-    --spk_embed_gpu_inference # Whether to use gpu to inference speaker embedding (default="${spk_embed_gpu_inference}").
-    --spk_embed_tool   # Toolkit for generating the speaker embedding (default="${spk_embed_tool}").
-    --spk_embed_model  # Pretrained model to generate the speaker embedding (default="${spk_embed_model}").
-    --use_sid          # Whether to use speaker id as the inputs (default="${use_sid}").
-    --use_lid          # Whether to use language id as the inputs (default="${use_lid}").
-    --feats_extract    # On the fly feature extractor (default="${feats_extract}").
-    --feats_normalize  # Feature normalizer for on the fly feature extractor (default="${feats_normalize}")
-    --fs               # Sampling rate (default="${fs}").
-    --fmax             # Maximum frequency of Mel basis (default="${fmax}").
-    --fmin             # Minimum frequency of Mel basis (default="${fmin}").
-    --n_mels           # The number of mel basis (default="${n_mels}").
-    --n_fft            # The number of fft points (default="${n_fft}").
-    --n_shift          # The number of shift points (default="${n_shift}").
-    --win_length       # Window length (default="${win_length}").
-    --f0min            # Maximum f0 for pitch extraction (default="${f0min}").
-    --f0max            # Minimum f0 for pitch extraction (default="${f0max}").
-    --oov              # Out of vocabrary symbol (default="${oov}").
-    --blank            # CTC blank symbol (default="${blank}").
-    --sos_eos          # sos and eos symbole (default="${sos_eos}").
-
-    # Training related
-    --train_config  # Config for training (default="${train_config}").
-    --train_args    # Arguments for training (default="${train_args}").
-                    # e.g., --train_args "--max_epoch 1"
-                    # Note that it will overwrite args in train config.
-    --tag           # Suffix for training directory (default="${tag}").
-    --tts_exp       # Specify the directory path for experiment.
-                    # If this option is specified, tag is ignored (default="${tts_exp}").
-    --tts_stats_dir # Specify the directory path for statistics.
-                    # If empty, automatically decided (default="${tts_stats_dir}").
-    --num_splits    # Number of splitting for tts corpus (default="${num_splits}").
-    --teacher_dumpdir       # Directory of teacher outputs (needed if tts=fastspeech, default="${teacher_dumpdir}").
-    --write_collected_feats # Whether to dump features in statistics collection (default="${write_collected_feats}").
-    --tts_task              # TTS task {tts or gan_tts} (default="${tts_task}").
-
-    # Decoding related
-    --inference_config  # Config for decoding (default="${inference_config}").
-    --inference_args    # Arguments for decoding, (default="${inference_args}").
-                        # e.g., --inference_args "--threshold 0.75"
-                        # Note that it will overwrite args in inference config.
-    --inference_tag     # Suffix for decoding directory (default="${inference_tag}").
-    --inference_model   # Model path for decoding (default=${inference_model}).
-    --vocoder_file      # Vocoder paramemter file (default=${vocoder_file}).
-                        # If set to none, Griffin-Lim vocoder will be used.
-    --download_model    # Download a model from Model Zoo and use it for decoding (default="${download_model}").
-
-    # [Task dependent] Set the datadir name created by local/data.sh.
-    --train_set          # Name of training set (required).
-    --valid_set          # Name of validation set used for monitoring/tuning network training (required).
-    --test_sets          # Names of test sets (required).
-                         # Note that multiple items (e.g., both dev and eval sets) can be specified.
-    --srctexts           # Texts to create token list (required).
-                         # Note that multiple items can be specified.
-    --nlsyms_txt         # Non-linguistic symbol list (default="${nlsyms_txt}").
-    --token_type         # Transcription type (default="${token_type}").
-    --cleaner            # Text cleaner (default="${cleaner}").
-    --g2p                # g2p method (default="${g2p}").
-    --lang               # The language type of corpus (default="${lang}").
-    --text_fold_length   # Fold length for text data (default="${text_fold_length}").
-    --speech_fold_length # Fold length for speech data (default="${speech_fold_length}").
-EOF
-)
-
-log "$0 $*"
-# Save command line args for logging (they will be lost after utils/parse_options.sh)
-run_args=$(scripts/utils/print_args.sh $0 "$@")
-. utils/parse_options.sh
-
-if [ $# -ne 0 ]; then
-    log "${help_message}"
-    log "Error: No positional arguments are required."
-    exit 2
-fi
-
-. ./path.sh
-. ./cmd.sh
-
-# Check feature type
-if [ "${feats_type}" = raw ]; then
-    data_feats="${dumpdir}/raw"
-else
-    log "${help_message}"
-    log "Error: only supported: --feats_type raw"
-    exit 2
-fi
-
-# Check token list type
-token_listdir="${dumpdir}/token_list/${token_type}"
-if [ "${cleaner}" != none ]; then
-    token_listdir+="_${cleaner}"
-fi
-if [ "${token_type}" = phn ]; then
-    token_listdir+="_${g2p}"
-fi
-token_list="${token_listdir}/tokens.txt"
-
-# Check old version token list dir existence
-if [ -e data/token_list ] && [ ! -e "${dumpdir}/token_list" ]; then
-    log "Default token_list directory path is changed from data to ${dumpdir}."
-    log "Copy data/token_list to ${dumpdir}/token_list for the compatibility."
-    [ ! -e ${dumpdir} ] && mkdir -p ${dumpdir}
-    cp -a "data/token_list" "${dumpdir}/token_list"
-fi
-
-# Set tag for naming of model directory
-if [ -z "${tag}" ]; then
-    if [ -n "${train_config}" ]; then
-        tag="$(basename "${train_config}" .yaml)_${feats_type}_${token_type}"
-    else
-        tag="train_${feats_type}_${token_type}"
-    fi
-    if [ "${cleaner}" != none ]; then
-        tag+="_${cleaner}"
-    fi
-    if [ "${token_type}" = phn ]; then
-        tag+="_${g2p}"
-    fi
-    # Add overwritten arg's info
-    if [ -n "${train_args}" ]; then
-        tag+="$(echo "${train_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
-    fi
-fi
-if [ -z "${inference_tag}" ]; then
-    if [ -n "${inference_config}" ]; then
-        inference_tag="$(basename "${inference_config}" .yaml)"
-    else
-        inference_tag=inference
-    fi
-    # Add overwritten arg's info
-    if [ -n "${inference_args}" ]; then
-        inference_tag+="$(echo "${inference_args}" | sed -e "s/--/\_/g" -e "s/[ |=]//g")"
-    fi
-    inference_tag+="_$(echo "${inference_model}" | sed -e "s/\//_/g" -e "s/\.[^.]*$//g")"
-fi
-
-# The directory used for collect-stats mode
-if [ -z "${tts_stats_dir}" ]; then
-    tts_stats_dir="${expdir}/tts_stats_${feats_type}"
-    if [ "${feats_extract}" != fbank ]; then
-        tts_stats_dir+="_${feats_extract}"
-    fi
-    tts_stats_dir+="_${token_type}"
-    if [ "${cleaner}" != none ]; then
-        tts_stats_dir+="_${cleaner}"
-    fi
-    if [ "${token_type}" = phn ]; then
-        tts_stats_dir+="_${g2p}"
-    fi
-fi
-# The directory used for training commands
-if [ -z "${tts_exp}" ]; then
-    tts_exp="${expdir}/tts_${tag}"
-fi
-
-
-# ========================== Main stages start from here. ==========================
-
-if ! "${skip_data_prep}"; then
-    if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-        log "Stage 1: Data preparation for data/${train_set}, data/${valid_set}, etc."
-        # [Task dependent] Need to create data.sh for new corpus
-        local/data.sh ${local_data_opts}
-    fi
-
-
-    if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-        # TODO(kamo): Change kaldi-ark to npy or HDF5?
-        # ====== Recreating "wav.scp" ======
-        # Kaldi-wav.scp, which can describe the file path with unix-pipe, like "cat /some/path |",
-        # shouldn't be used in training process.
-        # "format_wav_scp.sh" dumps such pipe-style-wav to real audio file
-        # and also it can also change the audio-format and sampling rate.
-        # If nothing is need, then format_wav_scp.sh does nothing:
-        # i.e. the input file format and rate is same as the output.
-
-        log "Stage 2: Format wav.scp: data/ -> ${data_feats}/"
-        for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-            if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                _suf="/org"
-            else
-                _suf=""
-            fi
-            utils/copy_data_dir.sh data/"${dset}" "${data_feats}${_suf}/${dset}"
-            rm -f ${data_feats}${_suf}/${dset}/{segments,wav.scp,reco2file_and_channel}
-            _opts=
-            if [ -e data/"${dset}"/segments ]; then
-                _opts+="--segments data/${dset}/segments "
-            fi
-
-            # shellcheck disable=SC2086
-            scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \
-                --audio-format "${audio_format}" --fs "${fs}" ${_opts} \
-                "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}"
-            echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type"
-        done
-    fi
-
-    if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-        # Extract speaker embedding
-        if "${use_spk_embed}"; then
-            if [ "${spk_embed_tool}" = "kaldi" ]; then
-		log "${spk_embed_tag} will be set to 'xvector' for Kaldi extraction"
-		spk_embed_tag=xvector
-
-                log "Stage 3.1: Extract X-vector with Kaldi: data/ -> ${dumpdir}/${spk_embed_tag} (Require Kaldi)"
-                # Download X-vector pretrained model
-                xvector_exp=${expdir}/xvector_nnet_1a
-                if [ ! -e "${xvector_exp}" ]; then
-                    log "X-vector model does not exist. Download pre-trained model."
-                    wget http://kaldi-asr.org/models/8/0008_sitw_v2_1a.tar.gz
-                    tar xvf 0008_sitw_v2_1a.tar.gz
-                    [ ! -e "${expdir}" ] && mkdir -p "${expdir}"
-                    mv 0008_sitw_v2_1a/exp/xvector_nnet_1a "${xvector_exp}"
-                    rm -rf 0008_sitw_v2_1a.tar.gz 0008_sitw_v2_1a
-                fi
-
-                # Generate the MFCC features, VAD decision, and X-vector
-                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                        _suf="/org"
-                    else
-                        _suf=""
-                    fi
-                    # 1. Copy datadir and resample to 16k
-                    utils/copy_data_dir.sh "${data_feats}${_suf}/${dset}" "${dumpdir}/mfcc/${dset}"
-                    utils/data/resample_data_dir.sh 16000 "${dumpdir}/mfcc/${dset}"
-
-                    # 2. Extract mfcc features
-                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/utt2spk wc -l)")
-                    steps/make_mfcc.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                        --write-utt2num-frames true \
-                        --mfcc-config conf/mfcc.conf \
-                        "${dumpdir}/mfcc/${dset}"
-                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
-
-                    # 3. Compute VAD decision
-                    _nj=$(min "${nj}" "$(<${dumpdir}/mfcc/${dset}/spk2utt wc -l)")
-                    sid/compute_vad_decision.sh --nj ${_nj} --cmd "${train_cmd}" \
-                        --vad-config conf/vad.conf \
-                        "${dumpdir}/mfcc/${dset}"
-                    utils/fix_data_dir.sh "${dumpdir}/mfcc/${dset}"
-
-                    # 4. Extract X-vector
-                    sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \
-                        "${xvector_exp}" \
-                        "${dumpdir}/mfcc/${dset}" \
-                        "${dumpdir}/${spk_embed_tag}/${dset}"
-
-                    # 5. Filter scp
-                    # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed,
-                    #   the number of utts will be different from the original features (raw or fbank).
-                    #   To avoid this mismatch, perform filtering of the original feature scp here.
-                    cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak}
-                    <"${data_feats}${_suf}/${dset}/wav.scp.bak" \
-                        utils/filter_scp.pl "${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp" \
-                        >"${data_feats}${_suf}/${dset}/wav.scp"
-                    utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}"
-                done
-            else
-                # Assume that others toolkits are python-based
-                log "Stage 3.1: Extract speaker embedding: data/ -> ${dumpdir}/${spk_embed_tag} using python toolkits"
-
-                if ${spk_embed_gpu_inference}; then
-                    _cmd="${cuda_cmd}"
-                    _ngpu=1
-                else
-                    _cmd="${decode_cmd}"
-                    _ngpu=0
-                fi
-
-                for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                    if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                        _suf="/org"
-                    else
-                        _suf=""
-                    fi
-                    if [ "${spk_embed_tool}" = "rawnet" ]; then
-                        spk_embed_model="RawNet"
-                    fi
-
-                    ${_cmd} --gpu "${_ngpu}" ${dumpdir}/${spk_embed_tag}/${dset}/spk_embed_extract.log \
-                    pyscripts/utils/extract_spk_embed.py \
-                        --pretrained_model ${spk_embed_model} \
-                        --toolkit ${spk_embed_tool} \
-			--spk_embed_tag ${spk_embed_tag} \
-                        ${data_feats}${_suf}/${dset} \
-                        ${dumpdir}/${spk_embed_tag}/${dset}
-                done
-            fi
-        else
-            log "Skip Stage 3.1, no speaker embedding extraction set"
-        fi
-
-        # Prepare spk id input
-        if "${use_sid}"; then
-            log "Stage 3.2: Prepare speaker id: data/ -> ${data_feats}/"
-            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                    _suf="/org"
-                else
-                    _suf=""
-                fi
-                if [ "${dset}" = "${train_set}" ]; then
-                    # Make spk2sid
-                    # NOTE(kan-bayashi): 0 is reserved for unknown speakers
-                    echo "<unk> 0" > "${data_feats}${_suf}/${dset}/spk2sid"
-                    cut -f 2 -d " " "${data_feats}${_suf}/${dset}/utt2spk" | sort | uniq | \
-                        awk '{print $1 " " NR}' >> "${data_feats}${_suf}/${dset}/spk2sid"
-                fi
-                pyscripts/utils/utt2spk_to_utt2sid.py \
-                    "${data_feats}/org/${train_set}/spk2sid" \
-                    "${data_feats}${_suf}/${dset}/utt2spk" \
-                    > "${data_feats}${_suf}/${dset}/utt2sid"
-            done
-        fi
-
-        # Prepare lang id input
-        if "${use_lid}"; then
-            log "Stage 3.3: Prepare lang id: data/ -> ${data_feats}/"
-            for dset in "${train_set}" "${valid_set}" ${test_sets}; do
-                if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then
-                    _suf="/org"
-                else
-                    _suf=""
-                fi
-                if [ "${dset}" = "${train_set}" ]; then
-                    # Make lang2lid
-                    # NOTE(kan-bayashi): 0 is reserved for unknown languages
-                    echo "<unk> 0" > "${data_feats}${_suf}/${dset}/lang2lid"
-                    cut -f 2 -d " " "${data_feats}${_suf}/${dset}/utt2lang" | sort | uniq | \
-                        awk '{print $1 " " NR}' >> "${data_feats}${_suf}/${dset}/lang2lid"
-                fi
-                # NOTE(kan-bayashi): We can reuse the same script for making utt2sid
-                pyscripts/utils/utt2spk_to_utt2sid.py \
-                    "${data_feats}/org/${train_set}/lang2lid" \
-                    "${data_feats}${_suf}/${dset}/utt2lang" \
-                    > "${data_feats}${_suf}/${dset}/utt2lid"
-            done
-        fi
-    fi
-
-
-    if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-        log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}"
-
-        # NOTE(kamo): Not applying to test_sets to keep original data
-        for dset in "${train_set}" "${valid_set}"; do
-            # Copy data dir
-            utils/copy_data_dir.sh "${data_feats}/org/${dset}" "${data_feats}/${dset}"
-            cp "${data_feats}/org/${dset}/feats_type" "${data_feats}/${dset}/feats_type"
-            if [ -e "${data_feats}/org/${dset}/utt2sid" ]; then
-                cp "${data_feats}/org/${dset}/utt2sid" "${data_feats}/${dset}/utt2sid"
-            fi
-            if [ -e "${data_feats}/org/${dset}/utt2lid" ]; then
-                cp "${data_feats}/org/${dset}/utt2lid" "${data_feats}/${dset}/utt2lid"
-            fi
-
-            # Remove short utterances
-            _fs=$(python3 -c "import humanfriendly as h;print(h.parse_size('${fs}'))")
-            _min_length=$(python3 -c "print(int(${min_wav_duration} * ${_fs}))")
-            _max_length=$(python3 -c "print(int(${max_wav_duration} * ${_fs}))")
-
-            # utt2num_samples is created by format_wav_scp.sh
-            <"${data_feats}/org/${dset}/utt2num_samples" \
-                awk -v min_length="${_min_length}" -v max_length="${_max_length}" \
-                    '{ if ($2 > min_length && $2 < max_length ) print $0; }' \
-                    >"${data_feats}/${dset}/utt2num_samples"
-            <"${data_feats}/org/${dset}/wav.scp" \
-                utils/filter_scp.pl "${data_feats}/${dset}/utt2num_samples"  \
-                >"${data_feats}/${dset}/wav.scp"
-
-            # Remove empty text
-            <"${data_feats}/org/${dset}/text" \
-                awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/${dset}/text"
-
-            # fix_data_dir.sh leaves only utts which exist in all files
-            _utt_extra_files=""
-            if [ -e "${data_feats}/org/${dset}/utt2sid" ]; then
-                _utt_extra_files+="utt2sid "
-            fi
-            if [ -e "${data_feats}/org/${dset}/utt2lid" ]; then
-                _utt_extra_files+="utt2lid "
-            fi
-            # shellcheck disable=SC2086
-            utils/fix_data_dir.sh --utt_extra_files "${_utt_extra_files}" "${data_feats}/${dset}"
-
-            # Filter spk_embedding
-            if "${use_spk_embed}"; then
-                cp "${dumpdir}/${spk_embed_tag}/${dset}"/${spk_embed_tag}.{scp,scp.bak}
-                <"${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp.bak" \
-                    utils/filter_scp.pl "${data_feats}/${dset}/wav.scp"  \
-                    >"${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp"
-            fi
-        done
-    fi
-
-
-    if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
-        log "Stage 5: Generate token_list from ${srctexts}"
-        # "nlsyms_txt" should be generated by local/data.sh if need
-
-        # The first symbol in token_list must be "<blank>" and the last must be also sos/eos:
-        # 0 is reserved for CTC-blank for ASR and also used as ignore-index in the other task
-
-        # shellcheck disable=SC2002
-        cat ${srctexts} | awk ' { if( NF != 1 ) print $0; } ' >"${data_feats}/srctexts"
-
-        ${python} -m espnet2.bin.tokenize_text \
-              --token_type "${token_type}" -f 2- \
-              --input "${data_feats}/srctexts" --output "${token_list}" \
-              --non_linguistic_symbols "${nlsyms_txt}" \
-              --cleaner "${cleaner}" \
-              --g2p "${g2p}" \
-              --write_vocabulary true \
-              --add_symbol "${blank}:0" \
-              --add_symbol "${oov}:1" \
-              --add_symbol "${sos_eos}:-1"
-    fi
-else
-    log "Skip the stages for data preparation"
-fi
-
-# ========================== Data preparation is done here. ==========================
-
-
-
-if ! "${skip_train}"; then
-    if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
-        _train_dir="${data_feats}/${train_set}"
-        _valid_dir="${data_feats}/${valid_set}"
-        log "Stage 6: TTS collect stats: train_set=${_train_dir}, valid_set=${_valid_dir}"
-
-        _opts=
-        if [ -n "${train_config}" ]; then
-            # To generate the config file: e.g.
-            #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
-            _opts+="--config ${train_config} "
-        fi
-
-        _scp=wav.scp
-        if [[ "${audio_format}" == *ark* ]]; then
-            _type=kaldi_ark
-        else
-            # "sound" supports "wav", "flac", etc.
-            _type=sound
-        fi
-        _opts+="--feats_extract ${feats_extract} "
-        _opts+="--feats_extract_conf n_fft=${n_fft} "
-        _opts+="--feats_extract_conf hop_length=${n_shift} "
-        _opts+="--feats_extract_conf win_length=${win_length} "
-        if [ "${feats_extract}" = fbank ]; then
-            _opts+="--feats_extract_conf fs=${fs} "
-            _opts+="--feats_extract_conf fmin=${fmin} "
-            _opts+="--feats_extract_conf fmax=${fmax} "
-            _opts+="--feats_extract_conf n_mels=${n_mels} "
-        fi
-
-        # Add extra configs for additional inputs
-        # NOTE(kan-bayashi): We always pass this options but not used in default
-        _opts+="--pitch_extract_conf fs=${fs} "
-        _opts+="--pitch_extract_conf n_fft=${n_fft} "
-        _opts+="--pitch_extract_conf hop_length=${n_shift} "
-        _opts+="--pitch_extract_conf f0max=${f0max} "
-        _opts+="--pitch_extract_conf f0min=${f0min} "
-        _opts+="--energy_extract_conf fs=${fs} "
-        _opts+="--energy_extract_conf n_fft=${n_fft} "
-        _opts+="--energy_extract_conf hop_length=${n_shift} "
-        _opts+="--energy_extract_conf win_length=${win_length} "
-
-        if [ -n "${teacher_dumpdir}" ]; then
-            _teacher_train_dir="${teacher_dumpdir}/${train_set}"
-            _teacher_valid_dir="${teacher_dumpdir}/${valid_set}"
-            _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/durations,durations,text_int "
-            _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/durations,durations,text_int "
-        fi
-
-        if "${use_spk_embed}"; then
-            _spk_embed_train_dir="${dumpdir}/${spk_embed_tag}/${train_set}"
-            _spk_embed_valid_dir="${dumpdir}/${spk_embed_tag}/${valid_set}"
-            _opts+="--train_data_path_and_name_and_type ${_spk_embed_train_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
-            _opts+="--valid_data_path_and_name_and_type ${_spk_embed_valid_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
-        fi
-
-        if "${use_sid}"; then
-            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2sid,sids,text_int "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2sid,sids,text_int "
-        fi
-
-        if "${use_lid}"; then
-            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2lid,lids,text_int "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2lid,lids,text_int "
-        fi
-
-        # 1. Split the key file
-        _logdir="${tts_stats_dir}/logdir"
-        mkdir -p "${_logdir}"
-
-        # Get the minimum number among ${nj} and the number lines of input files
-        _nj=$(min "${nj}" "$(<${_train_dir}/${_scp} wc -l)" "$(<${_valid_dir}/${_scp} wc -l)")
-
-        key_file="${_train_dir}/${_scp}"
-        split_scps=""
-        for n in $(seq "${_nj}"); do
-            split_scps+=" ${_logdir}/train.${n}.scp"
-        done
-        # shellcheck disable=SC2086
-        utils/split_scp.pl "${key_file}" ${split_scps}
-
-        key_file="${_valid_dir}/${_scp}"
-        split_scps=""
-        for n in $(seq "${_nj}"); do
-            split_scps+=" ${_logdir}/valid.${n}.scp"
-        done
-        # shellcheck disable=SC2086
-        utils/split_scp.pl "${key_file}" ${split_scps}
-
-        # 2. Generate run.sh
-        log "Generate '${tts_stats_dir}/run.sh'. You can resume the process from stage 6 using this script"
-        mkdir -p "${tts_stats_dir}"; echo "${run_args} --stage 6 \"\$@\"; exit \$?" > "${tts_stats_dir}/run.sh"; chmod +x "${tts_stats_dir}/run.sh"
-
-        # 3. Submit jobs
-        log "TTS collect_stats started... log: '${_logdir}/stats.*.log'"
-        # shellcheck disable=SC2046,SC2086
-        ${train_cmd} JOB=1:"${_nj}" "${_logdir}"/stats.JOB.log \
-            ${python} -m "espnet2.bin.${tts_task}_train" \
-                --collect_stats true \
-                --write_collected_feats "${write_collected_feats}" \
-                --use_preprocessor true \
-                --token_type "${token_type}" \
-                --token_list "${token_list}" \
-                --non_linguistic_symbols "${nlsyms_txt}" \
-                --cleaner "${cleaner}" \
-                --g2p "${g2p}" \
-                --normalize none \
-                --pitch_normalize none \
-                --energy_normalize none \
-                --train_data_path_and_name_and_type "${_train_dir}/text,text,text" \
-                --train_data_path_and_name_and_type "${_train_dir}/${_scp},speech,${_type}" \
-                --valid_data_path_and_name_and_type "${_valid_dir}/text,text,text" \
-                --valid_data_path_and_name_and_type "${_valid_dir}/${_scp},speech,${_type}" \
-                --train_shape_file "${_logdir}/train.JOB.scp" \
-                --valid_shape_file "${_logdir}/valid.JOB.scp" \
-                --output_dir "${_logdir}/stats.JOB" \
-                ${_opts} ${train_args} || { cat $(grep -l -i error "${_logdir}"/stats.*.log) ; exit 1; }
-
-        # 4. Aggregate shape files
-        _opts=
-        for i in $(seq "${_nj}"); do
-            _opts+="--input_dir ${_logdir}/stats.${i} "
-        done
-        if [ "${feats_normalize}" != global_mvn ]; then
-            # Skip summerizaing stats if not using global MVN
-            _opts+="--skip_sum_stats"
-        fi
-        ${python} -m espnet2.bin.aggregate_stats_dirs ${_opts} --output_dir "${tts_stats_dir}"
-
-        # Append the num-tokens at the last dimensions. This is used for batch-bins count
-        <"${tts_stats_dir}/train/text_shape" \
-            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
-            >"${tts_stats_dir}/train/text_shape.${token_type}"
-
-        <"${tts_stats_dir}/valid/text_shape" \
-            awk -v N="$(<${token_list} wc -l)" '{ print $0 "," N }' \
-            >"${tts_stats_dir}/valid/text_shape.${token_type}"
-    fi
-
-
-    if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-        _train_dir="${data_feats}/${train_set}"
-        _valid_dir="${data_feats}/${valid_set}"
-        log "Stage 7: TTS Training: train_set=${_train_dir}, valid_set=${_valid_dir}"
-
-        _opts=
-        if [ -n "${train_config}" ]; then
-            # To generate the config file: e.g.
-            #   % python3 -m espnet2.bin.tts_train --print_config --optim adam
-            _opts+="--config ${train_config} "
-        fi
-
-        if [ -z "${teacher_dumpdir}" ]; then
-            #####################################
-            #     CASE 1: AR model training     #
-            #####################################
-            _scp=wav.scp
-            if [[ "${audio_format}" == *ark* ]]; then
-                _type=kaldi_ark
-            else
-                # "sound" supports "wav", "flac", etc.
-                _type=sound
-            fi
-            _fold_length="$((speech_fold_length * n_shift))"
-            _opts+="--feats_extract ${feats_extract} "
-            _opts+="--feats_extract_conf n_fft=${n_fft} "
-            _opts+="--feats_extract_conf hop_length=${n_shift} "
-            _opts+="--feats_extract_conf win_length=${win_length} "
-            if [ "${feats_extract}" = fbank ]; then
-                _opts+="--feats_extract_conf fs=${fs} "
-                _opts+="--feats_extract_conf fmin=${fmin} "
-                _opts+="--feats_extract_conf fmax=${fmax} "
-                _opts+="--feats_extract_conf n_mels=${n_mels} "
-            fi
-
-            if [ "${num_splits}" -gt 1 ]; then
-                # If you met a memory error when parsing text files, this option may help you.
-                # The corpus is split into subsets and each subset is used for training one by one in order,
-                # so the memory footprint can be limited to the memory required for each dataset.
-
-                _split_dir="${tts_stats_dir}/splits${num_splits}"
-                if [ ! -f "${_split_dir}/.done" ]; then
-                    rm -f "${_split_dir}/.done"
-                    ${python} -m espnet2.bin.split_scps \
-                      --scps \
-                          "${_train_dir}/text" \
-                          "${_train_dir}/${_scp}" \
-                          "${tts_stats_dir}/train/speech_shape" \
-                          "${tts_stats_dir}/train/text_shape.${token_type}" \
-                      --num_splits "${num_splits}" \
-                      --output_dir "${_split_dir}"
-                    touch "${_split_dir}/.done"
-                else
-                    log "${_split_dir}/.done exists. Spliting is skipped"
-                fi
-
-                _opts+="--train_data_path_and_name_and_type ${_split_dir}/text,text,text "
-                _opts+="--train_data_path_and_name_and_type ${_split_dir}/${_scp},speech,${_type} "
-                _opts+="--train_shape_file ${_split_dir}/text_shape.${token_type} "
-                _opts+="--train_shape_file ${_split_dir}/speech_shape "
-                _opts+="--multiple_iterator true "
-
-            else
-                _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
-                _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
-                _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
-                _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
-            fi
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/text,text,text "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/${_scp},speech,${_type} "
-            _opts+="--valid_shape_file ${tts_stats_dir}/valid/text_shape.${token_type} "
-            _opts+="--valid_shape_file ${tts_stats_dir}/valid/speech_shape "
-        else
-            #####################################
-            #   CASE 2: Non-AR model training   #
-            #####################################
-            _teacher_train_dir="${teacher_dumpdir}/${train_set}"
-            _teacher_valid_dir="${teacher_dumpdir}/${valid_set}"
-            _fold_length="${speech_fold_length}"
-            _opts+="--train_data_path_and_name_and_type ${_train_dir}/text,text,text "
-            _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/durations,durations,text_int "
-            _opts+="--train_shape_file ${tts_stats_dir}/train/text_shape.${token_type} "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/text,text,text "
-            _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/durations,durations,text_int "
-            _opts+="--valid_shape_file ${tts_stats_dir}/valid/text_shape.${token_type} "
-
-            if [ -e ${_teacher_train_dir}/probs ]; then
-                # Knowledge distillation case: use the outputs of the teacher model as the target
-                _scp=feats.scp
-                _type=npy
-                _odim="$(head -n 1 "${_teacher_train_dir}/speech_shape" | cut -f 2 -d ",")"
-                _opts+="--odim=${_odim} "
-                _opts+="--train_data_path_and_name_and_type ${_teacher_train_dir}/denorm/${_scp},speech,${_type} "
-                _opts+="--train_shape_file ${_teacher_train_dir}/speech_shape "
-                _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/denorm/${_scp},speech,${_type} "
-                _opts+="--valid_shape_file ${_teacher_valid_dir}/speech_shape "
-            else
-                # Teacher forcing case: use groundtruth as the target
-                _scp=wav.scp
-                if [[ "${audio_format}" == *ark* ]]; then
-                    _type=kaldi_ark
-                else
-                    # "sound" supports "wav", "flac", etc.
-                    _type=sound
-                fi
-                _fold_length="$((speech_fold_length * n_shift))"
-                _opts+="--feats_extract ${feats_extract} "
-                _opts+="--feats_extract_conf n_fft=${n_fft} "
-                _opts+="--feats_extract_conf hop_length=${n_shift} "
-                _opts+="--feats_extract_conf win_length=${win_length} "
-                if [ "${feats_extract}" = fbank ]; then
-                    _opts+="--feats_extract_conf fs=${fs} "
-                    _opts+="--feats_extract_conf fmin=${fmin} "
-                    _opts+="--feats_extract_conf fmax=${fmax} "
-                    _opts+="--feats_extract_conf n_mels=${n_mels} "
-                fi
-                _opts+="--train_data_path_and_name_and_type ${_train_dir}/${_scp},speech,${_type} "
-                _opts+="--train_shape_file ${tts_stats_dir}/train/speech_shape "
-                _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/${_scp},speech,${_type} "
-                _opts+="--valid_shape_file ${tts_stats_dir}/valid/speech_shape "
-            fi
-        fi
-
-        # If there are dumped files of additional inputs, we use it to reduce computational cost
-        # NOTE (kan-bayashi): Use dumped files of the target features as well?
-        if [ -e "${tts_stats_dir}/train/collect_feats/pitch.scp" ]; then
-            _scp=pitch.scp
-            _type=npy
-            _train_collect_dir=${tts_stats_dir}/train/collect_feats
-            _valid_collect_dir=${tts_stats_dir}/valid/collect_feats
-            _opts+="--train_data_path_and_name_and_type ${_train_collect_dir}/${_scp},pitch,${_type} "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_collect_dir}/${_scp},pitch,${_type} "
-        fi
-        if [ -e "${tts_stats_dir}/train/collect_feats/energy.scp" ]; then
-            _scp=energy.scp
-            _type=npy
-            _train_collect_dir=${tts_stats_dir}/train/collect_feats
-            _valid_collect_dir=${tts_stats_dir}/valid/collect_feats
-            _opts+="--train_data_path_and_name_and_type ${_train_collect_dir}/${_scp},energy,${_type} "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_collect_dir}/${_scp},energy,${_type} "
-        fi
-
-        # Check extra statistics
-        if [ -e "${tts_stats_dir}/train/pitch_stats.npz" ]; then
-            _opts+="--pitch_extract_conf fs=${fs} "
-            _opts+="--pitch_extract_conf n_fft=${n_fft} "
-            _opts+="--pitch_extract_conf hop_length=${n_shift} "
-            _opts+="--pitch_extract_conf f0max=${f0max} "
-            _opts+="--pitch_extract_conf f0min=${f0min} "
-            _opts+="--pitch_normalize_conf stats_file=${tts_stats_dir}/train/pitch_stats.npz "
-        fi
-        if [ -e "${tts_stats_dir}/train/energy_stats.npz" ]; then
-            _opts+="--energy_extract_conf fs=${fs} "
-            _opts+="--energy_extract_conf n_fft=${n_fft} "
-            _opts+="--energy_extract_conf hop_length=${n_shift} "
-            _opts+="--energy_extract_conf win_length=${win_length} "
-            _opts+="--energy_normalize_conf stats_file=${tts_stats_dir}/train/energy_stats.npz "
-        fi
-
-        # Add speaker embedding to the inputs if needed
-        if "${use_spk_embed}"; then
-            _spk_embed_train_dir="${dumpdir}/${spk_embed_tag}/${train_set}"
-            _spk_embed_valid_dir="${dumpdir}/${spk_embed_tag}/${valid_set}"
-            _opts+="--train_data_path_and_name_and_type ${_spk_embed_train_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
-            _opts+="--valid_data_path_and_name_and_type ${_spk_embed_valid_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
-        fi
-
-        # Add spekaer ID to the inputs if needed
-        if "${use_sid}"; then
-            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2sid,sids,text_int "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2sid,sids,text_int "
-        fi
-
-        # Add language ID to the inputs if needed
-        if "${use_lid}"; then
-            _opts+="--train_data_path_and_name_and_type ${_train_dir}/utt2lid,lids,text_int "
-            _opts+="--valid_data_path_and_name_and_type ${_valid_dir}/utt2lid,lids,text_int "
-        fi
-
-        if [ "${feats_normalize}" = "global_mvn" ]; then
-            _opts+="--normalize_conf stats_file=${tts_stats_dir}/train/feats_stats.npz "
-        fi
-
-        log "Generate '${tts_exp}/run.sh'. You can resume the process from stage 7 using this script"
-        mkdir -p "${tts_exp}"; echo "${run_args} --stage 7 \"\$@\"; exit \$?" > "${tts_exp}/run.sh"; chmod +x "${tts_exp}/run.sh"
-
-        # NOTE(kamo): --fold_length is used only if --batch_type=folded and it's ignored in the other case
-
-        log "TTS training started... log: '${tts_exp}/train.log'"
-        if echo "${cuda_cmd}" | grep -e queue.pl -e queue-freegpu.pl &> /dev/null; then
-            # SGE can't include "/" in a job name
-            jobname="$(basename ${tts_exp})"
-        else
-            jobname="${tts_exp}/train.log"
-        fi
-        # shellcheck disable=SC2086
-        ${python} -m espnet2.bin.launch \
-            --cmd "${cuda_cmd} --name ${jobname}" \
-            --log "${tts_exp}"/train.log \
-            --ngpu "${ngpu}" \
-            --num_nodes "${num_nodes}" \
-            --init_file_prefix "${tts_exp}"/.dist_init_ \
-            --multiprocessing_distributed true -- \
-            ${python} -m "espnet2.bin.${tts_task}_train" \
-                --use_preprocessor true \
-                --token_type "${token_type}" \
-                --token_list "${token_list}" \
-                --non_linguistic_symbols "${nlsyms_txt}" \
-                --cleaner "${cleaner}" \
-                --g2p "${g2p}" \
-                --normalize "${feats_normalize}" \
-                --resume true \
-                --fold_length "${text_fold_length}" \
-                --fold_length "${_fold_length}" \
-                --output_dir "${tts_exp}" \
-                ${_opts} ${train_args}
-
-    fi
-else
-    log "Skip training stages"
-fi
-
-
-if [ -n "${download_model}" ]; then
-    log "Use ${download_model} for decoding and evaluation"
-    tts_exp="${expdir}/${download_model}"
-    mkdir -p "${tts_exp}"
-
-    # If the model already exists, you can skip downloading
-    espnet_model_zoo_download --unpack true "${download_model}" > "${tts_exp}/config.txt"
-
-    # Get the path of each file
-    _model_file=$(<"${tts_exp}/config.txt" sed -e "s/.*'model_file': '\([^']*\)'.*$/\1/")
-    _train_config=$(<"${tts_exp}/config.txt" sed -e "s/.*'train_config': '\([^']*\)'.*$/\1/")
-
-    # Create symbolic links
-    ln -sf "${_model_file}" "${tts_exp}"
-    ln -sf "${_train_config}" "${tts_exp}"
-    inference_model=$(basename "${_model_file}")
-
-fi
-
-
-if ! "${skip_eval}"; then
-    if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-        log "Stage 8: Decoding: training_dir=${tts_exp}"
-
-        if ${gpu_inference}; then
-            _cmd="${cuda_cmd}"
-            _ngpu=1
-        else
-            _cmd="${decode_cmd}"
-            _ngpu=0
-        fi
-
-        _opts=
-        if [ -n "${inference_config}" ]; then
-            _opts+="--config ${inference_config} "
-        fi
-
-        _scp=wav.scp
-        if [[ "${audio_format}" == *ark* ]]; then
-            _type=kaldi_ark
-        else
-            # "sound" supports "wav", "flac", etc.
-            _type=sound
-        fi
-
-        log "Generate '${tts_exp}/${inference_tag}/run.sh'. You can resume the process from stage 8 using this script"
-        mkdir -p "${tts_exp}/${inference_tag}"; echo "${run_args} --stage 8 \"\$@\"; exit \$?" > "${tts_exp}/${inference_tag}/run.sh"; chmod +x "${tts_exp}/${inference_tag}/run.sh"
-
-
-        for dset in ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _speech_data="${_data}"
-            _dir="${tts_exp}/${inference_tag}/${dset}"
-            _logdir="${_dir}/log"
-            mkdir -p "${_logdir}"
-
-            _ex_opts=""
-            if [ -n "${teacher_dumpdir}" ]; then
-                # Use groundtruth of durations
-                _teacher_dir="${teacher_dumpdir}/${dset}"
-                _ex_opts+="--data_path_and_name_and_type ${_teacher_dir}/durations,durations,text_int "
-                # Overwrite speech arguments if use knowledge distillation
-                if [ -e "${teacher_dumpdir}/${train_set}/probs" ]; then
-                    _speech_data="${_teacher_dir}/denorm"
-                    _scp=feats.scp
-                    _type=npy
-                fi
-            fi
-
-            # Add speaker embedding to the inputs if needed
-            if "${use_spk_embed}"; then
-                _spk_embed_dir="${dumpdir}/${spk_embed_tag}/${dset}"
-                _ex_opts+="--data_path_and_name_and_type ${_spk_embed_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark "
-            fi
-
-            # Add spekaer ID to the inputs if needed
-            if "${use_sid}"; then
-                _ex_opts+="--data_path_and_name_and_type ${_data}/utt2sid,sids,text_int "
-            fi
-
-            # Add language ID to the inputs if needed
-            if "${use_lid}"; then
-                _ex_opts+="--data_path_and_name_and_type ${_data}/utt2lid,lids,text_int "
-            fi
-
-            # 0. Copy feats_type
-            cp "${_data}/feats_type" "${_dir}/feats_type"
-
-            # 1. Split the key file
-            key_file=${_data}/text
-            split_scps=""
-            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-            for n in $(seq "${_nj}"); do
-                split_scps+=" ${_logdir}/keys.${n}.scp"
-            done
-            # shellcheck disable=SC2086
-            utils/split_scp.pl "${key_file}" ${split_scps}
-
-            # 2. Submit decoding jobs
-            log "Decoding started... log: '${_logdir}/tts_inference.*.log'"
-            # shellcheck disable=SC2046,SC2086
-            ${_cmd} --gpu "${_ngpu}" JOB=1:"${_nj}" "${_logdir}"/tts_inference.JOB.log \
-                ${python} -m espnet2.bin.tts_inference \
-                    --ngpu "${_ngpu}" \
-                    --data_path_and_name_and_type "${_data}/text,text,text" \
-                    --data_path_and_name_and_type ${_speech_data}/${_scp},speech,${_type} \
-                    --key_file "${_logdir}"/keys.JOB.scp \
-                    --model_file "${tts_exp}"/"${inference_model}" \
-                    --train_config "${tts_exp}"/config.yaml \
-                    --output_dir "${_logdir}"/output.JOB \
-                    --vocoder_file "${vocoder_file}" \
-                    ${_opts} ${_ex_opts} ${inference_args} || { cat $(grep -l -i error "${_logdir}"/tts_inference.*.log) ; exit 1; }
-
-            # 3. Concatenates the output files from each jobs
-            if [ -e "${_logdir}/output.${_nj}/norm" ]; then
-                mkdir -p "${_dir}"/norm
-                for i in $(seq "${_nj}"); do
-                     cat "${_logdir}/output.${i}/norm/feats.scp"
-                done | LC_ALL=C sort -k1 > "${_dir}/norm/feats.scp"
-            fi
-            if [ -e "${_logdir}/output.${_nj}/denorm" ]; then
-                mkdir -p "${_dir}"/denorm
-                for i in $(seq "${_nj}"); do
-                     cat "${_logdir}/output.${i}/denorm/feats.scp"
-                done | LC_ALL=C sort -k1 > "${_dir}/denorm/feats.scp"
-            fi
-            if [ -e "${_logdir}/output.${_nj}/speech_shape" ]; then
-                for i in $(seq "${_nj}"); do
-                     cat "${_logdir}/output.${i}/speech_shape/speech_shape"
-                done | LC_ALL=C sort -k1 > "${_dir}/speech_shape"
-            fi
-            if [ -e "${_logdir}/output.${_nj}/wav" ]; then
-                mkdir -p "${_dir}"/wav
-                for i in $(seq "${_nj}"); do
-                    mv -u "${_logdir}/output.${i}"/wav/*.wav "${_dir}"/wav
-                    rm -rf "${_logdir}/output.${i}"/wav
-                done
-                find "${_dir}/wav" -name "*.wav" | while read -r line; do
-                    echo "$(basename "${line}" .wav) ${line}"
-                done | LC_ALL=C sort -k1 > "${_dir}/wav/wav.scp"
-            fi
-            if [ -e "${_logdir}/output.${_nj}/att_ws" ]; then
-                mkdir -p "${_dir}"/att_ws
-                for i in $(seq "${_nj}"); do
-                    mv -u "${_logdir}/output.${i}"/att_ws/*.png "${_dir}"/att_ws
-                    rm -rf "${_logdir}/output.${i}"/att_ws
-                done
-            fi
-            if [ -e "${_logdir}/output.${_nj}/durations" ]; then
-                for i in $(seq "${_nj}"); do
-                     cat "${_logdir}/output.${i}/durations/durations"
-                done | LC_ALL=C sort -k1 > "${_dir}/durations"
-            fi
-            if [ -e "${_logdir}/output.${_nj}/focus_rates" ]; then
-                for i in $(seq "${_nj}"); do
-                     cat "${_logdir}/output.${i}/focus_rates/focus_rates"
-                done | LC_ALL=C sort -k1 > "${_dir}/focus_rates"
-            fi
-            if [ -e "${_logdir}/output.${_nj}/probs" ]; then
-                mkdir -p "${_dir}"/probs
-                for i in $(seq "${_nj}"); do
-                    mv -u "${_logdir}/output.${i}"/probs/*.png "${_dir}"/probs
-                    rm -rf "${_logdir}/output.${i}"/probs
-                done
-            fi
-        done
-    fi
-else
-    log "Skip the evaluation stages"
-fi
-
-
-packed_model="${tts_exp}/${tts_exp##*/}_${inference_model%.*}.zip"
-if ! "${skip_packing}" && [ -z "${download_model}" ]; then
-    # Skip pack preparation if using a downloaded model or skip_packing is true
-    if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
-        log "Stage 9: Pack model: ${packed_model}"
-
-        _opts=""
-        if [ -e "${tts_stats_dir}/train/feats_stats.npz" ]; then
-            _opts+=" --option ${tts_stats_dir}/train/feats_stats.npz"
-        fi
-        if [ -e "${tts_stats_dir}/train/pitch_stats.npz" ]; then
-            _opts+=" --option ${tts_stats_dir}/train/pitch_stats.npz"
-        fi
-        if [ -e "${tts_stats_dir}/train/energy_stats.npz" ]; then
-            _opts+=" --option ${tts_stats_dir}/train/energy_stats.npz"
-        fi
-        if "${use_spk_embed}"; then
-            for dset in "${train_set}" ${test_sets}; do
-                _opts+=" --option ${dumpdir}/${spk_embed_tag}/${dset}/spk_${spk_embed_tag}.scp"
-                _opts+=" --option ${dumpdir}/${spk_embed_tag}/${dset}/spk_${spk_embed_tag}.ark"
-            done
-        fi
-        if "${use_sid}"; then
-            _opts+=" --option ${data_feats}/org/${train_set}/spk2sid"
-        fi
-        if "${use_lid}"; then
-            _opts+=" --option ${data_feats}/org/${train_set}/lang2lid"
-        fi
-        ${python} -m espnet2.bin.pack tts \
-            --train_config "${tts_exp}"/config.yaml \
-            --model_file "${tts_exp}"/"${inference_model}" \
-            --option "${tts_exp}"/images  \
-            --outpath "${packed_model}" \
-            ${_opts}
-
-        # NOTE(kamo): If you'll use packed model to inference in this script, do as follows
-        #   % unzip ${packed_model}
-        #   % ./run.sh --stage 9 --tts_exp $(basename ${packed_model} .zip) --inference_model pretrain.pth
-    fi
-else
-    log "Skip the packing stage"
-fi
-
-if ! "${skip_upload_hf}"; then
-    if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
-        [ -z "${hf_repo}" ] && \
-            log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \
-            exit 1
-        log "Stage 10: Upload model to HuggingFace: ${hf_repo}"
-
-    if [ ! -f "${packed_model}" ]; then
-        log "ERROR: ${packed_model} does not exist. Please run stage 9 first."
-        exit 1
-    fi
-
-        gitlfs=$(git lfs --version 2> /dev/null || true)
-        [ -z "${gitlfs}" ] && \
-            log "ERROR: You need to install git-lfs first" && \
-            exit 1
-
-        dir_repo=${expdir}/hf_${hf_repo//"/"/"_"}
-        [ ! -d "${dir_repo}" ] && git clone https://huggingface.co/${hf_repo} ${dir_repo}
-
-        if command -v git &> /dev/null; then
-            _creator_name="$(git config user.name)"
-            _checkout="git checkout $(git show -s --format=%H)"
-        else
-            _creator_name="$(whoami)"
-            _checkout=""
-        fi
-        # /some/where/espnet/egs2/foo/asr1/ -> foo/asr1
-        _task="$(pwd | rev | cut -d/ -f2 | rev)"
-        # foo/asr1 -> foo
-        _corpus="${_task%/*}"
-        _model_name="${_creator_name}/${_corpus}_$(basename ${packed_model} .zip)"
-
-        # copy files in ${dir_repo}
-        unzip -o ${packed_model} -d ${dir_repo}
-        # Generate description file
-        # shellcheck disable=SC2034
-        hf_task=text-to-speech
-        # shellcheck disable=SC2034
-        espnet_task=TTS
-        # shellcheck disable=SC2034
-        task_exp=${tts_exp}
-        eval "echo \"$(cat scripts/utils/TEMPLATE_HF_Readme.md)\"" > "${dir_repo}"/README.md
-
-        this_folder=${PWD}
-        cd ${dir_repo}
-        if [ -n "$(git status --porcelain)" ]; then
-            git add .
-            git commit -m "Update model"
-        fi
-        git push
-        cd ${this_folder}
-    fi
-else
-    log "Skip the uploading to HuggingFace stage"
-fi
-
-log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell3/tts2/tts.sh b/egs2/aishell3/tts2/tts.sh
new file mode 120000
index 00000000000..905a3d8b038
--- /dev/null
+++ b/egs2/aishell3/tts2/tts.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork_tocommit/egs2/TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/espnet2/bin/tts2_inference.py b/espnet2/bin/tts2_inference.py
index 3781147ef5d..45483fec285 100755
--- a/espnet2/bin/tts2_inference.py
+++ b/espnet2/bin/tts2_inference.py
@@ -268,9 +268,9 @@ def from_pretrained(
         return Text2Speech(**kwargs)
 
 
-# @typechecked NOTE(yiwen) --output_dir "${_logdir}"/output.JOB \    format like this cannot pass typecheck, but it is str
+@typechecked
 def inference(
-    output_dir: str,
+    output_dir: Union[Path, str],
     batch_size: int,
     dtype: str,
     ngpu: int,
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
index 604fcb84fee..9d0ec1c3d2c 100755
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -306,9 +306,9 @@ def from_pretrained(
         return Text2Speech(**kwargs)
 
 
-# @typechecked
+@typechecked
 def inference(
-    output_dir: str,
+    output_dir: Union[Path, str],
     batch_size: int,
     dtype: str,
     ngpu: int,

From dd970e39fd0167c45fb2c5bc82830993573cd8c2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:23:26 +0000
Subject: [PATCH 09/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index 2bc9ec092da..1eb1686f0c2 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -102,9 +102,7 @@ def load_audio(self, path: str, ref_len: Optional[int] = None):
         if sr != self.sample_rate:
             logging.warning(
                 "sampling rate mismatch between the requirements of feature extractor {} "
-                "and source wav {}, conduct resampling".format(
-                    self.sample_rate, sr
-                )
+                "and source wav {}, conduct resampling".format(self.sample_rate, sr)
             )
             wav = librosa.resample(wav, sr, self.sample_rate, scale=True)
         if wav.ndim == 2:

From 79cd8e914ec24254c4a2c94b4a2e9626d1053913 Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br014.ib.bridges2.psc.edu>
Date: Tue, 20 Aug 2024 07:48:41 -0400
Subject: [PATCH 10/13] fix CI

---
 egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index 2bc9ec092da..902697a82e5 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -101,10 +101,10 @@ def load_audio(self, path: str, ref_len: Optional[int] = None):
         # assert sr == self.sample_rate, sr
         if sr != self.sample_rate:
             logging.warning(
-                "sampling rate mismatch between the requirements of feature extractor {} "
-                "and source wav {}, conduct resampling".format(
-                    self.sample_rate, sr
-                )
+                "sampling rate mismatch between "
+                "the requirements of feature extractor {} "
+                "and source wav {},"
+                "conduct resampling".format(self.sample_rate, sr)
             )
             wav = librosa.resample(wav, sr, self.sample_rate, scale=True)
         if wav.ndim == 2:

From bc5d5da44f29183b4a10747cf7f04a43498c68ae Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <107789242+Tsukasane@users.noreply.github.com>
Date: Tue, 20 Aug 2024 08:02:46 -0400
Subject: [PATCH 11/13] fix CI

---
 egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index 1eb1686f0c2..902697a82e5 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -101,8 +101,10 @@ def load_audio(self, path: str, ref_len: Optional[int] = None):
         # assert sr == self.sample_rate, sr
         if sr != self.sample_rate:
             logging.warning(
-                "sampling rate mismatch between the requirements of feature extractor {} "
-                "and source wav {}, conduct resampling".format(self.sample_rate, sr)
+                "sampling rate mismatch between "
+                "the requirements of feature extractor {} "
+                "and source wav {},"
+                "conduct resampling".format(self.sample_rate, sr)
             )
             wav = librosa.resample(wav, sr, self.sample_rate, scale=True)
         if wav.ndim == 2:

From b0425e9a0b6985cb00d92ef6a3f484b1de3edc19 Mon Sep 17 00:00:00 2001
From: Yiwen Zhao <yzhao16@br013.ib.bridges2.psc.edu>
Date: Wed, 21 Aug 2024 08:55:49 -0400
Subject: [PATCH 12/13] fix type mismatch on ssl feature utils

---
 .../asr1/pyscripts/feats/ssl_feature_utils.py      | 10 +++++-----
 egs2/TEMPLATE/tts2/tts2.sh                         | 14 +++++++-------
 egs2/aishell3/tts2/README.md                       |  1 +
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index 902697a82e5..d77ea627652 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -151,7 +151,7 @@ def __init__(
         if self.sample_rate != self.audio_sample_rate:
             logging.warning("The audio sample rate is different from feat extractor")
             self.resample = torchaudio.transforms.Resample(
-                orig_freq=audio_sample_rate, new_freq=fs
+                orig_freq=audio_sample_rate, new_freq=sample_rate
             )
         else:
             self.resample = None
@@ -202,12 +202,12 @@ def __init__(
         max_chunk=1600000,
         use_gpu=True,
     ):
-        self.sample_rate = sample_rate
+        self.sample_rate = int(sample_rate)
         self.audio_sample_rate = audio_sample_rate
         if self.sample_rate != self.audio_sample_rate:
             logging.warning("The audio sample rate is different from feat extractor")
             self.resample = torchaudio.transforms.Resample(
-                orig_freq=audio_sample_rate, new_freq=fs
+                orig_freq=audio_sample_rate, new_freq=self.sample_rate
             )
         else:
             self.resample = None
@@ -266,12 +266,12 @@ def __init__(
         max_chunk=1600000,
         use_gpu=True,
     ):
-        self.sample_rate = sample_rate
+        self.sample_rate = int(sample_rate) # str->int
         self.audio_sample_rate = audio_sample_rate
         if self.sample_rate != self.audio_sample_rate:
             logging.warning("The audio sample rate is different from feat extractor")
             self.resample = torchaudio.transforms.Resample(
-                orig_freq=audio_sample_rate, new_freq=fs
+                orig_freq=audio_sample_rate, new_freq=self.sample_rate
             )
         else:
             self.resample = None
diff --git a/egs2/TEMPLATE/tts2/tts2.sh b/egs2/TEMPLATE/tts2/tts2.sh
index 13ea21f5c73..7dbe42efe08 100755
--- a/egs2/TEMPLATE/tts2/tts2.sh
+++ b/egs2/TEMPLATE/tts2/tts2.sh
@@ -592,14 +592,14 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         log "Stage 6: Discrete TTS discrete unit extraction"
-        # (en hubert)
-        # s3prl_conf="{upstream=${s3prl_upstream_name}}"
-        # kmeans_feature_type=s3prl
-        # kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
-        # (zh hubert)
-        s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}"
+        # (en hubert), the original arguments
+        s3prl_conf="{upstream=${s3prl_upstream_name}}"
         kmeans_feature_type=s3prl
-        kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}
+        kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
+        # (zh hubert), the arguments we used on aishell3 
+        # s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}"
+        # kmeans_feature_type=s3prl
+        # kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}
         scripts/feats/perform_kmeans.sh \
             --stage ${discrete_stage} \
             --stop_stage ${discrete_stop_stage} \
diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
index f808902eead..9b7ba351187 100644
--- a/egs2/aishell3/tts2/README.md
+++ b/egs2/aishell3/tts2/README.md
@@ -99,6 +99,7 @@ The datasets include text, durations, speech, discrete speech, pitch, energy, an
 ```
 
 ```
+# It is recommended to modify tts2.sh, switching the English hubert to Chinese hubert, for aishell3 customization.
 ./run.sh --stage 5 --stop_stage 6 --s3prl_upstream_name hf_hubert_custom --feature_layer 17
 
 ./run.sh --stage 8 --stop_stage 8 --s3prl_upstream_name hf_hubert_custom --feature_layer 17 \

From b8d56cfd0ce43e8746be559b92c345fc7a4d6acb Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 21 Aug 2024 13:01:19 +0000
Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py | 2 +-
 egs2/TEMPLATE/tts2/tts2.sh                              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index d77ea627652..606022716bb 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -266,7 +266,7 @@ def __init__(
         max_chunk=1600000,
         use_gpu=True,
     ):
-        self.sample_rate = int(sample_rate) # str->int
+        self.sample_rate = int(sample_rate)  # str->int
         self.audio_sample_rate = audio_sample_rate
         if self.sample_rate != self.audio_sample_rate:
             logging.warning("The audio sample rate is different from feat extractor")
diff --git a/egs2/TEMPLATE/tts2/tts2.sh b/egs2/TEMPLATE/tts2/tts2.sh
index 7dbe42efe08..8cd1039d11c 100755
--- a/egs2/TEMPLATE/tts2/tts2.sh
+++ b/egs2/TEMPLATE/tts2/tts2.sh
@@ -596,7 +596,7 @@ if ! "${skip_data_prep}"; then
         s3prl_conf="{upstream=${s3prl_upstream_name}}"
         kmeans_feature_type=s3prl
         kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
-        # (zh hubert), the arguments we used on aishell3 
+        # (zh hubert), the arguments we used on aishell3
         # s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}"
         # kmeans_feature_type=s3prl
         # kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}