diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py
index 2577f4ef766..210c7331c99 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py
@@ -75,6 +75,12 @@ def get_parser():
         help="Specify the file format for the rspecifier. "
         '"mat" is the matrix format in kaldi',
     )
+    parser.add_argument(
+        "--audio_sample_rate",
+        type=int,
+        default=16000,
+        help="input audio sampling rate (could be different from fs used in SSL)",
+    )
     parser.add_argument(
         "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark"
     )
@@ -116,6 +122,7 @@ def __call__(self, x):
 def dump_label(
     rspecifier,
     in_filetype,
+    audio_sample_rate,
     wspecifier,
     out_filetype,
     km_path,
@@ -152,6 +159,7 @@ def dump_label(
             )
         if reader_conf.get("layer", None):
             reader_conf["layer"] = int(reader_conf["layer"])
+        reader_conf["audio_sample_rate"] = audio_sample_rate
 
         reader = reader_class(**reader_conf)
         iterator = build_data_iterator(
diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py
index ca5f03a734a..4932d3b7c26 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py
@@ -55,6 +55,12 @@ def get_parser():
         default=None,
         help="Specify the utt2num_samples file.",
     )
+    parser.add_argument(
+        "--audio_sample_rate",
+        type=int,
+        default=16000,
+        help="input audio sampling rate (could be different from fs used in SSL)",
+    )
     parser.add_argument(
         "--write_num_frames", type=str, help="Specify wspecifer for utt2num_frames"
     )
@@ -83,6 +89,7 @@ def main(args):
         reader_conf["multilayer_feature"] = str2bool(reader_conf["multilayer_feature"])
     if reader_conf.get("layer", None):
         reader_conf["layer"] = int(reader_conf["layer"])
+    reader_conf["audio_sample_rate"] = args.audio_sample_rate
     reader = reader_class(use_gpu=args.use_gpu, **reader_conf)
 
     dump_feature(
diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
index 92db12c5242..606022716bb 100644
--- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
+++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py
@@ -5,6 +5,7 @@
 import sys
 from typing import List, Optional, Tuple, Union
 
+import librosa
 import numpy as np
 import soundfile as sf
 import torch
@@ -97,7 +98,15 @@ def __init__(self):
 
     def load_audio(self, path: str, ref_len: Optional[int] = None):
         wav, sr = sf.read(path)
-        assert sr == self.sample_rate, sr
+        # assert sr == self.sample_rate, sr
+        if sr != self.sample_rate:
+            logging.warning(
+                "sampling rate mismatch between "
+                "the requirements of feature extractor {} "
+                "and source wav {},"
+                "conduct resampling".format(self.sample_rate, sr)
+            )
+            wav = librosa.resample(wav, sr, self.sample_rate, scale=True)
         if wav.ndim == 2:
             wav = wav.mean(-1)
         if ref_len is not None and abs(ref_len - len(wav)) > 160:
@@ -134,9 +143,18 @@ class MfccFeatureReader(BaseFeatureReader):
     def __init__(
         self,
         sample_rate: int = 16000,
+        audio_sample_rate: int = 16000,
         **kwargs,  # placeholder for unused arguments
     ):
         self.sample_rate = sample_rate
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=sample_rate
+            )
+        else:
+            self.resample = None
         self.frame_length = 25 * sample_rate / 1000
         self.frame_shift = 10 * sample_rate / 1000
 
@@ -149,6 +167,9 @@ def get_feats(
         feats, feats_lens = [], []
         with torch.no_grad():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             batch_size = x.shape[0]
             for i in range(batch_size):
                 mfcc = torchaudio.compliance.kaldi.mfcc(
@@ -177,10 +198,19 @@ def __init__(
         hubert_dir_path,
         layer,
         sample_rate=16000,
+        audio_sample_rate=16000,
         max_chunk=1600000,
         use_gpu=True,
     ):
-        self.sample_rate = sample_rate
+        self.sample_rate = int(sample_rate)
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=self.sample_rate
+            )
+        else:
+            self.resample = None
 
         self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder
@@ -200,6 +230,9 @@ def get_feats(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.no_grad():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             x = x.to(self.device)
             mask = x.zeros_like(x, dtype=torch.long)
             for i in range(x.shape[0]):
@@ -229,10 +262,19 @@ def __init__(
         hubert_model_path,
         layer,
         sample_rate=16000,
+        audio_sample_rate=16000,
         max_chunk=1600000,
         use_gpu=True,
     ):
-        self.sample_rate = sample_rate
+        self.sample_rate = int(sample_rate)  # str->int
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=self.sample_rate
+            )
+        else:
+            self.resample = None
 
         self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         from espnet2.tasks.hubert import HubertTask
@@ -256,6 +298,9 @@ def get_feats(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.inference_mode():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             x = x.to(self.device)
             x_lens = x_lens.to(self.device)
 
@@ -272,6 +317,7 @@ class S3PRLFeatureReader(BaseFeatureReader):
     def __init__(
         self,
         fs: Union[int, str] = 16000,
+        audio_sample_rate: int = 16000,
         s3prl_conf: Optional[dict] = None,
         download_dir: str = None,
         multilayer_feature: bool = False,
@@ -285,6 +331,16 @@ def __init__(
             multilayer_feature=multilayer_feature,
             layer=layer,
         )
+        self.sample_rate = fs
+        self.audio_sample_rate = audio_sample_rate
+        if self.sample_rate != self.audio_sample_rate:
+            logging.warning("The audio sample rate is different from feat extractor")
+            self.resample = torchaudio.transforms.Resample(
+                orig_freq=audio_sample_rate, new_freq=fs
+            )
+        else:
+            self.resample = None
+
         self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
         self.model = self.model.to(self.device)
 
@@ -296,6 +352,9 @@ def get_feats(
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         with torch.no_grad():
             x, x_lens = self.preprocess_data(data, data_lens)
+            if self.resample is not None:
+                x = self.resample(x)
+                x_lens = x_lens * self.sample_rate // self.audio_sample_rate
             x = x.to(self.device)
 
             feats, feats_lens = self.model(x, x_lens)
diff --git a/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh b/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh
index 9a5e560639f..daee0bfd49c 100755
--- a/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh
+++ b/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh
@@ -31,6 +31,7 @@ upsample=           # Upsampling rate of pseudo-labels to measure the pseudo-lab
 use_gpu=false       # Whether to use gpu in feature extraction
 suffix=             # A suffix to distinguish the feature dump directory. Empty in usual cases.
 audio_format="wav"  # The audio format of the source speech (flac, wav, *_ark, etc)
+audio_sample_rate=16000 # the sample rate of input audio
 
 skip_train_kmeans=false     # Whether to skip the kmeans model training
 nclusters=100       # Number of clusters of kmeans model
@@ -152,6 +153,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && ! [[ " ${skip_stages} " =~ [
         ${_cmd} JOB=1:${_nj} ${_logdir}/dump_features.JOB.log \
             ${python} pyscripts/feats/dump_ssl_feature.py \
                 --feature_conf "'${feature_conf}'" \
+                --audio_sample_rate "${audio_sample_rate}" \
                 --use_gpu ${use_gpu} \
                 --in_filetype "${_in_filetype}" \
                 --out_filetype "mat" \
@@ -267,6 +269,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && ! [[ " ${skip_stages} " =~ [
         ${_cmd} JOB=1:${_nj} "${_dump_dir}"/logdir/inference_pseudo_labels_km${nclusters}.JOB.log \
             ${python} pyscripts/feats/dump_km_label.py \
                 ${_opts} \
+                --audio_sample_rate "${audio_sample_rate}" \
                 --km_path "${km_dir}/km_${nclusters}.mdl" \
                 --out_filetype "mat" \
                 --use_gpu ${use_gpu} \
diff --git a/egs2/TEMPLATE/tts2/tts2.sh b/egs2/TEMPLATE/tts2/tts2.sh
index e6c10c4beb5..8cd1039d11c 100755
--- a/egs2/TEMPLATE/tts2/tts2.sh
+++ b/egs2/TEMPLATE/tts2/tts2.sh
@@ -592,11 +592,14 @@ if ! "${skip_data_prep}"; then
 
     if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         log "Stage 6: Discrete TTS discrete unit extraction"
-
+        # (en hubert), the original arguments
         s3prl_conf="{upstream=${s3prl_upstream_name}}"
         kmeans_feature_type=s3prl
         kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}"
-
+        # (zh hubert), the arguments we used on aishell3
+        # s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}"
+        # kmeans_feature_type=s3prl
+        # kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}
         scripts/feats/perform_kmeans.sh \
             --stage ${discrete_stage} \
             --stop_stage ${discrete_stop_stage} \
@@ -606,6 +609,7 @@ if ! "${skip_data_prep}"; then
             --datadir "${dumpdir}/raw" \
             --featdir "${feature_dir}" \
             --audio_format "${audio_format}" \
+            --audio_sample_rate "${fs}" \
             --feature_type ${kmeans_feature_type} \
             --layer "${feature_layer}" \
             --feature_conf "${kmeans_feature_conf}" \
diff --git a/egs2/aishell3/tts1/local/data.sh b/egs2/aishell3/tts1/local/data.sh
index 32df0584444..e4be9332c9c 100755
--- a/egs2/aishell3/tts1/local/data.sh
+++ b/egs2/aishell3/tts1/local/data.sh
@@ -34,6 +34,7 @@ fi
 db_root=${AISHELL3}
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    mkdir -p ${db_root}
     log "stage -1: download data from openslr"
     local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/93/data_aishell3.tgz" data_aishell3.tgz
 fi
@@ -78,19 +79,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         utils/fix_data_dir.sh data/${x}
     done
 fi
-
+# use {dset},_phn here, to be consistent with mfa
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     log "stage 3: split for development set"
     utils/subset_data_dir.sh data/train 250 data/dev
     utils/subset_data_dir.sh data/train_phn 250 data/dev_phn
     utils/copy_data_dir.sh data/train data/train_no_dev
-    utils/copy_data_dir.sh data/train_phn data/train_phn_no_dev
+    utils/copy_data_dir.sh data/train_phn data/train_no_dev_phn
     utils/filter_scp.pl --exclude data/dev/wav.scp \
         data/train/wav.scp > data/train_no_dev/wav.scp
     utils/filter_scp.pl --exclude data/dev_phn/wav.scp \
-        data/train_phn/wav.scp > data/train_phn_no_dev/wav.scp
+        data/train_phn/wav.scp > data/train_no_dev_phn/wav.scp
     utils/fix_data_dir.sh data/train_no_dev
-    utils/fix_data_dir.sh data/train_phn_no_dev
+    utils/fix_data_dir.sh data/train_no_dev_phn
 fi
 
 log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md
new file mode 100644
index 00000000000..9b7ba351187
--- /dev/null
+++ b/egs2/aishell3/tts2/README.md
@@ -0,0 +1,209 @@
+# AISHELL3 RECIPE
+
+This is the recipe of Mandrain multi-speaker TTS2 model with [aishell3](https://www.openslr.org/93/) corpus.
+
+See the following pages for running on clusters. They can help you to set the environment and get familiar with ESPNet's repo structure.
+- [PSC usage tutorial](https://www.wavlab.org/activities/2022/psc-usage/)
+- [Espnet recipe tutorial](https://github.com/espnet/notebook/blob/master/ESPnet2/Course/CMU_SpeechRecognition_Fall2022/recipe_tutorial.ipynb)
+
+
+## Brief on TTS2
+
+- In terms of features
+
+  ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the discrete FastSpeech2 model training.
+- In terms of data
+
+  ``tts2`` additionally requires duration information, which can be obtained from **Speech-Text Alignment Tools** (tacotron teacher model or mfa). According to the [FastSpeech2](https://arxiv.org/pdf/2006.04558) paper, mfa has a higher quality.
+
+
+## Run the Recipe
+
+🌟 Please notice that most of the ``bash files`` are symbolic linked from the TEMPLATE. It might be updated by later commits using other corpus, so please double check and customize the parameters before your run.
+
+Here is the basic order for running scripts, followed by more details.
+
+1. ``./local/run_mfa.sh``
+2. ``./run_train_teacher.sh`` (to stage 8, must use teacher forcing in decoding)
+3. ``./run_train_teacher.sh`` (stage 6 only, to extract energy and pitch)
+4. ``./run.sh`` (to stage 8, use custom cn_hubert, layer17 for large)
+5. Train a vocoder at [PWG](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs) (We use discrete hifigan here)
+6. ``./run.sh`` (stage 9 only)
+7. Evaluate the generated wav using [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation)
+
+
+### 1. Data Preparation
+
+* Download aishell-3 dataset(trainset & testset)
+* Trim slience to improve the efficiency and potentially improve the generated wave quality by cutting off noise.
+* Get the initial ``{dset}_phn`` dictionary.
+* Split 250 samples from the trainset to be the devset.
+
+```
+//{dset}/text sample
+SSB00050353 深交所副总经理周明指出
+//{dset}_phn/text sample
+SSB00050353 shen1 jiao1 suo3 fu4 zong3 jing1 li3 zhou1 ming2 zhi3 chu1
+```
+
+  NOTE: The parameters like ``fs``, ``n_fft``, in ``trim_slience.sh`` don't have to be the same as what in ``run.sh``, since they only determine the precision of slience trimming, where the outcome of different sets of parameters will be roughly the same (corpus w/ minimum slience sound).
+
+### 2. Train the teacher model
+Following ``tts1``, we train a Tacotron2 model to be the teacher model for FastSpeech2 in ``tts2``.
+
+Set ``audio_format=wav`` is recommended, as it can be directly processed if you want to use x-vector. Or you can use ``flac``, but take ``egs2/libirspeech/asr1/local/data.sh`` as a reference for ``uttid path-to-utt``
+
+Remember to keep the frame shift(fs), hop_size for the teacher model and the student model to be the same, only by which the soft targets generated by teacher Tacotron2 can align with the Fastspeech2 input.
+
+More specifically, the script can be executed by:
+
+```
+# Train the teacher model. Total steps >= 100k is recommended.
+./run_train_teacher.sh --stage 2 --stop_stage 7
+```
+
+Notice that ``test_set`` doesn't need all the processing here since only the pseudo labels from ``train_set`` and ``valid_set`` are required. Skipping some steps e.g. mfa, teacher forcing decoding on ``test_set`` is feasible.
+
+However, it is better to specify ``--test_sets`` in stage 1-3. Since test set phoneme is converted from grapheme to phoneme after new g2p model trained in mfa, the ``wav.scp`` from stage 2 can be used in vocoder part, and the ``spk_emb`` extracted from stage 3 can be used in the overall decoding test.
+
+Then generate the pseudo labels from ``train_set`` and ``valid_set``.
+
+```
+# use teacher forcing in decoding
+./run_train_teacher.sh --stage 8 --stop_stage 8 \
+    --tts_exp exp/tts_train_teacher_raw_phn_none \
+    --test_sets "train_no_dev_phn dev_phn" \
+    --inference_args "--use_teacher_forcing true" \
+    --inference_model 50epoch.pth
+```
+
+### 3. Extract additional features
+
+Calculate pitch and energy (still following ``tts1``), for fastspeech2.
+```
+./run_train_teacher.sh --stage 6 --stop_stage 6 \
+    --train_config conf/train_fastspeech2.yaml \
+    --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch \
+    --tts_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch/stats \
+    --write_collected_feats true
+```
+
+### 4. Train discrete fastspeech2
+The datasets include text, durations, speech, discrete speech, pitch, energy, and spkembs. We use cn_hubert (pretrained on mandarin) here for discrete tts feature extraction.
+
+```
+# Process test_set for stage 6. The discrete unit will be used in the vocoder part. Modify the bash file to avoid reprocessing train_set
+
+./local/data.sh --stage 1 --stop_stage 1
+./run.sh --stage 2 --stop_stage 2
+```
+
+```
+# It is recommended to modify tts2.sh, switching the English hubert to Chinese hubert, for aishell3 customization.
+./run.sh --stage 5 --stop_stage 6 --s3prl_upstream_name hf_hubert_custom --feature_layer 17
+
+./run.sh --stage 8 --stop_stage 8 --s3prl_upstream_name hf_hubert_custom --feature_layer 17 \
+    --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch \
+    --tts2_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch/stats \
+    --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert
+
+```
+
+### 5. Train a vocoder
+A customized vocoder for aishell3 discrete features is necessary for the purpose of generating ``wav`` from discrete hubert features.
+
+The vocoder for tts2 are not exactly mel2text, so our goal here is not to train a rule-based vocoder like ``tts1``, but another unique vocoder that maps discrete features to waves.
+
+We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs), and here are the detailed steps:
+
+
+* git clone https://github.com/kan-bayashi/ParallelWaveGAN.git , ``cd ParallelWaveGAN/egs/aishell3/hubert_voc1``.
+
+* Collect hubert text to a single file
+
+  ```shell
+  cat path/to/train_hubert.txt path/to/dev_hubert.txt path/to/test_hubert.txt > path/to/newfile_all.txt
+  ```
+* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid.
+
+* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``.
+
+* Start feature extraction and training from stage 1.
+
+
+### 6. Inference
+Run the inference stage in espnet2 recipe with your trained vocoder. Waveform will be directly generated this time.
+
+```
+./run.sh --stage 9 --stop_stage 9 --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert
+```
+
+### 7. Evaluate model performance
+Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation).
+
+
+## Other references
+
+**Speech-Text Alignment Tools**
+
+The token duration is predicted using Speech-Text alignment tools, which can be either force-aligner or attention-based auto-regressive model (e.g., Tacotron2). Please refer to [Alignment from Tacotron2](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#fastspeech-training) and [Montreal Forced Aligner(MFA)](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#new-mfa-aligments-generation) for details.
+
+
+**MFA**
+
+Firstly make sure ``mfa`` has been prepared in the environment.
+```
+cd ../../../tools
+make mfa.done
+cd -
+```
+
+Originally, ``Stage 1`` in ``run.sh`` calls ``local/data.sh``, but here we won't run ``Stage 1``, instead, we use
+
+```
+./local/run_mfa.sh
+```
+
+which is an entry point that will call ``scripts/mfa.sh`` and further call ``local/data.sh``. If ``--train false``, this script will download pretrained g2p and acoustic models, else if ``--train true``, this script will generate the alignments. The generated results will be stored in the ``<split_sets>_phn`` lexicon.
+
+For aishell-3, we train a new G2P model on ``mandarin_china_mfa`` dictionary, and generate the lexicon. Then train the speech-text alignment MFA.
+
+If you want to use the duration extracted by mfa, then you can continue the training on the main script from ``Stage 2``:
+
+```
+./run_.sh --stage 2 --stop_stage 2 --teacher_dumpdir "data"
+```
+
+### Multi-Speakers tts2
+
+In multi-spk scenario, adding speaker id or speaker embedding can help better tell speakers apart, specified using ``--use_spk_embed`` or ``--use_sid``. But since aishell-3 is not a fixed speaker corpus, i.e. exists speakers with unknown id, here we use speaker embeddings.
+
+**Speaker Embeddings**
+
+ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrain, espnet_spk). The recently proposed espnet_spk shows SOTA performance among many tasks, thus we use it here.
+
+
+### Discrete Speech Challenge Baseline
+
+<table class="table">
+  <thread>
+    <tr>
+      <th scope="col">Model</th>
+      <th scope="col">MCD ⬇️</th>
+      <th scope="col">Log F0 RMSE ⬇️</th>
+      <th scope="col">CER ⬇️</th>
+      <th scope="col">UTMOS ⬆️</th>
+    </tr>
+  </thread>
+  <tbody>
+    <tr>
+      <th scope="col">cn_hubert-large-layer17</th>
+      <th scope="col">8.5473 ± 0.9407</th>
+      <th scope="col">0.3032 ± 0.1354</th>
+      <th scope="col">42.3</th>
+      <th scope="col">1.7565 ± 0.3628</th>
+    </tr>
+  </tbody>
+</table>
+
+* CER is calculated using openai-whisper-large and Chinese characters.
diff --git a/egs2/aishell3/tts2/cmd.sh b/egs2/aishell3/tts2/cmd.sh
new file mode 100644
index 00000000000..2aae6919fef
--- /dev/null
+++ b/egs2/aishell3/tts2/cmd.sh
@@ -0,0 +1,110 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# Local machine logging to stdout and log file, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="stdout.pl"
+
+
+# "qsub" (Sun Grid Engine, or derivation of it)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+
+# "qsub" (Torque/PBS.)
+elif [ "${cmd_backend}" = pbs ]; then
+    # The default setting is written in conf/pbs.conf.
+
+    export train_cmd="pbs.pl"
+    export cuda_cmd="pbs.pl"
+    export decode_cmd="pbs.pl"
+
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml b/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
new file mode 100644
index 00000000000..cc8a791ef74
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/decode_fastspeech2.yaml
@@ -0,0 +1,10 @@
+# This configuration is the decoding setting for FastSpeech or FastSpeech2.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+speed_control_alpha: 1     # alpha to control the speed of generated speech
+                           # 1 < alpha makes slower and 1 > alpha makes faster
+use_teacher_forcing: false # whether to use teacher forcing
+                           # if true, we use groundtruth of durations
+                           # (+ pitch & energy for FastSpeech2)
diff --git a/egs2/aishell3/tts2/conf/decode_teacher.yaml b/egs2/aishell3/tts2/conf/decode_teacher.yaml
new file mode 100644
index 00000000000..131fdd04eba
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/decode_teacher.yaml
@@ -0,0 +1,10 @@
+# This configuration is the basic decoding setting for Tacotron 2.
+# It can be also applied to Transformer. If you met some problems
+# such as deletions or repetitions, it is worthwhile to try
+# `use_att_constraint: true` to make the generation more stable.
+# Note that attention constraint is not supported in Transformer.
+
+##########################################################
+#                    DECODING SETTING                    #
+##########################################################
+use_teacher_forcing: false
diff --git a/egs2/aishell3/tts2/conf/mfcc.conf b/egs2/aishell3/tts2/conf/mfcc.conf
new file mode 100644
index 00000000000..9e125706aae
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/mfcc.conf
@@ -0,0 +1,7 @@
+--sample-frequency=16000
+--frame-length=25 # the default is 25
+--low-freq=20 # the default.
+--high-freq=7600 # the default is zero meaning use the Nyquist (8k in this case).
+--num-mel-bins=30
+--num-ceps=30
+--snip-edges=false
diff --git a/egs2/aishell3/tts2/conf/pbs.conf b/egs2/aishell3/tts2/conf/pbs.conf
new file mode 100644
index 00000000000..119509938ce
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/pbs.conf
@@ -0,0 +1,11 @@
+# Default configuration
+command qsub -V -v PATH -S /bin/bash
+option name=* -N $0
+option mem=* -l mem=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -l ncpus=$0
+option num_threads=1  # Do not add anything to qsub_opts
+option num_nodes=* -l nodes=$0:ppn=1
+default gpu=0
+option gpu=0
+option gpu=* -l ngpus=$0
diff --git a/egs2/aishell3/tts2/conf/queue.conf b/egs2/aishell3/tts2/conf/queue.conf
new file mode 100644
index 00000000000..500582fab31
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/queue.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option name=* -N $0
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+option num_nodes=* -pe mpi $0  # You must set this PE as allocation_rule=1
+default gpu=0
+option gpu=0
+option gpu=* -l gpu=$0 -q g.q
diff --git a/egs2/aishell3/tts2/conf/slurm.conf b/egs2/aishell3/tts2/conf/slurm.conf
new file mode 100644
index 00000000000..3b229673638
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/slurm.conf
@@ -0,0 +1,14 @@
+# Default configuration
+command sbatch --export=PATH
+option name=* --job-name $0
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0
+option num_threads=* --cpus-per-task $0
+option num_threads=1 --cpus-per-task 1
+option num_nodes=* --nodes $0
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0 -c $0  # Recommend allocating more CPU than, or equal to the number of GPU
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs2/aishell3/tts2/conf/train_fastspeech2.yaml b/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
new file mode 100644
index 00000000000..5362a949817
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/train_fastspeech2.yaml
@@ -0,0 +1,104 @@
+# This configuration is for ESPnet2 to train FastSpeech2.
+# It requires only a single GPU with 12 GB memory and it
+# takes ~4 days to finish the training on Titan V.
+
+# Compared to the original FastSpeech2 paper, we use token
+# averaged pitch and energy as the same as FastPitch.
+# And we do not use quantized pitch and energy.
+
+# For FastSpeech2, we need to extract pitch and energy.
+# Therefore, we assume that feats_type=raw in using this
+# configuration. Please be careful.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: fastspeech2      # model architecture
+tts_conf:             # keyword arguments for the selected model
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    spk_embed_dim: 192              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_normalize: global_mvn # normalizer for the energy feature
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+optim: adam            # optimizer type
+optim_conf:            # keyword arguments for selected optimizer
+    lr: 1.0            # learning rate
+scheduler: noamlr      # scheduler type
+scheduler_conf:        # keyword arguments for selected scheduler
+    model_size: 384    # model size, a.k.a., attention dimension
+    warmup_steps: 4000 # the number of warmup steps
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 2000 # number of iterations per epoch
+max_epoch: 100            # number of epochs
+grad_clip: 1.0            # gradient clipping norm
+grad_noise: false         # whether to use gradient noise injection
+accum_grad: 1             # gradient accumulation
+batch_bins: 24000000      # batch bins (feats_type=raw)
+batch_type: numel         # how to make batch
+sort_in_batch: descending # how to sort data in making batch
+sort_batch: descending    # how to sort created batches
+num_workers: 4            # number of workers of data loader
+train_dtype: float32      # dtype in training
+log_interval: 201         # log interval in iterations
+keep_nbest_models: 5      # number of models to keep
+num_att_plot: 3           # number of attention figures to be saved in every check
+seed: 0                   # random seed number
+best_model_criterion:     # criterion to save the best models
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts2/conf/train_teacher.yaml b/egs2/aishell3/tts2/conf/train_teacher.yaml
new file mode 100644
index 00000000000..11b6bb1e187
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/train_teacher.yaml
@@ -0,0 +1,81 @@
+# This configuration is for ESPnet2 to train Tacotron 2 with
+# GST + X-vector. This configuration additionally use the guided
+# attention loss to accelerate the diagonal attention learning.
+# It takes around 4 days to finish the training on V100.
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+tts: tacotron2                   # model architecture
+tts_conf:                        # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: true         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: true         # whether to use batch normalization in encoder
+    use_concate: true            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: false          # whether to use residual connection in encoder
+    spk_embed_dim: 192              # dimension of speaker embedding
+    spk_embed_integration_type: add # how to integrate speaker embedding
+    use_gst: true                # whether to use GST embedding
+    gst_heads: 4                 # number of heads in GST multi-head attention
+    gst_tokens: 16               # number of global style tokens
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    use_masking: true            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 10.0         # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: true   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optim: adam           # optimizer type
+optim_conf:           # keyword arguments for selected optimizer
+    lr: 1.0e-03       # learning rate
+    eps: 1.0e-06      # epsilon
+    weight_decay: 0.0 # weight decay coefficient
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_iters_per_epoch: 2000    # number of iterations per epoch
+max_epoch: 100              # number of epochs, 500 by default
+grad_clip: 1.0              # gradient clipping norm
+grad_noise: false           # whether to use gradient noise injection
+accum_grad: 1               # gradient accumulation
+# batch_bins: 1000000       # batch bins (for feats_type=fbank)
+batch_bins: 3750000         # batch bins (for feats_type=raw, *= n_shift / n_mels)
+batch_type: numel           # how to make batch
+sort_in_batch: descending   # how to sort data in making batch
+sort_batch: descending      # how to sort created batches
+num_workers: 1              # number of workers of data loader
+train_dtype: float32        # dtype in training
+log_interval: null          # log interval in iterations
+keep_nbest_models: 5        # number of models to keep
+num_att_plot: 3             # number of attention figures to be saved in every check
+seed: 0                     # random seed number
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+-   - train
+    - loss
+    - min
diff --git a/egs2/aishell3/tts2/conf/vad.conf b/egs2/aishell3/tts2/conf/vad.conf
new file mode 100644
index 00000000000..c9f5e8b3072
--- /dev/null
+++ b/egs2/aishell3/tts2/conf/vad.conf
@@ -0,0 +1,4 @@
+--vad-energy-threshold=5.5
+--vad-energy-mean-scale=0.5
+--vad-proportion-threshold=0.12
+--vad-frames-context=2
diff --git a/egs2/aishell3/tts2/db.sh b/egs2/aishell3/tts2/db.sh
new file mode 120000
index 00000000000..50d86130898
--- /dev/null
+++ b/egs2/aishell3/tts2/db.sh
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/db.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/data.sh b/egs2/aishell3/tts2/local/data.sh
new file mode 120000
index 00000000000..2bb995843ef
--- /dev/null
+++ b/egs2/aishell3/tts2/local/data.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/data.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/data_prep.py b/egs2/aishell3/tts2/local/data_prep.py
new file mode 100644
index 00000000000..679232b9f3e
--- /dev/null
+++ b/egs2/aishell3/tts2/local/data_prep.py
@@ -0,0 +1,44 @@
+import argparse
+import os
+
+from espnet2.utils.types import str2bool
+
+SPK_LABEL_LEN = 7
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str)
+    parser.add_argument("--dest", type=str)
+    parser.add_argument("--external_g2p", type=str2bool, default=True)
+
+    args = parser.parse_args()
+
+    wav_dir = os.path.join(args.src, "wav")
+    transcript = open(os.path.join(args.src, "content.txt"), "r", encoding="utf-8")
+
+    wavscp = open(os.path.join(args.dest, "wav.scp"), "w", encoding="utf-8")
+    utt2spk = open(os.path.join(args.dest, "utt2spk"), "w", encoding="utf-8")
+    text = open(os.path.join(args.dest, "text"), "w", encoding="utf-8")
+
+    while True:
+        utt_info = transcript.readline()
+        if not utt_info:
+            break
+
+        (wav_name, text_info) = utt_info.strip().split("\t")
+        if args.external_g2p:
+            text_info = "".join(text_info.split(" ")[::2])
+        else:
+            text_info = " ".join(text_info.split(" ")[1::2])
+
+        spk_id = wav_name[:SPK_LABEL_LEN]
+        utt_id = wav_name[:-4]
+
+        wavscp.write("{} {}\n".format(utt_id, os.path.join(wav_dir, spk_id, wav_name)))
+        utt2spk.write("{} {}\n".format(utt_id, spk_id))
+        text.write("{} {}\n".format(utt_id, text_info))
+
+    transcript.close()
+    wavscp.close()
+    utt2spk.close()
+    text.close()
diff --git a/egs2/aishell3/tts2/local/download_and_untar.sh b/egs2/aishell3/tts2/local/download_and_untar.sh
new file mode 120000
index 00000000000..077f7073ae6
--- /dev/null
+++ b/egs2/aishell3/tts2/local/download_and_untar.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/download_and_untar.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/path.sh b/egs2/aishell3/tts2/local/path.sh
new file mode 120000
index 00000000000..032eace2e90
--- /dev/null
+++ b/egs2/aishell3/tts2/local/path.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork/egs2/aishell3/tts1/local/path.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/local/run_mfa.sh b/egs2/aishell3/tts2/local/run_mfa.sh
new file mode 100755
index 00000000000..71f8aa7e84e
--- /dev/null
+++ b/egs2/aishell3/tts2/local/run_mfa.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_shift=480
+
+./scripts/utils/mfa.sh \
+    --language pypinyin_phone  \
+    --train true \
+    --cleaner tacotron \
+    --acoustic_model mandarin_mfa \
+    --dictionary mandarin_china_mfa \
+    --g2p_model pypinyin_g2p_phone \
+    --samplerate ${fs} \
+    --hop-size ${n_shift} \
+    --clean_temp true \
+    --split_sets "train_no_dev dev test" \
+    --stage 0 \
+    --stop_stage 5 \
+    "$@"
diff --git a/egs2/aishell3/tts2/path.sh b/egs2/aishell3/tts2/path.sh
new file mode 120000
index 00000000000..2b62cec99da
--- /dev/null
+++ b/egs2/aishell3/tts2/path.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts2/path.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/pyscripts b/egs2/aishell3/tts2/pyscripts
new file mode 120000
index 00000000000..ac68ad75b60
--- /dev/null
+++ b/egs2/aishell3/tts2/pyscripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/pyscripts
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/run.sh b/egs2/aishell3/tts2/run.sh
new file mode 100755
index 00000000000..e1762d4fb1a
--- /dev/null
+++ b/egs2/aishell3/tts2/run.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=480
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi # opts="--audio_format wav " # flac for saving storage space, but require further processing afterwards.
+
+train_config=conf/train_fastspeech2.yaml
+inference_config=conf/decode_fastspeech2.yaml
+
+# train_set=train_no_dev
+# valid_set=dev
+# test_sets="dev test"
+# g2p=pypinyin_g2p_phone
+# Input: 卡尔普陪外孙玩滑梯
+# pypinyin_g2p: ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+# pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
+
+# if you want to use officially provided phoneme text (better for the quality)
+train_set=train_no_dev_phn
+valid_set=dev_phn
+test_sets="test_phn"
+g2p=none
+
+vocoder_file="vocoder/checkpoint-180000steps.pkl" # vocoder/vocoder.pkl
+
+./tts2.sh \
+    --lang zh \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --src_token_type phn \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --test_sets "${test_sets}" \
+    --srctexts "data/${train_set}/text" \
+    --use_spk_embed true \
+    --vocoder_file ${vocoder_file} \
+    ${opts} "$@"
diff --git a/egs2/aishell3/tts2/run_train_teacher.sh b/egs2/aishell3/tts2/run_train_teacher.sh
new file mode 100755
index 00000000000..690f973b7f4
--- /dev/null
+++ b/egs2/aishell3/tts2/run_train_teacher.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+fs=24000
+n_fft=2048
+n_shift=480
+win_length=1200
+
+opts=
+if [ "${fs}" -eq 44100 ]; then
+    # To suppress recreation, specify wav format
+    opts="--audio_format wav "
+else
+    opts="--audio_format flac "
+fi
+# opts="--audio_format wav " # flac for saving storage space, but require further processing afterwards.
+
+train_config=conf/train_teacher.yaml
+inference_config=conf/decode_teacher.yaml
+
+# train_set=train_no_dev
+# valid_set=dev
+# test_sets="dev test"
+# g2p=pypinyin_g2p_phone
+# Input: 卡尔普陪外孙玩滑梯
+# pypinyin_g2p: ka3 er3 pu3 pei2 wai4 sun1 wan2 hua2 ti1
+# pypinyin_g2p_phone: k a3 er3 p u3 p ei2 uai4 s un1 uan2 h ua2 t i1
+
+# if you want to use officially provided phoneme text (better for the quality)
+train_set=train_no_dev_phn
+valid_set=dev_phn
+g2p=none
+
+./tts.sh \
+    --lang zh \
+    --feats_type raw \
+    --fs "${fs}" \
+    --n_fft "${n_fft}" \
+    --n_shift "${n_shift}" \
+    --win_length "${win_length}" \
+    --token_type phn \
+    --cleaner none \
+    --g2p "${g2p}" \
+    --train_config "${train_config}" \
+    --inference_config "${inference_config}" \
+    --train_set "${train_set}" \
+    --valid_set "${valid_set}" \
+    --srctexts "data/${train_set}/text" \
+    --use_spk_embed true \
+    ${opts} "$@"
diff --git a/egs2/aishell3/tts2/scripts b/egs2/aishell3/tts2/scripts
new file mode 120000
index 00000000000..b25829705dc
--- /dev/null
+++ b/egs2/aishell3/tts2/scripts
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/scripts
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/sid b/egs2/aishell3/tts2/sid
new file mode 120000
index 00000000000..e1f18977f62
--- /dev/null
+++ b/egs2/aishell3/tts2/sid
@@ -0,0 +1 @@
+../../TEMPLATE/tts2/sid
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/steps b/egs2/aishell3/tts2/steps
new file mode 120000
index 00000000000..69ab7056139
--- /dev/null
+++ b/egs2/aishell3/tts2/steps
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/steps
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/tts.sh b/egs2/aishell3/tts2/tts.sh
new file mode 120000
index 00000000000..905a3d8b038
--- /dev/null
+++ b/egs2/aishell3/tts2/tts.sh
@@ -0,0 +1 @@
+/ocean/projects/cis210027p/yzhao16/espnet_fork_tocommit/egs2/TEMPLATE/tts1/tts.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/tts2.sh b/egs2/aishell3/tts2/tts2.sh
new file mode 120000
index 00000000000..80ac2d16de1
--- /dev/null
+++ b/egs2/aishell3/tts2/tts2.sh
@@ -0,0 +1 @@
+../../TEMPLATE/tts2/tts2.sh
\ No newline at end of file
diff --git a/egs2/aishell3/tts2/utils b/egs2/aishell3/tts2/utils
new file mode 120000
index 00000000000..e18ae14b549
--- /dev/null
+++ b/egs2/aishell3/tts2/utils
@@ -0,0 +1 @@
+../../TEMPLATE/asr1/utils
\ No newline at end of file
diff --git a/espnet2/bin/tts2_inference.py b/espnet2/bin/tts2_inference.py
index 63101df8890..45483fec285 100755
--- a/espnet2/bin/tts2_inference.py
+++ b/espnet2/bin/tts2_inference.py
@@ -270,7 +270,7 @@ def from_pretrained(
 
 @typechecked
 def inference(
-    output_dir: str,
+    output_dir: Union[Path, str],
     batch_size: int,
     dtype: str,
     ngpu: int,
diff --git a/espnet2/bin/tts_inference.py b/espnet2/bin/tts_inference.py
index aa7bbe58473..433220e619b 100755
--- a/espnet2/bin/tts_inference.py
+++ b/espnet2/bin/tts_inference.py
@@ -308,7 +308,7 @@ def from_pretrained(
 
 @typechecked
 def inference(
-    output_dir: str,
+    output_dir: Union[Path, str],
     batch_size: int,
     dtype: str,
     ngpu: int,
diff --git a/espnet2/tts2/espnet_model.py b/espnet2/tts2/espnet_model.py
index 94b7f934e17..aee1b859a6d 100644
--- a/espnet2/tts2/espnet_model.py
+++ b/espnet2/tts2/espnet_model.py
@@ -193,6 +193,9 @@ def collect_feats(
 
         """
         # feature extraction
+        discrete_feats, discrete_feats_lengths = self.discrete_feats_extract(
+            discrete_speech, discrete_speech_lengths
+        )
         feats, feats_lengths = speech, speech_lengths
         if self.pitch_extract is not None:
             pitch, pitch_lengths = self.pitch_extract(