diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py index 2577f4ef766..210c7331c99 100644 --- a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py +++ b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_km_label.py @@ -75,6 +75,12 @@ def get_parser(): help="Specify the file format for the rspecifier. " '"mat" is the matrix format in kaldi', ) + parser.add_argument( + "--audio_sample_rate", + type=int, + default=16000, + help="input audio sampling rate (could be different from fs used in SSL)", + ) parser.add_argument( "rspecifier", type=str, help="Read specifier for feats. e.g. ark:some.ark" ) @@ -116,6 +122,7 @@ def __call__(self, x): def dump_label( rspecifier, in_filetype, + audio_sample_rate, wspecifier, out_filetype, km_path, @@ -152,6 +159,7 @@ def dump_label( ) if reader_conf.get("layer", None): reader_conf["layer"] = int(reader_conf["layer"]) + reader_conf["audio_sample_rate"] = audio_sample_rate reader = reader_class(**reader_conf) iterator = build_data_iterator( diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py index ca5f03a734a..4932d3b7c26 100644 --- a/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py +++ b/egs2/TEMPLATE/asr1/pyscripts/feats/dump_ssl_feature.py @@ -55,6 +55,12 @@ def get_parser(): default=None, help="Specify the utt2num_samples file.", ) + parser.add_argument( + "--audio_sample_rate", + type=int, + default=16000, + help="input audio sampling rate (could be different from fs used in SSL)", + ) parser.add_argument( "--write_num_frames", type=str, help="Specify wspecifer for utt2num_frames" ) @@ -83,6 +89,7 @@ def main(args): reader_conf["multilayer_feature"] = str2bool(reader_conf["multilayer_feature"]) if reader_conf.get("layer", None): reader_conf["layer"] = int(reader_conf["layer"]) + reader_conf["audio_sample_rate"] = args.audio_sample_rate reader = reader_class(use_gpu=args.use_gpu, **reader_conf) dump_feature( diff --git a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py index 92db12c5242..606022716bb 100644 --- a/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py +++ b/egs2/TEMPLATE/asr1/pyscripts/feats/ssl_feature_utils.py @@ -5,6 +5,7 @@ import sys from typing import List, Optional, Tuple, Union +import librosa import numpy as np import soundfile as sf import torch @@ -97,7 +98,15 @@ def __init__(self): def load_audio(self, path: str, ref_len: Optional[int] = None): wav, sr = sf.read(path) - assert sr == self.sample_rate, sr + # assert sr == self.sample_rate, sr + if sr != self.sample_rate: + logging.warning( + "sampling rate mismatch between " + "the requirements of feature extractor {} " + "and source wav {}," + "conduct resampling".format(self.sample_rate, sr) + ) + wav = librosa.resample(wav, sr, self.sample_rate, scale=True) if wav.ndim == 2: wav = wav.mean(-1) if ref_len is not None and abs(ref_len - len(wav)) > 160: @@ -134,9 +143,18 @@ class MfccFeatureReader(BaseFeatureReader): def __init__( self, sample_rate: int = 16000, + audio_sample_rate: int = 16000, **kwargs, # placeholder for unused arguments ): self.sample_rate = sample_rate + self.audio_sample_rate = audio_sample_rate + if self.sample_rate != self.audio_sample_rate: + logging.warning("The audio sample rate is different from feat extractor") + self.resample = torchaudio.transforms.Resample( + orig_freq=audio_sample_rate, new_freq=sample_rate + ) + else: + self.resample = None self.frame_length = 25 * sample_rate / 1000 self.frame_shift = 10 * sample_rate / 1000 @@ -149,6 +167,9 @@ def get_feats( feats, feats_lens = [], [] with torch.no_grad(): x, x_lens = self.preprocess_data(data, data_lens) + if self.resample is not None: + x = self.resample(x) + x_lens = x_lens * self.sample_rate // self.audio_sample_rate batch_size = x.shape[0] for i in range(batch_size): mfcc = torchaudio.compliance.kaldi.mfcc( @@ -177,10 +198,19 @@ def __init__( hubert_dir_path, layer, sample_rate=16000, + audio_sample_rate=16000, max_chunk=1600000, use_gpu=True, ): - self.sample_rate = sample_rate + self.sample_rate = int(sample_rate) + self.audio_sample_rate = audio_sample_rate + if self.sample_rate != self.audio_sample_rate: + logging.warning("The audio sample rate is different from feat extractor") + self.resample = torchaudio.transforms.Resample( + orig_freq=audio_sample_rate, new_freq=self.sample_rate + ) + else: + self.resample = None self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu" from espnet2.asr.encoder.hubert_encoder import FairseqHubertEncoder @@ -200,6 +230,9 @@ def get_feats( ) -> Tuple[torch.Tensor, torch.Tensor]: with torch.no_grad(): x, x_lens = self.preprocess_data(data, data_lens) + if self.resample is not None: + x = self.resample(x) + x_lens = x_lens * self.sample_rate // self.audio_sample_rate x = x.to(self.device) mask = x.zeros_like(x, dtype=torch.long) for i in range(x.shape[0]): @@ -229,10 +262,19 @@ def __init__( hubert_model_path, layer, sample_rate=16000, + audio_sample_rate=16000, max_chunk=1600000, use_gpu=True, ): - self.sample_rate = sample_rate + self.sample_rate = int(sample_rate) # str->int + self.audio_sample_rate = audio_sample_rate + if self.sample_rate != self.audio_sample_rate: + logging.warning("The audio sample rate is different from feat extractor") + self.resample = torchaudio.transforms.Resample( + orig_freq=audio_sample_rate, new_freq=self.sample_rate + ) + else: + self.resample = None self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu" from espnet2.tasks.hubert import HubertTask @@ -256,6 +298,9 @@ def get_feats( ) -> Tuple[torch.Tensor, torch.Tensor]: with torch.inference_mode(): x, x_lens = self.preprocess_data(data, data_lens) + if self.resample is not None: + x = self.resample(x) + x_lens = x_lens * self.sample_rate // self.audio_sample_rate x = x.to(self.device) x_lens = x_lens.to(self.device) @@ -272,6 +317,7 @@ class S3PRLFeatureReader(BaseFeatureReader): def __init__( self, fs: Union[int, str] = 16000, + audio_sample_rate: int = 16000, s3prl_conf: Optional[dict] = None, download_dir: str = None, multilayer_feature: bool = False, @@ -285,6 +331,16 @@ def __init__( multilayer_feature=multilayer_feature, layer=layer, ) + self.sample_rate = fs + self.audio_sample_rate = audio_sample_rate + if self.sample_rate != self.audio_sample_rate: + logging.warning("The audio sample rate is different from feat extractor") + self.resample = torchaudio.transforms.Resample( + orig_freq=audio_sample_rate, new_freq=fs + ) + else: + self.resample = None + self.device = "cuda" if use_gpu and torch.cuda.is_available() else "cpu" self.model = self.model.to(self.device) @@ -296,6 +352,9 @@ def get_feats( ) -> Tuple[torch.Tensor, torch.Tensor]: with torch.no_grad(): x, x_lens = self.preprocess_data(data, data_lens) + if self.resample is not None: + x = self.resample(x) + x_lens = x_lens * self.sample_rate // self.audio_sample_rate x = x.to(self.device) feats, feats_lens = self.model(x, x_lens) diff --git a/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh b/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh index 9a5e560639f..daee0bfd49c 100755 --- a/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh +++ b/egs2/TEMPLATE/asr1/scripts/feats/perform_kmeans.sh @@ -31,6 +31,7 @@ upsample= # Upsampling rate of pseudo-labels to measure the pseudo-lab use_gpu=false # Whether to use gpu in feature extraction suffix= # A suffix to distinguish the feature dump directory. Empty in usual cases. audio_format="wav" # The audio format of the source speech (flac, wav, *_ark, etc) +audio_sample_rate=16000 # the sample rate of input audio skip_train_kmeans=false # Whether to skip the kmeans model training nclusters=100 # Number of clusters of kmeans model @@ -152,6 +153,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ] && ! [[ " ${skip_stages} " =~ [ ${_cmd} JOB=1:${_nj} ${_logdir}/dump_features.JOB.log \ ${python} pyscripts/feats/dump_ssl_feature.py \ --feature_conf "'${feature_conf}'" \ + --audio_sample_rate "${audio_sample_rate}" \ --use_gpu ${use_gpu} \ --in_filetype "${_in_filetype}" \ --out_filetype "mat" \ @@ -267,6 +269,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ] && ! [[ " ${skip_stages} " =~ [ ${_cmd} JOB=1:${_nj} "${_dump_dir}"/logdir/inference_pseudo_labels_km${nclusters}.JOB.log \ ${python} pyscripts/feats/dump_km_label.py \ ${_opts} \ + --audio_sample_rate "${audio_sample_rate}" \ --km_path "${km_dir}/km_${nclusters}.mdl" \ --out_filetype "mat" \ --use_gpu ${use_gpu} \ diff --git a/egs2/TEMPLATE/tts2/tts2.sh b/egs2/TEMPLATE/tts2/tts2.sh index e6c10c4beb5..8cd1039d11c 100755 --- a/egs2/TEMPLATE/tts2/tts2.sh +++ b/egs2/TEMPLATE/tts2/tts2.sh @@ -592,11 +592,14 @@ if ! "${skip_data_prep}"; then if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then log "Stage 6: Discrete TTS discrete unit extraction" - + # (en hubert), the original arguments s3prl_conf="{upstream=${s3prl_upstream_name}}" kmeans_feature_type=s3prl kmeans_feature_conf="{type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}}" - + # (zh hubert), the arguments we used on aishell3 + # s3prl_conf="{upstream=${s3prl_upstream_name},path_or_url=TencentGameMate/chinese-hubert-large}" + # kmeans_feature_type=s3prl + # kmeans_feature_conf={type=${kmeans_feature_type},conf={s3prl_conf=${s3prl_conf},download_dir=ckpt,multilayer_feature=False,layer=${feature_layer}}} scripts/feats/perform_kmeans.sh \ --stage ${discrete_stage} \ --stop_stage ${discrete_stop_stage} \ @@ -606,6 +609,7 @@ if ! "${skip_data_prep}"; then --datadir "${dumpdir}/raw" \ --featdir "${feature_dir}" \ --audio_format "${audio_format}" \ + --audio_sample_rate "${fs}" \ --feature_type ${kmeans_feature_type} \ --layer "${feature_layer}" \ --feature_conf "${kmeans_feature_conf}" \ diff --git a/egs2/aishell3/tts1/local/data.sh b/egs2/aishell3/tts1/local/data.sh index 32df0584444..e4be9332c9c 100755 --- a/egs2/aishell3/tts1/local/data.sh +++ b/egs2/aishell3/tts1/local/data.sh @@ -34,6 +34,7 @@ fi db_root=${AISHELL3} if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + mkdir -p ${db_root} log "stage -1: download data from openslr" local/download_and_untar.sh "${db_root}" "https://www.openslr.org/resources/93/data_aishell3.tgz" data_aishell3.tgz fi @@ -78,19 +79,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then utils/fix_data_dir.sh data/${x} done fi - +# use {dset},_phn here, to be consistent with mfa if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then log "stage 3: split for development set" utils/subset_data_dir.sh data/train 250 data/dev utils/subset_data_dir.sh data/train_phn 250 data/dev_phn utils/copy_data_dir.sh data/train data/train_no_dev - utils/copy_data_dir.sh data/train_phn data/train_phn_no_dev + utils/copy_data_dir.sh data/train_phn data/train_no_dev_phn utils/filter_scp.pl --exclude data/dev/wav.scp \ data/train/wav.scp > data/train_no_dev/wav.scp utils/filter_scp.pl --exclude data/dev_phn/wav.scp \ - data/train_phn/wav.scp > data/train_phn_no_dev/wav.scp + data/train_phn/wav.scp > data/train_no_dev_phn/wav.scp utils/fix_data_dir.sh data/train_no_dev - utils/fix_data_dir.sh data/train_phn_no_dev + utils/fix_data_dir.sh data/train_no_dev_phn fi log "Successfully finished. [elapsed=${SECONDS}s]" diff --git a/egs2/aishell3/tts2/README.md b/egs2/aishell3/tts2/README.md new file mode 100644 index 00000000000..9b7ba351187 --- /dev/null +++ b/egs2/aishell3/tts2/README.md @@ -0,0 +1,209 @@ +# AISHELL3 RECIPE + +This is the recipe of Mandrain multi-speaker TTS2 model with [aishell3](https://www.openslr.org/93/) corpus. + +See the following pages for running on clusters. They can help you to set the environment and get familiar with ESPNet's repo structure. +- [PSC usage tutorial](https://www.wavlab.org/activities/2022/psc-usage/) +- [Espnet recipe tutorial](https://github.com/espnet/notebook/blob/master/ESPnet2/Course/CMU_SpeechRecognition_Fall2022/recipe_tutorial.ipynb) + + +## Brief on TTS2 + +- In terms of features + + ``tts2`` uses discrete acoustic features instead of continuous features in ``tts1``. Current TEMPLATE supports the discrete FastSpeech2 model training. +- In terms of data + + ``tts2`` additionally requires duration information, which can be obtained from **Speech-Text Alignment Tools** (tacotron teacher model or mfa). According to the [FastSpeech2](https://arxiv.org/pdf/2006.04558) paper, mfa has a higher quality. + + +## Run the Recipe + +🌟 Please notice that most of the ``bash files`` are symbolic linked from the TEMPLATE. It might be updated by later commits using other corpus, so please double check and customize the parameters before your run. + +Here is the basic order for running scripts, followed by more details. + +1. ``./local/run_mfa.sh`` +2. ``./run_train_teacher.sh`` (to stage 8, must use teacher forcing in decoding) +3. ``./run_train_teacher.sh`` (stage 6 only, to extract energy and pitch) +4. ``./run.sh`` (to stage 8, use custom cn_hubert, layer17 for large) +5. Train a vocoder at [PWG](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs) (We use discrete hifigan here) +6. ``./run.sh`` (stage 9 only) +7. Evaluate the generated wav using [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation) + + +### 1. Data Preparation + +* Download aishell-3 dataset(trainset & testset) +* Trim slience to improve the efficiency and potentially improve the generated wave quality by cutting off noise. +* Get the initial ``{dset}_phn`` dictionary. +* Split 250 samples from the trainset to be the devset. + +``` +//{dset}/text sample +SSB00050353 深交所副总经理周明指出 +//{dset}_phn/text sample +SSB00050353 shen1 jiao1 suo3 fu4 zong3 jing1 li3 zhou1 ming2 zhi3 chu1 +``` + + NOTE: The parameters like ``fs``, ``n_fft``, in ``trim_slience.sh`` don't have to be the same as what in ``run.sh``, since they only determine the precision of slience trimming, where the outcome of different sets of parameters will be roughly the same (corpus w/ minimum slience sound). + +### 2. Train the teacher model +Following ``tts1``, we train a Tacotron2 model to be the teacher model for FastSpeech2 in ``tts2``. + +Set ``audio_format=wav`` is recommended, as it can be directly processed if you want to use x-vector. Or you can use ``flac``, but take ``egs2/libirspeech/asr1/local/data.sh`` as a reference for ``uttid path-to-utt`` + +Remember to keep the frame shift(fs), hop_size for the teacher model and the student model to be the same, only by which the soft targets generated by teacher Tacotron2 can align with the Fastspeech2 input. + +More specifically, the script can be executed by: + +``` +# Train the teacher model. Total steps >= 100k is recommended. +./run_train_teacher.sh --stage 2 --stop_stage 7 +``` + +Notice that ``test_set`` doesn't need all the processing here since only the pseudo labels from ``train_set`` and ``valid_set`` are required. Skipping some steps e.g. mfa, teacher forcing decoding on ``test_set`` is feasible. + +However, it is better to specify ``--test_sets`` in stage 1-3. Since test set phoneme is converted from grapheme to phoneme after new g2p model trained in mfa, the ``wav.scp`` from stage 2 can be used in vocoder part, and the ``spk_emb`` extracted from stage 3 can be used in the overall decoding test. + +Then generate the pseudo labels from ``train_set`` and ``valid_set``. + +``` +# use teacher forcing in decoding +./run_train_teacher.sh --stage 8 --stop_stage 8 \ + --tts_exp exp/tts_train_teacher_raw_phn_none \ + --test_sets "train_no_dev_phn dev_phn" \ + --inference_args "--use_teacher_forcing true" \ + --inference_model 50epoch.pth +``` + +### 3. Extract additional features + +Calculate pitch and energy (still following ``tts1``), for fastspeech2. +``` +./run_train_teacher.sh --stage 6 --stop_stage 6 \ + --train_config conf/train_fastspeech2.yaml \ + --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch \ + --tts_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch/stats \ + --write_collected_feats true +``` + +### 4. Train discrete fastspeech2 +The datasets include text, durations, speech, discrete speech, pitch, energy, and spkembs. We use cn_hubert (pretrained on mandarin) here for discrete tts feature extraction. + +``` +# Process test_set for stage 6. The discrete unit will be used in the vocoder part. Modify the bash file to avoid reprocessing train_set + +./local/data.sh --stage 1 --stop_stage 1 +./run.sh --stage 2 --stop_stage 2 +``` + +``` +# It is recommended to modify tts2.sh, switching the English hubert to Chinese hubert, for aishell3 customization. +./run.sh --stage 5 --stop_stage 6 --s3prl_upstream_name hf_hubert_custom --feature_layer 17 + +./run.sh --stage 8 --stop_stage 8 --s3prl_upstream_name hf_hubert_custom --feature_layer 17 \ + --teacher_dumpdir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch \ + --tts2_stats_dir exp/tts_train_teacher_raw_phn_none/decode_teacher_use_teacher_forcingtrue_50epoch/stats \ + --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert + +``` + +### 5. Train a vocoder +A customized vocoder for aishell3 discrete features is necessary for the purpose of generating ``wav`` from discrete hubert features. + +The vocoder for tts2 are not exactly mel2text, so our goal here is not to train a rule-based vocoder like ``tts1``, but another unique vocoder that maps discrete features to waves. + +We use [PWG repo](https://github.com/kan-bayashi/ParallelWaveGAN/tree/master/egs), and here are the detailed steps: + + +* git clone https://github.com/kan-bayashi/ParallelWaveGAN.git , ``cd ParallelWaveGAN/egs/aishell3/hubert_voc1``. + +* Collect hubert text to a single file + + ```shell + cat path/to/train_hubert.txt path/to/dev_hubert.txt path/to/test_hubert.txt > path/to/newfile_all.txt + ``` +* Modify the ``hubert_text`` in ./run.sh. Follow instructions in stage 0 to symlink the data(silence trimmed). ``wav`` format is better supported in kaldiio than ``flac``. Notice that aishell3 has unknown speakers, so we don't use sid. + +* Modify ``num_embs``(equals to the number of k-means clusters), ``batch_max_steps``(as the comment suggested) and custom parameters in the config file ``conf/hifigan_hubert_24k.v1.yaml``. + +* Start feature extraction and training from stage 1. + + +### 6. Inference +Run the inference stage in espnet2 recipe with your trained vocoder. Waveform will be directly generated this time. + +``` +./run.sh --stage 9 --stop_stage 9 --tts2_exp exp/tts_fastspeech2_raw_phn_none_cn_hubert +``` + +### 7. Evaluate model performance +Please follow [scripts here](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#evaluation). + + +## Other references + +**Speech-Text Alignment Tools** + +The token duration is predicted using Speech-Text alignment tools, which can be either force-aligner or attention-based auto-regressive model (e.g., Tacotron2). Please refer to [Alignment from Tacotron2](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#fastspeech-training) and [Montreal Forced Aligner(MFA)](https://github.com/espnet/espnet/tree/master/egs2/TEMPLATE/tts1#new-mfa-aligments-generation) for details. + + +**MFA** + +Firstly make sure ``mfa`` has been prepared in the environment. +``` +cd ../../../tools +make mfa.done +cd - +``` + +Originally, ``Stage 1`` in ``run.sh`` calls ``local/data.sh``, but here we won't run ``Stage 1``, instead, we use + +``` +./local/run_mfa.sh +``` + +which is an entry point that will call ``scripts/mfa.sh`` and further call ``local/data.sh``. If ``--train false``, this script will download pretrained g2p and acoustic models, else if ``--train true``, this script will generate the alignments. The generated results will be stored in the ``_phn`` lexicon. + +For aishell-3, we train a new G2P model on ``mandarin_china_mfa`` dictionary, and generate the lexicon. Then train the speech-text alignment MFA. + +If you want to use the duration extracted by mfa, then you can continue the training on the main script from ``Stage 2``: + +``` +./run_.sh --stage 2 --stop_stage 2 --teacher_dumpdir "data" +``` + +### Multi-Speakers tts2 + +In multi-spk scenario, adding speaker id or speaker embedding can help better tell speakers apart, specified using ``--use_spk_embed`` or ``--use_sid``. But since aishell-3 is not a fixed speaker corpus, i.e. exists speakers with unknown id, here we use speaker embeddings. + +**Speaker Embeddings** + +ESPnet supports several types of speaker embeddings (kaldi: x-vector, speechbrain, espnet_spk). The recently proposed espnet_spk shows SOTA performance among many tasks, thus we use it here. + + +### Discrete Speech Challenge Baseline + + + + + + + + + + + + + + + + + + + + +
ModelMCD ⬇️Log F0 RMSE ⬇️CER ⬇️UTMOS ⬆️
cn_hubert-large-layer178.5473 ± 0.94070.3032 ± 0.135442.31.7565 ± 0.3628
+ +* CER is calculated using openai-whisper-large and Chinese characters. diff --git a/egs2/aishell3/tts2/cmd.sh b/egs2/aishell3/tts2/cmd.sh new file mode 100644 index 00000000000..2aae6919fef --- /dev/null +++ b/egs2/aishell3/tts2/cmd.sh @@ -0,0 +1,110 @@ +# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== +# Usage: .pl [options] JOB=1: +# e.g. +# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB +# +# Options: +# --time