From 6a9c21a40882dc59676dc882954b290e0a867960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Mon, 19 Feb 2024 17:05:49 +0800 Subject: [PATCH] aishell example --- ..._paraformer_conformer_12e_6d_2048_256.yaml | 0 examples/aishell/paraformer/run.sh | 38 ++++++++++--------- .../paraformer/finetune.sh | 10 +++-- funasr/auto/auto_frontend.py | 1 - funasr/auto/auto_model.py | 5 ++- funasr/datasets/audio_datasets/scp2jsonl.py | 6 ++- setup.py | 1 - 7 files changed, 34 insertions(+), 27 deletions(-) rename examples/aishell/{ => paraformer}/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml (100%) diff --git a/examples/aishell/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml b/examples/aishell/paraformer/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml similarity index 100% rename from examples/aishell/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml rename to examples/aishell/paraformer/conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml diff --git a/examples/aishell/paraformer/run.sh b/examples/aishell/paraformer/run.sh index 7972a13c0..3f485c207 100755 --- a/examples/aishell/paraformer/run.sh +++ b/examples/aishell/paraformer/run.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -. ./path.sh || exit 1; +workspace=`pwd` # machines configuration CUDA_VISIBLE_DEVICES="0,1" @@ -39,7 +39,7 @@ train_set=train valid_set=dev test_sets="dev test" -asr_config=conf/train_asr_paraformer_conformer_12e_6d_2048_256.yaml +asr_config=train_asr_paraformer_conformer_12e_6d_2048_256.yaml model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}" #inference_config=conf/decode_asr_transformer_noctc_1best.yaml @@ -74,19 +74,21 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text - python funasr/datasets/audio_datasets/scp2jsonl.py \ - ++scp_file_list='["${feats_dir}/data/${x}/wav.scp", "${feats_dir}/data/${x}/text"]' \ + # convert wav.scp text to jsonl + scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'" + python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \ ++data_type_list='["source", "target"]' \ - ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl + ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \ + ${scp_file_list_arg} done fi if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "stage 1: Feature and CMVN Generation" # utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0 - python funasr/bin/compute_audio_cmvn.py \ - --config-path "/Users/zhifu/funasr1.0/examples/aishell/conf" \ - --config-name "train_asr_paraformer_conformer_12e_6d_2048_256.yaml" \ + python ../../../funasr/bin/compute_audio_cmvn.py \ + --config-path "${workspace}" \ + --config-name "${asr_config}" \ ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \ ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \ ++dataset_conf.num_workers=$nj @@ -116,16 +118,16 @@ fi if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then echo "stage 4: ASR Training" -torchrun \ ---nnodes 1 \ ---nproc_per_node ${gpu_num} \ -funasr/bin/train.py \ ---config-path "/Users/zhifu/funasr1.0/examples/aishell/conf" \ ---config-name "train_asr_paraformer_conformer_12e_6d_2048_256.yaml" \ -++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \ -++cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \ -++token_list="${token_list}" \ -++output_dir="${exp_dir}/exp/${model_dir}" + torchrun \ + --nnodes 1 \ + --nproc_per_node ${gpu_num} \ + ../../../funasr/bin/train.py \ + --config-path "${workspace}" \ + --config-name "${asr_config}" \ + ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \ + ++cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \ + ++token_list="${token_list}" \ + ++output_dir="${exp_dir}/exp/${model_dir}" fi # diff --git a/examples/industrial_data_pretraining/paraformer/finetune.sh b/examples/industrial_data_pretraining/paraformer/finetune.sh index 5fc7481d9..394861b77 100644 --- a/examples/industrial_data_pretraining/paraformer/finetune.sh +++ b/examples/industrial_data_pretraining/paraformer/finetune.sh @@ -6,10 +6,12 @@ #git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git ${local_path} ## generate jsonl from wav.scp and text.txt -#python funasr/datasets/audio_datasets/scp2jsonl.py \ -#++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ -#++data_type_list='["source", "target"]' \ -#++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl +python funasr/datasets/audio_datasets/scp2jsonl.py \ +++scp_file_list='["/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt"]' \ +++data_type_list='["source", "target"]' \ +++jsonl_file_out=/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl + + # torchrun \ # --nnodes 1 \ # --nproc_per_node 1 \ diff --git a/funasr/auto/auto_frontend.py b/funasr/auto/auto_frontend.py index 661f94939..8f2f06920 100644 --- a/funasr/auto/auto_frontend.py +++ b/funasr/auto/auto_frontend.py @@ -19,7 +19,6 @@ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank from funasr.utils.timestamp_tools import timestamp_sentence from funasr.models.campplus.utils import sv_chunk, postprocess, distribute_spk -from funasr.models.campplus.cluster_backend import ClusterBackend from funasr.auto.auto_model import prepare_data_iterator diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 13451570f..e95cfd8d1 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -20,7 +20,10 @@ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank from funasr.utils.timestamp_tools import timestamp_sentence from funasr.models.campplus.utils import sv_chunk, postprocess, distribute_spk -from funasr.models.campplus.cluster_backend import ClusterBackend +try: + from funasr.models.campplus.cluster_backend import ClusterBackend +except: + print("If you want to use the speaker diarization, please `pip install hdbscan`") def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): diff --git a/funasr/datasets/audio_datasets/scp2jsonl.py b/funasr/datasets/audio_datasets/scp2jsonl.py index c60c6f577..b6df34ae3 100644 --- a/funasr/datasets/audio_datasets/scp2jsonl.py +++ b/funasr/datasets/audio_datasets/scp2jsonl.py @@ -19,7 +19,7 @@ def gen_jsonl_from_wav_text_list(path, data_type_list=("source", "target"), json world_size = 1 cpu_cores = os.cpu_count() or 1 - + print(f"convert wav.scp text to jsonl, ncpu: {cpu_cores}") if rank == 0: json_dict = {} for data_type, data_file in zip(data_type_list, path): @@ -65,7 +65,7 @@ def parse_context_length(data_list: list, data_type: str): sample_num = len(waveform) context_len = int(sample_num//16000*1000/10) else: - context_len = len(line) + context_len = len(line.split()) if " " in line else len(line) res[key] = {data_type: line, f"{data_type}_len": context_len} return res @@ -83,6 +83,8 @@ def main_hydra(cfg: DictConfig): kwargs = OmegaConf.to_container(cfg, resolve=True) scp_file_list = kwargs.get("scp_file_list", ("/Users/zhifu/funasr1.0/test_local/wav.scp", "/Users/zhifu/funasr1.0/test_local/text.txt")) + if isinstance(scp_file_list, str): + scp_file_list = eval(scp_file_list) data_type_list = kwargs.get("data_type_list", ("source", "target")) jsonl_file_out = kwargs.get("jsonl_file_out", "/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonl") gen_jsonl_from_wav_text_list(scp_file_list, data_type_list=data_type_list, jsonl_file_out=jsonl_file_out) diff --git a/setup.py b/setup.py index 561dea2b6..f703bb494 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ # "textgrid", # "protobuf", "tqdm", - "hdbscan", "umap_learn", "jaconv", "hydra-core>=1.3.2",