-
Notifications
You must be signed in to change notification settings - Fork 798
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
29 changed files
with
2,454 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
ID0012W0013 当客户风险承受能力评估依据发生变化时 | ||
ID0012W0014 杨涛不得不将工厂关掉 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
ID0012W0013 /Users/zhifu/funasr_github/test_local/aishell2_dev_ios/wav/D0012/ID0012W0013.wav | ||
ID0012W0014 /Users/zhifu/funasr_github/test_local/aishell2_dev_ios/wav/D0012/ID0012W0014.wav |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
|
||
cmd="funasr/bin/train.py" | ||
|
||
python $cmd \ | ||
--config-path "/Users/zhifu/funasr_github/test_local/funasr_cli_egs" \ | ||
--config-name "config.yaml" \ | ||
++token_list="/Users/zhifu/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/tokens.txt" \ | ||
++train_data_set_list="/Users/zhifu/funasr_github/test_local/aishell2_dev_ios/asr_task_debug_len.jsonl" \ | ||
++output_dir="/nfs/zhifu.gzf/ckpt/funasr2/exp1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2017 Xingyu Na | ||
# Apache 2.0 | ||
|
||
#. ./path.sh || exit 1; | ||
|
||
if [ $# != 3 ]; then | ||
echo "Usage: $0 <audio-path> <text-path> <output-path>" | ||
echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data" | ||
exit 1; | ||
fi | ||
|
||
aishell_audio_dir=$1 | ||
aishell_text=$2/aishell_transcript_v0.8.txt | ||
output_dir=$3 | ||
|
||
train_dir=$output_dir/data/local/train | ||
dev_dir=$output_dir/data/local/dev | ||
test_dir=$output_dir/data/local/test | ||
tmp_dir=$output_dir/data/local/tmp | ||
|
||
mkdir -p $train_dir | ||
mkdir -p $dev_dir | ||
mkdir -p $test_dir | ||
mkdir -p $tmp_dir | ||
|
||
# data directory check | ||
if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then | ||
echo "Error: $0 requires two directory arguments" | ||
exit 1; | ||
fi | ||
|
||
# find wav audio file for train, dev and test resp. | ||
find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist | ||
n=`cat $tmp_dir/wav.flist | wc -l` | ||
[ $n -ne 141925 ] && \ | ||
echo Warning: expected 141925 data data files, found $n | ||
|
||
grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; | ||
grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; | ||
grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; | ||
|
||
rm -r $tmp_dir | ||
|
||
# Transcriptions preparation | ||
for dir in $train_dir $dev_dir $test_dir; do | ||
echo Preparing $dir transcriptions | ||
sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list | ||
paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all | ||
utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt | ||
awk '{print $1}' $dir/transcripts.txt > $dir/utt.list | ||
utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp | ||
sort -u $dir/transcripts.txt > $dir/text | ||
done | ||
|
||
mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test | ||
|
||
for f in wav.scp text; do | ||
cp $train_dir/$f $output_dir/data/train/$f || exit 1; | ||
cp $dev_dir/$f $output_dir/data/dev/$f || exit 1; | ||
cp $test_dir/$f $output_dir/data/test/$f || exit 1; | ||
done | ||
|
||
echo "$0: AISHELL data preparation succeeded" | ||
exit 0; |
105 changes: 105 additions & 0 deletions
105
examples/aishell/paraformer/local/download_and_untar.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2014 Johns Hopkins University (author: Daniel Povey) | ||
# 2017 Xingyu Na | ||
# Apache 2.0 | ||
|
||
remove_archive=false | ||
|
||
if [ "$1" == --remove-archive ]; then | ||
remove_archive=true | ||
shift | ||
fi | ||
|
||
if [ $# -ne 3 ]; then | ||
echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>" | ||
echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" | ||
echo "With --remove-archive it will remove the archive after successfully un-tarring it." | ||
echo "<corpus-part> can be one of: data_aishell, resource_aishell." | ||
fi | ||
|
||
data=$1 | ||
url=$2 | ||
part=$3 | ||
|
||
if [ ! -d "$data" ]; then | ||
echo "$0: no such directory $data" | ||
exit 1; | ||
fi | ||
|
||
part_ok=false | ||
list="data_aishell resource_aishell" | ||
for x in $list; do | ||
if [ "$part" == $x ]; then part_ok=true; fi | ||
done | ||
if ! $part_ok; then | ||
echo "$0: expected <corpus-part> to be one of $list, but got '$part'" | ||
exit 1; | ||
fi | ||
|
||
if [ -z "$url" ]; then | ||
echo "$0: empty URL base." | ||
exit 1; | ||
fi | ||
|
||
if [ -f $data/$part/.complete ]; then | ||
echo "$0: data part $part was already successfully extracted, nothing to do." | ||
exit 0; | ||
fi | ||
|
||
# sizes of the archive files in bytes. | ||
sizes="15582913665 1246920" | ||
|
||
if [ -f $data/$part.tgz ]; then | ||
size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') | ||
size_ok=false | ||
for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done | ||
if ! $size_ok; then | ||
echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" | ||
echo "does not equal the size of one of the archives." | ||
rm $data/$part.tgz | ||
else | ||
echo "$data/$part.tgz exists and appears to be complete." | ||
fi | ||
fi | ||
|
||
if [ ! -f $data/$part.tgz ]; then | ||
if ! command -v wget >/dev/null; then | ||
echo "$0: wget is not installed." | ||
exit 1; | ||
fi | ||
full_url=$url/$part.tgz | ||
echo "$0: downloading data from $full_url. This may take some time, please be patient." | ||
|
||
cd $data || exit 1 | ||
if ! wget --no-check-certificate $full_url; then | ||
echo "$0: error executing wget $full_url" | ||
exit 1; | ||
fi | ||
fi | ||
|
||
cd $data || exit 1 | ||
|
||
if ! tar -xvzf $part.tgz; then | ||
echo "$0: error un-tarring archive $data/$part.tgz" | ||
exit 1; | ||
fi | ||
|
||
touch $data/$part/.complete | ||
|
||
if [ $part == "data_aishell" ]; then | ||
cd $data/$part/wav || exit 1 | ||
for wav in ./*.tar.gz; do | ||
echo "Extracting wav from $wav" | ||
tar -zxf $wav && rm $wav | ||
done | ||
fi | ||
|
||
echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" | ||
|
||
if $remove_archive; then | ||
echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." | ||
rm $data/$part.tgz | ||
fi | ||
|
||
exit 0; |
Oops, something went wrong.