Skip to content

Commit

Permalink
Merge pull request espnet#5832 from Darshan7575/master
Browse files Browse the repository at this point in the history
WIP: Espnet Multi-convformer implementation
  • Loading branch information
sw005320 authored Jul 22, 2024
2 parents f2431f4 + ab1d0a6 commit b8b31c0
Show file tree
Hide file tree
Showing 10 changed files with 1,175 additions and 1 deletion.
55 changes: 55 additions & 0 deletions egs2/librispeech/asr1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,61 @@
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.7|1.6|0.7|0.4|2.7|25.7|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|94.5|3.9|1.5|1.0|6.4|45.1|

# Multiconvformer
- Params: 147.41 M
- ASR config: [conf/tuning/train_asr_multiconvformer_conv_fusion.yaml](conf/tuning/train_asr_multiconvformer_conv_fusion.yaml)
- LM config: [conf/tuning/train_lm_transformer2.yaml](conf/tuning/train_lm_transformer2.yaml)
- Model link: [https://huggingface.co/Darshan7575/librispeech_960_multiconvformer_ctcatt_conv_fusion](https://huggingface.co/Darshan7575/librispeech_960_multiconvformer_ctcatt_conv_fusion)

# RESULTS
## Environments
- date: `Fri Mar 1 15:40:42 UTC 2024`
- python version: `3.9.16 (main, May 15 2023, 23:46:34) [GCC 11.2.0]`
- espnet version: `espnet 202402`
- pytorch version: `pytorch 2.1.2+cu118`
- Git hash: `a50d6a0c8c31b4ef775473a657de031a40be30c1`
- Commit date: `Mon Feb 19 07:37:52 2024 -0500`

## exp/asr_train_asr_multiconvformer_conv_fusion_raw_en_bpe5000_sp
### WER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|54402|98.2|1.6|0.2|0.2|2.0|25.8|
|decode_asr_asr_model_valid.acc.ave/dev_other|2864|50948|95.7|3.9|0.3|0.5|4.7|41.4|
|decode_asr_asr_model_valid.acc.ave/test_clean|2620|52576|98.1|1.7|0.2|0.3|2.2|26.9|
|decode_asr_asr_model_valid.acc.ave/test_other|2939|52343|95.9|3.8|0.3|0.6|4.7|42.6|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|98.4|1.4|0.2|0.2|1.7|23.2|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|96.7|2.9|0.3|0.3|3.6|34.3|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|98.3|1.5|0.2|0.2|1.9|23.4|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|96.5|3.1|0.5|0.4|3.9|38.0|

### CER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|288456|99.5|0.3|0.2|0.2|0.7|25.8|
|decode_asr_asr_model_valid.acc.ave/dev_other|2864|265951|98.5|0.9|0.6|0.5|2.0|41.4|
|decode_asr_asr_model_valid.acc.ave/test_clean|2620|281530|99.5|0.2|0.2|0.2|0.7|26.9|
|decode_asr_asr_model_valid.acc.ave/test_other|2939|272758|98.7|0.8|0.5|0.6|1.9|42.6|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|99.5|0.2|0.2|0.2|0.6|23.2|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|98.7|0.7|0.6|0.4|1.7|34.3|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|99.5|0.2|0.3|0.2|0.7|23.4|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|98.7|0.7|0.6|0.4|1.7|38.0|

### TER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|68010|97.7|1.7|0.6|0.3|2.6|25.8|
|decode_asr_asr_model_valid.acc.ave/dev_other|2864|63110|94.7|4.0|1.3|0.8|6.1|41.4|
|decode_asr_asr_model_valid.acc.ave/test_clean|2620|65818|97.6|1.7|0.7|0.3|2.7|26.9|
|decode_asr_asr_model_valid.acc.ave/test_other|2939|65101|95.0|3.6|1.4|0.7|5.7|42.6|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|68010|97.9|1.4|0.7|0.3|2.4|23.2|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|63110|95.5|3.1|1.4|0.6|5.1|34.3|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|65818|97.8|1.4|0.8|0.3|2.4|23.4|
|decode_asr_lm_lm_train_lm_transformer2_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|65101|95.5|2.9|1.6|0.5|5.0|38.0|

# E-Branchformer
- Params: 148.92 M
- ASR config: [conf/tuning/train_asr_e_branchformer.yaml](conf/tuning/train_asr_e_branchformer.yaml)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Trained with A100 (80 GB) x 2 GPUs. It takes 110 minutes per epoch.
encoder: multiconv_conformer
encoder_conf:
output_size: 512
attention_heads: 8
selfattention_layer_type: rel_selfattn
pos_enc_layer_type: rel_pos
rel_pos_type: latest
cgmlp_linear_units: 3072
multicgmlp_type: concat_fusion
multicgmlp_kernel_sizes: 7,15,23,31
use_linear_after_conv: false
gate_activation: identity
num_blocks: 18
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
layer_drop_rate: 0.1
linear_units: 1024
positionwise_layer_type: linear
macaron_style: true
use_cnn_module: true

decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1

model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false

frontend_conf:
n_fft: 512
hop_length: 160

use_amp: true
unused_parameters: true
num_workers: 4
batch_type: numel
batch_bins: 70000000
accum_grad: 2
max_epoch: 80
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
nbest_averaging_interval: 10

optim: adam
optim_conf:
lr: 0.002
weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 40000

specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 10
43 changes: 42 additions & 1 deletion egs2/librispeech_100/asr1/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,45 @@
# Multiconvformer
- Params: 37.21 M
- ASR config: [conf/tuning/train_asr_multiconvformer_conv_fusion_linear1024.yaml](conf/tuning/train_asr_multiconvformer_conv_fusion_linear1024.yaml)
- Model link: [https://huggingface.co/Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion](https://huggingface.co/Darshan7575/librispeech_100_multiconvformer_ctcatt_conv_fusion)

# RESULTS
## Environments
- date: `Sun Jan 28 23:50:53 UTC 2024`
- python version: `3.9.16 (main, Mar 8 2023, 14:00:05) [GCC 11.2.0]`
- espnet version: `espnet 202304`
- pytorch version: `pytorch 2.1.2+cu118`
- Git hash: `3651c2e67126c4544820cf148407be7f2679866c`
- Commit date: `Sat Jul 1 14:46:46 2023 +0000`

## exp/librispeech_100_multiconvformer_conv_fusion
### WER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|54402|94.8|4.8|0.3|0.7|5.9|53.8|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|50948|85.4|13.2|1.4|2.0|16.6|78.8|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|52576|94.5|5.0|0.4|0.7|6.2|55.5|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|52343|85.0|13.6|1.5|2.0|17.0|80.5|

### CER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|288456|98.3|1.0|0.7|0.6|2.3|53.8|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|265951|93.6|4.0|2.4|2.0|8.4|78.8|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|281530|98.3|1.0|0.7|0.6|2.4|55.5|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|272758|93.6|3.8|2.6|1.9|8.2|80.5|

### TER

|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
|---|---|---|---|---|---|---|---|---|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_clean|2703|69558|92.5|4.7|2.8|0.6|8.1|53.8|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/dev_other|2864|64524|82.0|12.9|5.0|2.4|20.4|78.8|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_clean|2620|66983|92.4|4.8|2.8|0.6|8.2|55.5|
|decode_asr_lm_lm_train_en_bpe5000_valid.loss.ave_asr_model_valid.acc.ave/test_other|2939|66650|81.6|12.9|5.5|2.2|20.6|80.5|

# E-Branchformer
## Environments
- date: `Mon Dec 12 06:50:58 CST 2022`
Expand Down Expand Up @@ -40,7 +82,6 @@
|decode_asr_asr_model_valid.acc.ave/test_clean|2620|66983|92.2|4.9|2.9|0.6|8.4|56.1|
|decode_asr_asr_model_valid.acc.ave/test_other|2939|66650|81.5|13.0|5.5|2.2|20.7|80.3|


# E-Branchformer with CTC
## Environments
- date: `Sun Jan 1 15:05:07 CST 2023`
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Trained with A100 (80 GB) x 1 GPUs. It takes 15 minutes per epoch.
encoder: multiconv_conformer
encoder_conf:
output_size: 256
attention_heads: 4
selfattention_layer_type: rel_selfattn
pos_enc_layer_type: rel_pos
rel_pos_type: latest
cgmlp_linear_units: 1024
multicgmlp_type: concat_fusion
multicgmlp_kernel_sizes: 7,15,23,31
multicgmlp_merge_conv_kernel: 31
use_linear_after_conv: false
gate_activation: identity
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
layer_drop_rate: 0.0
linear_units: 1024
positionwise_layer_type: linear
macaron_style: true
use_cnn_module: true

decoder: transformer
decoder_conf:
attention_heads: 4
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
layer_drop_rate: 0.0

model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false

frontend_conf:
n_fft: 512
win_length: 400
hop_length: 160

seed: 2022
num_workers: 4
batch_type: numel
batch_bins: 16000000
accum_grad: 4
max_epoch: 70
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
use_amp: true

optim: adam
optim_conf:
lr: 0.002
weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 15000

specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 5
26 changes: 26 additions & 0 deletions egs2/slurp_entity/asr1/README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,29 @@
# Multiconvformer
- Params: 108.09 M
- ASR config: [conf/tuning/train_asr_multiconv_e12_mlp3072_linear2048_layerdrop.yaml](conf/tuning/train_asr_multiconv_e12_mlp3072_linear2048_layerdrop.yaml)
- Model link: [https://huggingface.co/Darshan7575/slurp_multiconvformer_conv_fusion](https://huggingface.co/Darshan7575/slurp_multiconvformer_conv_fusion)

# RESULTS
## Environments
- date: `Wed Feb 21 01:04:03 EST 2024`
- python version: `3.9.18 (main, Sep 11 2023, 13:41:44) [GCC 11.2.0]`
- espnet version: `espnet 202310`
- pytorch version: `pytorch 2.1.2+cu118`
- Git hash: `edb6ec64bb5d4f2c68a3b81674f0c2822e2e5b58`
- Commit date: `Fri Feb 9 21:26:35 2024 +0530`

### Intent Classification

- Valid Intent Classification Result: 0.8882623705408516
- Test Intent Classification Result: 0.8737574552683897

### Entity

|Slu f1|Precision|Recall|F-Measure|
|:---:|:---:|:---:|:---:|
| test | 0.8076 | 0.7710 | 0.7889 |


# E-Branchformer

- ASR config: [conf/tuning/train_asr_e_branchformer_e12_mlp3072_linear1024_layerdrop.yaml](conf/tuning/train_asr_e_branchformer_e12_mlp3072_linear1024_layerdrop.yaml)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# network architecture
# encoder related
encoder: multiconv_conformer
encoder_conf:
output_size: 512
attention_heads: 8
selfattention_layer_type: rel_selfattn
pos_enc_layer_type: rel_pos
rel_pos_type: latest
cgmlp_linear_units: 3072
multicgmlp_type: concat_fusion
multicgmlp_kernel_sizes: 7,15,23,31
multicgmlp_merge_conv_kernel: 31
use_linear_after_conv: false
gate_activation: identity
num_blocks: 12 # Maybe we can increase the size by 1 to match e-branchformer
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
layer_drop_rate: 0.1
linear_units: 1152
positionwise_layer_type: linear
macaron_style: true
use_cnn_module: true


decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
layer_drop_rate: 0.2

optim: adam
optim_conf:
lr: 0.001
weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 35000

unused_parameters: true
batch_type: folded
batch_size: 64
accum_grad: 1
max_epoch: 60
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10

model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
extract_feats_in_collect_stats: false # Note: "False" means during collect stats (stage 10), generating dummy stats files rather than extract_feats by forward frontend.

specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
num_freq_mask: 2
apply_time_mask: true
time_mask_width_range:
- 0
- 40
num_time_mask: 2
Loading

0 comments on commit b8b31c0

Please sign in to comment.