Merge pull request #102 from X-LANCE/ygr_pr1

Ygr pr1
X-LANCE · Jun 12, 2024 · 1aef5c1 · 1aef5c1
2 parents 806c368 + bacff89
commit 1aef5c1
Show file tree

Hide file tree

Showing 12 changed files with 897 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,11 @@ wandb/
 log/
 *.log
 outputs/
-data/
+data/
+
+.gitignore
+examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
+examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
+examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
+scripts_all
+examples/hotwords_librispeech
diff --git a/examples/asr_librispeech/finetune_asr.py b/examples/asr_librispeech/finetune_asr.py
@@ -15,6 +15,9 @@ class RunConfig:
     fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
     debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
     metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
+    ckpt_path: Optional[str] = field(
+        default=None, metadata={"help": "The path to projector checkpoint"}
+    )
 
 @hydra.main(config_name=None, version_base=None)
 def main_hydra(cfg: DictConfig):

diff --git a/examples/mala_asr_slidespeech/README.md b/examples/mala_asr_slidespeech/README.md
@@ -0,0 +1,35 @@
+# MALA-ASR_SLIDESPEECH
+
+## Performance and checkpoints
+We only train the linear projector in this recipe.
+Encoder | Projector | LLM | dev | test
+|---|---|---|---|---|
+[WavLM-large](https://drive.google.com/file/d/12-cB34qCTvByWT-QtOcZaqwwO21FLSqU/view) | [Linear](https://drive.google.com/file/d/1hYS5UI3W0WVOZRVbqWxDUWIFMO9VgzHk/view?usp=drive_link)(~15.74M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 8.91 | 9.14 
+
+
+## Data preparation
+Refer to official [SLIDESPEECH CORPUS](https://slidespeech.github.io/)
+
+## Decode with checkpoints
+```
+bash decode_MaLa-ASR_withkeywords_L95.sh
+```
+Modify the path including `speech_encoder_path`, `llm_path`, `output_dir`, `ckpt_path` and `decode_log` in the script when you run the shell script. 
+
+## Train a new model
+
+### Use self-supervised model(such as WavLM) as the encoder
+```
+bash finetune_MaLa-ASR_withkeywords_L95.sh
+```
+
+##  Citation
+You can refer to the paper for more results. 
+```
+@inproceedings{yang2024malaasr,
+      title={MaLa-ASR: Multimedia-Assisted LLM-Based ASR}, 
+      author={Guanrou Yang and Ziyang Ma and Fan Yu and Zhifu Gao and Shiliang Zhang and Xie Chen},
+      booktitle={Proc. INTERSPEECH},
+      year={2024},
+}
+```
diff --git a/examples/mala_asr_slidespeech/conf/ds_config.json b/examples/mala_asr_slidespeech/conf/ds_config.json
@@ -0,0 +1,19 @@
+{
+    "train_micro_batch_size_per_gpu": 4,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4
+        }
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu"
+        }
+    }
+}
diff --git a/examples/mala_asr_slidespeech/conf/prompt.yaml b/examples/mala_asr_slidespeech/conf/prompt.yaml
@@ -0,0 +1,4 @@
+dataset_config:
+    # we put prompt here, because the hydra override in shell script only support a small subset of chars
+    # prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. "
+    prompt: "Transcribe speech to text. "