pr1

X-LANCE · Jun 6, 2024 · 56aa511 · 56aa511
2 parents 41d5eeb + 806c368
commit 56aa511
Show file tree

Hide file tree

Showing 61 changed files with 4,883 additions and 788 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug.yml b/.github/ISSUE_TEMPLATE/bug.yml
@@ -6,7 +6,7 @@ body:
     attributes:
       value: >
         #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
-        existing and past issues](https://github.com/facebookresearch/llama-recipes/issues), the [FAQ](https://github.com/facebookresearch/llama-recipes/blob/main/docs/FAQ.md) 
+        existing and past issues](https://github.com/ddlBoJack/SLAM-LLM/issues).
 
   - type: textarea
     id: system-info

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -1,5 +1,5 @@
 name: 🚀 Feature request
-description: Submit a proposal/request for a new llama-recipes feature
+description: Submit a proposal/request for a new slam-llm feature
 
 body:
 - type: textarea

diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,4 @@ examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_noself.sh
 examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b_copy.sh
 examples/vsr_LRS3/scripts/decode_avhubert_vo_vicuna_7b_copy.sh
 scripts_all
-examples/hotwords_librispeech
+examples/hotwords_librispeech
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,41 @@
+FROM pytorch/pytorch:2.1.0-cuda11.8-cudnn8-runtime
+
+USER root
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+LABEL github_repo="https://github.com/ddlBoJack/SLAM-LLM"
+
+RUN set -x \
+    && apt-get update \
+    && apt-get -y install wget curl man git less openssl libssl-dev unzip unar build-essential aria2 tmux vim ninja-build\
+    && apt-get install -y openssh-server sox libsox-fmt-all libsox-fmt-mp3 libsndfile1-dev ffmpeg \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+RUN pip install --no-cache-dir packaging editdistance gpustat wandb einops debugpy tqdm soundfile matplotlib scipy sentencepiece pandas \
+    && pip install --no-cache-dir torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118
+
+WORKDIR /workspace
+
+RUN git clone https://github.com/huggingface/transformers.git \
+    && cd transformers \
+    && git checkout tags/v4.35.2 \
+    && pip install --no-cache-dir -e .
+
+RUN git clone https://github.com/huggingface/peft.git \
+    && cd peft \
+    && git checkout tags/v0.6.0 \
+    && pip install --no-cache-dir -e .
+
+RUN git clone https://github.com/pytorch/fairseq \
+    && cd fairseq \
+    && pip install --no-cache-dir --editable ./
+
+RUN git clone https://github.com/ddlBoJack/SLAM-LLM.git \
+    && cd SLAM-LLM \
+    && pip install --no-cache-dir -e .
+
+ENV SHELL=/bin/bash
+
+WORKDIR /workspace/SLAM-LLM
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Ziyang Ma
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -27,8 +27,14 @@ developers to train custom multimodal large language model (MLLM), focusing on <
 5. [Acknowledge](#acknowledge)
 
 # News
-- [Update Apr. 28, 2024] Recipes for automated audio captioning (AAC) with SOTA performance has been supported. 
-- [Update Mar. 31, 2024] Recipes for automatic speech recognition (ASR) with SOTA performance has been supported. 
+- **[CALL FOR EXAMPLE]** We sincerely invite developers and researchers to develop new applications, conduct academic research based on SLAM-LLM, and pull request your examples! We also acknowledge engineering PR (such as improving and speeding up multi-node training). 
+- [Update May. 22, 2024] Please join [slack](https://join.slack.com/t/slam-llm/shared_invite/zt-2jbuiyqgi-O83DteBG36xYWcjCNcLWqQ) or [WeChat group](./docs/Wechat.jpg). We will sync our updates and Q&A here. 
+- [Update May. 21, 2024] Recipes for [Spatial Audio Understanding](examples/seld_spatialsoundqa/README.md) has been supported. 
+- [Update May. 20, 2024] Recipes for [music caption (MC)](examples/mc_musiccaps/README.md) has been supported. 
+- [Update May. 8, 2024] Recipes for [visual speech recognition (VSR)](examples/vsr_LRS3/README.md) has been supported. 
+- [Update May. 4, 2024] Recipes for [zero-shot text-to-speech (TTS)](examples/vallex/README.md) has been supported. 
+- [Update Apr. 28, 2024] Recipes for [automated audio captioning (AAC)](examples/aac_audiocaps/README.md) has been supported. 
+- [Update Mar. 31, 2024] Recipes for [automatic speech recognition (ASR)](examples/asr_librispeech/README.md) has been supported. 
 
 # Installation
 ```bash
@@ -39,30 +45,42 @@ pip install -e .
 cd ..
 git clone https://github.com/huggingface/peft.git
 cd peft
-git checkout tags/0.6.0
+git checkout tags/v0.6.0
 pip install -e .
 cd ..
 pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
-git clone git@github.com:ddlBoJack/SLAM-LLM.git
+git clone https://github.com/ddlBoJack/SLAM-LLM.git
 cd SLAM-LLM
 pip install  -e .
 ```
 
 For some examples, you may need to use `fairseq`, the command line is as follows:
 ```
+# you need to install fairseq before SLAM-LLM
 git clone https://github.com/pytorch/fairseq
 cd fairseq
 pip install --editable ./
 ```
+We also provide a docker image for convenience:
+```shell
+# build docker image
+docker build -t slam-llm:latest .
 
+# run docker image with gpu
+docker run -it --gpus all --name slam --shm-size=256g slam-llm:latest /bin/bash
+```
 # Usage
 ## List of Recipes
 We provide reference implementations of various LLM-based speech, audio, and music tasks: 
 - **Speech Task**
     - [Automatic Speech Recognition (ASR)](examples/asr_librispeech/README.md)
     - [Text-to-Speech (TTS)](examples/vallex/README.md)
+    - [Visual Speech Recognition (VSR)](examples/vsr_LRS3/README.md)
 - **Audio Task**
     - [Automated Audio Captioning (AAC)](examples/aac_audiocaps/README.md)
+    - [Spatial Audio Understanding](examples/seld_spatialsoundqa/README.md)
+- **Music Task**
+    - [Music Caption (MC)](examples/mc_musiccaps/README.md)
 
 ## Configuration Priority
 We provide hierarchical configuration inheritance relationships as follows:
@@ -80,4 +98,4 @@ command-line (shell file) > Hydra configuration (yaml file) > dataclass configur
 # Acknowledge
 - We borrow code from [Llama-Recipes](https://github.com/meta-llama/llama-recipes) for the training process. 
 - We borrow code from [Fairseq](https://github.com/facebookresearch/fairseq) for deepspeed configuration. 
-- We thank the contributors for providing diverse recipes. 
+- We thank the contributors for providing diverse recipes. 
diff --git a/docs/Wechat.jpg b/docs/Wechat.jpg
diff --git a/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_hubert_xtralarge_linear_vicuna_7b.sh
@@ -55,7 +55,3 @@ python $code_dir/inference_asr_batch.py \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_wavlm_large_linear_vicuna_7b.sh
@@ -52,7 +52,3 @@ python $code_dir/inference_asr_batch.py \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh b/examples/asr_librispeech/scripts/decode_whisper_large_linear_vicuna_7b.sh
@@ -51,7 +51,3 @@ python $code_dir/inference_asr_batch.py \
         # ++dataset_config.normalize=true \
         # ++model_config.encoder_projector=q-former \
         # ++dataset_config.fix_length_audio=64 \
-
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_gt ${decode_log}_gt.proc
-python src/slam_llm/utils/whisper_tn.py ${decode_log}_pred ${decode_log}_pred.proc
-python src/slam_llm/utils/compute_wer.py ${decode_log}_gt.proc ${decode_log}_pred.proc ${decode_log}.proc.wer
diff --git a/examples/mc_musiccaps/README.md b/examples/mc_musiccaps/README.md
@@ -0,0 +1,31 @@
+# MC_MusicCaps
+
+## Performance and checkpoints
+Here is a recipe for music captioning, using MusicFM as encoder. We only train the linear projector. For more about MusicFM and its checkpoints, please refer to [this repository](https://github.com/minzwon/musicfm).
+
+The following results are obtained by training on the [LP-MusicCaps-MC](https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC) training set and evaluating on the [LP-MusicCaps-MC](https://huggingface.co/datasets/seungheondoh/LP-MusicCaps-MC) test set.
+Encoder | Projector | LLM | BLEU-1 | METEOR | SPICE | SPIDER 
+|---|---|---|---|---|---|---
+[MusicFM(pretrained with MSD)](https://huggingface.co/minzwon/MusicFM/resolve/main/pretrained_msd.pt) | [Linear](https://drive.google.com/file/d/1-9pob6QvJRoq5Dy-LZbiDfF6Q7QRO8Au/view?usp=sharing)(~18.88M) | [vicuna-7b-v1.5](https://huggingface.co/lmsys/vicuna-7b-v1.5) | 25.6 | 10.0 | 8.7 | 6.9
+
+
+## Data preparation
+You need to prepare the data jsonl in this format. Note that you may need to pre-extract the sample rate and duration of audio files for better loading efficiency.
+```
+{"key": "[-0Gj8-vB1q4]-[30-40]", "source": "path/to/MusicCaps/wav/[-0Gj8-vB1q4]-[30-40].wav", "target": "The low quality recording features a ballad song that contains sustained strings, mellow piano melody and soft female vocal singing over it. It sounds sad and soulful, like something you would hear at Sunday services.", "duration": 10.0, "sample_rate": 48000}
+...
+{"key": "[-0vPFx-wRRI]-[30-40]", "source": "path/to/MusicCaps/wav/[-0vPFx-wRRI]-[30-40].wav", "target": "a male voice is singing a melody with changing tempos while snipping his fingers rhythmically. The recording sounds like it has been recorded in an empty room. This song may be playing, practicing snipping and singing along.", "duration": 10.0, "sample_rate": 48000}
+```
+
+## Decode with checkpoints
+```
+bash decode_musicfm_linear_vicuna_7b_10s.sh
+```
+Modify the path including `music_encoder_path`, `music_encoder_stat_path`, `music_encoder_config_path`(if specified), `ckpt_path`, `val_data_path` and `decode_log` in the script when you run the shell script. 
+
+## Train a new model
+
+### Use MusicFM as encoder for music modality.
+```
+finetune_musicfm_linear_vicuna_7b_10s.sh
+```
diff --git a/examples/mc_musiccaps/conf/ds_config.json b/examples/mc_musiccaps/conf/ds_config.json
@@ -0,0 +1,19 @@
+{
+    "train_micro_batch_size_per_gpu": 4,
+    "gradient_accumulation_steps": 1,
+    "optimizer": {
+        "type": "Adam",
+        "params": {
+            "lr": 1e-4
+        }
+    },
+    "fp16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu"
+        }
+    }
+}
diff --git a/examples/mc_musiccaps/conf/prompt.yaml b/examples/mc_musiccaps/conf/prompt.yaml
@@ -0,0 +1,3 @@
+dataset_config:
+    # we put prompt here, because the hydra override in shell script only support a small subset of chars
+    prompt: "Describe this music."
diff --git a/examples/mc_musiccaps/deepspeed_finetune_mir.py b/examples/mc_musiccaps/deepspeed_finetune_mir.py
@@ -0,0 +1,47 @@
+from slam_llm.pipeline.finetune_deepspeed import main as train
+from slam_llm.utils.deepspeed_utils import deepspeed_main_wrapper
+
+import logging
+from dataclasses import dataclass, field
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig
+
+
+@dataclass
+class RunConfig:
+    dataset_config: DataConfig = field(default_factory=DataConfig)
+    model_config: ModelConfig = field(default_factory=ModelConfig)
+    train_config: TrainConfig = field(default_factory=TrainConfig)
+    log_config: LogConfig = field(default_factory=LogConfig)
+    debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
+    metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
+    deepspeed_config: str = field(default="examples/asr_librispeech/conf/ds_config.json", metadata={"help": "The metric for evaluation"})
+
+
+@deepspeed_main_wrapper(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    run_config = RunConfig()
+    cfg = OmegaConf.merge(run_config, cfg)
+    def to_plain_list(cfg_item):
+        if isinstance(cfg_item, ListConfig):
+            return OmegaConf.to_container(cfg_item, resolve=True)
+        elif isinstance(cfg_item, DictConfig):
+            return {k: to_plain_list(v) for k, v in cfg_item.items()}
+        else:
+            return cfg_item
+
+    # kwargs = to_plain_list(cfg)
+    kwargs = cfg
+    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+
+    logging.basicConfig(level=log_level)
+
+    if kwargs.get("debug", False):
+        import pdb;
+        pdb.set_trace()
+
+    train(kwargs)
+
+
+if __name__ == "__main__":
+    main_hydra()
diff --git a/examples/mc_musiccaps/finetune_mir.py b/examples/mc_musiccaps/finetune_mir.py
@@ -0,0 +1,45 @@
+from slam_llm.pipeline.finetune import main as train
+
+import hydra
+import logging
+from dataclasses import dataclass, field
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from mir_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig
+
+@dataclass
+class RunConfig:
+    dataset_config: DataConfig = field(default_factory=DataConfig)
+    model_config: ModelConfig = field(default_factory=ModelConfig)
+    train_config: TrainConfig = field(default_factory=TrainConfig)
+    log_config: LogConfig = field(default_factory=LogConfig)
+    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
+    debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
+    metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
+
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    run_config = RunConfig()
+    cfg = OmegaConf.merge(run_config, cfg)
+    def to_plain_list(cfg_item):
+        if isinstance(cfg_item, ListConfig):
+            return OmegaConf.to_container(cfg_item, resolve=True)
+        elif isinstance(cfg_item, DictConfig):
+            return {k: to_plain_list(v) for k, v in cfg_item.items()}
+        else:
+            return cfg_item
+
+    # kwargs = to_plain_list(cfg)
+    kwargs = cfg
+    log_level = getattr(logging, kwargs.get("log_level", "INFO").upper())
+
+    logging.basicConfig(level=log_level)
+
+    if kwargs.get("debug", False):
+        import pdb;
+        pdb.set_trace()
+
+    train(kwargs)
+
+
+if __name__ == "__main__":
+    main_hydra()
diff --git a/examples/mc_musiccaps/inference_mir_batch.py b/examples/mc_musiccaps/inference_mir_batch.py
@@ -0,0 +1,53 @@
+from slam_llm.pipeline.inference_batch import main as inference
+
+import hydra
+import logging
+from dataclasses import dataclass, field
+from omegaconf import DictConfig, ListConfig, OmegaConf
+from typing import Optional
+from mir_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig
+
+
+@dataclass
+class RunConfig:
+    dataset_config: DataConfig = field(default_factory=DataConfig)
+    model_config: ModelConfig = field(default_factory=ModelConfig)
+    train_config: TrainConfig = field(default_factory=TrainConfig)
+    log_config: LogConfig = field(default_factory=LogConfig)
+    fsdp_config: FSDPConfig = field(default_factory=FSDPConfig)
+    debug: bool = field(default=False, metadata={"help": "Use pdb when true"})
+    metric: str = field(default="acc", metadata={"help": "The metric for evaluation"})
+    decode_log: str = field(
+        default="output/decode_log",
+        metadata={"help": "The prefix for the decode output"},
+    )
+    ckpt_path: str = field(
+        default="output/model.pt", metadata={"help": "The path to projector checkpoint"}
+    )
+    peft_ckpt: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The path to peft checkpoint, should be a directory including adapter_config.json"
+        },
+    )
+
+
+@hydra.main(config_name=None, version_base=None)
+def main_hydra(cfg: DictConfig):
+    run_config = RunConfig()
+    cfg = OmegaConf.merge(run_config, cfg)
+    # kwargs = to_plain_list(cfg)
+    log_level = getattr(logging, cfg.get("log_level", "INFO").upper())
+
+    logging.basicConfig(level=log_level)
+
+    if cfg.get("debug", False):
+        import pdb
+
+        pdb.set_trace()
+
+    inference(cfg)
+
+
+if __name__ == "__main__":
+    main_hydra()