diff --git a/README.md b/README.md index ad1fe829..0fd38d0a 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ developers to train custom multimodal large language model (MLLM), focusing on < # Table of Contents 1. [News](#news) 2. [Installation](#installation) -3. [Uasge](#uasge) +3. [Usage](#usage) - [List of Recipes](#list-of-recipes) - [Configuration Priority](#configuration-priority) 4. [Features](#features) @@ -129,3 +129,14 @@ SLAM-ASR: } ``` +SLAM-AAC: +``` +@article{chen2024slam, + title={SLAM-AAC: Enhancing Audio Captioning with Paraphrasing Augmentation and CLAP-Refine through LLMs}, + author={Chen, Wenxi and Ma, Ziyang and Li, Xiquan and Xu, Xuenan and Liang, Yuzhe and Zheng, Zhisheng and Yu, Kai and Chen, Xie}, + journal={arXiv preprint arXiv:2410.09503}, + year={2024} +} +``` + + diff --git a/examples/slam_aac/README.md b/examples/slam_aac/README.md index eb39d158..7b8ddd37 100644 --- a/examples/slam_aac/README.md +++ b/examples/slam_aac/README.md @@ -1,7 +1,6 @@ # SLAM-AAC -SLAM-AAC is a LLM-based model for Automated Audio Captioning (AAC) task. Inspired by techniques in machine translation and ASR, the model enhances audio captioning by incorporating paraphrasing augmentation and a plug-and-play CLAP-Refine strategy. - +SLAM-AAC is a LLM-based model for Automated Audio Captioning (AAC) task. Inspired by techniques in machine translation and ASR, the model enhances audio captioning by incorporating paraphrasing augmentation and a plug-and-play CLAP-Refine strategy. For more details, please refer to the [paper](https://arxiv.org/abs/2410.09503). ## Model Architecture SLAM-AAC uses EAT as the audio encoder and Vicuna-7B as the LLM decoder. During training, only the Linear Projector and LoRA modules are trainable. For inference, multiple candidates are generated using different beam sizes, which are then refined using the CLAP-Refine strategy. @@ -81,8 +80,13 @@ If you already have the generated candidates and want to directly refine them us bash scripts/clap_refine.sh ``` - +@article{chen2024slam, + title={SLAM-AAC: Enhancing Audio Captioning with Paraphrasing Augmentation and CLAP-Refine through LLMs}, + author={Chen, Wenxi and Ma, Ziyang and Li, Xiquan and Xu, Xuenan and Liang, Yuzhe and Zheng, Zhisheng and Yu, Kai and Chen, Xie}, + journal={arXiv preprint arXiv:2410.09503}, + year={2024} +} +``` diff --git a/examples/slam_aac/aac_config.py b/examples/slam_aac/aac_config.py index 50fca279..9fb747b3 100644 --- a/examples/slam_aac/aac_config.py +++ b/examples/slam_aac/aac_config.py @@ -1,5 +1,9 @@ from dataclasses import dataclass, field from typing import Optional, List + +from torch.distributed.fsdp import ShardingStrategy + + @dataclass class ModelConfig: file: str = "examples/slam_aac/model/slam_model_aac.py:model_factory" @@ -125,7 +129,7 @@ class FSDPConfig: mixed_precision: bool = True use_fp16: bool = False # sharding_strategy = "FULL_SHARD" #ShardingStrategy = ShardingStrategy.FULL_SHARD - sharding_strategy: str = "NO_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP + sharding_strategy: ShardingStrategy = "NO_SHARD" #ShardingStrategy.NO_SHARD #MZY: set NO_SHARD when use DDP checkpoint_type: str = "SHARDED_STATE_DICT" # alternatively can use SHARDED_STATE_DICT save one file per rank, and can resize the world-size. fsdp_activation_checkpointing: bool = True fsdp_cpu_offload: bool = False