-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #51 from ddlBoJack/main
sync
- Loading branch information
Showing
23 changed files
with
1,368 additions
and
185 deletions.
There are no files selected for viewing
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"train_micro_batch_size_per_gpu": 4, | ||
"gradient_accumulation_steps": 1, | ||
"optimizer": { | ||
"type": "Adam", | ||
"params": { | ||
"lr": 1e-4 | ||
} | ||
}, | ||
"fp16": { | ||
"enabled": true | ||
}, | ||
"zero_optimization": { | ||
"stage": 3, | ||
"offload_optimizer": { | ||
"device": "cpu" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
dataset_config: | ||
# we put prompt here, because the hydra override in shell script only support a small subset of chars | ||
prompt: "Transcribe speech to text. Output the transcription directly without redundant content. Ensure that the output is not duplicated. " |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
from slam_llm.pipeline.finetune_deepspeed import main as train | ||
from slam_llm.utils.deepspeed_utils import deepspeed_main_wrapper | ||
|
||
import logging | ||
from dataclasses import dataclass, field | ||
from omegaconf import DictConfig, ListConfig, OmegaConf | ||
from asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig | ||
|
||
|
||
@dataclass | ||
class RunConfig: | ||
dataset_config: DataConfig = field(default_factory=DataConfig) | ||
model_config: ModelConfig = field(default_factory=ModelConfig) | ||
train_config: TrainConfig = field(default_factory=TrainConfig) | ||
log_config: LogConfig = field(default_factory=LogConfig) | ||
debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) | ||
metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) | ||
deepspeed_config: str = field(default="examples/asr_librispeech/conf/ds_config.json", metadata={"help": "The metric for evaluation"}) | ||
|
||
|
||
@deepspeed_main_wrapper(config_name=None, version_base=None) | ||
def main_hydra(cfg: DictConfig): | ||
run_config = RunConfig() | ||
cfg = OmegaConf.merge(run_config, cfg) | ||
def to_plain_list(cfg_item): | ||
if isinstance(cfg_item, ListConfig): | ||
return OmegaConf.to_container(cfg_item, resolve=True) | ||
elif isinstance(cfg_item, DictConfig): | ||
return {k: to_plain_list(v) for k, v in cfg_item.items()} | ||
else: | ||
return cfg_item | ||
|
||
# kwargs = to_plain_list(cfg) | ||
kwargs = cfg | ||
log_level = getattr(logging, kwargs.get("log_level", "INFO").upper()) | ||
|
||
logging.basicConfig(level=log_level) | ||
|
||
if kwargs.get("debug", False): | ||
import pdb; | ||
pdb.set_trace() | ||
|
||
train(kwargs) | ||
|
||
|
||
if __name__ == "__main__": | ||
main_hydra() |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
from slam_llm.pipeline.inference_batch import main as inference | ||
|
||
import hydra | ||
import logging | ||
from dataclasses import dataclass, field | ||
from omegaconf import DictConfig, ListConfig, OmegaConf | ||
from typing import Optional | ||
from asr_config import ModelConfig, TrainConfig, DataConfig, LogConfig, FSDPConfig | ||
|
||
|
||
@dataclass | ||
class RunConfig: | ||
dataset_config: DataConfig = field(default_factory=DataConfig) | ||
model_config: ModelConfig = field(default_factory=ModelConfig) | ||
train_config: TrainConfig = field(default_factory=TrainConfig) | ||
log_config: LogConfig = field(default_factory=LogConfig) | ||
fsdp_config: FSDPConfig = field(default_factory=FSDPConfig) | ||
debug: bool = field(default=False, metadata={"help": "Use pdb when true"}) | ||
metric: str = field(default="acc", metadata={"help": "The metric for evaluation"}) | ||
decode_log: str = field( | ||
default="output/decode_log", | ||
metadata={"help": "The prefix for the decode output"}, | ||
) | ||
ckpt_path: str = field( | ||
default="output/model.pt", metadata={"help": "The path to projector checkpoint"} | ||
) | ||
peft_ckpt: Optional[str] = field( | ||
default=None, | ||
metadata={ | ||
"help": "The path to peft checkpoint, should be a directory including adapter_config.json" | ||
}, | ||
) | ||
|
||
|
||
@hydra.main(config_name=None, version_base=None) | ||
def main_hydra(cfg: DictConfig): | ||
run_config = RunConfig() | ||
cfg = OmegaConf.merge(run_config, cfg) | ||
# kwargs = to_plain_list(cfg) | ||
log_level = getattr(logging, cfg.get("log_level", "INFO").upper()) | ||
|
||
logging.basicConfig(level=log_level) | ||
|
||
if cfg.get("debug", False): | ||
import pdb | ||
|
||
pdb.set_trace() | ||
|
||
inference(cfg) | ||
|
||
|
||
if __name__ == "__main__": | ||
main_hydra() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,155 @@ | ||
import torch | ||
import os | ||
import logging | ||
from slam_llm.models.slam_model import ( | ||
slam_model, | ||
setup_tokenizer, | ||
setup_encoder, | ||
setup_encoder_projector, | ||
setup_llm, | ||
) | ||
from slam_llm.utils.train_utils import print_model_size | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
def model_factory(train_config, model_config, **kwargs): | ||
# return necessary components for training | ||
tokenizer = setup_tokenizer(train_config, model_config, **kwargs) | ||
|
||
encoder = setup_encoder(train_config, model_config, **kwargs) | ||
|
||
# llm | ||
llm = setup_llm(train_config, model_config, **kwargs) | ||
|
||
# projector | ||
encoder_projector = setup_encoder_projector( | ||
train_config, model_config, **kwargs | ||
) | ||
model = slam_model_asr( | ||
encoder, | ||
llm, | ||
encoder_projector, | ||
tokenizer, | ||
train_config, | ||
model_config, | ||
**kwargs, | ||
) | ||
|
||
ckpt_path = kwargs.get( | ||
"ckpt_path", None | ||
) # FIX(MZY): load model ckpt(mainly projector, related to model_checkpointing/checkpoint_handler.py: save_model_checkpoint_peft) | ||
if ckpt_path is not None: | ||
logger.info("loading other parts from: {}".format(ckpt_path)) | ||
ckpt_dict = torch.load(ckpt_path, map_location="cpu") | ||
model.load_state_dict(ckpt_dict, strict=False) | ||
|
||
print_model_size( | ||
model, | ||
train_config, | ||
( | ||
int(os.environ["RANK"]) | ||
if train_config.enable_fsdp or train_config.enable_ddp | ||
else 0 | ||
), | ||
) | ||
return model, tokenizer | ||
|
||
|
||
class slam_model_asr(slam_model): | ||
def __init__( | ||
self, | ||
encoder, | ||
llm, | ||
encoder_projector, | ||
tokenizer, | ||
train_config, | ||
model_config, | ||
**kwargs, | ||
): | ||
super().__init__( | ||
encoder, | ||
llm, | ||
encoder_projector, | ||
tokenizer, | ||
train_config, | ||
model_config, | ||
**kwargs, | ||
) | ||
|
||
|
||
@torch.no_grad() | ||
def inference( | ||
self, | ||
wav_path=None, | ||
prompt=None, | ||
generation_config=None, | ||
logits_processor=None, | ||
stopping_criteria=None, | ||
prefix_allowed_tokens_fn=None, | ||
synced_gpus=None, | ||
assistant_model=None, | ||
streamer=None, | ||
negative_prompt_ids=None, | ||
negative_prompt_attention_mask=None, | ||
**kwargs, | ||
): | ||
# inference for asr model | ||
|
||
device = kwargs.get("device", "cuda") | ||
if os.path.exists(wav_path): # Audio-Text QA | ||
import whisper | ||
|
||
audio_raw = whisper.load_audio(wav_path) | ||
audio_raw = whisper.pad_or_trim(audio_raw) | ||
|
||
mel_size = getattr( | ||
self.dataset_config, "mel_size", 80 | ||
) # 80 for large v1 and v2, 128 for large v3 | ||
audio_mel = ( | ||
whisper.log_mel_spectrogram(audio_raw, n_mels=mel_size) | ||
.permute(1, 0)[None, :, :] | ||
.to(device) | ||
) | ||
|
||
encoder_outs = self.encoder.extract_variable_length_features( | ||
audio_mel.permute(0, 2, 1) | ||
) | ||
|
||
if self.model_config.encoder_projector == "q-former": | ||
audio_mel_post_mask = torch.ones( | ||
encoder_outs.size()[:-1], dtype=torch.long | ||
).to(encoder_outs.device) | ||
encoder_outs = self.encoder_projector(encoder_outs, audio_mel_post_mask) | ||
if self.model_config.encoder_projector == "linear": | ||
encoder_outs = self.encoder_projector(encoder_outs) | ||
else: # Text QA | ||
encoder_outs = torch.empty( | ||
1, 0, self.llm.model.embed_tokens.embedding_dim | ||
).to(device) | ||
|
||
prompt = "USER: {}\n ASSISTANT:".format(prompt) | ||
prompt_ids = self.tokenizer.encode(prompt) | ||
prompt_length = len(prompt_ids) | ||
prompt_ids = torch.tensor(prompt_ids, dtype=torch.int64).to(device) | ||
|
||
if hasattr(self.llm.model, "embed_tokens"): | ||
inputs_embeds = self.llm.model.embed_tokens(prompt_ids) | ||
elif hasattr(self.llm.model.model, "embed_tokens"): | ||
inputs_embeds = self.llm.model.model.embed_tokens(prompt_ids) | ||
else: | ||
inputs_embeds = self.llm.model.model.model.embed_tokens(prompt_ids) | ||
|
||
inputs_embeds = torch.cat( | ||
(encoder_outs, inputs_embeds[None, :, :]), dim=1 | ||
) # [audio,prompt] | ||
|
||
attention_mask = torch.ones(inputs_embeds.size()[:-1], dtype=torch.long).to( | ||
inputs_embeds.device | ||
) | ||
|
||
# generate | ||
model_outputs = self.generate( | ||
inputs_embeds=inputs_embeds, attention_mask=attention_mask, **kwargs | ||
) | ||
|
||
return model_outputs |
Oops, something went wrong.