From 1adff7479f4df702e5f08d9c928642d563d3144a Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Wed, 4 Sep 2024 20:25:33 -0700 Subject: [PATCH 1/3] Fix cross attention for blocks (#1512) --- llmfoundry/models/layers/blocks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py index 82e8e94f74..c88cf33d1b 100644 --- a/llmfoundry/models/layers/blocks.py +++ b/llmfoundry/models/layers/blocks.py @@ -170,7 +170,9 @@ def forward( extra_kwargs = {} if prev_layer_key_value is not None: extra_kwargs['prev_layer_key_value'] = prev_layer_key_value + if key_value_states is not None: extra_kwargs['key_value_states'] = key_value_states + if self.fuse_norm_attn_norm: x, m, attn_weights, past_key_value = self.norm_attn_norm( x, @@ -336,7 +338,9 @@ def forward( extra_kwargs = {} if prev_layer_key_value is not None: extra_kwargs['prev_layer_key_value'] = prev_layer_key_value + if key_value_states is not None: extra_kwargs['key_value_states'] = key_value_states + b, attn_weights, past_key_value = self.attn( a, past_key_value=past_key_value, From e8eca4fa83f3fec69ad482465f839fb7dcfbfb0d Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 5 Sep 2024 01:03:16 -0700 Subject: [PATCH 2/3] Put 2.3 image back in release examples (#1513) --- mcli/mcli-1b-eval.yaml | 2 +- mcli/mcli-1b-max-seq-len-8k.yaml | 2 +- mcli/mcli-1b.yaml | 2 +- mcli/mcli-benchmark-mpt.yaml | 2 +- mcli/mcli-convert-composer-to-hf.yaml | 2 +- mcli/mcli-hf-eval.yaml | 2 +- mcli/mcli-hf-generate.yaml | 2 +- mcli/mcli-llama2-finetune.yaml | 2 +- mcli/mcli-openai-eval.yaml | 2 +- mcli/mcli-pretokenize-oci-upload.yaml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml index 2f48fa5ce1..4fcf8b3cb9 100644 --- a/mcli/mcli-1b-eval.yaml +++ b/mcli/mcli-1b-eval.yaml @@ -9,7 +9,7 @@ integrations: command: | cd llm-foundry/scripts/ composer eval/eval.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest name: mpt-1b-eval compute: diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index bb83e2061d..fb96c576e0 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -17,7 +17,7 @@ command: | --out_root ./my-copy-c4 --splits train_small val_small \ --concat_tokens 8192 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest name: mpt-1b-ctx-8k-gpus-8 compute: diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index f371051ca0..26255977f4 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -21,7 +21,7 @@ command: | eval_loader.dataset.split=val_small \ max_duration=100ba \ eval_interval=0 -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest name: mpt-1b-gpus-8 compute: diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index b15f3b7eea..3995598fd3 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -6,7 +6,7 @@ compute: # cluster: TODO # Name of the cluster to use for this run # gpu_type: a100_80gb # Type of GPU to use. We use a100_80gb in our experiments -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest integrations: - integration_type: git_repo diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index 9c5d960a95..7b715f6792 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -13,7 +13,7 @@ command: | --hf_output_path s3://bucket/folder/hf/ \ --output_precision bf16 \ -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest name: convert-composer-hf compute: diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index 5f6b6c564f..27f5938d67 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -16,7 +16,7 @@ gpu_num: 8 # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index dfb9763462..cb3040e4ee 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -35,7 +35,7 @@ command: | "Here's a quick recipe for baking chocolate chip cookies: Start by" \ "The best 5 cities to visit in Europe are" -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest name: hf-generate compute: diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 32e8cddbda..7134e6204c 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -9,7 +9,7 @@ integrations: command: | cd llm-foundry/scripts composer train/train.py /mnt/config/parameters.yaml -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest name: llama2-finetune compute: diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index 4b69827d69..cd04d89f4e 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -16,7 +16,7 @@ gpu_num: # gpu_type: # cluster: # replace with your cluster here! -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index fafb251aee..5425ce9897 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -1,5 +1,5 @@ name: c4-2k-pre-tokenized -image: mosaicml/llm-foundry:2.4.0_cu124-latest +image: mosaicml/llm-foundry:2.3.1_cu121-latest compute: gpus: 8 # Number of GPUs to use From 8a8de18d31156f1acfbeb3b6853e17aa90573eb9 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Tue, 10 Sep 2024 10:07:36 -0700 Subject: [PATCH 3/3] Sort callbacks so that CheckpointSaver goes before HuggingFaceCheckpointer (#1515) --- llmfoundry/command_utils/train.py | 21 +++++++++++++++++++++ tests/a_scripts/train/test_train.py | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index 8e6309175a..73fa4c8d5a 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -10,6 +10,7 @@ import torch import torch.distributed from composer import ComposerModel, Trainer +from composer.callbacks.checkpoint_saver import CheckpointSaver from composer.core.callback import Callback from composer.profiler import ( JSONTraceHandler, @@ -187,6 +188,24 @@ def _initialize_dist_with_barrier(dist_timeout: Union[int, float]): log.debug('Barrier test passed with device.') +def _sort_callbacks(trainer: Trainer): + """Sort callback so that checkpoint saving callbacks go first. + + Args: + trainer (Trainer): Trainer object + """ + + def _sort_key(c: Callback) -> int: + # CheckpointSaver goes before HuggingFaceCheckpointer because the blocking time is shortest while upload is async. + if isinstance(c, CheckpointSaver): + return 1 + if isinstance(c, HuggingFaceCheckpointer): + return 2 + return 0 + + trainer.state.callbacks = sorted(trainer.state.callbacks, key=_sort_key) + + def train(cfg: DictConfig) -> Trainer: code_paths = cfg.get('code_paths', []) # Import any user provided code @@ -548,6 +567,8 @@ def train(cfg: DictConfig) -> Trainer: spin_dataloaders=train_cfg.spin_dataloaders, ) + _sort_callbacks(trainer) + # Optionally just save an HF checkpoint if train_cfg.only_hf_checkpoint: hf_checkpointer_callbacks = [ diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 1f724a6070..9af96f9868 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -5,14 +5,18 @@ import os import pathlib from typing import Optional +from unittest.mock import Mock import pytest +from composer.callbacks import CheckpointSaver from composer.loggers import InMemoryLogger from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om +from llmfoundry.callbacks import HuggingFaceCheckpointer, RunTimeoutCallback from llmfoundry.command_utils import TrainConfig # noqa: E402 from llmfoundry.command_utils import TRAIN_CONFIG_KEYS, train, validate_config +from llmfoundry.command_utils.train import _sort_callbacks from llmfoundry.utils.config_utils import ( make_dataclass_and_log_config, update_batch_size_info, @@ -110,6 +114,20 @@ def test_train_gauntlet(averages: Optional[dict], tmp_path: pathlib.Path): -1][-1] == 0 +def test_sort_callbacks(): + trainer_mock = Mock() + trainer_mock.state.callbacks = [ + CheckpointSaver(), + HuggingFaceCheckpointer('save-folder', '1ba'), + RunTimeoutCallback(), + ] + _sort_callbacks(trainer_mock) + + assert isinstance(trainer_mock.state.callbacks[0], RunTimeoutCallback) + assert isinstance(trainer_mock.state.callbacks[1], CheckpointSaver) + assert isinstance(trainer_mock.state.callbacks[2], HuggingFaceCheckpointer) + + def test_train_multi_eval(tmp_path: pathlib.Path): """Test training run with multiple eval datasets.""" c4_dataset_name = create_c4_dataset_xxsmall(tmp_path)