From 412688247e12f0a38de825c1d7bead63a715227f Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 6 Nov 2024 15:08:29 -0800 Subject: [PATCH 01/13] commit change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 33a21fb7c8..29fe51e24d 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', 'mosaicml-streaming>=0.9.0,<0.10', - 'torch>=2.4.0,<2.4.1', + 'torch>=2.5.0,<2.5.1', 'datasets>=2.19,<2.20', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.2.0', From 17922e604fa9b9bf1cd3a64185c54e23a61db9a8 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 6 Nov 2024 15:27:26 -0800 Subject: [PATCH 02/13] commit change --- .github/workflows/docker.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index c3fc9168ee..0c7c7557ff 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,11 +17,11 @@ jobs: strategy: matrix: include: - - name: "2.4.0_cu124" - base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04 + - name: "2.5.1_cu124" + base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 dep_groups: "[all]" - - name: "2.4.0_cu124_aws" - base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws + - name: "2.5.1_cu124_aws" + base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws dep_groups: "[all]" steps: From 3cfe4a120c910dced81388330d1653dd0ac2711c Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 6 Nov 2024 15:29:00 -0800 Subject: [PATCH 03/13] commit change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 22faa70351..49cbc51742 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ 'transformers>=4.43.2,<4.47', 'mosaicml-streaming>=0.9.0,<0.10', 'torch>=2.5.0,<2.5.1', - 'datasets>=2.19,<2.20', + 'datasets>=2.20,<2.21', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.2.0', 'einops==0.8.0', From f911d54d06fe9821fde82a63878c9553892cb7eb Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 6 Nov 2024 15:29:23 -0800 Subject: [PATCH 04/13] commit change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 49cbc51742..1760db57d1 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ 'transformers>=4.43.2,<4.47', 'mosaicml-streaming>=0.9.0,<0.10', 'torch>=2.5.0,<2.5.1', - 'datasets>=2.20,<2.21', + 'datasets>=2.20.0,<2.21', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.2.0', 'einops==0.8.0', From 0762d1a3ee3b3b4c674eb71f0355b3a63d3fab6d Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 6 Nov 2024 15:50:28 -0800 Subject: [PATCH 05/13] commit change --- .github/workflows/docker.yaml | 8 ++++---- setup.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 0c7c7557ff..43759724dd 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,11 +17,11 @@ jobs: strategy: matrix: include: - - name: "2.5.1_cu124" - base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 + - name: "2.5.0_cu124" + base_image: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 dep_groups: "[all]" - - name: "2.5.1_cu124_aws" - base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws + - name: "2.5.0_cu124_aws" + base_image: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws dep_groups: "[all]" steps: diff --git a/setup.py b/setup.py index 1760db57d1..6e82203b3a 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', 'mosaicml-streaming>=0.9.0,<0.10', - 'torch>=2.5.0,<2.5.1', + 'torch>=2.5.1,<2.5.2', 'datasets>=2.20.0,<2.21', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.2.0', From c75d80e7eae2baf74d85399df70e9f18d5a6fc4b Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Wed, 6 Nov 2024 17:02:42 -0800 Subject: [PATCH 06/13] commit change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6e82203b3a..34ec7695dd 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', 'mosaicml-streaming>=0.9.0,<0.10', - 'torch>=2.5.1,<2.5.2', + 'torch>=2.5.o,<2.5.2', 'datasets>=2.20.0,<2.21', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.2.0', From e584302d974d0125312629df78c6f30508b0840a Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 14 Nov 2024 15:34:13 -0800 Subject: [PATCH 07/13] commit change --- TUTORIAL.md | 2 +- scripts/train/benchmarking/collect_results.py | 2 +- scripts/train/benchmarking/submit_benchmarks.py | 16 ++++++++-------- .../finetune_example/mpt-7b-arc-easy--gpu.yaml | 15 ++++++++------- scripts/train/yamls/finetune/7b_dolly_sft.yaml | 15 ++++++++------- .../yamls/finetune/t5-small_dolly_sft.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/gpt-neo-125m.yaml | 15 ++++++++------- .../train/yamls/pretrain/gpt-neo-125m_eval.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/gpt2-small.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-125m.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-13b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-1b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-30b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-350m.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-3b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-70b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-760m.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-7b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/mpt-small-cpu.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/opt-3b.yaml | 15 ++++++++------- scripts/train/yamls/pretrain/testing-moe.yaml | 17 +++++++++-------- scripts/train/yamls/pretrain/testing.yaml | 15 ++++++++------- tests/a_scripts/train/test_train_inputs.py | 2 +- tests/models/test_model.py | 2 +- 24 files changed, 165 insertions(+), 146 deletions(-) diff --git a/TUTORIAL.md b/TUTORIAL.md index d1751f62e3..13e7e813c9 100644 --- a/TUTORIAL.md +++ b/TUTORIAL.md @@ -283,7 +283,7 @@ The purpose of this section is probably pretty self-evident. You’ve got questi ### I’m running into an Out-Of-Memory (OOM) error. What do I do? - Hardware limitations may simply prevent some training/inference configurations, but here are some steps to troubleshooting OOMs. - First, confirm that you are running with the `composer` launcher, e.g. `composer train/train.py ...`, and using all N GPUs? If not, you may be running into OOMs because your model is not being FSDP-sharded across N devices. -- Second, confirm that you have turned on FSDP for model sharding. For example, YAMLs for the `train.py` script should have a `fsdp_config` section. And you need to use `fsdp_config.sharding_strategy: FULL_SHARD` to enable parameter sharding. +- Second, confirm that you have turned on FSDP for model sharding. For example, YAMLs for the `train.py` script should have a `parallelism_config` section which you should initialize with: `{'fsdp': fsdp_config}`. Use `fsdp_config.sharding_strategy: FULL_SHARD` to enable parameter sharding. - Third, confirm that you are using mixed precision, for example by setting `precision: amp_bf16`. - If you are still seeing OOMs, reduce the `device_train_microbatch_size` or `device_eval_batch_size` which will reduce the live activation memory. - If OOMs persist with `device_train_microbatch_size: 1` and `device_eval_batch_size: 1`, you may need to use activation checkpointing `fsdp_config.activation_checkpointing: true` (if you are not already) and, as a last resort, activation CPU offloading `fsdp_config.activation_cpu_offload: true`. diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py index ef9d6ea534..6ba560bde0 100644 --- a/scripts/train/benchmarking/collect_results.py +++ b/scripts/train/benchmarking/collect_results.py @@ -129,7 +129,7 @@ def parse_run(run: msdk.Run) -> dict[str, Any]: run.submitted_config.parameters['precision']] gpu_type = run.gpu_type - fsdp_config = run.submitted_config.parameters['fsdp_config'] + fsdp_config = run.submitted_config.parameters['parallelism_config']['fsdp'] seq_len = run.submitted_config.parameters['max_seq_len'] global_train_batch_size = run.submitted_config.parameters[ diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index 27f5c26c7d..7f1f17f64d 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -47,7 +47,7 @@ def parse_args(): parser.add_argument( '--image', type=str, - default='mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04', + default='mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04', ) parser.add_argument( '--git_branch', @@ -387,24 +387,24 @@ def mod_parameters( parameters['eval_interval'] = eval_interval parameters['precision'] = precision - parameters['fsdp_config']['mixed_precision'] = fsdp_config_mixed_precision + parameters['parallelism_config']['fsdp']['mixed_precision'] = fsdp_config_mixed_precision if fsdp_config_activation_checkpointing is not None: - parameters['fsdp_config']['activation_checkpointing' + parameters['parallelism_config']['fsdp']['activation_checkpointing' ] = fsdp_config_activation_checkpointing if fsdp_config_shard_strategy is not None: - parameters['fsdp_config']['sharding_strategy' + parameters['parallelism_config']['fsdp']['sharding_strategy' ] = fsdp_config_shard_strategy if fsdp_config_limit_all_gathers is not None: - parameters['fsdp_config']['limit_all_gathers' + parameters['parallelism_config']['fsdp']['limit_all_gathers' ] = fsdp_config_limit_all_gathers if fsdp_config_forward_prefetch is not None: - parameters['fsdp_config']['forward_prefetch' + parameters['parallelism_config']['fsdp']['forward_prefetch' ] = fsdp_config_forward_prefetch if fsdp_config_backward_prefetch is not None: - parameters['fsdp_config']['backward_prefetch' + parameters['parallelism_config']['fsdp']['backward_prefetch' ] = fsdp_config_backward_prefetch if activation_cpu_offload is not None: - parameters['fsdp_config']['activation_cpu_offload' + parameters['parallelism_config']['fsdp']['activation_cpu_offload' ] = activation_cpu_offload if wandb: diff --git a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml index e017ca54e7..8757303328 100644 --- a/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml +++ b/scripts/train/finetune_example/mpt-7b-arc-easy--gpu.yaml @@ -85,13 +85,14 @@ device_train_microbatch_size: 1 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml index f9edba3716..9f14d2bf35 100644 --- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml @@ -96,13 +96,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml index d394018cfc..580b767160 100644 --- a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml @@ -74,13 +74,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml index 5f02ba47e6..54eafb1397 100644 --- a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml +++ b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml @@ -90,13 +90,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml index fe9828b50a..fc1a2c779b 100644 --- a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml +++ b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml @@ -90,13 +90,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/gpt2-small.yaml b/scripts/train/yamls/pretrain/gpt2-small.yaml index 458f6869da..c62df18761 100644 --- a/scripts/train/yamls/pretrain/gpt2-small.yaml +++ b/scripts/train/yamls/pretrain/gpt2-small.yaml @@ -90,13 +90,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml index 644dfc26c1..7e4687c15e 100644 --- a/scripts/train/yamls/pretrain/mpt-125m.yaml +++ b/scripts/train/yamls/pretrain/mpt-125m.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 16 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-13b.yaml b/scripts/train/yamls/pretrain/mpt-13b.yaml index 41002bb45d..bbc60f82ed 100644 --- a/scripts/train/yamls/pretrain/mpt-13b.yaml +++ b/scripts/train/yamls/pretrain/mpt-13b.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-1b.yaml b/scripts/train/yamls/pretrain/mpt-1b.yaml index 93b2a58a42..18a423818c 100644 --- a/scripts/train/yamls/pretrain/mpt-1b.yaml +++ b/scripts/train/yamls/pretrain/mpt-1b.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-30b.yaml b/scripts/train/yamls/pretrain/mpt-30b.yaml index 3627c36dd0..cff2cad567 100644 --- a/scripts/train/yamls/pretrain/mpt-30b.yaml +++ b/scripts/train/yamls/pretrain/mpt-30b.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-350m.yaml b/scripts/train/yamls/pretrain/mpt-350m.yaml index ebe8da715f..8704014542 100644 --- a/scripts/train/yamls/pretrain/mpt-350m.yaml +++ b/scripts/train/yamls/pretrain/mpt-350m.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-3b.yaml b/scripts/train/yamls/pretrain/mpt-3b.yaml index 615f59ee3f..1efc71af67 100644 --- a/scripts/train/yamls/pretrain/mpt-3b.yaml +++ b/scripts/train/yamls/pretrain/mpt-3b.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-70b.yaml b/scripts/train/yamls/pretrain/mpt-70b.yaml index 55450a8bfc..506804ac91 100644 --- a/scripts/train/yamls/pretrain/mpt-70b.yaml +++ b/scripts/train/yamls/pretrain/mpt-70b.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-760m.yaml b/scripts/train/yamls/pretrain/mpt-760m.yaml index 5c1f0bdbdc..8a3344346d 100644 --- a/scripts/train/yamls/pretrain/mpt-760m.yaml +++ b/scripts/train/yamls/pretrain/mpt-760m.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-7b.yaml b/scripts/train/yamls/pretrain/mpt-7b.yaml index b97f3f2c9e..ab7236173b 100644 --- a/scripts/train/yamls/pretrain/mpt-7b.yaml +++ b/scripts/train/yamls/pretrain/mpt-7b.yaml @@ -88,13 +88,14 @@ device_train_microbatch_size: 8 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: true - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/mpt-small-cpu.yaml b/scripts/train/yamls/pretrain/mpt-small-cpu.yaml index b579723002..0ea1b31ecd 100644 --- a/scripts/train/yamls/pretrain/mpt-small-cpu.yaml +++ b/scripts/train/yamls/pretrain/mpt-small-cpu.yaml @@ -90,13 +90,14 @@ device_train_microbatch_size: 16 precision: fp32 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/opt-3b.yaml b/scripts/train/yamls/pretrain/opt-3b.yaml index 31b7bf255b..ee6f8f738f 100644 --- a/scripts/train/yamls/pretrain/opt-3b.yaml +++ b/scripts/train/yamls/pretrain/opt-3b.yaml @@ -82,13 +82,14 @@ device_train_microbatch_size: 4 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/testing-moe.yaml b/scripts/train/yamls/pretrain/testing-moe.yaml index ee9483ffd0..a80a4232ed 100644 --- a/scripts/train/yamls/pretrain/testing-moe.yaml +++ b/scripts/train/yamls/pretrain/testing-moe.yaml @@ -98,14 +98,15 @@ device_train_microbatch_size: 16 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true - verbose: false +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + verbose: false # Logging progress_bar: false diff --git a/scripts/train/yamls/pretrain/testing.yaml b/scripts/train/yamls/pretrain/testing.yaml index 2271be5d6d..e32b462645 100644 --- a/scripts/train/yamls/pretrain/testing.yaml +++ b/scripts/train/yamls/pretrain/testing.yaml @@ -89,13 +89,14 @@ device_train_microbatch_size: 16 precision: amp_bf16 # FSDP -fsdp_config: - sharding_strategy: FULL_SHARD - mixed_precision: PURE - activation_checkpointing: false - activation_checkpointing_reentrant: false - activation_cpu_offload: false - limit_all_gathers: true +parallelism_config: + fsdp: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true # Logging progress_bar: false diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index 7fbc0e4fc6..a895e26975 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -103,7 +103,7 @@ def test_optional_misspelled_params_raise_error( 'eval_first', 'autoresume', 'save_folder', - 'fsdp_config', + 'parallelism_config', 'lora_config', 'eval_loader', 'icl_tasks_config', diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 8a6290d5c4..4a1006839b 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -93,7 +93,7 @@ def _get_objs( test_cfg.model[k] = v # Read FSDP Config as a dict - fsdp_config = test_cfg.get('fsdp_config', None) + fsdp_config = test_cfg.get('parallelism_config', {}).get('fsdp', None) fsdp_config = om.to_container( fsdp_config, resolve=True, From 71618c89da0898d64f3f773c0d2168bf112d8c86 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 14 Nov 2024 15:35:26 -0800 Subject: [PATCH 08/13] commit change --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 34ec7695dd..b96bbf3a1e 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.26.0,<0.27', + 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28', 'mlflow>=2.14.1,<2.18', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', @@ -91,7 +91,7 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.26.0,<0.27', + 'mosaicml[databricks]>=0.27.0,<0.28', 'numpy<2', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', @@ -99,7 +99,7 @@ ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.26.0,<0.27', + 'mosaicml[tensorboard]>=0.27.0,<0.28', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.26.0,<0.27', + 'mosaicml[peft]>=0.27.0,<0.28', ] extra_deps['openai'] = [ From e842ca104e57446c3fe4cdc88df1b5687335b41c Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Thu, 14 Nov 2024 15:40:43 -0800 Subject: [PATCH 09/13] commit change --- .github/workflows/pr-cpu.yaml | 2 +- .github/workflows/pr-gpu.yaml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 056b070143..9bc76bc206 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -23,7 +23,7 @@ jobs: include: - name: "cpu-2.4.0" pip_deps: "[all-cpu]" - container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04 + container: mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu20.04 markers: "not gpu" pytest_command: "coverage run -m pytest" steps: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 5b91d54442..c709ecf78a 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -23,7 +23,7 @@ jobs: matrix: include: - name: "gpu-2.4.0-1" - container: mosaicml/llm-foundry:2.4.0_cu124-latest + container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 markers: "gpu" pip_deps: "[all]" pytest_command: "coverage run -m pytest" @@ -51,8 +51,8 @@ jobs: fail-fast: false matrix: include: - - name: "gpu-2.4.0-2" - container: mosaicml/llm-foundry:2.4.0_cu124-latest + - name: "gpu-2.5.0-2" + container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 markers: "gpu" pip_deps: "[all]" pytest_command: "coverage run -m pytest" @@ -81,7 +81,7 @@ jobs: matrix: include: - name: "gpu-2.4.0-4" - container: mosaicml/llm-foundry:2.4.0_cu124-latest + container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 markers: "gpu" pip_deps: "[all]" pytest_command: "coverage run -m pytest" From 82af8c73cbb779216ea33227b87a1e1b2e2f8649 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 15 Nov 2024 07:20:18 -0800 Subject: [PATCH 10/13] commit change --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b96bbf3a1e..ba61751573 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,7 @@ 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', 'mosaicml-streaming>=0.9.0,<0.10', - 'torch>=2.5.o,<2.5.2', + 'torch>=2.5.1,<2.5.2', 'datasets>=2.20.0,<2.21', 'fsspec==2023.6.0', # newer version results in a bug in datasets that duplicates data 'sentencepiece==0.2.0', From ac8a75d7a52fb2e9aa9e176be5086de06ca410c4 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 15 Nov 2024 07:20:45 -0800 Subject: [PATCH 11/13] commit change --- .github/workflows/docker.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 43759724dd..0c7c7557ff 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -17,11 +17,11 @@ jobs: strategy: matrix: include: - - name: "2.5.0_cu124" - base_image: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04 + - name: "2.5.1_cu124" + base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04 dep_groups: "[all]" - - name: "2.5.0_cu124_aws" - base_image: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws + - name: "2.5.1_cu124_aws" + base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws dep_groups: "[all]" steps: From f6f1471e14fa668181c22592d37866902e69576a Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 15 Nov 2024 07:31:45 -0800 Subject: [PATCH 12/13] commit change --- llmfoundry/models/mpt/modeling_mpt.py | 5 +++-- scripts/train/benchmarking/collect_results.py | 2 +- scripts/train/benchmarking/submit_benchmarks.py | 16 ++++++++-------- scripts/train/benchmarking/sweep.sh | 2 +- tests/data_utils.py | 2 +- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 0afb493844..ede9dfc22d 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -1239,8 +1239,9 @@ def activation_checkpointing_fn(self, module: nn.Module) -> bool: - a list of mixed integers and strings of first-n, middle-m, last-k, range-i-j An example in yaml config file: - fsdp_config: - activation_checkpointing: true + parallelism_config: + fsdp: + activation_checkpointing: true model: activation_checkpointing_target: { diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py index 6ba560bde0..ef9d6ea534 100644 --- a/scripts/train/benchmarking/collect_results.py +++ b/scripts/train/benchmarking/collect_results.py @@ -129,7 +129,7 @@ def parse_run(run: msdk.Run) -> dict[str, Any]: run.submitted_config.parameters['precision']] gpu_type = run.gpu_type - fsdp_config = run.submitted_config.parameters['parallelism_config']['fsdp'] + fsdp_config = run.submitted_config.parameters['fsdp_config'] seq_len = run.submitted_config.parameters['max_seq_len'] global_train_batch_size = run.submitted_config.parameters[ diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index 7f1f17f64d..27f5c26c7d 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -47,7 +47,7 @@ def parse_args(): parser.add_argument( '--image', type=str, - default='mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04', + default='mosaicml/pytorch:1.13.1_cu117-python3.10-ubuntu20.04', ) parser.add_argument( '--git_branch', @@ -387,24 +387,24 @@ def mod_parameters( parameters['eval_interval'] = eval_interval parameters['precision'] = precision - parameters['parallelism_config']['fsdp']['mixed_precision'] = fsdp_config_mixed_precision + parameters['fsdp_config']['mixed_precision'] = fsdp_config_mixed_precision if fsdp_config_activation_checkpointing is not None: - parameters['parallelism_config']['fsdp']['activation_checkpointing' + parameters['fsdp_config']['activation_checkpointing' ] = fsdp_config_activation_checkpointing if fsdp_config_shard_strategy is not None: - parameters['parallelism_config']['fsdp']['sharding_strategy' + parameters['fsdp_config']['sharding_strategy' ] = fsdp_config_shard_strategy if fsdp_config_limit_all_gathers is not None: - parameters['parallelism_config']['fsdp']['limit_all_gathers' + parameters['fsdp_config']['limit_all_gathers' ] = fsdp_config_limit_all_gathers if fsdp_config_forward_prefetch is not None: - parameters['parallelism_config']['fsdp']['forward_prefetch' + parameters['fsdp_config']['forward_prefetch' ] = fsdp_config_forward_prefetch if fsdp_config_backward_prefetch is not None: - parameters['parallelism_config']['fsdp']['backward_prefetch' + parameters['fsdp_config']['backward_prefetch' ] = fsdp_config_backward_prefetch if activation_cpu_offload is not None: - parameters['parallelism_config']['fsdp']['activation_cpu_offload' + parameters['fsdp_config']['activation_cpu_offload' ] = activation_cpu_offload if wandb: diff --git a/scripts/train/benchmarking/sweep.sh b/scripts/train/benchmarking/sweep.sh index 97372ee6fd..5795257922 100755 --- a/scripts/train/benchmarking/sweep.sh +++ b/scripts/train/benchmarking/sweep.sh @@ -2,7 +2,7 @@ PROJECT="tput" GIT_COMMIT="v0.0.4" -IMAGE="mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04" +IMAGE="mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04" CLUSTER_40GB= # TODO for PRECISION in fp8 bf16 diff --git a/tests/data_utils.py b/tests/data_utils.py index 57da51956e..168fd75717 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -305,7 +305,7 @@ def gpt_tiny_cfg(dataset_name: str, device: str): if device == 'cpu': test_cfg.model.init_device = 'cpu' - test_cfg.fsdp_config = None + test_cfg.parallelism_config = None test_cfg.model.attn_config.attn_impl = 'torch' test_cfg.model.loss_fn = 'torch_crossentropy' test_cfg.precision = 'fp32' From bcad02114edbad8708952cf59fee276b13ab7d79 Mon Sep 17 00:00:00 2001 From: Chuck Tang Date: Fri, 15 Nov 2024 07:31:57 -0800 Subject: [PATCH 13/13] commit change --- tests/a_scripts/eval/test_eval_inputs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/a_scripts/eval/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py index 86243ba154..5502671749 100644 --- a/tests/a_scripts/eval/test_eval_inputs.py +++ b/tests/a_scripts/eval/test_eval_inputs.py @@ -59,7 +59,7 @@ def test_optional_mispelled_params_raise_error( 'num_retries', 'loggers', 'eval_gauntlet', - 'fsdp_config', + 'parallelism_config', 'eval_loader', ] old_cfg = copy.deepcopy(cfg)