mosaicml · j316chuck · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
@@ -17,11 +17,11 @@ jobs:
     strategy:
       matrix:
         include:
-        - name: "2.4.0_cu124"
-          base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04
+        - name: "2.5.1_cu124"
+          base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04
           dep_groups: "[all]"
-        - name: "2.4.0_cu124_aws"
-          base_image: mosaicml/pytorch:2.4.0_cu124-python3.11-ubuntu20.04-aws
+        - name: "2.5.1_cu124_aws"
+          base_image: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04-aws
           dep_groups: "[all]"
     steps:
 

@@ -23,7 +23,7 @@ jobs:
         include:
         - name: "cpu-2.4.0"
           pip_deps: "[all-cpu]"
-          container: mosaicml/pytorch:2.4.0_cpu-python3.11-ubuntu20.04
+          container: mosaicml/pytorch:2.5.1_cpu-python3.11-ubuntu20.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
     steps:

@@ -23,7 +23,7 @@ jobs:
       matrix:
         include:
         - name: "gpu-2.4.0-1"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+          container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"
@@ -51,8 +51,8 @@ jobs:
       fail-fast: false
       matrix:
         include:
-        - name: "gpu-2.4.0-2"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+        - name: "gpu-2.5.0-2"
+          container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"
@@ -81,7 +81,7 @@ jobs:
       matrix:
         include:
         - name: "gpu-2.4.0-4"
-          container: mosaicml/llm-foundry:2.4.0_cu124-latest
+          container: mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04
           markers: "gpu"
           pip_deps: "[all]"
           pytest_command: "coverage run -m pytest"

@@ -283,7 +283,7 @@ The purpose of this section is probably pretty self-evident. You’ve got questi
 ### I’m running into an Out-Of-Memory (OOM) error. What do I do?
 - Hardware limitations may simply prevent some training/inference configurations, but here are some steps to troubleshooting OOMs.
 - First, confirm that you are running with the `composer` launcher, e.g. `composer train/train.py ...`, and using all N GPUs? If not, you may be running into OOMs because your model is not being FSDP-sharded across N devices.
-- Second, confirm that you have turned on FSDP for model sharding. For example, YAMLs for the `train.py` script should have a `fsdp_config` section. And you need to use `fsdp_config.sharding_strategy: FULL_SHARD` to enable parameter sharding.
+- Second, confirm that you have turned on FSDP for model sharding. For example, YAMLs for the `train.py` script should have a `parallelism_config` section which you should initialize with: `{'fsdp': fsdp_config}`. Use `fsdp_config.sharding_strategy: FULL_SHARD` to enable parameter sharding.
 - Third, confirm that you are using mixed precision, for example by setting `precision: amp_bf16`.
 - If you are still seeing OOMs, reduce the `device_train_microbatch_size` or `device_eval_batch_size` which will reduce the live activation memory.
 - If OOMs persist with `device_train_microbatch_size: 1` and `device_eval_batch_size: 1`, you may need to use activation checkpointing `fsdp_config.activation_checkpointing: true` (if you are not already) and, as a last resort, activation CPU offloading `fsdp_config.activation_cpu_offload: true`.

@@ -1239,8 +1239,9 @@ def activation_checkpointing_fn(self, module: nn.Module) -> bool:
                 - a list of mixed integers and strings of first-n, middle-m, last-k, range-i-j
 
             An example in yaml config file:
-                fsdp_config:
-                    activation_checkpointing: true
+                parallelism_config:
+                    fsdp:
+                        activation_checkpointing: true
                 model:
                     activation_checkpointing_target:
                         {

@@ -2,7 +2,7 @@
 
 PROJECT="tput"
 GIT_COMMIT="v0.0.4"
-IMAGE="mosaicml/pytorch:2.1.0_cu121-python3.10-ubuntu20.04"
+IMAGE="mosaicml/pytorch:2.5.1_cu124-python3.11-ubuntu20.04"
 CLUSTER_40GB= # TODO
 
 for PRECISION in fp8 bf16

@@ -85,13 +85,14 @@ device_train_microbatch_size: 1
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -96,13 +96,14 @@ device_train_microbatch_size: 8
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -74,13 +74,14 @@ device_train_microbatch_size: 8
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -90,13 +90,14 @@ device_train_microbatch_size: 4
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -90,13 +90,14 @@ device_train_microbatch_size: 4
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -90,13 +90,14 @@ device_train_microbatch_size: 4
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 16
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 8
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 4
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 8
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 8
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 8
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 4
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: true
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false

@@ -88,13 +88,14 @@ device_train_microbatch_size: 4
 precision: amp_bf16
 
 # FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
+parallelism_config:
+  fsdp:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
 
 # Logging
 progress_bar: false