From 4db2a91dbd689029ae82fbbf0876c14003e5b474 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 28 Aug 2024 14:47:50 +0000
Subject: [PATCH 01/68] add tp_strategy registry

---
 llmfoundry/models/utils/tp_strategy.py |  0
 llmfoundry/registry.py                 | 13 ++++++++++++-
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 llmfoundry/models/utils/tp_strategy.py

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index cb2455a760..c150488ea6 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -1,12 +1,13 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Callable, Iterable, Union
+from typing import Any, Callable, Iterable, Union, Dict
 
 from composer.core import Algorithm, Callback, DataSpec
 from composer.loggers import LoggerDestination
 from composer.models import ComposerModel
 from composer.optim import ComposerScheduler
 from torch.distributed.checkpoint import LoadPlanner, SavePlanner
+from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader as TorchDataloader
 from torch.utils.data import Dataset
@@ -389,6 +390,16 @@
     description=_save_planners_description,
 )
 
+_tp_strategy_description: str = 'NA'
+
+tp_strategy = create_registry(
+    'llmfoundry',
+    'tp_strategy',
+    generic_type=Callable[ComposerModel, Union[ParallelStyle, Dict[str, ParallelStyle]]],
+    entry_points=True,
+    description=_tp_strategy_description,
+)
+
 __all__ = [
     'loggers',
     'callbacks',

From 9f948b2e53644f1d3148f678a201f6abdc9d8e8e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 28 Aug 2024 16:42:42 +0000
Subject: [PATCH 02/68] update

---
 llmfoundry/registry.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index c150488ea6..f6fa795317 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -390,12 +390,22 @@
     description=_save_planners_description,
 )
 
-_tp_strategy_description: str = 'NA'
+_tp_strategy_description = (
+    """The tp_strategy registry is used to register strategies for tensor parallelism.
+
+    Args:
+        model (ComposerModel): The model.
+
+    Returns:
+        layer_plan (Union[ParallelStyle, Dict[str, ParallelStyle]]): The plan used to parallelize the model.
+        model (ComposerModel): The model.
+    """
+)
 
 tp_strategy = create_registry(
     'llmfoundry',
     'tp_strategy',
-    generic_type=Callable[ComposerModel, Union[ParallelStyle, Dict[str, ParallelStyle]]],
+    generic_type=Callable[ComposerModel, tuple[Union[ParallelStyle, Dict[str, ParallelStyle]], ComposerModel]],
     entry_points=True,
     description=_tp_strategy_description,
 )
@@ -427,4 +437,5 @@
     'config_transforms',
     'load_planners',
     'save_planners',
+    'tp_strategy',
 ]

From 8229133242a1480d0af81fa3033a5f3db541da52 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 28 Aug 2024 17:28:02 +0000
Subject: [PATCH 03/68] add ffn tp strategy

---
 llmfoundry/models/utils/tp_strategy.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index e69de29bb2..e66f199990 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -0,0 +1,24 @@
+from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
+from torch.distributed._tensor import Replicate, Shard
+
+from llmfoundry.registry import tp_strategy
+
+def ffn_tp_strategy(model):
+    TP_LAYERS = set(['up_proj', 'down_proj'])
+
+    # validate that all TP_LAYERS are in model
+    tp_layers_in_model = set([name for name, _ in model.named_modules() if name in TP_LAYERS])
+    assert tp_layers_in_model == TP_LAYERS, f'This tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
+
+    # generate layer plan
+    layer_plan = {}
+    for name, _ in model.named_modules():
+        if 'up_proj' in name:
+            layer_plan[name] = ColwiseParallel(input_layouts=Replicate(), output_layouts=Shard(-1))
+        if 'down_proj' in name:
+            layer_plan[name] = RowwiseParallel(input_layouts=Shard(-1), output_layouts=Shard(0))
+
+    return layer_plan, model
+
+
+tp_strategy.register('ffn', func=ffn_tp_strategy)

From 3b201d87c4677706cbd6101f3dd5d9f34d955496 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 28 Aug 2024 18:36:27 +0000
Subject: [PATCH 04/68] only do layer_plan for now

---
 llmfoundry/models/utils/tp_strategy.py | 10 +++++++---
 llmfoundry/registry.py                 |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index e66f199990..9452c0e8ad 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,9 +1,13 @@
+from typing import Union, Dict
+
+from composer.models import ComposerModel
 from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
+from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.distributed._tensor import Replicate, Shard
 
 from llmfoundry.registry import tp_strategy
 
-def ffn_tp_strategy(model):
+def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
     # validate that all TP_LAYERS are in model
@@ -11,14 +15,14 @@ def ffn_tp_strategy(model):
     assert tp_layers_in_model == TP_LAYERS, f'This tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
 
     # generate layer plan
-    layer_plan = {}
+    layer_plan: Dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
         if 'up_proj' in name:
             layer_plan[name] = ColwiseParallel(input_layouts=Replicate(), output_layouts=Shard(-1))
         if 'down_proj' in name:
             layer_plan[name] = RowwiseParallel(input_layouts=Shard(-1), output_layouts=Shard(0))
 
-    return layer_plan, model
+    return layer_plan
 
 
 tp_strategy.register('ffn', func=ffn_tp_strategy)
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index f6fa795317..b57c77b0cc 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -405,7 +405,7 @@
 tp_strategy = create_registry(
     'llmfoundry',
     'tp_strategy',
-    generic_type=Callable[ComposerModel, tuple[Union[ParallelStyle, Dict[str, ParallelStyle]], ComposerModel]],
+    generic_type=Callable[ComposerModel, Union[ParallelStyle, Dict[str, ParallelStyle]]],
     entry_points=True,
     description=_tp_strategy_description,
 )

From b0118f70d34d16808714899bd7bd7d3f514de9a2 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 16:19:10 +0000
Subject: [PATCH 05/68] add tp_config

---
 llmfoundry/utils/config_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index ba5c5941b8..c4e822cef8 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -120,6 +120,7 @@ class TrainConfig:
     # Distributed training parameters
     dist_timeout: Union[int, float] = 600.0
     fsdp_config: Optional[dict[str, Any]] = None
+    tp_config: Optional[dict[str, Any]] = None
 
     # Evaluation parameters
     eval_interval: Union[int, str] = 1

From 4868c17afba85d114db5e2e16cf75e591592b5e0 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 16:19:28 +0000
Subject: [PATCH 06/68] build tp_strategy

---
 llmfoundry/utils/builders.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index f2d5cfc0f7..60b4dcc51e 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -14,6 +14,7 @@
     Iterable,
     Optional,
     Union,
+    Dict,
 )
 
 import torch
@@ -25,6 +26,7 @@
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.distributed.checkpoint import LoadPlanner, SavePlanner
+from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.optim.optimizer import Optimizer
 from torchmetrics import Metric
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -52,6 +54,7 @@
     'build_tokenizer',
     'build_composer_model',
     'build_metric',
+    'build_tp_strategy',
 ]
 
 
@@ -701,3 +704,13 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
                 )
 
     return evaluators, logger_keys
+
+
+def build_tp_strategy(
+    name: str,
+) -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
+    return construct_from_registry(
+        name=name,
+        registry=registry.tp_strategy,
+        partial_function=True,
+    )

From f30cda8df68d9264ca7a5a75c84af13b6b391920 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 16:19:42 +0000
Subject: [PATCH 07/68] update

---
 llmfoundry/command_utils/train.py | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 8e6309175a..6977f30db8 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -5,7 +5,7 @@
 import os
 import time
 import warnings
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, Dict
 
 import torch
 import torch.distributed
@@ -17,7 +17,7 @@
     TraceHandler,
     cyclic_schedule,
 )
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import dist, get_device, reproducibility, ParallelismConfig, TPConfig
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -42,6 +42,7 @@
     build_save_planner,
     build_scheduler,
     build_tokenizer,
+    build_tp_strategy,
 )
 from llmfoundry.utils.config_utils import (
     TRAIN_CONFIG_KEYS,
@@ -255,6 +256,8 @@ def train(cfg: DictConfig) -> Trainer:
     model_config = train_cfg.model
     train_loader_config = train_cfg.train_loader
 
+    parallelism_config: Optional[Union[Dict[str, Any], ParallelismConfig]] = None
+
     # Optional fsdp data, fine-tuning, and eval configs
     fsdp_config: Optional[dict[str, Any]] = train_cfg.fsdp_config
 
@@ -283,6 +286,14 @@ def train(cfg: DictConfig) -> Trainer:
                 **save_planner_config,
             )
 
+    # Optional tp config
+    tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
+    if tp_config is not None:
+        if 'strategy' in tp_config:
+            strategy_layer_plan = build_tp_strategy(tp_config['strategy'])
+            tp_config['layer_plan'] |= strategy_layer_plan
+
+
     eval_loader_config = train_cfg.eval_loader if train_cfg.eval_loader is not None else train_cfg.eval_loaders
     icl_tasks_config = train_cfg.icl_tasks or train_cfg.icl_tasks_str
     eval_gauntlet_config = train_cfg.eval_gauntlet or train_cfg.eval_gauntlet_str
@@ -527,7 +538,7 @@ def train(cfg: DictConfig) -> Trainer:
         precision=train_cfg.precision,
         algorithms=algorithms,
         device_train_microbatch_size=train_cfg.device_train_microbatch_size,
-        parallelism_config={'fsdp': fsdp_config},
+        parallelism_config=parallelism_config,
         save_folder=train_cfg.save_folder,
         save_filename=save_filename,
         save_latest_filename=save_latest_filename,

From 63d236c3b5775b9bb8b35f68ce278c74018fb910 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 18:20:56 +0000
Subject: [PATCH 08/68] update

---
 scripts/train/train.py                     | 5 +++++
 scripts/train/yamls/pretrain/mpt-125m.yaml | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/scripts/train/train.py b/scripts/train/train.py
index 728010d13a..c60f2ef886 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -3,7 +3,12 @@
 import sys
 
 from llmfoundry.command_utils import train_from_yaml
+from llmfoundry.utils.builders import build_tp_strategy, build_config_transforms
+from icecream import install
+install()
 
 if __name__ == '__main__':
+    build_tokenizer('EleutherAI/gpt-neox-20b', {})
+    build_tp_strategy('ffn')
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
     train_from_yaml(yaml_path, args_list)
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 644dfc26c1..20fc04766a 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -96,6 +96,10 @@ fsdp_config:
   activation_cpu_offload: false
   limit_all_gathers: true
 
+# TP
+tp_config:
+  strategy: ffn
+
 # Logging
 progress_bar: false
 log_to_console: true

From 6dcf5e69b502efda17ed42f153471bfd15e6b5de Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 18:22:31 +0000
Subject: [PATCH 09/68] update

---
 llmfoundry/command_utils/train.py  | 15 ++++++++++-----
 llmfoundry/utils/builders.py       |  6 +++---
 llmfoundry/utils/registry_utils.py |  1 +
 scripts/train/train.py             |  5 -----
 tests/test_registry.py             |  1 +
 5 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 6977f30db8..92ddca826f 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -256,8 +256,6 @@ def train(cfg: DictConfig) -> Trainer:
     model_config = train_cfg.model
     train_loader_config = train_cfg.train_loader
 
-    parallelism_config: Optional[Union[Dict[str, Any], ParallelismConfig]] = None
-
     # Optional fsdp data, fine-tuning, and eval configs
     fsdp_config: Optional[dict[str, Any]] = train_cfg.fsdp_config
 
@@ -321,16 +319,23 @@ def train(cfg: DictConfig) -> Trainer:
                 changing autoresume default to True...',
         )
 
-    # Warn if fsdp is enabled but user only has 1 GPU
-    if dist.get_world_size() == 1 and fsdp_config is not None:
+    # Warn if fsdp or tp is enabled but user only has 1 GPU
+    if dist.get_world_size() == 1 and (fsdp_config is not None or tp_config is not None):
+        parallelism = ''
+        if fsdp_config is not None:
+            parallelism += 'FSDP'
+        if tp_config is not None:
+            parallelism += '+TP' if fsdp_config is not None else 'TP'
         warnings.warn(
-            'FSDP is not applicable for single-GPU training. Reverting to DDP.',
+            f'{parallelism} is not applicable for single-GPU training. Reverting to DDP.',
         )
         fsdp_config = None
+        tp_config = None
 
     # Initialize context
     init_context = process_init_device(model_config, fsdp_config)
     logged_cfg.update({'fsdp_config': fsdp_config}, merge=True)
+    logged_cfg.update({'tp_config': tp_config}, merge=True)
 
     # Build tokenizer
     log.info('Building tokenizer...')
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 60b4dcc51e..1040d96245 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -14,7 +14,7 @@
     Iterable,
     Optional,
     Union,
-    Dict,
+    Callable
 )
 
 import torch
@@ -708,9 +708,9 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
 
 def build_tp_strategy(
     name: str,
-) -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
+) -> Callable[ComposerModel, Union[ParallelStyle, dict[str, ParallelStyle]]]:
     return construct_from_registry(
         name=name,
         registry=registry.tp_strategy,
-        partial_function=True,
+        partial_function=False,
     )
diff --git a/llmfoundry/utils/registry_utils.py b/llmfoundry/utils/registry_utils.py
index 74ba0996ef..53461f9f24 100644
--- a/llmfoundry/utils/registry_utils.py
+++ b/llmfoundry/utils/registry_utils.py
@@ -136,6 +136,7 @@ def construct_from_registry(
     if kwargs is None:
         kwargs = {}
 
+    ic(type(registry), name)
     registered_constructor = registry.get(name)
 
     if pre_validation_function is not None:
diff --git a/scripts/train/train.py b/scripts/train/train.py
index c60f2ef886..728010d13a 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -3,12 +3,7 @@
 import sys
 
 from llmfoundry.command_utils import train_from_yaml
-from llmfoundry.utils.builders import build_tp_strategy, build_config_transforms
-from icecream import install
-install()
 
 if __name__ == '__main__':
-    build_tokenizer('EleutherAI/gpt-neox-20b', {})
-    build_tp_strategy('ffn')
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
     train_from_yaml(yaml_path, args_list)
diff --git a/tests/test_registry.py b/tests/test_registry.py
index 5108a7d46c..6c103d3504 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -47,6 +47,7 @@ def test_expected_registries_exist():
         'config_transforms',
         'load_planners',
         'save_planners',
+        'tp_strategy',
     }
 
     assert existing_registries == expected_registry_names

From eac4ad2eaad65ae73cc8b9ab040dc929158cb7eb Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 18:24:02 +0000
Subject: [PATCH 10/68] replace Dict with dict

---
 llmfoundry/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index b57c77b0cc..bbaac6f640 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -1,6 +1,6 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
-from typing import Any, Callable, Iterable, Union, Dict
+from typing import Any, Callable, Iterable, Union
 
 from composer.core import Algorithm, Callback, DataSpec
 from composer.loggers import LoggerDestination
@@ -405,7 +405,7 @@
 tp_strategy = create_registry(
     'llmfoundry',
     'tp_strategy',
-    generic_type=Callable[ComposerModel, Union[ParallelStyle, Dict[str, ParallelStyle]]],
+    generic_type=Callable[ComposerModel, Union[ParallelStyle, dict[str, ParallelStyle]]],
     entry_points=True,
     description=_tp_strategy_description,
 )

From 4bcaf9658fe0e6f69576dd329d3bc27ca5a5f0c9 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 18:27:01 +0000
Subject: [PATCH 11/68] update

---
 llmfoundry/command_utils/train.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 92ddca826f..5a94b285ad 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -288,7 +288,8 @@ def train(cfg: DictConfig) -> Trainer:
     tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
     if tp_config is not None:
         if 'strategy' in tp_config:
-            strategy_layer_plan = build_tp_strategy(tp_config['strategy'])
+            tp_strategy = build_tp_strategy(tp_config['strategy'])
+            strategy_layer_plan = tp_strategy(ComposerModel)
             tp_config['layer_plan'] |= strategy_layer_plan
 
 

From 092f2f272dfe9397b6c6a511ccbd6a7007a063fb Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 18:40:00 +0000
Subject: [PATCH 12/68] works!

---
 llmfoundry/models/__init__.py          | 5 ++++-
 llmfoundry/models/utils/tp_strategy.py | 3 ---
 scripts/train/train.py                 | 5 +++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/models/__init__.py b/llmfoundry/models/__init__.py
index 827fe2ce56..9db62e5c6b 100644
--- a/llmfoundry/models/__init__.py
+++ b/llmfoundry/models/__init__.py
@@ -15,7 +15,8 @@
     MPTModel,
     MPTPreTrainedModel,
 )
-from llmfoundry.registry import models
+from llmfoundry.models.utils.tp_strategy import ffn_tp_strategy
+from llmfoundry.registry import models, tp_strategy
 
 models.register('mpt_causal_lm', func=ComposerMPTCausalLM)
 models.register('hf_causal_lm', func=ComposerHFCausalLM)
@@ -24,6 +25,7 @@
 models.register('fmapi_causal_lm', func=FMAPICasualLMEvalWrapper)
 models.register('openai_chat', func=OpenAIChatAPIEvalWrapper)
 models.register('fmapi_chat', func=FMAPIChatAPIEvalWrapper)
+tp_strategy.register('ffn', func=ffn_tp_strategy)
 
 __all__ = [
     'ComposerHFCausalLM',
@@ -37,4 +39,5 @@
     'FMAPICasualLMEvalWrapper',
     'OpenAIChatAPIEvalWrapper',
     'FMAPIChatAPIEvalWrapper',
+    'ffn_tp_strategy',
 ]
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 9452c0e8ad..ed327227b3 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -23,6 +23,3 @@ def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, Para
             layer_plan[name] = RowwiseParallel(input_layouts=Shard(-1), output_layouts=Shard(0))
 
     return layer_plan
-
-
-tp_strategy.register('ffn', func=ffn_tp_strategy)
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 728010d13a..7840bb0ee3 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -3,7 +3,12 @@
 import sys
 
 from llmfoundry.command_utils import train_from_yaml
+from llmfoundry.utils.builders import build_tp_strategy, build_save_planner
+from icecream import install
+install()
 
 if __name__ == '__main__':
+    # build_save_planner('dummy')
+    build_tp_strategy('ffn')
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
     train_from_yaml(yaml_path, args_list)

From a935bed3c96b34b8b56b3dadd5e6d31abb4cea7a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 18:57:09 +0000
Subject: [PATCH 13/68] tp_strategy does not require model

---
 llmfoundry/command_utils/train.py      | 2 +-
 llmfoundry/models/utils/tp_strategy.py | 2 +-
 llmfoundry/registry.py                 | 2 +-
 llmfoundry/utils/builders.py           | 2 +-
 llmfoundry/utils/registry_utils.py     | 1 -
 scripts/train/train.py                 | 6 ++----
 6 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 5a94b285ad..97afc274c9 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -289,7 +289,7 @@ def train(cfg: DictConfig) -> Trainer:
     if tp_config is not None:
         if 'strategy' in tp_config:
             tp_strategy = build_tp_strategy(tp_config['strategy'])
-            strategy_layer_plan = tp_strategy(ComposerModel)
+            strategy_layer_plan = tp_strategy()
             tp_config['layer_plan'] |= strategy_layer_plan
 
 
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index ed327227b3..a1dc943aab 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -7,7 +7,7 @@
 
 from llmfoundry.registry import tp_strategy
 
-def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
+def ffn_tp_strategy() -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
     # validate that all TP_LAYERS are in model
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index bbaac6f640..49f3009d88 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -405,7 +405,7 @@
 tp_strategy = create_registry(
     'llmfoundry',
     'tp_strategy',
-    generic_type=Callable[ComposerModel, Union[ParallelStyle, dict[str, ParallelStyle]]],
+    generic_type=Callable[[], Union[ParallelStyle, dict[str, ParallelStyle]]],
     entry_points=True,
     description=_tp_strategy_description,
 )
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 1040d96245..c5fbd8376d 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -708,7 +708,7 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
 
 def build_tp_strategy(
     name: str,
-) -> Callable[ComposerModel, Union[ParallelStyle, dict[str, ParallelStyle]]]:
+) -> Callable[[], Union[ParallelStyle, dict[str, ParallelStyle]]]:
     return construct_from_registry(
         name=name,
         registry=registry.tp_strategy,
diff --git a/llmfoundry/utils/registry_utils.py b/llmfoundry/utils/registry_utils.py
index 53461f9f24..74ba0996ef 100644
--- a/llmfoundry/utils/registry_utils.py
+++ b/llmfoundry/utils/registry_utils.py
@@ -136,7 +136,6 @@ def construct_from_registry(
     if kwargs is None:
         kwargs = {}
 
-    ic(type(registry), name)
     registered_constructor = registry.get(name)
 
     if pre_validation_function is not None:
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 7840bb0ee3..934b868cce 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -3,12 +3,10 @@
 import sys
 
 from llmfoundry.command_utils import train_from_yaml
-from llmfoundry.utils.builders import build_tp_strategy, build_save_planner
 from icecream import install
-install()
+
 
 if __name__ == '__main__':
-    # build_save_planner('dummy')
-    build_tp_strategy('ffn')
+    install()
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
     train_from_yaml(yaml_path, args_list)

From 11c0492ef0cfe3e37de773f5cf0a62b534edfba8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 19:36:01 +0000
Subject: [PATCH 14/68] tp_strategy accepts model

---
 llmfoundry/command_utils/train.py      | 25 +++++++++++++++----------
 llmfoundry/models/utils/tp_strategy.py |  3 +--
 llmfoundry/registry.py                 |  2 +-
 llmfoundry/utils/builders.py           |  4 +++-
 4 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 97afc274c9..ccc190998d 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -284,15 +284,6 @@ def train(cfg: DictConfig) -> Trainer:
                 **save_planner_config,
             )
 
-    # Optional tp config
-    tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
-    if tp_config is not None:
-        if 'strategy' in tp_config:
-            tp_strategy = build_tp_strategy(tp_config['strategy'])
-            strategy_layer_plan = tp_strategy()
-            tp_config['layer_plan'] |= strategy_layer_plan
-
-
     eval_loader_config = train_cfg.eval_loader if train_cfg.eval_loader is not None else train_cfg.eval_loaders
     icl_tasks_config = train_cfg.icl_tasks or train_cfg.icl_tasks_str
     eval_gauntlet_config = train_cfg.eval_gauntlet or train_cfg.eval_gauntlet_str
@@ -320,7 +311,10 @@ def train(cfg: DictConfig) -> Trainer:
                 changing autoresume default to True...',
         )
 
-    # Warn if fsdp or tp is enabled but user only has 1 GPU
+    # Optional tp config
+    tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
+
+    # Warn if FSDP or TP is enabled but user only has 1 GPU
     if dist.get_world_size() == 1 and (fsdp_config is not None or tp_config is not None):
         parallelism = ''
         if fsdp_config is not None:
@@ -500,6 +494,17 @@ def train(cfg: DictConfig) -> Trainer:
 
     _log_num_params(model, logged_cfg)
 
+    # TP config
+    if tp_config is not None:
+        if 'layer_plan' not in tp_config:
+            tp_config['layer_plan'] = {}
+        if 'strategy' in tp_config:
+            strategy = tp_config['strategy']
+            strategy_layer_plan = build_tp_strategy(strategy, model)
+            from icecream import ic
+            ic(strategy_layer_plan)
+            tp_config['layer_plan'] |= strategy_layer_plan
+
     # Optimizer
     optimizer_name: str = train_cfg.optimizer.pop('name')
     optimizer_cfg = train_cfg.optimizer
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index a1dc943aab..e874f005b2 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -5,9 +5,8 @@
 from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.distributed._tensor import Replicate, Shard
 
-from llmfoundry.registry import tp_strategy
 
-def ffn_tp_strategy() -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
+def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
     # validate that all TP_LAYERS are in model
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index 49f3009d88..a0633f139a 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -405,7 +405,7 @@
 tp_strategy = create_registry(
     'llmfoundry',
     'tp_strategy',
-    generic_type=Callable[[], Union[ParallelStyle, dict[str, ParallelStyle]]],
+    generic_type=Callable[[ComposerModel], Union[ParallelStyle, dict[str, ParallelStyle]]],
     entry_points=True,
     description=_tp_strategy_description,
 )
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index c5fbd8376d..5c7838206a 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -708,9 +708,11 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
 
 def build_tp_strategy(
     name: str,
-) -> Callable[[], Union[ParallelStyle, dict[str, ParallelStyle]]]:
+    model: ComposerModel,
+) -> Union[ParallelStyle, dict[str, ParallelStyle]]:
     return construct_from_registry(
         name=name,
         registry=registry.tp_strategy,
         partial_function=False,
+        kwargs={'model': model},
     )

From bddb165b07b0214cdb42d3437dfd9318b825c3b3 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 19:46:46 +0000
Subject: [PATCH 15/68] fix validation

---
 llmfoundry/models/utils/tp_strategy.py | 2 +-
 scripts/train/train.py                 | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index e874f005b2..10e984058e 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -10,7 +10,7 @@ def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, Para
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
     # validate that all TP_LAYERS are in model
-    tp_layers_in_model = set([name for name, _ in model.named_modules() if name in TP_LAYERS])
+    tp_layers_in_model = set([layer for layer in TP_LAYERS for name, _ in model.named_modules() if layer in name])
     assert tp_layers_in_model == TP_LAYERS, f'This tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
 
     # generate layer plan
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 934b868cce..850b465bed 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -4,9 +4,8 @@
 
 from llmfoundry.command_utils import train_from_yaml
 from icecream import install
-
+install()
 
 if __name__ == '__main__':
-    install()
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
     train_from_yaml(yaml_path, args_list)

From 309b96cf3e58aa95560524d6f2d325ab6f5c9d7d Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Mon, 9 Sep 2024 20:30:21 +0000
Subject: [PATCH 16/68] updatE

---
 TEST.py                                | 10 ++++++++++
 llmfoundry/command_utils/train.py      |  7 ++++---
 llmfoundry/models/utils/tp_strategy.py |  2 +-
 3 files changed, 15 insertions(+), 4 deletions(-)
 create mode 100644 TEST.py

diff --git a/TEST.py b/TEST.py
new file mode 100644
index 0000000000..59be956a20
--- /dev/null
+++ b/TEST.py
@@ -0,0 +1,10 @@
+from torch.distributed.tensor.parallel import ColwiseParallel
+from omegaconf import OmegaConf as om
+from composer.utils import TPConfig
+
+
+layer_plan = {'up_proj': ColwiseParallel}
+tp_config = TPConfig(layer_plan)
+
+om.to_yaml(tp_config)
+
diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index ccc190998d..45433b48f9 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -499,12 +499,13 @@ def train(cfg: DictConfig) -> Trainer:
         if 'layer_plan' not in tp_config:
             tp_config['layer_plan'] = {}
         if 'strategy' in tp_config:
-            strategy = tp_config['strategy']
+            strategy = tp_config.pop('strategy')
             strategy_layer_plan = build_tp_strategy(strategy, model)
-            from icecream import ic
-            ic(strategy_layer_plan)
             tp_config['layer_plan'] |= strategy_layer_plan
 
+    # Parallelism config
+    parallelism_config: ParallelismConfig = {'fsdp': fsdp_config, 'tp': tp_config}
+
     # Optimizer
     optimizer_name: str = train_cfg.optimizer.pop('name')
     optimizer_cfg = train_cfg.optimizer
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 10e984058e..f4805c52ed 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -11,7 +11,7 @@ def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, Para
 
     # validate that all TP_LAYERS are in model
     tp_layers_in_model = set([layer for layer in TP_LAYERS for name, _ in model.named_modules() if layer in name])
-    assert tp_layers_in_model == TP_LAYERS, f'This tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
+    assert tp_layers_in_model == TP_LAYERS, f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
 
     # generate layer plan
     layer_plan: Dict[str, ParallelStyle] = {}

From 42269169b679fe4db814b98ad9cbe174f4fc35e8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Tue, 10 Sep 2024 17:48:58 +0000
Subject: [PATCH 17/68] fix logging issue

---
 llmfoundry/command_utils/train.py          |   5 +-
 llmfoundry/models/utils/tp_strategy.py     | 118 +++++++++++++++++++--
 llmfoundry/registry.py                     |   4 +-
 llmfoundry/utils/builders.py               |   2 +-
 scripts/train/yamls/pretrain/mpt-125m.yaml |   3 +-
 5 files changed, 118 insertions(+), 14 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 45433b48f9..81a22cdd3e 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -5,7 +5,8 @@
 import os
 import time
 import warnings
-from typing import Any, Optional, Union, Dict
+from copy import deepcopy
+from typing import Any, Optional, Union
 
 import torch
 import torch.distributed
@@ -330,7 +331,7 @@ def train(cfg: DictConfig) -> Trainer:
     # Initialize context
     init_context = process_init_device(model_config, fsdp_config)
     logged_cfg.update({'fsdp_config': fsdp_config}, merge=True)
-    logged_cfg.update({'tp_config': tp_config}, merge=True)
+    logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True)
 
     # Build tokenizer
     log.info('Building tokenizer...')
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index f4805c52ed..d6e6100aed 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,12 +1,100 @@
-from typing import Union, Dict
+from typing import Union, Dict, Optional
 
 from composer.models import ComposerModel
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
+from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel,  PrepareModuleInput
 from torch.distributed.tensor.parallel.style import ParallelStyle
-from torch.distributed._tensor import Replicate, Shard
+from torch.distributed._tensor import Replicate, Shard, Placement
 
 
-def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, ParallelStyle]]:
+# class SerializableColwiseParallel(ColwiseParallel):
+#     @classmethod
+#     def __struct_hook__(cls, *args, **kwargs):
+#         return 'torch.distributed.tensor.parallel.ColwiseParallel'
+
+#     def __reduce__(self):
+#         return (SerializableColwiseParallel, ())
+
+
+# class SerializableRowwiseParallel(RowwiseParallel):
+#     @classmethod
+#     def __struct_hook__(cls, *args, **kwargs):
+#         return 'torch.distributed.tensor.parallel.RowwiseParallel'
+
+#     def __reduce__(self):
+#         return (SerializableRowwiseParallel, ())
+
+
+class GatherColwiseParallel(ColwiseParallel):
+    """ColwiseParallel layer that allgathers inputs and optionally reshards outputs."""
+    def __init__(
+        self,
+        *,
+        #input_layouts: Optional[Placement] = None,
+        output_layouts: Optional[Placement] = None,
+        use_local_output: bool = True
+    ):
+        super().__init__()
+        # Inputs over the TP dimension are sharded by device batches.
+        self.input_layouts = (Shard(0), )
+        # All-gather inputs so that each GPU now has the same input activations.
+        self.desired_input_layouts = (Replicate(), )
+        self.output_layouts = (output_layouts or Shard(-1), )
+        self.use_local_output = use_local_output
+
+
+def retrieve_layer_plan(model):
+    layer_plan = {}
+    for name, _ in model.named_modules():
+        split_name = name.split('.')
+        # First block -- allgathers device batches from TP group devices. Residual stream activations
+        # will be full, allgathered device batches from all TP group devices.
+        if len(split_name) >= 2 and split_name[-2] == 'blocks' and split_name[-1] == '0':
+            print(f"using PrepareModuleInput, (in=Shard(0), desired_in=Replicate) for module {name}")
+            layer_plan[name] = PrepareModuleInput(
+                input_layouts = Shard(0),
+                desired_input_layouts = Replicate(),
+                use_local_output = True,
+            )
+        # Wqkv -- inputs are all samples from TP group, but to keep KV cache unique to each device,
+        # we need to reshard device batches back to TP group devices.
+        elif 'Wqkv' in name:
+            print(f"using ColwiseParallel, (in=Replicate, out=Shard(0)) for module {name}")
+            layer_plan[name] = ColwiseParallel(
+                input_layouts = Replicate(),
+                output_layouts = Shard(0),
+            )
+        # Attn out_proj -- inputs should again be allgathered from TP group devices and remain allgathered.
+        elif 'out_proj' in name:
+            print(f"using GatherColwiseParallel, (out=Replicate) for module {name}")
+            layer_plan[name] = GatherColwiseParallel(
+                output_layouts = Replicate(),
+            )
+        # FFN up_proj -- inputs are already allgathered but should get sharded along the embedding dimension.
+        if 'up_proj' in name or 'gate_proj' in name:
+            print(f"using ColwiseParallel, [Replicate, Shard(-1)] for module {name}")
+            layer_plan[name] = ColwiseParallel(
+                input_layouts = Replicate(),
+                output_layouts = Shard(-1),
+            )
+        # FFN down_proj -- inputs are sharded along the embedding dimension but should get allreduced.
+        if 'down_proj' in name:
+            print(f"using RowwiseParallel, [Shard(-1), Replicate] for module {name}")
+            layer_plan[name] = RowwiseParallel(
+                input_layouts = Shard(-1),
+                output_layouts = Replicate(),
+            )
+        # LM head reshards device batches back to TP group devices.
+        elif 'lm_head' in name:
+            print(f"using ColwiseParallel, [Replicate, Shard(0)] for module {name}")
+            layer_plan[name] = ColwiseParallel(
+                input_layouts = Replicate(),
+                output_layouts = Shard(0),
+            )
+    return layer_plan
+
+def ffn_tp_strategy(model: ComposerModel) -> Dict[str, ParallelStyle]:
+
+    return retrieve_layer_plan(model)
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
     # validate that all TP_LAYERS are in model
@@ -16,9 +104,23 @@ def ffn_tp_strategy(model: ComposerModel) -> Union[ParallelStyle, Dict[str, Para
     # generate layer plan
     layer_plan: Dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
-        if 'up_proj' in name:
-            layer_plan[name] = ColwiseParallel(input_layouts=Replicate(), output_layouts=Shard(-1))
-        if 'down_proj' in name:
-            layer_plan[name] = RowwiseParallel(input_layouts=Shard(-1), output_layouts=Shard(0))
+        split_name = name.split('.')
+
+        if len(split_name) >= 2 and split_name[-2] == 'blocks' and split_name[-1] == '0':
+            layer_plan[name] = PrepareModuleInput(
+                input_layouts = Shard(0),
+                desired_input_layouts = Replicate(),
+                use_local_output = True,
+            )
+        elif 'up_proj' in name:
+            layer_plan[name] = ColwiseParallel(
+                input_layouts = Replicate(),
+                output_layouts = Shard(-1),
+            )
+        elif 'down_proj' in name:
+            layer_plan[name] = RowwiseParallel(
+                input_layouts = Shard(-1),
+                output_layouts = Replicate(),
+            )
 
     return layer_plan
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index a0633f139a..6c20e30f21 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -397,7 +397,7 @@
         model (ComposerModel): The model.
 
     Returns:
-        layer_plan (Union[ParallelStyle, Dict[str, ParallelStyle]]): The plan used to parallelize the model.
+        layer_plan (Dict[str, ParallelStyle]): The plan used to parallelize the model.
         model (ComposerModel): The model.
     """
 )
@@ -405,7 +405,7 @@
 tp_strategy = create_registry(
     'llmfoundry',
     'tp_strategy',
-    generic_type=Callable[[ComposerModel], Union[ParallelStyle, dict[str, ParallelStyle]]],
+    generic_type=Callable[[ComposerModel], dict[str, ParallelStyle]],
     entry_points=True,
     description=_tp_strategy_description,
 )
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 5c7838206a..fd54bfb88f 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -709,7 +709,7 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
 def build_tp_strategy(
     name: str,
     model: ComposerModel,
-) -> Union[ParallelStyle, dict[str, ParallelStyle]]:
+) -> dict[str, ParallelStyle]:
     return construct_from_registry(
         name=name,
         registry=registry.tp_strategy,
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 20fc04766a..6ca19517e8 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -16,7 +16,7 @@ model:
   init_device: meta
   d_model: 768
   n_heads: 12
-  n_layers: 12
+  n_layers: 1
   expansion_ratio: 4
   max_seq_len: ${variables.max_seq_len}
   vocab_size: 50368
@@ -99,6 +99,7 @@ fsdp_config:
 # TP
 tp_config:
   strategy: ffn
+  tensor_parallel_degree: 2
 
 # Logging
 progress_bar: false

From 86b1b81752597857c3ce85470eaed9b22223d768 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Tue, 10 Sep 2024 18:27:09 +0000
Subject: [PATCH 18/68] fix yaml

---
 scripts/train/yamls/pretrain/mpt-125m.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 6ca19517e8..08a4cb9b8c 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -21,7 +21,8 @@ model:
   max_seq_len: ${variables.max_seq_len}
   vocab_size: 50368
   attn_config:
-    attn_impl: flash
+    attn_impl: torch
+  loss_fn: torch_crossentropy
 
 # Tokenizer
 tokenizer:

From f2d6571c000784538a1df0c3de72a77cf229201e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Tue, 10 Sep 2024 18:44:30 +0000
Subject: [PATCH 19/68] add error

---
 llmfoundry/utils/config_utils.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index c4e822cef8..214481149d 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -502,7 +502,7 @@ def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]:
     return cfg
 
 
-def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]):
+def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict], tp_config: Optional[dict]):
     # Restrict model init_device to 'meta' and 'cpu',
     # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors
     # when multiple GPUs are available.
@@ -534,9 +534,12 @@ def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]):
             # Set defaults for mixed initialization
             fsdp_config.setdefault('load_monolith_rank0_only', True)
 
+    if tp_config is not None and 'ffn_config' in model_cfg and model_cfg['ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
+        raise ValueError('Cannot use TP with MoEs.')
+
     # Set ffn_config.device_mesh using fsdp_config
-    if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg[
-        'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
+    if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg['ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
+
         shard_degree = fsdp_config.get('data_parallel_shard_degree', None)
         replicate_degree = fsdp_config.get(
             'data_parallel_replicate_degree',

From c6cee7f42461e8005400793acb3aee7504383d2c Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Tue, 10 Sep 2024 19:40:38 +0000
Subject: [PATCH 20/68] it works!

---
 llmfoundry/command_utils/train.py          |   2 +-
 llmfoundry/models/mpt/modeling_mpt.py      |   9 +-
 llmfoundry/models/utils/tp_strategy.py     | 101 +--------------------
 scripts/train/yamls/pretrain/mpt-125m.yaml |   2 +-
 4 files changed, 12 insertions(+), 102 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 81a22cdd3e..537e7120b2 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -329,7 +329,7 @@ def train(cfg: DictConfig) -> Trainer:
         tp_config = None
 
     # Initialize context
-    init_context = process_init_device(model_config, fsdp_config)
+    init_context = process_init_device(model_config, fsdp_config, tp_config)
     logged_cfg.update({'fsdp_config': fsdp_config}, merge=True)
     logged_cfg.update({'tp_config': deepcopy(tp_config)}, merge=True)
 
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 06b64101c3..f3f37e1ac9 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -1337,9 +1337,14 @@ def compute_loss_from_logits(
 ) -> torch.Tensor:
     targets = get_targets(labels) if shift_labels else labels
 
+    outputs = outputs.logits.view(-1, outputs.logits.size(-1))
+    targets =targets.view(-1)
+
+    ic(targets.shape, outputs.shape, labels.shape)
+
     losses = loss_fn(
-        outputs.logits.view(-1, outputs.logits.size(-1)),
-        targets.view(-1),
+        outputs,
+        targets,
     )
 
     if torch.all(targets == loss_fn.ignore_index):
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index d6e6100aed..7c5ec24db4 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,100 +1,13 @@
 from typing import Union, Dict, Optional
 
 from composer.models import ComposerModel
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel,  PrepareModuleInput
+from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
 from torch.distributed.tensor.parallel.style import ParallelStyle
-from torch.distributed._tensor import Replicate, Shard, Placement
+from torch.distributed._tensor import Replicate, Shard
 
 
-# class SerializableColwiseParallel(ColwiseParallel):
-#     @classmethod
-#     def __struct_hook__(cls, *args, **kwargs):
-#         return 'torch.distributed.tensor.parallel.ColwiseParallel'
-
-#     def __reduce__(self):
-#         return (SerializableColwiseParallel, ())
-
-
-# class SerializableRowwiseParallel(RowwiseParallel):
-#     @classmethod
-#     def __struct_hook__(cls, *args, **kwargs):
-#         return 'torch.distributed.tensor.parallel.RowwiseParallel'
-
-#     def __reduce__(self):
-#         return (SerializableRowwiseParallel, ())
-
-
-class GatherColwiseParallel(ColwiseParallel):
-    """ColwiseParallel layer that allgathers inputs and optionally reshards outputs."""
-    def __init__(
-        self,
-        *,
-        #input_layouts: Optional[Placement] = None,
-        output_layouts: Optional[Placement] = None,
-        use_local_output: bool = True
-    ):
-        super().__init__()
-        # Inputs over the TP dimension are sharded by device batches.
-        self.input_layouts = (Shard(0), )
-        # All-gather inputs so that each GPU now has the same input activations.
-        self.desired_input_layouts = (Replicate(), )
-        self.output_layouts = (output_layouts or Shard(-1), )
-        self.use_local_output = use_local_output
-
-
-def retrieve_layer_plan(model):
-    layer_plan = {}
-    for name, _ in model.named_modules():
-        split_name = name.split('.')
-        # First block -- allgathers device batches from TP group devices. Residual stream activations
-        # will be full, allgathered device batches from all TP group devices.
-        if len(split_name) >= 2 and split_name[-2] == 'blocks' and split_name[-1] == '0':
-            print(f"using PrepareModuleInput, (in=Shard(0), desired_in=Replicate) for module {name}")
-            layer_plan[name] = PrepareModuleInput(
-                input_layouts = Shard(0),
-                desired_input_layouts = Replicate(),
-                use_local_output = True,
-            )
-        # Wqkv -- inputs are all samples from TP group, but to keep KV cache unique to each device,
-        # we need to reshard device batches back to TP group devices.
-        elif 'Wqkv' in name:
-            print(f"using ColwiseParallel, (in=Replicate, out=Shard(0)) for module {name}")
-            layer_plan[name] = ColwiseParallel(
-                input_layouts = Replicate(),
-                output_layouts = Shard(0),
-            )
-        # Attn out_proj -- inputs should again be allgathered from TP group devices and remain allgathered.
-        elif 'out_proj' in name:
-            print(f"using GatherColwiseParallel, (out=Replicate) for module {name}")
-            layer_plan[name] = GatherColwiseParallel(
-                output_layouts = Replicate(),
-            )
-        # FFN up_proj -- inputs are already allgathered but should get sharded along the embedding dimension.
-        if 'up_proj' in name or 'gate_proj' in name:
-            print(f"using ColwiseParallel, [Replicate, Shard(-1)] for module {name}")
-            layer_plan[name] = ColwiseParallel(
-                input_layouts = Replicate(),
-                output_layouts = Shard(-1),
-            )
-        # FFN down_proj -- inputs are sharded along the embedding dimension but should get allreduced.
-        if 'down_proj' in name:
-            print(f"using RowwiseParallel, [Shard(-1), Replicate] for module {name}")
-            layer_plan[name] = RowwiseParallel(
-                input_layouts = Shard(-1),
-                output_layouts = Replicate(),
-            )
-        # LM head reshards device batches back to TP group devices.
-        elif 'lm_head' in name:
-            print(f"using ColwiseParallel, [Replicate, Shard(0)] for module {name}")
-            layer_plan[name] = ColwiseParallel(
-                input_layouts = Replicate(),
-                output_layouts = Shard(0),
-            )
-    return layer_plan
-
 def ffn_tp_strategy(model: ComposerModel) -> Dict[str, ParallelStyle]:
 
-    return retrieve_layer_plan(model)
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
     # validate that all TP_LAYERS are in model
@@ -104,15 +17,7 @@ def ffn_tp_strategy(model: ComposerModel) -> Dict[str, ParallelStyle]:
     # generate layer plan
     layer_plan: Dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
-        split_name = name.split('.')
-
-        if len(split_name) >= 2 and split_name[-2] == 'blocks' and split_name[-1] == '0':
-            layer_plan[name] = PrepareModuleInput(
-                input_layouts = Shard(0),
-                desired_input_layouts = Replicate(),
-                use_local_output = True,
-            )
-        elif 'up_proj' in name:
+        if 'up_proj' in name:
             layer_plan[name] = ColwiseParallel(
                 input_layouts = Replicate(),
                 output_layouts = Shard(-1),
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 08a4cb9b8c..f3961ba0e0 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -15,7 +15,7 @@ model:
   name: mpt_causal_lm
   init_device: meta
   d_model: 768
-  n_heads: 12
+  n_heads: 1
   n_layers: 1
   expansion_ratio: 4
   max_seq_len: ${variables.max_seq_len}

From 8040aa7e3f75cbceafd0fb53ab4c5be09e2fb185 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Tue, 10 Sep 2024 19:52:17 +0000
Subject: [PATCH 21/68] works with original yaml

---
 llmfoundry/models/mpt/modeling_mpt.py      | 9 ++-------
 scripts/train/yamls/pretrain/mpt-125m.yaml | 7 +++----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index f3f37e1ac9..06b64101c3 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -1337,14 +1337,9 @@ def compute_loss_from_logits(
 ) -> torch.Tensor:
     targets = get_targets(labels) if shift_labels else labels
 
-    outputs = outputs.logits.view(-1, outputs.logits.size(-1))
-    targets =targets.view(-1)
-
-    ic(targets.shape, outputs.shape, labels.shape)
-
     losses = loss_fn(
-        outputs,
-        targets,
+        outputs.logits.view(-1, outputs.logits.size(-1)),
+        targets.view(-1),
     )
 
     if torch.all(targets == loss_fn.ignore_index):
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index f3961ba0e0..ea25cac667 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -15,14 +15,13 @@ model:
   name: mpt_causal_lm
   init_device: meta
   d_model: 768
-  n_heads: 1
-  n_layers: 1
+  n_heads: 12
+  n_layers: 12
   expansion_ratio: 4
   max_seq_len: ${variables.max_seq_len}
   vocab_size: 50368
   attn_config:
-    attn_impl: torch
-  loss_fn: torch_crossentropy
+    attn_impl: flash
 
 # Tokenizer
 tokenizer:

From f384de7bebd8fae7e4ff1d61e780c7991595416a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Tue, 10 Sep 2024 20:36:09 +0000
Subject: [PATCH 22/68] update

---
 llmfoundry/models/utils/tp_strategy.py     | 6 ++----
 scripts/train/train.py                     | 3 +--
 scripts/train/yamls/pretrain/mpt-125m.yaml | 5 +++++
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 7c5ec24db4..9f2d5d4055 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,12 +1,10 @@
-from typing import Union, Dict, Optional
-
 from composer.models import ComposerModel
 from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
 from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.distributed._tensor import Replicate, Shard
 
 
-def ffn_tp_strategy(model: ComposerModel) -> Dict[str, ParallelStyle]:
+def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
 
     TP_LAYERS = set(['up_proj', 'down_proj'])
 
@@ -15,7 +13,7 @@ def ffn_tp_strategy(model: ComposerModel) -> Dict[str, ParallelStyle]:
     assert tp_layers_in_model == TP_LAYERS, f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
 
     # generate layer plan
-    layer_plan: Dict[str, ParallelStyle] = {}
+    layer_plan: dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
         if 'up_proj' in name:
             layer_plan[name] = ColwiseParallel(
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 850b465bed..288f409abb 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -3,8 +3,7 @@
 import sys
 
 from llmfoundry.command_utils import train_from_yaml
-from icecream import install
-install()
+
 
 if __name__ == '__main__':
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index ea25cac667..65cb079db0 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -113,6 +113,11 @@ callbacks:
   memory_monitor: {}
   runtime_estimator: {}
 
+loggers:
+  mlflow:
+    experiment_name: tp
+    # model_registry_prefix: datasets.eitanturok.${run_name}
+
 # loggers:
 #   wandb: {}
 

From 3b5f9355a3fad6b6cb2591bfbf42e3d14d223424 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 12 Sep 2024 15:27:05 +0000
Subject: [PATCH 23/68] delete file

---
 TEST.py | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 TEST.py

diff --git a/TEST.py b/TEST.py
deleted file mode 100644
index 59be956a20..0000000000
--- a/TEST.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from torch.distributed.tensor.parallel import ColwiseParallel
-from omegaconf import OmegaConf as om
-from composer.utils import TPConfig
-
-
-layer_plan = {'up_proj': ColwiseParallel}
-tp_config = TPConfig(layer_plan)
-
-om.to_yaml(tp_config)
-

From 76adc485f2dcc032244b1779d2ccea8d18462c10 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 12 Sep 2024 16:04:26 +0000
Subject: [PATCH 24/68] add replication

---
 scripts/train/yamls/pretrain/mpt-125m.yaml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 65cb079db0..c85ef41b36 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -35,7 +35,7 @@ train_loader:
   dataset:
     local: ${variables.data_local}
     remote: ${variables.data_remote}
-    split: train
+    split: train_small
     shuffle: true
     max_seq_len: ${variables.max_seq_len}
     shuffle_seed: ${variables.global_seed}
@@ -47,10 +47,11 @@ eval_loader:
   dataset:
     local: ${variables.data_local}
     remote: ${variables.data_remote}
-    split: val
+    split: val_small
     shuffle: false
     max_seq_len: ${variables.max_seq_len}
     shuffle_seed: ${variables.global_seed}
+    replication: 2
   drop_last: false
   num_workers: 8
 
@@ -98,7 +99,7 @@ fsdp_config:
 
 # TP
 tp_config:
-  strategy: ffn
+  strategy: megatron
   tensor_parallel_degree: 2
 
 # Logging

From 90264eaf66def1a711c90fc41f4c649ddc844310 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 14:24:16 +0000
Subject: [PATCH 25/68] tp-strat does not crash

---
 llmfoundry/command_utils/train.py          |  8 ++++++--
 llmfoundry/models/__init__.py              |  1 +
 llmfoundry/models/utils/tp_strategy.py     | 19 +++++++++++++++----
 scripts/train/yamls/pretrain/mpt-125m.yaml |  6 +++---
 4 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index d29865598b..41098d10cf 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -19,7 +19,8 @@
     TraceHandler,
     cyclic_schedule,
 )
-from composer.utils import dist, get_device, reproducibility, ParallelismConfig, TPConfig
+from composer.utils import dist, get_device, reproducibility, ParallelismConfig, TPConfig, FSDPConfig
+from icecream import install
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -64,6 +65,7 @@
 
 log = logging.getLogger(__name__)
 
+install()
 
 def validate_config(train_config: TrainConfig):
     """Validates compatible model and dataloader selection."""
@@ -524,7 +526,9 @@ def train(cfg: DictConfig) -> Trainer:
             tp_config['layer_plan'] |= strategy_layer_plan
 
     # Parallelism config
-    parallelism_config: ParallelismConfig = {'fsdp': fsdp_config, 'tp': tp_config}
+    tp = TPConfig(**tp_config)
+    fsdp = FSDPConfig(**fsdp_config)
+    parallelism_config = ParallelismConfig(fsdp=fsdp, tp=tp)
 
     # Optimizer
     optimizer_name: str = train_cfg.optimizer.pop('name')
diff --git a/llmfoundry/models/__init__.py b/llmfoundry/models/__init__.py
index 9db62e5c6b..ba60e774ed 100644
--- a/llmfoundry/models/__init__.py
+++ b/llmfoundry/models/__init__.py
@@ -27,6 +27,7 @@
 models.register('fmapi_chat', func=FMAPIChatAPIEvalWrapper)
 tp_strategy.register('ffn', func=ffn_tp_strategy)
 
+
 __all__ = [
     'ComposerHFCausalLM',
     'ComposerHFT5',
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 9f2d5d4055..c6af5d0a37 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,7 +1,9 @@
+from typing import Optional
+
 from composer.models import ComposerModel
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel
+from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
 from torch.distributed.tensor.parallel.style import ParallelStyle
-from torch.distributed._tensor import Replicate, Shard
+from torch.distributed._tensor import Replicate, Shard, Placement
 
 
 def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
@@ -15,15 +17,24 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     # generate layer plan
     layer_plan: dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
-        if 'up_proj' in name:
+        ic(name)
+        if name.split('.')[-2:] == ['ffn', 'up_proj']:
             layer_plan[name] = ColwiseParallel(
                 input_layouts = Replicate(),
                 output_layouts = Shard(-1),
             )
-        elif 'down_proj' in name:
+        elif name.split('.')[-2:] == ['ffn', 'down_proj']:
             layer_plan[name] = RowwiseParallel(
                 input_layouts = Shard(-1),
                 output_layouts = Replicate(),
             )
+        elif name.split('.')[-1] == 'ffn':
+            layer_plan[name] = PrepareModuleInput(
+                input_layouts = Shard(0),
+                desired_input_layouts = Replicate(),
+                use_local_output = True,
+            )
 
     return layer_plan
+
+
diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index c85ef41b36..5f6f55c06a 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -75,10 +75,10 @@ algorithms:
     clipping_type: norm
     clipping_threshold: 1.0
 
-max_duration: 4800ba  # ~ 2.5B tokens
+max_duration: 100ba
 eval_interval: 500ba
 eval_first: false
-eval_subset_num_batches: -1
+eval_subset_num_batches: 0
 global_train_batch_size: 256
 
 # System
@@ -99,7 +99,7 @@ fsdp_config:
 
 # TP
 tp_config:
-  strategy: megatron
+  strategy: ffn
   tensor_parallel_degree: 2
 
 # Logging

From 7b73db53eec53b177565db22fb958a5c49083d50 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 14:45:50 +0000
Subject: [PATCH 26/68] debug print

---
 llmfoundry/command_utils/train.py     | 4 +++-
 llmfoundry/models/layers/blocks.py    | 1 +
 llmfoundry/models/mpt/modeling_mpt.py | 4 ++++
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 41098d10cf..5cca197415 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -20,7 +20,7 @@
     cyclic_schedule,
 )
 from composer.utils import dist, get_device, reproducibility, ParallelismConfig, TPConfig, FSDPConfig
-from icecream import install
+from icecream import install, ic
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -65,6 +65,8 @@
 
 log = logging.getLogger(__name__)
 
+
+ic.configureOutput(includeContext=True)
 install()
 
 def validate_config(train_config: TrainConfig):
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index c88cf33d1b..d191b1e277 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -206,6 +206,7 @@ def forward(
                 m = self.norm_2(x)
 
         n = self.apply_ffn(attention_mask, m)
+        ic(x.shape, x.device, n.shape, n.device)
         # In the following line we move the `x` tensor to the same devices as the output of ffn layer. This operation should be a no-op during training.
         # This is done to fix pipeline parallel generation using hf.generate. Please see this comment for details: https://github.com/mosaicml/llm-foundry/pull/1332#issue-2386827204
         x = x.to(device=n.device,
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 06b64101c3..770af5a9cf 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -965,6 +965,8 @@ def forward(
             extra_kwargs = {}
             if prev_layer_key_value is not None:
                 extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
+
+            ic(type(x), type(past_key_value), type(attn_bias), type(attention_mask))
             x, attn_weights, present = block(
                 x,
                 past_key_value=past_key_value,
@@ -1142,6 +1144,7 @@ def forward(
             use_cache if use_cache is not None else self.config.use_cache
         )
 
+        ic(type(input_ids))
         outputs = self.transformer(
             input_ids=input_ids,
             past_key_values=past_key_values,
@@ -1154,6 +1157,7 @@ def forward(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
         )
+        ic(outputs)
 
         if self.lm_head is not None:
             logits = self.lm_head(outputs.last_hidden_state)

From 19f5477e4ee6ae8909a34fd208a195465e1587be Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 15:12:47 +0000
Subject: [PATCH 27/68] it works!

---
 llmfoundry/command_utils/train.py      | 2 ++
 llmfoundry/models/layers/blocks.py     | 2 +-
 llmfoundry/models/utils/tp_strategy.py | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 5cca197415..8e05bb390b 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -69,6 +69,8 @@
 ic.configureOutput(includeContext=True)
 install()
 
+ic.disable()
+
 def validate_config(train_config: TrainConfig):
     """Validates compatible model and dataloader selection."""
     # Validate the rest of the config
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index d191b1e277..cf07e453fa 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -206,7 +206,7 @@ def forward(
                 m = self.norm_2(x)
 
         n = self.apply_ffn(attention_mask, m)
-        ic(x.shape, x.device, n.shape, n.device)
+        ic(x.shape, x.device, m.shape, m.device, n.shape, n.device)
         # In the following line we move the `x` tensor to the same devices as the output of ffn layer. This operation should be a no-op during training.
         # This is done to fix pipeline parallel generation using hf.generate. Please see this comment for details: https://github.com/mosaicml/llm-foundry/pull/1332#issue-2386827204
         x = x.to(device=n.device,
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index c6af5d0a37..2e2b253c87 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -26,7 +26,8 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
         elif name.split('.')[-2:] == ['ffn', 'down_proj']:
             layer_plan[name] = RowwiseParallel(
                 input_layouts = Shard(-1),
-                output_layouts = Replicate(),
+                # output_layouts = Replicate(),
+                output_layouts = Shard(0),
             )
         elif name.split('.')[-1] == 'ffn':
             layer_plan[name] = PrepareModuleInput(

From 2b9664ee7b6ef9188b4efba9d955bbd0cd4b443a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 15:27:58 +0000
Subject: [PATCH 28/68] better init for parallelism_config

---
 llmfoundry/command_utils/train.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 8e05bb390b..fe128158c3 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -530,9 +530,7 @@ def train(cfg: DictConfig) -> Trainer:
             tp_config['layer_plan'] |= strategy_layer_plan
 
     # Parallelism config
-    tp = TPConfig(**tp_config)
-    fsdp = FSDPConfig(**fsdp_config)
-    parallelism_config = ParallelismConfig(fsdp=fsdp, tp=tp)
+    parallelism_config = dict(fsdp=fsdp_config, tp=tp_config)
 
     # Optimizer
     optimizer_name: str = train_cfg.optimizer.pop('name')

From 9f7365ed54463c248f9c799d671b941a0fc52287 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 17:39:49 +0000
Subject: [PATCH 29/68] remove comment

---
 llmfoundry/models/utils/tp_strategy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 2e2b253c87..68fd73762b 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -26,7 +26,6 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
         elif name.split('.')[-2:] == ['ffn', 'down_proj']:
             layer_plan[name] = RowwiseParallel(
                 input_layouts = Shard(-1),
-                # output_layouts = Replicate(),
                 output_layouts = Shard(0),
             )
         elif name.split('.')[-1] == 'ffn':

From a382b5ef88d1d4e9ed1ad76308fd32dec88afe8d Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 18:44:15 +0000
Subject: [PATCH 30/68] init tests

---
 llmfoundry/models/utils/tp_strategy.py |  2 +-
 tests/models/utils/test_tp_strategy.py | 59 ++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 tests/models/utils/test_tp_strategy.py

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 68fd73762b..158daa6425 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -3,7 +3,7 @@
 from composer.models import ComposerModel
 from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
 from torch.distributed.tensor.parallel.style import ParallelStyle
-from torch.distributed._tensor import Replicate, Shard, Placement
+from torch.distributed._tensor import Replicate, Shard
 
 
 def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
new file mode 100644
index 0000000000..2a53fe895e
--- /dev/null
+++ b/tests/models/utils/test_tp_strategy.py
@@ -0,0 +1,59 @@
+from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
+from torch.distributed._tensor import Replicate, Shard
+
+from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
+from llmfoundry.utils.builders import build_tp_strategy
+
+
+from icecream import install
+install()
+
+def test_tp_strategy():
+
+    tp_config = {
+        'strategy': 'ffn',
+        }
+
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'd_model': 128,
+        'n_heads': 4,
+        'n_layers': 3,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_impl': 'flash',
+        },
+    }
+
+    _expected_layer_plan = {
+        'ffn': PrepareModuleInput(
+                input_layouts = Shard(0),
+                desired_input_layouts = Replicate(),
+                use_local_output = True,
+            ),
+        'ffn.down_proj': RowwiseParallel(
+                input_layouts = Shard(-1),
+                output_layouts = Shard(0),
+            ),
+        'ffn.up_proj': ColwiseParallel(
+                input_layouts = Replicate(),
+                output_layouts = Shard(-1),
+            )
+    }
+    expected_layer_plan = {f'model.transformer.blocks.{layer_idx}.{name}': layer_plan for name, layer_plan in _expected_layer_plan.items() for layer_idx in range(model_cfg['n_layers'])}
+    ic(expected_layer_plan)
+
+    model = ComposerMPTCausalLM(**model_cfg)
+    strategy_layer_plan = build_tp_strategy(tp_config['strategy'], model)
+    ic(strategy_layer_plan)
+
+    for (n1, lp1), (n2, lp2) in zip(expected_layer_plan.items(), strategy_layer_plan.items()):
+        ic(n1, n2)
+        ic(lp1, lp2)
+        assert n1 == n2
+        assert lp1 == lp2
+
+if __name__ == '__main__':
+    test_tp_strategy()

From 3fa5189144abc41eeddf9e62e2706d61381a9b9e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 18:47:48 +0000
Subject: [PATCH 31/68] tests pass

---
 tests/models/utils/test_tp_strategy.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 2a53fe895e..48d561bb5c 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -43,17 +43,22 @@ def test_tp_strategy():
             )
     }
     expected_layer_plan = {f'model.transformer.blocks.{layer_idx}.{name}': layer_plan for name, layer_plan in _expected_layer_plan.items() for layer_idx in range(model_cfg['n_layers'])}
-    ic(expected_layer_plan)
 
     model = ComposerMPTCausalLM(**model_cfg)
-    strategy_layer_plan = build_tp_strategy(tp_config['strategy'], model)
-    ic(strategy_layer_plan)
+    layer_plan = build_tp_strategy(tp_config['strategy'], model)
 
-    for (n1, lp1), (n2, lp2) in zip(expected_layer_plan.items(), strategy_layer_plan.items()):
-        ic(n1, n2)
-        ic(lp1, lp2)
+    # Compare expected and actual layer plan
+    for (n1, lp1), (n2, lp2) in zip(sorted(expected_layer_plan.items()), sorted(layer_plan.items())):
         assert n1 == n2
-        assert lp1 == lp2
+        assert type(lp1) == type(lp2)
+        if isinstance(lp1, PrepareModuleInput):
+            assert lp1.input_layouts == lp2.input_layouts
+            assert lp1.desired_input_layouts == lp2.desired_input_layouts
+            assert lp1.use_local_output == lp2.use_local_output
+        elif isinstance(lp1, ColwiseParallel) or isinstance(lp1, RowwiseParallel):
+            assert lp1.input_layouts == lp2.input_layouts
+            assert lp1.output_layouts == lp2.output_layouts
+            assert lp1.use_local_output == lp2.use_local_output
 
 if __name__ == '__main__':
     test_tp_strategy()

From 5e58dbc62bdd04eff838a954cc6cca90fae9d370 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:12:54 +0000
Subject: [PATCH 32/68] add test for tp training

---
 tests/models/utils/test_tp_strategy.py | 65 +++++++++++++++++++++++---
 1 file changed, 58 insertions(+), 7 deletions(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 48d561bb5c..0d4d04a023 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -1,5 +1,10 @@
+import pytest
+
+from composer.trainer.trainer import Trainer
+from composer.utils import dist
 from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
 from torch.distributed._tensor import Replicate, Shard
+from torch.utils.data import DataLoader
 
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
@@ -8,8 +13,9 @@
 from icecream import install
 install()
 
-def test_tp_strategy():
+def test_ffn_tp_strategy_layer_plan():
 
+    # Actual layer plan
     tp_config = {
         'strategy': 'ffn',
         }
@@ -26,7 +32,10 @@ def test_tp_strategy():
             'attn_impl': 'flash',
         },
     }
+    model = ComposerMPTCausalLM(**model_cfg)
+    layer_plan = build_tp_strategy(tp_config['strategy'], model)
 
+    # Expected layer plan
     _expected_layer_plan = {
         'ffn': PrepareModuleInput(
                 input_layouts = Shard(0),
@@ -44,10 +53,7 @@ def test_tp_strategy():
     }
     expected_layer_plan = {f'model.transformer.blocks.{layer_idx}.{name}': layer_plan for name, layer_plan in _expected_layer_plan.items() for layer_idx in range(model_cfg['n_layers'])}
 
-    model = ComposerMPTCausalLM(**model_cfg)
-    layer_plan = build_tp_strategy(tp_config['strategy'], model)
-
-    # Compare expected and actual layer plan
+    # Compare expected and actual layer plans
     for (n1, lp1), (n2, lp2) in zip(sorted(expected_layer_plan.items()), sorted(layer_plan.items())):
         assert n1 == n2
         assert type(lp1) == type(lp2)
@@ -60,5 +66,50 @@ def test_tp_strategy():
             assert lp1.output_layouts == lp2.output_layouts
             assert lp1.use_local_output == lp2.use_local_output
 
-if __name__ == '__main__':
-    test_tp_strategy()
+
+@pytest.mark.gpu
+@pytest.mark.world_size(4)
+def test_tp_train(tiny_ft_dataloader: DataLoader):
+    ic(tiny_ft_dataloader)
+
+    # Actual layer plan
+    tp_config = {
+        'strategy': 'ffn',
+        }
+
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'd_model': 128,
+        'n_heads': 4,
+        'n_layers': 3,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+        'attn_config': {
+            'attn_impl': 'flash',
+        },
+    }
+    model = ComposerMPTCausalLM(**model_cfg)
+    layer_plan = build_tp_strategy(tp_config['strategy'], model)
+
+    # dataset = RandomClassificationDataset(size=8)
+    # dataloader = DataLoader(dataset, batch_size=2, sampler=dist.get_sampler(dataset))
+
+    trainer = Trainer(
+        model=model,
+        train_dataloader=tiny_ft_dataloader,
+        parallelism_config={
+            'tp': {
+                'layer_plan': layer_plan,
+                'tensor_parallel_degree': 2,
+            },
+            'fsdp': {},
+        },
+        max_duration='3ba',
+    )
+
+    trainer.fit()
+
+
+# if __name__ == '__main__':
+#     test_tp_train()

From 3a6dec61ffc9b9b23a1e3f163725712906fb85f2 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:13:10 +0000
Subject: [PATCH 33/68] remove test for tp training b/c in composer

---
 tests/models/utils/test_tp_strategy.py | 48 --------------------------
 1 file changed, 48 deletions(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 0d4d04a023..0e0079ad7e 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -65,51 +65,3 @@ def test_ffn_tp_strategy_layer_plan():
             assert lp1.input_layouts == lp2.input_layouts
             assert lp1.output_layouts == lp2.output_layouts
             assert lp1.use_local_output == lp2.use_local_output
-
-
-@pytest.mark.gpu
-@pytest.mark.world_size(4)
-def test_tp_train(tiny_ft_dataloader: DataLoader):
-    ic(tiny_ft_dataloader)
-
-    # Actual layer plan
-    tp_config = {
-        'strategy': 'ffn',
-        }
-
-    model_cfg = {
-        'name': 'mpt_causal_lm',
-        'd_model': 128,
-        'n_heads': 4,
-        'n_layers': 3,
-        'expansion_ratio': 1,
-        'max_seq_len': 16,
-        'vocab_size': 50368,
-        'attn_config': {
-            'attn_impl': 'flash',
-        },
-    }
-    model = ComposerMPTCausalLM(**model_cfg)
-    layer_plan = build_tp_strategy(tp_config['strategy'], model)
-
-    # dataset = RandomClassificationDataset(size=8)
-    # dataloader = DataLoader(dataset, batch_size=2, sampler=dist.get_sampler(dataset))
-
-    trainer = Trainer(
-        model=model,
-        train_dataloader=tiny_ft_dataloader,
-        parallelism_config={
-            'tp': {
-                'layer_plan': layer_plan,
-                'tensor_parallel_degree': 2,
-            },
-            'fsdp': {},
-        },
-        max_duration='3ba',
-    )
-
-    trainer.fit()
-
-
-# if __name__ == '__main__':
-#     test_tp_train()

From 1f025d89e58a7ef9f17ca8ff13595f4c2247b5f3 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:14:48 +0000
Subject: [PATCH 34/68] remove icecream

---
 llmfoundry/command_utils/train.py      | 1 -
 tests/models/utils/test_tp_strategy.py | 8 --------
 2 files changed, 9 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index fe128158c3..3b94e57a6c 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -20,7 +20,6 @@
     cyclic_schedule,
 )
 from composer.utils import dist, get_device, reproducibility, ParallelismConfig, TPConfig, FSDPConfig
-from icecream import install, ic
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 0e0079ad7e..0166c9ea64 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -1,18 +1,10 @@
-import pytest
-
-from composer.trainer.trainer import Trainer
-from composer.utils import dist
 from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
 from torch.distributed._tensor import Replicate, Shard
-from torch.utils.data import DataLoader
 
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
 
 
-from icecream import install
-install()
-
 def test_ffn_tp_strategy_layer_plan():
 
     # Actual layer plan

From c2d309af7fcc9700033f204ad064881537d4f78b Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:16:39 +0000
Subject: [PATCH 35/68] remove more icrecream

---
 llmfoundry/command_utils/train.py      | 5 -----
 llmfoundry/models/layers/blocks.py     | 1 -
 llmfoundry/models/mpt/modeling_mpt.py  | 3 ---
 llmfoundry/models/utils/tp_strategy.py | 1 -
 4 files changed, 10 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 3b94e57a6c..46d7e2c61a 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -65,11 +65,6 @@
 log = logging.getLogger(__name__)
 
 
-ic.configureOutput(includeContext=True)
-install()
-
-ic.disable()
-
 def validate_config(train_config: TrainConfig):
     """Validates compatible model and dataloader selection."""
     # Validate the rest of the config
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index cf07e453fa..c88cf33d1b 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -206,7 +206,6 @@ def forward(
                 m = self.norm_2(x)
 
         n = self.apply_ffn(attention_mask, m)
-        ic(x.shape, x.device, m.shape, m.device, n.shape, n.device)
         # In the following line we move the `x` tensor to the same devices as the output of ffn layer. This operation should be a no-op during training.
         # This is done to fix pipeline parallel generation using hf.generate. Please see this comment for details: https://github.com/mosaicml/llm-foundry/pull/1332#issue-2386827204
         x = x.to(device=n.device,
diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index 770af5a9cf..e3f6f0575e 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -966,7 +966,6 @@ def forward(
             if prev_layer_key_value is not None:
                 extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
 
-            ic(type(x), type(past_key_value), type(attn_bias), type(attention_mask))
             x, attn_weights, present = block(
                 x,
                 past_key_value=past_key_value,
@@ -1144,7 +1143,6 @@ def forward(
             use_cache if use_cache is not None else self.config.use_cache
         )
 
-        ic(type(input_ids))
         outputs = self.transformer(
             input_ids=input_ids,
             past_key_values=past_key_values,
@@ -1157,7 +1155,6 @@ def forward(
             inputs_embeds=inputs_embeds,
             position_ids=position_ids,
         )
-        ic(outputs)
 
         if self.lm_head is not None:
             logits = self.lm_head(outputs.last_hidden_state)
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 158daa6425..35c87b86dd 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -17,7 +17,6 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     # generate layer plan
     layer_plan: dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
-        ic(name)
         if name.split('.')[-2:] == ['ffn', 'up_proj']:
             layer_plan[name] = ColwiseParallel(
                 input_layouts = Replicate(),

From 6d65a29136398d82c9bcc9cd3be07c0ce4e52fd5 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:23:04 +0000
Subject: [PATCH 36/68] style

---
 llmfoundry/command_utils/train.py      |  8 +++--
 llmfoundry/models/__init__.py          |  1 -
 llmfoundry/models/utils/tp_strategy.py | 34 ++++++++++--------
 llmfoundry/utils/builders.py           |  1 -
 llmfoundry/utils/config_utils.py       | 11 ++++--
 scripts/train/train.py                 |  1 -
 tests/models/utils/test_tp_strategy.py | 48 +++++++++++++++++---------
 7 files changed, 63 insertions(+), 41 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 46d7e2c61a..2eef4bdc30 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -19,7 +19,8 @@
     TraceHandler,
     cyclic_schedule,
 )
-from composer.utils import dist, get_device, reproducibility, ParallelismConfig, TPConfig, FSDPConfig
+from composer.utils import (FSDPConfig, ParallelismConfig, TPConfig, dist,
+                            get_device, reproducibility,)
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
@@ -335,7 +336,8 @@ def train(cfg: DictConfig) -> Trainer:
     tp_config: Optional[dict[str, Any]] = train_cfg.tp_config
 
     # Warn if FSDP or TP is enabled but user only has 1 GPU
-    if dist.get_world_size() == 1 and (fsdp_config is not None or tp_config is not None):
+    if dist.get_world_size(
+    ) == 1 and (fsdp_config is not None or tp_config is not None):
         parallelism = ''
         if fsdp_config is not None:
             parallelism += 'FSDP'
@@ -524,7 +526,7 @@ def train(cfg: DictConfig) -> Trainer:
             tp_config['layer_plan'] |= strategy_layer_plan
 
     # Parallelism config
-    parallelism_config = dict(fsdp=fsdp_config, tp=tp_config)
+    parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config}
 
     # Optimizer
     optimizer_name: str = train_cfg.optimizer.pop('name')
diff --git a/llmfoundry/models/__init__.py b/llmfoundry/models/__init__.py
index ba60e774ed..9db62e5c6b 100644
--- a/llmfoundry/models/__init__.py
+++ b/llmfoundry/models/__init__.py
@@ -27,7 +27,6 @@
 models.register('fmapi_chat', func=FMAPIChatAPIEvalWrapper)
 tp_strategy.register('ffn', func=ffn_tp_strategy)
 
-
 __all__ = [
     'ComposerHFCausalLM',
     'ComposerHFT5',
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 35c87b86dd..1d7a199efc 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,17 +1,23 @@
-from typing import Optional
+
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
 
 from composer.models import ComposerModel
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
-from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (ColwiseParallel,
+                                               PrepareModuleInput,
+                                               RowwiseParallel,)
+from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
 def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
-
-    TP_LAYERS = set(['up_proj', 'down_proj'])
+    TP_LAYERS = {'up_proj', 'down_proj'}
 
     # validate that all TP_LAYERS are in model
-    tp_layers_in_model = set([layer for layer in TP_LAYERS for name, _ in model.named_modules() if layer in name])
+    tp_layers_in_model = set([
+        layer for layer in TP_LAYERS for name, _ in model.named_modules()
+        if layer in name
+    ])
     assert tp_layers_in_model == TP_LAYERS, f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
 
     # generate layer plan
@@ -19,21 +25,19 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     for name, _ in model.named_modules():
         if name.split('.')[-2:] == ['ffn', 'up_proj']:
             layer_plan[name] = ColwiseParallel(
-                input_layouts = Replicate(),
-                output_layouts = Shard(-1),
+                input_layouts=Replicate(),
+                output_layouts=Shard(-1),
             )
         elif name.split('.')[-2:] == ['ffn', 'down_proj']:
             layer_plan[name] = RowwiseParallel(
-                input_layouts = Shard(-1),
-                output_layouts = Shard(0),
+                input_layouts=Shard(-1),
+                output_layouts=Shard(0),
             )
         elif name.split('.')[-1] == 'ffn':
             layer_plan[name] = PrepareModuleInput(
-                input_layouts = Shard(0),
-                desired_input_layouts = Replicate(),
-                use_local_output = True,
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                use_local_output=True,
             )
 
     return layer_plan
-
-
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index fd54bfb88f..631d25bc60 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -14,7 +14,6 @@
     Iterable,
     Optional,
     Union,
-    Callable
 )
 
 import torch
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index 214481149d..3d2d152715 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -502,7 +502,10 @@ def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]:
     return cfg
 
 
-def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict], tp_config: Optional[dict]):
+def process_init_device(
+    model_cfg: dict[str, Any], fsdp_config: Optional[dict],
+    tp_config: Optional[dict]
+):
     # Restrict model init_device to 'meta' and 'cpu',
     # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors
     # when multiple GPUs are available.
@@ -534,11 +537,13 @@ def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict],
             # Set defaults for mixed initialization
             fsdp_config.setdefault('load_monolith_rank0_only', True)
 
-    if tp_config is not None and 'ffn_config' in model_cfg and model_cfg['ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
+    if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[
+        'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
         raise ValueError('Cannot use TP with MoEs.')
 
     # Set ffn_config.device_mesh using fsdp_config
-    if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg['ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
+    if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg[
+        'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
 
         shard_degree = fsdp_config.get('data_parallel_shard_degree', None)
         replicate_degree = fsdp_config.get(
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 288f409abb..728010d13a 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -4,7 +4,6 @@
 
 from llmfoundry.command_utils import train_from_yaml
 
-
 if __name__ == '__main__':
     yaml_path, args_list = sys.argv[1], sys.argv[2:]
     train_from_yaml(yaml_path, args_list)
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 0166c9ea64..6893550e6a 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -1,16 +1,20 @@
-from torch.distributed.tensor.parallel import ColwiseParallel, RowwiseParallel, PrepareModuleInput
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
 from torch.distributed._tensor import Replicate, Shard
+from torch.distributed.tensor.parallel import (ColwiseParallel,
+                                               PrepareModuleInput,
+                                               RowwiseParallel,)
 
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
 
 
 def test_ffn_tp_strategy_layer_plan():
-
     # Actual layer plan
     tp_config = {
         'strategy': 'ffn',
-        }
+    }
 
     model_cfg = {
         'name': 'mpt_causal_lm',
@@ -29,31 +33,41 @@ def test_ffn_tp_strategy_layer_plan():
 
     # Expected layer plan
     _expected_layer_plan = {
-        'ffn': PrepareModuleInput(
-                input_layouts = Shard(0),
-                desired_input_layouts = Replicate(),
-                use_local_output = True,
+        'ffn':
+            PrepareModuleInput(
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                use_local_output=True,
             ),
-        'ffn.down_proj': RowwiseParallel(
-                input_layouts = Shard(-1),
-                output_layouts = Shard(0),
+        'ffn.down_proj':
+            RowwiseParallel(
+                input_layouts=Shard(-1),
+                output_layouts=Shard(0),
             ),
-        'ffn.up_proj': ColwiseParallel(
-                input_layouts = Replicate(),
-                output_layouts = Shard(-1),
-            )
+        'ffn.up_proj':
+            ColwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(-1),
+            ),
+    }
+    expected_layer_plan = {
+        f'model.transformer.blocks.{layer_idx}.{name}': layer_plan
+        for name, layer_plan in _expected_layer_plan.items()
+        for layer_idx in range(model_cfg['n_layers'])
     }
-    expected_layer_plan = {f'model.transformer.blocks.{layer_idx}.{name}': layer_plan for name, layer_plan in _expected_layer_plan.items() for layer_idx in range(model_cfg['n_layers'])}
 
     # Compare expected and actual layer plans
-    for (n1, lp1), (n2, lp2) in zip(sorted(expected_layer_plan.items()), sorted(layer_plan.items())):
+    for (n1, lp1), (n2, lp2) in zip(
+        sorted(expected_layer_plan.items()), sorted(layer_plan.items())
+    ):
         assert n1 == n2
         assert type(lp1) == type(lp2)
         if isinstance(lp1, PrepareModuleInput):
             assert lp1.input_layouts == lp2.input_layouts
             assert lp1.desired_input_layouts == lp2.desired_input_layouts
             assert lp1.use_local_output == lp2.use_local_output
-        elif isinstance(lp1, ColwiseParallel) or isinstance(lp1, RowwiseParallel):
+        elif isinstance(lp1,
+                        ColwiseParallel) or isinstance(lp1, RowwiseParallel):
             assert lp1.input_layouts == lp2.input_layouts
             assert lp1.output_layouts == lp2.output_layouts
             assert lp1.use_local_output == lp2.use_local_output

From 3372ec06f02f924fc0957259cafc9497302cd129 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:25:34 +0000
Subject: [PATCH 37/68] style more

---
 llmfoundry/command_utils/train.py      |  7 +++++--
 llmfoundry/models/utils/tp_strategy.py | 13 +++++++------
 llmfoundry/utils/config_utils.py       |  5 +++--
 tests/models/utils/test_tp_strategy.py | 11 +++++++----
 4 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 2eef4bdc30..ac7f0e7258 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -19,8 +19,11 @@
     TraceHandler,
     cyclic_schedule,
 )
-from composer.utils import (FSDPConfig, ParallelismConfig, TPConfig, dist,
-                            get_device, reproducibility,)
+from composer.utils import (
+    dist,
+    get_device,
+    reproducibility,
+)
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 1d7a199efc..f7929686c6 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -1,12 +1,13 @@
-
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
 from composer.models import ComposerModel
 from torch.distributed._tensor import Replicate, Shard
-from torch.distributed.tensor.parallel import (ColwiseParallel,
-                                               PrepareModuleInput,
-                                               RowwiseParallel,)
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+)
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
@@ -14,10 +15,10 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     TP_LAYERS = {'up_proj', 'down_proj'}
 
     # validate that all TP_LAYERS are in model
-    tp_layers_in_model = set([
+    tp_layers_in_model = {
         layer for layer in TP_LAYERS for name, _ in model.named_modules()
         if layer in name
-    ])
+    }
     assert tp_layers_in_model == TP_LAYERS, f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
 
     # generate layer plan
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index 3d2d152715..cc7415ba10 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -503,8 +503,9 @@ def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]:
 
 
 def process_init_device(
-    model_cfg: dict[str, Any], fsdp_config: Optional[dict],
-    tp_config: Optional[dict]
+    model_cfg: dict[str, Any],
+    fsdp_config: Optional[dict],
+    tp_config: Optional[dict],
 ):
     # Restrict model init_device to 'meta' and 'cpu',
     # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 6893550e6a..cb015c53bc 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -2,9 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from torch.distributed._tensor import Replicate, Shard
-from torch.distributed.tensor.parallel import (ColwiseParallel,
-                                               PrepareModuleInput,
-                                               RowwiseParallel,)
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
+)
 
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
@@ -58,7 +60,8 @@ def test_ffn_tp_strategy_layer_plan():
 
     # Compare expected and actual layer plans
     for (n1, lp1), (n2, lp2) in zip(
-        sorted(expected_layer_plan.items()), sorted(layer_plan.items())
+        sorted(expected_layer_plan.items()),
+        sorted(layer_plan.items()),
     ):
         assert n1 == n2
         assert type(lp1) == type(lp2)

From d2c91143a0db34e8652dcb549bcb9863f05e3f5f Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 20:31:45 +0000
Subject: [PATCH 38/68] remove extra

---
 llmfoundry/models/mpt/modeling_mpt.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py
index e3f6f0575e..06b64101c3 100644
--- a/llmfoundry/models/mpt/modeling_mpt.py
+++ b/llmfoundry/models/mpt/modeling_mpt.py
@@ -965,7 +965,6 @@ def forward(
             extra_kwargs = {}
             if prev_layer_key_value is not None:
                 extra_kwargs['prev_layer_key_value'] = prev_layer_key_value
-
             x, attn_weights, present = block(
                 x,
                 past_key_value=past_key_value,

From ff36f1777695e6d43d6200896e4c8e651d67fe1d Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 21:17:12 +0000
Subject: [PATCH 39/68] fix type checking

---
 llmfoundry/utils/config_utils.py       |  4 ++--
 tests/models/utils/test_tp_strategy.py | 16 +++++++++++++---
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index cc7415ba10..dd13fa5009 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -504,8 +504,8 @@ def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]:
 
 def process_init_device(
     model_cfg: dict[str, Any],
-    fsdp_config: Optional[dict],
-    tp_config: Optional[dict],
+    fsdp_config: Optional[dict] = None,
+    tp_config: Optional[dict] = None,
 ):
     # Restrict model init_device to 'meta' and 'cpu',
     # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index cb015c53bc..aba35689cc 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -65,12 +65,22 @@ def test_ffn_tp_strategy_layer_plan():
     ):
         assert n1 == n2
         assert type(lp1) == type(lp2)
-        if isinstance(lp1, PrepareModuleInput):
+        if isinstance(
+            lp1,
+            PrepareModuleInput,
+        ) and isinstance(lp2, PrepareModuleInput):
             assert lp1.input_layouts == lp2.input_layouts
             assert lp1.desired_input_layouts == lp2.desired_input_layouts
             assert lp1.use_local_output == lp2.use_local_output
-        elif isinstance(lp1,
-                        ColwiseParallel) or isinstance(lp1, RowwiseParallel):
+        elif (
+            isinstance(lp1, ColwiseParallel) and
+            isinstance(lp2, ColwiseParallel)
+        ) or (
+            isinstance(lp1, RowwiseParallel) and
+            isinstance(lp2, RowwiseParallel)
+        ):
             assert lp1.input_layouts == lp2.input_layouts
             assert lp1.output_layouts == lp2.output_layouts
             assert lp1.use_local_output == lp2.use_local_output
+        else:
+            raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')

From 1474a8049e6fbaf1627f2c78f67a46918a748a96 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 21:31:35 +0000
Subject: [PATCH 40/68] make tp yaml

---
 scripts/train/yamls/pretrain/mpt-125m.yaml    |  19 +--
 scripts/train/yamls/pretrain/tp-mpt-125m.yaml | 133 ++++++++++++++++++
 2 files changed, 137 insertions(+), 15 deletions(-)
 create mode 100644 scripts/train/yamls/pretrain/tp-mpt-125m.yaml

diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml
index 5f6f55c06a..644dfc26c1 100644
--- a/scripts/train/yamls/pretrain/mpt-125m.yaml
+++ b/scripts/train/yamls/pretrain/mpt-125m.yaml
@@ -35,7 +35,7 @@ train_loader:
   dataset:
     local: ${variables.data_local}
     remote: ${variables.data_remote}
-    split: train_small
+    split: train
     shuffle: true
     max_seq_len: ${variables.max_seq_len}
     shuffle_seed: ${variables.global_seed}
@@ -47,11 +47,10 @@ eval_loader:
   dataset:
     local: ${variables.data_local}
     remote: ${variables.data_remote}
-    split: val_small
+    split: val
     shuffle: false
     max_seq_len: ${variables.max_seq_len}
     shuffle_seed: ${variables.global_seed}
-    replication: 2
   drop_last: false
   num_workers: 8
 
@@ -75,10 +74,10 @@ algorithms:
     clipping_type: norm
     clipping_threshold: 1.0
 
-max_duration: 100ba
+max_duration: 4800ba  # ~ 2.5B tokens
 eval_interval: 500ba
 eval_first: false
-eval_subset_num_batches: 0
+eval_subset_num_batches: -1
 global_train_batch_size: 256
 
 # System
@@ -97,11 +96,6 @@ fsdp_config:
   activation_cpu_offload: false
   limit_all_gathers: true
 
-# TP
-tp_config:
-  strategy: ffn
-  tensor_parallel_degree: 2
-
 # Logging
 progress_bar: false
 log_to_console: true
@@ -114,11 +108,6 @@ callbacks:
   memory_monitor: {}
   runtime_estimator: {}
 
-loggers:
-  mlflow:
-    experiment_name: tp
-    # model_registry_prefix: datasets.eitanturok.${run_name}
-
 # loggers:
 #   wandb: {}
 
diff --git a/scripts/train/yamls/pretrain/tp-mpt-125m.yaml b/scripts/train/yamls/pretrain/tp-mpt-125m.yaml
new file mode 100644
index 0000000000..5f6f55c06a
--- /dev/null
+++ b/scripts/train/yamls/pretrain/tp-mpt-125m.yaml
@@ -0,0 +1,133 @@
+variables:
+  data_local: ./my-copy-c4
+  data_remote:  # If blank, files must be present in data_local
+  max_seq_len: 2048
+  global_seed: 17
+
+  # Run Name
+  run_name:  # If left blank, will be read from env var $RUN_NAME
+
+max_seq_len: ${variables.max_seq_len}
+run_name: ${variables.run_name}
+
+# Model
+model:
+  name: mpt_causal_lm
+  init_device: meta
+  d_model: 768
+  n_heads: 12
+  n_layers: 12
+  expansion_ratio: 4
+  max_seq_len: ${variables.max_seq_len}
+  vocab_size: 50368
+  attn_config:
+    attn_impl: flash
+
+# Tokenizer
+tokenizer:
+  name: EleutherAI/gpt-neox-20b
+  kwargs:
+    model_max_length: ${variables.max_seq_len}
+
+# Dataloaders
+train_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: train_small
+    shuffle: true
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+  drop_last: true
+  num_workers: 8
+
+eval_loader:
+  name: text
+  dataset:
+    local: ${variables.data_local}
+    remote: ${variables.data_remote}
+    split: val_small
+    shuffle: false
+    max_seq_len: ${variables.max_seq_len}
+    shuffle_seed: ${variables.global_seed}
+    replication: 2
+  drop_last: false
+  num_workers: 8
+
+# Optimization
+scheduler:
+  name: cosine_with_warmup
+  t_warmup: 100ba
+  alpha_f: 0.1
+
+optimizer:
+  name: decoupled_adamw
+  lr: 6.0e-4
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-08
+  weight_decay: 0.0
+
+algorithms:
+  gradient_clipping:
+    clipping_type: norm
+    clipping_threshold: 1.0
+
+max_duration: 100ba
+eval_interval: 500ba
+eval_first: false
+eval_subset_num_batches: 0
+global_train_batch_size: 256
+
+# System
+seed: ${variables.global_seed}
+device_eval_batch_size: 16
+device_train_microbatch_size: 16
+# device_train_microbatch_size: auto
+precision: amp_bf16
+
+# FSDP
+fsdp_config:
+  sharding_strategy: FULL_SHARD
+  mixed_precision: PURE
+  activation_checkpointing: false
+  activation_checkpointing_reentrant: false
+  activation_cpu_offload: false
+  limit_all_gathers: true
+
+# TP
+tp_config:
+  strategy: ffn
+  tensor_parallel_degree: 2
+
+# Logging
+progress_bar: false
+log_to_console: true
+console_log_interval: 1ba
+
+callbacks:
+  speed_monitor:
+    window_size: 10
+  lr_monitor: {}
+  memory_monitor: {}
+  runtime_estimator: {}
+
+loggers:
+  mlflow:
+    experiment_name: tp
+    # model_registry_prefix: datasets.eitanturok.${run_name}
+
+# loggers:
+#   wandb: {}
+
+# Checkpoint to local filesystem or remote object store
+# save_interval: 500ba
+# save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+# save_folder: ./{run_name}/checkpoints
+# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
+
+# Load from local filesystem or remote object store
+# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
+# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt

From 8996de4fbe6d158a3fe01bf13aa058bf9da87d92 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 23:57:20 +0000
Subject: [PATCH 41/68] no flash attn

---
 tests/models/utils/test_tp_strategy.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index aba35689cc..f7d7b48a12 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -26,9 +26,6 @@ def test_ffn_tp_strategy_layer_plan():
         'expansion_ratio': 1,
         'max_seq_len': 16,
         'vocab_size': 50368,
-        'attn_config': {
-            'attn_impl': 'flash',
-        },
     }
     model = ComposerMPTCausalLM(**model_cfg)
     layer_plan = build_tp_strategy(tp_config['strategy'], model)

From 5004fe54c0f505bf1a194b95f880c4fbf42329b0 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Wed, 25 Sep 2024 23:58:48 +0000
Subject: [PATCH 42/68] better comment

---
 tests/models/utils/test_tp_strategy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index f7d7b48a12..d1e48babcf 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -13,7 +13,8 @@
 
 
 def test_ffn_tp_strategy_layer_plan():
-    # Actual layer plan
+
+    # Actual layer plan from tp_strategy=fnn
     tp_config = {
         'strategy': 'ffn',
     }

From ba2dd0d7f880917d0adeeea9937c4424604a5745 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 00:08:06 +0000
Subject: [PATCH 43/68] docformatter

---
 tests/models/utils/test_tp_strategy.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index d1e48babcf..37c1a2941d 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -13,7 +13,6 @@
 
 
 def test_ffn_tp_strategy_layer_plan():
-
     # Actual layer plan from tp_strategy=fnn
     tp_config = {
         'strategy': 'ffn',

From 7e3ad7180c38f1d0cec5a58122b5ba14fa615450 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 14:35:57 +0000
Subject: [PATCH 44/68] remove |=

---
 llmfoundry/command_utils/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 6f47da2cd5..9641aa44e8 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -526,7 +526,7 @@ def train(cfg: DictConfig) -> Trainer:
         if 'strategy' in tp_config:
             strategy = tp_config.pop('strategy')
             strategy_layer_plan = build_tp_strategy(strategy, model)
-            tp_config['layer_plan'] |= strategy_layer_plan
+            tp_config['layer_plan'] = strategy_layer_plan
 
     # Parallelism config
     parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config}

From 39a92ad2d16e12164da219b0e59f2e8b910c358e Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 14:38:46 +0000
Subject: [PATCH 45/68] add runtimeError

---
 llmfoundry/models/utils/tp_strategy.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index f7929686c6..748ccf0c55 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -19,7 +19,10 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
         layer for layer in TP_LAYERS for name, _ in model.named_modules()
         if layer in name
     }
-    assert tp_layers_in_model == TP_LAYERS, f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
+    if tp_layers_in_model != TP_LAYERS:
+        raise RuntimeError(
+            f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
+        )
 
     # generate layer plan
     layer_plan: dict[str, ParallelStyle] = {}

From 92719e70271b99b4a88019178d2635b0a1543a3d Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:39:09 -0400
Subject: [PATCH 46/68] Update llmfoundry/models/utils/tp_strategy.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 llmfoundry/models/utils/tp_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 748ccf0c55..2c48160750 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -14,7 +14,7 @@
 def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     TP_LAYERS = {'up_proj', 'down_proj'}
 
-    # validate that all TP_LAYERS are in model
+    # Validate that all TP_LAYERS are in model
     tp_layers_in_model = {
         layer for layer in TP_LAYERS for name, _ in model.named_modules()
         if layer in name

From 8c0135de9e79f557d31a6cb3cec81ad69b05cf6b Mon Sep 17 00:00:00 2001
From: Eitan Turok <150733043+eitanturok@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:39:16 -0400
Subject: [PATCH 47/68] Update llmfoundry/models/utils/tp_strategy.py

Co-authored-by: Mihir Patel <mihir.v.patel7@gmail.com>
---
 llmfoundry/models/utils/tp_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 2c48160750..7c8754994d 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -24,7 +24,7 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
             f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
         )
 
-    # generate layer plan
+    # Generate layer plan
     layer_plan: dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
         if name.split('.')[-2:] == ['ffn', 'up_proj']:

From 0156dd2374fdd7be1909b06ac991d52a24b170fc Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 14:58:33 +0000
Subject: [PATCH 48/68] explain with comments

---
 llmfoundry/models/utils/tp_strategy.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index 7c8754994d..c75f59d356 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -21,27 +21,36 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     }
     if tp_layers_in_model != TP_LAYERS:
         raise RuntimeError(
-            f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.'
+            f'The FFN tensor parallelism strategy requires `model` to have layers {TP_LAYERS}. But `model` is missing layers {TP_LAYERS - tp_layers_in_model}.',
         )
 
     # Generate layer plan
     layer_plan: dict[str, ParallelStyle] = {}
     for name, _ in model.named_modules():
-        if name.split('.')[-2:] == ['ffn', 'up_proj']:
+        # Before the ffn layer starts, distribute the input data for proper TP use
+        # Inputs are currently sharded across the batch dimension (dim 0) as is done in standard DDP
+        # Inputs will be replicated across hidden dimension (dim 1) via allgather
+        if name.split('.')[-1] == 'ffn':
+            layer_plan[name] = PrepareModuleInput(
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                use_local_output=True,
+            )
+        # Shard the ffn.up_proj weight matrix across its columns
+        # Inputs are already replicated across each TP group
+        # Outputs will be sharded along the hidden dimension (dim 1) via allgather
+        elif name.split('.')[-2:] == ['ffn', 'up_proj']:
             layer_plan[name] = ColwiseParallel(
                 input_layouts=Replicate(),
                 output_layouts=Shard(-1),
             )
+        # Shard the ffn.down_proj weight matrix across its rows
+        # Inputs are sharded along the hidden dimension (dim 1)
+        # Outputs will be sharded along batch dimension (dim 0) via allreduce
         elif name.split('.')[-2:] == ['ffn', 'down_proj']:
             layer_plan[name] = RowwiseParallel(
                 input_layouts=Shard(-1),
                 output_layouts=Shard(0),
             )
-        elif name.split('.')[-1] == 'ffn':
-            layer_plan[name] = PrepareModuleInput(
-                input_layouts=Shard(0),
-                desired_input_layouts=Replicate(),
-                use_local_output=True,
-            )
 
     return layer_plan

From c696338dd5c911cb3e1578fde277e76b9a77a958 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 15:13:45 +0000
Subject: [PATCH 49/68] run on GPU

---
 tests/models/utils/test_tp_strategy.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 37c1a2941d..cf3107cc1c 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -11,11 +11,11 @@
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
 
-
+@pytest.mark.gpu
 def test_ffn_tp_strategy_layer_plan():
     # Actual layer plan from tp_strategy=fnn
     tp_config = {
-        'strategy': 'ffn',
+        'strategy': 'ffn'
     }
 
     model_cfg = {

From ccdbcf487c7044c612d70d03db0e1b7c81830ceb Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 15:14:28 +0000
Subject: [PATCH 50/68] style

---
 tests/models/utils/test_tp_strategy.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index cf3107cc1c..d0812bf5ce 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -11,11 +11,12 @@
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
 
+
 @pytest.mark.gpu
 def test_ffn_tp_strategy_layer_plan():
     # Actual layer plan from tp_strategy=fnn
     tp_config = {
-        'strategy': 'ffn'
+        'strategy': 'ffn',
     }
 
     model_cfg = {

From cb1ab3159d092352714c556cdc0ee1624bf0ff2c Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 18:29:21 +0000
Subject: [PATCH 51/68] test one gpu warning

---
 .../data_prep/convert_dataset_hf.py           |   2 +-
 tests/data_utils.py                           |   2 +-
 tests/models/utils/test_tp_strategy.py        | 182 +++++++++++-------
 3 files changed, 113 insertions(+), 73 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
index 0ea94ac687..fba062d6f5 100644
--- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
+++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py
@@ -158,7 +158,7 @@ def __init__(
     truncated_samples=100,
 )
 
-CONSTS = {'c4': c4constants, 'the_pile': pileconstants}
+CONSTS = {'allenai/c4': c4constants, 'the_pile': pileconstants}
 
 
 def build_hf_dataset(
diff --git a/tests/data_utils.py b/tests/data_utils.py
index 117310b0cf..1f6c26b72e 100644
--- a/tests/data_utils.py
+++ b/tests/data_utils.py
@@ -231,7 +231,7 @@ def create_c4_dataset_xxsmall(path: Path) -> str:
 
     # Hyperparameters from https://github.com/mosaicml/llm-foundry/blob/340a56658560ebceb2a3aa69d6e37813e415acd0/README.md#L188
     convert_dataset_hf(
-        dataset='c4',
+        dataset='allenai/c4',
         data_subset='en',
         splits=[downloaded_split],
         out_root=c4_dir,
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index d0812bf5ce..dbe7ef030a 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -1,84 +1,124 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+
+import pytest
+from omegaconf import OmegaConf as om
 from torch.distributed._tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
-    ColwiseParallel,
-    PrepareModuleInput,
-    RowwiseParallel,
+   ColwiseParallel,
+   PrepareModuleInput,
+   RowwiseParallel,
 )
 
+
+from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
+from llmfoundry.command_utils.train import train
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
 
 
+
+
 @pytest.mark.gpu
 def test_ffn_tp_strategy_layer_plan():
-    # Actual layer plan from tp_strategy=fnn
-    tp_config = {
-        'strategy': 'ffn',
-    }
-
-    model_cfg = {
-        'name': 'mpt_causal_lm',
-        'd_model': 128,
-        'n_heads': 4,
-        'n_layers': 3,
-        'expansion_ratio': 1,
-        'max_seq_len': 16,
-        'vocab_size': 50368,
-    }
-    model = ComposerMPTCausalLM(**model_cfg)
-    layer_plan = build_tp_strategy(tp_config['strategy'], model)
-
-    # Expected layer plan
-    _expected_layer_plan = {
-        'ffn':
-            PrepareModuleInput(
-                input_layouts=Shard(0),
-                desired_input_layouts=Replicate(),
-                use_local_output=True,
-            ),
-        'ffn.down_proj':
-            RowwiseParallel(
-                input_layouts=Shard(-1),
-                output_layouts=Shard(0),
-            ),
-        'ffn.up_proj':
-            ColwiseParallel(
-                input_layouts=Replicate(),
-                output_layouts=Shard(-1),
-            ),
-    }
-    expected_layer_plan = {
-        f'model.transformer.blocks.{layer_idx}.{name}': layer_plan
-        for name, layer_plan in _expected_layer_plan.items()
-        for layer_idx in range(model_cfg['n_layers'])
-    }
-
-    # Compare expected and actual layer plans
-    for (n1, lp1), (n2, lp2) in zip(
-        sorted(expected_layer_plan.items()),
-        sorted(layer_plan.items()),
-    ):
-        assert n1 == n2
-        assert type(lp1) == type(lp2)
-        if isinstance(
-            lp1,
-            PrepareModuleInput,
-        ) and isinstance(lp2, PrepareModuleInput):
-            assert lp1.input_layouts == lp2.input_layouts
-            assert lp1.desired_input_layouts == lp2.desired_input_layouts
-            assert lp1.use_local_output == lp2.use_local_output
-        elif (
-            isinstance(lp1, ColwiseParallel) and
-            isinstance(lp2, ColwiseParallel)
-        ) or (
-            isinstance(lp1, RowwiseParallel) and
-            isinstance(lp2, RowwiseParallel)
-        ):
-            assert lp1.input_layouts == lp2.input_layouts
-            assert lp1.output_layouts == lp2.output_layouts
-            assert lp1.use_local_output == lp2.use_local_output
-        else:
-            raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')
+   # Actual layer plan from tp_strategy=fnn
+   tp_config = {
+       'strategy': 'ffn',
+   }
+
+
+   model_cfg = {
+       'name': 'mpt_causal_lm',
+       'd_model': 128,
+       'n_heads': 4,
+       'n_layers': 3,
+       'expansion_ratio': 1,
+       'max_seq_len': 16,
+       'vocab_size': 50368,
+   }
+   model = ComposerMPTCausalLM(**model_cfg)
+   layer_plan = build_tp_strategy(tp_config['strategy'], model)
+
+
+   # Expected layer plan
+   _expected_layer_plan = {
+       'ffn':
+           PrepareModuleInput(
+               input_layouts=Shard(0),
+               desired_input_layouts=Replicate(),
+               use_local_output=True,
+           ),
+       'ffn.down_proj':
+           RowwiseParallel(
+               input_layouts=Shard(-1),
+               output_layouts=Shard(0),
+           ),
+       'ffn.up_proj':
+           ColwiseParallel(
+               input_layouts=Replicate(),
+               output_layouts=Shard(-1),
+           ),
+   }
+   expected_layer_plan = {
+       f'model.transformer.blocks.{layer_idx}.{name}': layer_plan
+       for name, layer_plan in _expected_layer_plan.items()
+       for layer_idx in range(model_cfg['n_layers'])
+   }
+
+
+   # Compare expected and actual layer plans
+   for (n1, lp1), (n2, lp2) in zip(
+       sorted(expected_layer_plan.items()),
+       sorted(layer_plan.items()),
+   ):
+       assert n1 == n2
+       assert type(lp1) == type(lp2)
+       if isinstance(
+           lp1,
+           PrepareModuleInput,
+       ) and isinstance(lp2, PrepareModuleInput):
+           assert lp1.input_layouts == lp2.input_layouts
+           assert lp1.desired_input_layouts == lp2.desired_input_layouts
+           assert lp1.use_local_output == lp2.use_local_output
+       elif (
+           isinstance(lp1, ColwiseParallel) and
+           isinstance(lp2, ColwiseParallel)
+       ) or (
+           isinstance(lp1, RowwiseParallel) and
+           isinstance(lp2, RowwiseParallel)
+       ):
+           assert lp1.input_layouts == lp2.input_layouts
+           assert lp1.output_layouts == lp2.output_layouts
+           assert lp1.use_local_output == lp2.use_local_output
+       else:
+           raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')
+
+
+
+
+@pytest.mark.gpu
+# @pytest.mark.filterwarnings("error::") # treat warnings like errors
+def test_tp_one_gpu():
+    from icecream import ic
+    # get train_cfg with tp
+    train_cfg_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'
+    with open(train_cfg_path) as f:
+        train_cfg = om.load(f)
+
+
+    tmp_path = '/my-tmp/c4_small'
+    dataset_name = create_c4_dataset_xxsmall(tmp_path)
+    train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
+    train_cfg.tp_config = {'strategy': 'ffn'}
+
+    with pytest.warns(UserWarning, match='FSDP+TP is not applicable for single-GPU training. Reverting to DDP.'):
+        train(train_cfg)
+
+
+
+
+
+
+# if __name__ == '__main__':
+#     test_tp_one_gpu()

From 3921cda2726726f196eba0e93740a9b35ccd00a7 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 19:24:27 +0000
Subject: [PATCH 52/68] test_no_tp_with_one_gpu

---
 tests/models/utils/test_tp_strategy.py | 195 ++++++++++++-------------
 1 file changed, 91 insertions(+), 104 deletions(-)

diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index dbe7ef030a..f2ef0c951e 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -1,124 +1,111 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+from pathlib import Path
+from tempfile import TemporaryDirectory
 
 import pytest
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
-   ColwiseParallel,
-   PrepareModuleInput,
-   RowwiseParallel,
+    ColwiseParallel,
+    PrepareModuleInput,
+    RowwiseParallel,
 )
 
-
-from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
 from llmfoundry.command_utils.train import train
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
-
-
+from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
 
 
 @pytest.mark.gpu
 def test_ffn_tp_strategy_layer_plan():
-   # Actual layer plan from tp_strategy=fnn
-   tp_config = {
-       'strategy': 'ffn',
-   }
-
-
-   model_cfg = {
-       'name': 'mpt_causal_lm',
-       'd_model': 128,
-       'n_heads': 4,
-       'n_layers': 3,
-       'expansion_ratio': 1,
-       'max_seq_len': 16,
-       'vocab_size': 50368,
-   }
-   model = ComposerMPTCausalLM(**model_cfg)
-   layer_plan = build_tp_strategy(tp_config['strategy'], model)
-
-
-   # Expected layer plan
-   _expected_layer_plan = {
-       'ffn':
-           PrepareModuleInput(
-               input_layouts=Shard(0),
-               desired_input_layouts=Replicate(),
-               use_local_output=True,
-           ),
-       'ffn.down_proj':
-           RowwiseParallel(
-               input_layouts=Shard(-1),
-               output_layouts=Shard(0),
-           ),
-       'ffn.up_proj':
-           ColwiseParallel(
-               input_layouts=Replicate(),
-               output_layouts=Shard(-1),
-           ),
-   }
-   expected_layer_plan = {
-       f'model.transformer.blocks.{layer_idx}.{name}': layer_plan
-       for name, layer_plan in _expected_layer_plan.items()
-       for layer_idx in range(model_cfg['n_layers'])
-   }
-
-
-   # Compare expected and actual layer plans
-   for (n1, lp1), (n2, lp2) in zip(
-       sorted(expected_layer_plan.items()),
-       sorted(layer_plan.items()),
-   ):
-       assert n1 == n2
-       assert type(lp1) == type(lp2)
-       if isinstance(
-           lp1,
-           PrepareModuleInput,
-       ) and isinstance(lp2, PrepareModuleInput):
-           assert lp1.input_layouts == lp2.input_layouts
-           assert lp1.desired_input_layouts == lp2.desired_input_layouts
-           assert lp1.use_local_output == lp2.use_local_output
-       elif (
-           isinstance(lp1, ColwiseParallel) and
-           isinstance(lp2, ColwiseParallel)
-       ) or (
-           isinstance(lp1, RowwiseParallel) and
-           isinstance(lp2, RowwiseParallel)
-       ):
-           assert lp1.input_layouts == lp2.input_layouts
-           assert lp1.output_layouts == lp2.output_layouts
-           assert lp1.use_local_output == lp2.use_local_output
-       else:
-           raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')
-
-
+    # Actual layer plan from tp_strategy=fnn
+    tp_config = {
+        'strategy': 'ffn',
+    }
+
+    model_cfg = {
+        'name': 'mpt_causal_lm',
+        'd_model': 128,
+        'n_heads': 4,
+        'n_layers': 3,
+        'expansion_ratio': 1,
+        'max_seq_len': 16,
+        'vocab_size': 50368,
+    }
+    model = ComposerMPTCausalLM(**model_cfg)
+    layer_plan = build_tp_strategy(tp_config['strategy'], model)
+
+    # Expected layer plan
+    _expected_layer_plan = {
+        'ffn':
+            PrepareModuleInput(
+                input_layouts=Shard(0),
+                desired_input_layouts=Replicate(),
+                use_local_output=True,
+            ),
+        'ffn.down_proj':
+            RowwiseParallel(
+                input_layouts=Shard(-1),
+                output_layouts=Shard(0),
+            ),
+        'ffn.up_proj':
+            ColwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(-1),
+            ),
+    }
+    expected_layer_plan = {
+        f'model.transformer.blocks.{layer_idx}.{name}': layer_plan
+        for name, layer_plan in _expected_layer_plan.items()
+        for layer_idx in range(model_cfg['n_layers'])
+    }
+
+    # Compare expected and actual layer plans
+    for (n1, lp1), (n2, lp2) in zip(
+        sorted(expected_layer_plan.items()),
+        sorted(layer_plan.items()),
+    ):
+        assert n1 == n2
+        assert type(lp1) == type(lp2)
+        if isinstance(
+            lp1,
+            PrepareModuleInput,
+        ) and isinstance(lp2, PrepareModuleInput):
+            assert lp1.input_layouts == lp2.input_layouts
+            assert lp1.desired_input_layouts == lp2.desired_input_layouts
+            assert lp1.use_local_output == lp2.use_local_output
+        elif (
+            isinstance(lp1, ColwiseParallel) and
+            isinstance(lp2, ColwiseParallel)
+        ) or (
+            isinstance(lp1, RowwiseParallel) and
+            isinstance(lp2, RowwiseParallel)
+        ):
+            assert lp1.input_layouts == lp2.input_layouts
+            assert lp1.output_layouts == lp2.output_layouts
+            assert lp1.use_local_output == lp2.use_local_output
+        else:
+            raise ValueError(f'Layer plan of wrong type: {type(layer_plan)}')
 
 
 @pytest.mark.gpu
-# @pytest.mark.filterwarnings("error::") # treat warnings like errors
-def test_tp_one_gpu():
-    from icecream import ic
-    # get train_cfg with tp
-    train_cfg_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'
-    with open(train_cfg_path) as f:
-        train_cfg = om.load(f)
-
-
-    tmp_path = '/my-tmp/c4_small'
-    dataset_name = create_c4_dataset_xxsmall(tmp_path)
-    train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
-    train_cfg.tp_config = {'strategy': 'ffn'}
-
-    with pytest.warns(UserWarning, match='FSDP+TP is not applicable for single-GPU training. Reverting to DDP.'):
-        train(train_cfg)
-
-
-
-
-
-
-# if __name__ == '__main__':
-#     test_tp_one_gpu()
+def test_no_tp_with_one_gpu():
+    with TemporaryDirectory() as tmp_path:
+        # train_cfg with ffn tensor parallelism
+        train_cfg_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'
+        with open(train_cfg_path, 'r', encoding='utf-8') as f:
+            train_cfg = om.load(f)
+        dataset_name = create_c4_dataset_xxsmall(Path(tmp_path))
+        train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
+        train_cfg.tp_config = {'strategy': 'ffn'}
+
+        # Expect a warning that we use DDP and not FSDP-TP when we have one GPU.
+        with pytest.warns(
+            UserWarning,
+            match=
+            r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
+        ):
+            train(train_cfg)

From 4e4b6b9d2ddba33ced26ade07c0702d511a16aa8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 19:56:44 +0000
Subject: [PATCH 53/68] test_no_tp_with_moes

---
 llmfoundry/utils/config_utils.py       |  6 ++++--
 tests/models/utils/test_tp_strategy.py | 23 +++++++++++++++++++++--
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index dd13fa5009..c22495993c 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -538,14 +538,16 @@ def process_init_device(
             # Set defaults for mixed initialization
             fsdp_config.setdefault('load_monolith_rank0_only', True)
 
+    # Check we are not using tensor parallelism with MoEs
     if tp_config is not None and 'ffn_config' in model_cfg and model_cfg[
         'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
-        raise ValueError('Cannot use TP with MoEs.')
+        raise ValueError(
+            'Tensor Parallelism is not currently supported for MoE models.',
+        )
 
     # Set ffn_config.device_mesh using fsdp_config
     if fsdp_config is not None and 'ffn_config' in model_cfg and model_cfg[
         'ffn_config'].get('ffn_type', None) in ffns_with_megablocks:
-
         shard_degree = fsdp_config.get('data_parallel_shard_degree', None)
         replicate_degree = fsdp_config.get(
             'data_parallel_replicate_degree',
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index f2ef0c951e..073a8ff782 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -16,6 +16,7 @@
 from llmfoundry.command_utils.train import train
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
 from llmfoundry.utils.builders import build_tp_strategy
+from llmfoundry.utils.config_utils import process_init_device
 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
 
 
@@ -94,7 +95,7 @@ def test_ffn_tp_strategy_layer_plan():
 @pytest.mark.gpu
 def test_no_tp_with_one_gpu():
     with TemporaryDirectory() as tmp_path:
-        # train_cfg with ffn tensor parallelism
+        # Make `train_cfg`` with a tensor parallelism strategy
         train_cfg_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'
         with open(train_cfg_path, 'r', encoding='utf-8') as f:
             train_cfg = om.load(f)
@@ -102,10 +103,28 @@ def test_no_tp_with_one_gpu():
         train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
         train_cfg.tp_config = {'strategy': 'ffn'}
 
-        # Expect a warning that we use DDP and not FSDP-TP when we have one GPU.
+        # Expect a warning to use DDP and not FSDP-TP when we have one GPU.
         with pytest.warns(
             UserWarning,
             match=
             r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
         ):
             train(train_cfg)
+
+
+@pytest.mark.gpu  # use gpu because `megablocks` only installed with `gpu` dependencies
+def test_no_tp_with_moes():
+    # Make `cfg` for MoE model, fsdp, and tp (tensor parallelism)
+    train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'
+    with open(train_cfg_path, 'r', encoding='utf-8') as f:
+        train_cfg = om.load(f)
+    model_cfg = train_cfg.model
+    fsdp_cfg = train_cfg.fsdp_config
+    tp_cfg = {'strategy': 'ffn'}
+
+    # Expect an error for using tensor parallelism with MoEs
+    with pytest.raises(
+        ValueError,
+        match='Tensor Parallelism is not currently supported for MoE models.',
+    ):
+        process_init_device(model_cfg, fsdp_cfg, tp_cfg)

From c9c2455de2e65aaee372379e06379e11edd449b8 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 20:42:51 +0000
Subject: [PATCH 54/68] add experimental_function decorator to tp_strategy

---
 llmfoundry/utils/builders.py           | 2 ++
 tests/models/utils/test_tp_strategy.py | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 631d25bc60..dcffbf6caa 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -38,6 +38,7 @@
 )
 from llmfoundry.utils.config_utils import to_dict_container, to_list_container
 from llmfoundry.utils.registry_utils import construct_from_registry
+from llmfoundry.utils.warnings import experimental_function
 
 log = logging.getLogger(__name__)
 
@@ -705,6 +706,7 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
     return evaluators, logger_keys
 
 
+@experimental_function('tp_strategy')
 def build_tp_strategy(
     name: str,
     model: ComposerModel,
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategy.py
index 073a8ff782..19bad8abfd 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategy.py
@@ -21,6 +21,9 @@
 
 
 @pytest.mark.gpu
+@pytest.mark.filterwarnings(
+    'ignore:tp_strategy is experimental and may change with future versions.'
+)
 def test_ffn_tp_strategy_layer_plan():
     # Actual layer plan from tp_strategy=fnn
     tp_config = {
@@ -128,3 +131,7 @@ def test_no_tp_with_moes():
         match='Tensor Parallelism is not currently supported for MoE models.',
     ):
         process_init_device(model_cfg, fsdp_cfg, tp_cfg)
+
+
+# if __name__ == '__main__':
+#     test_ffn_tp_strategy_layer_plan()

From 33bbf9bdd3e8773c4b9d97e7be8591f275b947e4 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 21:05:56 +0000
Subject: [PATCH 55/68] simplify trainer

---
 llmfoundry/command_utils/train.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 9641aa44e8..effec90559 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -521,12 +521,9 @@ def train(cfg: DictConfig) -> Trainer:
 
     # TP config
     if tp_config is not None:
-        if 'layer_plan' not in tp_config:
-            tp_config['layer_plan'] = {}
-        if 'strategy' in tp_config:
-            strategy = tp_config.pop('strategy')
-            strategy_layer_plan = build_tp_strategy(strategy, model)
-            tp_config['layer_plan'] = strategy_layer_plan
+        strategy = tp_config.pop('strategy', None)
+        assert isinstance(strategy, str), "`strategy` must be in `tp_config`."
+        tp_config['layer_plan'] = build_tp_strategy(strategy, model)
 
     # Parallelism config
     parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config}

From c9e64df81a1f24cc1b03a67d8cde2c191ac6fe62 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 21:16:07 +0000
Subject: [PATCH 56/68] tp_strategy -> tp_stratigies

---
 llmfoundry/command_utils/train.py                |  6 +++---
 llmfoundry/models/__init__.py                    |  8 ++++----
 .../utils/{tp_strategy.py => tp_strategies.py}   |  2 +-
 llmfoundry/registry.py                           | 12 ++++++------
 llmfoundry/utils/builders.py                     |  8 ++++----
 ...test_tp_strategy.py => test_tp_strategies.py} | 16 +++++++++-------
 tests/test_registry.py                           |  2 +-
 7 files changed, 28 insertions(+), 26 deletions(-)
 rename llmfoundry/models/utils/{tp_strategy.py => tp_strategies.py} (96%)
 rename tests/models/utils/{test_tp_strategy.py => test_tp_strategies.py} (89%)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index effec90559..0b746c05f8 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -48,7 +48,7 @@
     build_save_planner,
     build_scheduler,
     build_tokenizer,
-    build_tp_strategy,
+    build_tp_strategies,
 )
 from llmfoundry.utils.config_utils import (
     TRAIN_CONFIG_KEYS,
@@ -522,8 +522,8 @@ def train(cfg: DictConfig) -> Trainer:
     # TP config
     if tp_config is not None:
         strategy = tp_config.pop('strategy', None)
-        assert isinstance(strategy, str), "`strategy` must be in `tp_config`."
-        tp_config['layer_plan'] = build_tp_strategy(strategy, model)
+        assert isinstance(strategy, str), '`strategy` must be in `tp_config`.'
+        tp_config['layer_plan'] = build_tp_strategies(strategy, model)
 
     # Parallelism config
     parallelism_config = {'fsdp': fsdp_config, 'tp': tp_config}
diff --git a/llmfoundry/models/__init__.py b/llmfoundry/models/__init__.py
index 9db62e5c6b..569ef116f0 100644
--- a/llmfoundry/models/__init__.py
+++ b/llmfoundry/models/__init__.py
@@ -15,8 +15,8 @@
     MPTModel,
     MPTPreTrainedModel,
 )
-from llmfoundry.models.utils.tp_strategy import ffn_tp_strategy
-from llmfoundry.registry import models, tp_strategy
+from llmfoundry.models.utils.tp_strategies import ffn_tp_strategies
+from llmfoundry.registry import models, tp_strategies
 
 models.register('mpt_causal_lm', func=ComposerMPTCausalLM)
 models.register('hf_causal_lm', func=ComposerHFCausalLM)
@@ -25,7 +25,7 @@
 models.register('fmapi_causal_lm', func=FMAPICasualLMEvalWrapper)
 models.register('openai_chat', func=OpenAIChatAPIEvalWrapper)
 models.register('fmapi_chat', func=FMAPIChatAPIEvalWrapper)
-tp_strategy.register('ffn', func=ffn_tp_strategy)
+tp_strategies.register('ffn', func=ffn_tp_strategies)
 
 __all__ = [
     'ComposerHFCausalLM',
@@ -39,5 +39,5 @@
     'FMAPICasualLMEvalWrapper',
     'OpenAIChatAPIEvalWrapper',
     'FMAPIChatAPIEvalWrapper',
-    'ffn_tp_strategy',
+    'ffn_tp_strategies',
 ]
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategies.py
similarity index 96%
rename from llmfoundry/models/utils/tp_strategy.py
rename to llmfoundry/models/utils/tp_strategies.py
index c75f59d356..5613589c2b 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategies.py
@@ -11,7 +11,7 @@
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
-def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
+def ffn_tp_strategies(model: ComposerModel) -> dict[str, ParallelStyle]:
     TP_LAYERS = {'up_proj', 'down_proj'}
 
     # Validate that all TP_LAYERS are in model
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index 6c20e30f21..850c4f3bbd 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -390,8 +390,8 @@
     description=_save_planners_description,
 )
 
-_tp_strategy_description = (
-    """The tp_strategy registry is used to register strategies for tensor parallelism.
+_tp_strategies_description = (
+    """The tp_strategies registry is used to register strategies for tensor parallelism.
 
     Args:
         model (ComposerModel): The model.
@@ -402,12 +402,12 @@
     """
 )
 
-tp_strategy = create_registry(
+tp_strategies = create_registry(
     'llmfoundry',
-    'tp_strategy',
+    'tp_strategies',
     generic_type=Callable[[ComposerModel], dict[str, ParallelStyle]],
     entry_points=True,
-    description=_tp_strategy_description,
+    description=_tp_strategies_description,
 )
 
 __all__ = [
@@ -437,5 +437,5 @@
     'config_transforms',
     'load_planners',
     'save_planners',
-    'tp_strategy',
+    'tp_strategies',
 ]
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index dcffbf6caa..12e734446c 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -54,7 +54,7 @@
     'build_tokenizer',
     'build_composer_model',
     'build_metric',
-    'build_tp_strategy',
+    'build_tp_strategies',
 ]
 
 
@@ -706,14 +706,14 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
     return evaluators, logger_keys
 
 
-@experimental_function('tp_strategy')
-def build_tp_strategy(
+@experimental_function('tp_strategies')
+def build_tp_strategies(
     name: str,
     model: ComposerModel,
 ) -> dict[str, ParallelStyle]:
     return construct_from_registry(
         name=name,
-        registry=registry.tp_strategy,
+        registry=registry.tp_strategies,
         partial_function=False,
         kwargs={'model': model},
     )
diff --git a/tests/models/utils/test_tp_strategy.py b/tests/models/utils/test_tp_strategies.py
similarity index 89%
rename from tests/models/utils/test_tp_strategy.py
rename to tests/models/utils/test_tp_strategies.py
index 19bad8abfd..3d0758e928 100644
--- a/tests/models/utils/test_tp_strategy.py
+++ b/tests/models/utils/test_tp_strategies.py
@@ -15,17 +15,17 @@
 
 from llmfoundry.command_utils.train import train
 from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
-from llmfoundry.utils.builders import build_tp_strategy
+from llmfoundry.utils.builders import build_tp_strategies
 from llmfoundry.utils.config_utils import process_init_device
 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
 
 
 @pytest.mark.gpu
 @pytest.mark.filterwarnings(
-    'ignore:tp_strategy is experimental and may change with future versions.'
+    'ignore:tp_strategies is experimental and may change with future versions.',
 )
-def test_ffn_tp_strategy_layer_plan():
-    # Actual layer plan from tp_strategy=fnn
+def test_ffn_tp_strategies_layer_plan():
+    # Actual layer plan from tp_strategies=fnn
     tp_config = {
         'strategy': 'ffn',
     }
@@ -40,7 +40,7 @@ def test_ffn_tp_strategy_layer_plan():
         'vocab_size': 50368,
     }
     model = ComposerMPTCausalLM(**model_cfg)
-    layer_plan = build_tp_strategy(tp_config['strategy'], model)
+    layer_plan = build_tp_strategies(tp_config['strategy'], model)
 
     # Expected layer plan
     _expected_layer_plan = {
@@ -97,6 +97,7 @@ def test_ffn_tp_strategy_layer_plan():
 
 @pytest.mark.gpu
 def test_no_tp_with_one_gpu():
+    """When we have one GPU, make a warning to use DDP and not FSDP-TP."""
     with TemporaryDirectory() as tmp_path:
         # Make `train_cfg`` with a tensor parallelism strategy
         train_cfg_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'
@@ -106,7 +107,7 @@ def test_no_tp_with_one_gpu():
         train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
         train_cfg.tp_config = {'strategy': 'ffn'}
 
-        # Expect a warning to use DDP and not FSDP-TP when we have one GPU.
+        # Expect a warning
         with pytest.warns(
             UserWarning,
             match=
@@ -117,6 +118,7 @@ def test_no_tp_with_one_gpu():
 
 @pytest.mark.gpu  # use gpu because `megablocks` only installed with `gpu` dependencies
 def test_no_tp_with_moes():
+    """Test that tensor parallelism is not compatible with MoEs."""
     # Make `cfg` for MoE model, fsdp, and tp (tensor parallelism)
     train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'
     with open(train_cfg_path, 'r', encoding='utf-8') as f:
@@ -134,4 +136,4 @@ def test_no_tp_with_moes():
 
 
 # if __name__ == '__main__':
-#     test_ffn_tp_strategy_layer_plan()
+#     test_ffn_tp_strategies_layer_plan()
diff --git a/tests/test_registry.py b/tests/test_registry.py
index 6c103d3504..90ef3bfaac 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -47,7 +47,7 @@ def test_expected_registries_exist():
         'config_transforms',
         'load_planners',
         'save_planners',
-        'tp_strategy',
+        'tp_strategies',
     }
 
     assert existing_registries == expected_registry_names

From df169e89f604649a4ef1423ed5650857b8246c7a Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 21:50:29 +0000
Subject: [PATCH 57/68] make tp dir

---
 llmfoundry/__init__.py                           |  2 ++
 llmfoundry/models/__init__.py                    |  5 +----
 llmfoundry/tp/__init__.py                        | 11 +++++++++++
 llmfoundry/{models/utils => tp}/tp_strategies.py |  2 +-
 tests/models/utils/test_tp_strategies.py         |  2 +-
 5 files changed, 16 insertions(+), 6 deletions(-)
 create mode 100644 llmfoundry/tp/__init__.py
 rename llmfoundry/{models/utils => tp}/tp_strategies.py (97%)

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index b851aaa559..07e8f35747 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -48,6 +48,7 @@
     models,
     optim,
     tokenizers,
+    tp,
     utils,
 )
 from llmfoundry._version import __version__
@@ -87,5 +88,6 @@
     'models',
     'optim',
     'tokenizers',
+    'tp',
     'utils',
 ]
diff --git a/llmfoundry/models/__init__.py b/llmfoundry/models/__init__.py
index 569ef116f0..827fe2ce56 100644
--- a/llmfoundry/models/__init__.py
+++ b/llmfoundry/models/__init__.py
@@ -15,8 +15,7 @@
     MPTModel,
     MPTPreTrainedModel,
 )
-from llmfoundry.models.utils.tp_strategies import ffn_tp_strategies
-from llmfoundry.registry import models, tp_strategies
+from llmfoundry.registry import models
 
 models.register('mpt_causal_lm', func=ComposerMPTCausalLM)
 models.register('hf_causal_lm', func=ComposerHFCausalLM)
@@ -25,7 +24,6 @@
 models.register('fmapi_causal_lm', func=FMAPICasualLMEvalWrapper)
 models.register('openai_chat', func=OpenAIChatAPIEvalWrapper)
 models.register('fmapi_chat', func=FMAPIChatAPIEvalWrapper)
-tp_strategies.register('ffn', func=ffn_tp_strategies)
 
 __all__ = [
     'ComposerHFCausalLM',
@@ -39,5 +37,4 @@
     'FMAPICasualLMEvalWrapper',
     'OpenAIChatAPIEvalWrapper',
     'FMAPIChatAPIEvalWrapper',
-    'ffn_tp_strategies',
 ]
diff --git a/llmfoundry/tp/__init__.py b/llmfoundry/tp/__init__.py
new file mode 100644
index 0000000000..55ceda0b76
--- /dev/null
+++ b/llmfoundry/tp/__init__.py
@@ -0,0 +1,11 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
+from llmfoundry.registry import tp_strategies
+from llmfoundry.tp.tp_strategies import ffn_tp_strategies
+
+tp_strategies.register('ffn', func=ffn_tp_strategies)
+
+__all__ = [
+    'ffn_tp_strategies',
+]
diff --git a/llmfoundry/models/utils/tp_strategies.py b/llmfoundry/tp/tp_strategies.py
similarity index 97%
rename from llmfoundry/models/utils/tp_strategies.py
rename to llmfoundry/tp/tp_strategies.py
index 5613589c2b..7dbfa3b90c 100644
--- a/llmfoundry/models/utils/tp_strategies.py
+++ b/llmfoundry/tp/tp_strategies.py
@@ -12,7 +12,7 @@
 
 
 def ffn_tp_strategies(model: ComposerModel) -> dict[str, ParallelStyle]:
-    TP_LAYERS = {'up_proj', 'down_proj'}
+    TP_LAYERS = {'ffn', 'ffn.up_proj', 'ffn.down_proj'}
 
     # Validate that all TP_LAYERS are in model
     tp_layers_in_model = {
diff --git a/tests/models/utils/test_tp_strategies.py b/tests/models/utils/test_tp_strategies.py
index 3d0758e928..357e146f11 100644
--- a/tests/models/utils/test_tp_strategies.py
+++ b/tests/models/utils/test_tp_strategies.py
@@ -25,7 +25,7 @@
     'ignore:tp_strategies is experimental and may change with future versions.',
 )
 def test_ffn_tp_strategies_layer_plan():
-    # Actual layer plan from tp_strategies=fnn
+    # Create layer plan from fnn tp_strategy
     tp_config = {
         'strategy': 'ffn',
     }

From e6ab9296d0bc0674412eacea92a58f7e57d3eaca Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 22:00:45 +0000
Subject: [PATCH 58/68] rename

---
 llmfoundry/tp/__init__.py                              | 6 +++---
 llmfoundry/tp/{tp_strategies.py => ffn_tp_strategy.py} | 2 +-
 tests/models/utils/test_tp_strategies.py               | 6 +-----
 3 files changed, 5 insertions(+), 9 deletions(-)
 rename llmfoundry/tp/{tp_strategies.py => ffn_tp_strategy.py} (96%)

diff --git a/llmfoundry/tp/__init__.py b/llmfoundry/tp/__init__.py
index 55ceda0b76..48b788befa 100644
--- a/llmfoundry/tp/__init__.py
+++ b/llmfoundry/tp/__init__.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.registry import tp_strategies
-from llmfoundry.tp.tp_strategies import ffn_tp_strategies
+from llmfoundry.tp.ffn_tp_strategy import ffn
 
-tp_strategies.register('ffn', func=ffn_tp_strategies)
+tp_strategies.register('ffn', func=ffn)
 
 __all__ = [
-    'ffn_tp_strategies',
+    'ffn',
 ]
diff --git a/llmfoundry/tp/tp_strategies.py b/llmfoundry/tp/ffn_tp_strategy.py
similarity index 96%
rename from llmfoundry/tp/tp_strategies.py
rename to llmfoundry/tp/ffn_tp_strategy.py
index 7dbfa3b90c..2804bfc747 100644
--- a/llmfoundry/tp/tp_strategies.py
+++ b/llmfoundry/tp/ffn_tp_strategy.py
@@ -11,7 +11,7 @@
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
-def ffn_tp_strategies(model: ComposerModel) -> dict[str, ParallelStyle]:
+def ffn(model: ComposerModel) -> dict[str, ParallelStyle]:
     TP_LAYERS = {'ffn', 'ffn.up_proj', 'ffn.down_proj'}
 
     # Validate that all TP_LAYERS are in model
diff --git a/tests/models/utils/test_tp_strategies.py b/tests/models/utils/test_tp_strategies.py
index 357e146f11..f0c5e40be7 100644
--- a/tests/models/utils/test_tp_strategies.py
+++ b/tests/models/utils/test_tp_strategies.py
@@ -24,7 +24,7 @@
 @pytest.mark.filterwarnings(
     'ignore:tp_strategies is experimental and may change with future versions.',
 )
-def test_ffn_tp_strategies_layer_plan():
+def test_ffn_tp_strategy():
     # Create layer plan from fnn tp_strategy
     tp_config = {
         'strategy': 'ffn',
@@ -133,7 +133,3 @@ def test_no_tp_with_moes():
         match='Tensor Parallelism is not currently supported for MoE models.',
     ):
         process_init_device(model_cfg, fsdp_cfg, tp_cfg)
-
-
-# if __name__ == '__main__':
-#     test_ffn_tp_strategies_layer_plan()

From 6caeea9044048500ad7d86022e41159b4c5ac4fa Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 22:07:23 +0000
Subject: [PATCH 59/68] better function names

---
 llmfoundry/tp/__init__.py        | 6 +++---
 llmfoundry/tp/ffn_tp_strategy.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/tp/__init__.py b/llmfoundry/tp/__init__.py
index 48b788befa..323ae23727 100644
--- a/llmfoundry/tp/__init__.py
+++ b/llmfoundry/tp/__init__.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from llmfoundry.registry import tp_strategies
-from llmfoundry.tp.ffn_tp_strategy import ffn
+from llmfoundry.tp.ffn_tp_strategy import ffn_tp_strategy
 
-tp_strategies.register('ffn', func=ffn)
+tp_strategies.register('ffn', func=ffn_tp_strategy)
 
 __all__ = [
-    'ffn',
+    'ffn_tp_strategy',
 ]
diff --git a/llmfoundry/tp/ffn_tp_strategy.py b/llmfoundry/tp/ffn_tp_strategy.py
index 2804bfc747..1de92ef6ae 100644
--- a/llmfoundry/tp/ffn_tp_strategy.py
+++ b/llmfoundry/tp/ffn_tp_strategy.py
@@ -11,7 +11,7 @@
 from torch.distributed.tensor.parallel.style import ParallelStyle
 
 
-def ffn(model: ComposerModel) -> dict[str, ParallelStyle]:
+def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
     TP_LAYERS = {'ffn', 'ffn.up_proj', 'ffn.down_proj'}
 
     # Validate that all TP_LAYERS are in model

From 3426ea38c310f663e1c9ab90bf3f749162619f45 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 22:19:19 +0000
Subject: [PATCH 60/68] import fix style

---
 llmfoundry/command_utils/train.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 0b746c05f8..29878714f6 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -19,11 +19,7 @@
     TraceHandler,
     cyclic_schedule,
 )
-from composer.utils import (
-    dist,
-    get_device,
-    reproducibility,
-)
+from composer.utils import dist, get_device, reproducibility
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 

From 2683c6d959533249870ac9ab9f7adfefe5e4f4dc Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 22:20:09 +0000
Subject: [PATCH 61/68] delete tp yaml

---
 scripts/train/yamls/pretrain/tp-mpt-125m.yaml | 133 ------------------
 1 file changed, 133 deletions(-)
 delete mode 100644 scripts/train/yamls/pretrain/tp-mpt-125m.yaml

diff --git a/scripts/train/yamls/pretrain/tp-mpt-125m.yaml b/scripts/train/yamls/pretrain/tp-mpt-125m.yaml
deleted file mode 100644
index 5f6f55c06a..0000000000
--- a/scripts/train/yamls/pretrain/tp-mpt-125m.yaml
+++ /dev/null
@@ -1,133 +0,0 @@
-variables:
-  data_local: ./my-copy-c4
-  data_remote:  # If blank, files must be present in data_local
-  max_seq_len: 2048
-  global_seed: 17
-
-  # Run Name
-  run_name:  # If left blank, will be read from env var $RUN_NAME
-
-max_seq_len: ${variables.max_seq_len}
-run_name: ${variables.run_name}
-
-# Model
-model:
-  name: mpt_causal_lm
-  init_device: meta
-  d_model: 768
-  n_heads: 12
-  n_layers: 12
-  expansion_ratio: 4
-  max_seq_len: ${variables.max_seq_len}
-  vocab_size: 50368
-  attn_config:
-    attn_impl: flash
-
-# Tokenizer
-tokenizer:
-  name: EleutherAI/gpt-neox-20b
-  kwargs:
-    model_max_length: ${variables.max_seq_len}
-
-# Dataloaders
-train_loader:
-  name: text
-  dataset:
-    local: ${variables.data_local}
-    remote: ${variables.data_remote}
-    split: train_small
-    shuffle: true
-    max_seq_len: ${variables.max_seq_len}
-    shuffle_seed: ${variables.global_seed}
-  drop_last: true
-  num_workers: 8
-
-eval_loader:
-  name: text
-  dataset:
-    local: ${variables.data_local}
-    remote: ${variables.data_remote}
-    split: val_small
-    shuffle: false
-    max_seq_len: ${variables.max_seq_len}
-    shuffle_seed: ${variables.global_seed}
-    replication: 2
-  drop_last: false
-  num_workers: 8
-
-# Optimization
-scheduler:
-  name: cosine_with_warmup
-  t_warmup: 100ba
-  alpha_f: 0.1
-
-optimizer:
-  name: decoupled_adamw
-  lr: 6.0e-4
-  betas:
-  - 0.9
-  - 0.95
-  eps: 1.0e-08
-  weight_decay: 0.0
-
-algorithms:
-  gradient_clipping:
-    clipping_type: norm
-    clipping_threshold: 1.0
-
-max_duration: 100ba
-eval_interval: 500ba
-eval_first: false
-eval_subset_num_batches: 0
-global_train_batch_size: 256
-
-# System
-seed: ${variables.global_seed}
-device_eval_batch_size: 16
-device_train_microbatch_size: 16
-# device_train_microbatch_size: auto
-precision: amp_bf16
-
-# FSDP
-fsdp_config:
-  sharding_strategy: FULL_SHARD
-  mixed_precision: PURE
-  activation_checkpointing: false
-  activation_checkpointing_reentrant: false
-  activation_cpu_offload: false
-  limit_all_gathers: true
-
-# TP
-tp_config:
-  strategy: ffn
-  tensor_parallel_degree: 2
-
-# Logging
-progress_bar: false
-log_to_console: true
-console_log_interval: 1ba
-
-callbacks:
-  speed_monitor:
-    window_size: 10
-  lr_monitor: {}
-  memory_monitor: {}
-  runtime_estimator: {}
-
-loggers:
-  mlflow:
-    experiment_name: tp
-    # model_registry_prefix: datasets.eitanturok.${run_name}
-
-# loggers:
-#   wandb: {}
-
-# Checkpoint to local filesystem or remote object store
-# save_interval: 500ba
-# save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
-# save_folder: ./{run_name}/checkpoints
-# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
-
-# Load from local filesystem or remote object store
-# load_path: ./gpt-125m/checkpoints/latest-rank{rank}.pt
-# load_path: s3://my-bucket/my-folder/gpt-125m/checkpoints/latest-rank{rank}.pt

From d5779c7900b63e7df791645e6a5a1302f177ad00 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 22:27:34 +0000
Subject: [PATCH 62/68] warn checkpointing does not work

---
 llmfoundry/utils/builders.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 12e734446c..b1e1a303fe 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -7,14 +7,9 @@
 import logging
 import os
 import re
+import warnings
 from collections import OrderedDict
-from typing import (
-    Any,
-    ContextManager,
-    Iterable,
-    Optional,
-    Union,
-)
+from typing import Any, ContextManager, Iterable, Optional, Union
 
 import torch
 from composer.core import Algorithm, Callback, Evaluator
@@ -711,6 +706,10 @@ def build_tp_strategies(
     name: str,
     model: ComposerModel,
 ) -> dict[str, ParallelStyle]:
+
+    warnings.warn(
+        'Checkpointing is not currently supported for tensor parallelism due to this pytorch bug: https://github.com/pytorch/pytorch/issues/134095#issuecomment-2345018244'
+    )
     return construct_from_registry(
         name=name,
         registry=registry.tp_strategies,

From 7ac37bca5eb195d5054fa3d222e0af45c6dff5d6 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Thu, 26 Sep 2024 22:31:47 +0000
Subject: [PATCH 63/68] better description

---
 llmfoundry/utils/builders.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index b1e1a303fe..a1ffe57169 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -701,7 +701,7 @@ def _validate_cfg(icl_cfg: dict[str, Any]):
     return evaluators, logger_keys
 
 
-@experimental_function('tp_strategies')
+@experimental_function('Tensor Parallelism')
 def build_tp_strategies(
     name: str,
     model: ComposerModel,

From 67a1c7ba89b6365e2203fd232b6bc5080319c7dc Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Fri, 27 Sep 2024 13:40:42 +0000
Subject: [PATCH 64/68] cleanup

---
 tests/models/utils/test_tp_strategies.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tests/models/utils/test_tp_strategies.py b/tests/models/utils/test_tp_strategies.py
index f0c5e40be7..fd2fa384ce 100644
--- a/tests/models/utils/test_tp_strategies.py
+++ b/tests/models/utils/test_tp_strategies.py
@@ -25,6 +25,7 @@
     'ignore:tp_strategies is experimental and may change with future versions.',
 )
 def test_ffn_tp_strategy():
+    """Test the FFN tensor parallelism strategy is correct."""
     # Create layer plan from fnn tp_strategy
     tp_config = {
         'strategy': 'ffn',
@@ -97,12 +98,9 @@ def test_ffn_tp_strategy():
 
 @pytest.mark.gpu
 def test_no_tp_with_one_gpu():
-    """When we have one GPU, make a warning to use DDP and not FSDP-TP."""
+    """Test that when we have one GPU, we use DDP and not FSDP-TP."""
     with TemporaryDirectory() as tmp_path:
         # Make `train_cfg`` with a tensor parallelism strategy
-        train_cfg_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'
-        with open(train_cfg_path, 'r', encoding='utf-8') as f:
-            train_cfg = om.load(f)
         dataset_name = create_c4_dataset_xxsmall(Path(tmp_path))
         train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
         train_cfg.tp_config = {'strategy': 'ffn'}
@@ -119,7 +117,7 @@ def test_no_tp_with_one_gpu():
 @pytest.mark.gpu  # use gpu because `megablocks` only installed with `gpu` dependencies
 def test_no_tp_with_moes():
     """Test that tensor parallelism is not compatible with MoEs."""
-    # Make `cfg` for MoE model, fsdp, and tp (tensor parallelism)
+    # Make `cfg` for MoE model, fsdp, and tp
     train_cfg_path: str = 'scripts/train/yamls/pretrain/testing-moe.yaml'
     with open(train_cfg_path, 'r', encoding='utf-8') as f:
         train_cfg = om.load(f)
@@ -127,7 +125,7 @@ def test_no_tp_with_moes():
     fsdp_cfg = train_cfg.fsdp_config
     tp_cfg = {'strategy': 'ffn'}
 
-    # Expect an error for using tensor parallelism with MoEs
+    # Expect an error
     with pytest.raises(
         ValueError,
         match='Tensor Parallelism is not currently supported for MoE models.',

From eb2b59180a468fdb1491f4fc6e7a780537c800a6 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Fri, 27 Sep 2024 13:42:40 +0000
Subject: [PATCH 65/68] tp test directory

---
 tests/tp/__init__.py                             | 3 +++
 tests/{models/utils => tp}/test_tp_strategies.py | 0
 2 files changed, 3 insertions(+)
 create mode 100644 tests/tp/__init__.py
 rename tests/{models/utils => tp}/test_tp_strategies.py (100%)

diff --git a/tests/tp/__init__.py b/tests/tp/__init__.py
new file mode 100644
index 0000000000..0fd8896d0a
--- /dev/null
+++ b/tests/tp/__init__.py
@@ -0,0 +1,3 @@
+# Copyright 2024 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+
diff --git a/tests/models/utils/test_tp_strategies.py b/tests/tp/test_tp_strategies.py
similarity index 100%
rename from tests/models/utils/test_tp_strategies.py
rename to tests/tp/test_tp_strategies.py

From 86992f95b4adfbcde4c81807ea45603a241cd6c0 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Fri, 27 Sep 2024 13:51:09 +0000
Subject: [PATCH 66/68] style

---
 llmfoundry/utils/builders.py | 2 +-
 tests/tp/__init__.py         | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index a1ffe57169..687b21b46d 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -708,7 +708,7 @@ def build_tp_strategies(
 ) -> dict[str, ParallelStyle]:
 
     warnings.warn(
-        'Checkpointing is not currently supported for tensor parallelism due to this pytorch bug: https://github.com/pytorch/pytorch/issues/134095#issuecomment-2345018244'
+        'Checkpointing is not currently supported for tensor parallelism due to this pytorch bug: https://github.com/pytorch/pytorch/issues/134095#issuecomment-2345018244',
     )
     return construct_from_registry(
         name=name,
diff --git a/tests/tp/__init__.py b/tests/tp/__init__.py
index 0fd8896d0a..80950cb7b4 100644
--- a/tests/tp/__init__.py
+++ b/tests/tp/__init__.py
@@ -1,3 +1,2 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
-

From 24ffeb44b1611fc09267369a468bb01acface9d4 Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Fri, 27 Sep 2024 13:55:01 +0000
Subject: [PATCH 67/68] type checking

---
 llmfoundry/data/finetuning/dataloader.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 69051a2d51..181528c51f 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -286,6 +286,7 @@ def build_finetuning_dataloader(
 
         # Get the preprocessing function.
         proto_preprocessing_fn = dataset_cfg.get('preprocessing_fn')
+        assert proto_preprocessing_fn is not None
         if isinstance(proto_preprocessing_fn, (dict, DictConfig)):
             preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_dict(
                 dict(proto_preprocessing_fn),

From 04da536ef5204f5aebc249f5b824b40903be558d Mon Sep 17 00:00:00 2001
From: Eitan Turok <eitan.turok@gmail.com>
Date: Fri, 27 Sep 2024 14:12:03 +0000
Subject: [PATCH 68/68] remove assert

---
 llmfoundry/data/finetuning/dataloader.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 181528c51f..69051a2d51 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -286,7 +286,6 @@ def build_finetuning_dataloader(
 
         # Get the preprocessing function.
         proto_preprocessing_fn = dataset_cfg.get('preprocessing_fn')
-        assert proto_preprocessing_fn is not None
         if isinstance(proto_preprocessing_fn, (dict, DictConfig)):
             preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_dict(
                 dict(proto_preprocessing_fn),