Fix gpu tests test_tp_train and test_huggingface_conversion_callback_…

…interval (#1642)
mosaicml · Nov 6, 2024 · 1f4de8f · 1f4de8f
1 parent 066edce
commit 1f4de8f
Show file tree

Hide file tree

Showing 2 changed files with 43 additions and 49 deletions.
diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py
@@ -152,6 +152,13 @@ def check_hf_tokenizer_equivalence(
     tokenizer1.__dict__['init_kwargs'].pop('vocab_file', None)
     tokenizer2.__dict__['init_kwargs'].pop('vocab_file', None)
 
+    # tokenizer.init_kwargs['merges_file'] is set when loading with AutoTokenizer.from_pretrained, but is set to
+    # None when you save and reload the tokenizer only.
+    # Otherwise, merges_file will be the path that the tokenizer was loaded from, which will just be a temporary directory for
+    # the reloaded tokenizer, so we remove it and don't compare it between the two tokenizers.
+    tokenizer1.__dict__['init_kwargs'].pop('merges_file', None)
+    tokenizer2.__dict__['init_kwargs'].pop('merges_file', None)
+
     # vocab_file will be the path that the tokenizer was loaded from, which will just be a temporary directory for
     # the reloaded tokenizer, so we remove it and don't compare it between the two tokenizers
     tokenizer1.__dict__.pop('vocab_file', None)

diff --git a/tests/tp/test_tp_strategies.py b/tests/tp/test_tp_strategies.py
@@ -3,13 +3,12 @@
 
 import os
 import pathlib
-import shutil
 from pathlib import Path
-from tempfile import TemporaryDirectory
 from typing import Optional
 
 import pytest
 from composer import Trainer
+from composer.utils import dist
 from omegaconf import DictConfig
 from omegaconf import OmegaConf as om
 from torch.distributed._tensor import Replicate, Shard
@@ -151,58 +150,46 @@ def get_loss_array(trainer: Trainer):
 @pytest.mark.world_size(4)
 @pytest.mark.parametrize('tp_degree', [2])
 @pytest.mark.parametrize('tp_strategy', ['ffn'])
-def test_tp_train(tp_degree: int, tp_strategy: str):
+def test_tp_train(tp_degree: int, tp_strategy: str, tmp_path: Path):
     """Test that we can train with FSDP-TP."""
-    my_dir = Path('/my-data-dir')
-
-    try:
-        # create c4 dataset
-        if my_dir.is_dir() and my_dir.exists():
-            shutil.rmtree(my_dir)
-        my_dir.mkdir(parents=True)
-        tp_dataset_name = create_c4_dataset_xxsmall(my_dir)
-
-        # Train model with TP and get loss
-        tp_cfg = get_cfg(pathlib.Path(tp_dataset_name), tp_strategy, tp_degree)
-        tp_trainer = train(tp_cfg)
-        tp_trainer.close()
-        tp_loss = get_loss_array(tp_trainer)
-
-        # Compare loss and expected loss for TP
-        import numpy as np
-        expected_tp_loss = np.array([
-            12.02126884,
-            11.96996498,
-            12.02957344,
-            11.97966957,
-            11.99677086,
-            11.96347618,
-        ])
-        np.testing.assert_allclose(tp_loss, expected_tp_loss)
-    except Exception as e:
-        raise e
-    finally:
-        # always remove the directory
-        if os.path.isdir(my_dir):
-            shutil.rmtree(my_dir)
+    tp_dataset_name = create_c4_dataset_xxsmall(tmp_path)
+
+    tp_dataset_name = dist.all_gather_object(tp_dataset_name)[0]
+
+    # Train model with TP and get loss
+    tp_cfg = get_cfg(pathlib.Path(tp_dataset_name), tp_strategy, tp_degree)
+    tp_trainer = train(tp_cfg)
+    tp_trainer.close()
+    tp_loss = get_loss_array(tp_trainer)
+
+    # Compare loss and expected loss for TP
+    import numpy as np
+    expected_tp_loss = np.array([
+        12.02126884,
+        11.96996498,
+        12.02957344,
+        11.97966957,
+        11.99677086,
+        11.96347618,
+    ])
+    np.testing.assert_allclose(tp_loss, expected_tp_loss)
 
 
 @pytest.mark.gpu
-def test_tp_train_with_one_gpu():
+def test_tp_train_with_one_gpu(tmp_path: Path):
     """Test that when we have one GPU, we train DDP and not FSDP-TP."""
-    with TemporaryDirectory() as tmp_path:
-        # Make `train_cfg`` with a tensor parallelism strategy
-        dataset_name = create_c4_dataset_xxsmall(Path(tmp_path))
-        train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
-        train_cfg.tp_config = {'strategy': 'ffn'}
-
-        # Expect a warning
-        with pytest.warns(
-            UserWarning,
-            match=
-            r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
-        ):
-            train(train_cfg)
+    # Make `train_cfg`` with a tensor parallelism strategy
+    dataset_name = create_c4_dataset_xxsmall(tmp_path)
+    train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
+    train_cfg.tp_config = {'strategy': 'ffn'}
+
+    # Expect a warning
+    with pytest.warns(
+        UserWarning,
+        match=
+        r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
+    ):
+        train(train_cfg)
 
 
 @pytest.mark.gpu  # use gpu because `megablocks` only installed with `gpu` dependencies