Skip to content

Commit

Permalink
Fix gpu tests test_tp_train and test_huggingface_conversion_callback_…
Browse files Browse the repository at this point in the history
…interval (#1642)
  • Loading branch information
irenedea authored Nov 6, 2024
1 parent 066edce commit 1f4de8f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 49 deletions.
7 changes: 7 additions & 0 deletions tests/a_scripts/inference/test_convert_composer_to_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,13 @@ def check_hf_tokenizer_equivalence(
tokenizer1.__dict__['init_kwargs'].pop('vocab_file', None)
tokenizer2.__dict__['init_kwargs'].pop('vocab_file', None)

# tokenizer.init_kwargs['merges_file'] is set when loading with AutoTokenizer.from_pretrained, but is set to
# None when you save and reload the tokenizer only.
# Otherwise, merges_file will be the path that the tokenizer was loaded from, which will just be a temporary directory for
# the reloaded tokenizer, so we remove it and don't compare it between the two tokenizers.
tokenizer1.__dict__['init_kwargs'].pop('merges_file', None)
tokenizer2.__dict__['init_kwargs'].pop('merges_file', None)

# vocab_file will be the path that the tokenizer was loaded from, which will just be a temporary directory for
# the reloaded tokenizer, so we remove it and don't compare it between the two tokenizers
tokenizer1.__dict__.pop('vocab_file', None)
Expand Down
85 changes: 36 additions & 49 deletions tests/tp/test_tp_strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@

import os
import pathlib
import shutil
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Optional

import pytest
from composer import Trainer
from composer.utils import dist
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from torch.distributed._tensor import Replicate, Shard
Expand Down Expand Up @@ -151,58 +150,46 @@ def get_loss_array(trainer: Trainer):
@pytest.mark.world_size(4)
@pytest.mark.parametrize('tp_degree', [2])
@pytest.mark.parametrize('tp_strategy', ['ffn'])
def test_tp_train(tp_degree: int, tp_strategy: str):
def test_tp_train(tp_degree: int, tp_strategy: str, tmp_path: Path):
"""Test that we can train with FSDP-TP."""
my_dir = Path('/my-data-dir')

try:
# create c4 dataset
if my_dir.is_dir() and my_dir.exists():
shutil.rmtree(my_dir)
my_dir.mkdir(parents=True)
tp_dataset_name = create_c4_dataset_xxsmall(my_dir)

# Train model with TP and get loss
tp_cfg = get_cfg(pathlib.Path(tp_dataset_name), tp_strategy, tp_degree)
tp_trainer = train(tp_cfg)
tp_trainer.close()
tp_loss = get_loss_array(tp_trainer)

# Compare loss and expected loss for TP
import numpy as np
expected_tp_loss = np.array([
12.02126884,
11.96996498,
12.02957344,
11.97966957,
11.99677086,
11.96347618,
])
np.testing.assert_allclose(tp_loss, expected_tp_loss)
except Exception as e:
raise e
finally:
# always remove the directory
if os.path.isdir(my_dir):
shutil.rmtree(my_dir)
tp_dataset_name = create_c4_dataset_xxsmall(tmp_path)

tp_dataset_name = dist.all_gather_object(tp_dataset_name)[0]

# Train model with TP and get loss
tp_cfg = get_cfg(pathlib.Path(tp_dataset_name), tp_strategy, tp_degree)
tp_trainer = train(tp_cfg)
tp_trainer.close()
tp_loss = get_loss_array(tp_trainer)

# Compare loss and expected loss for TP
import numpy as np
expected_tp_loss = np.array([
12.02126884,
11.96996498,
12.02957344,
11.97966957,
11.99677086,
11.96347618,
])
np.testing.assert_allclose(tp_loss, expected_tp_loss)


@pytest.mark.gpu
def test_tp_train_with_one_gpu():
def test_tp_train_with_one_gpu(tmp_path: Path):
"""Test that when we have one GPU, we train DDP and not FSDP-TP."""
with TemporaryDirectory() as tmp_path:
# Make `train_cfg`` with a tensor parallelism strategy
dataset_name = create_c4_dataset_xxsmall(Path(tmp_path))
train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
train_cfg.tp_config = {'strategy': 'ffn'}

# Expect a warning
with pytest.warns(
UserWarning,
match=
r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
):
train(train_cfg)
# Make `train_cfg`` with a tensor parallelism strategy
dataset_name = create_c4_dataset_xxsmall(tmp_path)
train_cfg = gpt_tiny_cfg(dataset_name, 'gpu')
train_cfg.tp_config = {'strategy': 'ffn'}

# Expect a warning
with pytest.warns(
UserWarning,
match=
r'FSDP\+TP is not applicable for single-GPU training. Reverting to DDP.',
):
train(train_cfg)


@pytest.mark.gpu # use gpu because `megablocks` only installed with `gpu` dependencies
Expand Down

0 comments on commit 1f4de8f

Please sign in to comment.