From bf6e6fd05b39d52b4f2a3bbb7d98068d7b60dfff Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 14 Nov 2024 12:59:00 -0500 Subject: [PATCH] gradient accumulation tests, embeddings w pad_token fix, smaller models (#2059) * add more test cases for gradient accumulation and fix zero3 * swap out for smaller model * fix missing return * fix missing pad_token in config * support concurrency for multigpu testing * cast empty deepspeed to empty string for zero3 check * fix temp_dir as fixture so parametrize works properly * fix test file for multigpu evals * don't use default * don't use default for fsdp_state_dict_type * don't use llama tokenizer w smollm * also automatically cancel multigpu for concurrency --- .github/workflows/multi-gpu-e2e.yml | 5 + cicd/multigpu.sh | 2 +- .../config/models/input/v0_4_1/__init__.py | 19 +++ src/axolotl/utils/models.py | 1 + tests/e2e/conftest.py | 16 +++ tests/e2e/multigpu/test_eval.py | 12 +- tests/e2e/multigpu/test_llama.py | 115 +++++++++--------- tests/e2e/multigpu/test_qwen2.py | 19 +-- 8 files changed, 118 insertions(+), 71 deletions(-) create mode 100644 tests/e2e/conftest.py diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml index c2fb5dfb5d..2fecde5a97 100644 --- a/.github/workflows/multi-gpu-e2e.yml +++ b/.github/workflows/multi-gpu-e2e.yml @@ -8,6 +8,11 @@ on: schedule: - cron: '0 0 * * 1,4' # Runs at 00:00 UTC every monday & thursday +# Cancel jobs on the same ref if a new one is triggered +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + jobs: test-axolotl-multigpu: if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }} diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh index ff7f9f19a5..05d1bbbf2a 100755 --- a/cicd/multigpu.sh +++ b/cicd/multigpu.sh @@ -2,4 +2,4 @@ set -e # only run one test at a time so as not to OOM the GPU -pytest -n1 /workspace/axolotl/tests/e2e/multigpu/ +pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/ diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index 1feb8aae86..10e80d9f33 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -1291,6 +1291,25 @@ def check_use_reentrant_mismatch(cls, data): ) return data + @model_validator(mode="before") + @classmethod + def warn_qlora_zero3_w_use_reentrant(cls, data): + if ( + data.get("adapter") == "qlora" + and data.get("gradient_checkpointing_kwargs", {}) + and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant") + is False + and "zero3" in data.get("deepspeed", "") + ): + # may result in: + # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint: + # Recomputed values for the following tensors have different metadata + # than during the forward pass. + LOG.warning( + "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values" + ) + return data + @model_validator(mode="before") @classmethod def check_val_w_test_datasets(cls, data): diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 41e89dbfb2..db66c65f25 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -238,6 +238,7 @@ def load_tokenizer(cfg): x in cfg.lora_modules_to_save for x in lora_modules_to_save ) ) + and k != "pad_token" ): lora_modules_to_save = ", ".join( [f"`{x}`" for x in lora_modules_to_save] diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py new file mode 100644 index 0000000000..723a44f03a --- /dev/null +++ b/tests/e2e/conftest.py @@ -0,0 +1,16 @@ +""" +shared pytest fixtures +""" +import shutil +import tempfile + +import pytest + + +@pytest.fixture +def temp_dir(): + # Create a temporary directory + _temp_dir = tempfile.mkdtemp() + yield _temp_dir + # Clean up the directory after the test + shutil.rmtree(_temp_dir) diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py index 65d26bb824..068a9220ca 100644 --- a/tests/e2e/multigpu/test_eval.py +++ b/tests/e2e/multigpu/test_eval.py @@ -3,28 +3,25 @@ """ import logging import os -import unittest from pathlib import Path import yaml from accelerate.test_utils import execute_subprocess_async +from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from ..utils import with_temp_dir - LOG = logging.getLogger("axolotl.tests.e2e.multigpu") os.environ["WANDB_DISABLED"] = "true" AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent -class TestMultiGPUEval(unittest.TestCase): +class TestMultiGPUEval: """ Test case for MultiGPU Eval Sample Packing """ - @with_temp_dir def test_eval_sample_packing(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( @@ -83,13 +80,14 @@ def test_eval_sample_packing(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir def test_eval(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( @@ -148,6 +146,8 @@ def test_eval(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 8087e08e3e..b2c8abc604 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -4,17 +4,17 @@ import logging import os -import unittest from pathlib import Path import pytest import yaml from accelerate.test_utils import execute_subprocess_async from huggingface_hub import snapshot_download +from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from ..utils import is_hopper, with_temp_dir +from ..utils import is_hopper LOG = logging.getLogger("axolotl.tests.e2e.multigpu") os.environ["WANDB_DISABLED"] = "true" @@ -28,18 +28,16 @@ def download_model(): snapshot_download("TinyLlama/TinyLlama_v1.1") -class TestMultiGPULlama(unittest.TestCase): +class TestMultiGPULlama: """ Test case for Llama models using LoRA """ - @with_temp_dir def test_lora_ddp(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama_v1.1", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM-135M", "sequence_len": 2048, "adapter": "lora", "lora_r": 8, @@ -48,9 +46,7 @@ def test_lora_ddp(self, temp_dir): "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -81,19 +77,23 @@ def test_lora_ddp(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir - def test_lora_ddp_packed(self, temp_dir): + @pytest.mark.parametrize( + "gradient_accumulation_steps", + [1, 4], + ) + def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama_v1.1", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM-135M", "sequence_len": 2048, "sample_packing": True, "eval_sample_packing": False, @@ -105,9 +105,7 @@ def test_lora_ddp_packed(self, temp_dir): "lora_target_linear": True, "val_set_size": 0.05, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -118,7 +116,7 @@ def test_lora_ddp_packed(self, temp_dir): "num_epochs": 1, "max_steps": 15, "micro_batch_size": 4, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_8bit", @@ -138,6 +136,8 @@ def test_lora_ddp_packed(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), @@ -145,7 +145,6 @@ def test_lora_ddp_packed(self, temp_dir): ) @pytest.mark.skipif(is_hopper(), reason="h100 doesn't support 8-bit lora") - @with_temp_dir def test_dpo_lora_ddp(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( @@ -210,13 +209,14 @@ def test_dpo_lora_ddp(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir def test_dpo_qlora_ddp(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( @@ -278,25 +278,27 @@ def test_dpo_qlora_ddp(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir - def test_fsdp(self, temp_dir): + @pytest.mark.parametrize( + "gradient_accumulation_steps", + [1, 4], + ) + def test_fsdp(self, temp_dir, gradient_accumulation_steps): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama_v1.1", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM-135M", "sequence_len": 2048, - "val_set_size": 0.05, + "val_set_size": 0.01, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -305,9 +307,9 @@ def test_fsdp(self, temp_dir): }, ], "num_epochs": 1, - "max_steps": 15, + "max_steps": 10, "micro_batch_size": 4, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch", @@ -324,7 +326,7 @@ def test_fsdp(self, temp_dir): "fsdp_use_orig_params": False, "fsdp_cpu_ram_efficient_loading": False, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", + "fsdp_state_dict_type": "FULL_STATE_DICT", "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, } @@ -341,28 +343,29 @@ def test_fsdp(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir - def test_fsdp_packed(self, temp_dir): + @pytest.mark.parametrize( + "fsdp_state_dict_type", + ["FULL_STATE_DICT", "SHARDED_STATE_DICT"], + ) + def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama_v1.1", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM-135M", "sample_packing": True, - "eval_sample_packing": False, "pad_to_sequence_len": True, "sequence_len": 2048, "val_set_size": 0.05, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -390,7 +393,7 @@ def test_fsdp_packed(self, temp_dir): "fsdp_use_orig_params": False, "fsdp_cpu_ram_efficient_loading": False, "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", + "fsdp_state_dict_type": fsdp_state_dict_type, "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", }, } @@ -407,13 +410,14 @@ def test_fsdp_packed(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir def test_fsdp_qlora_prequant_packed(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( @@ -483,28 +487,29 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir - def test_ds_zero3_packed(self, temp_dir): + @pytest.mark.parametrize( + "gradient_accumulation_steps", + [1, 4], + ) + def test_ds_zero3_packed(self, temp_dir, gradient_accumulation_steps): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama_v1.1", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM-135M", "sample_packing": True, - "eval_sample_packing": False, "pad_to_sequence_len": True, "sequence_len": 2048, "val_set_size": 0.05, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -515,7 +520,7 @@ def test_ds_zero3_packed(self, temp_dir): "num_epochs": 1, "max_steps": 15, "micro_batch_size": 4, - "gradient_accumulation_steps": 4, + "gradient_accumulation_steps": gradient_accumulation_steps, "output_dir": temp_dir, "learning_rate": 0.00001, "optimizer": "adamw_torch", @@ -536,19 +541,19 @@ def test_ds_zero3_packed(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), ] ) - @with_temp_dir def test_ds_zero3_qlora_packed(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "TinyLlama/TinyLlama_v1.1", - "tokenizer_type": "LlamaTokenizer", + "base_model": "HuggingFaceTB/SmolLM-135M", "load_in_4bit": True, "adapter": "qlora", "lora_r": 8, @@ -561,9 +566,7 @@ def test_ds_zero3_qlora_packed(self, temp_dir): "sequence_len": 2048, "val_set_size": 0.05, "special_tokens": { - "unk_token": "", - "bos_token": "", - "eos_token": "", + "pad_token": "<|endoftext|>", }, "datasets": [ { @@ -595,6 +598,8 @@ def test_ds_zero3_qlora_packed(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"), diff --git a/tests/e2e/multigpu/test_qwen2.py b/tests/e2e/multigpu/test_qwen2.py index 393ab7d707..32bb6a3e13 100644 --- a/tests/e2e/multigpu/test_qwen2.py +++ b/tests/e2e/multigpu/test_qwen2.py @@ -4,31 +4,30 @@ import logging import os -import unittest from pathlib import Path +import pytest import yaml from accelerate.test_utils import execute_subprocess_async +from transformers.testing_utils import get_torch_dist_unique_port from axolotl.utils.dict import DictDefault -from ..utils import with_temp_dir - LOG = logging.getLogger("axolotl.tests.e2e.multigpu") os.environ["WANDB_DISABLED"] = "true" -class TestMultiGPUQwen2(unittest.TestCase): +class TestMultiGPUQwen2: """ Test case for Llama models using LoRA """ - @with_temp_dir - def test_qlora_fsdp_dpo(self, temp_dir): + @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"]) + def test_qlora_fsdp_dpo(self, base_model, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( { - "base_model": "Qwen/Qwen2-1.5B", + "base_model": base_model, "load_in_4bit": True, "rl": "dpo", "chat_template": "chatml", @@ -47,9 +46,9 @@ def test_qlora_fsdp_dpo(self, temp_dir): }, ], "num_epochs": 1, - "max_steps": 15, + "max_steps": 5, "warmup_steps": 20, - "micro_batch_size": 4, + "micro_batch_size": 2, "gradient_accumulation_steps": 2, "output_dir": temp_dir, "learning_rate": 0.00001, @@ -91,6 +90,8 @@ def test_qlora_fsdp_dpo(self, temp_dir): "launch", "--num-processes", "2", + "--main_process_port", + f"{get_torch_dist_unique_port()}", "-m", "axolotl.cli.train", str(Path(temp_dir) / "config.yaml"),