From bf6e6fd05b39d52b4f2a3bbb7d98068d7b60dfff Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Thu, 14 Nov 2024 12:59:00 -0500
Subject: [PATCH] gradient accumulation tests, embeddings w pad_token fix,
 smaller models (#2059)

* add more test cases for gradient accumulation and fix zero3

* swap out for smaller model

* fix missing return

* fix missing pad_token in config

* support concurrency for multigpu testing

* cast empty deepspeed to empty string for zero3 check

* fix temp_dir as fixture so parametrize works properly

* fix test file for multigpu evals

* don't use default

* don't use default for fsdp_state_dict_type

* don't use llama tokenizer w smollm

* also automatically cancel multigpu for concurrency
---
 .github/workflows/multi-gpu-e2e.yml           |   5 +
 cicd/multigpu.sh                              |   2 +-
 .../config/models/input/v0_4_1/__init__.py    |  19 +++
 src/axolotl/utils/models.py                   |   1 +
 tests/e2e/conftest.py                         |  16 +++
 tests/e2e/multigpu/test_eval.py               |  12 +-
 tests/e2e/multigpu/test_llama.py              | 115 +++++++++---------
 tests/e2e/multigpu/test_qwen2.py              |  19 +--
 8 files changed, 118 insertions(+), 71 deletions(-)
 create mode 100644 tests/e2e/conftest.py

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index c2fb5dfb5d..2fecde5a97 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,6 +8,11 @@ on:
   schedule:
     - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
 
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
   test-axolotl-multigpu:
     if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}
diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh
index ff7f9f19a5..05d1bbbf2a 100755
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,4 +2,4 @@
 set -e
 
 # only run one test at a time so as not to OOM the GPU
-pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
index 1feb8aae86..10e80d9f33 100644
--- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -1291,6 +1291,25 @@ def check_use_reentrant_mismatch(cls, data):
             )
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def warn_qlora_zero3_w_use_reentrant(cls, data):
+        if (
+            data.get("adapter") == "qlora"
+            and data.get("gradient_checkpointing_kwargs", {})
+            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
+            is False
+            and "zero3" in data.get("deepspeed", "")
+        ):
+            # may result in:
+            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
+            # Recomputed values for the following tensors have different metadata
+            # than during the forward pass.
+            LOG.warning(
+                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
+            )
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def check_val_w_test_datasets(cls, data):
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 41e89dbfb2..db66c65f25 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -238,6 +238,7 @@ def load_tokenizer(cfg):
                         x in cfg.lora_modules_to_save for x in lora_modules_to_save
                     )
                 )
+                and k != "pad_token"
             ):
                 lora_modules_to_save = ", ".join(
                     [f"`{x}`" for x in lora_modules_to_save]
diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
new file mode 100644
index 0000000000..723a44f03a
--- /dev/null
+++ b/tests/e2e/conftest.py
@@ -0,0 +1,16 @@
+"""
+shared pytest fixtures
+"""
+import shutil
+import tempfile
+
+import pytest
+
+
+@pytest.fixture
+def temp_dir():
+    # Create a temporary directory
+    _temp_dir = tempfile.mkdtemp()
+    yield _temp_dir
+    # Clean up the directory after the test
+    shutil.rmtree(_temp_dir)
diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py
index 65d26bb824..068a9220ca 100644
--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -3,28 +3,25 @@
 """
 import logging
 import os
-import unittest
 from pathlib import Path
 
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-from ..utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
 
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
 
-class TestMultiGPUEval(unittest.TestCase):
+class TestMultiGPUEval:
     """
     Test case for MultiGPU Eval Sample Packing
     """
 
-    @with_temp_dir
     def test_eval_sample_packing(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -83,13 +80,14 @@ def test_eval_sample_packing(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
     def test_eval(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -148,6 +146,8 @@ def test_eval(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 8087e08e3e..b2c8abc604 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -4,17 +4,17 @@
 
 import logging
 import os
-import unittest
 from pathlib import Path
 
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
 from huggingface_hub import snapshot_download
+from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-from ..utils import is_hopper, with_temp_dir
+from ..utils import is_hopper
 
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
@@ -28,18 +28,16 @@ def download_model():
     snapshot_download("TinyLlama/TinyLlama_v1.1")
 
 
-class TestMultiGPULlama(unittest.TestCase):
+class TestMultiGPULlama:
     """
     Test case for Llama models using LoRA
     """
 
-    @with_temp_dir
     def test_lora_ddp(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                 "sequence_len": 2048,
                 "adapter": "lora",
                 "lora_r": 8,
@@ -48,9 +46,7 @@ def test_lora_ddp(self, temp_dir):
                 "lora_target_linear": True,
                 "val_set_size": 0.05,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
@@ -81,19 +77,23 @@ def test_lora_ddp(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
-    def test_lora_ddp_packed(self, temp_dir):
+    @pytest.mark.parametrize(
+        "gradient_accumulation_steps",
+        [1, 4],
+    )
+    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                 "sequence_len": 2048,
                 "sample_packing": True,
                 "eval_sample_packing": False,
@@ -105,9 +105,7 @@ def test_lora_ddp_packed(self, temp_dir):
                 "lora_target_linear": True,
                 "val_set_size": 0.05,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
@@ -118,7 +116,7 @@ def test_lora_ddp_packed(self, temp_dir):
                 "num_epochs": 1,
                 "max_steps": 15,
                 "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
@@ -138,6 +136,8 @@ def test_lora_ddp_packed(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
@@ -145,7 +145,6 @@ def test_lora_ddp_packed(self, temp_dir):
         )
 
     @pytest.mark.skipif(is_hopper(), reason="h100 doesn't support 8-bit lora")
-    @with_temp_dir
     def test_dpo_lora_ddp(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -210,13 +209,14 @@ def test_dpo_lora_ddp(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
     def test_dpo_qlora_ddp(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -278,25 +278,27 @@ def test_dpo_qlora_ddp(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
-    def test_fsdp(self, temp_dir):
+    @pytest.mark.parametrize(
+        "gradient_accumulation_steps",
+        [1, 4],
+    )
+    def test_fsdp(self, temp_dir, gradient_accumulation_steps):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                 "sequence_len": 2048,
-                "val_set_size": 0.05,
+                "val_set_size": 0.01,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
@@ -305,9 +307,9 @@ def test_fsdp(self, temp_dir):
                     },
                 ],
                 "num_epochs": 1,
-                "max_steps": 15,
+                "max_steps": 10,
                 "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch",
@@ -324,7 +326,7 @@ def test_fsdp(self, temp_dir):
                     "fsdp_use_orig_params": False,
                     "fsdp_cpu_ram_efficient_loading": False,
                     "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                 },
             }
@@ -341,28 +343,29 @@ def test_fsdp(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
-    def test_fsdp_packed(self, temp_dir):
+    @pytest.mark.parametrize(
+        "fsdp_state_dict_type",
+        ["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
+    )
+    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                 "sample_packing": True,
-                "eval_sample_packing": False,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
                 "val_set_size": 0.05,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
@@ -390,7 +393,7 @@ def test_fsdp_packed(self, temp_dir):
                     "fsdp_use_orig_params": False,
                     "fsdp_cpu_ram_efficient_loading": False,
                     "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
+                    "fsdp_state_dict_type": fsdp_state_dict_type,
                     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                 },
             }
@@ -407,13 +410,14 @@ def test_fsdp_packed(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -483,28 +487,29 @@ def test_fsdp_qlora_prequant_packed(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
-    def test_ds_zero3_packed(self, temp_dir):
+    @pytest.mark.parametrize(
+        "gradient_accumulation_steps",
+        [1, 4],
+    )
+    def test_ds_zero3_packed(self, temp_dir, gradient_accumulation_steps):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                 "sample_packing": True,
-                "eval_sample_packing": False,
                 "pad_to_sequence_len": True,
                 "sequence_len": 2048,
                 "val_set_size": 0.05,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
@@ -515,7 +520,7 @@ def test_ds_zero3_packed(self, temp_dir):
                 "num_epochs": 1,
                 "max_steps": 15,
                 "micro_batch_size": 4,
-                "gradient_accumulation_steps": 4,
+                "gradient_accumulation_steps": gradient_accumulation_steps,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch",
@@ -536,19 +541,19 @@ def test_ds_zero3_packed(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
     def test_ds_zero3_qlora_packed(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "TinyLlama/TinyLlama_v1.1",
-                "tokenizer_type": "LlamaTokenizer",
+                "base_model": "HuggingFaceTB/SmolLM-135M",
                 "load_in_4bit": True,
                 "adapter": "qlora",
                 "lora_r": 8,
@@ -561,9 +566,7 @@ def test_ds_zero3_qlora_packed(self, temp_dir):
                 "sequence_len": 2048,
                 "val_set_size": 0.05,
                 "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
+                    "pad_token": "<|endoftext|>",
                 },
                 "datasets": [
                     {
@@ -595,6 +598,8 @@ def test_ds_zero3_qlora_packed(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
diff --git a/tests/e2e/multigpu/test_qwen2.py b/tests/e2e/multigpu/test_qwen2.py
index 393ab7d707..32bb6a3e13 100644
--- a/tests/e2e/multigpu/test_qwen2.py
+++ b/tests/e2e/multigpu/test_qwen2.py
@@ -4,31 +4,30 @@
 
 import logging
 import os
-import unittest
 from pathlib import Path
 
+import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-from ..utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
 
 
-class TestMultiGPUQwen2(unittest.TestCase):
+class TestMultiGPUQwen2:
     """
     Test case for Llama models using LoRA
     """
 
-    @with_temp_dir
-    def test_qlora_fsdp_dpo(self, temp_dir):
+    @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
+    def test_qlora_fsdp_dpo(self, base_model, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
-                "base_model": "Qwen/Qwen2-1.5B",
+                "base_model": base_model,
                 "load_in_4bit": True,
                 "rl": "dpo",
                 "chat_template": "chatml",
@@ -47,9 +46,9 @@ def test_qlora_fsdp_dpo(self, temp_dir):
                     },
                 ],
                 "num_epochs": 1,
-                "max_steps": 15,
+                "max_steps": 5,
                 "warmup_steps": 20,
-                "micro_batch_size": 4,
+                "micro_batch_size": 2,
                 "gradient_accumulation_steps": 2,
                 "output_dir": temp_dir,
                 "learning_rate": 0.00001,
@@ -91,6 +90,8 @@ def test_qlora_fsdp_dpo(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),