gradient accumulation tests, embeddings w pad_token fix, smaller mode…

…ls (#2059) * add more test cases for gradient accumulation and fix zero3 * swap out for smaller model * fix missing return * fix missing pad_token in config * support concurrency for multigpu testing * cast empty deepspeed to empty string for zero3 check * fix temp_dir as fixture so parametrize works properly * fix test file for multigpu evals * don't use default * don't use default for fsdp_state_dict_type * don't use llama tokenizer w smollm * also automatically cancel multigpu for concurrency
axolotl-ai-cloud · Nov 18, 2024 · bf6e6fd · bf6e6fd
1 parent a865d3d
commit bf6e6fd
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 71 deletions.
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
@@ -8,6 +8,11 @@ on:
   schedule:
     - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
 
+# Cancel jobs on the same ref if a new one is triggered
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
 jobs:
   test-axolotl-multigpu:
     if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'axolotl-ai-cloud' }}

diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh
@@ -2,4 +2,4 @@
 set -e
 
 # only run one test at a time so as not to OOM the GPU
-pytest -n1 /workspace/axolotl/tests/e2e/multigpu/
+pytest -v -n2 /workspace/axolotl/tests/e2e/multigpu/
diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py
@@ -1291,6 +1291,25 @@ def check_use_reentrant_mismatch(cls, data):
             )
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def warn_qlora_zero3_w_use_reentrant(cls, data):
+        if (
+            data.get("adapter") == "qlora"
+            and data.get("gradient_checkpointing_kwargs", {})
+            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
+            is False
+            and "zero3" in data.get("deepspeed", "")
+        ):
+            # may result in:
+            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
+            # Recomputed values for the following tensors have different metadata
+            # than during the forward pass.
+            LOG.warning(
+                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
+            )
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def check_val_w_test_datasets(cls, data):

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
@@ -238,6 +238,7 @@ def load_tokenizer(cfg):
                         x in cfg.lora_modules_to_save for x in lora_modules_to_save
                     )
                 )
+                and k != "pad_token"
             ):
                 lora_modules_to_save = ", ".join(
                     [f"`{x}`" for x in lora_modules_to_save]

diff --git a/tests/e2e/conftest.py b/tests/e2e/conftest.py
@@ -0,0 +1,16 @@
+"""
+shared pytest fixtures
+"""
+import shutil
+import tempfile
+
+import pytest
+
+
+@pytest.fixture
+def temp_dir():
+    # Create a temporary directory
+    _temp_dir = tempfile.mkdtemp()
+    yield _temp_dir
+    # Clean up the directory after the test
+    shutil.rmtree(_temp_dir)
diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py
@@ -3,28 +3,25 @@
 """
 import logging
 import os
-import unittest
 from pathlib import Path
 
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-from ..utils import with_temp_dir
-
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"
 
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
 
-class TestMultiGPUEval(unittest.TestCase):
+class TestMultiGPUEval:
     """
     Test case for MultiGPU Eval Sample Packing
     """
 
-    @with_temp_dir
     def test_eval_sample_packing(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -83,13 +80,14 @@ def test_eval_sample_packing(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),
             ]
         )
 
-    @with_temp_dir
     def test_eval(self, temp_dir):
         # pylint: disable=duplicate-code
         cfg = DictDefault(
@@ -148,6 +146,8 @@ def test_eval(self, temp_dir):
                 "launch",
                 "--num-processes",
                 "2",
+                "--main_process_port",
+                f"{get_torch_dist_unique_port()}",
                 "-m",
                 "axolotl.cli.train",
                 str(Path(temp_dir) / "config.yaml"),