Merge pull request #350 from instructlab/deepspeed-move

Update Dependencies to Move DeepSpeed to CUDA Extras
instructlab · Nov 15, 2024 · e19c744 · e19c744
2 parents c04f638 + fdfb4fd
commit e19c744
Show file tree

Hide file tree

Showing 7 changed files with 29 additions and 19 deletions.
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
@@ -1,5 +1,8 @@
 flash-attn>=2.4.0
 bitsandbytes>=0.43.1
 
+# available as an option for NVIDIA, FSDP still default
+deepspeed>=0.14.3
+
 # required for FSDP updates
-accelerate>=0.34.2
+accelerate>=0.34.2,<1.1.0
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -1,2 +1,2 @@
 # required for optimum-habana's deps
-accelerate>=0.33.0
+accelerate>=0.33.0,<1.1.0
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
@@ -1,3 +1,3 @@
 flash-attn>=2.6.2,<2.7.0
 # required for FSDP updates
-accelerate>=0.34.2
+accelerate>=0.34.2,<1.1.0
diff --git a/requirements.txt b/requirements.txt
@@ -21,7 +21,4 @@ instructlab-dolomite>=0.2.0
 trl>=0.9.4
 peft
 pydantic>=2.7.0
-
-# deepspeed needs to be at the end or it'll break stuff.
-deepspeed>=0.14.3
 aiofiles>=23.2.1
diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py
@@ -195,7 +195,7 @@ class TrainingArgs(BaseModel):
             cpu_offload_params=False, sharding_strategy=ShardingStrategies.SHARD_GRAD_OP
         )
     )
-    distributed_backend: DistributedBackend = DistributedBackend.DEEPSPEED
+    distributed_backend: DistributedBackend = DistributedBackend.FSDP
 
     disable_flash_attn: Optional[bool] = False
 

diff --git a/src/instructlab/training/setup_accelerator.py b/src/instructlab/training/setup_accelerator.py
@@ -96,8 +96,13 @@ def get_fsdp_config(args, model: PreTrainedModel):
 
 def setup_accelerator(args, model: PreTrainedModel, grad_accum):
     if args.distributed_training_framework == "deepspeed":
-        # Third Party
-        from deepspeed import DeepSpeedEngine
+        try:
+            # Third Party
+            from deepspeed import DeepSpeedEngine
+        except ImportError as exc:
+            raise ImportError(
+                "DeepSpeed selected as distributed framework, but not installed"
+            ) from exc
 
         # patch deepspeed to work with quantized models.
         if args.lora_quant_bits is not None:

diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py
@@ -653,16 +653,21 @@ def prepare_universal_checkpoint_from_latest(output_dir):
 
     start = time.time()
     if torch.distributed.get_rank() == 0:
-        # Third Party
-        from deepspeed.checkpoint import DeepSpeedCheckpoint
-        from deepspeed.checkpoint.ds_to_universal import (
-            PARAM_SHAPES,
-            UNIVERSAL_CHECKPOINT_INFO,
-            _check_for_required_state,
-            _extract_zero_shard_files,
-            _merge_tp_slice_files,
-            _save_optimizer_state,
-        )
+        try:
+            # Third Party
+            from deepspeed.checkpoint import DeepSpeedCheckpoint
+            from deepspeed.checkpoint.ds_to_universal import (
+                PARAM_SHAPES,
+                UNIVERSAL_CHECKPOINT_INFO,
+                _check_for_required_state,
+                _extract_zero_shard_files,
+                _merge_tp_slice_files,
+                _save_optimizer_state,
+            )
+        except ImportError as exc:
+            raise ImportError(
+                "DeepSpeed-specific checkpoints cannot be saved without DeepSpeed>=0.14.3 installed"
+            ) from exc
 
         # read the latest file to get the step folder
         latest_file = output_dir / "latest"