Skip to content

Commit

Permalink
Merge pull request #350 from instructlab/deepspeed-move
Browse files Browse the repository at this point in the history
Update Dependencies to Move DeepSpeed to CUDA Extras
  • Loading branch information
mergify[bot] authored Nov 15, 2024
2 parents c04f638 + fdfb4fd commit e19c744
Show file tree
Hide file tree
Showing 7 changed files with 29 additions and 19 deletions.
5 changes: 4 additions & 1 deletion requirements-cuda.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
flash-attn>=2.4.0
bitsandbytes>=0.43.1

# available as an option for NVIDIA, FSDP still default
deepspeed>=0.14.3

# required for FSDP updates
accelerate>=0.34.2
accelerate>=0.34.2,<1.1.0
2 changes: 1 addition & 1 deletion requirements-hpu.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# required for optimum-habana's deps
accelerate>=0.33.0
accelerate>=0.33.0,<1.1.0
2 changes: 1 addition & 1 deletion requirements-rocm.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
flash-attn>=2.6.2,<2.7.0
# required for FSDP updates
accelerate>=0.34.2
accelerate>=0.34.2,<1.1.0
3 changes: 0 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,4 @@ instructlab-dolomite>=0.2.0
trl>=0.9.4
peft
pydantic>=2.7.0

# deepspeed needs to be at the end or it'll break stuff.
deepspeed>=0.14.3
aiofiles>=23.2.1
2 changes: 1 addition & 1 deletion src/instructlab/training/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ class TrainingArgs(BaseModel):
cpu_offload_params=False, sharding_strategy=ShardingStrategies.SHARD_GRAD_OP
)
)
distributed_backend: DistributedBackend = DistributedBackend.DEEPSPEED
distributed_backend: DistributedBackend = DistributedBackend.FSDP

disable_flash_attn: Optional[bool] = False

Expand Down
9 changes: 7 additions & 2 deletions src/instructlab/training/setup_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,13 @@ def get_fsdp_config(args, model: PreTrainedModel):

def setup_accelerator(args, model: PreTrainedModel, grad_accum):
if args.distributed_training_framework == "deepspeed":
# Third Party
from deepspeed import DeepSpeedEngine
try:
# Third Party
from deepspeed import DeepSpeedEngine
except ImportError as exc:
raise ImportError(
"DeepSpeed selected as distributed framework, but not installed"
) from exc

# patch deepspeed to work with quantized models.
if args.lora_quant_bits is not None:
Expand Down
25 changes: 15 additions & 10 deletions src/instructlab/training/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,16 +653,21 @@ def prepare_universal_checkpoint_from_latest(output_dir):

start = time.time()
if torch.distributed.get_rank() == 0:
# Third Party
from deepspeed.checkpoint import DeepSpeedCheckpoint
from deepspeed.checkpoint.ds_to_universal import (
PARAM_SHAPES,
UNIVERSAL_CHECKPOINT_INFO,
_check_for_required_state,
_extract_zero_shard_files,
_merge_tp_slice_files,
_save_optimizer_state,
)
try:
# Third Party
from deepspeed.checkpoint import DeepSpeedCheckpoint
from deepspeed.checkpoint.ds_to_universal import (
PARAM_SHAPES,
UNIVERSAL_CHECKPOINT_INFO,
_check_for_required_state,
_extract_zero_shard_files,
_merge_tp_slice_files,
_save_optimizer_state,
)
except ImportError as exc:
raise ImportError(
"DeepSpeed-specific checkpoints cannot be saved without DeepSpeed>=0.14.3 installed"
) from exc

# read the latest file to get the step folder
latest_file = output_dir / "latest"
Expand Down

0 comments on commit e19c744

Please sign in to comment.