Skip to content

Commit

Permalink
fix for optimizer that doesn't have step in optimizer states
Browse files Browse the repository at this point in the history
  • Loading branch information
tohtana committed Mar 19, 2024
1 parent 24484ad commit 13effa1
Showing 1 changed file with 5 additions and 1 deletion.
6 changes: 5 additions & 1 deletion deepspeed/checkpoint/ds_to_universal.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def _merge_zero_shards(param_base_path, state, tp_degree, slice_shape):
for tp_index in range(tp_degree):
prefix_path = os.path.join(param_base_path, str(tp_index), f"{state}")
paths = sorted(list(glob.glob(f"{prefix_path}.*")))
if len(paths) == 0:
continue

shards = [torch.load(p) for p in paths]

if state == "step":
Expand Down Expand Up @@ -190,7 +193,8 @@ def get_matched_pattern(patterns_, name_):
return None

step_merged = _merge_zero_shards(slice_base_path, "step", tp_degree, shape)
_save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0])
if step_merged:
_save_checkpoint(os.path.join(param_base_path, f"step.pt"), step_merged[0])

for state in ("fp32", "exp_avg", "exp_avg_sq"):
slices = _merge_zero_shards(slice_base_path, state, tp_degree, shape)
Expand Down

0 comments on commit 13effa1

Please sign in to comment.