Skip to content

Commit

Permalink
Don't touch DS
Browse files Browse the repository at this point in the history
  • Loading branch information
muellerzr committed Jul 9, 2024
1 parent 8f81104 commit 2725029
Showing 1 changed file with 39 additions and 7 deletions.
46 changes: 39 additions & 7 deletions src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,13 +679,45 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix):

error_msgs = []

# Adjust and remove our `start_prefix` as we don't need it anymore
state_dict = {
key[len(start_prefix) :] if key.startswith(start_prefix) else key: value for key, value in state_dict.items()
}
# By passing in `assign=True`, we can be memory efficient by mapping the tensors directly, using only 1x
# the memory of the original state_dict instead of 2.
model_to_load.load_state_dict(state_dict, assign=True, strict=False)
# Note: for now this is only for DeepSpeed Zero3
# PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
# so we need to apply the function recursively.
def load(module: nn.Module, state_dict, prefix=""):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
# Parameters of module and children will start with prefix. We can exit early if there are none in this
# state_dict
if len([key for key in state_dict if key.startswith(prefix)]) > 0:
if is_deepspeed_zero3_enabled():
import deepspeed

# In sharded models, each shard has only part of the full state_dict, so only gather
# parameters that are in the current state_dict.
named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
if len(params_to_gather) > 0:
# because zero3 puts placeholders in model params, this context
# manager gathers (unpartitions) the params of the current layer, then loads from
# the state dict and then re-partitions them again
with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
if torch.distributed.get_rank() == 0:
module._load_from_state_dict(*args)

for name, child in module._modules.items():
if child is not None:
load(child, state_dict, prefix + name + ".")

if is_deepspeed_zero3_enabled():
load(model_to_load, state_dict, prefix=start_prefix)
else:
# Adjust and remove our `start_prefix` as we don't need it anymore
state_dict = {
key[len(start_prefix) :] if key.startswith(start_prefix) else key: value
for key, value in state_dict.items()
}
# By passing in `assign=True`, we can be memory efficient by mapping the tensors directly, using only 1x
# the memory of the original state_dict instead of 2.
model_to_load.load_state_dict(state_dict, assign=True, strict=False)
# Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
# it's safe to delete it.
del state_dict
Expand Down

0 comments on commit 2725029

Please sign in to comment.