From 2725029693449d0adaf8c64387cb163b7c21502c Mon Sep 17 00:00:00 2001 From: "[[ -z $EMAIL ]] && read -e -p \"Enter your email (for git configuration): \" EMAIL" Date: Tue, 9 Jul 2024 14:47:07 -0400 Subject: [PATCH] Don't touch DS --- src/transformers/modeling_utils.py | 46 +++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index f0b17b7d5d844a..eff4bc1f3eab78 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -679,13 +679,45 @@ def _load_state_dict_into_model(model_to_load, state_dict, start_prefix): error_msgs = [] - # Adjust and remove our `start_prefix` as we don't need it anymore - state_dict = { - key[len(start_prefix) :] if key.startswith(start_prefix) else key: value for key, value in state_dict.items() - } - # By passing in `assign=True`, we can be memory efficient by mapping the tensors directly, using only 1x - # the memory of the original state_dict instead of 2. - model_to_load.load_state_dict(state_dict, assign=True, strict=False) + # Note: for now this is only for DeepSpeed Zero3 + # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants + # so we need to apply the function recursively. + def load(module: nn.Module, state_dict, prefix=""): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) + # Parameters of module and children will start with prefix. We can exit early if there are none in this + # state_dict + if len([key for key in state_dict if key.startswith(prefix)]) > 0: + if is_deepspeed_zero3_enabled(): + import deepspeed + + # In sharded models, each shard has only part of the full state_dict, so only gather + # parameters that are in the current state_dict. + named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False)) + params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters] + if len(params_to_gather) > 0: + # because zero3 puts placeholders in model params, this context + # manager gathers (unpartitions) the params of the current layer, then loads from + # the state dict and then re-partitions them again + with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0): + if torch.distributed.get_rank() == 0: + module._load_from_state_dict(*args) + + for name, child in module._modules.items(): + if child is not None: + load(child, state_dict, prefix + name + ".") + + if is_deepspeed_zero3_enabled(): + load(model_to_load, state_dict, prefix=start_prefix) + else: + # Adjust and remove our `start_prefix` as we don't need it anymore + state_dict = { + key[len(start_prefix) :] if key.startswith(start_prefix) else key: value + for key, value in state_dict.items() + } + # By passing in `assign=True`, we can be memory efficient by mapping the tensors directly, using only 1x + # the memory of the original state_dict instead of 2. + model_to_load.load_state_dict(state_dict, assign=True, strict=False) # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so # it's safe to delete it. del state_dict