Skip to content

Commit

Permalink
Fix HF local module copy contention with a meta init on local rank 0 (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg authored Nov 2, 2023
1 parent ac8e023 commit 6c41241
Showing 1 changed file with 19 additions and 0 deletions.
19 changes: 19 additions & 0 deletions llmfoundry/models/hf/hf_causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import logging
import os
import warnings
from typing import Mapping, Union

# required for loading a python model into composer
Expand Down Expand Up @@ -157,6 +158,24 @@ def __init__(self, om_model_config: Union[DictConfig,
if dist.get_local_rank() != 0 and init_device == 'mixed':
om_model_config.pretrained = False

# If the HuggingFace model is coming from a local folder, Hugging Face copies the modules into the
# transformers modules cache. On particular systems, this operation seems to cause contention between
# the different processes. To avoid this contention, we first create the model (on meta device) on local rank
# zero. This will set up the transformers model cache and avoid the future contention.
if dist.get_local_rank() == 0 and os.path.isdir(
om_model_config.pretrained_model_name_or_path):
with init_empty_weights(include_buffers=False):
with warnings.catch_warnings():
warnings.simplefilter('ignore', UserWarning)
AutoModelForCausalLM.from_pretrained(
om_model_config.pretrained_model_name_or_path,
trust_remote_code=trust_remote_code,
use_auth_token=use_auth_token,
config=config,
)

dist.barrier()

# initialize the model on the correct device
if resolved_init_device == 'cpu':
if om_model_config.pretrained:
Expand Down

0 comments on commit 6c41241

Please sign in to comment.