diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 5d4dfdbf85..ebb7991dde 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -313,7 +313,8 @@ def _build_hf_dataset_from_remote( # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - signal_file_path = os.path.join(finetune_dir, '.the_eagle_has_landed') + signal_file_path = os.path.join( + finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed') if dist.get_local_rank() == 0: try: get_file(path=name, destination=destination, overwrite=True) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 3100478a27..bf6f5288e4 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -164,7 +164,7 @@ def __init__(self, om_model_config: Union[DictConfig, f'init_device="{init_device}" must be either "cpu" or "meta".' ) - signal_file_path = '.local_rank0_completed_autoresume' + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed' if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download')