From fe042dc12091701694ed549261b5c989c085839b Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 26 Sep 2023 11:24:20 -0400 Subject: [PATCH 1/2] add node rank --- llmfoundry/data/finetuning/dataloader.py | 2 +- llmfoundry/models/hf/hf_causal_lm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 5d4dfdbf85..b9ffa3de91 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -313,7 +313,7 @@ def _build_hf_dataset_from_remote( # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - signal_file_path = os.path.join(finetune_dir, '.the_eagle_has_landed') + signal_file_path = os.path.join(finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed') if dist.get_local_rank() == 0: try: get_file(path=name, destination=destination, overwrite=True) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 3100478a27..bf6f5288e4 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -164,7 +164,7 @@ def __init__(self, om_model_config: Union[DictConfig, f'init_device="{init_device}" must be either "cpu" or "meta".' ) - signal_file_path = '.local_rank0_completed_autoresume' + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_completed' if dist.get_local_rank() == 0: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download') From 6973b347edd2e9a2b5c7a945aba517da4eba3577 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Tue, 26 Sep 2023 11:38:07 -0400 Subject: [PATCH 2/2] lint --- llmfoundry/data/finetuning/dataloader.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index b9ffa3de91..ebb7991dde 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -313,7 +313,8 @@ def _build_hf_dataset_from_remote( # Since we don't know exactly what the extension will be, since it is one of a list # use a signal file to wait for instead of the desired file - signal_file_path = os.path.join(finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed') + signal_file_path = os.path.join( + finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed') if dist.get_local_rank() == 0: try: get_file(path=name, destination=destination, overwrite=True)