From 477bbb107f1159b7ff322c7d0f22a7ee769b4d1c Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Mon, 18 Sep 2023 09:05:44 -0700 Subject: [PATCH] Unique subidr --- .github/workflows/test_elastic_resumption.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test_elastic_resumption.py b/.github/workflows/test_elastic_resumption.py index 7fe2b6846f..796cdad99c 100644 --- a/.github/workflows/test_elastic_resumption.py +++ b/.github/workflows/test_elastic_resumption.py @@ -15,6 +15,9 @@ def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str, wandb_project: str, git_repo: str, git_branch: str): + commit_hash = subprocess.check_output(['git', 'rev-parse', + 'HEAD']).strip().decode('utf-8') + timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S') def create_run_and_wait(gpus: int, resume: bool, subdir: str): config = RunConfig.from_file( os.path.join(REGRESSIONS_DIR, 'mpt-125m-elastic-resumption.yaml')) @@ -22,9 +25,9 @@ def create_run_and_wait(gpus: int, resume: bool, subdir: str): # Add the command to train our model composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml' if resume: - composer_command += ' autoresume=true' # TODO: autoresume and save_overwrite can't both be true, but i have to overwrite if i run multiple runs with same save folder + composer_command += ' autoresume=true' else: - composer_command += ' save_overwrite=true autoresume=false' + composer_command += ' autoresume=false' config.command += composer_command # Add suffix to name @@ -38,9 +41,7 @@ def create_run_and_wait(gpus: int, resume: bool, subdir: str): config.compute['gpus'] = gpus config.parameters['save_folder'] = os.path.join(save_folder, subdir) config.parameters['max_duration'] = '20ba' if resume else '10ba' - commit_hash = subprocess.check_output(['git', 'rev-parse', - 'HEAD']).strip().decode('utf-8') - timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S') + wandb_group = f'{timestamp}::{commit_hash}' wandb_config = { 'entity': wandb_entity, @@ -61,12 +62,12 @@ def create_run_and_wait(gpus: int, resume: bool, subdir: str): log.info(f'Completed run {run.name}') # Test 1 node => 2 node elastic resumption - subdir = '1_to_2_node' + subdir = f'1_to_2_node_{commit_hash}_{timestamp}' create_run_and_wait(gpus=8, resume=False, subdir=subdir) create_run_and_wait(gpus=16, resume=True, subdir=subdir) # Test 2 node => 1 node elastic resumption - subdir = '2_to_1_node' + subdir = f'2_to_1_node_{commit_hash}_{timestamp}' create_run_and_wait(gpus=16, resume=False, subdir=subdir) create_run_and_wait(gpus=8, resume=True, subdir=subdir)