Skip to content

Commit

Permalink
Unique subidr
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea committed Sep 18, 2023
1 parent 79f62f4 commit 477bbb1
Showing 1 changed file with 8 additions and 7 deletions.
15 changes: 8 additions & 7 deletions .github/workflows/test_elastic_resumption.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,19 @@

def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str,
wandb_project: str, git_repo: str, git_branch: str):
commit_hash = subprocess.check_output(['git', 'rev-parse',
'HEAD']).strip().decode('utf-8')
timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S')
def create_run_and_wait(gpus: int, resume: bool, subdir: str):
config = RunConfig.from_file(
os.path.join(REGRESSIONS_DIR, 'mpt-125m-elastic-resumption.yaml'))

# Add the command to train our model
composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml'
if resume:
composer_command += ' autoresume=true' # TODO: autoresume and save_overwrite can't both be true, but i have to overwrite if i run multiple runs with same save folder
composer_command += ' autoresume=true'
else:
composer_command += ' save_overwrite=true autoresume=false'
composer_command += ' autoresume=false'
config.command += composer_command

# Add suffix to name
Expand All @@ -38,9 +41,7 @@ def create_run_and_wait(gpus: int, resume: bool, subdir: str):
config.compute['gpus'] = gpus
config.parameters['save_folder'] = os.path.join(save_folder, subdir)
config.parameters['max_duration'] = '20ba' if resume else '10ba'
commit_hash = subprocess.check_output(['git', 'rev-parse',
'HEAD']).strip().decode('utf-8')
timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S')

wandb_group = f'{timestamp}::{commit_hash}'
wandb_config = {
'entity': wandb_entity,
Expand All @@ -61,12 +62,12 @@ def create_run_and_wait(gpus: int, resume: bool, subdir: str):
log.info(f'Completed run {run.name}')

# Test 1 node => 2 node elastic resumption
subdir = '1_to_2_node'
subdir = f'1_to_2_node_{commit_hash}_{timestamp}'
create_run_and_wait(gpus=8, resume=False, subdir=subdir)
create_run_and_wait(gpus=16, resume=True, subdir=subdir)

# Test 2 node => 1 node elastic resumption
subdir = '2_to_1_node'
subdir = f'2_to_1_node_{commit_hash}_{timestamp}'
create_run_and_wait(gpus=16, resume=False, subdir=subdir)
create_run_and_wait(gpus=8, resume=True, subdir=subdir)

Expand Down

0 comments on commit 477bbb1

Please sign in to comment.