diff --git a/.github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml b/.github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml new file mode 100644 index 0000000000..bb6bb85123 --- /dev/null +++ b/.github/workflows/regression_yamls/mpt-125m-elastic-resumption.yaml @@ -0,0 +1,119 @@ +integrations: +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + git_branch: main + pip_install: -e .[gpu] + +command: | + cd llm-foundry/scripts + python data_prep/convert_dataset_hf.py \ + --dataset c4 --data_subset en \ + --out_root ./my-copy-c4 --splits train_small val_small \ + --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>' + # Composer command will be inserted below: + +image: mosaicml/llm-foundry:2.0.1_cu118-latest +name: mpt-125-elastic-resumption + +compute: + gpus: + +parameters: + data_local: ./my-copy-c4 + data_remote: + max_seq_len: 2048 + global_seed: 17 + + model: + name: mpt_causal_lm + init_device: meta + d_model: 768 + n_heads: 12 + n_layers: 12 + expansion_ratio: 4 + max_seq_len: ${max_seq_len} + vocab_size: 50368 + attn_config: + attn_impl: triton + + tokenizer: + name: EleutherAI/gpt-neox-20b + kwargs: + model_max_length: ${max_seq_len} + + train_loader: + name: text + dataset: + local: ${data_local} + remote: ${data_remote} + split: train_small + shuffle: true + max_seq_len: ${max_seq_len} + shuffle_seed: ${global_seed} + drop_last: true + num_workers: 8 + + eval_loader: + name: text + dataset: + local: ${data_local} + remote: ${data_remote} + split: val_small + shuffle: false + max_seq_len: ${max_seq_len} + shuffle_seed: ${global_seed} + drop_last: false + num_workers: 8 + + scheduler: + name: cosine_with_warmup + t_warmup: 100ba + alpha_f: 0.1 + + optimizer: + name: decoupled_adamw + lr: 6.0e-4 + betas: + - 0.9 + - 0.95 + eps: 1.0e-08 + weight_decay: 0.0 + + algorithms: + gradient_clipping: + clipping_type: norm + clipping_threshold: 1.0 + + max_duration: + eval_interval: 500ba + eval_first: false + eval_subset_num_batches: -1 + global_train_batch_size: 256 + + seed: ${global_seed} + device_eval_batch_size: 16 + device_train_microbatch_size: auto + precision: amp_bf16 + + fsdp_config: + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: false + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + verbose: false + state_dict_type: sharded + + progress_bar: false + log_to_console: true + console_log_interval: 1ba + + callbacks: + speed_monitor: + window_size: 10 + lr_monitor: {} + memory_monitor: {} + runtime_estimator: {} + + save_interval: 10ba diff --git a/.github/workflows/regressions.py b/.github/workflows/regressions.py index 9211df1908..a582d41e97 100644 --- a/.github/workflows/regressions.py +++ b/.github/workflows/regressions.py @@ -6,25 +6,99 @@ import os import subprocess +from mcli import RunConfig, RunStatus, create_run, wait_for_run_status + DIR_PATH = os.path.dirname(os.path.abspath(__file__)) REGRESSIONS_DIR = os.path.join(DIR_PATH, 'regression_yamls') -from mcli import RunConfig, create_run +COMMIT_HASH = subprocess.check_output(['git', 'rev-parse', + 'HEAD']).strip().decode('utf-8') +TIMESTAMP = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S') + + +def _get_regression_config(yaml_name: str) -> RunConfig: + """Get the yaml config from regressions directory.""" + return RunConfig.from_file(os.path.join(REGRESSIONS_DIR, yaml_name)) + + +def _set_general_configs(config: RunConfig, cluster: str, wandb_entity: str, + wandb_project: str, git_repo: str, git_branch: str): + """Set general configuration arguments.""" + config.cluster = cluster + wandb_group = f'{TIMESTAMP}::{COMMIT_HASH}' + wandb_config = { + 'entity': wandb_entity, + 'project': wandb_project, + 'group': wandb_group + } + config.parameters['loggers'] = config.parameters.get('loggers', {}) + config.parameters['loggers']['wandb'] = wandb_config + config.integrations[0]['git_repo'] = git_repo + config.integrations[0]['git_branch'] = git_branch + + +def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str, + wandb_project: str, git_repo: str, git_branch: str): + """Regression test for elastic resumption.""" + + def create_run_and_wait(gpus: int, resume: bool, subdir: str): + config = _get_regression_config('mpt-125m-elastic-resumption.yaml') + + # Add the command to train our model + composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml' + if resume: + composer_command += ' autoresume=true' + else: + composer_command += ' autoresume=false' + config.command += composer_command + # Add suffix to name + name_suffix = f'-{gpus}' + if resume: + name_suffix += '-resume' + config.name += name_suffix -def get_configs(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str, - wandb_project: str, git_repo: str, git_branch: str): - print(f'Running regression tests on {git_repo} {git_branch}.') - eval_7b_hf = RunConfig.from_file( - os.path.join(REGRESSIONS_DIR, 'eval-7b-hf.yaml')) - eval_7b_composer = RunConfig.from_file( - os.path.join(REGRESSIONS_DIR, 'eval-7b-composer.yaml')) - llama2_finetune = RunConfig.from_file( - os.path.join(REGRESSIONS_DIR, 'llama2-finetune.yaml')) - mpt_125m_chinchilla = RunConfig.from_file( - os.path.join(REGRESSIONS_DIR, 'mpt-125m-chinchilla.yaml')) - mpt_125m_sharded_resumption = RunConfig.from_file( - os.path.join(REGRESSIONS_DIR, 'mpt-125m-sharded-resumption.yaml')) + # Set other parameters + config.compute['gpus'] = gpus + config.parameters['save_folder'] = os.path.join(save_folder, subdir) + config.parameters['max_duration'] = '20ba' if resume else '10ba' + + _set_general_configs(config, + cluster=cluster, + wandb_entity=wandb_entity, + wandb_project=wandb_project, + git_repo=git_repo, + git_branch=git_branch) + + # Start run + run = create_run(config) + wait_for_run_status( + run, + RunStatus.COMPLETED) # Wait for the run to complete or terminate. + if run.status != RunStatus.COMPLETED: + raise Exception( + f'Failure on run {run.name}. Run status is {run.status}. ' + + 'Terminating elastic resumption regression test.') + + # Test 1 node => 2 node elastic resumption + subdir = f'1_to_2_node_{TIMESTAMP}_{COMMIT_HASH}' + create_run_and_wait(gpus=8, resume=False, subdir=subdir) + create_run_and_wait(gpus=16, resume=True, subdir=subdir) + + # Test 2 node => 1 node elastic resumption + subdir = f'2_to_1_node_{TIMESTAMP}_{COMMIT_HASH}' + create_run_and_wait(gpus=16, resume=False, subdir=subdir) + create_run_and_wait(gpus=8, resume=True, subdir=subdir) + + +def test_basic(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str, + wandb_project: str, git_repo: str, git_branch: str): + eval_7b_hf = _get_regression_config('eval-7b-hf.yaml') + eval_7b_composer = _get_regression_config('eval-7b-composer.yaml') + llama2_finetune = _get_regression_config('llama2-finetune.yaml') + mpt_125m_chinchilla = _get_regression_config('mpt-125m-chinchilla.yaml') + mpt_125m_sharded_resumption = _get_regression_config( + 'mpt-125m-sharded-resumption.yaml') # make specific changes eval_7b_composer.parameters['models'][0]['load_path'] = mpt_7b_ckpt_path @@ -34,25 +108,14 @@ def get_configs(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str, mpt_125m_sharded_resumption ] - commit_hash = subprocess.check_output(['git', 'rev-parse', - 'HEAD']).strip().decode('utf-8') - timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S') - wandb_group = f'{timestamp}::{commit_hash}' - - # make general changes - wandb_config = { - 'entity': wandb_entity, - 'project': wandb_project, - 'group': wandb_group - } for config in all_configs: - config.cluster = cluster - config.parameters['loggers'] = config.parameters.get('loggers', {}) - config.parameters['loggers']['wandb'] = wandb_config - config.integrations[0]['git_repo'] = git_repo - config.integrations[0]['git_branch'] = git_branch - - return all_configs, [] + _set_general_configs(config, + cluster=cluster, + wandb_entity=wandb_entity, + wandb_project=wandb_project, + git_repo=git_repo, + git_branch=git_branch) + create_run(config) if __name__ == '__main__': @@ -61,13 +124,16 @@ def get_configs(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str, parser.add_argument('--mpt-7b-ckpt-path', type=str) parser.add_argument('--wandb-entity', type=str) parser.add_argument('--wandb-project', type=str) + parser.add_argument('--remote-save-folder', type=str) parser.add_argument('--git-repo', type=str, default='mosaicml/llm-foundry') parser.add_argument('--git-branch', type=str, default='main') args = parser.parse_args() - run_configs, _ = get_configs(args.cluster, args.mpt_7b_ckpt_path, - args.wandb_entity, args.wandb_project, - args.git_repo, args.git_branch) - for run_config in run_configs: - run = create_run(run_config) + print(f'Running regression tests on {args.git_repo} {args.git_branch}.') + + test_basic(args.cluster, args.mpt_7b_ckpt_path, args.wandb_entity, + args.wandb_project, args.git_repo, args.git_branch) + test_elastic_resumption(args.cluster, args.remote_save_folder, + args.wandb_entity, args.wandb_project, + args.git_repo, args.git_branch)