mosaicml · irenedea · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023 · Sep 18, 2023
@@ -0,0 +1,119 @@
+integrations:
+- integration_type: git_repo
+  git_repo: mosaicml/llm-foundry
+  git_branch: main
+  pip_install: -e .[gpu]
+
+command: |
+  cd llm-foundry/scripts
+  python data_prep/convert_dataset_hf.py \
+    --dataset c4 --data_subset en \
+    --out_root ./my-copy-c4 --splits train_small val_small \
+    --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b --eos_text '<|endoftext|>'
+  # Composer command will be inserted below:
+
+image: mosaicml/llm-foundry:2.0.1_cu118-latest
+name: mpt-125-elastic-resumption
+
+compute:
+  gpus: <INPUT>
+
+parameters:
+  data_local: ./my-copy-c4
+  data_remote:
+  max_seq_len: 2048
+  global_seed: 17
+
+  model:
+    name: mpt_causal_lm
+    init_device: meta
+    d_model: 768
+    n_heads: 12
+    n_layers: 12
+    expansion_ratio: 4
+    max_seq_len: ${max_seq_len}
+    vocab_size: 50368
+    attn_config:
+      attn_impl: triton
+
+  tokenizer:
+    name: EleutherAI/gpt-neox-20b
+    kwargs:
+      model_max_length: ${max_seq_len}
+
+  train_loader:
+    name: text
+    dataset:
+      local: ${data_local}
+      remote: ${data_remote}
+      split: train_small
+      shuffle: true
+      max_seq_len: ${max_seq_len}
+      shuffle_seed: ${global_seed}
+    drop_last: true
+    num_workers: 8
+
+  eval_loader:
+    name: text
+    dataset:
+      local: ${data_local}
+      remote: ${data_remote}
+      split: val_small
+      shuffle: false
+      max_seq_len: ${max_seq_len}
+      shuffle_seed: ${global_seed}
+    drop_last: false
+    num_workers: 8
+
+  scheduler:
+    name: cosine_with_warmup
+    t_warmup: 100ba
+    alpha_f: 0.1
+
+  optimizer:
+    name: decoupled_adamw
+    lr: 6.0e-4
+    betas:
+    - 0.9
+    - 0.95
+    eps: 1.0e-08
+    weight_decay: 0.0
+
+  algorithms:
+    gradient_clipping:
+      clipping_type: norm
+      clipping_threshold: 1.0
+
+  max_duration: <INPUT>
+  eval_interval: 500ba
+  eval_first: false
+  eval_subset_num_batches: -1
+  global_train_batch_size: 256
+
+  seed: ${global_seed}
+  device_eval_batch_size: 16
+  device_train_microbatch_size: auto
+  precision: amp_bf16
+
+  fsdp_config:
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: false
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
+    verbose: false
+    state_dict_type: sharded
+
+  progress_bar: false
+  log_to_console: true
+  console_log_interval: 1ba
+
+  callbacks:
+    speed_monitor:
+      window_size: 10
+    lr_monitor: {}
+    memory_monitor: {}
+    runtime_estimator: {}
+
+  save_interval: 10ba
@@ -6,25 +6,99 @@
 import os
 import subprocess
 
+from mcli import RunConfig, RunStatus, create_run, wait_for_run_status
+
 DIR_PATH = os.path.dirname(os.path.abspath(__file__))
 REGRESSIONS_DIR = os.path.join(DIR_PATH, 'regression_yamls')
 
-from mcli import RunConfig, create_run
+COMMIT_HASH = subprocess.check_output(['git', 'rev-parse',
+                                       'HEAD']).strip().decode('utf-8')
+TIMESTAMP = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S')
+
+
+def _get_regression_config(yaml_name: str) -> RunConfig:
+    """Get the yaml config from regressions directory."""
+    return RunConfig.from_file(os.path.join(REGRESSIONS_DIR, yaml_name))
+
+
+def _set_general_configs(config: RunConfig, cluster: str, wandb_entity: str,
+                         wandb_project: str, git_repo: str, git_branch: str):
+    """Set general configuration arguments."""
+    config.cluster = cluster
+    wandb_group = f'{TIMESTAMP}::{COMMIT_HASH}'
+    wandb_config = {
+        'entity': wandb_entity,
+        'project': wandb_project,
+        'group': wandb_group
+    }
+    config.parameters['loggers'] = config.parameters.get('loggers', {})
+    config.parameters['loggers']['wandb'] = wandb_config
+    config.integrations[0]['git_repo'] = git_repo
+    config.integrations[0]['git_branch'] = git_branch
+
+
+def test_elastic_resumption(cluster: str, save_folder: str, wandb_entity: str,
+                            wandb_project: str, git_repo: str, git_branch: str):
+    """Regression test for elastic resumption."""
+
+    def create_run_and_wait(gpus: int, resume: bool, subdir: str):
+        config = _get_regression_config('mpt-125m-elastic-resumption.yaml')
+
+        # Add the command to train our model
+        composer_command = '\ncomposer train/train.py /mnt/config/parameters.yaml'
+        if resume:
+            composer_command += ' autoresume=true'
+        else:
+            composer_command += ' autoresume=false'
+        config.command += composer_command
 
+        # Add suffix to name
+        name_suffix = f'-{gpus}'
+        if resume:
+            name_suffix += '-resume'
+        config.name += name_suffix
 
-def get_configs(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str,
-                wandb_project: str, git_repo: str, git_branch: str):
-    print(f'Running regression tests on {git_repo} {git_branch}.')
-    eval_7b_hf = RunConfig.from_file(
-        os.path.join(REGRESSIONS_DIR, 'eval-7b-hf.yaml'))
-    eval_7b_composer = RunConfig.from_file(
-        os.path.join(REGRESSIONS_DIR, 'eval-7b-composer.yaml'))
-    llama2_finetune = RunConfig.from_file(
-        os.path.join(REGRESSIONS_DIR, 'llama2-finetune.yaml'))
-    mpt_125m_chinchilla = RunConfig.from_file(
-        os.path.join(REGRESSIONS_DIR, 'mpt-125m-chinchilla.yaml'))
-    mpt_125m_sharded_resumption = RunConfig.from_file(
-        os.path.join(REGRESSIONS_DIR, 'mpt-125m-sharded-resumption.yaml'))
+        # Set other parameters
+        config.compute['gpus'] = gpus
+        config.parameters['save_folder'] = os.path.join(save_folder, subdir)
+        config.parameters['max_duration'] = '20ba' if resume else '10ba'
+
+        _set_general_configs(config,
+                             cluster=cluster,
+                             wandb_entity=wandb_entity,
+                             wandb_project=wandb_project,
+                             git_repo=git_repo,
+                             git_branch=git_branch)
+
+        # Start run
+        run = create_run(config)
+        wait_for_run_status(
+            run,
+            RunStatus.COMPLETED)  # Wait for the run to complete or terminate.
+        if run.status != RunStatus.COMPLETED:
+            raise Exception(
+                f'Failure on run {run.name}. Run status is {run.status}. ' +
+                'Terminating elastic resumption regression test.')
+
+    # Test 1 node => 2 node elastic resumption
+    subdir = f'1_to_2_node_{TIMESTAMP}_{COMMIT_HASH}'
+    create_run_and_wait(gpus=8, resume=False, subdir=subdir)
+    create_run_and_wait(gpus=16, resume=True, subdir=subdir)
+
+    # Test 2 node => 1 node elastic resumption
+    subdir = f'2_to_1_node_{TIMESTAMP}_{COMMIT_HASH}'
+    create_run_and_wait(gpus=16, resume=False, subdir=subdir)
+    create_run_and_wait(gpus=8, resume=True, subdir=subdir)
+
+
+def test_basic(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str,
+               wandb_project: str, git_repo: str, git_branch: str):
+    eval_7b_hf = _get_regression_config('eval-7b-hf.yaml')
+    eval_7b_composer = _get_regression_config('eval-7b-composer.yaml')
+    llama2_finetune = _get_regression_config('llama2-finetune.yaml')
+    mpt_125m_chinchilla = _get_regression_config('mpt-125m-chinchilla.yaml')
+    mpt_125m_sharded_resumption = _get_regression_config(
+        'mpt-125m-sharded-resumption.yaml')
 
     # make specific changes
     eval_7b_composer.parameters['models'][0]['load_path'] = mpt_7b_ckpt_path
@@ -34,25 +108,14 @@ def get_configs(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str,
         mpt_125m_sharded_resumption
     ]
 
-    commit_hash = subprocess.check_output(['git', 'rev-parse',
-                                           'HEAD']).strip().decode('utf-8')
-    timestamp = datetime.datetime.now().strftime('%m-%d-%Y::%H:%M:%S')
-    wandb_group = f'{timestamp}::{commit_hash}'
-
-    # make general changes
-    wandb_config = {
-        'entity': wandb_entity,
-        'project': wandb_project,
-        'group': wandb_group
-    }
     for config in all_configs:
-        config.cluster = cluster
-        config.parameters['loggers'] = config.parameters.get('loggers', {})
-        config.parameters['loggers']['wandb'] = wandb_config
-        config.integrations[0]['git_repo'] = git_repo
-        config.integrations[0]['git_branch'] = git_branch
-
-    return all_configs, []
+        _set_general_configs(config,
+                             cluster=cluster,
+                             wandb_entity=wandb_entity,
+                             wandb_project=wandb_project,
+                             git_repo=git_repo,
+                             git_branch=git_branch)
+        create_run(config)
 
 
 if __name__ == '__main__':
@@ -61,13 +124,16 @@ def get_configs(cluster: str, mpt_7b_ckpt_path: str, wandb_entity: str,
     parser.add_argument('--mpt-7b-ckpt-path', type=str)
     parser.add_argument('--wandb-entity', type=str)
     parser.add_argument('--wandb-project', type=str)
+    parser.add_argument('--remote-save-folder', type=str)
     parser.add_argument('--git-repo', type=str, default='mosaicml/llm-foundry')
     parser.add_argument('--git-branch', type=str, default='main')
 
     args = parser.parse_args()
 
-    run_configs, _ = get_configs(args.cluster, args.mpt_7b_ckpt_path,
-                                 args.wandb_entity, args.wandb_project,
-                                 args.git_repo, args.git_branch)
-    for run_config in run_configs:
-        run = create_run(run_config)
+    print(f'Running regression tests on {args.git_repo} {args.git_branch}.')
+
+    test_basic(args.cluster, args.mpt_7b_ckpt_path, args.wandb_entity,
+               args.wandb_project, args.git_repo, args.git_branch)
+    test_elastic_resumption(args.cluster, args.remote_save_folder,
+                            args.wandb_entity, args.wandb_project,
+                            args.git_repo, args.git_branch)