diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 8f3b6bac4e..d675cb9767 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -21,9 +21,12 @@ compute: # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: - tokenizer_name: meta-llama/Llama-2-7b-hf - max_seq_len: 4096 - global_seed: 17 + variables: + tokenizer_name: meta-llama/Llama-2-7b-hf + global_seed: 17 + max_seq_len: 4096 + + max_seq_len: ${variables.max_seq_len} # Run Name run_name: # If left blank, will be read from env var $RUN_NAME @@ -42,9 +45,9 @@ parameters: # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: @@ -52,7 +55,7 @@ parameters: dataset: hf_name: mosaicml/dolly_hhrlhf split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true shuffle: true @@ -75,7 +78,7 @@ parameters: dataset: hf_name: mosaicml/dolly_hhrlhf split: test - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true # packing_ratio: @@ -114,7 +117,7 @@ parameters: global_train_batch_size: 64 # System - seed: ${global_seed} + seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: auto precision: amp_bf16 diff --git a/mcli/mcli-llama3-70b-instruct-finetune.yaml b/mcli/mcli-llama3-70b-instruct-finetune.yaml new file mode 100644 index 0000000000..1bb3f17b01 --- /dev/null +++ b/mcli/mcli-llama3-70b-instruct-finetune.yaml @@ -0,0 +1,158 @@ +integrations: +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + git_branch: v0.15.0 + # git_commit: # OR use your commit hash + pip_install: .[gpu] + ssh_clone: false # Should be true if using a private repo + +command: | + cd llm-foundry/scripts + composer train/train.py /mnt/config/parameters.yaml +image: mosaicml/llm-foundry:2.5.1_cu124-latest +name: llama3.1-70b-finetune + +compute: + # Note: Finetuning the 70b model requires at least 16x80GB GPUs + gpus: 16 # Number of GPUs to use + ## These configurations are optional + # cluster: TODO # Name of the cluster to use for this run + # gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments + +# The below is injected as a YAML file: /mnt/config/parameters.yaml +parameters: + variables: + tokenizer_name: meta-llama/Llama-3.1-70B-Instruct + global_seed: 17 + max_seq_len: 4096 + + max_seq_len: ${variables.max_seq_len} + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + + max_split_size_mb: 512 + + # Model + model: + name: hf_causal_lm + init_device: mixed + pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct + pretrained: true + # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models + use_auth_token: true + use_flash_attention_2: true + + # Tokenizer + tokenizer: + name: ${variables.tokenizer_name} + kwargs: + model_max_length: ${variables.max_seq_len} + # Dataloaders + train_loader: + name: finetuning + dataset: + hf_name: mosaicml/dolly_hhrlhf + split: train + max_seq_len: ${variables.max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + shuffle: true + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # to profile this run's optimal packing_ratio as it depends on GPU count, + # # batch size, sequence length + # packing_ratio: auto + drop_last: true + num_workers: 8 + pin_memory: false + prefetch_factor: 2 + persistent_workers: true + timeout: 0 + + eval_loader: + name: finetuning + dataset: + hf_name: mosaicml/dolly_hhrlhf + split: test + max_seq_len: ${variables.max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + # packing_ratio: + shuffle: false + drop_last: true + num_workers: 8 + pin_memory: false + prefetch_factor: 2 + persistent_workers: true + timeout: 0 + + # Optimization + scheduler: + name: cosine_with_warmup + t_warmup: 100ba + alpha_f: 0.1 + + # Note: You may want to change learning rate, betas, weight decay + optimizer: + name: decoupled_lionw + lr: 5.0e-7 + betas: + - 0.9 + - 0.95 + weight_decay: 0.0 + + algorithms: + gradient_clipping: + clipping_type: norm + clipping_threshold: 1.0 + + max_duration: 1ep + eval_first: false + eval_interval: 1ep + eval_subset_num_batches: -1 + global_train_batch_size: 16 + + # System + seed: ${variables.global_seed} + device_eval_batch_size: 1 + device_train_microbatch_size: 1 + precision: amp_bf16 + + # FSDP + fsdp_config: + state_dict_type: sharded # Note: we enable sharded checkpointing to avoid GPU OOM + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + + # Logging + progress_bar: false + log_to_console: true + console_log_interval: 1ba + + callbacks: + speed_monitor: + window_size: 10 + lr_monitor: {} + memory_monitor: {} + runtime_estimator: {} + + load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc + +# loggers: +# wandb: {} + +# Checkpoint to local filesystem or remote object store +# save_interval: 2000ba +# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK +# save_folder: ./{run_name}/checkpoints +# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints + +# Load from local filesystem or remote object store +# load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt +# load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt