diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 661729ff8a..fce694f160 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -336,6 +336,7 @@ def build_finetuning_dataloader( replication_factor if replication_factor > 1 else None, rank=dist.get_global_rank() // replication_factor if replication_factor > 1 else None, + seed=dataset_cfg.get('shuffle_seed', 0), ) assert streaming_dataset is not None # for pyright diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml index 35a1165ae0..b4f7d7ae42 100644 --- a/mcli/mcli-1b-eval.yaml +++ b/mcli/mcli-1b-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index fb83165b75..dc0c80f488 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index f88d6cbac2..37df667ce0 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index 916337cb7b..27c96c466f 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.5.1_cu124-latest integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index ab02024280..90029c8d56 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index ab72e99f97..146848555f 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index b885568f66..1dbf6afdd6 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 8f3b6bac4e..1b52826e6f 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -1,13 +1,14 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo command: | cd llm-foundry/scripts + export HF_HUB_ENABLE_HF_TRANSFER=1 composer train/train.py /mnt/config/parameters.yaml image: mosaicml/llm-foundry:2.5.1_cu124-latest name: llama2-finetune @@ -21,9 +22,12 @@ compute: # The below is injected as a YAML file: /mnt/config/parameters.yaml parameters: - tokenizer_name: meta-llama/Llama-2-7b-hf - max_seq_len: 4096 - global_seed: 17 + variables: + tokenizer_name: meta-llama/Llama-2-7b-hf + global_seed: 17 + max_seq_len: 4096 + + max_seq_len: ${variables.max_seq_len} # Run Name run_name: # If left blank, will be read from env var $RUN_NAME @@ -42,9 +46,9 @@ parameters: # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: @@ -52,7 +56,7 @@ parameters: dataset: hf_name: mosaicml/dolly_hhrlhf split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true shuffle: true @@ -75,7 +79,7 @@ parameters: dataset: hf_name: mosaicml/dolly_hhrlhf split: test - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true # packing_ratio: @@ -114,7 +118,7 @@ parameters: global_train_batch_size: 64 # System - seed: ${global_seed} + seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: auto precision: amp_bf16 diff --git a/mcli/mcli-llama3-70b-instruct-finetune.yaml b/mcli/mcli-llama3-70b-instruct-finetune.yaml new file mode 100644 index 0000000000..c84f20473b --- /dev/null +++ b/mcli/mcli-llama3-70b-instruct-finetune.yaml @@ -0,0 +1,160 @@ +integrations: +- integration_type: git_repo + git_repo: mosaicml/llm-foundry + git_branch: v0.15.1 + # git_commit: # OR use your commit hash + pip_install: .[gpu] + ssh_clone: false # Should be true if using a private repo + +command: | + cd llm-foundry/scripts + export HF_HUB_ENABLE_HF_TRANSFER=1 + composer train/train.py /mnt/config/parameters.yaml +image: mosaicml/llm-foundry:2.5.1_cu124-latest +name: llama3.1-70b-finetune + +compute: + # Note: Finetuning the 70b model requires at least 16x80GB GPUs + gpus: 16 # Number of GPUs to use + ## These configurations are optional + # cluster: TODO # Name of the cluster to use for this run + # gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments + +# The below is injected as a YAML file: /mnt/config/parameters.yaml +parameters: + variables: + tokenizer_name: meta-llama/Llama-3.1-70B-Instruct + global_seed: 17 + max_seq_len: 4096 + + max_seq_len: ${variables.max_seq_len} + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + + max_split_size_mb: 512 + dist_timeout: 3600 # set to avoid NCCL timeouts + + # Model + model: + name: hf_causal_lm + init_device: mixed + pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct + pretrained: true + # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models + use_auth_token: true + use_flash_attention_2: true + + # Tokenizer + tokenizer: + name: ${variables.tokenizer_name} + kwargs: + model_max_length: ${variables.max_seq_len} + # Dataloaders + train_loader: + name: finetuning + dataset: + hf_name: mosaicml/dolly_hhrlhf + split: train + max_seq_len: ${variables.max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + shuffle: true + # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with + # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion + # # of the dataset. + # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...` + # # to profile this run's optimal packing_ratio as it depends on GPU count, + # # batch size, sequence length + # packing_ratio: auto + drop_last: true + num_workers: 8 + pin_memory: false + prefetch_factor: 2 + persistent_workers: true + timeout: 0 + + eval_loader: + name: finetuning + dataset: + hf_name: mosaicml/dolly_hhrlhf + split: test + max_seq_len: ${variables.max_seq_len} + allow_pad_trimming: false + decoder_only_format: true + # packing_ratio: + shuffle: false + drop_last: true + num_workers: 8 + pin_memory: false + prefetch_factor: 2 + persistent_workers: true + timeout: 0 + + # Optimization + scheduler: + name: cosine_with_warmup + t_warmup: 100ba + alpha_f: 0.1 + + # Note: You may want to change learning rate, betas, weight decay + optimizer: + name: decoupled_lionw + lr: 5.0e-7 + betas: + - 0.9 + - 0.95 + weight_decay: 0.0 + + algorithms: + gradient_clipping: + clipping_type: norm + clipping_threshold: 1.0 + + max_duration: 1ep + eval_first: false + eval_interval: 1ep + eval_subset_num_batches: -1 + global_train_batch_size: 16 + + # System + seed: ${variables.global_seed} + device_eval_batch_size: 1 + device_train_microbatch_size: 1 + precision: amp_bf16 + + # FSDP + fsdp_config: + state_dict_type: sharded # Note: we enable sharded checkpointing to avoid GPU OOM + sharding_strategy: FULL_SHARD + mixed_precision: PURE + activation_checkpointing: true + activation_checkpointing_reentrant: false + activation_cpu_offload: false + limit_all_gathers: true + + # Logging + progress_bar: false + log_to_console: true + console_log_interval: 1ba + + callbacks: + speed_monitor: + window_size: 10 + lr_monitor: {} + memory_monitor: {} + runtime_estimator: {} + + load_weights_only: true # Only load the weights, not the optimizer state, LR schedule, etc + +# loggers: +# wandb: {} + +# Checkpoint to local filesystem or remote object store +# save_interval: 2000ba +# save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK +# save_folder: ./{run_name}/checkpoints +# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints + +# Load from local filesystem or remote object store +# load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt +# load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index 563eb6b8c2..818afff545 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: .[gpu,openai] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index f344531049..7991bf5e13 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -14,7 +14,7 @@ integrations: - oci-cli==3.23.2 - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.15.0 + git_branch: v0.15.1 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo diff --git a/setup.py b/setup.py index 22a968246c..57acaddf5a 100644 --- a/setup.py +++ b/setup.py @@ -52,7 +52,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28', + 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.28.0,<0.29', 'mlflow>=2.14.1,<2.19', 'accelerate>=0.25,<1.2', # for HF inference `device_map` 'transformers>=4.43.2,<4.47', @@ -91,7 +91,7 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.27.0,<0.28', + 'mosaicml[databricks]>=0.28.0,<0.29', 'numpy<2', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', @@ -99,7 +99,7 @@ ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.27.0,<0.28', + 'mosaicml[tensorboard]>=0.28.0,<0.29', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.27.0,<0.28', + 'mosaicml[peft]>=0.28.0,<0.29', ] extra_deps['openai'] = [