From fc20db8873c058e82460166146b9590f03256f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?= <45557362+qgallouedec@users.noreply.github.com> Date: Wed, 4 Sep 2024 10:07:49 +0200 Subject: [PATCH] Clean configs documentation (#1944) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Clean BCO * Optional[int] * fix sft config * alignprop config * upadte tempfile to work with output_dir * clean kto config * intro docstring * style * reward config * orpo config * warning in trainer, not in config * cpo config * ppo v2 * model config * ddpo and per_device_train_batch_size (instead of (train_batch_size) * rloo * Online config * tmp_dir in test_ddpo * style * remove to_dict and fix post-init * batch size in test ddpo * dpo * style * `Args` -> `Parameters` * parameters * ppo config * dont overwrite world size * style * outputdir in test ppo * output dir in ppo config * revert non-core change (1/n) * revert non-core changes (2/n) * revert non-core change (3/n) * uniform max_length * fix uniform max_length * beta uniform * style * link to `ConstantLengthDataset` * uniform `dataset_num_proc` * uniform `disable_dropout` * `eval_packing` doc * try latex and α in doc * try title first * doesn't work * reorganize doc * overview * better latex * is_encoder_decoder uniform * proper ticks * fix latex * uniform generate_during_eval * uniform truncation_mode * ref_model_mixup_alpha * ref_model_mixup_alpha and ref_model_sync_steps * Uniform `model_init_kwargs` and `ref_model_init_kwargs` * rpo_alpha * Update maximum length argument names in config files * Update loss_type descriptions in config files * Update max_target_length to max_completion_length in CPOConfig and CPOTrainer * Update padding value in config files * Update precompute_ref_log_probs flag documentation * Fix typos and update comments in dpo_config.py and sft_config.py * post init warning for `max_target_length` --- docs/source/_toctree.yml | 68 ++++++------ tests/test_trainers_args.py | 6 +- trl/trainer/alignprop_config.py | 124 +++++++++++++-------- trl/trainer/bco_config.py | 102 ++++++++--------- trl/trainer/bco_trainer.py | 7 +- trl/trainer/cpo_config.py | 99 +++++++++-------- trl/trainer/cpo_trainer.py | 14 +-- trl/trainer/ddpo_config.py | 142 ++++++++++++++++-------- trl/trainer/dpo_config.py | 117 +++++++++++++------- trl/trainer/dpo_trainer.py | 16 +-- trl/trainer/kto_config.py | 103 +++++++++-------- trl/trainer/model_config.py | 153 ++++++++++++------------- trl/trainer/online_dpo_config.py | 18 +-- trl/trainer/orpo_config.py | 68 ++++++------ trl/trainer/ppo_config.py | 184 ++++++++++++++++++++----------- trl/trainer/ppov2_config.py | 44 +++++--- trl/trainer/reward_config.py | 20 ++-- trl/trainer/rloo_config.py | 35 ++++-- trl/trainer/sft_config.py | 82 +++++++------- trl/trainer/utils.py | 85 +++++++++----- 20 files changed, 851 insertions(+), 636 deletions(-) diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index d9cad99ad3..4f8f875562 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -17,44 +17,46 @@ title: Understanding Logs title: Get started - sections: + - sections: + - local: trainer + title: Overview + - local: alignprop_trainer + title: AlignProp + - local: bco_trainer + title: BCO + - local: cpo_trainer + title: CPO + - local: ddpo_trainer + title: DDPO + - local: dpo_trainer + title: DPO + - local: online_dpo_trainer + title: Online DPO + - local: orpo_trainer + title: ORPO + - local: kto_trainer + title: KTO + - local: ppo_trainer + title: PPO + - local: ppov2_trainer + title: PPOv2 + - local: rloo_trainer + title: RLOO + - local: sft_trainer + title: SFT + - local: iterative_sft_trainer + title: Iterative SFT + - local: reward_trainer + title: Reward Model + title: Trainers - local: models title: Model Classes - - local: trainer - title: Trainer Classes - - local: reward_trainer - title: Reward Model Training - - local: sft_trainer - title: Supervised Fine-Tuning - - local: ppo_trainer - title: PPO Trainer - - local: ppov2_trainer - title: PPOv2 Trainer - - local: rloo_trainer - title: RLOO Trainer - local: best_of_n title: Best of N Sampling - - local: dpo_trainer - title: DPO Trainer - - local: online_dpo_trainer - title: Online DPO Trainer - - local: kto_trainer - title: KTO Trainer - - local: bco_trainer - title: BCO Trainer - - local: cpo_trainer - title: CPO Trainer - - local: ddpo_trainer - title: Denoising Diffusion Policy Optimization - - local: alignprop_trainer - title: AlignProp Trainer - - local: orpo_trainer - title: ORPO Trainer - - local: iterative_sft_trainer - title: Iterative Supervised Fine-Tuning - - local: callbacks - title: Callback Classes - local: judges - title: Judge Classes + title: Judges + - local: callbacks + title: Callbacks - local: text_environments title: Text Environments title: API diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py index a8b4ef028c..cd6de6bd0a 100644 --- a/tests/test_trainers_args.py +++ b/tests/test_trainers_args.py @@ -77,7 +77,6 @@ def test_cpo(self): max_length=256, max_prompt_length=64, max_completion_length=64, - max_target_length=64, beta=0.5, label_smoothing=0.5, loss_type="hinge", @@ -96,7 +95,6 @@ def test_cpo(self): self.assertEqual(trainer.args.max_length, 256) self.assertEqual(trainer.args.max_prompt_length, 64) self.assertEqual(trainer.args.max_completion_length, 64) - self.assertEqual(trainer.args.max_target_length, 64) self.assertEqual(trainer.args.beta, 0.5) self.assertEqual(trainer.args.label_smoothing, 0.5) self.assertEqual(trainer.args.loss_type, "hinge") @@ -127,7 +125,7 @@ def test_dpo(self): truncation_mode="keep_start", max_length=256, max_prompt_length=64, - max_target_length=64, + max_completion_length=64, is_encoder_decoder=True, disable_dropout=False, # generate_during_eval=True, # ignore this one, it requires wandb @@ -155,7 +153,7 @@ def test_dpo(self): self.assertEqual(trainer.args.truncation_mode, "keep_start") self.assertEqual(trainer.args.max_length, 256) self.assertEqual(trainer.args.max_prompt_length, 64) - self.assertEqual(trainer.args.max_target_length, 64) + self.assertEqual(trainer.args.max_completion_length, 64) self.assertEqual(trainer.args.is_encoder_decoder, True) self.assertEqual(trainer.args.disable_dropout, False) # self.assertEqual(trainer.args.generate_during_eval, True) diff --git a/trl/trainer/alignprop_config.py b/trl/trainer/alignprop_config.py index 7bd4cd32bd..b5f56eecfa 100644 --- a/trl/trainer/alignprop_config.py +++ b/trl/trainer/alignprop_config.py @@ -2,7 +2,7 @@ import sys import warnings from dataclasses import dataclass, field -from typing import Literal, Optional +from typing import Any, Dict, Literal, Optional, Tuple from ..core import flatten_dict from ..import_utils import is_bitsandbytes_available, is_torchvision_available @@ -10,77 +10,109 @@ @dataclass class AlignPropConfig: - """ - Configuration class for AlignPropTrainer + r""" + Configuration class for the [`AlignPropTrainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`): + Name of this experiment (defaults to the file name without the extension). + run_name (`str`, *optional*, defaults to `""`): + Name of this run. + log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`): + Log with either `"wandb"` or `"tensorboard"`. Check + [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details. + log_image_freq (`int`, *optional*, defaults to `1`): + Frequency for logging images. + tracker_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`): + Keyword arguments for the tracker (e.g., `wandb_project`). + accelerator_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator. + project_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator project config (e.g., `logging_dir`). + tracker_project_name (`str`, *optional*, defaults to `"trl"`): + Name of project to use for tracking. + logdir (`str`, *optional*, defaults to `"logs"`): + Top-level logging directory for checkpoint saving. + num_epochs (`int`, *optional*, defaults to `100`): + Number of epochs to train. + save_freq (`int`, *optional*, defaults to `1`): + Number of epochs between saving model checkpoints. + num_checkpoint_limit (`int`, *optional*, defaults to `5`): + Number of checkpoints to keep before overwriting old ones. + mixed_precision (`str`, *optional*, defaults to `"fp16"`): + Mixed precision training. + allow_tf32 (`bool`, *optional*, defaults to `True`): + Allow `tf32` on Ampere GPUs. + resume_from (`str`, *optional*, defaults to `""`): + Path to resume training from a checkpoint. + sample_num_steps (`int`, *optional*, defaults to `50`): + Number of sampler inference steps. + sample_eta (`float`, *optional*, defaults to `1.0`): + Eta parameter for the DDIM sampler. + sample_guidance_scale (`float`, *optional*, defaults to `5.0`): + Classifier-free guidance weight. + train_use_8bit_adam (`bool`, *optional*, defaults to `False`): + Whether to use the 8bit Adam optimizer from `bitsandbytes`. + train_learning_rate (`float`, *optional*, defaults to `1e-3`): + Learning rate. + train_adam_beta1 (`float`, *optional*, defaults to `0.9`): + Beta1 for Adam optimizer. + train_adam_beta2 (`float`, *optional*, defaults to `0.999`): + Beta2 for Adam optimizer. + train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`): + Weight decay for Adam optimizer. + train_adam_epsilon (`float`, *optional*, defaults to `1e-8`): + Epsilon value for Adam optimizer. + train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`): + Number of gradient accumulation steps. + train_max_grad_norm (`float`, *optional*, defaults to `1.0`): + Maximum gradient norm for gradient clipping. + negative_prompts (`Optional[str]`, *optional*, defaults to `None`): + Comma-separated list of prompts to use as negative examples. + truncated_backprop_rand (`bool`, *optional*, defaults to `True`): + If `True`, randomized truncation to different diffusion timesteps is used. + truncated_backprop_timestep (`int`, *optional*, defaults to `49`): + Absolute timestep to which the gradients are backpropagated. Used only if `truncated_backprop_rand=False`. + truncated_rand_backprop_minmax (`Tuple[int, int]`, *optional*, defaults to `(0, 50)`): + Range of diffusion timesteps for randomized truncated backpropagation. """ - # common parameters exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")] - """the name of this experiment (by default is the file name without the extension name)""" - run_name: Optional[str] = "" - """Run name for wandb logging and checkpoint saving.""" + run_name: str = "" seed: int = 0 - """Seed value for random generations""" log_with: Optional[Literal["wandb", "tensorboard"]] = None - """Log with either 'wandb' or 'tensorboard', check https://huggingface.co/docs/accelerate/usage_guides/tracking for more details""" - log_image_freq = 1 - """Logging Frequency for images""" - tracker_kwargs: dict = field(default_factory=dict) - """Keyword arguments for the tracker (e.g. wandb_project)""" - accelerator_kwargs: dict = field(default_factory=dict) - """Keyword arguments for the accelerator""" - project_kwargs: dict = field(default_factory=dict) - """Keyword arguments for the accelerator project config (e.g. `logging_dir`)""" + log_image_freq: int = 1 + tracker_kwargs: Dict[str, Any] = field(default_factory=dict) + accelerator_kwargs: Dict[str, Any] = field(default_factory=dict) + project_kwargs: Dict[str, Any] = field(default_factory=dict) tracker_project_name: str = "trl" - """Name of project to use for tracking""" logdir: str = "logs" - """Top-level logging directory for checkpoint saving.""" - - # hyperparameters num_epochs: int = 100 - """Number of epochs to train.""" save_freq: int = 1 - """Number of epochs between saving model checkpoints.""" num_checkpoint_limit: int = 5 - """Number of checkpoints to keep before overwriting old ones.""" mixed_precision: str = "fp16" - """Mixed precision training.""" allow_tf32: bool = True - """Allow tf32 on Ampere GPUs.""" - resume_from: Optional[str] = "" - """Resume training from a checkpoint.""" + resume_from: str = "" sample_num_steps: int = 50 - """Number of sampler inference steps.""" sample_eta: float = 1.0 - """Eta parameter for the DDIM sampler.""" sample_guidance_scale: float = 5.0 - """Classifier-free guidance weight.""" train_batch_size: int = 1 - """Batch size (per GPU!) to use for training.""" train_use_8bit_adam: bool = False - """Whether to use the 8bit Adam optimizer from bitsandbytes.""" train_learning_rate: float = 1e-3 - """Learning rate.""" train_adam_beta1: float = 0.9 - """Adam beta1.""" train_adam_beta2: float = 0.999 - """Adam beta2.""" train_adam_weight_decay: float = 1e-4 - """Adam weight decay.""" train_adam_epsilon: float = 1e-8 - """Adam epsilon.""" train_gradient_accumulation_steps: int = 1 - """Number of gradient accumulation steps.""" train_max_grad_norm: float = 1.0 - """Maximum gradient norm for gradient clipping.""" - negative_prompts: Optional[str] = "" - """Comma-separated list of prompts to use as negative examples.""" + negative_prompts: Optional[str] = None truncated_backprop_rand: bool = True - """Truncated Randomized Backpropation randomizes truncation to different diffusion timesteps""" truncated_backprop_timestep: int = 49 - """Absolute timestep to which the gradients are being backpropagated. If truncated_backprop_rand is False""" - truncated_rand_backprop_minmax: tuple = (0, 50) - """Range of diffusion timesteps for randomized truncated backprop.""" + truncated_rand_backprop_minmax: Tuple[int, int] = (0, 50) def to_dict(self): output_dict = {} diff --git a/trl/trainer/bco_config.py b/trl/trainer/bco_config.py index 05d4b0b8fe..2d7a3f7a09 100644 --- a/trl/trainer/bco_config.py +++ b/trl/trainer/bco_config.py @@ -12,87 +12,77 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Dict, Optional +from typing import Any, Dict, Optional from transformers import TrainingArguments -from ..import_utils import is_sklearn_available - @dataclass class BCOConfig(TrainingArguments): r""" - BCOConfig collects all training arguments related to the [`BCOTrainer`] class. + Configuration class for the [`BCOTrainer`]. - Using [`HfArgumentParser`] we can turn this class into + Using [`~transformers.HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. Parameters: - max_length (`int`, *optional*, defaults to `None`): - The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`int`, *optional*, defaults to `None`): - The maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`int`, *optional*, defaults to `None`): - The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. - beta (`float`, defaults to 0.1): - The beta factor in BCO loss. Higher beta means less divergence from the initial policy. - label_pad_token_id (`int`, defaults to `-100`): - The label pad token id. This argument is required if you want to use the default data collator. - padding_value (`int`, defaults to `0`): - The padding value if it is different to the tokenizer's pad_token_id. - truncation_mode (`str`, defaults to `keep_end`): - The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. - generate_during_eval (`bool`, defaults to `False`): - Whether to sample and log generations during evaluation step. - is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): - If no model is provided, we need to know if the model_init returns an encoder-decoder. - precompute_ref_log_probs (`bool`, defaults to `False`): - Flag to precompute reference model log probabilities for training and evaluation datasets. This is useful if you want to train - without the reference model and reduce the total GPU memory needed. - model_init_kwargs: (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the model from a string. - ref_model_init_kwargs: (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the ref model from a string. - dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`): - Number of processes to use for processing the datasets. - prompt_sample_size: (`int`, defaults to 1024): + max_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`Optional[int]`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from both the model and the reference model to W&B during + evaluation. + is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): + Whether to precompute reference model log probabilities for training and evaluation datasets. This is + useful when training without the reference model to reduce the total GPU memory needed. + model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model + from a string. + dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + prompt_sample_size (`int`, *optional*, defaults to `1024`): Number of prompts that are fed to density ratio classifier. - min_density_ratio: (`float`, defaults to 0.5): - The minimum value of the density ratio. The estimated density ratio is clamped to this value. - max_density_ratio: (`float`, defaults to 10.0): - The maximum value of the density ratio. The estimated density ratio is clamped to this value. + min_density_ratio (`float`, *optional*, defaults to `0.5`): + Minimum value of the density ratio. The estimated density ratio is clamped to this value. + max_density_ratio (`float`, *optional*, defaults to `10.0`): + Maximum value of the density ratio. The estimated density ratio is clamped to this value. """ max_length: Optional[int] = None - """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.""" max_prompt_length: Optional[int] = None - """The maximum length of the prompt. This argument is required if you want to use the default data collator.""" max_completion_length: Optional[int] = None - """The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.""" beta: float = 0.1 - """The beta factor in BCO loss. Higher beta means less divergence from the initial policy.""" - label_pad_token_id: int = -100 - padding_value: int = None + padding_value: Optional[int] = None truncation_mode: str = "keep_end" generate_during_eval: bool = False is_encoder_decoder: Optional[bool] = None precompute_ref_log_probs: bool = False - model_init_kwargs: Optional[Dict] = None - ref_model_init_kwargs: Optional[Dict] = None + model_init_kwargs: Optional[Dict[str, Any]] = None + ref_model_init_kwargs: Optional[Dict[str, Any]] = None dataset_num_proc: Optional[int] = None - - # BCO config prompt_sample_size: int = 1024 min_density_ratio: float = 0.5 max_density_ratio: float = 10.0 - - def __post_init__(self): - super().__post_init__() - - if not is_sklearn_available(): - raise ImportError( - "You need to install scikit-learn to use `BCOTrainer` " - "You can install it with `pip install scikit-learn`." - ) diff --git a/trl/trainer/bco_trainer.py b/trl/trainer/bco_trainer.py index 3d74501a10..75b5553d08 100644 --- a/trl/trainer/bco_trainer.py +++ b/trl/trainer/bco_trainer.py @@ -327,8 +327,13 @@ def __init__( embedding_func: Optional[Callable] = None, embedding_tokenizer: Optional[PreTrainedTokenizerBase] = None, ): + if not is_sklearn_available(): + raise ImportError( + "BCOTrainer requires the scikit-learn library. Please install it with `pip install scikit-learn`." + ) + if type(args) is TrainingArguments: - raise ValueError("Please use `BCOConfig` instead TrainingArguments.") + raise ValueError("Please use `BCOConfig` instead `TrainingArguments`.") if args.model_init_kwargs is None: model_init_kwargs = {} diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py index 5ba874b7a7..f61672de4e 100644 --- a/trl/trainer/cpo_config.py +++ b/trl/trainer/cpo_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Dict, Literal, Optional +from typing import Any, Dict, Literal, Optional from transformers import TrainingArguments @@ -20,70 +20,73 @@ @dataclass class CPOConfig(TrainingArguments): r""" - CPOConfig collects all training arguments related to the [`CPOTrainer`] class. + Configuration class for the [`CPOTrainer`]. - Using [`HfArgumentParser`] we can turn this class into + Using [`~transformers.HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. Parameters: - max_length (`int`, defaults to `None`): - The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`int`, defaults to `None`): - The maximum length of the prompt. This argument is required if you want to use the default data collator. - max_target_length (`int`, defaults to `None`): - The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. - beta (`float`, defaults to 0.1): - The beta factor in CPO loss. - label_smoothing (`float`, defaults to 0): - The label smoothing factor. This argument is required if you want to use the default data collator. - loss_type (`str`, defaults to `sigmoid`): - The type of loss to use. This argument is required if you want to use the default data collator. - label_pad_token_id (`int`, defaults to `-100`): - The label pad token id. This argument is required if you want to use the default data collator. - cpo_alpha (`float`, defaults to `1.0`): - A hyperparameter that controls the strength of the BC regularizer in CPO training. - simpo_gamma (`float`, defaults to `0.5`): - A target reward margin for the SimPO loss, used only when the "simpo" option is enabled. - padding_value (`int`, defaults to `None`): - The padding value if it is different to the tokenizer's pad_token_id. - truncation_mode (`str`, defaults to `keep_end`): - The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. - generate_during_eval (`bool`, defaults to `False`): - Whether to sample and log generations during evaluation step. - is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): - If no model is provided, we need to know if the model_init returns an encoder-decoder. - disable_dropout (`bool`, defaults to `True`): - Whether or not to disable dropouts in `model`. - model_init_kwargs (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the model from a string - dataset_num_proc (`Optional[int]`, *optional*): - The number of workers to use to tokenize the data. Defaults to None. + max_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in + the [paper](https://huggingface.co/papers/2310.12036). + label_smoothing (`float`, *optional*, defaults to `0.0`): + Label smoothing factor. This argument is required if you want to use the default data collator. + loss_type (`str`, *optional*, defaults to `"sigmoid"`): + Type of loss to use. Possible values are: + + - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. + - `"hinge"`: hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. + - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper. + - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper. + + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + cpo_alpha (`float`, *optional*, defaults to `1.0`): + Weight of the BC regularizer in CPO training. + simpo_gamma (`float`, *optional*, defaults to `0.5`): + Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`Optional[int]`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`,*optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from the model to W&B during evaluation. + is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. """ max_length: Optional[int] = None max_prompt_length: Optional[int] = None max_completion_length: Optional[int] = None - max_target_length: Optional[int] = None - beta: float = 0.1 - label_smoothing: float = 0 + label_smoothing: float = 0.0 loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid" disable_dropout: bool = True cpo_alpha: float = 1.0 simpo_gamma: float = 0.5 - label_pad_token_id: int = -100 - padding_value: int = None + padding_value: Optional[int] = None truncation_mode: str = "keep_end" generate_during_eval: bool = False is_encoder_decoder: Optional[bool] = None - - model_init_kwargs: Optional[Dict] = None - + model_init_kwargs: Optional[Dict[str, Any]] = None dataset_num_proc: Optional[int] = None - - def __post_init__(self): - if self.loss_type == "kto_pair": - raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.") - return super().__post_init__() diff --git a/trl/trainer/cpo_trainer.py b/trl/trainer/cpo_trainer.py index 459c06c6d3..9771296cea 100644 --- a/trl/trainer/cpo_trainer.py +++ b/trl/trainer/cpo_trainer.py @@ -225,15 +225,15 @@ def make_inputs_require_grad(module, input, output): else: max_prompt_length = args.max_prompt_length - if args.max_target_length is None and self.is_encoder_decoder: + if args.max_completion_length is None and self.is_encoder_decoder: warnings.warn( - "When using an encoder decoder architecture, you should set `max_target_length` in the CPOConfig's init" + "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init" " it will default to `128` by default, but you should do it yourself in the future.", UserWarning, ) - max_target_length = 128 + max_completion_length = 128 else: - max_target_length = args.max_target_length + max_completion_length = args.max_completion_length if data_collator is None: data_collator = DPODataCollatorWithPadding( @@ -264,7 +264,7 @@ def make_inputs_require_grad(module, input, output): self.padding_value = args.padding_value if args.padding_value is not None else tokenizer.pad_token_id self.max_prompt_length = max_prompt_length self.truncation_mode = args.truncation_mode - self.max_target_length = max_target_length + self.max_completion_length = max_completion_length self.tokenizer = tokenizer if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0: @@ -495,10 +495,10 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module else: chosen_tokens = self.tokenizer( - chosen, truncation=True, max_length=self.max_target_length, add_special_tokens=True + chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True ) rejected_tokens = self.tokenizer( - rejected, truncation=True, max_length=self.max_target_length, add_special_tokens=True + rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True ) prompt_tokens = self.tokenizer( prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True diff --git a/trl/trainer/ddpo_config.py b/trl/trainer/ddpo_config.py index b73bd58d05..f186f662f8 100644 --- a/trl/trainer/ddpo_config.py +++ b/trl/trainer/ddpo_config.py @@ -10,93 +10,137 @@ @dataclass class DDPOConfig: - """ - Configuration class for DDPOTrainer + r""" + Configuration class for the [`DDPOTrainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`): + Name of this experiment (by default is the file name without the extension name). + run_name (`str`, *optional*, defaults to `""`): + Name of this run. + seed (`int`, *optional*, defaults to `0`): + Random seed. + log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`): + Log with either 'wandb' or 'tensorboard', check + https://huggingface.co/docs/accelerate/usage_guides/tracking for more details. + tracker_kwargs (`Dict`, *optional*, defaults to `{}`): + Keyword arguments for the tracker (e.g. wandb_project). + accelerator_kwargs (`Dict`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator. + project_kwargs (`Dict`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator project config (e.g. `logging_dir`). + tracker_project_name (`str`, *optional*, defaults to `"trl"`): + Name of project to use for tracking. + logdir (`str`, *optional*, defaults to `"logs"`): + Top-level logging directory for checkpoint saving. + num_epochs (`int`, *optional*, defaults to `100`): + Number of epochs to train. + save_freq (`int`, *optional*, defaults to `1`): + Number of epochs between saving model checkpoints. + num_checkpoint_limit (`int`, *optional*, defaults to `5`): + Number of checkpoints to keep before overwriting old ones. + mixed_precision (`str`, *optional*, defaults to `"fp16"`): + Mixed precision training. + allow_tf32 (`bool`, *optional*, defaults to `True`): + Allow `tf32` on Ampere GPUs. + resume_from (`str`, *optional*, defaults to `""`): + Resume training from a checkpoint. + sample_num_steps (`int`, *optional*, defaults to `50`): + Number of sampler inference steps. + sample_eta (`float`, *optional*, defaults to `1.0`): + Eta parameter for the DDIM sampler. + sample_guidance_scale (`float`, *optional*, defaults to `5.0`): + Classifier-free guidance weight. + sample_batch_size (`int`, *optional*, defaults to `1`): + Batch size (per GPU) to use for sampling. + sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`): + Number of batches to sample per epoch. + train_batch_size (`int`, *optional*, defaults to `1`): + Batch size (per GPU) to use for training. + train_use_8bit_adam (`bool`, *optional*, defaults to `False`): + Use 8bit Adam optimizer from bitsandbytes. + train_learning_rate (`float`, *optional*, defaults to `3e-4`): + Learning rate. + train_adam_beta1 (`float`, *optional*, defaults to `0.9`): + Adam beta1. + train_adam_beta2 (`float`, *optional*, defaults to `0.999`): + Adam beta2. + train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`): + Adam weight decay. + train_adam_epsilon (`float`, *optional*, defaults to `1e-8`): + Adam epsilon. + train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`): + Number of gradient accumulation steps. + train_max_grad_norm (`float`, *optional*, defaults to `1.0`): + Maximum gradient norm for gradient clipping. + train_num_inner_epochs (`int`, *optional*, defaults to `1`): + Number of inner epochs per outer epoch. + train_cfg (`bool`, *optional*, defaults to `True`): + Whether or not to use classifier-free guidance during training. + train_adv_clip_max (`float`, *optional*, defaults to `5.0`): + Clip advantages to the range. + train_clip_range (`float`, *optional*, defaults to `1e-4`): + PPO clip range. + train_timestep_fraction (`float`, *optional*, defaults to `1.0`): + Fraction of timesteps to train on. + per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`): + Whether to track statistics for each prompt separately. + per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`): + Number of reward values to store in the buffer for each prompt. + per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`): + Minimum number of reward values to store in the buffer. + async_reward_computation (`bool`, *optional*, defaults to `False`): + Whether to compute rewards asynchronously. + max_workers (`int`, *optional*, defaults to `2`): + Maximum number of workers to use for async reward computation. + negative_prompts (`Optional[str]`, *optional*, defaults to `""`): + Comma-separated list of prompts to use as negative examples. """ - # common parameters exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")] - """the name of this experiment (by default is the file name without the extension name)""" - run_name: Optional[str] = "" - """Run name for wandb logging and checkpoint saving.""" + run_name: str = "" seed: int = 0 - """Seed value for random generations""" log_with: Optional[Literal["wandb", "tensorboard"]] = None - """Log with either 'wandb' or 'tensorboard', check https://huggingface.co/docs/accelerate/usage_guides/tracking for more details""" tracker_kwargs: dict = field(default_factory=dict) - """Keyword arguments for the tracker (e.g. wandb_project)""" accelerator_kwargs: dict = field(default_factory=dict) - """Keyword arguments for the accelerator""" project_kwargs: dict = field(default_factory=dict) - """Keyword arguments for the accelerator project config (e.g. `logging_dir`)""" tracker_project_name: str = "trl" - """Name of project to use for tracking""" logdir: str = "logs" - """Top-level logging directory for checkpoint saving.""" - - # hyperparameters num_epochs: int = 100 - """Number of epochs to train.""" save_freq: int = 1 - """Number of epochs between saving model checkpoints.""" num_checkpoint_limit: int = 5 - """Number of checkpoints to keep before overwriting old ones.""" mixed_precision: str = "fp16" - """Mixed precision training.""" allow_tf32: bool = True - """Allow tf32 on Ampere GPUs.""" - resume_from: Optional[str] = "" - """Resume training from a checkpoint.""" + resume_from: str = "" sample_num_steps: int = 50 - """Number of sampler inference steps.""" sample_eta: float = 1.0 - """Eta parameter for the DDIM sampler.""" sample_guidance_scale: float = 5.0 - """Classifier-free guidance weight.""" sample_batch_size: int = 1 - """Batch size (per GPU!) to use for sampling.""" sample_num_batches_per_epoch: int = 2 - """Number of batches to sample per epoch.""" train_batch_size: int = 1 - """Batch size (per GPU!) to use for training.""" train_use_8bit_adam: bool = False - """Whether to use the 8bit Adam optimizer from bitsandbytes.""" train_learning_rate: float = 3e-4 - """Learning rate.""" train_adam_beta1: float = 0.9 - """Adam beta1.""" train_adam_beta2: float = 0.999 - """Adam beta2.""" train_adam_weight_decay: float = 1e-4 - """Adam weight decay.""" train_adam_epsilon: float = 1e-8 - """Adam epsilon.""" train_gradient_accumulation_steps: int = 1 - """Number of gradient accumulation steps.""" train_max_grad_norm: float = 1.0 - """Maximum gradient norm for gradient clipping.""" train_num_inner_epochs: int = 1 - """Number of inner epochs per outer epoch.""" train_cfg: bool = True - """Whether or not to use classifier-free guidance during training.""" - train_adv_clip_max: float = 5 - """Clip advantages to the range.""" + train_adv_clip_max: float = 5.0 train_clip_range: float = 1e-4 - """The PPO clip range.""" train_timestep_fraction: float = 1.0 - """The fraction of timesteps to train on.""" per_prompt_stat_tracking: bool = False - """Whether to track statistics for each prompt separately.""" per_prompt_stat_tracking_buffer_size: int = 16 - """Number of reward values to store in the buffer for each prompt.""" per_prompt_stat_tracking_min_count: int = 16 - """The minimum number of reward values to store in the buffer.""" async_reward_computation: bool = False - """Whether to compute rewards asynchronously.""" max_workers: int = 2 - """The maximum number of workers to use for async reward computation.""" - negative_prompts: Optional[str] = "" - """Comma-separated list of prompts to use as negative examples.""" + negative_prompts: str = "" def to_dict(self): output_dict = {} diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py index 67acb14ad5..a517c3cbfe 100644 --- a/trl/trainer/dpo_config.py +++ b/trl/trainer/dpo_config.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import warnings from dataclasses import dataclass from enum import Enum -from typing import Dict, Literal, Optional +from typing import Any, Dict, Literal, Optional from transformers import TrainingArguments @@ -32,15 +33,22 @@ class FDivergenceConstants: @dataclass class DPOConfig(TrainingArguments): r""" - Initialize DPOConfig. + Configuration class for the [`DPOTrainer`]. - Args: + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: beta (`float`, *optional*, defaults to `0.1`): - The beta factor in DPO loss. Higher beta means less divergence from the initial policy. For the IPO loss, beta is the regularization parameter denoted by tau in the paper. + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in + the [paper](https://huggingface.co/papers/2310.12036). label_smoothing (`float`, *optional*, defaults to `0.0`): - The robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and [Robust DPO](https://huggingface.co/papers/2403.00409) paper that should be between 0 and 0.5. + Robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and + [Robust DPO](https://huggingface.co/papers/2403.00409) paper that should be between `0.0` and `0.5`. loss_type (`str`, *optional*, defaults to `"sigmoid"`): - The type of DPO loss to use. Possible values are: + Type of loss to use. Possible values are: - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. - `"hinge"`: hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. @@ -56,56 +64,74 @@ class DPOConfig(TrainingArguments): - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper. label_pad_token_id (`int`, *optional*, defaults to `-100`): - The label pad token id. This argument is required if you want to use the default data collator. + Label pad token id. This argument is required if you want to use the default data collator. padding_value (`Optional[int]`, *optional*, defaults to `None`): - The padding value if it is different to the tokenizer's pad_token_id. + Padding value to use. If `None`, the padding value of the tokenizer is used. truncation_mode (`str`, *optional*, defaults to `"keep_end"`): - The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. + Truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the + default data collator. max_length (`Optional[int]`, *optional*, defaults to `None`): - The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): - The maximum length of the prompt. This argument is required if you want to use the default data collator. - max_target_length (`Optional[int]`, *optional*, defaults to `None`): - The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the target. This argument is required if you want to use the default data collator and + your model is an encoder-decoder. is_encoder_decoder(`Optional[int]`, *optional*, defaults to `None`): - If no model is provided, we need to know if the model_init returns an encoder-decoder. + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. disable_dropout (`bool`, *optional*, defaults to `True`): - Whether or not to disable dropouts in `model` and `ref_model`. + Whether to disable dropout in the model and reference model. generate_during_eval (`bool`, *optional*, defaults to `False`): - Whether to sample and log generations during evaluation step. + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): - Flag to precompute reference model log probabilities for training and evaluation datasets. This is useful if you want to train - without the reference model and reduce the total GPU memory needed. + Whether to precompute reference model log probabilities for training and evaluation datasets. This is + useful when training without the reference model to reduce the total GPU memory needed. dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): - The number of workers to use to tokenize the data. Defaults to None. - model_init_kwargs (`Optional[Dict]`, *optional*, defaults to `None`): - Dict of Optional kwargs to pass when instantiating the model from a string - ref_model_init_kwargs (`Optional[Dict]`, *optional*, defaults to `None`): - Dict of Optional kwargs to pass when instantiating the ref model from a string + Number of processes to use for processing the dataset. + model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model + from a string. model_adapter_name (`Optional[str]`, *optional*, defaults to `None`): Name of the train target PEFT adapter, when using LoRA with multiple adapters. ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`): Name of the reference PEFT adapter, when using LoRA with multiple adapters. reference_free (`bool`, *optional*, defaults to `False`): - If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses. + If `True`, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal + probability to all responses. force_use_ref_model (`bool`, *optional*, defaults to `False`): - In case one passes a PEFT model for the active model and you want to use a different model for the ref_model, set this flag to `True`. - f_divergence_type (`FDivergenceType`, *optional*, defaults to `FDivergenceType.REVERSE_KL`): - The type of f-divergence regularization function to compute divergence between policy and reference model. This argument is optional, defaults to `FDivergenceType.REVERSE_KL`. + In case one passes a PEFT model for the active model and you want to use a different model for the + ref_model, set this flag to `True`. + f_divergence_type (`str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`): + Type of f-divergence regularization function to compute divergence between policy and reference model. f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`): - The alpha coef in alpha-divergence(u^-alpha) regularization function for DPO loss. - sync_ref_model ('bool', *optional*, defaults to `False`): - The flag for syncing reference model during training from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper. - ref_model_mixup_alpha ('float', *optional*, defaults to `1.0`): - The alpha parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper. - ref_model_sync_steps ('int', *optional*, defaults to `2`): - The tau parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper. - rpo_alpha ('float', *optional*, defaults to `None`): - The alpha parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper V3. If None, no weighting is applied and the loss is the same as the DPO loss. The paper recommends `rpo_alpha=1.0`. + α coefficient in the α-divergence \\(u^{-\\alpha}\\) regularization function for DPO loss. + sync_ref_model (`bool`, *optional*, defaults to `False`): + When set to `True`, the reference model is synchronized with the active model every `ref_model_sync_steps` + steps, using the `ref_model_mixup_alpha` parameter. This synchronization originites from the + [TR-DPO](https://huggingface.co/papers/2404.09656) paper. + ref_model_mixup_alpha (`float`, *optional*, defaults to `0.9`): + α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix + between the current policy and the previous reference policy during updates. The reference policy is + updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev` + To use this parameter, you must set `sync_ref_model=True`. + ref_model_sync_steps (`int`, *optional*, defaults to `64`): + τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how + frequently the current policy is synchronized with the reference policy. To use this parameter, you must + set `sync_ref_model=True`. + rpo_alpha (`float`, *optional*, defaults to `None`): + α parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper (v3), which controls the + weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the + DPO loss. The paper recommends `rpo_alpha=1.0`. """ beta: float = 0.1 - label_smoothing: float = 0 + label_smoothing: float = 0.0 loss_type: Literal[ "sigmoid", "hinge", @@ -125,14 +151,15 @@ class DPOConfig(TrainingArguments): truncation_mode: str = "keep_end" max_length: Optional[int] = None max_prompt_length: Optional[int] = None - max_target_length: Optional[int] = None + max_target_length: Optional[int] = None # deprecated in favor of max_completion_length + max_completion_length: Optional[int] = None is_encoder_decoder: Optional[bool] = None disable_dropout: bool = True generate_during_eval: bool = False precompute_ref_log_probs: bool = False dataset_num_proc: Optional[int] = None - model_init_kwargs: Optional[Dict] = None - ref_model_init_kwargs: Optional[Dict] = None + model_init_kwargs: Optional[Dict[str, Any]] = None + ref_model_init_kwargs: Optional[Dict[str, Any]] = None model_adapter_name: Optional[str] = None ref_adapter_name: Optional[str] = None reference_free: bool = False @@ -145,6 +172,12 @@ class DPOConfig(TrainingArguments): rpo_alpha: Optional[float] = None def __post_init__(self): - if self.loss_type == "kto_pair": - raise ValueError("Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer.") + if self.max_target_length is not None: + warnings.warn( + "The `max_target_length` argument is deprecated in favor of `max_completion_length` and will be removed in a future version.", + FutureWarning, + ) + if self.max_completion_length is None: + self.max_completion_length = self.max_target_length + return super().__post_init__() diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py index eccc9496c2..f54538962d 100644 --- a/trl/trainer/dpo_trainer.py +++ b/trl/trainer/dpo_trainer.py @@ -254,8 +254,10 @@ def _tokenize_encoder_decoder( args: DPOConfig, model: Optional[PreTrainedModel], ) -> None: - chosen_tokens = tokenizer(chosen, truncation=True, max_length=args.max_target_length, add_special_tokens=True) - rejected_tokens = tokenizer(rejected, truncation=True, max_length=args.max_target_length, add_special_tokens=True) + chosen_tokens = tokenizer(chosen, truncation=True, max_length=args.max_completion_length, add_special_tokens=True) + rejected_tokens = tokenizer( + rejected, truncation=True, max_length=args.max_completion_length, add_special_tokens=True + ) prompt_tokens = tokenizer(prompt, truncation=True, max_length=args.max_prompt_length, add_special_tokens=True) batch["chosen_labels"] = chosen_tokens["input_ids"] @@ -692,14 +694,14 @@ def make_inputs_require_grad(module, input, output): warnings.warn( "You passed `max_target_length` to the DPOTrainer, the value you passed will override the one in the `DPOConfig`." ) - args.max_target_length = max_target_length - if args.max_target_length is None and self.is_encoder_decoder: + args.max_completion_length = max_target_length + if args.max_completion_length is None and self.is_encoder_decoder: warnings.warn( - "When using an encoder decoder architecture, you should set `max_target_length` in the DPOConfig's init" + "When using an encoder decoder architecture, you should set `max_completion_length` in the DPOConfig's init" " it will default to `128` by default, but you should do it yourself in the future.", UserWarning, ) - args.max_target_length = 128 + args.max_completion_length = 128 if label_pad_token_id != -100: warnings.warn( @@ -752,7 +754,7 @@ def make_inputs_require_grad(module, input, output): ) args.truncation_mode = truncation_mode self.truncation_mode = args.truncation_mode - self.max_target_length = args.max_target_length + self.max_completion_length = args.max_completion_length self.precompute_ref_log_probs = args.precompute_ref_log_probs # Since ref_logs are precomputed on the first call to get_train/eval_dataloader diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py index c5a6893fce..64b030b585 100644 --- a/trl/trainer/kto_config.py +++ b/trl/trainer/kto_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Dict, Literal, Optional +from typing import Any, Dict, Literal, Optional from transformers import TrainingArguments @@ -20,74 +20,73 @@ @dataclass class KTOConfig(TrainingArguments): r""" - KTOConfig collects all training arguments related to the [`KTOTrainer`] class. + Configuration class for the [`KTOTrainer`]. - Using [`HfArgumentParser`] we can turn this class into + Using [`~transformers.HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. Parameters: + max_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. loss_type (`str`, *optional*, defaults to `"kto"`): - The type of unpaired loss to use. Possible values are: + Type of loss to use. Possible values are: - - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper. - - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper. - max_length (`int`, *optional*, defaults to `None`): - The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`int`, *optional*, defaults to `None`): - The maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`int`, *optional*, defaults to `None`): - The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder. - beta (`float`, defaults to 0.1): - The beta factor in KTO loss. Higher beta means less divergence from the initial policy. - desirable_weight (`float`, *optional*, defaults to 1.0): - The desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris. - undesirable_weight (`float`, *optional*, defaults to 1.0): - The undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs. - label_pad_token_id (`int`, defaults to `-100`): - The label pad token id. This argument is required if you want to use the default data collator. - padding_value (`int`, defaults to `0`): - The padding value if it is different to the tokenizer's pad_token_id. - truncation_mode (`str`, defaults to `keep_end`): - The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. - generate_during_eval (`bool`, defaults to `False`): - Whether to sample and log generations during evaluation step. - is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): - If no model is provided, we need to know if the model_init returns an encoder-decoder. - precompute_ref_log_probs (`bool`, defaults to `False`): - Flag to precompute reference model log probabilities for training and evaluation datasets. This is useful if you want to train - without the reference model and reduce the total GPU memory needed. - model_init_kwargs: (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the model from a string. - ref_model_init_kwargs: (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the ref model from a string. + - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper. + - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper. + + desirable_weight (`float`, *optional*, defaults to `1.0`): + Desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris. + undesirable_weight (`float`, *optional*, defaults to `1.0`): + Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`Optional[int]`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from both the model and the reference model to W&B during + evaluation. + is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + precompute_ref_log_probs (`bool`, *optional*, defaults to `False`): + Whether to precompute reference model log probabilities for training and evaluation datasets. This is + useful when training without the reference model to reduce the total GPU memory needed. + model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model + from a string. dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`): - Number of processes to use for processing the datasets. + Number of processes to use for processing the dataset. """ - loss_type: Literal[ - "kto", - "apo_zero_unpaired", - ] = "kto" max_length: Optional[int] = None - """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.""" max_prompt_length: Optional[int] = None - """The maximum length of the prompt. This argument is required if you want to use the default data collator.""" max_completion_length: Optional[int] = None - """The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.""" beta: float = 0.1 - """The beta factor in KTO loss. Higher beta means less divergence from the initial policy.""" - desirable_weight: Optional[float] = 1.0 - """The desirable losses are weighed by this factor.""" - undesirable_weight: Optional[float] = 1.0 - """The undesirable losses are weighed by this factor.""" - + loss_type: Literal["kto", "apo_zero_unpaired"] = "kto" + desirable_weight: float = 1.0 + undesirable_weight: float = 1.0 label_pad_token_id: int = -100 - padding_value: int = None + padding_value: Optional[int] = None truncation_mode: str = "keep_end" generate_during_eval: bool = False is_encoder_decoder: Optional[bool] = None precompute_ref_log_probs: bool = False - model_init_kwargs: Optional[Dict] = None - ref_model_init_kwargs: Optional[Dict] = None + model_init_kwargs: Optional[Dict[str, Any]] = None + ref_model_init_kwargs: Optional[Dict[str, Any]] = None dataset_num_proc: Optional[int] = None diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py index bc0caf93d2..2bca3f84bf 100644 --- a/trl/trainer/model_config.py +++ b/trl/trainer/model_config.py @@ -1,95 +1,80 @@ -from dataclasses import dataclass, field -from typing import List, Optional - -from ..core import flatten_dict +from dataclasses import dataclass +from typing import List, Literal, Optional @dataclass class ModelConfig: """ - Arguments which define the model and tokenizer to load. - """ + Configuration class for the models. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. - model_name_or_path: Optional[str] = field( - default=None, - metadata={"help": ("The model checkpoint for weights initialization.")}, - ) - model_revision: str = field( - default="main", - metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, - ) - torch_dtype: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the " - "dtype will be automatically derived from the model's weights." - ), - "choices": ["auto", "bfloat16", "float16", "float32"], - }, - ) - trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."}) - attn_implementation: Optional[str] = field( - default=None, - metadata={ - "help": ( - "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`" - ) - }, - ) - use_peft: bool = field( - default=False, - metadata={"help": ("Whether to use PEFT or not for training.")}, - ) - lora_r: Optional[int] = field( - default=16, - metadata={"help": ("LoRA R value.")}, - ) - lora_alpha: Optional[int] = field( - default=32, - metadata={"help": ("LoRA alpha.")}, - ) - lora_dropout: Optional[float] = field( - default=0.05, - metadata={"help": ("LoRA dropout.")}, - ) - lora_target_modules: Optional[List[str]] = field( - default=None, - metadata={"help": ("LoRA target modules.")}, - ) - lora_modules_to_save: Optional[List[str]] = field( - default=None, - metadata={"help": ("Model layers to unfreeze & train")}, - ) - lora_task_type: str = field( - default="CAUSAL_LM", metadata={"help": "The task_type to pass for LoRA (use SEQ_CLS for reward modeling)"} - ) - use_rslora: bool = field( - default=False, - metadata={ - "help": ( - "Use Rank-Stabilized LoRA (https://huggingface.co/papers/2312.03732), which sets the adapter " - "scaling factor to lora_alpha/√r, instead of the original default value of `lora_alpha/r`." - ) - }, - ) - load_in_8bit: bool = field( - default=False, metadata={"help": "use 8 bit precision for the base model - works only with LoRA"} - ) - load_in_4bit: bool = field( - default=False, metadata={"help": "use 4 bit precision for the base model - works only with LoRA"} - ) + Parameters: + model_name_or_path (`Optional[str]`, *optional*, defaults to `None`): + Model checkpoint for weights initialization. + model_revision (`str`, *optional*, defaults to `"main"`): + Specific model version to use. It can be a branch name, a tag name, or a commit id. + torch_dtype (`Optional[Literal["auto", "bfloat16", "float16", "float32"]]`, *optional*, defaults to `None`): + Override the default `torch.dtype` and load the model under this dtype. Possible values are - bnb_4bit_quant_type: Optional[str] = field( - default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"} - ) - use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"}) + - `"bfloat16"`: `torch.bfloat16` + - `"float16"`: `torch.float16` + - `"float32"`: `torch.float32` + - `"auto"`: Automatically derive the dtype from the model's weights. + + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether to allow for custom models defined on the Hub in their own modeling files. This option should only + be set to `True` for repositories you trust and in which you have read the code, as it will execute code + present on the Hub on your local machine. + attn_implementation (`Optional[str]`, *optional*, defaults to `None`): + Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case + you must install this manually by running `pip install flash-attn --no-build-isolation`. + use_peft (`bool`, *optional*, defaults to `False`): + Whether to use PEFT for training. + lora_r (`int`, *optional*, defaults to `16`): + LoRA R value. + lora_alpha (`int`, *optional*, defaults to `32`): + LoRA alpha. + lora_dropout (`float`, *optional*, defaults to `0.05`): + LoRA dropout. + lora_target_modules (`Optional[Union[str, List[str]]]`, *optional*, defaults to `None`): + LoRA target modules. + lora_modules_to_save (`Optional[List[str]]`, *optional*, defaults to `None`): + Model layers to unfreeze & train. + lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`): + Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling). + use_rslora (`bool`, *optional*, defaults to `False`): + Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/√r`, instead of + the original default value of `lora_alpha/r`. + load_in_8bit (`bool`, *optional*, defaults to `False`): + Whether to use 8 bit precision for the base model. Works only with LoRA. + load_in_4bit (`bool`, *optional*, defaults to `False`): + Whether to use 4 bit precision for the base model. Works only with LoRA. + bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`): + Quantization type (`"fp4"` or `"nf4"`). + use_bnb_nested_quant (`bool`, *optional*, defaults to `False`): + Whether to use nested quantization. + """ - def to_dict(self): - output_dict = {} - for key, value in self.__dict__.items(): - output_dict[key] = value - return flatten_dict(output_dict) + model_name_or_path: Optional[str] = None + model_revision: str = "main" + torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None + trust_remote_code: bool = False + attn_implementation: Optional[str] = None + use_peft: bool = False + lora_r: int = 16 + lora_alpha: int = 32 + lora_dropout: float = 0.05 + lora_target_modules: Optional[List[str]] = None + lora_modules_to_save: Optional[List[str]] = None + lora_task_type: str = "CAUSAL_LM" + use_rslora: bool = False + load_in_8bit: bool = False + load_in_4bit: bool = False + bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4" + use_bnb_nested_quant: bool = False def __post_init__(self): if self.load_in_8bit and self.load_in_4bit: diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py index f1c1ccab4a..fca33c4dcc 100644 --- a/trl/trainer/online_dpo_config.py +++ b/trl/trainer/online_dpo_config.py @@ -13,29 +13,33 @@ class OnlineDPOConfig(TrainingArguments): [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. - Args: + Parameters: reward_model_path (`Optional[str]`, *optional*, defaults to `None`): Path to the reward model. max_new_tokens (`int`, *optional*, defaults to `64`): - The maximum number of tokens to generate per completion. + Maximum number of tokens to generate per completion. temperature (`float`, *optional*, defaults to `0.9`): Temperature for sampling. The higher the temperature, the more random the completions. missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`): - Penalty when the model fails to generate an EOS token. + Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage + to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive + value. beta (`float`, *optional*, defaults to `0.1`): - Beta parameter for the DPO loss. + Parameter controlling the deviation from the reference model. Higher β means less deviation from the + reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in + the [paper](https://huggingface.co/papers/2310.12036). loss_type (`str`, *optional*, defaults to `"sigmoid"`): - Type of DPO loss to use. Possible values are: + Type of loss to use. Possible values are: - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper. - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper. dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): - Number of workers to use to process the data. + Number of processes to use for processing the dataset. """ reward_model_path: Optional[str] = None - max_new_tokens: int = 53 + max_new_tokens: int = 64 temperature: float = 0.9 missing_eos_penalty: Optional[float] = None beta: float = 0.1 diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py index 14be7ee1a2..6cc54ed919 100644 --- a/trl/trainer/orpo_config.py +++ b/trl/trainer/orpo_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Dict, Optional +from typing import Any, Dict, Optional from transformers import TrainingArguments @@ -20,52 +20,54 @@ @dataclass class ORPOConfig(TrainingArguments): r""" - ORPOConfig collects all training arguments related to the [`ORPOTrainer`] class. + Configuration class for the [`ORPOTrainer`]. - Using [`HfArgumentParser`] we can turn this class into + Using [`~transformers.HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. Parameters: - max_length (`int`, defaults to `None`): - The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. - max_prompt_length (`int`, defaults to `None`): - The maximum length of the prompt. This argument is required if you want to use the default data collator. - max_completion_length (`int`, defaults to `None`): - The maximum length of the completions. This argument is required if you want to use the default data collator and your model is an encoder-decoder. - beta (`float`, defaults to 0.1): - The beta factor in ORPO loss (lambda/alpha in paper/code) that is the weight of the relative loss ratio in the SFT loss. - label_pad_token_id (`int`, defaults to `-100`): - The label pad token id. This argument is required if you want to use the default data collator. - padding_value (`int`, defaults to `None`): - The padding value if it is different to the tokenizer's pad_token_id. - truncation_mode (`str`, defaults to `keep_end`): - The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator. - generate_during_eval (`bool`, defaults to `False`): - Whether to sample and log generations during evaluation step. - is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`): - If no model is provided, we need to know if the model_init returns an encoder-decoder. - disable_dropout (`bool`, defaults to `True`): - Whether or not to disable dropouts in `model`. - model_init_kwargs (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the model from a string - dataset_num_proc (`Optional[int]`, *optional*): - The number of workers to use to tokenize the data. Defaults to None. + max_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + max_prompt_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the prompt. This argument is required if you want to use the default data collator. + max_completion_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the completion. This argument is required if you want to use the default data collator + and your model is an encoder-decoder. + beta (`float`, *optional*, defaults to `0.1`): + Parameter controlling the relative ratio loss weight in the ORPO loss. In the [paper](https://huggingface.co/papers/2403.07691), + it is denoted by λ. In the [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`. + disable_dropout (`bool`, *optional*, defaults to `True`): + Whether to disable dropout in the model. + label_pad_token_id (`int`, *optional*, defaults to `-100`): + Label pad token id. This argument is required if you want to use the default data collator. + padding_value (`Optional[int]`, *optional*, defaults to `None`): + Padding value to use. If `None`, the padding value of the tokenizer is used. + truncation_mode (`str`, *optional*, defaults to `"keep_end"`): + Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`. + This argument is required if you want to use the default data collator. + generate_during_eval (`bool`, *optional*, defaults to `False`): + If `True`, generates and logs completions from the model to W&B during evaluation. + is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. """ max_length: Optional[int] = None max_prompt_length: Optional[int] = None max_completion_length: Optional[int] = None - beta: float = 0.1 disable_dropout: bool = True - label_pad_token_id: int = -100 - padding_value: int = None + padding_value: Optional[int] = None truncation_mode: str = "keep_end" generate_during_eval: bool = False is_encoder_decoder: Optional[bool] = None - - model_init_kwargs: Optional[Dict] = None - + model_init_kwargs: Optional[Dict[str, Any]] = None dataset_num_proc: Optional[int] = None diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py index 38aee8e2b3..147ab5720b 100644 --- a/trl/trainer/ppo_config.py +++ b/trl/trainer/ppo_config.py @@ -33,112 +33,164 @@ @dataclass class PPOConfig: - """ - Configuration class for PPOTrainer + r""" + Configuration class for the [`PPOTrainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`): + Name of this experiment. + seed (`int`, *optional*, defaults to `0`): + Random seed. + log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`): + Log with either `"wandb"` or `"tensorboard"`. Check + [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details. + task_name (`Optional[str]`, *optional*, defaults to `None`): + Name of task to use - used only for tracking purposes. + model_name (`Optional[str]`, *optional*, defaults to `"gpt2"`): + Name of model to use - used only for tracking purposes. + query_dataset (`Optional[str]`, *optional*, defaults to `"imdb"`): + Name of dataset to query - used only for tracking purposes. + reward_model (`Optional[str]`, *optional*, defaults to `"sentiment-analysis:lvwerra/distilbert-imdb"`): + Reward model to use - used only for tracking purposes. + remove_unused_columns (`bool`, *optional*, defaults to `True`): + Remove unused columns from the dataset. + tracker_kwargs (`JSONDict`, *optional*, defaults to `{}`): + Keyword arguments for the tracker (e.g. `python ppo.py --tracker_kwargs='{"wandb": {"entity": "my_wandb_entity", "name": "my_exp_name"}}'`. + accelerator_kwargs (`JSONDict`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator. + project_kwargs (`JSONDict`, *optional*, defaults to `{}`): + Keyword arguments for the accelerator project config (e.g. `logging_dir`). + tracker_project_name (`str`, *optional*, defaults to `"trl"`): + Name of project to use for tracking. + push_to_hub_if_best_kwargs (`JSONDict`, *optional*, defaults to `{}`): + Keyword arguments for pushing model to the hub during training (e.g. repo_id). + steps (`int`, *optional*, defaults to `20000`): + Number of training steps. + learning_rate (`float`, *optional*, defaults to `1.41e-5`): + Learning rate for the optimizer. + adap_kl_ctrl (`bool`, *optional*, defaults to `True`): + Use adaptive KL control, otherwise linear. + init_kl_coef (`Optional[float]`, *optional*, defaults to `0.2`): + Initial KL penalty coefficient (used for adaptive and linear control). + kl_penalty (`Literal["kl", "abs", "mse", "full"]`, *optional*, defaults to `"kl"`): + kl penalty options. Possible values are: + + - `"kl"`: model_logp - ref_logp + - `"abs"`: abs(kl) + - `"mse"`: mean squared error mse(kl) + - `"full"`: the actual kl for all tokens in the distribution. + + target (`float`, *optional*, defaults to `6.0`): + Target KL value for adaptive KL control. + horizon (`float`, *optional*, defaults to `10000.0`): + Horizon for adaptive KL control. + gamma (`float`, *optional*, defaults to `1.0`): + Gamma parameter for advantage calculation. + lam (`float`, *optional*, defaults to `0.95`): + Lambda parameter for advantage calculation. + cliprange (`float`, *optional*, defaults to `0.2`): + Range for clipping in PPO policy gradient loss. + cliprange_value (`float`, *optional*, defaults to `0.2`): + Range for clipping values in loss calculation. + vf_coef (`float`, *optional*, defaults to `0.1`): + Scaling factor for value loss. + batch_size (`int`, *optional*, defaults to `128`): + Number of samples per optimisation step. + forward_batch_size (`Optional[int]`, *optional*, defaults to `None`): + DEPRECATED: use `mini_batch_size` instead, which does the same thing. + mini_batch_size (`int`, *optional*, defaults to `128`): + Number of samples optimized in each mini batch. + gradient_accumulation_steps (`int`, *optional*, defaults to `1`): + Number of gradient accumulation steps. + world_size (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for distributed training. + ppo_epochs (`int`, *optional*, defaults to `4`): + Number of optimisation epochs per batch of samples. + optimize_device_cache (`bool`, *optional*, defaults to `False`): + Optimize device cache for slightly more memory-efficient training. + early_stopping (`bool`, *optional*, defaults to `False`): + Whether to stop the PPO optimization loop early is the KL too high. + target_kl (`float`, *optional*, defaults to `1.0`): + Stop early if we exceed this value by over 50%. + compare_steps (`int`, *optional*, defaults to `1`): + Compare the current step with the previous `compare_steps` steps. + ratio_threshold (`float`, *optional*, defaults to `10.0`): + Skip mini-batches with high PPO ratios that can cause loss spikes. + use_score_scaling (`bool`, *optional*, defaults to `False`): + Use score scaling. + use_score_norm (`bool`, *optional*, defaults to `False`): + Use score normalization. Only applicable if `use_score_scaling` is True. + score_clip (`Optional[float]`, *optional*, defaults to `None`): + Score clipping. + whiten_rewards (`bool`, *optional*, defaults to `False`): + Whiten the rewards before computing advantages. + is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`): + When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument, + you need to specify if the model returned by the callable is an encoder-decoder model. + is_peft_model (`Optional[bool]`, *optional*, defaults to `None`): + Whether the model is a PEFT model. + backward_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Number of samples optimized in an `optimizer.step()` call. + global_backward_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Effective `backward_batch_size` across all processes. + global_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Effective `batch_size` across all processes. + dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. """ - # common parameters exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")] - """the name of this experiment (by default is the file name without the extension name)""" seed: int = 0 - """Seed value for random generations""" log_with: Optional[Literal["wandb", "tensorboard"]] = None - """Log with either 'wandb' or 'tensorboard', check https://huggingface.co/docs/accelerate/usage_guides/tracking for more details""" task_name: Optional[str] = None - """Name of task to use - used only for tracking purposes""" - model_name: Optional[str] = "gpt2" - """Name of model to use - used only for tracking purposes""" - query_dataset: Optional[str] = "imdb" - """Name of dataset to query - used only for tracking purposes""" - reward_model: Optional[str] = "sentiment-analysis:lvwerra/distilbert-imdb" - """The reward model to use - used only for tracking purposes""" + model_name: str = "gpt2" + query_dataset: str = "imdb" + reward_model: str = "sentiment-analysis:lvwerra/distilbert-imdb" remove_unused_columns: bool = True - """Remove unused columns from the dataset if `datasets.Dataset` is used""" tracker_kwargs: JSONDict = field(default_factory=dict) - """Keyword arguments for the tracker (e.g. python ppo.py --tracker_kwargs='{"wandb": {"entity": "my_wandb_entity", "name": "my_exp_name"}}'""" accelerator_kwargs: JSONDict = field(default_factory=dict) - """Keyword arguments for the accelerator""" project_kwargs: JSONDict = field(default_factory=dict) - """Keyword arguments for the accelerator project config (e.g. `logging_dir`)""" tracker_project_name: str = "trl" - """Name of project to use for tracking""" push_to_hub_if_best_kwargs: JSONDict = field(default_factory=dict) - """Keyword arguments for pushing model to the hub during training (e.g. repo_id)""" - - # hyperparameters steps: int = 20000 - """Number of training steps""" learning_rate: float = 1.41e-5 - """Adam learning rate""" adap_kl_ctrl: bool = True - """Use adaptive KL control, otherwise linear""" - init_kl_coef: Optional[float] = 0.2 - """Initial KL penalty coefficient (used for adaptive and linear control)""" + init_kl_coef: float = 0.2 kl_penalty: Literal["kl", "abs", "mse", "full"] = "kl" - """kl penalty options: 'kl': model_logp - ref_logp, 'abs': abs(kl), 'mse': mean squared error mse(kl) and 'full': the actual kl for all tokens in the distribution""" - target: Optional[float] = 6 - """Target KL value for adaptive KL control""" - horizon: Optional[float] = 10000 - """Horizon for adaptive KL control""" - gamma: float = 1 - """Gamma parameter for advantage calculation""" + target: float = 6.0 + horizon: float = 10000.0 + gamma: float = 1.0 lam: float = 0.95 - """Lambda parameter for advantage calculation""" cliprange: float = 0.2 - """Range for clipping in PPO policy gradient loss""" cliprange_value: float = 0.2 - """Range for clipping values in loss calculation""" vf_coef: float = 0.1 - """Scaling factor for value loss""" batch_size: int = 128 - """Number of samples per optimisation step""" forward_batch_size: Optional[int] = None - """DEPRECATED: use `mini_batch_size` instead, which does the same thing.""" mini_batch_size: int = 128 - """Number of samples optimized in each mini batch""" gradient_accumulation_steps: int = 1 - """The number of gradient accumulation steps""" world_size: tyro.conf.Suppress[int] = None - """The world size for distributed training""" ppo_epochs: int = 4 - """Number of optimisation epochs per batch of samples""" max_grad_norm: Optional[float] = None - """Maximum gradient norm for gradient clipping""" optimize_cuda_cache: Optional[bool] = None - """DEPRECATED: use `optimize_device_cache` instead, which does the same thing.""" - optimize_device_cache: Optional[bool] = False - """Optimize device cache for slightly more memory-efficient training""" + optimize_device_cache: bool = False early_stopping: bool = False - """Whether to stop the PPO optimization loop early is the KL too high""" - target_kl: float = 1 - """Stop early if we exceed this value by over 50%""" + target_kl: float = 1.0 compare_steps: int = 1 - """Number of steps between comparison of the current reward with the best seen so far""" ratio_threshold: float = 10.0 - """Skip mini-batches with high PPO ratios that can cause loss spikes""" use_score_scaling: bool = False - """Use score scaling""" use_score_norm: bool = False - """Use score normalization. Only applicable if use_score_scaling is True""" score_clip: Optional[float] = None - """Score clipping""" whiten_rewards: bool = False - """Whiten the rewards before compute advantages""" gradient_checkpointing: bool = False - """Enable gradient checkpointing""" - - # computed hyperparameters at runtime; we use `tyro.conf.Suppress` to hide them from the help text is_encoder_decoder: Optional[tyro.conf.Suppress[bool]] = None - """TO BE FILLED In RUNTIME: Whether the model is an encoder-decoder model""" is_peft_model: Optional[tyro.conf.Suppress[bool]] = None - """TO BE FILLED In RUNTIME: Whether the model is a PEFT model""" backward_batch_size: tyro.conf.Suppress[int] = None - """TO BE FILLED In RUNTIME: Number of samples optimized in an `optimizer.step()` call""" - global_backward_batch_size: tyro.conf.Suppress[int] = None - """TO BE FILLED In RUNTIME: the effective `backward_batch_size` across all processes""" + global_backward_batch_size: Optional[tyro.conf.Suppress[int]] = None global_batch_size: tyro.conf.Suppress[int] = None - """TO BE FILLED In RUNTIME: the effective `batch_size` across all processes""" - dataset_num_proc: Optional[int] = None if optimize_cuda_cache is not None: diff --git a/trl/trainer/ppov2_config.py b/trl/trainer/ppov2_config.py index 05247f4fae..944d247e7c 100644 --- a/trl/trainer/ppov2_config.py +++ b/trl/trainer/ppov2_config.py @@ -6,25 +6,43 @@ @dataclass class PPOv2Config(OnPolicyConfig): + r""" + Configuration class for the [`PPOv2Trainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`): + Name of this experiment. + reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): + Path to the reward model. + num_ppo_epochs (`int`, *optional*, defaults to `4`): + Number of epochs to train. + whiten_rewards (`bool`, *optional*, defaults to `False`): + Whether to whiten the rewards. + kl_coef (`float`, *optional*, defaults to `0.05`): + KL coefficient. + cliprange (`float`, *optional*, defaults to `0.2`): + Clip range. + vf_coef (`float`, *optional*, defaults to `0.1`): + Value function coefficient. + cliprange_value (`float`, *optional*, defaults to `0.2`): + Clip range for the value function. + gamma (`float`, *optional*, defaults to `1.0`): + Discount factor. + lam (`float`, *optional*, defaults to `0.95`): + Lambda value for GAE. + """ + exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" reward_model_path: str = "EleutherAI/pythia-160m" - """the path to the reward model""" - - # ppo config num_ppo_epochs: int = 4 - """the number of epochs to train""" whiten_rewards: bool = False - """whether to whiten the rewards""" kl_coef: float = 0.05 - """the KL coefficient""" cliprange: float = 0.2 - """the clip range""" vf_coef: float = 0.1 - """the value function coefficient""" cliprange_value: float = 0.2 - """the clip range for the value function""" - gamma: float = 1 - """the discount factor""" + gamma: float = 1.0 lam: float = 0.95 - """the lambda value for GAE""" diff --git a/trl/trainer/reward_config.py b/trl/trainer/reward_config.py index 6e3975b42d..8eaa0bdcba 100644 --- a/trl/trainer/reward_config.py +++ b/trl/trainer/reward_config.py @@ -20,22 +20,24 @@ @dataclass class RewardConfig(TrainingArguments): - """ - RewardConfig collects all training arguments related to the [`RewardTrainer`] class. + r""" + Configuration class for the [`RewardTrainer`]. - Using [`HfArgumentParser`] we can turn this class into + Using [`~transformers.HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line. Parameters: - max_length (`int`, *optional*, defaults to `None`): - The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. - gradient_checkpointing (`bool`, *optional*, defaults to `True`): - If True, use gradient checkpointing to save memory at the expense of slower backward pass. + max_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want + to use the default data collator. + dataset_num_proc (`int`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + center_rewards_coefficient (`float`, *optional*, defaults to `None`): + Coefficient to incentivize the reward model to output mean-zero rewards (proposed by + https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`. """ max_length: Optional[int] = None - """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.""" dataset_num_proc: Optional[int] = None - """Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.""" center_rewards_coefficient: Optional[float] = None diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py index e629d84afa..ee0e3f7b60 100644 --- a/trl/trainer/rloo_config.py +++ b/trl/trainer/rloo_config.py @@ -6,21 +6,34 @@ @dataclass class RLOOConfig(OnPolicyConfig): + r""" + Configuration class for the [`RLOOTrainer`]. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`): + Name of this experiment. + reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): + Path to the reward model. + num_ppo_epochs (`int`, *optional*, defaults to `4`): + Number of epochs to train. + whiten_rewards (`bool`, *optional*, defaults to `False`): + Whether to whiten the rewards. + kl_coef (`float`, *optional*, defaults to `0.05`): + KL coefficient. + cliprange (`float`, *optional*, defaults to `0.2`): + Clip range. + rloo_k (`int`, *optional*, defaults to `2`): + REINFORCE Leave-One-Out (RLOO) number of online samples per prompt. + """ + exp_name: str = os.path.basename(__file__)[: -len(".py")] - """the name of this experiment""" reward_model_path: str = "EleutherAI/pythia-160m" - """the path to the reward model""" - - # ppo config num_ppo_epochs: int = 4 - """the number of epochs to train""" whiten_rewards: bool = False - """whether to whiten the rewards""" kl_coef: float = 0.05 - """the KL coefficient""" cliprange: float = 0.2 - """the clip range""" - - # rloo config rloo_k: int = 2 - """REINFORCE Leave-One-Out (RLOO) number of online samples per prompt""" diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py index 132a0c69d9..f0f2df1985 100644 --- a/trl/trainer/sft_config.py +++ b/trl/trainer/sft_config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from dataclasses import dataclass -from typing import Dict, Optional +from typing import Any, Dict, Optional from transformers import TrainingArguments @@ -20,50 +20,56 @@ @dataclass class SFTConfig(TrainingArguments): r""" - Initialize SFTConfig. + Configuration class for the [`SFTTrainer`]. - Args: - dataset_text_field (`Optional[str]`): - The name of the text field of the dataset, in case this is passed by a user, the trainer will automatically create a - `ConstantLengthDataset` based on the `dataset_text_field` argument. Defaults to None. - packing (`Optional[bool]`): - Used only in case `dataset_text_field` is passed. This argument is used by the `ConstantLengthDataset` to pack the sequences - of the dataset. Defaults to False. - max_seq_length (`Optional[int]`): - The maximum sequence length to use for the `ConstantLengthDataset` and for automatically creating the Dataset. Defaults to min of the smaller of the `tokenizer.model_max_length` and `1024`. - dataset_num_proc (`Optional[int]`): - The number of workers to use to tokenize the data. Only used when `packing=False`. Defaults to None. - dataset_batch_size (`int`): - The number of examples to tokenize per batch. If batch_size <= 0 or batch_size == None, - tokenize the full dataset as a single batch. Defaults to 1000. - neftune_noise_alpha (`Optional[float]`): - If not `None`, this will activate NEFTune noise embeddings. This has been proven to drastically improve model performances for instruction - fine-tuning. Check out the original paper here: https://huggingface.co/papers/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune - model_init_kwargs: (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when instantiating the model from a string. - dataset_kwargs: (`Optional[Dict]`, *optional*): - Dict of Optional kwargs to pass when creating packed or non-packed datasets - eval_packing: (`Optional[bool]`, *optional*): - Whether to pack the eval dataset as well. Defaults to `packing` if `None` is passed. - num_of_sequences (`Optional[int]`): - The number of sequences to use for the `ConstantLengthDataset`. Defaults to `1024`. - chars_per_token (`Optional[float]`): - The number of characters per token to use for the `ConstantLengthDataset`. Defaults to `3.6`. You can check how this is computed in the - stack-llama example: - [chars_token_ratio](https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53). - use_liger (`Optional[bool]`): + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + dataset_text_field (`Optional[str]`, *optional*, defaults to `None`): + Name of the text field of the dataset. If provided, the trainer will automatically create a + [`ConstantLengthDataset`] based on `dataset_text_field`. + packing (`bool`, *optional*, defaults to `False`): + Used only when `dataset_text_field` is provided. Controls whether the [`ConstantLengthDataset`] packs + the sequences of the dataset. + max_seq_length (`Optional[int]`, *optional*, defaults to `None`): + Maximum sequence length for the [`ConstantLengthDataset`] and for automatically creating the dataset. If + `None`, it uses the smaller value between `tokenizer.model_max_length` and `1024`. + dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. Only used when `packing=False`. + dataset_batch_size (`Union[int, None]`, *optional*, defaults to `1000`): + Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is None`, + tokenizes the full dataset as a single batch. + neftune_noise_alpha (`Optional[float]`, *optional*, defaults to `None`): + Scale of the noise for NEFTune embeddings. The [NEFTune paper](https://huggingface.co/papers/2310.05914) + suggests using values between `5` and `15`. If set to `None`, NEFTune is not activated. Activating NEFTune + can significantly improve model performance for instruction fine-tuning. + model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a + string. + dataset_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`): + Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets. + eval_packing (`Optional[bool]`, *optional*, defaults to `None`): + Whether to pack the eval dataset. If `None`, uses the same value as `packing`. + num_of_sequences (`int`, *optional*, defaults to `1024`): + Number of sequences to use for the [`ConstantLengthDataset`]. + chars_per_token (`float`, *optional*, defaults to `3.6`): + Number of characters per token to use for the [`ConstantLengthDataset`]. See + [chars_token_ratio](https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53) for more details. + use_liger (`bool`, *optional*, defaults to `False`): Monkey patch the model with Liger kernels to increase throughput and reduce memory usage. """ dataset_text_field: Optional[str] = None - packing: Optional[bool] = False + packing: bool = False max_seq_length: Optional[int] = None dataset_num_proc: Optional[int] = None dataset_batch_size: int = 1000 neftune_noise_alpha: Optional[float] = None - model_init_kwargs: Optional[Dict] = None - dataset_kwargs: Optional[Dict] = None + model_init_kwargs: Optional[Dict[str, Any]] = None + dataset_kwargs: Optional[Dict[str, Any]] = None eval_packing: Optional[bool] = None - num_of_sequences: Optional[int] = 1024 - chars_per_token: Optional[float] = 3.6 - use_liger: Optional[bool] = False + num_of_sequences: int = 1024 + chars_per_token: float = 3.6 + use_liger: bool = False diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py index fb180a9da8..b6b709d41a 100644 --- a/trl/trainer/utils.py +++ b/trl/trainer/utils.py @@ -470,11 +470,11 @@ class ConstantLengthDataset(IterableDataset): Number of characters per token used to estimate number of tokens in text buffer. eos_token_id (`int`, *optional*, defaults to `0`): Id of the end of sequence token if the passed tokenizer does not have an EOS token. - shuffle ('bool', *optional*, defaults to True) + shuffle (`bool`, *optional*, defaults to True) Shuffle the examples before they are returned - append_concat_token ('bool', *optional*, defaults to True) + append_concat_token (`bool`, *optional*, defaults to True) If true, appends `eos_token_id` at the end of each sample being packed. - add_special_tokens ('bool', *optional*, defaults to True) + add_special_tokens (`bool`, *optional*, defaults to True) If true, tokenizers adds special tokens to each sample being packed. """ @@ -890,54 +890,79 @@ class OnlineTrainerState(TrainerState): @dataclass class OnPolicyConfig(TrainingArguments): - # common config + r""" + Base configuration class for on-policy trainers. + + Using [`~transformers.HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + run_name (`Optional[str]`, *optional*, defaults to `None`): + Name of the run. + sanity_check (`bool`, *optional*, defaults to `False`): + Whether to run in debug mode. + dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`): + Number of processes to use for processing the dataset. + num_mini_batches (`int`, *optional*, defaults to `1`): + Number of minibatches to split a batch into. + total_episodes (`Optional[int]`, *optional*, defaults to `None`): + Total number of episodes in the dataset. + local_rollout_forward_batch_size (`int`, *optional*, defaults to `64`): + Per rank no grad forward pass in the rollout phase. + num_sample_generations (`int`, *optional*, defaults to `10`): + Number of debugging samples generations (i.e., `generate_completions` calls) throughout training. + response_length (`int`, *optional*, defaults to `53`): + Length of the response. + stop_token (`Optional[str]`, *optional*, defaults to `None`): + Stop token. + stop_token_id (`Optional[int]`, *optional*, defaults to `None`): + Truncation token id. + temperature (`float`, *optional*, defaults to `0.7`): + Sampling temperature. + penalty_reward_value (`int`, *optional*, defaults to `-1`): + Reward value for responses that do not contain `stop_token_id`. + non_eos_penalty (`bool`, *optional*, defaults to `False`): + Whether to penalize responses that do not contain `stop_token_id`. + sft_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`): + Path to the SFT model. + world_size (`Optional[int]`, *optional*, defaults to `None`): + Number of processes (GPUs) to use for the training. + num_total_batches (`Optional[int]`, *optional*, defaults to `None`): + Number of total batches to train. + micro_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`). + local_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`). + batch_size (`Optional[int]`, *optional*, defaults to `None`): + Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`). + local_mini_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Mini batch size per GPU. + mini_batch_size (`Optional[int]`, *optional*, defaults to `None`): + Mini batch size across GPUs. + """ + run_name: Optional[str] = None - """a unique name of this run""" sanity_check: bool = False - """wether to run in debug mode""" dataset_num_proc: Optional[int] = None - - # batch size related config num_mini_batches: int = 1 - """Number of minibatches to split a batch into""" total_episodes: Optional[int] = None - """The total number of episodes in the dataset""" local_rollout_forward_batch_size: int = 64 - """per rank no grad forward pass in the rollout phase""" num_sample_generations: int = 10 - """the number of debugging samples generations (i.e., `generate_completions` calls) throughout training""" - - # other config response_length: int = 53 - """the length of the response""" stop_token: Optional[Literal["eos"]] = None - """the stop token""" stop_token_id: Optional[int] = None - """the truncation token id""" temperature: float = 0.7 - """the sampling temperature""" penalty_reward_value: int = -1 - """the reward value for responses that do not contain `stop_token_id`""" non_eos_penalty: bool = False - """whether to penalize responses that do not contain `stop_token_id`""" sft_model_path: str = "EleutherAI/pythia-160m" - """the path to the sft model""" - - # various batch sizes world_size: Optional[int] = None - """The number of processes (GPUs) to use""" num_total_batches: Optional[int] = None - """The number of total batches to train""" micro_batch_size: Optional[int] = None - """The micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`)""" local_batch_size: Optional[int] = None - """The batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`)""" batch_size: Optional[int] = None - """The batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`)""" local_mini_batch_size: Optional[int] = None - """the mini batch size per GPU""" mini_batch_size: Optional[int] = None - """the mini batch size across GPUs""" def first_true_indices(bools: torch.Tensor, dtype=torch.long):