From fc20db8873c058e82460166146b9590f03256f28 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Wed, 4 Sep 2024 10:07:49 +0200
Subject: [PATCH] Clean configs documentation (#1944)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Clean BCO

* Optional[int]

* fix sft config

* alignprop config

* upadte tempfile to work with output_dir

* clean kto config

* intro docstring

* style

* reward config

* orpo config

* warning in trainer, not in config

* cpo config

* ppo v2

* model config

* ddpo and per_device_train_batch_size (instead of (train_batch_size)

* rloo

* Online config

* tmp_dir in test_ddpo

* style

* remove to_dict and fix post-init

* batch size in test ddpo

* dpo

* style

* `Args` -> `Parameters`

* parameters

* ppo config

* dont overwrite world size

* style

* outputdir in test ppo

* output dir in ppo config

* revert non-core change (1/n)

* revert non-core changes (2/n)

* revert non-core change (3/n)

* uniform max_length

* fix uniform max_length

* beta uniform

* style

* link to `ConstantLengthDataset`

* uniform `dataset_num_proc`

* uniform `disable_dropout`

* `eval_packing` doc

* try latex and α in doc

* try title first

* doesn't work

* reorganize doc

* overview

* better latex

* is_encoder_decoder uniform

* proper ticks

* fix latex

* uniform generate_during_eval

* uniform truncation_mode

* ref_model_mixup_alpha

* ref_model_mixup_alpha and ref_model_sync_steps

* Uniform  `model_init_kwargs` and `ref_model_init_kwargs`

* rpo_alpha

* Update maximum length argument names in config files

* Update loss_type descriptions in config files

* Update max_target_length to max_completion_length in CPOConfig and CPOTrainer

* Update padding value in config files

* Update precompute_ref_log_probs flag documentation

* Fix typos and update comments in dpo_config.py and sft_config.py

* post init warning for `max_target_length`
---
 docs/source/_toctree.yml         |  68 ++++++------
 tests/test_trainers_args.py      |   6 +-
 trl/trainer/alignprop_config.py  | 124 +++++++++++++--------
 trl/trainer/bco_config.py        | 102 ++++++++---------
 trl/trainer/bco_trainer.py       |   7 +-
 trl/trainer/cpo_config.py        |  99 +++++++++--------
 trl/trainer/cpo_trainer.py       |  14 +--
 trl/trainer/ddpo_config.py       | 142 ++++++++++++++++--------
 trl/trainer/dpo_config.py        | 117 +++++++++++++-------
 trl/trainer/dpo_trainer.py       |  16 +--
 trl/trainer/kto_config.py        | 103 +++++++++--------
 trl/trainer/model_config.py      | 153 ++++++++++++-------------
 trl/trainer/online_dpo_config.py |  18 +--
 trl/trainer/orpo_config.py       |  68 ++++++------
 trl/trainer/ppo_config.py        | 184 ++++++++++++++++++++-----------
 trl/trainer/ppov2_config.py      |  44 +++++---
 trl/trainer/reward_config.py     |  20 ++--
 trl/trainer/rloo_config.py       |  35 ++++--
 trl/trainer/sft_config.py        |  82 +++++++-------
 trl/trainer/utils.py             |  85 +++++++++-----
 20 files changed, 851 insertions(+), 636 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index d9cad99ad3..4f8f875562 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -17,44 +17,46 @@
     title: Understanding Logs
   title: Get started
 - sections:
+  - sections:
+    - local: trainer
+      title: Overview
+    - local: alignprop_trainer
+      title: AlignProp
+    - local: bco_trainer
+      title: BCO
+    - local: cpo_trainer
+      title: CPO
+    - local: ddpo_trainer
+      title: DDPO
+    - local: dpo_trainer
+      title: DPO
+    - local: online_dpo_trainer
+      title: Online DPO
+    - local: orpo_trainer
+      title: ORPO
+    - local: kto_trainer
+      title: KTO
+    - local: ppo_trainer
+      title: PPO
+    - local: ppov2_trainer
+      title: PPOv2
+    - local: rloo_trainer
+      title: RLOO
+    - local: sft_trainer
+      title: SFT
+    - local: iterative_sft_trainer
+      title: Iterative SFT
+    - local: reward_trainer
+      title: Reward Model
+    title: Trainers
   - local: models
     title: Model Classes
-  - local: trainer
-    title: Trainer Classes
-  - local: reward_trainer
-    title: Reward Model Training
-  - local: sft_trainer
-    title: Supervised Fine-Tuning
-  - local: ppo_trainer
-    title: PPO Trainer
-  - local: ppov2_trainer
-    title: PPOv2 Trainer
-  - local: rloo_trainer
-    title: RLOO Trainer
   - local: best_of_n
     title: Best of N Sampling
-  - local: dpo_trainer
-    title: DPO Trainer
-  - local: online_dpo_trainer
-    title: Online DPO Trainer
-  - local: kto_trainer
-    title: KTO Trainer
-  - local: bco_trainer
-    title: BCO Trainer
-  - local: cpo_trainer
-    title: CPO Trainer
-  - local: ddpo_trainer
-    title: Denoising Diffusion Policy Optimization
-  - local: alignprop_trainer
-    title: AlignProp Trainer
-  - local: orpo_trainer
-    title: ORPO Trainer
-  - local: iterative_sft_trainer
-    title: Iterative Supervised Fine-Tuning
-  - local: callbacks
-    title: Callback Classes
   - local: judges
-    title: Judge Classes
+    title: Judges
+  - local: callbacks
+    title: Callbacks
   - local: text_environments
     title: Text Environments
   title: API
diff --git a/tests/test_trainers_args.py b/tests/test_trainers_args.py
index a8b4ef028c..cd6de6bd0a 100644
--- a/tests/test_trainers_args.py
+++ b/tests/test_trainers_args.py
@@ -77,7 +77,6 @@ def test_cpo(self):
                 max_length=256,
                 max_prompt_length=64,
                 max_completion_length=64,
-                max_target_length=64,
                 beta=0.5,
                 label_smoothing=0.5,
                 loss_type="hinge",
@@ -96,7 +95,6 @@ def test_cpo(self):
             self.assertEqual(trainer.args.max_length, 256)
             self.assertEqual(trainer.args.max_prompt_length, 64)
             self.assertEqual(trainer.args.max_completion_length, 64)
-            self.assertEqual(trainer.args.max_target_length, 64)
             self.assertEqual(trainer.args.beta, 0.5)
             self.assertEqual(trainer.args.label_smoothing, 0.5)
             self.assertEqual(trainer.args.loss_type, "hinge")
@@ -127,7 +125,7 @@ def test_dpo(self):
                 truncation_mode="keep_start",
                 max_length=256,
                 max_prompt_length=64,
-                max_target_length=64,
+                max_completion_length=64,
                 is_encoder_decoder=True,
                 disable_dropout=False,
                 # generate_during_eval=True, # ignore this one, it requires wandb
@@ -155,7 +153,7 @@ def test_dpo(self):
             self.assertEqual(trainer.args.truncation_mode, "keep_start")
             self.assertEqual(trainer.args.max_length, 256)
             self.assertEqual(trainer.args.max_prompt_length, 64)
-            self.assertEqual(trainer.args.max_target_length, 64)
+            self.assertEqual(trainer.args.max_completion_length, 64)
             self.assertEqual(trainer.args.is_encoder_decoder, True)
             self.assertEqual(trainer.args.disable_dropout, False)
             # self.assertEqual(trainer.args.generate_during_eval, True)
diff --git a/trl/trainer/alignprop_config.py b/trl/trainer/alignprop_config.py
index 7bd4cd32bd..b5f56eecfa 100644
--- a/trl/trainer/alignprop_config.py
+++ b/trl/trainer/alignprop_config.py
@@ -2,7 +2,7 @@
 import sys
 import warnings
 from dataclasses import dataclass, field
-from typing import Literal, Optional
+from typing import Any, Dict, Literal, Optional, Tuple
 
 from ..core import flatten_dict
 from ..import_utils import is_bitsandbytes_available, is_torchvision_available
@@ -10,77 +10,109 @@
 
 @dataclass
 class AlignPropConfig:
-    """
-    Configuration class for AlignPropTrainer
+    r"""
+    Configuration class for the [`AlignPropTrainer`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
+            Name of this experiment (defaults to the file name without the extension).
+        run_name (`str`, *optional*, defaults to `""`):
+            Name of this run.
+        log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`):
+            Log with either `"wandb"` or `"tensorboard"`. Check
+            [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details.
+        log_image_freq (`int`, *optional*, defaults to `1`):
+            Frequency for logging images.
+        tracker_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`):
+            Keyword arguments for the tracker (e.g., `wandb_project`).
+        accelerator_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`):
+            Keyword arguments for the accelerator.
+        project_kwargs (`Dict[str, Any]`, *optional*, defaults to `{}`):
+            Keyword arguments for the accelerator project config (e.g., `logging_dir`).
+        tracker_project_name (`str`, *optional*, defaults to `"trl"`):
+            Name of project to use for tracking.
+        logdir (`str`, *optional*, defaults to `"logs"`):
+            Top-level logging directory for checkpoint saving.
+        num_epochs (`int`, *optional*, defaults to `100`):
+            Number of epochs to train.
+        save_freq (`int`, *optional*, defaults to `1`):
+            Number of epochs between saving model checkpoints.
+        num_checkpoint_limit (`int`, *optional*, defaults to `5`):
+            Number of checkpoints to keep before overwriting old ones.
+        mixed_precision (`str`, *optional*, defaults to `"fp16"`):
+            Mixed precision training.
+        allow_tf32 (`bool`, *optional*, defaults to `True`):
+            Allow `tf32` on Ampere GPUs.
+        resume_from (`str`, *optional*, defaults to `""`):
+            Path to resume training from a checkpoint.
+        sample_num_steps (`int`, *optional*, defaults to `50`):
+            Number of sampler inference steps.
+        sample_eta (`float`, *optional*, defaults to `1.0`):
+            Eta parameter for the DDIM sampler.
+        sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
+            Classifier-free guidance weight.
+        train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
+            Whether to use the 8bit Adam optimizer from `bitsandbytes`.
+        train_learning_rate (`float`, *optional*, defaults to `1e-3`):
+            Learning rate.
+        train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
+            Beta1 for Adam optimizer.
+        train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
+            Beta2 for Adam optimizer.
+        train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
+            Weight decay for Adam optimizer.
+        train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
+            Epsilon value for Adam optimizer.
+        train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
+            Number of gradient accumulation steps.
+        train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
+            Maximum gradient norm for gradient clipping.
+        negative_prompts (`Optional[str]`, *optional*, defaults to `None`):
+            Comma-separated list of prompts to use as negative examples.
+        truncated_backprop_rand (`bool`, *optional*, defaults to `True`):
+            If `True`, randomized truncation to different diffusion timesteps is used.
+        truncated_backprop_timestep (`int`, *optional*, defaults to `49`):
+            Absolute timestep to which the gradients are backpropagated. Used only if `truncated_backprop_rand=False`.
+        truncated_rand_backprop_minmax (`Tuple[int, int]`, *optional*, defaults to `(0, 50)`):
+            Range of diffusion timesteps for randomized truncated backpropagation.
     """
 
-    # common parameters
     exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
-    """the name of this experiment (by default is the file name without the extension name)"""
-    run_name: Optional[str] = ""
-    """Run name for wandb logging and checkpoint saving."""
+    run_name: str = ""
     seed: int = 0
-    """Seed value for random generations"""
     log_with: Optional[Literal["wandb", "tensorboard"]] = None
-    """Log with either 'wandb' or 'tensorboard', check  https://huggingface.co/docs/accelerate/usage_guides/tracking for more details"""
-    log_image_freq = 1
-    """Logging Frequency for images"""
-    tracker_kwargs: dict = field(default_factory=dict)
-    """Keyword arguments for the tracker (e.g. wandb_project)"""
-    accelerator_kwargs: dict = field(default_factory=dict)
-    """Keyword arguments for the accelerator"""
-    project_kwargs: dict = field(default_factory=dict)
-    """Keyword arguments for the accelerator project config (e.g. `logging_dir`)"""
+    log_image_freq: int = 1
+    tracker_kwargs: Dict[str, Any] = field(default_factory=dict)
+    accelerator_kwargs: Dict[str, Any] = field(default_factory=dict)
+    project_kwargs: Dict[str, Any] = field(default_factory=dict)
     tracker_project_name: str = "trl"
-    """Name of project to use for tracking"""
     logdir: str = "logs"
-    """Top-level logging directory for checkpoint saving."""
-
-    # hyperparameters
     num_epochs: int = 100
-    """Number of epochs to train."""
     save_freq: int = 1
-    """Number of epochs between saving model checkpoints."""
     num_checkpoint_limit: int = 5
-    """Number of checkpoints to keep before overwriting old ones."""
     mixed_precision: str = "fp16"
-    """Mixed precision training."""
     allow_tf32: bool = True
-    """Allow tf32 on Ampere GPUs."""
-    resume_from: Optional[str] = ""
-    """Resume training from a checkpoint."""
+    resume_from: str = ""
     sample_num_steps: int = 50
-    """Number of sampler inference steps."""
     sample_eta: float = 1.0
-    """Eta parameter for the DDIM sampler."""
     sample_guidance_scale: float = 5.0
-    """Classifier-free guidance weight."""
     train_batch_size: int = 1
-    """Batch size (per GPU!) to use for training."""
     train_use_8bit_adam: bool = False
-    """Whether to use the 8bit Adam optimizer from bitsandbytes."""
     train_learning_rate: float = 1e-3
-    """Learning rate."""
     train_adam_beta1: float = 0.9
-    """Adam beta1."""
     train_adam_beta2: float = 0.999
-    """Adam beta2."""
     train_adam_weight_decay: float = 1e-4
-    """Adam weight decay."""
     train_adam_epsilon: float = 1e-8
-    """Adam epsilon."""
     train_gradient_accumulation_steps: int = 1
-    """Number of gradient accumulation steps."""
     train_max_grad_norm: float = 1.0
-    """Maximum gradient norm for gradient clipping."""
-    negative_prompts: Optional[str] = ""
-    """Comma-separated list of prompts to use as negative examples."""
+    negative_prompts: Optional[str] = None
     truncated_backprop_rand: bool = True
-    """Truncated Randomized Backpropation randomizes truncation to different diffusion timesteps"""
     truncated_backprop_timestep: int = 49
-    """Absolute timestep to which the gradients are being backpropagated. If truncated_backprop_rand is False"""
-    truncated_rand_backprop_minmax: tuple = (0, 50)
-    """Range of diffusion timesteps for randomized truncated backprop."""
+    truncated_rand_backprop_minmax: Tuple[int, int] = (0, 50)
 
     def to_dict(self):
         output_dict = {}
diff --git a/trl/trainer/bco_config.py b/trl/trainer/bco_config.py
index 05d4b0b8fe..2d7a3f7a09 100644
--- a/trl/trainer/bco_config.py
+++ b/trl/trainer/bco_config.py
@@ -12,87 +12,77 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
 from transformers import TrainingArguments
 
-from ..import_utils import is_sklearn_available
-
 
 @dataclass
 class BCOConfig(TrainingArguments):
     r"""
-    BCOConfig collects all training arguments related to the [`BCOTrainer`] class.
+    Configuration class for the [`BCOTrainer`].
 
-    Using [`HfArgumentParser`] we can turn this class into
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
-        max_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
-        max_prompt_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
-        beta (`float`, defaults to 0.1):
-            The beta factor in BCO loss. Higher beta means less divergence from the initial policy.
-        label_pad_token_id (`int`, defaults to `-100`):
-            The label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`int`, defaults to `0`):
-            The padding value if it is different to the tokenizer's pad_token_id.
-        truncation_mode (`str`, defaults to `keep_end`):
-            The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
-        generate_during_eval (`bool`, defaults to `False`):
-            Whether to sample and log generations during evaluation step.
-        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
-            If no model is provided, we need to know if the model_init returns an encoder-decoder.
-        precompute_ref_log_probs (`bool`, defaults to `False`):
-            Flag to precompute reference model log probabilities for training and evaluation datasets. This is useful if you want to train
-            without the reference model and reduce the total GPU memory needed.
-        model_init_kwargs: (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the model from a string.
-        ref_model_init_kwargs: (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the ref model from a string.
-        dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`):
-            Number of processes to use for processing the datasets.
-        prompt_sample_size: (`int`, defaults to 1024):
+        max_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model.
+        label_pad_token_id (`int`,  *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from both the model and the reference model to W&B during
+            evaluation.
+        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+            Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+            useful when training without the reference model to reduce the total GPU memory needed.
+        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+            from a string.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
+        prompt_sample_size (`int`, *optional*, defaults to `1024`):
             Number of prompts that are fed to density ratio classifier.
-        min_density_ratio: (`float`, defaults to 0.5):
-            The minimum value of the density ratio. The estimated density ratio is clamped to this value.
-        max_density_ratio: (`float`, defaults to 10.0):
-            The maximum value of the density ratio. The estimated density ratio is clamped to this value.
+        min_density_ratio (`float`, *optional*, defaults to `0.5`):
+            Minimum value of the density ratio. The estimated density ratio is clamped to this value.
+        max_density_ratio (`float`, *optional*, defaults to `10.0`):
+            Maximum value of the density ratio. The estimated density ratio is clamped to this value.
     """
 
     max_length: Optional[int] = None
-    """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
     max_prompt_length: Optional[int] = None
-    """The maximum length of the prompt. This argument is required if you want to use the default data collator."""
     max_completion_length: Optional[int] = None
-    """The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder."""
     beta: float = 0.1
-    """The beta factor in BCO loss. Higher beta means less divergence from the initial policy."""
-
     label_pad_token_id: int = -100
-    padding_value: int = None
+    padding_value: Optional[int] = None
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
     precompute_ref_log_probs: bool = False
-    model_init_kwargs: Optional[Dict] = None
-    ref_model_init_kwargs: Optional[Dict] = None
+    model_init_kwargs: Optional[Dict[str, Any]] = None
+    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
-
-    # BCO config
     prompt_sample_size: int = 1024
     min_density_ratio: float = 0.5
     max_density_ratio: float = 10.0
-
-    def __post_init__(self):
-        super().__post_init__()
-
-        if not is_sklearn_available():
-            raise ImportError(
-                "You need to install scikit-learn to use `BCOTrainer` "
-                "You can install it with `pip install scikit-learn`."
-            )
diff --git a/trl/trainer/bco_trainer.py b/trl/trainer/bco_trainer.py
index 3d74501a10..75b5553d08 100644
--- a/trl/trainer/bco_trainer.py
+++ b/trl/trainer/bco_trainer.py
@@ -327,8 +327,13 @@ def __init__(
         embedding_func: Optional[Callable] = None,
         embedding_tokenizer: Optional[PreTrainedTokenizerBase] = None,
     ):
+        if not is_sklearn_available():
+            raise ImportError(
+                "BCOTrainer requires the scikit-learn library. Please install it with `pip install scikit-learn`."
+            )
+
         if type(args) is TrainingArguments:
-            raise ValueError("Please use `BCOConfig` instead TrainingArguments.")
+            raise ValueError("Please use `BCOConfig` instead `TrainingArguments`.")
 
         if args.model_init_kwargs is None:
             model_init_kwargs = {}
diff --git a/trl/trainer/cpo_config.py b/trl/trainer/cpo_config.py
index 5ba874b7a7..f61672de4e 100644
--- a/trl/trainer/cpo_config.py
+++ b/trl/trainer/cpo_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Literal, Optional
+from typing import Any, Dict, Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -20,70 +20,73 @@
 @dataclass
 class CPOConfig(TrainingArguments):
     r"""
-    CPOConfig collects all training arguments related to the [`CPOTrainer`] class.
+    Configuration class for the [`CPOTrainer`].
 
-    Using [`HfArgumentParser`] we can turn this class into
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
-        max_length (`int`, defaults to `None`):
-            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
-        max_prompt_length (`int`, defaults to `None`):
-            The maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_target_length (`int`, defaults to `None`):
-            The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
-        beta (`float`, defaults to 0.1):
-            The beta factor in CPO loss.
-        label_smoothing (`float`, defaults to 0):
-            The label smoothing factor. This argument is required if you want to use the default data collator.
-        loss_type (`str`, defaults to `sigmoid`):
-            The type of loss to use. This argument is required if you want to use the default data collator.
-        label_pad_token_id (`int`, defaults to `-100`):
-            The label pad token id. This argument is required if you want to use the default data collator.
-        cpo_alpha (`float`, defaults to `1.0`):
-            A hyperparameter that controls the strength of the BC regularizer in CPO training.
-        simpo_gamma (`float`, defaults to `0.5`):
-            A target reward margin for the SimPO loss, used only when the "simpo" option is enabled.
-        padding_value (`int`, defaults to `None`):
-            The padding value if it is different to the tokenizer's pad_token_id.
-        truncation_mode (`str`, defaults to `keep_end`):
-            The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
-        generate_during_eval (`bool`, defaults to `False`):
-            Whether to sample and log generations during evaluation step.
-        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
-            If no model is provided, we need to know if the model_init returns an encoder-decoder.
-        disable_dropout (`bool`, defaults to `True`):
-            Whether or not to disable dropouts in `model`.
-        model_init_kwargs (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the model from a string
-        dataset_num_proc (`Optional[int]`, *optional*):
-            The number of workers to use to tokenize the data. Defaults to None.
+        max_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
+        label_smoothing (`float`, *optional*, defaults to `0.0`):
+            Label smoothing factor. This argument is required if you want to use the default data collator.
+        loss_type (`str`, *optional*, defaults to `"sigmoid"`):
+            Type of loss to use. Possible values are:
+
+                - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
+                - `"hinge"`: hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper.
+                - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
+                - `"simpo"`: SimPO loss from the [SimPO](https://huggingface.co/papers/2405.14734) paper.
+
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        cpo_alpha (`float`, *optional*, defaults to `1.0`):
+            Weight of the BC regularizer in CPO training.
+        simpo_gamma (`float`, *optional*, defaults to `0.5`):
+            Target reward margin for the SimPO loss, used only when the `loss_type="simpo"`.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`,*optional*,  defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B during evaluation.
+        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
     """
 
     max_length: Optional[int] = None
     max_prompt_length: Optional[int] = None
     max_completion_length: Optional[int] = None
-    max_target_length: Optional[int] = None
-
     beta: float = 0.1
-    label_smoothing: float = 0
+    label_smoothing: float = 0.0
     loss_type: Literal["sigmoid", "hinge", "ipo", "simpo"] = "sigmoid"
     disable_dropout: bool = True
     cpo_alpha: float = 1.0
     simpo_gamma: float = 0.5
-
     label_pad_token_id: int = -100
-    padding_value: int = None
+    padding_value: Optional[int] = None
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
-
-    model_init_kwargs: Optional[Dict] = None
-
+    model_init_kwargs: Optional[Dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
-
-    def __post_init__(self):
-        if self.loss_type == "kto_pair":
-            raise ValueError("Support for kto_pair has been removed in CPOTrainer. Please use KTOTrainer.")
-        return super().__post_init__()
diff --git a/trl/trainer/cpo_trainer.py b/trl/trainer/cpo_trainer.py
index 459c06c6d3..9771296cea 100644
--- a/trl/trainer/cpo_trainer.py
+++ b/trl/trainer/cpo_trainer.py
@@ -225,15 +225,15 @@ def make_inputs_require_grad(module, input, output):
         else:
             max_prompt_length = args.max_prompt_length
 
-        if args.max_target_length is None and self.is_encoder_decoder:
+        if args.max_completion_length is None and self.is_encoder_decoder:
             warnings.warn(
-                "When using an encoder decoder architecture, you should set `max_target_length` in the CPOConfig's init"
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the CPOConfig's init"
                 " it will default to `128` by default, but you should do it yourself in the future.",
                 UserWarning,
             )
-            max_target_length = 128
+            max_completion_length = 128
         else:
-            max_target_length = args.max_target_length
+            max_completion_length = args.max_completion_length
 
         if data_collator is None:
             data_collator = DPODataCollatorWithPadding(
@@ -264,7 +264,7 @@ def make_inputs_require_grad(module, input, output):
         self.padding_value = args.padding_value if args.padding_value is not None else tokenizer.pad_token_id
         self.max_prompt_length = max_prompt_length
         self.truncation_mode = args.truncation_mode
-        self.max_target_length = max_target_length
+        self.max_completion_length = max_completion_length
         self.tokenizer = tokenizer
 
         if args.loss_type in ["hinge", "ipo"] and args.label_smoothing > 0:
@@ -495,10 +495,10 @@ def tokenize_row(self, feature, model: Optional[Union[PreTrainedModel, nn.Module
 
         else:
             chosen_tokens = self.tokenizer(
-                chosen, truncation=True, max_length=self.max_target_length, add_special_tokens=True
+                chosen, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
             )
             rejected_tokens = self.tokenizer(
-                rejected, truncation=True, max_length=self.max_target_length, add_special_tokens=True
+                rejected, truncation=True, max_length=self.max_completion_length, add_special_tokens=True
             )
             prompt_tokens = self.tokenizer(
                 prompt, truncation=True, max_length=self.max_prompt_length, add_special_tokens=True
diff --git a/trl/trainer/ddpo_config.py b/trl/trainer/ddpo_config.py
index b73bd58d05..f186f662f8 100644
--- a/trl/trainer/ddpo_config.py
+++ b/trl/trainer/ddpo_config.py
@@ -10,93 +10,137 @@
 
 @dataclass
 class DDPOConfig:
-    """
-    Configuration class for DDPOTrainer
+    r"""
+    Configuration class for the [`DDPOTrainer`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        exp_name (`str`, *optional*, defaults to `os.path.basename(sys.argv[0])[: -len(".py")]`):
+            Name of this experiment (by default is the file name without the extension name).
+        run_name (`str`, *optional*, defaults to `""`):
+            Name of this run.
+        seed (`int`, *optional*, defaults to `0`):
+            Random seed.
+        log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`):
+            Log with either 'wandb' or 'tensorboard', check
+            https://huggingface.co/docs/accelerate/usage_guides/tracking for more details.
+        tracker_kwargs (`Dict`, *optional*, defaults to `{}`):
+            Keyword arguments for the tracker (e.g. wandb_project).
+        accelerator_kwargs (`Dict`, *optional*, defaults to `{}`):
+            Keyword arguments for the accelerator.
+        project_kwargs (`Dict`, *optional*, defaults to `{}`):
+            Keyword arguments for the accelerator project config (e.g. `logging_dir`).
+        tracker_project_name (`str`, *optional*, defaults to `"trl"`):
+            Name of project to use for tracking.
+        logdir (`str`, *optional*, defaults to `"logs"`):
+            Top-level logging directory for checkpoint saving.
+        num_epochs (`int`, *optional*, defaults to `100`):
+            Number of epochs to train.
+        save_freq (`int`, *optional*, defaults to `1`):
+            Number of epochs between saving model checkpoints.
+        num_checkpoint_limit (`int`, *optional*, defaults to `5`):
+            Number of checkpoints to keep before overwriting old ones.
+        mixed_precision (`str`, *optional*, defaults to `"fp16"`):
+            Mixed precision training.
+        allow_tf32 (`bool`, *optional*, defaults to `True`):
+            Allow `tf32` on Ampere GPUs.
+        resume_from (`str`, *optional*, defaults to `""`):
+            Resume training from a checkpoint.
+        sample_num_steps (`int`, *optional*, defaults to `50`):
+            Number of sampler inference steps.
+        sample_eta (`float`, *optional*, defaults to `1.0`):
+            Eta parameter for the DDIM sampler.
+        sample_guidance_scale (`float`, *optional*, defaults to `5.0`):
+            Classifier-free guidance weight.
+        sample_batch_size (`int`, *optional*, defaults to `1`):
+            Batch size (per GPU) to use for sampling.
+        sample_num_batches_per_epoch (`int`, *optional*, defaults to `2`):
+            Number of batches to sample per epoch.
+        train_batch_size (`int`, *optional*, defaults to `1`):
+            Batch size (per GPU) to use for training.
+        train_use_8bit_adam (`bool`, *optional*, defaults to `False`):
+            Use 8bit Adam optimizer from bitsandbytes.
+        train_learning_rate (`float`, *optional*, defaults to `3e-4`):
+            Learning rate.
+        train_adam_beta1 (`float`, *optional*, defaults to `0.9`):
+            Adam beta1.
+        train_adam_beta2 (`float`, *optional*, defaults to `0.999`):
+            Adam beta2.
+        train_adam_weight_decay (`float`, *optional*, defaults to `1e-4`):
+            Adam weight decay.
+        train_adam_epsilon (`float`, *optional*, defaults to `1e-8`):
+            Adam epsilon.
+        train_gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
+            Number of gradient accumulation steps.
+        train_max_grad_norm (`float`, *optional*, defaults to `1.0`):
+            Maximum gradient norm for gradient clipping.
+        train_num_inner_epochs (`int`, *optional*, defaults to `1`):
+            Number of inner epochs per outer epoch.
+        train_cfg (`bool`, *optional*, defaults to `True`):
+            Whether or not to use classifier-free guidance during training.
+        train_adv_clip_max (`float`, *optional*, defaults to `5.0`):
+            Clip advantages to the range.
+        train_clip_range (`float`, *optional*, defaults to `1e-4`):
+            PPO clip range.
+        train_timestep_fraction (`float`, *optional*, defaults to `1.0`):
+            Fraction of timesteps to train on.
+        per_prompt_stat_tracking (`bool`, *optional*, defaults to `False`):
+            Whether to track statistics for each prompt separately.
+        per_prompt_stat_tracking_buffer_size (`int`, *optional*, defaults to `16`):
+            Number of reward values to store in the buffer for each prompt.
+        per_prompt_stat_tracking_min_count (`int`, *optional*, defaults to `16`):
+            Minimum number of reward values to store in the buffer.
+        async_reward_computation (`bool`, *optional*, defaults to `False`):
+            Whether to compute rewards asynchronously.
+        max_workers (`int`, *optional*, defaults to `2`):
+            Maximum number of workers to use for async reward computation.
+        negative_prompts (`Optional[str]`, *optional*, defaults to `""`):
+            Comma-separated list of prompts to use as negative examples.
     """
 
-    # common parameters
     exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
-    """the name of this experiment (by default is the file name without the extension name)"""
-    run_name: Optional[str] = ""
-    """Run name for wandb logging and checkpoint saving."""
+    run_name: str = ""
     seed: int = 0
-    """Seed value for random generations"""
     log_with: Optional[Literal["wandb", "tensorboard"]] = None
-    """Log with either 'wandb' or 'tensorboard', check  https://huggingface.co/docs/accelerate/usage_guides/tracking for more details"""
     tracker_kwargs: dict = field(default_factory=dict)
-    """Keyword arguments for the tracker (e.g. wandb_project)"""
     accelerator_kwargs: dict = field(default_factory=dict)
-    """Keyword arguments for the accelerator"""
     project_kwargs: dict = field(default_factory=dict)
-    """Keyword arguments for the accelerator project config (e.g. `logging_dir`)"""
     tracker_project_name: str = "trl"
-    """Name of project to use for tracking"""
     logdir: str = "logs"
-    """Top-level logging directory for checkpoint saving."""
-
-    # hyperparameters
     num_epochs: int = 100
-    """Number of epochs to train."""
     save_freq: int = 1
-    """Number of epochs between saving model checkpoints."""
     num_checkpoint_limit: int = 5
-    """Number of checkpoints to keep before overwriting old ones."""
     mixed_precision: str = "fp16"
-    """Mixed precision training."""
     allow_tf32: bool = True
-    """Allow tf32 on Ampere GPUs."""
-    resume_from: Optional[str] = ""
-    """Resume training from a checkpoint."""
+    resume_from: str = ""
     sample_num_steps: int = 50
-    """Number of sampler inference steps."""
     sample_eta: float = 1.0
-    """Eta parameter for the DDIM sampler."""
     sample_guidance_scale: float = 5.0
-    """Classifier-free guidance weight."""
     sample_batch_size: int = 1
-    """Batch size (per GPU!) to use for sampling."""
     sample_num_batches_per_epoch: int = 2
-    """Number of batches to sample per epoch."""
     train_batch_size: int = 1
-    """Batch size (per GPU!) to use for training."""
     train_use_8bit_adam: bool = False
-    """Whether to use the 8bit Adam optimizer from bitsandbytes."""
     train_learning_rate: float = 3e-4
-    """Learning rate."""
     train_adam_beta1: float = 0.9
-    """Adam beta1."""
     train_adam_beta2: float = 0.999
-    """Adam beta2."""
     train_adam_weight_decay: float = 1e-4
-    """Adam weight decay."""
     train_adam_epsilon: float = 1e-8
-    """Adam epsilon."""
     train_gradient_accumulation_steps: int = 1
-    """Number of gradient accumulation steps."""
     train_max_grad_norm: float = 1.0
-    """Maximum gradient norm for gradient clipping."""
     train_num_inner_epochs: int = 1
-    """Number of inner epochs per outer epoch."""
     train_cfg: bool = True
-    """Whether or not to use classifier-free guidance during training."""
-    train_adv_clip_max: float = 5
-    """Clip advantages to the range."""
+    train_adv_clip_max: float = 5.0
     train_clip_range: float = 1e-4
-    """The PPO clip range."""
     train_timestep_fraction: float = 1.0
-    """The fraction of timesteps to train on."""
     per_prompt_stat_tracking: bool = False
-    """Whether to track statistics for each prompt separately."""
     per_prompt_stat_tracking_buffer_size: int = 16
-    """Number of reward values to store in the buffer for each prompt."""
     per_prompt_stat_tracking_min_count: int = 16
-    """The minimum number of reward values to store in the buffer."""
     async_reward_computation: bool = False
-    """Whether to compute rewards asynchronously."""
     max_workers: int = 2
-    """The maximum number of workers to use for async reward computation."""
-    negative_prompts: Optional[str] = ""
-    """Comma-separated list of prompts to use as negative examples."""
+    negative_prompts: str = ""
 
     def to_dict(self):
         output_dict = {}
diff --git a/trl/trainer/dpo_config.py b/trl/trainer/dpo_config.py
index 67acb14ad5..a517c3cbfe 100644
--- a/trl/trainer/dpo_config.py
+++ b/trl/trainer/dpo_config.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import warnings
 from dataclasses import dataclass
 from enum import Enum
-from typing import Dict, Literal, Optional
+from typing import Any, Dict, Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -32,15 +33,22 @@ class FDivergenceConstants:
 @dataclass
 class DPOConfig(TrainingArguments):
     r"""
-    Initialize DPOConfig.
+    Configuration class for the [`DPOTrainer`].
 
-    Args:
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
         beta (`float`, *optional*, defaults to `0.1`):
-            The beta factor in DPO loss. Higher beta means less divergence from the initial policy. For the IPO loss, beta is the regularization parameter denoted by tau in the paper.
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
         label_smoothing (`float`, *optional*, defaults to `0.0`):
-            The robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and [Robust DPO](https://huggingface.co/papers/2403.00409) paper that should be between 0 and 0.5.
+            Robust DPO label smoothing parameter from the [cDPO](https://ericmitchell.ai/cdpo.pdf) report and
+            [Robust DPO](https://huggingface.co/papers/2403.00409) paper that should be between `0.0` and `0.5`.
         loss_type (`str`, *optional*, defaults to `"sigmoid"`):
-            The type of DPO loss to use. Possible values are:
+            Type of loss to use. Possible values are:
 
                 - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
                 - `"hinge"`: hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper.
@@ -56,56 +64,74 @@ class DPOConfig(TrainingArguments):
                 - `"apo_down"`: APO-down loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
 
         label_pad_token_id (`int`, *optional*, defaults to `-100`):
-            The label pad token id. This argument is required if you want to use the default data collator.
+            Label pad token id. This argument is required if you want to use the default data collator.
         padding_value (`Optional[int]`, *optional*, defaults to `None`):
-            The padding value if it is different to the tokenizer's pad_token_id.
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
         truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
-            The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
+            Truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the
+            default data collator.
         max_length (`Optional[int]`, *optional*, defaults to `None`):
-            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
         max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
-            The maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_target_length (`Optional[int]`, *optional*, defaults to `None`):
-            The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the target. This argument is required if you want to use the default data collator and
+            your model is an encoder-decoder.
         is_encoder_decoder(`Optional[int]`, *optional*, defaults to `None`):
-            If no model is provided, we need to know if the model_init returns an encoder-decoder.
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
         disable_dropout (`bool`, *optional*, defaults to `True`):
-            Whether or not to disable dropouts in `model` and `ref_model`.
+            Whether to disable dropout in the model and reference model.
         generate_during_eval (`bool`, *optional*, defaults to `False`):
-            Whether to sample and log generations during evaluation step.
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
         precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
-            Flag to precompute reference model log probabilities for training and evaluation datasets. This is useful if you want to train
-            without the reference model and reduce the total GPU memory needed.
+            Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+            useful when training without the reference model to reduce the total GPU memory needed.
         dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
-            The number of workers to use to tokenize the data. Defaults to None.
-        model_init_kwargs (`Optional[Dict]`, *optional*, defaults to `None`):
-            Dict of Optional kwargs to pass when instantiating the model from a string
-        ref_model_init_kwargs (`Optional[Dict]`, *optional*, defaults to `None`):
-            Dict of Optional kwargs to pass when instantiating the ref model from a string
+            Number of processes to use for processing the dataset.
+        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+            from a string.
         model_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
             Name of the train target PEFT adapter, when using LoRA with multiple adapters.
         ref_adapter_name (`Optional[str]`, *optional*, defaults to `None`):
             Name of the reference PEFT adapter, when using LoRA with multiple adapters.
         reference_free (`bool`, *optional*, defaults to `False`):
-            If True, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal probability to all responses.
+            If `True`, we ignore the _provided_ reference model and implicitly use a reference model that assigns equal
+            probability to all responses.
         force_use_ref_model (`bool`, *optional*, defaults to `False`):
-            In case one passes a PEFT model for the active model and you want to use a different model for the ref_model, set this flag to `True`.
-        f_divergence_type (`FDivergenceType`, *optional*, defaults to `FDivergenceType.REVERSE_KL`):
-            The type of f-divergence regularization function to compute divergence between policy and reference model. This argument is optional, defaults to `FDivergenceType.REVERSE_KL`.
+            In case one passes a PEFT model for the active model and you want to use a different model for the
+            ref_model, set this flag to `True`.
+        f_divergence_type (`str`, *optional*, defaults to `FDivergenceType.REVERSE_KL`):
+            Type of f-divergence regularization function to compute divergence between policy and reference model.
         f_alpha_divergence_coef (`float`, *optional*, defaults to `1.0`):
-            The alpha coef in alpha-divergence(u^-alpha) regularization function for DPO loss.
-        sync_ref_model ('bool', *optional*, defaults to `False`):
-            The flag for syncing reference model during training from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
-        ref_model_mixup_alpha ('float', *optional*, defaults to `1.0`):
-            The alpha parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
-        ref_model_sync_steps ('int', *optional*, defaults to `2`):
-            The tau parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
-        rpo_alpha ('float', *optional*, defaults to `None`):
-            The alpha parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper V3. If None, no weighting is applied and the loss is the same as the DPO loss. The paper recommends `rpo_alpha=1.0`.
+            α coefficient in the α-divergence \\(u^{-\\alpha}\\) regularization function for DPO loss.
+        sync_ref_model (`bool`, *optional*, defaults to `False`):
+            When set to `True`, the reference model is synchronized with the active model every `ref_model_sync_steps`
+            steps, using the `ref_model_mixup_alpha` parameter. This synchronization originites from the
+            [TR-DPO](https://huggingface.co/papers/2404.09656) paper.
+        ref_model_mixup_alpha (`float`, *optional*, defaults to `0.9`):
+            α parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which controls the mix
+            between the current policy and the previous reference policy during updates. The reference policy is
+            updated according to the equation: `π_ref = α * π_θ + (1 - α) * π_ref_prev`
+            To use this parameter, you must set `sync_ref_model=True`.
+        ref_model_sync_steps (`int`, *optional*, defaults to `64`):
+            τ parameter from the [TR-DPO](https://huggingface.co/papers/2404.09656) paper, which determines how
+            frequently the current policy is synchronized with the reference policy. To use this parameter, you must
+            set `sync_ref_model=True`.
+        rpo_alpha (`float`, *optional*, defaults to `None`):
+            α parameter from the [RPO](https://huggingface.co/papers/2404.19733) paper (v3), which controls the
+            weighting of the NLL term in the loss. If `None`, no weighting is applied and the loss is the same as the
+            DPO loss. The paper recommends `rpo_alpha=1.0`.
     """
 
     beta: float = 0.1
-    label_smoothing: float = 0
+    label_smoothing: float = 0.0
     loss_type: Literal[
         "sigmoid",
         "hinge",
@@ -125,14 +151,15 @@ class DPOConfig(TrainingArguments):
     truncation_mode: str = "keep_end"
     max_length: Optional[int] = None
     max_prompt_length: Optional[int] = None
-    max_target_length: Optional[int] = None
+    max_target_length: Optional[int] = None  # deprecated in favor of max_completion_length
+    max_completion_length: Optional[int] = None
     is_encoder_decoder: Optional[bool] = None
     disable_dropout: bool = True
     generate_during_eval: bool = False
     precompute_ref_log_probs: bool = False
     dataset_num_proc: Optional[int] = None
-    model_init_kwargs: Optional[Dict] = None
-    ref_model_init_kwargs: Optional[Dict] = None
+    model_init_kwargs: Optional[Dict[str, Any]] = None
+    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
     model_adapter_name: Optional[str] = None
     ref_adapter_name: Optional[str] = None
     reference_free: bool = False
@@ -145,6 +172,12 @@ class DPOConfig(TrainingArguments):
     rpo_alpha: Optional[float] = None
 
     def __post_init__(self):
-        if self.loss_type == "kto_pair":
-            raise ValueError("Support for kto_pair has been removed in DPOTrainer. Please use KTOTrainer.")
+        if self.max_target_length is not None:
+            warnings.warn(
+                "The `max_target_length` argument is deprecated in favor of `max_completion_length` and will be removed in a future version.",
+                FutureWarning,
+            )
+            if self.max_completion_length is None:
+                self.max_completion_length = self.max_target_length
+
         return super().__post_init__()
diff --git a/trl/trainer/dpo_trainer.py b/trl/trainer/dpo_trainer.py
index eccc9496c2..f54538962d 100644
--- a/trl/trainer/dpo_trainer.py
+++ b/trl/trainer/dpo_trainer.py
@@ -254,8 +254,10 @@ def _tokenize_encoder_decoder(
     args: DPOConfig,
     model: Optional[PreTrainedModel],
 ) -> None:
-    chosen_tokens = tokenizer(chosen, truncation=True, max_length=args.max_target_length, add_special_tokens=True)
-    rejected_tokens = tokenizer(rejected, truncation=True, max_length=args.max_target_length, add_special_tokens=True)
+    chosen_tokens = tokenizer(chosen, truncation=True, max_length=args.max_completion_length, add_special_tokens=True)
+    rejected_tokens = tokenizer(
+        rejected, truncation=True, max_length=args.max_completion_length, add_special_tokens=True
+    )
     prompt_tokens = tokenizer(prompt, truncation=True, max_length=args.max_prompt_length, add_special_tokens=True)
 
     batch["chosen_labels"] = chosen_tokens["input_ids"]
@@ -692,14 +694,14 @@ def make_inputs_require_grad(module, input, output):
             warnings.warn(
                 "You passed `max_target_length` to the DPOTrainer, the value you passed will override the one in the `DPOConfig`."
             )
-            args.max_target_length = max_target_length
-        if args.max_target_length is None and self.is_encoder_decoder:
+            args.max_completion_length = max_target_length
+        if args.max_completion_length is None and self.is_encoder_decoder:
             warnings.warn(
-                "When using an encoder decoder architecture, you should set `max_target_length` in the DPOConfig's init"
+                "When using an encoder decoder architecture, you should set `max_completion_length` in the DPOConfig's init"
                 " it will default to `128` by default, but you should do it yourself in the future.",
                 UserWarning,
             )
-            args.max_target_length = 128
+            args.max_completion_length = 128
 
         if label_pad_token_id != -100:
             warnings.warn(
@@ -752,7 +754,7 @@ def make_inputs_require_grad(module, input, output):
             )
             args.truncation_mode = truncation_mode
         self.truncation_mode = args.truncation_mode
-        self.max_target_length = args.max_target_length
+        self.max_completion_length = args.max_completion_length
         self.precompute_ref_log_probs = args.precompute_ref_log_probs
 
         # Since ref_logs are precomputed on the first call to get_train/eval_dataloader
diff --git a/trl/trainer/kto_config.py b/trl/trainer/kto_config.py
index c5a6893fce..64b030b585 100644
--- a/trl/trainer/kto_config.py
+++ b/trl/trainer/kto_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Literal, Optional
+from typing import Any, Dict, Literal, Optional
 
 from transformers import TrainingArguments
 
@@ -20,74 +20,73 @@
 @dataclass
 class KTOConfig(TrainingArguments):
     r"""
-    KTOConfig collects all training arguments related to the [`KTOTrainer`] class.
+    Configuration class for the [`KTOTrainer`].
 
-    Using [`HfArgumentParser`] we can turn this class into
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
+        max_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model.
         loss_type (`str`, *optional*, defaults to `"kto"`):
-                The type of unpaired loss to use. Possible values are:
+            Type of loss to use. Possible values are:
 
-                    - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper.
-                    - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
-        max_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
-        max_prompt_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
-        beta (`float`, defaults to 0.1):
-            The beta factor in KTO loss. Higher beta means less divergence from the initial policy.
-        desirable_weight (`float`, *optional*, defaults to 1.0):
-            The desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris.
-        undesirable_weight (`float`, *optional*, defaults to 1.0):
-            The undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs.
-        label_pad_token_id (`int`, defaults to `-100`):
-            The label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`int`, defaults to `0`):
-            The padding value if it is different to the tokenizer's pad_token_id.
-        truncation_mode (`str`, defaults to `keep_end`):
-            The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
-        generate_during_eval (`bool`, defaults to `False`):
-            Whether to sample and log generations during evaluation step.
-        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
-            If no model is provided, we need to know if the model_init returns an encoder-decoder.
-        precompute_ref_log_probs (`bool`, defaults to `False`):
-            Flag to precompute reference model log probabilities for training and evaluation datasets. This is useful if you want to train
-            without the reference model and reduce the total GPU memory needed.
-        model_init_kwargs: (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the model from a string.
-        ref_model_init_kwargs: (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the ref model from a string.
+                - `"kto"`: KTO loss from the [KTO](https://huggingface.co/papers/2402.01306) paper.
+                - `"apo_zero_unpaired"`: Unpaired variant of APO-zero loss from the [APO](https://huggingface.co/papers/2408.06266) paper.
+
+        desirable_weight (`float`, *optional*, defaults to `1.0`):
+            Desirable losses are weighed by this factor to counter unequal number of desirable and undesirable paris.
+        undesirable_weight (`float`, *optional*, defaults to `1.0`):
+            Undesirable losses are weighed by this factor to counter unequal number of desirable and undesirable pairs.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from both the model and the reference model to W&B during
+            evaluation.
+        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        precompute_ref_log_probs (`bool`, *optional*, defaults to `False`):
+            Whether to precompute reference model log probabilities for training and evaluation datasets. This is
+            useful when training without the reference model to reduce the total GPU memory needed.
+        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        ref_model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the reference model
+            from a string.
         dataset_num_proc: (`Optional[int]`, *optional*, defaults to `None`):
-            Number of processes to use for processing the datasets.
+            Number of processes to use for processing the dataset.
     """
 
-    loss_type: Literal[
-        "kto",
-        "apo_zero_unpaired",
-    ] = "kto"
     max_length: Optional[int] = None
-    """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
     max_prompt_length: Optional[int] = None
-    """The maximum length of the prompt. This argument is required if you want to use the default data collator."""
     max_completion_length: Optional[int] = None
-    """The maximum length of the target. This argument is required if you want to use the default data collator and your model is an encoder-decoder."""
     beta: float = 0.1
-    """The beta factor in KTO loss. Higher beta means less divergence from the initial policy."""
-    desirable_weight: Optional[float] = 1.0
-    """The desirable losses are weighed by this factor."""
-    undesirable_weight: Optional[float] = 1.0
-    """The undesirable losses are weighed by this factor."""
-
+    loss_type: Literal["kto", "apo_zero_unpaired"] = "kto"
+    desirable_weight: float = 1.0
+    undesirable_weight: float = 1.0
     label_pad_token_id: int = -100
-    padding_value: int = None
+    padding_value: Optional[int] = None
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
     precompute_ref_log_probs: bool = False
-    model_init_kwargs: Optional[Dict] = None
-    ref_model_init_kwargs: Optional[Dict] = None
+    model_init_kwargs: Optional[Dict[str, Any]] = None
+    ref_model_init_kwargs: Optional[Dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
diff --git a/trl/trainer/model_config.py b/trl/trainer/model_config.py
index bc0caf93d2..2bca3f84bf 100644
--- a/trl/trainer/model_config.py
+++ b/trl/trainer/model_config.py
@@ -1,95 +1,80 @@
-from dataclasses import dataclass, field
-from typing import List, Optional
-
-from ..core import flatten_dict
+from dataclasses import dataclass
+from typing import List, Literal, Optional
 
 
 @dataclass
 class ModelConfig:
     """
-    Arguments which define the model and tokenizer to load.
-    """
+    Configuration class for the models.
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
 
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={"help": ("The model checkpoint for weights initialization.")},
-    )
-    model_revision: str = field(
-        default="main",
-        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
-    )
-    torch_dtype: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Override the default `torch.dtype` and load the model under this dtype. If `auto` is passed, the "
-                "dtype will be automatically derived from the model's weights."
-            ),
-            "choices": ["auto", "bfloat16", "float16", "float32"],
-        },
-    )
-    trust_remote_code: bool = field(default=False, metadata={"help": "Trust remote code when loading a model."})
-    attn_implementation: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`"
-            )
-        },
-    )
-    use_peft: bool = field(
-        default=False,
-        metadata={"help": ("Whether to use PEFT or not for training.")},
-    )
-    lora_r: Optional[int] = field(
-        default=16,
-        metadata={"help": ("LoRA R value.")},
-    )
-    lora_alpha: Optional[int] = field(
-        default=32,
-        metadata={"help": ("LoRA alpha.")},
-    )
-    lora_dropout: Optional[float] = field(
-        default=0.05,
-        metadata={"help": ("LoRA dropout.")},
-    )
-    lora_target_modules: Optional[List[str]] = field(
-        default=None,
-        metadata={"help": ("LoRA target modules.")},
-    )
-    lora_modules_to_save: Optional[List[str]] = field(
-        default=None,
-        metadata={"help": ("Model layers to unfreeze & train")},
-    )
-    lora_task_type: str = field(
-        default="CAUSAL_LM", metadata={"help": "The task_type to pass for LoRA (use SEQ_CLS for reward modeling)"}
-    )
-    use_rslora: bool = field(
-        default=False,
-        metadata={
-            "help": (
-                "Use Rank-Stabilized LoRA (https://huggingface.co/papers/2312.03732), which sets the adapter "
-                "scaling factor to lora_alpha/√r, instead of the original default value of `lora_alpha/r`."
-            )
-        },
-    )
-    load_in_8bit: bool = field(
-        default=False, metadata={"help": "use 8 bit precision for the base model - works only with LoRA"}
-    )
-    load_in_4bit: bool = field(
-        default=False, metadata={"help": "use 4 bit precision for the base model - works only with LoRA"}
-    )
+    Parameters:
+        model_name_or_path (`Optional[str]`, *optional*, defaults to `None`):
+            Model checkpoint for weights initialization.
+        model_revision (`str`, *optional*, defaults to `"main"`):
+            Specific model version to use. It can be a branch name, a tag name, or a commit id.
+        torch_dtype (`Optional[Literal["auto", "bfloat16", "float16", "float32"]]`, *optional*, defaults to `None`):
+            Override the default `torch.dtype` and load the model under this dtype. Possible values are
 
-    bnb_4bit_quant_type: Optional[str] = field(
-        default="nf4", metadata={"help": "precise the quantization type (fp4 or nf4)"}
-    )
-    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "use nested quantization"})
+                - `"bfloat16"`: `torch.bfloat16`
+                - `"float16"`: `torch.float16`
+                - `"float32"`: `torch.float32`
+                - `"auto"`: Automatically derive the dtype from the model's weights.
+
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether to allow for custom models defined on the Hub in their own modeling files. This option should only
+            be set to `True` for repositories you trust and in which you have read the code, as it will execute code
+            present on the Hub on your local machine.
+        attn_implementation (`Optional[str]`, *optional*, defaults to `None`):
+            Which attention implementation to use. You can run `--attn_implementation=flash_attention_2`, in which case
+            you must install this manually by running `pip install flash-attn --no-build-isolation`.
+        use_peft (`bool`, *optional*, defaults to `False`):
+            Whether to use PEFT for training.
+        lora_r (`int`, *optional*, defaults to `16`):
+            LoRA R value.
+        lora_alpha (`int`, *optional*, defaults to `32`):
+            LoRA alpha.
+        lora_dropout (`float`, *optional*, defaults to `0.05`):
+            LoRA dropout.
+        lora_target_modules (`Optional[Union[str, List[str]]]`, *optional*, defaults to `None`):
+            LoRA target modules.
+        lora_modules_to_save (`Optional[List[str]]`, *optional*, defaults to `None`):
+            Model layers to unfreeze & train.
+        lora_task_type (`str`, *optional*, defaults to `"CAUSAL_LM"`):
+            Task type to pass for LoRA (use `"SEQ_CLS"` for reward modeling).
+        use_rslora (`bool`, *optional*, defaults to `False`):
+            Whether to use Rank-Stabilized LoRA, which sets the adapter scaling factor to `lora_alpha/√r`, instead of
+            the original default value of `lora_alpha/r`.
+        load_in_8bit (`bool`, *optional*, defaults to `False`):
+            Whether to use 8 bit precision for the base model. Works only with LoRA.
+        load_in_4bit (`bool`, *optional*, defaults to `False`):
+            Whether to use 4 bit precision for the base model. Works only with LoRA.
+        bnb_4bit_quant_type (`str`, *optional*, defaults to `"nf4"`):
+            Quantization type (`"fp4"` or `"nf4"`).
+        use_bnb_nested_quant (`bool`, *optional*, defaults to `False`):
+            Whether to use nested quantization.
+    """
 
-    def to_dict(self):
-        output_dict = {}
-        for key, value in self.__dict__.items():
-            output_dict[key] = value
-        return flatten_dict(output_dict)
+    model_name_or_path: Optional[str] = None
+    model_revision: str = "main"
+    torch_dtype: Optional[Literal["auto", "bfloat16", "float16", "float32"]] = None
+    trust_remote_code: bool = False
+    attn_implementation: Optional[str] = None
+    use_peft: bool = False
+    lora_r: int = 16
+    lora_alpha: int = 32
+    lora_dropout: float = 0.05
+    lora_target_modules: Optional[List[str]] = None
+    lora_modules_to_save: Optional[List[str]] = None
+    lora_task_type: str = "CAUSAL_LM"
+    use_rslora: bool = False
+    load_in_8bit: bool = False
+    load_in_4bit: bool = False
+    bnb_4bit_quant_type: Literal["fp4", "nf4"] = "nf4"
+    use_bnb_nested_quant: bool = False
 
     def __post_init__(self):
         if self.load_in_8bit and self.load_in_4bit:
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index f1c1ccab4a..fca33c4dcc 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -13,29 +13,33 @@ class OnlineDPOConfig(TrainingArguments):
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
-    Args:
+    Parameters:
         reward_model_path (`Optional[str]`, *optional*, defaults to `None`):
             Path to the reward model.
         max_new_tokens (`int`, *optional*, defaults to `64`):
-            The maximum number of tokens to generate per completion.
+            Maximum number of tokens to generate per completion.
         temperature (`float`, *optional*, defaults to `0.9`):
             Temperature for sampling. The higher the temperature, the more random the completions.
         missing_eos_penalty (`Optional[float]`, *optional*, defaults to `None`):
-            Penalty when the model fails to generate an EOS token.
+            Penalty applied to the score when the model fails to generate an EOS token. This is useful to encourage
+            to generate completions shorter than the maximum length (`max_new_tokens`). The penalty must be a positive
+            value.
         beta (`float`, *optional*, defaults to `0.1`):
-            Beta parameter for the DPO loss.
+            Parameter controlling the deviation from the reference model. Higher β means less deviation from the
+            reference model. For the IPO loss (`loss_type="ipo"`), β is the regularization parameter denoted by τ in
+            the [paper](https://huggingface.co/papers/2310.12036).
         loss_type (`str`, *optional*, defaults to `"sigmoid"`):
-            Type of DPO loss to use. Possible values are:
+            Type of loss to use. Possible values are:
 
                 - `"sigmoid"`: sigmoid loss from the original [DPO](https://huggingface.co/papers/2305.18290) paper.
                 - `"ipo"`: IPO loss from the [IPO](https://huggingface.co/papers/2310.12036) paper.
 
         dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
-            Number of workers to use to process the data.
+            Number of processes to use for processing the dataset.
     """
 
     reward_model_path: Optional[str] = None
-    max_new_tokens: int = 53
+    max_new_tokens: int = 64
     temperature: float = 0.9
     missing_eos_penalty: Optional[float] = None
     beta: float = 0.1
diff --git a/trl/trainer/orpo_config.py b/trl/trainer/orpo_config.py
index 14be7ee1a2..6cc54ed919 100644
--- a/trl/trainer/orpo_config.py
+++ b/trl/trainer/orpo_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
 from transformers import TrainingArguments
 
@@ -20,52 +20,54 @@
 @dataclass
 class ORPOConfig(TrainingArguments):
     r"""
-    ORPOConfig collects all training arguments related to the [`ORPOTrainer`] class.
+    Configuration class for the [`ORPOTrainer`].
 
-    Using [`HfArgumentParser`] we can turn this class into
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
-        max_length (`int`, defaults to `None`):
-            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
-        max_prompt_length (`int`, defaults to `None`):
-            The maximum length of the prompt. This argument is required if you want to use the default data collator.
-        max_completion_length (`int`, defaults to `None`):
-            The maximum length of the completions. This argument is required if you want to use the default data collator and your model is an encoder-decoder.
-        beta (`float`, defaults to 0.1):
-            The beta factor in ORPO loss (lambda/alpha in paper/code) that is the weight of the relative loss ratio in the SFT loss.
-        label_pad_token_id (`int`, defaults to `-100`):
-            The label pad token id. This argument is required if you want to use the default data collator.
-        padding_value (`int`, defaults to `None`):
-            The padding value if it is different to the tokenizer's pad_token_id.
-        truncation_mode (`str`, defaults to `keep_end`):
-            The truncation mode to use, either `keep_end` or `keep_start`. This argument is required if you want to use the default data collator.
-        generate_during_eval (`bool`, defaults to `False`):
-            Whether to sample and log generations during evaluation step.
-        is_encoder_decoder (`Optional[bool]`, `optional`, defaults to `None`):
-            If no model is provided, we need to know if the model_init returns an encoder-decoder.
-        disable_dropout (`bool`, defaults to `True`):
-            Whether or not to disable dropouts in `model`.
-        model_init_kwargs (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the model from a string
-        dataset_num_proc (`Optional[int]`, *optional*):
-            The number of workers to use to tokenize the data. Defaults to None.
+        max_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        max_prompt_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the prompt. This argument is required if you want to use the default data collator.
+        max_completion_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the completion. This argument is required if you want to use the default data collator
+            and your model is an encoder-decoder.
+        beta (`float`, *optional*, defaults to `0.1`):
+            Parameter controlling the relative ratio loss weight in the ORPO loss. In the [paper](https://huggingface.co/papers/2403.07691),
+            it is denoted by λ. In the [code](https://github.com/xfactlab/orpo), it is denoted by `alpha`.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model.
+        label_pad_token_id (`int`, *optional*, defaults to `-100`):
+            Label pad token id. This argument is required if you want to use the default data collator.
+        padding_value (`Optional[int]`, *optional*, defaults to `None`):
+            Padding value to use. If `None`, the padding value of the tokenizer is used.
+        truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
+            Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
+            This argument is required if you want to use the default data collator.
+        generate_during_eval (`bool`, *optional*, defaults to `False`):
+            If `True`, generates and logs completions from the model to W&B during evaluation.
+        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
     """
 
     max_length: Optional[int] = None
     max_prompt_length: Optional[int] = None
     max_completion_length: Optional[int] = None
-
     beta: float = 0.1
     disable_dropout: bool = True
-
     label_pad_token_id: int = -100
-    padding_value: int = None
+    padding_value: Optional[int] = None
     truncation_mode: str = "keep_end"
     generate_during_eval: bool = False
     is_encoder_decoder: Optional[bool] = None
-
-    model_init_kwargs: Optional[Dict] = None
-
+    model_init_kwargs: Optional[Dict[str, Any]] = None
     dataset_num_proc: Optional[int] = None
diff --git a/trl/trainer/ppo_config.py b/trl/trainer/ppo_config.py
index 38aee8e2b3..147ab5720b 100644
--- a/trl/trainer/ppo_config.py
+++ b/trl/trainer/ppo_config.py
@@ -33,112 +33,164 @@
 
 @dataclass
 class PPOConfig:
-    """
-    Configuration class for PPOTrainer
+    r"""
+    Configuration class for the [`PPOTrainer`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`):
+            Name of this experiment.
+        seed (`int`, *optional*, defaults to `0`):
+            Random seed.
+        log_with (`Optional[Literal["wandb", "tensorboard"]]`, *optional*, defaults to `None`):
+            Log with either `"wandb"` or `"tensorboard"`. Check
+            [tracking](https://huggingface.co/docs/accelerate/usage_guides/tracking) for more details.
+        task_name (`Optional[str]`, *optional*, defaults to `None`):
+            Name of task to use - used only for tracking purposes.
+        model_name (`Optional[str]`, *optional*, defaults to `"gpt2"`):
+            Name of model to use - used only for tracking purposes.
+        query_dataset (`Optional[str]`, *optional*, defaults to `"imdb"`):
+            Name of dataset to query - used only for tracking purposes.
+        reward_model (`Optional[str]`, *optional*, defaults to `"sentiment-analysis:lvwerra/distilbert-imdb"`):
+            Reward model to use - used only for tracking purposes.
+        remove_unused_columns (`bool`, *optional*, defaults to `True`):
+            Remove unused columns from the dataset.
+        tracker_kwargs (`JSONDict`, *optional*, defaults to `{}`):
+            Keyword arguments for the tracker (e.g. `python ppo.py --tracker_kwargs='{"wandb": {"entity": "my_wandb_entity", "name": "my_exp_name"}}'`.
+        accelerator_kwargs (`JSONDict`, *optional*, defaults to `{}`):
+            Keyword arguments for the accelerator.
+        project_kwargs (`JSONDict`, *optional*, defaults to `{}`):
+            Keyword arguments for the accelerator project config (e.g. `logging_dir`).
+        tracker_project_name (`str`, *optional*, defaults to `"trl"`):
+            Name of project to use for tracking.
+        push_to_hub_if_best_kwargs (`JSONDict`, *optional*, defaults to `{}`):
+            Keyword arguments for pushing model to the hub during training (e.g. repo_id).
+        steps (`int`, *optional*, defaults to `20000`):
+            Number of training steps.
+        learning_rate (`float`, *optional*, defaults to `1.41e-5`):
+            Learning rate for the optimizer.
+        adap_kl_ctrl (`bool`, *optional*, defaults to `True`):
+            Use adaptive KL control, otherwise linear.
+        init_kl_coef (`Optional[float]`, *optional*, defaults to `0.2`):
+            Initial KL penalty coefficient (used for adaptive and linear control).
+        kl_penalty (`Literal["kl", "abs", "mse", "full"]`, *optional*, defaults to `"kl"`):
+            kl penalty options. Possible values are:
+
+                - `"kl"`: model_logp - ref_logp
+                - `"abs"`: abs(kl)
+                - `"mse"`: mean squared error mse(kl)
+                - `"full"`: the actual kl for all tokens in the distribution.
+
+        target (`float`, *optional*, defaults to `6.0`):
+            Target KL value for adaptive KL control.
+        horizon (`float`, *optional*, defaults to `10000.0`):
+            Horizon for adaptive KL control.
+        gamma (`float`, *optional*, defaults to `1.0`):
+            Gamma parameter for advantage calculation.
+        lam (`float`, *optional*, defaults to `0.95`):
+            Lambda parameter for advantage calculation.
+        cliprange (`float`, *optional*, defaults to `0.2`):
+            Range for clipping in PPO policy gradient loss.
+        cliprange_value (`float`, *optional*, defaults to `0.2`):
+            Range for clipping values in loss calculation.
+        vf_coef (`float`, *optional*, defaults to `0.1`):
+            Scaling factor for value loss.
+        batch_size (`int`, *optional*, defaults to `128`):
+            Number of samples per optimisation step.
+        forward_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            DEPRECATED: use `mini_batch_size` instead, which does the same thing.
+        mini_batch_size (`int`, *optional*, defaults to `128`):
+            Number of samples optimized in each mini batch.
+        gradient_accumulation_steps (`int`, *optional*, defaults to `1`):
+            Number of gradient accumulation steps.
+        world_size (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for distributed training.
+        ppo_epochs (`int`, *optional*, defaults to `4`):
+            Number of optimisation epochs per batch of samples.
+        optimize_device_cache (`bool`, *optional*, defaults to `False`):
+            Optimize device cache for slightly more memory-efficient training.
+        early_stopping (`bool`, *optional*, defaults to `False`):
+            Whether to stop the PPO optimization loop early is the KL too high.
+        target_kl (`float`, *optional*, defaults to `1.0`):
+            Stop early if we exceed this value by over 50%.
+        compare_steps (`int`, *optional*, defaults to `1`):
+            Compare the current step with the previous `compare_steps` steps.
+        ratio_threshold (`float`, *optional*, defaults to `10.0`):
+            Skip mini-batches with high PPO ratios that can cause loss spikes.
+        use_score_scaling (`bool`, *optional*, defaults to `False`):
+            Use score scaling.
+        use_score_norm (`bool`, *optional*, defaults to `False`):
+            Use score normalization. Only applicable if `use_score_scaling` is True.
+        score_clip (`Optional[float]`, *optional*, defaults to `None`):
+            Score clipping.
+        whiten_rewards (`bool`, *optional*, defaults to `False`):
+            Whiten the rewards before computing advantages.
+        is_encoder_decoder (`Optional[bool]`, *optional*, defaults to `None`):
+            When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
+            you need to specify if the model returned by the callable is an encoder-decoder model.
+        is_peft_model (`Optional[bool]`, *optional*, defaults to `None`):
+            Whether the model is a PEFT model.
+        backward_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Number of samples optimized in an `optimizer.step()` call.
+        global_backward_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Effective `backward_batch_size` across all processes.
+        global_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Effective `batch_size` across all processes.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
     """
 
-    # common parameters
     exp_name: str = os.path.basename(sys.argv[0])[: -len(".py")]
-    """the name of this experiment (by default is the file name without the extension name)"""
     seed: int = 0
-    """Seed value for random generations"""
     log_with: Optional[Literal["wandb", "tensorboard"]] = None
-    """Log with either 'wandb' or 'tensorboard', check  https://huggingface.co/docs/accelerate/usage_guides/tracking for more details"""
     task_name: Optional[str] = None
-    """Name of task to use - used only for tracking purposes"""
-    model_name: Optional[str] = "gpt2"
-    """Name of model to use - used only for tracking purposes"""
-    query_dataset: Optional[str] = "imdb"
-    """Name of dataset to query - used only for tracking purposes"""
-    reward_model: Optional[str] = "sentiment-analysis:lvwerra/distilbert-imdb"
-    """The reward model to use - used only for tracking purposes"""
+    model_name: str = "gpt2"
+    query_dataset: str = "imdb"
+    reward_model: str = "sentiment-analysis:lvwerra/distilbert-imdb"
     remove_unused_columns: bool = True
-    """Remove unused columns from the dataset if `datasets.Dataset` is used"""
     tracker_kwargs: JSONDict = field(default_factory=dict)
-    """Keyword arguments for the tracker (e.g. python ppo.py --tracker_kwargs='{"wandb": {"entity": "my_wandb_entity", "name": "my_exp_name"}}'"""
     accelerator_kwargs: JSONDict = field(default_factory=dict)
-    """Keyword arguments for the accelerator"""
     project_kwargs: JSONDict = field(default_factory=dict)
-    """Keyword arguments for the accelerator project config (e.g. `logging_dir`)"""
     tracker_project_name: str = "trl"
-    """Name of project to use for tracking"""
     push_to_hub_if_best_kwargs: JSONDict = field(default_factory=dict)
-    """Keyword arguments for pushing model to the hub during training (e.g. repo_id)"""
-
-    # hyperparameters
     steps: int = 20000
-    """Number of training steps"""
     learning_rate: float = 1.41e-5
-    """Adam learning rate"""
     adap_kl_ctrl: bool = True
-    """Use adaptive KL control, otherwise linear"""
-    init_kl_coef: Optional[float] = 0.2
-    """Initial KL penalty coefficient (used for adaptive and linear control)"""
+    init_kl_coef: float = 0.2
     kl_penalty: Literal["kl", "abs", "mse", "full"] = "kl"
-    """kl penalty options: 'kl': model_logp - ref_logp,  'abs': abs(kl),  'mse': mean squared error mse(kl) and 'full': the actual kl for all tokens in the distribution"""
-    target: Optional[float] = 6
-    """Target KL value for adaptive KL control"""
-    horizon: Optional[float] = 10000
-    """Horizon for adaptive KL control"""
-    gamma: float = 1
-    """Gamma parameter for advantage calculation"""
+    target: float = 6.0
+    horizon: float = 10000.0
+    gamma: float = 1.0
     lam: float = 0.95
-    """Lambda parameter for advantage calculation"""
     cliprange: float = 0.2
-    """Range for clipping in PPO policy gradient loss"""
     cliprange_value: float = 0.2
-    """Range for clipping values in loss calculation"""
     vf_coef: float = 0.1
-    """Scaling factor for value loss"""
     batch_size: int = 128
-    """Number of samples per optimisation step"""
     forward_batch_size: Optional[int] = None
-    """DEPRECATED: use `mini_batch_size` instead, which does the same thing."""
     mini_batch_size: int = 128
-    """Number of samples optimized in each mini batch"""
     gradient_accumulation_steps: int = 1
-    """The number of gradient accumulation steps"""
     world_size: tyro.conf.Suppress[int] = None
-    """The world size for distributed training"""
     ppo_epochs: int = 4
-    """Number of optimisation epochs per batch of samples"""
     max_grad_norm: Optional[float] = None
-    """Maximum gradient norm for gradient clipping"""
     optimize_cuda_cache: Optional[bool] = None
-    """DEPRECATED: use `optimize_device_cache` instead, which does the same thing."""
-    optimize_device_cache: Optional[bool] = False
-    """Optimize device cache for slightly more memory-efficient training"""
+    optimize_device_cache: bool = False
     early_stopping: bool = False
-    """Whether to stop the PPO optimization loop early is the KL too high"""
-    target_kl: float = 1
-    """Stop early if we exceed this value by over 50%"""
+    target_kl: float = 1.0
     compare_steps: int = 1
-    """Number of steps between comparison of the current reward with the best seen so far"""
     ratio_threshold: float = 10.0
-    """Skip mini-batches with high PPO ratios that can cause loss spikes"""
     use_score_scaling: bool = False
-    """Use score scaling"""
     use_score_norm: bool = False
-    """Use score normalization. Only applicable if use_score_scaling is True"""
     score_clip: Optional[float] = None
-    """Score clipping"""
     whiten_rewards: bool = False
-    """Whiten the rewards before compute advantages"""
     gradient_checkpointing: bool = False
-    """Enable gradient checkpointing"""
-
-    # computed hyperparameters at runtime; we use `tyro.conf.Suppress` to hide them from the help text
     is_encoder_decoder: Optional[tyro.conf.Suppress[bool]] = None
-    """TO BE FILLED In RUNTIME: Whether the model is an encoder-decoder model"""
     is_peft_model: Optional[tyro.conf.Suppress[bool]] = None
-    """TO BE FILLED In RUNTIME: Whether the model is a PEFT model"""
     backward_batch_size: tyro.conf.Suppress[int] = None
-    """TO BE FILLED In RUNTIME: Number of samples optimized in an `optimizer.step()` call"""
-    global_backward_batch_size: tyro.conf.Suppress[int] = None
-    """TO BE FILLED In RUNTIME: the effective `backward_batch_size` across all processes"""
+    global_backward_batch_size: Optional[tyro.conf.Suppress[int]] = None
     global_batch_size: tyro.conf.Suppress[int] = None
-    """TO BE FILLED In RUNTIME: the effective `batch_size` across all processes"""
-
     dataset_num_proc: Optional[int] = None
 
     if optimize_cuda_cache is not None:
diff --git a/trl/trainer/ppov2_config.py b/trl/trainer/ppov2_config.py
index 05247f4fae..944d247e7c 100644
--- a/trl/trainer/ppov2_config.py
+++ b/trl/trainer/ppov2_config.py
@@ -6,25 +6,43 @@
 
 @dataclass
 class PPOv2Config(OnPolicyConfig):
+    r"""
+    Configuration class for the [`PPOv2Trainer`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
+            Name of this experiment.
+        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the reward model.
+        num_ppo_epochs (`int`, *optional*, defaults to `4`):
+            Number of epochs to train.
+        whiten_rewards (`bool`, *optional*, defaults to `False`):
+            Whether to whiten the rewards.
+        kl_coef (`float`, *optional*, defaults to `0.05`):
+            KL coefficient.
+        cliprange (`float`, *optional*, defaults to `0.2`):
+            Clip range.
+        vf_coef (`float`, *optional*, defaults to `0.1`):
+            Value function coefficient.
+        cliprange_value (`float`, *optional*, defaults to `0.2`):
+            Clip range for the value function.
+        gamma (`float`, *optional*, defaults to `1.0`):
+            Discount factor.
+        lam (`float`, *optional*, defaults to `0.95`):
+            Lambda value for GAE.
+    """
+
     exp_name: str = os.path.basename(__file__)[: -len(".py")]
-    """the name of this experiment"""
     reward_model_path: str = "EleutherAI/pythia-160m"
-    """the path to the reward model"""
-
-    # ppo config
     num_ppo_epochs: int = 4
-    """the number of epochs to train"""
     whiten_rewards: bool = False
-    """whether to whiten the rewards"""
     kl_coef: float = 0.05
-    """the KL coefficient"""
     cliprange: float = 0.2
-    """the clip range"""
     vf_coef: float = 0.1
-    """the value function coefficient"""
     cliprange_value: float = 0.2
-    """the clip range for the value function"""
-    gamma: float = 1
-    """the discount factor"""
+    gamma: float = 1.0
     lam: float = 0.95
-    """the lambda value for GAE"""
diff --git a/trl/trainer/reward_config.py b/trl/trainer/reward_config.py
index 6e3975b42d..8eaa0bdcba 100644
--- a/trl/trainer/reward_config.py
+++ b/trl/trainer/reward_config.py
@@ -20,22 +20,24 @@
 
 @dataclass
 class RewardConfig(TrainingArguments):
-    """
-    RewardConfig collects all training arguments related to the [`RewardTrainer`] class.
+    r"""
+    Configuration class for the [`RewardTrainer`].
 
-    Using [`HfArgumentParser`] we can turn this class into
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
-        max_length (`int`, *optional*, defaults to `None`):
-            The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
-        gradient_checkpointing (`bool`, *optional*, defaults to `True`):
-                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        max_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum length of the sequences (prompt + completion) in the batch. This argument is required if you want
+            to use the default data collator.
+        dataset_num_proc (`int`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
+        center_rewards_coefficient (`float`, *optional*, defaults to `None`):
+            Coefficient to incentivize the reward model to output mean-zero rewards (proposed by
+            https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`.
     """
 
     max_length: Optional[int] = None
-    """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
     dataset_num_proc: Optional[int] = None
-    """Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."""
     center_rewards_coefficient: Optional[float] = None
diff --git a/trl/trainer/rloo_config.py b/trl/trainer/rloo_config.py
index e629d84afa..ee0e3f7b60 100644
--- a/trl/trainer/rloo_config.py
+++ b/trl/trainer/rloo_config.py
@@ -6,21 +6,34 @@
 
 @dataclass
 class RLOOConfig(OnPolicyConfig):
+    r"""
+    Configuration class for the [`RLOOTrainer`].
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[: -len(".py")]`):
+            Name of this experiment.
+        reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the reward model.
+        num_ppo_epochs (`int`, *optional*, defaults to `4`):
+            Number of epochs to train.
+        whiten_rewards (`bool`, *optional*, defaults to `False`):
+            Whether to whiten the rewards.
+        kl_coef (`float`, *optional*, defaults to `0.05`):
+            KL coefficient.
+        cliprange (`float`, *optional*, defaults to `0.2`):
+            Clip range.
+        rloo_k (`int`, *optional*, defaults to `2`):
+            REINFORCE Leave-One-Out (RLOO) number of online samples per prompt.
+    """
+
     exp_name: str = os.path.basename(__file__)[: -len(".py")]
-    """the name of this experiment"""
     reward_model_path: str = "EleutherAI/pythia-160m"
-    """the path to the reward model"""
-
-    # ppo config
     num_ppo_epochs: int = 4
-    """the number of epochs to train"""
     whiten_rewards: bool = False
-    """whether to whiten the rewards"""
     kl_coef: float = 0.05
-    """the KL coefficient"""
     cliprange: float = 0.2
-    """the clip range"""
-
-    # rloo config
     rloo_k: int = 2
-    """REINFORCE Leave-One-Out (RLOO) number of online samples per prompt"""
diff --git a/trl/trainer/sft_config.py b/trl/trainer/sft_config.py
index 132a0c69d9..f0f2df1985 100644
--- a/trl/trainer/sft_config.py
+++ b/trl/trainer/sft_config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from dataclasses import dataclass
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
 
 from transformers import TrainingArguments
 
@@ -20,50 +20,56 @@
 @dataclass
 class SFTConfig(TrainingArguments):
     r"""
-    Initialize SFTConfig.
+    Configuration class for the [`SFTTrainer`].
 
-    Args:
-        dataset_text_field (`Optional[str]`):
-            The name of the text field of the dataset, in case this is passed by a user, the trainer will automatically create a
-            `ConstantLengthDataset` based on the `dataset_text_field` argument. Defaults to None.
-        packing (`Optional[bool]`):
-            Used only in case `dataset_text_field` is passed. This argument is used by the `ConstantLengthDataset` to pack the sequences
-            of the dataset. Defaults to False.
-        max_seq_length (`Optional[int]`):
-            The maximum sequence length to use for the `ConstantLengthDataset` and for automatically creating the Dataset. Defaults to min of the smaller of the `tokenizer.model_max_length` and `1024`.
-        dataset_num_proc (`Optional[int]`):
-            The number of workers to use to tokenize the data. Only used when `packing=False`. Defaults to None.
-        dataset_batch_size (`int`):
-            The number of examples to tokenize per batch. If batch_size <= 0 or batch_size == None,
-            tokenize the full dataset as a single batch. Defaults to 1000.
-        neftune_noise_alpha (`Optional[float]`):
-            If not `None`, this will activate NEFTune noise embeddings. This has been proven to drastically improve model performances for instruction
-            fine-tuning. Check out the original paper here: https://huggingface.co/papers/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune
-        model_init_kwargs: (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when instantiating the model from a string.
-        dataset_kwargs: (`Optional[Dict]`, *optional*):
-            Dict of Optional kwargs to pass when creating packed or non-packed datasets
-        eval_packing: (`Optional[bool]`, *optional*):
-            Whether to pack the eval dataset as well. Defaults to `packing` if `None` is passed.
-        num_of_sequences (`Optional[int]`):
-            The number of sequences to use for the `ConstantLengthDataset`. Defaults to `1024`.
-        chars_per_token (`Optional[float]`):
-            The number of characters per token to use for the `ConstantLengthDataset`. Defaults to `3.6`. You can check how this is computed in the
-            stack-llama example:
-            [chars_token_ratio](https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53).
-        use_liger (`Optional[bool]`):
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        dataset_text_field (`Optional[str]`, *optional*, defaults to `None`):
+            Name of the text field of the dataset. If provided, the trainer will automatically create a
+            [`ConstantLengthDataset`] based on `dataset_text_field`.
+        packing (`bool`, *optional*, defaults to `False`):
+            Used only when `dataset_text_field` is provided. Controls whether the [`ConstantLengthDataset`] packs
+            the sequences of the dataset.
+        max_seq_length (`Optional[int]`, *optional*, defaults to `None`):
+            Maximum sequence length for the [`ConstantLengthDataset`] and for automatically creating the dataset. If
+            `None`, it uses the smaller value between `tokenizer.model_max_length` and `1024`.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset. Only used when `packing=False`.
+        dataset_batch_size (`Union[int, None]`, *optional*, defaults to `1000`):
+            Number of examples to tokenize per batch. If `dataset_batch_size <= 0` or `dataset_batch_size is None`,
+            tokenizes the full dataset as a single batch.
+        neftune_noise_alpha (`Optional[float]`, *optional*, defaults to `None`):
+            Scale of the noise for NEFTune embeddings. The [NEFTune paper](https://huggingface.co/papers/2310.05914)
+            suggests using values between `5` and `15`. If set to `None`, NEFTune is not activated. Activating NEFTune
+            can significantly improve model performance for instruction fine-tuning.
+        model_init_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Keyword arguments to pass to `AutoModelForCausalLM.from_pretrained` when instantiating the model from a
+            string.
+        dataset_kwargs (`Optional[Dict[str, Any]]`, *optional*, defaults to `None`):
+            Dictionary of optional keyword arguments to pass when creating packed or non-packed datasets.
+        eval_packing (`Optional[bool]`, *optional*, defaults to `None`):
+            Whether to pack the eval dataset. If `None`, uses the same value as `packing`.
+        num_of_sequences (`int`, *optional*, defaults to `1024`):
+            Number of sequences to use for the [`ConstantLengthDataset`].
+        chars_per_token (`float`, *optional*, defaults to `3.6`):
+            Number of characters per token to use for the [`ConstantLengthDataset`]. See
+            [chars_token_ratio](https://github.com/huggingface/trl/blob/08f550674c553c36c51d1027613c29f14f3676a5/examples/stack_llama/scripts/supervised_finetuning.py#L53) for more details.
+        use_liger (`bool`, *optional*, defaults to `False`):
             Monkey patch the model with Liger kernels to increase throughput and reduce memory usage.
     """
 
     dataset_text_field: Optional[str] = None
-    packing: Optional[bool] = False
+    packing: bool = False
     max_seq_length: Optional[int] = None
     dataset_num_proc: Optional[int] = None
     dataset_batch_size: int = 1000
     neftune_noise_alpha: Optional[float] = None
-    model_init_kwargs: Optional[Dict] = None
-    dataset_kwargs: Optional[Dict] = None
+    model_init_kwargs: Optional[Dict[str, Any]] = None
+    dataset_kwargs: Optional[Dict[str, Any]] = None
     eval_packing: Optional[bool] = None
-    num_of_sequences: Optional[int] = 1024
-    chars_per_token: Optional[float] = 3.6
-    use_liger: Optional[bool] = False
+    num_of_sequences: int = 1024
+    chars_per_token: float = 3.6
+    use_liger: bool = False
diff --git a/trl/trainer/utils.py b/trl/trainer/utils.py
index fb180a9da8..b6b709d41a 100644
--- a/trl/trainer/utils.py
+++ b/trl/trainer/utils.py
@@ -470,11 +470,11 @@ class ConstantLengthDataset(IterableDataset):
                 Number of characters per token used to estimate number of tokens in text buffer.
             eos_token_id (`int`, *optional*, defaults to `0`):
                 Id of the end of sequence token if the passed tokenizer does not have an EOS token.
-            shuffle ('bool', *optional*, defaults to True)
+            shuffle (`bool`, *optional*, defaults to True)
                 Shuffle the examples before they are returned
-            append_concat_token ('bool', *optional*, defaults to True)
+            append_concat_token (`bool`, *optional*, defaults to True)
                 If true, appends `eos_token_id` at the end of each sample being packed.
-            add_special_tokens ('bool', *optional*, defaults to True)
+            add_special_tokens (`bool`, *optional*, defaults to True)
                 If true, tokenizers adds special tokens to each sample being packed.
     """
 
@@ -890,54 +890,79 @@ class OnlineTrainerState(TrainerState):
 
 @dataclass
 class OnPolicyConfig(TrainingArguments):
-    # common config
+    r"""
+    Base configuration class for on-policy trainers.
+
+    Using [`~transformers.HfArgumentParser`] we can turn this class into
+    [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
+    command line.
+
+    Parameters:
+        run_name (`Optional[str]`, *optional*, defaults to `None`):
+            Name of the run.
+        sanity_check (`bool`, *optional*, defaults to `False`):
+            Whether to run in debug mode.
+        dataset_num_proc (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes to use for processing the dataset.
+        num_mini_batches (`int`, *optional*, defaults to `1`):
+            Number of minibatches to split a batch into.
+        total_episodes (`Optional[int]`, *optional*, defaults to `None`):
+            Total number of episodes in the dataset.
+        local_rollout_forward_batch_size (`int`, *optional*, defaults to `64`):
+            Per rank no grad forward pass in the rollout phase.
+        num_sample_generations (`int`, *optional*, defaults to `10`):
+            Number of debugging samples generations (i.e., `generate_completions` calls) throughout training.
+        response_length (`int`, *optional*, defaults to `53`):
+            Length of the response.
+        stop_token (`Optional[str]`, *optional*, defaults to `None`):
+            Stop token.
+        stop_token_id (`Optional[int]`, *optional*, defaults to `None`):
+            Truncation token id.
+        temperature (`float`, *optional*, defaults to `0.7`):
+            Sampling temperature.
+        penalty_reward_value (`int`, *optional*, defaults to `-1`):
+            Reward value for responses that do not contain `stop_token_id`.
+        non_eos_penalty (`bool`, *optional*, defaults to `False`):
+            Whether to penalize responses that do not contain `stop_token_id`.
+        sft_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
+            Path to the SFT model.
+        world_size (`Optional[int]`, *optional*, defaults to `None`):
+            Number of processes (GPUs) to use for the training.
+        num_total_batches (`Optional[int]`, *optional*, defaults to `None`):
+            Number of total batches to train.
+        micro_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`).
+        local_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`).
+        batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`).
+        local_mini_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Mini batch size per GPU.
+        mini_batch_size (`Optional[int]`, *optional*, defaults to `None`):
+            Mini batch size across GPUs.
+    """
+
     run_name: Optional[str] = None
-    """a unique name of this run"""
     sanity_check: bool = False
-    """wether to run in debug mode"""
     dataset_num_proc: Optional[int] = None
-
-    # batch size related config
     num_mini_batches: int = 1
-    """Number of minibatches to split a batch into"""
     total_episodes: Optional[int] = None
-    """The total number of episodes in the dataset"""
     local_rollout_forward_batch_size: int = 64
-    """per rank no grad forward pass in the rollout phase"""
     num_sample_generations: int = 10
-    """the number of debugging samples generations (i.e., `generate_completions` calls) throughout training"""
-
-    # other config
     response_length: int = 53
-    """the length of the response"""
     stop_token: Optional[Literal["eos"]] = None
-    """the stop token"""
     stop_token_id: Optional[int] = None
-    """the truncation token id"""
     temperature: float = 0.7
-    """the sampling temperature"""
     penalty_reward_value: int = -1
-    """the reward value for responses that do not contain `stop_token_id`"""
     non_eos_penalty: bool = False
-    """whether to penalize responses that do not contain `stop_token_id`"""
     sft_model_path: str = "EleutherAI/pythia-160m"
-    """the path to the sft model"""
-
-    # various batch sizes
     world_size: Optional[int] = None
-    """The number of processes (GPUs) to use"""
     num_total_batches: Optional[int] = None
-    """The number of total batches to train"""
     micro_batch_size: Optional[int] = None
-    """The micro batch size across devices (HF's `per_device_train_batch_size` * `world_size`)"""
     local_batch_size: Optional[int] = None
-    """The batch size per GPU (HF's `per_device_train_batch_size` * `gradient_accumulation_steps`)"""
     batch_size: Optional[int] = None
-    """The batch size across devices (HF's `per_device_train_batch_size` * `world_size` * `gradient_accumulation_steps`)"""
     local_mini_batch_size: Optional[int] = None
-    """the mini batch size per GPU"""
     mini_batch_size: Optional[int] = None
-    """the mini batch size across GPUs"""
 
 
 def first_true_indices(bools: torch.Tensor, dtype=torch.long):