From 9c376c571f2660342b965a5e417fe0010ba3ff4f Mon Sep 17 00:00:00 2001
From: Kashif Rasul <kashif.rasul@gmail.com>
Date: Thu, 24 Oct 2024 16:47:10 +0200
Subject: [PATCH] [Judges] use the pair-judges in online-preference trainers
 (#2243)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* use the pair-judges

* add test

* Update trl/trainer/online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update trl/trainer/online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* decode and skip special characters

* initial nash

* return tensors

* Update trl/trainer/online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update trl/trainer/online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* Update trl/trainer/online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* add back the logging

* use batch_decode

* add judges api to XPO trainer

* Update tests/test_online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* judge in examples

* judge in config

* add back logs when using reward model

* typo

* add back model_scores logging when using reward model

* log scores for reward model only

* better cond on what to log

* same for rlhf reward

* Update trl/trainer/online_dpo_trainer.py

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>

* use decode_and_strip_padding

* error if both reward and judge or none are set

* remove unused check

* Uniform way to pass conversation into judge

* heading -> leading

* LogCompletionsCallback compat with online method

* Update Online DPO doc

* check if data is conversational for judges

* update example

* remove comment

* use zip

* fix stats xpo

* Replace judge with PairRMJudge and import AutoModelForSequenceClassification

* update xpo documentation

* Remove doc duplication

* update nash doc

* XPO trl chat

* nash md doc

* HfPairwiseJudge

---------

Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
Co-authored-by: Quentin Gallouédec <quentin.gallouedec@huggingface.co>
---
 docs/source/nash_md_trainer.md    | 69 +++++++++++++++--------
 docs/source/online_dpo_trainer.md | 81 +++++++++++++++------------
 docs/source/xpo_trainer.mdx       | 71 ++++++++++++++++--------
 examples/scripts/dpo_online.py    | 27 +++++++--
 examples/scripts/nash_md.py       | 26 ++++++++-
 examples/scripts/xpo.py           | 27 ++++++++-
 tests/test_nash_md_trainer.py     | 34 +++++++++++-
 tests/test_online_dpo_trainer.py  | 28 +++++++++-
 tests/test_xpo_trainer.py         | 34 +++++++++++-
 trl/trainer/callbacks.py          |  2 +
 trl/trainer/judges.py             |  4 +-
 trl/trainer/nash_md_trainer.py    | 92 ++++++++++++++++++++++++-------
 trl/trainer/online_dpo_config.py  |  5 +-
 trl/trainer/online_dpo_trainer.py | 78 ++++++++++++++++++--------
 trl/trainer/xpo_trainer.py        | 85 ++++++++++++++++++++++------
 15 files changed, 502 insertions(+), 161 deletions(-)

diff --git a/docs/source/nash_md_trainer.md b/docs/source/nash_md_trainer.md
index 38e955639c..881e57e69c 100644
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@@ -14,7 +14,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Quick start
 
-This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the Nash-MD method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -28,21 +28,17 @@ Below is the script to train the model:
 ```python
 # train_nash_md.py
 from datasets import load_dataset
-from trl import NashMDConfig, NashMDTrainer
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+from trl import NashMDConfig, NashMDTrainer, PairRMJudge
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+judge = PairRMJudge()
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-training_args = NashMDConfig(output_dir="nash-md-qwen2", logging_steps=10)
+training_args = NashMDConfig(output_dir="Qwen2-0.5B-NashMD", logging_steps=10)
 trainer = NashMDTrainer(
-    model=model,
-    reward_model=reward_model,
-    args=training_args,
-    processing_class=tokenizer,
-    train_dataset=train_dataset,
+    model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
 trainer.train()
 ```
@@ -53,15 +49,47 @@ Execute the script using the following command:
 accelerate launch train_nash_md.py
 ```
 
+Distributed across 8 GPUs, the training takes approximately 3 hours.
+
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-NashMD) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+
+<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-NashMD
+<strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
+What is the best programming language?
+
+<strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-NashMD&gt;:</span></strong>
+The best programming language depends on personal preference, the complexity of the project, and the specific requirements of the task. Some programming languages that are often recommended include Python, Java, and JavaScript, and there are many other languages to choose from depending on individual needs.
+</code></pre>
+
 ## Expected dataset type
 
 Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`NashMDTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Usage tips
 
-### ⚠️ Use the same chat template
+### Use a reward model
+
+Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
+
+```diff
+- from trl import PairRMJudge
++ from transformers import AutoModelForSequenceClassification
+
+- judge = PairRMJudge()
++ reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+
+  trainer = NashMDTrainer(
+      ...
+-     judge=judge,
++     reward_model=reward_model,
+  )
+```
+
+<Tip warning={true}>
+
+Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.
 
-Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
+</Tip>
 
 ### Encourage EOS token generation
 
@@ -89,21 +117,17 @@ This callback logs the model's generated completions directly to Weights & Biase
 
 We provide an example script to train a model using the Nash-MD method. The script is available in [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py)
 
-To test the Nash-MD script with the [Pythia 14M model](https://huggingface.co/EleutherAI/pythia-14m) on the TL;DR summarization task, run the following command:
+To test the online DPO script with the [Qwen2.5 0.5B model](https://huggingface.co/trl-lib/Qwen/Qwen2.5-0.5B-Instruct) on the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback), run the following command:
 
 ```bash
 python examples/scripts/nash_md.py \
-    --model_name_or_path EleutherAI/pythia-14m  \
-    --reward_model_path EleutherAI/pythia-14m \
-    --dataset_name trl-lib/tldr \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --judge pair_rm \
+    --dataset_name trl-lib/ultrafeedback-prompt \
     --learning_rate 5.0e-7 \
-    --output_dir pythia-14m-tldr-nash-md \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 32 \
-    --num_train_epochs 3 \
-    --max_new_tokens 64 \
+    --logging_steps 25 \
+    --output_dir Qwen2.5-0.5B-NashMD-PairRM \
     --warmup_ratio 0.1 \
-    --missing_eos_penalty 1.0 \
     --push_to_hub
 ```
 
@@ -116,6 +140,7 @@ The logged metrics are as follows:
 * `loss/score`: The mean reinforce score loss.
 * `rewards/chosen`: The mean scores (according to the reward model) of the model completions.
 * `rewards/rejected`: The mean scores (according to the reward model) of the mixture completions.
+* `rewards/probabilities`: The mean probability (according to the reward model or judge) of the model completions chosen vs the mixture completion.
 * `rewards/accuracies`: The accuracies of the Nash-MD's implicit reward model.
 * `rewards/margins`: The mean reward margin (according to reward model) between the chosen and mixture completions.
 * `logps/chosen`: The mean log probabilities of the chosen completions.
diff --git a/docs/source/online_dpo_trainer.md b/docs/source/online_dpo_trainer.md
index 7d479cb2a9..823f2e56cc 100644
--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@@ -10,13 +10,11 @@ The abstract from the paper is the following:
 
 > Direct alignment from preferences (DAP) methods, such as DPO, have recently emerged as efficient alternatives to reinforcement learning from human feedback (RLHF), that do not require a separate reward model. However, the preference datasets used in DAP methods are usually collected ahead of training and never updated, thus the feedback is purely offline. Moreover, responses in these datasets are often sampled from a language model distinct from the one being aligned, and since the model evolves over training, the alignment phase is inevitably off-policy. In this study, we posit that online feedback is key and improves DAP methods. Our method, online AI feedback (OAIF), uses an LLM as annotator: on each training iteration, we sample two responses from the current model and prompt the LLM annotator to choose which one is preferred, thus providing online feedback. Despite its simplicity, we demonstrate via human evaluation in several tasks that OAIF outperforms both offline DAP and RLHF methods. We further show that the feedback leveraged in OAIF is easily controllable, via instruction prompts to the LLM annotator.
 
-The current implementation uses reward models for scoring completions -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use.
-
 This post-training method was contributed by [Michael Noukhovitch](https://huggingface.co/mnoukhov), [Shengyi Costa Huang](https://huggingface.co/vwxyzjn), [Quentin Gallouédec](https://huggingface.co/qgallouedec), and [Edward Beeching](https://huggingface.co/edbeeching).
 
 ## Quick start
 
-This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
+This example demonstrates how to train a model using the online DPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
@@ -30,17 +28,17 @@ Below is the script to train the model:
 ```python
 # train_online_dpo.py
 from datasets import load_dataset
-from trl import OnlineDPOConfig, OnlineDPOTrainer
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+from trl import OnlineDPOConfig, OnlineDPOTrainer, PairRMJudge
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+judge = PairRMJudge()
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-training_args = OnlineDPOConfig(output_dir="online-dpo-qwen2", logging_steps=10)
+training_args = OnlineDPOConfig(output_dir="Qwen2-0.5B-OnlineDPO", logging_steps=10)
 trainer = OnlineDPOTrainer(
-    model=model, reward_model=reward_model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
+    model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
 trainer.train()
 ```
@@ -53,20 +51,17 @@ accelerate launch train_online_dpo.py
 
 Distributed across 8 GPUs, the training takes approximately 1 hour. You can verify the training progress by checking the reward graph. An increasing trend in both the reward for rejected and chosen completions indicates that the model is improving and generating better responses over time.
 
-![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/online-dpo-qwen2-reward.png)
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/online-dpo-qwen2.png)
 
-To see how the trained model performs, use the following code to generate completions:
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-OnlineDPO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
 
-```python
->>> from transformers import pipeline
->>> generator = pipeline("text-generation", model="online-dpo-qwen2/checkpoint-1773", device="cuda")
->>> question = "Why is the problem always DNS?"
->>> output = generator([{"role": "user", "content": question}], max_new_tokens=200, return_full_text=False)[0]
->>> print(output["generated_text"])
-The reason why the problem of DNS (Domain Name System) can always be encountered is that it is designed to provide reliable and accurate information about the availability, ownership, or expiration of domain names. However, there may be some circumstances where the system fails to resolve an IP address correctly, leading to the problem of DNS.
-For example, if the server hosting the domain name does not have the correct IP address associated with it, or if the IP address is incorrectly formatted, then the DNS system will fail to resolve the domain name correctly. Additionally, if the server hosting the domain name has been compromised, then the DNS system may also fail to resolve the domain name correctly.
-It's worth noting that the exact cause of DNS failure can vary depending on the specific situation, so it's important to carefully check all relevant factors before attempting to resolve the issue. If you suspect that your DNS problem may be caused by a bug in the system, you should report it to the DNS provider directly for further investigation.
-```
+<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-OnlineDPO
+<strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
+What is the best programming language?
+
+<strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-OnlineDPO&gt;:</span></strong>
+The best programming language depends on your specific needs and priorities. Some people prefer imperative programming languages (like Haskell or Lisp), while others prefer functional programming languages (like Scala or Python). It's important to consider your work style, programming environment, and project requirements when choosing a programming language.
+</code></pre>
 
 ## Expected dataset type
 
@@ -74,13 +69,33 @@ Online DPO only requires a [prompt-only dataset](dataset_formats#prompt-only) (u
 
 ## Usage tips
 
-### ⚠️ Use the same chat template
+### Use a reward model
+
+Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
+
+```diff
+- from trl import PairRMJudge
++ from transformers import AutoModelForSequenceClassification
+
+- judge = PairRMJudge()
++ reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+
+  trainer = OnlineDPOTrainer(
+      ...
+-     judge=judge,
++     reward_model=reward_model,
+  )
+```
+
+<Tip warning={true}>
+
+Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.
 
-Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
+</Tip>
 
 ### Encourage EOS token generation
 
-We may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`OnlineDPOConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`OnlineDPOConfig`]:
+When using a reward model, we may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`OnlineDPOConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`OnlineDPOConfig`]:
 
 ```python
 training_args = OnlineDPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
@@ -105,33 +120,29 @@ This callback logs the model's generated completions directly to Weights & Biase
 
 We provide an example script to train a model using the online DPO method. The script is available in [`examples/scripts/dpo_online.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_online.py)
 
-To test the online DPO script with the [Pythia 1B model](https://huggingface.co/trl-lib/pythia-1b-deduped-tldr-sft) on the TL;DR summarization task, run the following command:
+To test the online DPO script with the [Qwen2.5 0.5B model](https://huggingface.co/trl-lib/Qwen/Qwen2.5-0.5B-Instruct) on the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback), run the following command:
 
 ```bash
 python examples/scripts/dpo_online.py \
-    --model_name_or_path trl-lib/pythia-1b-deduped-tldr-sft  \
-    --reward_model_path trl-lib/pythia-1b-deduped-tldr-rm \
-    --dataset_name trl-lib/tldr \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --judge pair_rm \
+    --dataset_name trl-lib/ultrafeedback-prompt \
     --learning_rate 5.0e-7 \
-    --output_dir pythia-1b-tldr-online-dpo \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 32 \
-    --num_train_epochs 3 \
-    --max_new_tokens 53 \
+    --logging_steps 25 \
+    --output_dir Qwen2.5-0.5B-Online-DPO-PairRM \
     --warmup_ratio 0.1 \
-    --missing_eos_penalty 1.0 \
     --push_to_hub
 ```
 
 ## Logged metrics
 
-The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35)
+The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/w4apmsi9)
 
 * `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current model and reference model.
 * `objective/entropy`: The mean entropy of the model, indicating the randomness of the actions chosen by the model.
 * `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
 * `objective/rlhf_reward`: The mean RLHF reward, which is `scores - non_score_reward`. The `rlhf_reward` is the ultimate objective of online DPO training. If training works as intended, this metric should keep going up.
-* `objective/scores`: The mean scores returned by the reward mode.
+* `objective/scores`: The mean scores returned by the reward model.
 * `objective/scores_margin`: The mean score margin (according to the external reward model) between the chosen and rejected completions.
 * `rewards/chosen`: The mean reward (according to online DPO's implicit reward model)of the chosen completions.
 * `rewards/rejected`: The mean reward (according to online DPO's implicit reward model) of the rejected completions.
diff --git a/docs/source/xpo_trainer.mdx b/docs/source/xpo_trainer.mdx
index 0a0cf90387..7516b9218d 100644
--- a/docs/source/xpo_trainer.mdx
+++ b/docs/source/xpo_trainer.mdx
@@ -14,8 +14,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 
 ## Quick start
 
-This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and the [Qwen 0.5B reward model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) as the reward model. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
-
+This example demonstrates how to train a model using the XPO method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) as the base model and [`PairRMJudge`] as a judge. We use the prompts from the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback). You can view the prompts in the dataset here:
 <iframe
   src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
   frameborder="0"
@@ -28,21 +27,17 @@ Below is the script to train the model:
 ```python
 # train_xpo.py
 from datasets import load_dataset
-from trl import XPOConfig, XPOTrainer
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer
+from trl import PairRMJudge, XPOConfig, XPOTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
-reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+judge = PairRMJudge()
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")
 
-training_args = XPOConfig(output_dir="xpo-qwen2", logging_steps=10)
+training_args = XPOConfig(output_dir="Qwen2-0.5B-XPO", logging_steps=10)
 trainer = XPOTrainer(
-    model=model,
-    reward_model=reward_model,
-    args=training_args,
-    processing_class=tokenizer,
-    train_dataset=train_dataset,
+    model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
 trainer.train()
 ```
@@ -53,19 +48,51 @@ Execute the script using the following command:
 accelerate launch train_xpo.py
 ```
 
+Distributed across 8 GPUs, the training takes approximately 1 hour.
+
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-XPO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+
+<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-XPO
+<strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
+What is the best programming language?
+
+<strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-XPO&gt;:</span></strong>
+The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript. 
+</code></pre>
+
 ## Expected dataset type
 
 XPO requires a [prompt-only dataset](dataset_formats#prompt-only). The [`XPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Usage tips
 
-### ⚠️ Use the same chat template
+### Use a reward model
+
+Instead of a judge, you can chose to use a reward model -- see [Reward Bench](https://huggingface.co/spaces/allenai/reward-bench) for a leaderboard of public models you can use. Below is a code example showing how to replace a judge with the [trl-lib/Qwen2-0.5B-Reward](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward) model:
+
+```diff
+- from trl import PairRMJudge
++ from transformers import AutoModelForSequenceClassification
+
+- judge = PairRMJudge()
++ reward_model = AutoModelForSequenceClassification.from_pretrained("trl-lib/Qwen2-0.5B-Reward", num_labels=1)
+
+  trainer = XPOTrainer(
+      ...
+-     judge=judge,
++     reward_model=reward_model,
+  )
+```
+
+<Tip warning={true}>
+
+Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.
 
-Make sure that the SFT model and reward model use the _same_ chat template. Otherwise, you may find the model completions are scored incorrectly during training.
+</Tip>
 
 ### Encourage EOS token generation
 
-We may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`XPOConfig`]. If you want to penalize the model for not generating an EOS token before reaching the `maximum length`, you can use the missing_eos_penalty argument of [`XPOConfig`]:
+When using a reward model, we may want the model to generate completions within a given length. During training, the model will generate completions up to the maximum length specified in the `max_new_tokens` argument of [`XPOConfig`]. If you want to penalize the model for not generating an EOS token before reaching the maximum length, you can use the `missing_eos_penalty` argument of [`XPOConfig`]:
 
 ```python
 training_args = XPOConfig(..., max_new_tokens=128, missing_eos_penalty=1.0)
@@ -89,21 +116,17 @@ This callback logs the model's generated completions directly to Weights & Biase
 
 We provide an example script to train a model using the XPO method. The script is available in [`examples/scripts/xpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/xpo.py)
 
-To test the XPO script with the [Pythia 14M model](https://huggingface.co/EleutherAI/pythia-14m) on the TL;DR summarization task, run the following command:
+To test the XPO script with the [Qwen2.5 0.5B model](https://huggingface.co/trl-lib/Qwen/Qwen2.5-0.5B-Instruct) on the [UltraFeedback dataset](https://huggingface.co/datasets/openbmb/UltraFeedback), run the following command:
 
 ```bash
 python examples/scripts/xpo.py \
-    --model_name_or_path EleutherAI/pythia-14m  \
-    --reward_model_path EleutherAI/pythia-14m \
-    --dataset_name trl-lib/tldr \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --judge pair_rm \
+    --dataset_name trl-lib/ultrafeedback-prompt \
     --learning_rate 5.0e-7 \
-    --output_dir pythia-14m-tldr-xpo \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 32 \
-    --num_train_epochs 3 \
-    --max_new_tokens 64 \
+    --logging_steps 25 \
+    --output_dir Qwen2.5-0.5B-XPO-PairRM \
     --warmup_ratio 0.1 \
-    --missing_eos_penalty 1.0 \
     --push_to_hub
 ```
 
diff --git a/examples/scripts/dpo_online.py b/examples/scripts/dpo_online.py
index c8e6954739..5c2c47d56c 100644
--- a/examples/scripts/dpo_online.py
+++ b/examples/scripts/dpo_online.py
@@ -44,10 +44,13 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
+    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
     OnlineDPOConfig,
     OnlineDPOTrainer,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
@@ -57,6 +60,8 @@
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
+JUDGES = {"pair_rm": PairRMJudge, "openai": OpenAIPairwiseJudge, "hf": HfPairwiseJudge}
+
 if __name__ == "__main__":
     parser = TrlParser((ScriptArguments, OnlineDPOConfig, ModelConfig))
     script_args, training_args, model_config = parser.parse_args_and_config()
@@ -81,12 +86,21 @@
         model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, **model_kwargs
     )
 
-    reward_model = AutoModelForSequenceClassification.from_pretrained(
-        training_args.reward_model_path,
-        num_labels=1,
-        trust_remote_code=model_config.trust_remote_code,
-        **model_kwargs,
-    )
+    if training_args.reward_model_path is not None:
+        reward_model = AutoModelForSequenceClassification.from_pretrained(
+            training_args.reward_model_path,
+            num_labels=1,
+            trust_remote_code=model_config.trust_remote_code,
+            **model_kwargs,
+        )
+    else:
+        reward_model = None
+
+    if training_args.judge is not None:
+        judge_cls = JUDGES[training_args.judge]
+        judge = judge_cls()
+    else:
+        judge = None
 
     tokenizer = AutoTokenizer.from_pretrained(
         model_config.model_name_or_path,
@@ -104,6 +118,7 @@
     trainer = OnlineDPOTrainer(
         model=model,
         reward_model=reward_model,
+        judge=judge,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split],
diff --git a/examples/scripts/nash_md.py b/examples/scripts/nash_md.py
index 7c614d6ec8..ec731d0384 100644
--- a/examples/scripts/nash_md.py
+++ b/examples/scripts/nash_md.py
@@ -50,10 +50,13 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
+    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
     NashMDConfig,
     NashMDTrainer,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
     ScriptArguments,
     TrlParser,
     get_kbit_device_map,
@@ -62,6 +65,8 @@
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
+JUDGES = {"pair_rm": PairRMJudge, "openai": OpenAIPairwiseJudge, "hf": HfPairwiseJudge}
+
 if __name__ == "__main__":
     parser = TrlParser((ScriptArguments, NashMDConfig, ModelConfig))
     script_args, training_args, model_config = parser.parse_args_and_config()
@@ -88,9 +93,23 @@
     ref_model = AutoModelForCausalLM.from_pretrained(
         model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, **model_kwargs
     )
-    reward_model = AutoModelForSequenceClassification.from_pretrained(
-        training_args.reward_model_path, num_labels=1, trust_remote_code=model_config.trust_remote_code
-    )
+
+    if training_args.reward_model_path is not None:
+        reward_model = AutoModelForSequenceClassification.from_pretrained(
+            training_args.reward_model_path,
+            num_labels=1,
+            trust_remote_code=model_config.trust_remote_code,
+            **model_kwargs,
+        )
+    else:
+        reward_model = None
+
+    if training_args.judge is not None:
+        judge_cls = JUDGES[training_args.judge]
+        judge = judge_cls()
+    else:
+        judge = None
+
     tokenizer = AutoTokenizer.from_pretrained(
         model_config.model_name_or_path,
         padding_side="left",
@@ -107,6 +126,7 @@
         model=model,
         ref_model=ref_model,
         reward_model=reward_model,
+        judge=judge,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split],
diff --git a/examples/scripts/xpo.py b/examples/scripts/xpo.py
index 4995214078..5f32c161b4 100644
--- a/examples/scripts/xpo.py
+++ b/examples/scripts/xpo.py
@@ -34,8 +34,11 @@
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, GenerationConfig
 
 from trl import (
+    HfPairwiseJudge,
     LogCompletionsCallback,
     ModelConfig,
+    OpenAIPairwiseJudge,
+    PairRMJudge,
     ScriptArguments,
     TrlParser,
     XPOConfig,
@@ -46,6 +49,9 @@
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 
 
+JUDGES = {"pair_rm": PairRMJudge, "openai": OpenAIPairwiseJudge, "hf": HfPairwiseJudge}
+
+
 if __name__ == "__main__":
     parser = TrlParser((ScriptArguments, XPOConfig, ModelConfig))
     script_args, training_args, model_config = parser.parse_args_and_config()
@@ -72,9 +78,23 @@
     ref_model = AutoModelForCausalLM.from_pretrained(
         model_config.model_name_or_path, trust_remote_code=model_config.trust_remote_code, **model_kwargs
     )
-    reward_model = AutoModelForSequenceClassification.from_pretrained(
-        training_args.reward_model_path, num_labels=1, trust_remote_code=model_config.trust_remote_code
-    )
+
+    if training_args.reward_model_path is not None:
+        reward_model = AutoModelForSequenceClassification.from_pretrained(
+            training_args.reward_model_path,
+            num_labels=1,
+            trust_remote_code=model_config.trust_remote_code,
+            **model_kwargs,
+        )
+    else:
+        reward_model = None
+
+    if training_args.judge is not None:
+        judge_cls = JUDGES[training_args.judge]
+        judge = judge_cls()
+    else:
+        judge = None
+
     tokenizer = AutoTokenizer.from_pretrained(
         model_config.model_name_or_path,
         padding_side="left",
@@ -91,6 +111,7 @@
         model=model,
         ref_model=ref_model,
         reward_model=reward_model,
+        judge=judge,
         args=training_args,
         train_dataset=dataset[script_args.dataset_train_split],
         eval_dataset=dataset[script_args.dataset_test_split],
diff --git a/tests/test_nash_md_trainer.py b/tests/test_nash_md_trainer.py
index aff4d9e2cd..1e936c529f 100644
--- a/tests/test_nash_md_trainer.py
+++ b/tests/test_nash_md_trainer.py
@@ -20,7 +20,7 @@
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
 
-from trl import NashMDConfig, NashMDTrainer
+from trl import NashMDConfig, NashMDTrainer, PairRMJudge, is_llmblender_available
 
 
 if is_peft_available():
@@ -155,3 +155,35 @@ def test_training_with_peft_model_and_peft_config(self):
 
             # Check if training loss is available
             self.assertIn("train_loss", trainer.state.log_history[-1])
+
+    @unittest.skipIf(not is_llmblender_available(), "llm-blender is not available")
+    @parameterized.expand([("standard_prompt_only",), ("conversational_prompt_only",)])
+    def test_nash_md_trainer_judge_training(self, config_name):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = NashMDConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                remove_unused_columns=False,
+                gradient_accumulation_steps=1,
+                learning_rate=9e-1,
+                eval_strategy="steps",
+                report_to="none",
+            )
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
+            judge = PairRMJudge()
+
+            trainer = NashMDTrainer(
+                model=self.model,
+                ref_model=self.ref_model,
+                judge=judge,
+                args=training_args,
+                processing_class=self.tokenizer,
+                train_dataset=dummy_dataset["train"],
+                eval_dataset=dummy_dataset["test"],
+            )
+
+            trainer.train()
+
+            # Check if training loss is available
+            self.assertIn("train_loss", trainer.state.log_history[-1])
diff --git a/tests/test_online_dpo_trainer.py b/tests/test_online_dpo_trainer.py
index acabbbafa4..9a4e7680a2 100644
--- a/tests/test_online_dpo_trainer.py
+++ b/tests/test_online_dpo_trainer.py
@@ -20,7 +20,7 @@
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
 
-from trl import OnlineDPOConfig, OnlineDPOTrainer
+from trl import OnlineDPOConfig, OnlineDPOTrainer, PairRMJudge, is_llmblender_available
 
 
 if is_peft_available():
@@ -198,3 +198,29 @@ def test_training_with_peft_model_and_peft_config(self):
 
             # Check if training loss is available
             self.assertIn("train_loss", trainer.state.log_history[-1])
+
+    @unittest.skipIf(not is_llmblender_available(), "llm-blender is not available")
+    def test_training_with_judge(self):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = OnlineDPOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                learning_rate=5.0e-7,
+                eval_strategy="steps",
+                report_to="none",
+            )
+            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only")
+
+            trainer = OnlineDPOTrainer(
+                model=self.model,
+                judge=PairRMJudge(),
+                args=training_args,
+                processing_class=self.tokenizer,
+                train_dataset=dummy_dataset["train"],
+                eval_dataset=dummy_dataset["test"],
+            )
+            trainer.train()
+
+            # Check if training loss is available
+            self.assertIn("train_loss", trainer.state.log_history[-1])
diff --git a/tests/test_xpo_trainer.py b/tests/test_xpo_trainer.py
index 734d30ba4d..e03b855d5a 100644
--- a/tests/test_xpo_trainer.py
+++ b/tests/test_xpo_trainer.py
@@ -20,7 +20,7 @@
 from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available
 
-from trl import XPOConfig, XPOTrainer
+from trl import PairRMJudge, XPOConfig, XPOTrainer, is_llmblender_available
 
 
 if is_peft_available():
@@ -155,3 +155,35 @@ def test_training_with_peft_model_and_peft_config(self):
 
             # Check if training loss is available
             self.assertIn("train_loss", trainer.state.log_history[-1])
+
+    @unittest.skipIf(not is_llmblender_available(), "llm-blender is not available")
+    @parameterized.expand([("standard_prompt_only",), ("conversational_prompt_only",)])
+    def test_xpo_trainer_judge_training(self, config_name):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = XPOConfig(
+                output_dir=tmp_dir,
+                per_device_train_batch_size=2,
+                max_steps=3,
+                remove_unused_columns=False,
+                gradient_accumulation_steps=1,
+                learning_rate=9e-1,
+                eval_strategy="steps",
+                report_to="none",
+            )
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
+            judge = PairRMJudge()
+
+            trainer = XPOTrainer(
+                model=self.model,
+                ref_model=self.ref_model,
+                judge=judge,
+                args=training_args,
+                processing_class=self.tokenizer,
+                train_dataset=dummy_dataset["train"],
+                eval_dataset=dummy_dataset["test"],
+            )
+
+            trainer.train()
+
+            # Check if training loss is available
+            self.assertIn("train_loss", trainer.state.log_history[-1])
diff --git a/trl/trainer/callbacks.py b/trl/trainer/callbacks.py
index a920f41cc4..eefe83ce96 100644
--- a/trl/trainer/callbacks.py
+++ b/trl/trainer/callbacks.py
@@ -35,6 +35,7 @@
 from transformers.integrations import WandbCallback
 from transformers.trainer_utils import has_length
 
+from ..data_utils import maybe_apply_chat_template
 from ..models.utils import unwrap_model_for_generation
 from .judges import BasePairwiseJudge
 
@@ -406,6 +407,7 @@ def on_step_end(self, args, state, control, **kwargs):
         accelerator = self.trainer.accelerator
         model = self.trainer.model_wrapped
         with accelerator.split_between_processes(self.eval_dataset["prompt"]) as prompts:
+            prompts = [maybe_apply_chat_template({"prompt": prompt}, tokenizer)["prompt"] for prompt in prompts]
             completions = _generate_completions(
                 prompts,
                 model=model,
diff --git a/trl/trainer/judges.py b/trl/trainer/judges.py
index 776fd77d19..96ab36d88d 100644
--- a/trl/trainer/judges.py
+++ b/trl/trainer/judges.py
@@ -168,10 +168,10 @@ class PairRMJudge(BasePairwiseJudge):
         blender (llm_blender.Blender): An instance of the Blender class from llm-blender.
 
     Example:
-        >>> judge = PairRMJudge()
+        >>> pairrm_judge = PairRMJudge()
         >>> prompts = ["Translate 'hello' to French", "What's the capital of Japan?"]
         >>> completions = [["Bonjour", "Salut"], ["Kyoto", "Tokyo"]]
-        >>> results = judge(prompts, completions)
+        >>> results = pairrm_judge.judge(prompts, completions)
         >>> print(results)  # [0, 1] (indicating the first completion is preferred for the first prompt and the second)
 
     Note:
diff --git a/trl/trainer/nash_md_trainer.py b/trl/trainer/nash_md_trainer.py
index 73aab7899a..7702802bd1 100644
--- a/trl/trainer/nash_md_trainer.py
+++ b/trl/trainer/nash_md_trainer.py
@@ -16,6 +16,7 @@
 import textwrap
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import jinja2
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -33,12 +34,13 @@
 from transformers.training_args import OptimizerNames
 from transformers.utils import is_apex_available
 
-from ..data_utils import maybe_apply_chat_template
+from ..data_utils import is_conversational, maybe_apply_chat_template
 from ..models.modeling_base import GeometricMixtureWrapper
 from ..models.utils import unwrap_model_for_generation
+from .judges import BasePairwiseJudge
 from .nash_md_config import NashMDConfig
 from .online_dpo_trainer import OnlineDPOTrainer
-from .utils import empty_cache, generate_model_card, get_reward, truncate_right
+from .utils import SIMPLE_CHAT_TEMPLATE, empty_cache, generate_model_card, get_reward, truncate_right
 
 
 if is_apex_available():
@@ -95,7 +97,8 @@ def __init__(
         self,
         model: Union[PreTrainedModel, nn.Module] = None,
         ref_model: Union[PreTrainedModel, nn.Module] = None,
-        reward_model: Optional[nn.Module] = None,
+        reward_model: Union[PreTrainedModel, nn.Module, None] = None,
+        judge: Optional[BasePairwiseJudge] = None,
         args: Optional[NashMDConfig] = None,
         data_collator: Optional[Callable] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
@@ -113,6 +116,7 @@ def __init__(
             model=model,
             ref_model=ref_model,
             reward_model=reward_model,
+            judge=judge,
             args=args,
             data_collator=data_collator,
             train_dataset=train_dataset,
@@ -134,8 +138,7 @@ def __init__(
             "loss/kl": [],
             "objective/entropy": [],
             "loss/score": [],
-            "rewards/chosen": [],
-            "rewards/rejected": [],
+            "rewards/probabilities": [],
             "rewards/accuracies": [],
             "rewards/margins": [],
             "logps/chosen": [],
@@ -145,6 +148,9 @@ def __init__(
             "beta": [],
             "mixture_coef": [],
         }
+        if self.reward_model is not None:
+            self.stats["rewards/chosen"] = []
+            self.stats["rewards/rejected"] = []
 
     @property
     def mixture_coef(self):
@@ -191,6 +197,7 @@ def _process_completions(self, model_output, mixture_output, prompts):
         model_data = {
             "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
             "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
         }
 
         # Process reference model completions
@@ -201,6 +208,7 @@ def _process_completions(self, model_output, mixture_output, prompts):
         mixture_data = {
             "input_ids": torch.cat((prompts["input_ids"], mixture_completion_ids), dim=1),
             "attention_mask": torch.cat((prompts["attention_mask"], mixture_completion_mask), dim=1),
+            "raw": prompts["raw"],
         }
 
         return model_data, mixture_data
@@ -223,6 +231,40 @@ def _compute_rewards(self, model_data, mixture_data, context_length):
 
         return model_scores, mixture_scores
 
+    def _compute_judge(self, model_data, mixture_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+
+        mixture_data_completions = self.processing_class.batch_decode(
+            mixture_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        mixture_data_completions = [completion.strip() for completion in mixture_data_completions]
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+
+            mixture_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in mixture_data_completions
+            ]
+            mixture_data_completions = [
+                template.render(messages=completion) for completion in mixture_data_completions
+            ]
+
+        probability = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, mixture_data_completions)),
+            return_scores=True,
+        )
+        return torch.tensor(probability, device=model_data["input_ids"].device)
+
     def _compute_logprobs(self, model, model_data, context_length):
         def compute_logprobs_for_data(m, data):
             output = m(data["input_ids"], attention_mask=data["attention_mask"])
@@ -253,16 +295,12 @@ def _compute_losses(
         self,
         model_logprobs_model_data,
         ref_logprobs_model_data,
-        model_data_scores,
-        mixture_data_scores,
+        probability,
     ):
         # Compute log probs
         model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
         ref_logprobs_model_data_sum = ref_logprobs_model_data.sum(1)
 
-        # probability of the model data vs the mixture data
-        probability = F.sigmoid(model_data_scores - mixture_data_scores)
-
         # reinforce score where 0.5 is a control variate
         score = (probability - 0.5) * model_logprobs_model_data_sum
 
@@ -280,11 +318,12 @@ def _log_statistics(
         mixture_data,
         model_logprobs_model_data,
         ref_logprobs_model_data,
-        model_scores,
-        mixture_scores,
+        probability,
         score,
         kl_div,
         context_length,
+        model_scores=None,
+        mixture_scores=None,
     ):
         # Helper function to gather and compute mean
         def gather_mean(tensor):
@@ -303,15 +342,19 @@ def gather_mean(tensor):
         self.stats["logps/rejected"].append(gather_mean(ref_logprobs_model_data_sum))
 
         # Log rewards
-        self.stats["rewards/chosen"].append(gather_mean(model_scores))
-        self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
+        if self.reward_model is not None:
+            self.stats["rewards/chosen"].append(gather_mean(model_scores))
+            self.stats["rewards/rejected"].append(gather_mean(mixture_scores))
+
+        # Log probabilities
+        self.stats["rewards/probabilities"].append(gather_mean(probability))
 
         # Calculate entropy for model data
         entropy_model_data = -model_logprobs_model_data.sum(1)
         self.stats["objective/entropy"].append(gather_mean(entropy_model_data))
 
         # Calculate margins
-        margin = model_scores - mixture_scores
+        margin = model_logprobs_model_data_sum - ref_logprobs_model_data_sum
         self.stats["rewards/margins"].append(gather_mean(margin))
 
         # Calculate accuracy
@@ -335,6 +378,7 @@ def training_step(
 
         # Apply chat template and tokenize the input
         batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
         inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
         inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
         inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
@@ -346,6 +390,7 @@ def training_step(
         prompts = {
             "input_ids": inputs["prompt_input_ids"],
             "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
         }
         del inputs
 
@@ -356,15 +401,19 @@ def training_step(
         model_data, mixture_data = self._process_completions(model_output, mixture_output, prompts)
 
         # Compute rewards
-        model_data_scores, mixture_data_scores = self._compute_rewards(model_data, mixture_data, context_length)
+        if self.reward_model is not None:
+            model_scores, mixture_scores = self._compute_rewards(model_data, mixture_data, context_length)
+            # probability of the model data vs the mixture data
+            probability = F.sigmoid(model_scores - mixture_scores)
+        else:
+            model_scores, mixture_scores = None, None
+            probability = self._compute_judge(model_data, mixture_data, context_length)
 
         # Compute logprobs
         model_logprobs_model_data, ref_logprobs_model_data = self._compute_logprobs(model, model_data, context_length)
 
         # Compute loss
-        loss, score, kl_div = self._compute_losses(
-            model_logprobs_model_data, ref_logprobs_model_data, model_data_scores, mixture_data_scores
-        )
+        loss, score, kl_div = self._compute_losses(model_logprobs_model_data, ref_logprobs_model_data, probability)
 
         # Log everything
         self._log_statistics(
@@ -372,11 +421,12 @@ def training_step(
             mixture_data,
             model_logprobs_model_data.detach(),
             ref_logprobs_model_data,
-            model_data_scores,
-            mixture_data_scores,
+            probability,
             score.detach(),
             kl_div.detach(),
             context_length,
+            model_scores,
+            mixture_scores,
         )
 
         if (
diff --git a/trl/trainer/online_dpo_config.py b/trl/trainer/online_dpo_config.py
index 4c67cb278e..16ae8105fc 100644
--- a/trl/trainer/online_dpo_config.py
+++ b/trl/trainer/online_dpo_config.py
@@ -32,7 +32,9 @@ class OnlineDPOConfig(TrainingArguments):
             Initial learning rate for [`AdamW`] optimizer. The default value replaces that of
             [`~transformers.TrainingArguments`].
         reward_model_path (`Optional[str]`, *optional*, defaults to `None`):
-            Path to the reward model.
+            Path to the reward model. Either `judge` or `reward_model_path` must be set, but not both.
+        judge (`Optional[str]`, *optional*, defaults to `None`):
+            Name of the judge to use. Either `judge` or `reward_model_path` must be set, but not both.
         max_new_tokens (`int`, *optional*, defaults to `64`):
             Maximum number of tokens to generate per completion.
         temperature (`float`, *optional*, defaults to `0.9`):
@@ -60,6 +62,7 @@ class OnlineDPOConfig(TrainingArguments):
 
     learning_rate: float = 5e-7
     reward_model_path: Optional[str] = None
+    judge: Optional[str] = None
     max_new_tokens: int = 64
     temperature: float = 0.9
     missing_eos_penalty: Optional[float] = None
diff --git a/trl/trainer/online_dpo_trainer.py b/trl/trainer/online_dpo_trainer.py
index c480c61fc5..21ce78d8bc 100644
--- a/trl/trainer/online_dpo_trainer.py
+++ b/trl/trainer/online_dpo_trainer.py
@@ -19,6 +19,7 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import datasets
+import jinja2
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -43,12 +44,13 @@
 from transformers.training_args import OptimizerNames
 from transformers.utils import is_peft_available, is_sagemaker_mp_enabled, logging
 
-from ..data_utils import maybe_apply_chat_template
+from ..data_utils import is_conversational, maybe_apply_chat_template
 from ..models import create_reference_model
 from ..models.utils import unwrap_model_for_generation
 from .judges import BasePairwiseJudge
 from .online_dpo_config import OnlineDPOConfig
 from .utils import (
+    SIMPLE_CHAT_TEMPLATE,
     DPODataCollatorWithPadding,
     disable_dropout_in_model,
     empty_cache,
@@ -154,14 +156,16 @@ def __init__(
                 "Both `reward_model` and `judge` are provided. Please choose provide only one of them. "
                 "Ignoring `judge` and using `reward_model`."
             )
+            judge = None
         elif reward_model is None and judge is None:
             raise ValueError("Either `reward_model` or `judge` must be provided.")
-        elif reward_model is None and judge is not None:
-            raise NotImplementedError("Using `judge` is not yet supported.")
 
         self.reward_model = reward_model
         self.judge = judge
 
+        if args.missing_eos_penalty is not None and judge is not None:
+            raise ValueError("`missing_eos_penalty` is not supported when `judge` is provided.")
+
         if args is None:
             raise ValueError("`args` must be provided.")
 
@@ -215,9 +219,6 @@ def __init__(
             "objective/kl": [],
             "objective/entropy": [],
             "objective/non_score_reward": [],
-            "objective/rlhf_reward": [],
-            "objective/scores": [],
-            "objective/scores_margin": [],
             "rewards/chosen": [],
             "rewards/rejected": [],
             "rewards/accuracies": [],
@@ -227,6 +228,10 @@ def __init__(
             "val/contain_eos_token": [],
             "beta": [],
         }
+        if self.reward_model is not None:
+            self.stats["objective/rlhf_reward"] = []
+            self.stats["objective/scores_margin"] = []
+            self.stats["objective/scores"] = []
 
         self.generation_config = GenerationConfig(
             max_new_tokens=args.max_new_tokens,
@@ -374,12 +379,13 @@ def training_step(
         # Apply chat template and tokenize the input.
         # We do this on-the-fly to enable the use of reward models and policies with different tokenizers / chat templates.
         batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
         inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
         inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
         inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
         inputs = self.data_collator(inputs)
 
-        # Sample 2 completations per prompt of size `max_new_tokens` from the model
+        # Sample 2 completions per prompt of size `max_new_tokens` from the model
         inputs = self._prepare_inputs(inputs)
         num_examples, context_length = inputs["prompt_input_ids"].shape
         prompt_ids = inputs["prompt_input_ids"].repeat(2, 1)
@@ -396,6 +402,7 @@ def training_step(
         completion_ids, completion_mask = truncate_right(
             completion_ids, self.processing_class.eos_token_id, self.processing_class.pad_token_id
         )
+        contain_eos_token = torch.any(completion_ids == self.processing_class.eos_token_id, dim=-1)
         prompt_completion_ids = torch.cat((prompt_ids, completion_ids), dim=1)
         prompt_completion_mask = torch.cat((prompt_mask, completion_mask), dim=1)
 
@@ -421,23 +428,45 @@ def training_step(
             ref_logprobs = torch.take_along_dim(ref_all_logprobs, completion_ids.unsqueeze(-1), dim=2).squeeze(-1)
             del ref_output, ref_logits, ref_all_logprobs  # free memory
 
-            # Get the reward from the reward model
+        # Get the reward from the reward model or judge:
+        if self.judge is not None:
+            completions = self.processing_class.batch_decode(
+                prompt_completion_ids[:, context_length:], skip_special_tokens=True
+            )
+            completions = [completion.strip() for completion in completions]  # remove the leading space
+
+            if is_conversational({"prompt": prompts[0]}):
+                completions = [[{"role": "assistant", "content": completion}] for completion in completions]
+                environment = jinja2.Environment()
+                template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+                prompts = [template.render(messages=message) for message in prompts]
+                completions = [template.render(messages=completion) for completion in completions]
+
+            ranks_of_first_completion = self.judge.judge(
+                prompts, list(zip(completions[:num_examples], completions[num_examples:]))
+            )
+
+            # convert ranks to a True/False mask:
+            # when rank == 0, it means the first completion is the best
+            # when rank == 1, it means the second completion is the best
+            mask = torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=prompt_completion_ids.device)
+        else:
             _, scores, _ = get_reward(
                 self.reward_model, prompt_completion_ids, self.processing_class.pad_token_id, context_length
             )
 
-        # Filter completion. Ensure that the sample contains stop_token_id
-        # Completions not passing that filter will receive a lower score.
-        contain_eos_token = torch.any(completion_ids == self.processing_class.eos_token_id, dim=-1)
-        if self.args.missing_eos_penalty is not None:
-            scores[~contain_eos_token] -= self.args.missing_eos_penalty
+            # Filter completion. Ensure that the sample contains stop_token_id
+            # Completions not passing that filter will receive a lower score.
+            if self.args.missing_eos_penalty is not None:
+                scores[~contain_eos_token] -= self.args.missing_eos_penalty
+
+            # Split the scores in 2 (the prompts of the first half are the same as the second half)
+            first_half, second_half = scores.split(num_examples)
 
-        # Split the scores in 2 (the prompts of the first half are the same as the second half)
-        first_half, second_half = scores.split(num_examples)
+            # Get the indices of the chosen and rejected examples
+            mask = first_half >= second_half
 
-        # Get the indices of the chosen and rejected examples
-        num_examples_range = torch.arange(num_examples, device=scores.device)
-        mask = first_half >= second_half
+        num_examples_range = torch.arange(num_examples, device=prompt_completion_ids.device)
         chosen_indices = num_examples_range + (~mask * num_examples)
         rejected_indices = num_examples_range + (mask * num_examples)
 
@@ -471,22 +500,25 @@ def training_step(
         loss = losses.mean()
 
         # Log everything
+        if self.reward_model is not None:
+            scores_margin = scores[chosen_indices] - scores[rejected_indices]
+            self.stats["objective/scores_margin"].append(self.accelerator.gather(scores_margin.mean()).mean().item())
+            self.stats["objective/scores"].append(self.accelerator.gather(scores.mean()).mean().item())
         self.stats["val/contain_eos_token"].append(contain_eos_token.float().mean().item())
         self.stats["logps/chosen"].append(self.accelerator.gather(chosen_logprobs_sum).mean().item())
         self.stats["logps/rejected"].append(self.accelerator.gather(rejected_logprobs_sum).mean().item())
-        self.stats["objective/scores"].append(self.accelerator.gather(scores.mean()).mean().item())
+
         kl = logprobs - ref_logprobs
         mean_kl = kl.sum(1).mean()
         self.stats["objective/kl"].append(self.accelerator.gather(mean_kl).mean().item())
         non_score_reward = (-self.beta * kl).sum(1)
         mean_non_score_reward = non_score_reward.mean()
         self.stats["objective/non_score_reward"].append(self.accelerator.gather(mean_non_score_reward).mean().item())
-        rlhf_reward = scores + non_score_reward
-        self.stats["objective/rlhf_reward"].append(self.accelerator.gather(rlhf_reward).mean().item())
+        if self.reward_model is not None:
+            rlhf_reward = scores + non_score_reward
+            self.stats["objective/rlhf_reward"].append(self.accelerator.gather(rlhf_reward).mean().item())
         mean_entropy = -logprobs.sum(1).mean()
         self.stats["objective/entropy"].append(self.accelerator.gather(mean_entropy).mean().item())
-        scores_margin = scores[chosen_indices] - scores[rejected_indices]
-        self.stats["objective/scores_margin"].append(self.accelerator.gather(scores_margin.mean()).mean().item())
         chosen_rewards = self.beta * (chosen_logprobs_sum - chosen_ref_logprobs_sum)
         gathered_chosen_rewards = self.accelerator.gather(chosen_rewards)
         self.stats["rewards/chosen"].append(gathered_chosen_rewards.mean().item())
diff --git a/trl/trainer/xpo_trainer.py b/trl/trainer/xpo_trainer.py
index a154875821..d643dab651 100644
--- a/trl/trainer/xpo_trainer.py
+++ b/trl/trainer/xpo_trainer.py
@@ -16,6 +16,7 @@
 import textwrap
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
+import jinja2
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -33,10 +34,11 @@
 from transformers.trainer_utils import EvalPrediction
 from transformers.training_args import OptimizerNames
 
-from ..data_utils import maybe_apply_chat_template
+from ..data_utils import is_conversational, maybe_apply_chat_template
 from ..models.utils import unwrap_model_for_generation
+from .judges import BasePairwiseJudge
 from .online_dpo_trainer import OnlineDPOTrainer
-from .utils import empty_cache, generate_model_card, get_reward, truncate_right
+from .utils import SIMPLE_CHAT_TEMPLATE, empty_cache, generate_model_card, get_reward, truncate_right
 from .xpo_config import XPOConfig
 
 
@@ -95,6 +97,7 @@ def __init__(
         model: Union[PreTrainedModel, nn.Module] = None,
         ref_model: Union[PreTrainedModel, nn.Module] = None,
         reward_model: Optional[nn.Module] = None,
+        judge: Optional[BasePairwiseJudge] = None,
         args: Optional[XPOConfig] = None,
         data_collator: Optional[Callable] = None,
         train_dataset: Optional[Union[Dataset, IterableDataset]] = None,
@@ -111,6 +114,7 @@ def __init__(
         super().__init__(
             model=model,
             ref_model=ref_model,
+            judge=judge,
             reward_model=reward_model,
             args=args,
             data_collator=data_collator,
@@ -134,10 +138,6 @@ def __init__(
             "loss/xpo": [],
             "objective/kl": [],
             "objective/entropy": [],
-            # Replace "scores" by "model_scores" and "ref_scores"
-            "objective/model_scores": [],
-            "objective/ref_scores": [],
-            "objective/scores_margin": [],
             "rewards/chosen": [],
             "rewards/rejected": [],
             "rewards/accuracies": [],
@@ -150,6 +150,11 @@ def __init__(
             "alpha": [],
             "beta": [],
         }
+        if self.reward_model is not None:
+            # Replace "scores" by "model_scores" and "ref_scores"
+            self.stats["objective/model_scores"] = []
+            self.stats["objective/ref_scores"] = []
+            self.stats["objective/scores_margin"] = []
 
     @property
     def alpha(self):
@@ -188,6 +193,7 @@ def _process_completions(self, model_output, ref_output, prompts):
         model_data = {
             "input_ids": torch.cat((prompts["input_ids"], model_completion_ids), dim=1),
             "attention_mask": torch.cat((prompts["attention_mask"], model_completion_mask), dim=1),
+            "raw": prompts["raw"],
         }
 
         # Process reference model completions
@@ -198,6 +204,7 @@ def _process_completions(self, model_output, ref_output, prompts):
         ref_data = {
             "input_ids": torch.cat((prompts["input_ids"], ref_completion_ids), dim=1),
             "attention_mask": torch.cat((prompts["attention_mask"], ref_completion_mask), dim=1),
+            "raw": prompts["raw"],
         }
 
         return model_data, ref_data
@@ -220,6 +227,41 @@ def _compute_rewards(self, model_data, ref_data, context_length):
 
         return model_scores, ref_scores
 
+    def _compute_judge(self, model_data, ref_data, context_length):
+        prompts = model_data["raw"]
+        model_data_completions = self.processing_class.batch_decode(
+            model_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        model_data_completions = [completion.strip() for completion in model_data_completions]
+
+        ref_data_completions = self.processing_class.batch_decode(
+            ref_data["input_ids"][:, context_length:], skip_special_tokens=True
+        )
+        ref_data_completions = [completion.strip() for completion in ref_data_completions]
+
+        if is_conversational({"prompt": prompts[0]}):
+            model_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in model_data_completions
+            ]
+            environment = jinja2.Environment()
+            template = environment.from_string(SIMPLE_CHAT_TEMPLATE)
+            prompts = [template.render(messages=message) for message in prompts]
+            model_data_completions = [template.render(messages=completion) for completion in model_data_completions]
+
+            ref_data_completions = [
+                [{"role": "assistant", "content": completion}] for completion in ref_data_completions
+            ]
+            ref_data_completions = [template.render(messages=completion) for completion in ref_data_completions]
+
+        ranks_of_first_completion = self.judge.judge(
+            prompts,
+            list(zip(model_data_completions, ref_data_completions)),
+        )
+        # convert ranks to a True/False mask:
+        # when rank == 0, it means the first completion is the best
+        # when rank == 1, it means the second completion is the best
+        return torch.tensor([rank == 0 for rank in ranks_of_first_completion], device=model_data["input_ids"].device)
+
     def _compute_logprobs(self, model, model_data, ref_data, context_length):
         def compute_logprobs_for_data(m, data):
             output = m(data["input_ids"], attention_mask=data["attention_mask"])
@@ -301,11 +343,12 @@ def _log_statistics(
         model_logprobs_ref_data,
         ref_logprobs_ref_data,
         ref_logprobs_model_data,
-        model_scores,
-        ref_scores,
+        chosen_mask,
         dpo_losses,
         xpo_losses,
         context_length,
+        model_scores=None,
+        ref_scores=None,
     ):
         # Helper function to gather and compute mean
         def gather_mean(tensor):
@@ -316,12 +359,10 @@ def gather_mean(tensor):
         self.stats["loss/xpo"].append(gather_mean(xpo_losses))
 
         # Log scores
-        self.stats["objective/model_scores"].append(gather_mean(model_scores))
-        self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
-        self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores))
-
-        # Determine which model outputs are "chosen" vs "rejected"
-        chosen_mask = model_scores >= ref_scores
+        if self.reward_model is not None:
+            self.stats["objective/model_scores"].append(gather_mean(model_scores))
+            self.stats["objective/ref_scores"].append(gather_mean(ref_scores))
+            self.stats["objective/scores_margin"].append(gather_mean(model_scores - ref_scores))
 
         # Log logprobs
         model_logprobs_model_data_sum = model_logprobs_model_data.sum(1)
@@ -384,6 +425,7 @@ def training_step(
 
         # Apply chat template and tokenize the input
         batch_size = len(next(iter(inputs.values())))
+        prompts = inputs["prompt"]
         inputs = [{k: v[i] for k, v in inputs.items()} for i in range(batch_size)]
         inputs = [maybe_apply_chat_template(x, self.processing_class) for x in inputs]
         inputs = [self.tokenize_row(x, self.model.config.is_encoder_decoder, self.processing_class) for x in inputs]
@@ -395,6 +437,7 @@ def training_step(
         prompts = {
             "input_ids": inputs["prompt_input_ids"],
             "attention_mask": inputs["prompt_attention_mask"],
+            "raw": prompts,
         }
         del inputs
 
@@ -405,7 +448,12 @@ def training_step(
         model_data, ref_data = self._process_completions(model_output, ref_output, prompts)
 
         # Compute rewards
-        model_data_scores, ref_data_scores = self._compute_rewards(model_data, ref_data, context_length)
+        if self.reward_model is not None:
+            model_scores, ref_scores = self._compute_rewards(model_data, ref_data, context_length)
+            chosen_mask = model_scores >= ref_scores
+        else:
+            model_scores, ref_scores = None, None
+            chosen_mask = self._compute_judge(model_data, ref_data, context_length)
 
         # Compute logprobs
         model_logprobs_model_data, model_logprobs_ref_data, ref_logprobs_ref_data, ref_logprobs_model_data = (
@@ -418,7 +466,7 @@ def training_step(
             model_logprobs_ref_data,
             ref_logprobs_ref_data,
             ref_logprobs_model_data,
-            model_data_scores >= ref_data_scores,
+            chosen_mask,
         )
 
         # Log everything
@@ -429,11 +477,12 @@ def training_step(
             model_logprobs_ref_data.detach(),
             ref_logprobs_ref_data,
             ref_logprobs_model_data,
-            model_data_scores,
-            ref_data_scores,
+            chosen_mask,
             dpo_losses.detach(),
             xpo_losses.detach(),
             context_length,
+            model_scores,
+            ref_scores,
         )
 
         if (