From 641d7a24a69952a008671cae02d980fe99fe6d69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 24 Sep 2024 11:30:37 +0000
Subject: [PATCH 1/6] update test

---
 tests/test_bco_trainer.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py
index 8d5e8b65a5..14e97c1c82 100644
--- a/tests/test_bco_trainer.py
+++ b/tests/test_bco_trainer.py
@@ -49,19 +49,19 @@ def setUp(self):
 
     @parameterized.expand(
         [
-            ["gpt2", True, True],
-            ["gpt2", True, False],
-            ["gpt2", False, True],
-            ["gpt2", False, False],
+            ["gpt2", True, True, "standard_unpaired_preference"],
+            ["gpt2", True, False, "standard_unpaired_preference"],
+            ["gpt2", False, True, "standard_unpaired_preference"],
+            ["gpt2", False, False, "standard_unpaired_preference"],
+            ["gpt2", True, True, "conversational_unpaired_preference"],
         ]
     )
-    def test_bco_trainer(self, name, pre_compute, eval_dataset):
+    def test_bco_trainer(self, name, pre_compute, eval_dataset, config_name):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = BCOConfig(
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -70,7 +70,7 @@ def test_bco_trainer(self, name, pre_compute, eval_dataset):
                 report_to="none",
             )
 
-            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference")
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
 
             if name == "gpt2":
                 model = self.model
@@ -129,7 +129,6 @@ def test_tokenize_and_process_tokens(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -192,7 +191,6 @@ def test_bco_trainer_without_providing_ref_model(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -230,7 +228,6 @@ def test_bco_trainer_udm(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -289,7 +286,6 @@ def test_bco_trainer_without_providing_ref_model_with_lora(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -330,7 +326,6 @@ def test_bco_trainer_generate_during_eval_no_wandb(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -376,7 +371,6 @@ def test_bco_lora_save(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",

From 351ed55a846b5ac4783505086e4b4d6b25177417 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 24 Sep 2024 11:30:51 +0000
Subject: [PATCH 2/6] maybe_apply_chat_template

---
 trl/trainer/bco_trainer.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/trl/trainer/bco_trainer.py b/trl/trainer/bco_trainer.py
index acfa9285e7..3751bbe5d5 100644
--- a/trl/trainer/bco_trainer.py
+++ b/trl/trainer/bco_trainer.py
@@ -46,6 +46,7 @@
 from transformers.trainer_utils import EvalLoopOutput, has_length
 from transformers.utils import is_peft_available
 
+from ..data_utils import maybe_apply_chat_template
 from ..models import PreTrainedModelWrapper, create_reference_model
 from .bco_config import BCOConfig
 from .utils import (
@@ -562,6 +563,14 @@ def make_inputs_require_grad(module, input, output):
         self.embedding_tokenizer = embedding_tokenizer
 
         with PartialState().local_main_process_first():
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=args.dataset_num_proc
+                )
             # Shuffle the datasets
             train_dataset = train_dataset.shuffle(seed=args.data_seed)
             if eval_dataset is not None:

From d273e43084508595646c0ddb2c7e6973c957b4b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 24 Sep 2024 11:53:34 +0000
Subject: [PATCH 3/6] simplify bco example

---
 examples/scripts/bco.py | 99 ++++-------------------------------------
 1 file changed, 8 insertions(+), 91 deletions(-)

diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
index 5ac9ed4a0d..66cc57dc29 100644
--- a/examples/scripts/bco.py
+++ b/examples/scripts/bco.py
@@ -18,6 +18,8 @@
 # Full training:
 python examples/scripts/bco.py \
     --model_name_or_path=nnheui/stablelm-2-1_6b-sft-full \
+    --trust_remote_code \
+    --dataset_name trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness \
     --per_device_train_batch_size 16 \
     --per_device_eval_batch_size 32 \
     --num_train_epochs 1 \
@@ -66,88 +68,15 @@
     --lora_alpha=16
 """
 
-import logging
-from dataclasses import dataclass
 from functools import partial
-from typing import Literal, Optional
 
 import torch
 import torch.nn.functional as F
-from accelerate import Accelerator, PartialState
-from datasets import Dataset, load_dataset
+from accelerate import Accelerator
+from datasets import load_dataset
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, PreTrainedModel
 
-from trl import BCOConfig, BCOTrainer, ModelConfig, get_peft_config, setup_chat_format
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    The arguments for the BCO training script.
-    """
-
-    llm_name: Literal["gpt-3.5-turbo", "llama-2-7b-chat", "llama-2-70b-chat"] = "gpt-3.5-turbo"
-
-
-def build_helpfulness_dataset(llm_name: str, num_proc: Optional[int] = None) -> Dataset:
-    """
-    Filter `llm_name` completions and binarize given their helpfulness score.
-    If helpfulness score is 5, it is desirable. Otherwise, it is undesirable.
-    """
-
-    def get_model_rating(example, metric: str, llm_name: str):
-        try:
-            model_index = example["models"].index(llm_name)
-            return {metric: int(example["completions"][model_index]["annotations"][metric]["Rating"])}
-        except ValueError as e:
-            logging.warning(e)
-            return -1
-
-    def get_model_response(example, llm_name: str):
-        try:
-            model_index = example["models"].index(llm_name)
-            return {"response": example["completions"][model_index]["response"]}
-        except ValueError as e:
-            logging.warning(e)
-            return -1
-
-    dataset = load_dataset("openbmb/UltraFeedback")["train"]
-
-    dataset = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc)
-    dataset = dataset.filter(
-        lambda example: len(example["models"]) == len(example["completions"]), batched=False, num_proc=num_proc
-    )
-
-    METRIC = "helpfulness"
-
-    dataset = dataset.map(
-        get_model_rating,
-        batched=False,
-        fn_kwargs={"metric": METRIC, "llm_name": llm_name},
-        num_proc=num_proc,
-    )
-
-    dataset = dataset.map(
-        get_model_response,
-        batched=False,
-        fn_kwargs={"llm_name": llm_name},
-        num_proc=num_proc,
-    )
-
-    dataset = dataset.select_columns(["source", "instruction", "response", "helpfulness"])
-
-    dataset = dataset.rename_columns({"instruction": "prompt", "response": "completion"})
-    dataset = dataset.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc)
-
-    dataset = dataset.map(
-        lambda example: {"prompt": [{"role": "user", "content": example["prompt"]}]},
-        batched=False,
-        num_proc=num_proc,
-    )
-    dataset = dataset.train_test_split(test_size=0.05, seed=42)
-
-    return dataset
+from trl import BCOConfig, BCOTrainer, DPOScriptArguments, ModelConfig, get_peft_config, setup_chat_format
 
 
 def embed_prompt(input_ids: torch.LongTensor, attention_mask: torch.LongTensor, model: PreTrainedModel):
@@ -174,8 +103,8 @@ def mean_pooling(model_output, attention_mask):
 
 
 if __name__ == "__main__":
-    parser = HfArgumentParser((ScriptArguments, BCOConfig, ModelConfig))
-    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
+    parser = HfArgumentParser((DPOScriptArguments, BCOConfig, ModelConfig))
+    args, training_args, model_args = parser.parse_args_into_dataclasses()
 
     training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
 
@@ -197,19 +126,7 @@ def mean_pooling(model_output, attention_mask):
     if tokenizer.chat_template is None:
         model, tokenizer = setup_chat_format(model, tokenizer)
 
-    # Apply chat template
-    def format_dataset(example):
-        example["prompt"] = tokenizer.apply_chat_template(
-            example["prompt"], tokenize=False, add_generation_prompt=True
-        )
-        return example
-
-    # Compute that only on the main process for faster data processing.
-    # see: https://github.com/huggingface/trl/pull/1255
-    with PartialState().local_main_process_first():
-        # Load the dataset
-        dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=training_args.dataset_num_proc)
-        dataset = dataset.map(format_dataset, batched=False, num_proc=training_args.dataset_num_proc)
+    dataset = load_dataset(args.dataset_name)
 
     accelerator = Accelerator()
     embedding_model = AutoModel.from_pretrained(

From 8cc574c2997df8a65dad4524d6eeb9e42946c128 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <quentin.gallouedec@huggingface.co>
Date: Tue, 24 Sep 2024 11:58:30 +0000
Subject: [PATCH 4/6] Update documentation

---
 docs/source/bco_trainer.mdx | 45 ++-----------------------------------
 1 file changed, 2 insertions(+), 43 deletions(-)

diff --git a/docs/source/bco_trainer.mdx b/docs/source/bco_trainer.mdx
index ae094c142b..b19f499577 100644
--- a/docs/source/bco_trainer.mdx
+++ b/docs/source/bco_trainer.mdx
@@ -6,49 +6,8 @@ For a full example have a look at  [`examples/scripts/bco.py`].
 
 ## Expected dataset format
 
-The BCO trainer expects a very specific format for the dataset as it does not require pairwise preferences. Since the model will be trained to directly optimize examples that consist of a prompt, model completion, and a label to indicate whether the completion is "good" or "bad", we expect a dataset with the following columns:
-
-- `prompt`
-- `completion`
-- `label`
-
-for example:
-
-```
-bco_dataset_dict = {
-    "prompt": [
-        "Hey, hello",
-        "How are you",
-        "What is your name?",
-        "What is your name?",
-        "Which is the best programming language?",
-        "Which is the best programming language?",
-        "Which is the best programming language?",
-    ],
-    "completion": [
-        "hi nice to meet you",
-        "leave me alone",
-        "I don't have a name",
-        "My name is Mary",
-        "Python",
-        "C++",
-        "Java",
-    ],
-    "label": [
-        True,
-        False,
-        False,
-        True,
-        True,
-        False,
-        False,
-    ],
-}
-```
-
-where the `prompt` contains the context inputs, `completion` contains the corresponding responses and `label` contains the corresponding flag that indicates if the generated completion is desired (`True`) or undesired (`False`).
-A prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays. It is required that the dataset contains at least one desirable and one undesirable completion.
-
+The [`BCOTrainer`] requires a [unpaired preference dataset](dataset_formats#unpaired-preference).
+The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational-dataset-format) and [standard](dataset_formats#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Expected model format
 The BCO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.

From 0058f476243056e1f55df0c8761c9aee87246c27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:54:04 +0200
Subject: [PATCH 5/6] Update examples/scripts/bco.py

---
 examples/scripts/bco.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
index 66cc57dc29..3c9a4ce13a 100644
--- a/examples/scripts/bco.py
+++ b/examples/scripts/bco.py
@@ -17,7 +17,7 @@
 
 # Full training:
 python examples/scripts/bco.py \
-    --model_name_or_path=nnheui/stablelm-2-1_6b-sft-full \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
     --trust_remote_code \
     --dataset_name trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness \
     --per_device_train_batch_size 16 \

From d545d0deeb21b82520223efd879a44c2fdd633fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:54:15 +0200
Subject: [PATCH 6/6] Update docs/source/bco_trainer.mdx

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 docs/source/bco_trainer.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/bco_trainer.mdx b/docs/source/bco_trainer.mdx
index b19f499577..299a1e78a0 100644
--- a/docs/source/bco_trainer.mdx
+++ b/docs/source/bco_trainer.mdx
@@ -6,7 +6,7 @@ For a full example have a look at  [`examples/scripts/bco.py`].
 
 ## Expected dataset format
 
-The [`BCOTrainer`] requires a [unpaired preference dataset](dataset_formats#unpaired-preference).
+The [`BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unpaired-preference).
 The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational-dataset-format) and [standard](dataset_formats#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Expected model format