From 44a06fc48739e6f5e6a1d7e85a06c6e128746b09 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Quentin=20Gallou=C3=A9dec?=
 <45557362+qgallouedec@users.noreply.github.com>
Date: Tue, 24 Sep 2024 18:15:57 +0200
Subject: [PATCH] `BCOTrainer` conversational dataset support (#2107)

* update test

* maybe_apply_chat_template

* simplify bco example

* Update documentation

* Update examples/scripts/bco.py

* Update docs/source/bco_trainer.mdx

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>

---------

Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
---
 docs/source/bco_trainer.mdx |  45 +---------------
 examples/scripts/bco.py     | 101 ++++--------------------------------
 tests/test_bco_trainer.py   |  20 +++----
 trl/trainer/bco_trainer.py  |   9 ++++
 4 files changed, 27 insertions(+), 148 deletions(-)

diff --git a/docs/source/bco_trainer.mdx b/docs/source/bco_trainer.mdx
index ae094c142b..299a1e78a0 100644
--- a/docs/source/bco_trainer.mdx
+++ b/docs/source/bco_trainer.mdx
@@ -6,49 +6,8 @@ For a full example have a look at  [`examples/scripts/bco.py`].
 
 ## Expected dataset format
 
-The BCO trainer expects a very specific format for the dataset as it does not require pairwise preferences. Since the model will be trained to directly optimize examples that consist of a prompt, model completion, and a label to indicate whether the completion is "good" or "bad", we expect a dataset with the following columns:
-
-- `prompt`
-- `completion`
-- `label`
-
-for example:
-
-```
-bco_dataset_dict = {
-    "prompt": [
-        "Hey, hello",
-        "How are you",
-        "What is your name?",
-        "What is your name?",
-        "Which is the best programming language?",
-        "Which is the best programming language?",
-        "Which is the best programming language?",
-    ],
-    "completion": [
-        "hi nice to meet you",
-        "leave me alone",
-        "I don't have a name",
-        "My name is Mary",
-        "Python",
-        "C++",
-        "Java",
-    ],
-    "label": [
-        True,
-        False,
-        False,
-        True,
-        True,
-        False,
-        False,
-    ],
-}
-```
-
-where the `prompt` contains the context inputs, `completion` contains the corresponding responses and `label` contains the corresponding flag that indicates if the generated completion is desired (`True`) or undesired (`False`).
-A prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays. It is required that the dataset contains at least one desirable and one undesirable completion.
-
+The [`BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unpaired-preference).
+The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational-dataset-format) and [standard](dataset_formats#standard-dataset-format) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
 
 ## Expected model format
 The BCO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
index 5ac9ed4a0d..3c9a4ce13a 100644
--- a/examples/scripts/bco.py
+++ b/examples/scripts/bco.py
@@ -17,7 +17,9 @@
 
 # Full training:
 python examples/scripts/bco.py \
-    --model_name_or_path=nnheui/stablelm-2-1_6b-sft-full \
+    --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --trust_remote_code \
+    --dataset_name trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness \
     --per_device_train_batch_size 16 \
     --per_device_eval_batch_size 32 \
     --num_train_epochs 1 \
@@ -66,88 +68,15 @@
     --lora_alpha=16
 """
 
-import logging
-from dataclasses import dataclass
 from functools import partial
-from typing import Literal, Optional
 
 import torch
 import torch.nn.functional as F
-from accelerate import Accelerator, PartialState
-from datasets import Dataset, load_dataset
+from accelerate import Accelerator
+from datasets import load_dataset
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, PreTrainedModel
 
-from trl import BCOConfig, BCOTrainer, ModelConfig, get_peft_config, setup_chat_format
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    The arguments for the BCO training script.
-    """
-
-    llm_name: Literal["gpt-3.5-turbo", "llama-2-7b-chat", "llama-2-70b-chat"] = "gpt-3.5-turbo"
-
-
-def build_helpfulness_dataset(llm_name: str, num_proc: Optional[int] = None) -> Dataset:
-    """
-    Filter `llm_name` completions and binarize given their helpfulness score.
-    If helpfulness score is 5, it is desirable. Otherwise, it is undesirable.
-    """
-
-    def get_model_rating(example, metric: str, llm_name: str):
-        try:
-            model_index = example["models"].index(llm_name)
-            return {metric: int(example["completions"][model_index]["annotations"][metric]["Rating"])}
-        except ValueError as e:
-            logging.warning(e)
-            return -1
-
-    def get_model_response(example, llm_name: str):
-        try:
-            model_index = example["models"].index(llm_name)
-            return {"response": example["completions"][model_index]["response"]}
-        except ValueError as e:
-            logging.warning(e)
-            return -1
-
-    dataset = load_dataset("openbmb/UltraFeedback")["train"]
-
-    dataset = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc)
-    dataset = dataset.filter(
-        lambda example: len(example["models"]) == len(example["completions"]), batched=False, num_proc=num_proc
-    )
-
-    METRIC = "helpfulness"
-
-    dataset = dataset.map(
-        get_model_rating,
-        batched=False,
-        fn_kwargs={"metric": METRIC, "llm_name": llm_name},
-        num_proc=num_proc,
-    )
-
-    dataset = dataset.map(
-        get_model_response,
-        batched=False,
-        fn_kwargs={"llm_name": llm_name},
-        num_proc=num_proc,
-    )
-
-    dataset = dataset.select_columns(["source", "instruction", "response", "helpfulness"])
-
-    dataset = dataset.rename_columns({"instruction": "prompt", "response": "completion"})
-    dataset = dataset.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc)
-
-    dataset = dataset.map(
-        lambda example: {"prompt": [{"role": "user", "content": example["prompt"]}]},
-        batched=False,
-        num_proc=num_proc,
-    )
-    dataset = dataset.train_test_split(test_size=0.05, seed=42)
-
-    return dataset
+from trl import BCOConfig, BCOTrainer, DPOScriptArguments, ModelConfig, get_peft_config, setup_chat_format
 
 
 def embed_prompt(input_ids: torch.LongTensor, attention_mask: torch.LongTensor, model: PreTrainedModel):
@@ -174,8 +103,8 @@ def mean_pooling(model_output, attention_mask):
 
 
 if __name__ == "__main__":
-    parser = HfArgumentParser((ScriptArguments, BCOConfig, ModelConfig))
-    script_args, training_args, model_args = parser.parse_args_into_dataclasses()
+    parser = HfArgumentParser((DPOScriptArguments, BCOConfig, ModelConfig))
+    args, training_args, model_args = parser.parse_args_into_dataclasses()
 
     training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
 
@@ -197,19 +126,7 @@ def mean_pooling(model_output, attention_mask):
     if tokenizer.chat_template is None:
         model, tokenizer = setup_chat_format(model, tokenizer)
 
-    # Apply chat template
-    def format_dataset(example):
-        example["prompt"] = tokenizer.apply_chat_template(
-            example["prompt"], tokenize=False, add_generation_prompt=True
-        )
-        return example
-
-    # Compute that only on the main process for faster data processing.
-    # see: https://github.com/huggingface/trl/pull/1255
-    with PartialState().local_main_process_first():
-        # Load the dataset
-        dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=training_args.dataset_num_proc)
-        dataset = dataset.map(format_dataset, batched=False, num_proc=training_args.dataset_num_proc)
+    dataset = load_dataset(args.dataset_name)
 
     accelerator = Accelerator()
     embedding_model = AutoModel.from_pretrained(
diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py
index 8d5e8b65a5..14e97c1c82 100644
--- a/tests/test_bco_trainer.py
+++ b/tests/test_bco_trainer.py
@@ -49,19 +49,19 @@ def setUp(self):
 
     @parameterized.expand(
         [
-            ["gpt2", True, True],
-            ["gpt2", True, False],
-            ["gpt2", False, True],
-            ["gpt2", False, False],
+            ["gpt2", True, True, "standard_unpaired_preference"],
+            ["gpt2", True, False, "standard_unpaired_preference"],
+            ["gpt2", False, True, "standard_unpaired_preference"],
+            ["gpt2", False, False, "standard_unpaired_preference"],
+            ["gpt2", True, True, "conversational_unpaired_preference"],
         ]
     )
-    def test_bco_trainer(self, name, pre_compute, eval_dataset):
+    def test_bco_trainer(self, name, pre_compute, eval_dataset, config_name):
         with tempfile.TemporaryDirectory() as tmp_dir:
             training_args = BCOConfig(
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -70,7 +70,7 @@ def test_bco_trainer(self, name, pre_compute, eval_dataset):
                 report_to="none",
             )
 
-            dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_unpaired_preference")
+            dummy_dataset = load_dataset("trl-internal-testing/zen", config_name)
 
             if name == "gpt2":
                 model = self.model
@@ -129,7 +129,6 @@ def test_tokenize_and_process_tokens(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -192,7 +191,6 @@ def test_bco_trainer_without_providing_ref_model(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -230,7 +228,6 @@ def test_bco_trainer_udm(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -289,7 +286,6 @@ def test_bco_trainer_without_providing_ref_model_with_lora(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -330,7 +326,6 @@ def test_bco_trainer_generate_during_eval_no_wandb(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=1,
                 learning_rate=9e-1,
                 eval_strategy="steps",
@@ -376,7 +371,6 @@ def test_bco_lora_save(self):
                 output_dir=tmp_dir,
                 per_device_train_batch_size=2,
                 max_steps=3,
-                remove_unused_columns=False,
                 gradient_accumulation_steps=4,
                 learning_rate=9e-1,
                 eval_strategy="steps",
diff --git a/trl/trainer/bco_trainer.py b/trl/trainer/bco_trainer.py
index acfa9285e7..3751bbe5d5 100644
--- a/trl/trainer/bco_trainer.py
+++ b/trl/trainer/bco_trainer.py
@@ -46,6 +46,7 @@
 from transformers.trainer_utils import EvalLoopOutput, has_length
 from transformers.utils import is_peft_available
 
+from ..data_utils import maybe_apply_chat_template
 from ..models import PreTrainedModelWrapper, create_reference_model
 from .bco_config import BCOConfig
 from .utils import (
@@ -562,6 +563,14 @@ def make_inputs_require_grad(module, input, output):
         self.embedding_tokenizer = embedding_tokenizer
 
         with PartialState().local_main_process_first():
+            # Apply the chat template if needed
+            train_dataset = train_dataset.map(
+                maybe_apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=args.dataset_num_proc
+            )
+            if eval_dataset is not None:
+                eval_dataset = eval_dataset.map(
+                    maybe_apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=args.dataset_num_proc
+                )
             # Shuffle the datasets
             train_dataset = train_dataset.shuffle(seed=args.data_seed)
             if eval_dataset is not None: