Merge branch 'main' into field

huggingface · Jan 6, 2025 · b0b6f51 · b0b6f51
2 parents 2ea9cb9 + d9ee2fd
commit b0b6f51
Show file tree

Hide file tree

Showing 30 changed files with 439 additions and 201 deletions.
diff --git a/docs/source/dpo_trainer.mdx b/docs/source/dpo_trainer.mdx
@@ -278,6 +278,6 @@ dpo_trainer = DPOTrainer(
 
 [[autodoc]] DPOConfig
 
-## PreferenceCollator
+## DataCollatorForPreference
 
-[[autodoc]] trainer.dpo_trainer.PreferenceCollator
+[[autodoc]] trainer.dpo_trainer.DataCollatorForPreference
diff --git a/docs/source/reducing_memory_usage.md b/docs/source/reducing_memory_usage.md
@@ -51,4 +51,37 @@ training_args = SFTConfig(..., max_length=...)
 ```
 
 </hfoption>
-</hfoptions>
+</hfoptions>
+
+## Packing
+
+<Tip>
+
+This technique applies only to SFT.
+
+</Tip>
+
+
+[Truncation](#truncation) has several drawbacks:
+1. **Loss of information**: Key data at the end of a sequence may be discarded.
+2. **Choosing truncation length**: Too short loses data; too long undermines efficiency.
+
+Packing, introduced in [Raffel et al., 2020](https://huggingface.co/papers/1910.10683), addresses these issues by grouping sequences instead of truncating. It concatenates and splits dataset sequences into the desired lengths.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing.png" alt="Packing" width="600"/>
+</div>
+
+Packing eliminates padding, preserves all sequence information, and allows for flexible sequence lengths, making it a more efficient alternative to truncation. To enable packing, use `packing=True` in the [`SFTConfig`]:
+
+```python
+from trl import SFTConfig
+
+training_args = SFTConfig(..., packing=True, max_seq_length=512)
+```
+
+<Tip warning={true}>
+
+Packing may cause batch contamination, where adjacent sequences influence one another. This can be problematic for some applications. For more details, see [#1230](https://github.com/huggingface/trl/issues/1230).
+
+</Tip>
diff --git a/tests/test_bco_trainer.py b/tests/test_bco_trainer.py
@@ -346,8 +346,8 @@ def test_bco_trainer_generate_during_eval_no_wandb(self):
 
             with self.assertRaisesRegex(
                 ValueError,
-                expected_regex="`generate_during_eval=True` requires Weights and Biases to be installed."
-                " Please install with `pip install wandb` to resolve.",
+                expected_regex="`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve.",
             ):
                 BCOTrainer(
                     model=self.model,

diff --git a/tests/test_callbacks.py b/tests/test_callbacks.py
@@ -24,7 +24,7 @@
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_peft_available
 
-from tests.testing_utils import require_mergekit
+from tests.testing_utils import require_comet, require_mergekit
 from trl import BasePairwiseJudge, DPOConfig, DPOTrainer, LogCompletionsCallback, MergeModelCallback, WinRateCallback
 from trl.mergekit_utils import MergeConfig
 
@@ -216,7 +216,6 @@ def test_lora(self):
             self.assertListEqual(winrate_history, self.expected_winrates)
 
 
-@require_wandb
 class LogCompletionsCallbackTester(unittest.TestCase):
     def setUp(self):
         self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
@@ -234,7 +233,8 @@ def tokenize_function(examples):
 
         self.generation_config = GenerationConfig(max_length=32)
 
-    def test_basic(self):
+    @require_wandb
+    def test_basic_wandb(self):
         import wandb
 
         with tempfile.TemporaryDirectory() as tmp_dir:
@@ -271,6 +271,45 @@ def test_basic(self):
             # Check that the prompt is in the log
             self.assertIn(self.dataset["test"][0]["prompt"], completions["data"][0])
 
+    @require_comet
+    def test_basic_comet(self):
+        import comet_ml
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            training_args = TrainingArguments(
+                output_dir=tmp_dir,
+                eval_strategy="steps",
+                eval_steps=2,  # evaluate every 2 steps
+                per_device_train_batch_size=2,  # 8 samples in total so 4 batches of 2 per epoch
+                per_device_eval_batch_size=2,
+                report_to="comet_ml",
+            )
+            trainer = Trainer(
+                model=self.model,
+                args=training_args,
+                train_dataset=self.dataset["train"],
+                eval_dataset=self.dataset["test"],
+                processing_class=self.tokenizer,
+            )
+            completions_callback = LogCompletionsCallback(trainer, self.generation_config, num_prompts=2)
+            trainer.add_callback(completions_callback)
+            trainer.train()
+
+            # close experiment to make sure all pending data are flushed
+            experiment = comet_ml.get_running_experiment()
+            assert experiment is not None
+            experiment.end()
+
+            # get experiment assets and check that all required tables was logged
+            steps = len(self.dataset["train"]) + len(self.dataset["test"])
+            tables_logged = int(steps / 2) + 1  # +1 to include zero step
+
+            api_experiment = comet_ml.APIExperiment(previous_experiment=experiment.id)
+            tables = api_experiment.get_asset_list("dataframe")
+            assert tables is not None
+            assert len(tables) == tables_logged
+            assert all(table["fileName"] == "completions.csv" for table in tables)
+
 
 # On Windows, temporary directory cleanup fails when using the MergeModelCallback.
 # This is not an issue with the functionality of the code itself, but it can cause the test to fail

diff --git a/tests/test_collators.py b/tests/test_collators.py
@@ -0,0 +1,74 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from trl.trainer.dpo_trainer import DataCollatorForPreference
+
+
+class TestDataCollatorForPreference(unittest.TestCase):
+    def setUp(self):
+        self.collator = DataCollatorForPreference(pad_token_id=0)
+
+    def assertTensorEqual(self, tensor1, tensor2):
+        self.assertTrue(torch.equal(tensor1, tensor2), f"Tensors are not equal:\n{tensor1}\n{tensor2}")
+
+    def test_padding_behavior(self):
+        examples = [
+            {"prompt_input_ids": [1, 2, 3], "chosen_input_ids": [4, 5], "rejected_input_ids": [6]},
+            {"prompt_input_ids": [7, 8], "chosen_input_ids": [9, 10], "rejected_input_ids": [11, 12, 13]},
+        ]
+        output = self.collator.torch_call(examples)
+
+        expected_prompt_input_ids = torch.tensor([[1, 2, 3], [0, 7, 8]])
+        expected_prompt_attention_mask = torch.tensor([[1, 1, 1], [0, 1, 1]])
+        expected_chosen_input_ids = torch.tensor([[4, 5], [9, 10]])
+        expected_chosen_attention_mask = torch.tensor([[1, 1], [1, 1]])
+        expected_rejected_input_ids = torch.tensor([[6, 0, 0], [11, 12, 13]])
+        expected_rejected_attention_mask = torch.tensor([[1, 0, 0], [1, 1, 1]])
+
+        self.assertTensorEqual(output["prompt_input_ids"], expected_prompt_input_ids)
+        self.assertTensorEqual(output["prompt_attention_mask"], expected_prompt_attention_mask)
+        self.assertTensorEqual(output["chosen_input_ids"], expected_chosen_input_ids)
+        self.assertTensorEqual(output["chosen_attention_mask"], expected_chosen_attention_mask)
+        self.assertTensorEqual(output["rejected_input_ids"], expected_rejected_input_ids)
+        self.assertTensorEqual(output["rejected_attention_mask"], expected_rejected_attention_mask)
+
+    def test_optional_fields(self):
+        examples = [
+            {
+                "prompt_input_ids": [1],
+                "chosen_input_ids": [2],
+                "rejected_input_ids": [3],
+                "pixel_values": [[[0.1, 0.2], [0.3, 0.4]]],  # Example 3D tensor (1x2x2)
+            },
+            {
+                "prompt_input_ids": [4],
+                "chosen_input_ids": [5],
+                "rejected_input_ids": [6],
+                "pixel_values": [[[0.5, 0.6], [0.7, 0.8]]],  # Example 3D tensor (1x2x2)
+            },
+        ]
+        output = self.collator.torch_call(examples)
+
+        expected_pixel_values = torch.tensor(
+            [
+                [[[0.1, 0.2], [0.3, 0.4]]],
+                [[[0.5, 0.6], [0.7, 0.8]]],
+            ]
+        )  # Shape: (2, 1, 2, 2)
+
+        self.assertTensorEqual(output["pixel_values"], expected_pixel_values)
diff --git a/tests/test_data_collator_completion_only.py b/tests/test_data_collator_completion_only.py
@@ -114,7 +114,7 @@ def test_padding_free(self):
         inst1 = "### System: You are a helpful assistant.\n\n### User: How much is 2+2?\n\n### Assistant: 2+2 equals 4"
         inst2 = "### System: You are a honest and helpful assistant.\n\n### User: What is the answer of 22x22?\n\n### Assistant: 22x22 equals 484"
 
-        response_template = "\n### Assistant:"
+        response_template = "\n\n### Assistant:"
         collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
         collator_paddingfree = DataCollatorForCompletionOnlyLM(
             response_template, tokenizer=tokenizer, padding_free=True
@@ -143,3 +143,21 @@ def test_padding_free(self):
         self.assertTrue((input_ids_remove_pad == batch_paddingfree["input_ids"]).all())
         self.assertTrue((expected_position_ids == batch_paddingfree["position_ids"]).all())
         self.assertTrue((expected_labels == batch_paddingfree["labels"]).all())
+
+    def test_data_collator_for_completion_only_lm(self):
+        # The tokenizer isn't use but the collator needs it to be provided.
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+
+        collator = DataCollatorForCompletionOnlyLM(tokenizer.decode(9999), tokenizer=tokenizer, padding_free=True)
+
+        tokenized_instruction = [
+            {"input_ids": [1, 2, 3, 9999, 4, 5], "attention_mask": [1, 1, 1, 1, 1, 1]},
+            {"input_ids": [6, 7, 8, 9, 9999, 10, 11], "attention_mask": [1, 1, 1, 1, 1, 1, 1]},
+        ]
+        batch = collator(tokenized_instruction)
+
+        self.assertEqual(batch["position_ids"].tolist(), [[0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 6]])  # flat pos ids
+        self.assertEqual(batch["cu_seq_lens_q"].tolist(), [0, 6, 13])  # start idx of each seq + total number of tokens
+        self.assertEqual(batch["cu_seq_lens_k"].tolist(), [0, 6, 13])  # idem
+        self.assertEqual(batch["max_length_k"], 7)  # max length in batch, here 7 (second sequence)
+        self.assertEqual(batch["max_length_q"], 7)  # idem
diff --git a/tests/test_dpo_trainer.py b/tests/test_dpo_trainer.py
@@ -571,8 +571,8 @@ def test_dpo_trainer_generate_during_eval_no_wandb(self):
 
             with self.assertRaisesRegex(
                 ValueError,
-                expected_regex="`generate_during_eval=True` requires Weights and Biases to be installed."
-                " Please install `wandb` to resolve.",
+                expected_regex="`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve.",
             ):
                 DPOTrainer(
                     model=self.model,

diff --git a/tests/test_kto_trainer.py b/tests/test_kto_trainer.py
@@ -316,8 +316,8 @@ def test_kto_trainer_generate_during_eval_no_wandb(self):
 
             with self.assertRaisesRegex(
                 ValueError,
-                expected_regex="`generate_during_eval=True` requires Weights and Biases to be installed."
-                " Please install with `pip install wandb` to resolve.",
+                expected_regex="`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+                " Please install `wandb` or `comet-ml` to resolve.",
             ):
                 KTOTrainer(
                     model=self.model,

diff --git a/tests/test_rloo_trainer.py b/tests/test_rloo_trainer.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import platform
-import subprocess
 import tempfile
 import unittest
 
@@ -24,34 +22,6 @@
 from trl import RLOOConfig, RLOOTrainer
 
 
-def test():
-    command = """\
-python examples/scripts/rloo/rloo.py \
-    --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \
-    --dataset_train_split descriptiveness \
-    --learning_rate 3e-6 \
-    --output_dir models/minimal/rloo \
-    --per_device_train_batch_size 4 \
-    --gradient_accumulation_steps 1 \
-    --total_episodes 10 \
-    --model_name_or_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \
-    --sft_model_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \
-    --reward_model_path trl-internal-testing/tiny-Qwen2ForCausalLM-2.5 \
-    --missing_eos_penalty 1.0 \
-    --save_strategy no \
-    --stop_token eos
-"""
-    if platform.system() == "Windows":
-        # windows CI does not work with subprocesses for some reason
-        # e.g., https://github.com/huggingface/trl/actions/runs/9600036224/job/26475286210?pr=1743
-        return
-    subprocess.run(
-        command,
-        shell=True,
-        check=True,
-    )
-
-
 class RLOOTrainerTester(unittest.TestCase):
     def setUp(self):
         self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"

diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -15,7 +15,7 @@
 import random
 import unittest
 
-from transformers import is_bitsandbytes_available, is_sklearn_available, is_wandb_available
+from transformers import is_bitsandbytes_available, is_comet_available, is_sklearn_available, is_wandb_available
 
 from trl import BaseBinaryJudge, BasePairwiseJudge, is_diffusers_available, is_llm_blender_available
 from trl.import_utils import is_mergekit_available
@@ -65,6 +65,13 @@ def require_sklearn(test_case):
     return unittest.skipUnless(is_sklearn_available(), "test requires sklearn")(test_case)
 
 
+def require_comet(test_case):
+    """
+    Decorator marking a test that requires Comet. Skips the test if Comet is not available.
+    """
+    return unittest.skipUnless(is_comet_available(), "test requires comet_ml")(test_case)
+
+
 class RandomBinaryJudge(BaseBinaryJudge):
     """
     Random binary judge, for testing purposes.

diff --git a/trl/trainer/bco_config.py b/trl/trainer/bco_config.py
@@ -46,8 +46,10 @@ class BCOConfig(TrainingArguments):
         truncation_mode (`str`, *optional*, defaults to `"keep_end"`):
             Truncation mode to use when the prompt is too long. Possible values are `"keep_end"` or `"keep_start"`.
             This argument is required if you want to use the default data collator.
+        disable_dropout (`bool`, *optional*, defaults to `True`):
+            Whether to disable dropout in the model and reference model.
         generate_during_eval (`bool`, *optional*, defaults to `False`):
-            If `True`, generates and logs completions from both the model and the reference model to W&B during
+            If `True`, generates and logs completions from both the model and the reference model to W&B or Comet during
             evaluation.
         is_encoder_decoder (`bool` or `None`, *optional*, defaults to `None`):
             When using the `model_init` argument (callable) to instantiate the model instead of the `model` argument,
@@ -117,6 +119,12 @@ class BCOConfig(TrainingArguments):
             "default data collator."
         },
     )
+    disable_dropout: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to disable dropout in the model and reference model."
+        },
+    )
     generate_during_eval: bool = field(
         default=False,
         metadata={