From 65936cfb8e3bebcfb7c8b515323b0999d3ba8e0d Mon Sep 17 00:00:00 2001
From: Abhay Gupta <gupta-abhay@users.noreply.github.com>
Date: Thu, 22 Aug 2024 14:02:22 -0700
Subject: [PATCH] Make sample tokenization extensible (#1478)

---
 llmfoundry/data/finetuning/tasks.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index aaaa5e145a..801813b3ff 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -659,6 +659,9 @@ def __init__(
         self.max_seq_len = max_seq_len
         self.packing_ratio = packing_ratio
 
+    def tokenize_example(self, example: Example) -> TokenizedExample:
+        return tokenize_formatted_example(example, self.tokenizer)
+
     # How to process a sample
     def __getitem__(self, idx: int) -> dict[str, Any]:
         sample = super().__getitem__(idx)
@@ -687,7 +690,7 @@ def __getitem__(self, idx: int) -> dict[str, Any]:
                 )
             # Convert to latest format by wrapping sample as a "turn"
             return {'turns': [sample]}
-        return tokenize_formatted_example(sample, tokenizer=self.tokenizer)
+        return self.tokenize_example(sample)
 
     def state_dict(self, num_samples: int,
                    from_beginning: bool) -> dict[str, Any]: