From 65936cfb8e3bebcfb7c8b515323b0999d3ba8e0d Mon Sep 17 00:00:00 2001 From: Abhay Gupta Date: Thu, 22 Aug 2024 14:02:22 -0700 Subject: [PATCH] Make sample tokenization extensible (#1478) --- llmfoundry/data/finetuning/tasks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index aaaa5e145a..801813b3ff 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -659,6 +659,9 @@ def __init__( self.max_seq_len = max_seq_len self.packing_ratio = packing_ratio + def tokenize_example(self, example: Example) -> TokenizedExample: + return tokenize_formatted_example(example, self.tokenizer) + # How to process a sample def __getitem__(self, idx: int) -> dict[str, Any]: sample = super().__getitem__(idx) @@ -687,7 +690,7 @@ def __getitem__(self, idx: int) -> dict[str, Any]: ) # Convert to latest format by wrapping sample as a "turn" return {'turns': [sample]} - return tokenize_formatted_example(sample, tokenizer=self.tokenizer) + return self.tokenize_example(sample) def state_dict(self, num_samples: int, from_beginning: bool) -> dict[str, Any]: