Skip to content

Commit

Permalink
Make sample tokenization extensible (#1478)
Browse files Browse the repository at this point in the history
  • Loading branch information
gupta-abhay authored Aug 22, 2024
1 parent e235f42 commit 65936cf
Showing 1 changed file with 4 additions and 1 deletion.
5 changes: 4 additions & 1 deletion llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,9 @@ def __init__(
self.max_seq_len = max_seq_len
self.packing_ratio = packing_ratio

def tokenize_example(self, example: Example) -> TokenizedExample:
return tokenize_formatted_example(example, self.tokenizer)

# How to process a sample
def __getitem__(self, idx: int) -> dict[str, Any]:
sample = super().__getitem__(idx)
Expand Down Expand Up @@ -687,7 +690,7 @@ def __getitem__(self, idx: int) -> dict[str, Any]:
)
# Convert to latest format by wrapping sample as a "turn"
return {'turns': [sample]}
return tokenize_formatted_example(sample, tokenizer=self.tokenizer)
return self.tokenize_example(sample)

def state_dict(self, num_samples: int,
from_beginning: bool) -> dict[str, Any]:
Expand Down

0 comments on commit 65936cf

Please sign in to comment.