Skip to content

Commit

Permalink
attempt
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Jun 13, 2024
1 parent 5571101 commit 08a505d
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
1 change: 1 addition & 0 deletions llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def build_finetuning_dataloader(
max_seq_len=dataset_cfg['max_seq_len'],
allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False),
replication=replication_factor,
packing_ratio=dataloader_batch_size / dataset_batch_size,
)

else:
Expand Down
11 changes: 11 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def __init__(
max_seq_len: int = 2048,
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
packing_ratio: Optional[float] = None,
**kwargs: Any,
):

Expand Down Expand Up @@ -644,6 +645,7 @@ def __init__(

self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self.packing_ratio = packing_ratio

# How to process a sample
def __getitem__(self, idx: int) -> Dict[str, Any]:
Expand Down Expand Up @@ -675,6 +677,15 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
return {'turns': [sample]}
return tokenize_formatted_example(sample, tokenizer=self.tokenizer)

def state_dict(self, num_samples: int,
from_beginning: bool) -> Dict[str, Any]:
if self.packing_ratio is not None:
num_samples = self.packing_ratio * num_samples

return super().state_dict(
num_samples=num_samples, from_beginning=from_beginning
)


class DatasetConstructor:

Expand Down

0 comments on commit 08a505d

Please sign in to comment.