Skip to content

Commit

Permalink
Merge branch 'main' into dbfs-hf
Browse files Browse the repository at this point in the history
  • Loading branch information
KuuCi authored Jun 14, 2024
2 parents 6dcf61f + dbd798e commit d852cbb
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 1 deletion.
4 changes: 3 additions & 1 deletion llmfoundry/data/finetuning/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def build_finetuning_dataloader(
cache_limit=dataset_cfg.get('cache_limit', None),
partition_algo=dataset_cfg.get('partition_algo', 'relaxed'),
num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None),
batch_size=dataset_batch_size,
batch_size=dataloader_batch_size,
shuffle=dataset_cfg.get('shuffle', False),
shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'),
shuffle_seed=dataset_cfg.get('shuffle_seed', 9176),
Expand All @@ -233,6 +233,7 @@ def build_finetuning_dataloader(
max_seq_len=dataset_cfg['max_seq_len'],
allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False),
replication=replication_factor,
packing_ratio=dataloader_batch_size / dataset_batch_size,
)

else:
Expand Down Expand Up @@ -390,6 +391,7 @@ def _validate_config(
'allow_pad_trimming',
'seq_parallel_replication',
'auto_packing_replication',
'max_leftover_bins_to_keep',
}
if not set(kwargs.keys()).issubset(allowed_additional_kwargs):
raise ValueError(
Expand Down
12 changes: 12 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,7 @@ def __init__(
max_seq_len: int = 2048,
allow_unsafe_types: bool = False,
replication: Optional[int] = None,
packing_ratio: Optional[float] = None,
**kwargs: Any,
):

Expand Down Expand Up @@ -644,6 +645,7 @@ def __init__(

self.tokenizer = tokenizer
self.max_seq_len = max_seq_len
self.packing_ratio = packing_ratio

# How to process a sample
def __getitem__(self, idx: int) -> Dict[str, Any]:
Expand Down Expand Up @@ -675,6 +677,16 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
return {'turns': [sample]}
return tokenize_formatted_example(sample, tokenizer=self.tokenizer)

def state_dict(self, num_samples: int,
from_beginning: bool) -> Dict[str, Any]:
if self.packing_ratio is not None:
num_samples = int(self.packing_ratio * num_samples)

return super().state_dict(
num_samples=num_samples,
from_beginning=from_beginning,
)


class DatasetConstructor:

Expand Down
14 changes: 14 additions & 0 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,20 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
icl_constructor_kwargs['pad_tok_id'] = pad_tok_id
icl_constructor_kwargs['num_fewshot'] = num_fewshot

# Support backwards compatibility for the naming of "prelimiter" as "question_prelimiter"
if 'question_prelimiter' in icl_constructor_kwargs:
if 'prelimiter' in icl_constructor_kwargs:
raise ValueError(
'Both "question_prelimiter" and "prelimiter" are specified in the ICL task config. '
+
'Please only specify one of them, as they map to the same argument.',
)
else:
icl_constructor_kwargs['prelimiter'
] = icl_constructor_kwargs.pop(
'question_prelimiter',
)

assert early_stopping_criteria is None or isinstance(
early_stopping_criteria,
list,
Expand Down
10 changes: 10 additions & 0 deletions tests/data/test_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from torch.utils.data import DataLoader

from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
from llmfoundry.data.finetuning.tasks import StreamingFinetuningDataset
from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio
from llmfoundry.utils.builders import build_tokenizer

Expand Down Expand Up @@ -206,6 +207,15 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
if batch_ix >= 3:
break

assert isinstance(loader, DataLoader)
assert isinstance(loader.dataset, StreamingFinetuningDataset)
assert loader.dataset.packing_ratio is not None
assert isinstance(loader.batch_size, int)
assert loader.dataset.packing_ratio == int(loader.batch_size / 6)

state_dict = loader.dataset.state_dict(num_samples=2, from_beginning=False)
assert state_dict['sample_in_epoch'] == 2 * loader.dataset.packing_ratio


@pytest.mark.parametrize('packing_ratio', ['auto', 2.0])
@patch(
Expand Down
40 changes: 40 additions & 0 deletions tests/eval/test_in_context_learning_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
InContextLearningLMAccuracy,
InContextLearningMultipleChoiceAccuracy,
)
from llmfoundry.utils.builders import build_icl_evaluators


def test_strip_data():
Expand Down Expand Up @@ -2588,3 +2589,42 @@ def test_hf_dataloading_custom_parsing(
)
assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:')
assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:')


@pytest.mark.parametrize(
'prelimiter_key_name',
['prelimiter', 'question_prelimiter'],
)
def test_bc_question_prelimiter(
mpt_tokenizer: transformers.PreTrainedTokenizerBase,
prelimiter_key_name: str,
):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

dataset_uri = f'{local_data}/piqa_small.jsonl'

icl_tasks = [
{
'dataset_uri': dataset_uri,
'label': 'piqa',
'icl_task_type': 'multiple_choice',
'max_seq_len': 64,
'pad_tok_id': mpt_tokenizer.eos_token_id,
'num_fewshot': [0],
'prompt_string': '',
'example_delimiter': '\n',
'continuation_delimiter': ': ',
prelimiter_key_name: 'This is a question: ',
},
]

evaluators, _ = build_icl_evaluators(
icl_tasks=icl_tasks,
tokenizer=mpt_tokenizer,
default_batch_size=2,
default_max_seq_len=128,
)

assert len(evaluators) == 1
evaluator = evaluators[0]
assert evaluator.dataloader.dataloader.dataset.prelimiter == 'This is a question: ' # type: ignore

0 comments on commit d852cbb

Please sign in to comment.