From 43509903e208828a29e39363a72ed7b56171bbba Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 14 Jun 2024 14:24:27 -0700 Subject: [PATCH 1/2] Fix backwards compatibility for ICL arg (#1286) --- llmfoundry/utils/builders.py | 14 +++++++ .../eval/test_in_context_learning_datasets.py | 40 +++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f9e84aab45..ada553c52f 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -595,6 +595,20 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): icl_constructor_kwargs['pad_tok_id'] = pad_tok_id icl_constructor_kwargs['num_fewshot'] = num_fewshot + # Support backwards compatibility for the naming of "prelimiter" as "question_prelimiter" + if 'question_prelimiter' in icl_constructor_kwargs: + if 'prelimiter' in icl_constructor_kwargs: + raise ValueError( + 'Both "question_prelimiter" and "prelimiter" are specified in the ICL task config. ' + + + 'Please only specify one of them, as they map to the same argument.', + ) + else: + icl_constructor_kwargs['prelimiter' + ] = icl_constructor_kwargs.pop( + 'question_prelimiter', + ) + assert early_stopping_criteria is None or isinstance( early_stopping_criteria, list, diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py index b5eacdeb0f..81769a18e6 100644 --- a/tests/eval/test_in_context_learning_datasets.py +++ b/tests/eval/test_in_context_learning_datasets.py @@ -37,6 +37,7 @@ InContextLearningLMAccuracy, InContextLearningMultipleChoiceAccuracy, ) +from llmfoundry.utils.builders import build_icl_evaluators def test_strip_data(): @@ -2588,3 +2589,42 @@ def test_hf_dataloading_custom_parsing( ) assert decoded_batch[0].endswith('Orbs: quas wex exort\nSpell:') assert decoded_batch[1].endswith('Orbs: quas quas quas\nSpell:') + + +@pytest.mark.parametrize( + 'prelimiter_key_name', + ['prelimiter', 'question_prelimiter'], +) +def test_bc_question_prelimiter( + mpt_tokenizer: transformers.PreTrainedTokenizerBase, + prelimiter_key_name: str, +): + local_data = os.path.join(os.path.dirname(__file__), 'local_data') + + dataset_uri = f'{local_data}/piqa_small.jsonl' + + icl_tasks = [ + { + 'dataset_uri': dataset_uri, + 'label': 'piqa', + 'icl_task_type': 'multiple_choice', + 'max_seq_len': 64, + 'pad_tok_id': mpt_tokenizer.eos_token_id, + 'num_fewshot': [0], + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + prelimiter_key_name: 'This is a question: ', + }, + ] + + evaluators, _ = build_icl_evaluators( + icl_tasks=icl_tasks, + tokenizer=mpt_tokenizer, + default_batch_size=2, + default_max_seq_len=128, + ) + + assert len(evaluators) == 1 + evaluator = evaluators[0] + assert evaluator.dataloader.dataloader.dataset.prelimiter == 'This is a question: ' # type: ignore From dbd798e00c474cba5f2d53c1bca476c241f1aa84 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 14 Jun 2024 14:58:13 -0700 Subject: [PATCH 2/2] Fix packing + streaming + resumption (#1277) --- llmfoundry/data/finetuning/dataloader.py | 4 +++- llmfoundry/data/finetuning/tasks.py | 12 ++++++++++++ tests/data/test_packing.py | 10 ++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 639beba6f0..160e9bfe3b 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -222,7 +222,7 @@ def build_finetuning_dataloader( cache_limit=dataset_cfg.get('cache_limit', None), partition_algo=dataset_cfg.get('partition_algo', 'relaxed'), num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None), - batch_size=dataset_batch_size, + batch_size=dataloader_batch_size, shuffle=dataset_cfg.get('shuffle', False), shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'), shuffle_seed=dataset_cfg.get('shuffle_seed', 9176), @@ -233,6 +233,7 @@ def build_finetuning_dataloader( max_seq_len=dataset_cfg['max_seq_len'], allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), replication=replication_factor, + packing_ratio=dataloader_batch_size / dataset_batch_size, ) else: @@ -390,6 +391,7 @@ def _validate_config( 'allow_pad_trimming', 'seq_parallel_replication', 'auto_packing_replication', + 'max_leftover_bins_to_keep', } if not set(kwargs.keys()).issubset(allowed_additional_kwargs): raise ValueError( diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 40f178fb6e..9a0f680bd7 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -592,6 +592,7 @@ def __init__( max_seq_len: int = 2048, allow_unsafe_types: bool = False, replication: Optional[int] = None, + packing_ratio: Optional[float] = None, **kwargs: Any, ): @@ -644,6 +645,7 @@ def __init__( self.tokenizer = tokenizer self.max_seq_len = max_seq_len + self.packing_ratio = packing_ratio # How to process a sample def __getitem__(self, idx: int) -> Dict[str, Any]: @@ -675,6 +677,16 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: return {'turns': [sample]} return tokenize_formatted_example(sample, tokenizer=self.tokenizer) + def state_dict(self, num_samples: int, + from_beginning: bool) -> Dict[str, Any]: + if self.packing_ratio is not None: + num_samples = int(self.packing_ratio * num_samples) + + return super().state_dict( + num_samples=num_samples, + from_beginning=from_beginning, + ) + class DatasetConstructor: diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index b910b8c5ff..d181dbde0b 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -14,6 +14,7 @@ from torch.utils.data import DataLoader from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader +from llmfoundry.data.finetuning.tasks import StreamingFinetuningDataset from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio from llmfoundry.utils.builders import build_tokenizer @@ -206,6 +207,15 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path): if batch_ix >= 3: break + assert isinstance(loader, DataLoader) + assert isinstance(loader.dataset, StreamingFinetuningDataset) + assert loader.dataset.packing_ratio is not None + assert isinstance(loader.batch_size, int) + assert loader.dataset.packing_ratio == int(loader.batch_size / 6) + + state_dict = loader.dataset.state_dict(num_samples=2, from_beginning=False) + assert state_dict['sample_in_epoch'] == 2 * loader.dataset.packing_ratio + @pytest.mark.parametrize('packing_ratio', ['auto', 2.0]) @patch(