diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index a68a611c52..179f017fd9 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -77,6 +77,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: IncorrectMessageKeyQuantityError, InvalidContentTypeError, InvalidConversationError, + InvalidDatasetError, InvalidExampleTypeError, InvalidFileExtensionError, InvalidLastChatMessageRoleError, @@ -997,6 +998,12 @@ def dataset_mapper(example: dict): + 'the prompt or response was empty, or the response was all padding tokens.', ) + if len(filtered_dataset) == 0: + raise InvalidDatasetError( + f'No valid examples found after filtering out prompts longer than {max_seq_len}, ' + + + 'examples with empty prompts or responses, and examples with responses that are all padding tokens.', + ) except Exception as e: error = e # Now local rank 0 indicates to the other ranks that it is done diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 1fbda7c495..1b9feb9a10 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -397,6 +397,14 @@ def __init__(self, dataset_name: str, split: str) -> None: super().__init__(message, dataset_name=dataset_name, split=split) +class InvalidDatasetError(UserError): + """Error thrown when a dataset contains no valid samples for training.""" + + def __init__(self, reason: str) -> None: + message = f'Dataset contains no valid samples for training. {reason}' + super().__init__(message, reason=reason) + + class DatasetTooSmallError(UserError): """Error thrown when the dataset is too small to be processed.""" diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 7239bfe958..d7f979713a 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -437,7 +437,7 @@ def test_finetuning_dataloader_safe_load( 'dataset': { 'hf_name': hf_name, 'split': 'train', - 'max_seq_len': 8, + 'max_seq_len': 100, 'decoder_only_format': True, 'shuffle': True, 'safe_load': True,