Error if filtered dataset contains 0 examples (#1585)

mosaicml · Oct 11, 2024 · e6e74a2 · e6e74a2
1 parent 85b251f
commit e6e74a2
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 1 deletion.
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -77,6 +77,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
     InvalidConversationError,
+    InvalidDatasetError,
     InvalidExampleTypeError,
     InvalidFileExtensionError,
     InvalidLastChatMessageRoleError,
@@ -997,6 +998,12 @@ def dataset_mapper(example: dict):
                     +
                     'the prompt or response was empty, or the response was all padding tokens.',
                 )
+            if len(filtered_dataset) == 0:
+                raise InvalidDatasetError(
+                    f'No valid examples found after filtering out prompts longer than {max_seq_len}, '
+                    +
+                    'examples with empty prompts or responses, and examples with responses that are all padding tokens.',
+                )
         except Exception as e:
             error = e
         # Now local rank 0 indicates to the other ranks that it is done

diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
@@ -397,6 +397,14 @@ def __init__(self, dataset_name: str, split: str) -> None:
         super().__init__(message, dataset_name=dataset_name, split=split)
 
 
+class InvalidDatasetError(UserError):
+    """Error thrown when a dataset contains no valid samples for training."""
+
+    def __init__(self, reason: str) -> None:
+        message = f'Dataset contains no valid samples for training. {reason}'
+        super().__init__(message, reason=reason)
+
+
 class DatasetTooSmallError(UserError):
     """Error thrown when the dataset is too small to be processed."""
 

diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
@@ -437,7 +437,7 @@ def test_finetuning_dataloader_safe_load(
         'dataset': {
             'hf_name': hf_name,
             'split': 'train',
-            'max_seq_len': 8,
+            'max_seq_len': 100,
             'decoder_only_format': True,
             'shuffle': True,
             'safe_load': True,