Skip to content

Commit

Permalink
Error if filtered dataset contains 0 examples (#1585)
Browse files Browse the repository at this point in the history
  • Loading branch information
irenedea authored Oct 11, 2024
1 parent 85b251f commit e6e74a2
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 1 deletion.
7 changes: 7 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
IncorrectMessageKeyQuantityError,
InvalidContentTypeError,
InvalidConversationError,
InvalidDatasetError,
InvalidExampleTypeError,
InvalidFileExtensionError,
InvalidLastChatMessageRoleError,
Expand Down Expand Up @@ -997,6 +998,12 @@ def dataset_mapper(example: dict):
+
'the prompt or response was empty, or the response was all padding tokens.',
)
if len(filtered_dataset) == 0:
raise InvalidDatasetError(
f'No valid examples found after filtering out prompts longer than {max_seq_len}, '
+
'examples with empty prompts or responses, and examples with responses that are all padding tokens.',
)
except Exception as e:
error = e
# Now local rank 0 indicates to the other ranks that it is done
Expand Down
8 changes: 8 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,14 @@ def __init__(self, dataset_name: str, split: str) -> None:
super().__init__(message, dataset_name=dataset_name, split=split)


class InvalidDatasetError(UserError):
"""Error thrown when a dataset contains no valid samples for training."""

def __init__(self, reason: str) -> None:
message = f'Dataset contains no valid samples for training. {reason}'
super().__init__(message, reason=reason)


class DatasetTooSmallError(UserError):
"""Error thrown when the dataset is too small to be processed."""

Expand Down
2 changes: 1 addition & 1 deletion tests/data/test_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def test_finetuning_dataloader_safe_load(
'dataset': {
'hf_name': hf_name,
'split': 'train',
'max_seq_len': 8,
'max_seq_len': 100,
'decoder_only_format': True,
'shuffle': True,
'safe_load': True,
Expand Down

0 comments on commit e6e74a2

Please sign in to comment.