Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Dec 8, 2024
1 parent 7b8bf5f commit 08a5230
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 0 deletions.
9 changes: 9 additions & 0 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
ALLOWED_MESSAGES_KEYS,
ALLOWED_PROMPT_KEYS,
ALLOWED_RESPONSE_KEYS,
BadDatasetSplitError,
ChatTemplateError,
ConsecutiveRepeatedChatRolesError,
DatasetTooSmallError,
Expand Down Expand Up @@ -1047,6 +1048,14 @@ def dataset_mapper(example: dict):
dataset_name=dataset_name,
split=split,
) from error
elif isinstance(error, ValueError) and 'Split name should match' in str(
error,
):
log.error('Huggingface split ValueError during data prep.')
raise BadDatasetSplitError(
dataset_name=dataset_name,
split=split,
) from error
if error is not None:
log.error('Error during data prep')
raise error
Expand Down
12 changes: 12 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,18 @@ def __init__(self, dataset_name: str, split: Optional[str] = None) -> None:
super().__init__(message, dataset_name=dataset_name, split=split)


class BadDatasetSplitError(UserError):
"""Error thrown when a HuggingFace dataset is misconfigured."""

def __init__(self, dataset_name: str, split: Optional[str] = None) -> None:
reg = r"^\\w+(\\.\\w+)*$"
message = f'Your dataset (name={dataset_name}, split={split}) has an invalid split. ' + \
f'Please check your split name to make sure it matches the pattern "{reg}"' \
if split is not None else f'Your dataset (name={dataset_name}) is misconfigured. ' + \
f'Please check your split name to make sure it matches the pattern "{reg}"'
super().__init__(message, dataset_name=dataset_name, split=split)


class InvalidDatasetError(UserError):
"""Error thrown when a dataset contains no valid samples for training."""

Expand Down

0 comments on commit 08a5230

Please sign in to comment.