Skip to content

Commit

Permalink
Error on text dataset file not found (#1534)
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress authored Sep 22, 2024
1 parent d7c7822 commit 14cff66
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 5 deletions.
15 changes: 10 additions & 5 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
CannotUnicodeDecodeFile,
DatasetTooSmallError,
InputFolderMissingDataError,
InputFolderNotFound,
OutputFolderNotEmptyError,
)

Expand Down Expand Up @@ -125,11 +126,15 @@ def get_object_names(input_folder: str) -> list[str]:
object_store = maybe_create_object_store_from_uri(input_folder)
if object_store is not None:
_, _, folder_prefix = parse_uri(input_folder)
names = [
name for name in object_store.list_objects(folder_prefix)
if name.endswith('.txt')
]
log.info(f'Found {len(names)} text files in remote storage')
try:
names = [
name for name in object_store.list_objects(folder_prefix)
if name.endswith('.txt')
]
log.info(f'Found {len(names)} text files in remote storage')
except FileNotFoundError:
raise InputFolderNotFound(folder_prefix)

else:
# input_folder is a local folder
names = [
Expand Down
11 changes: 11 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,17 @@ def __init__(self, input_folder: str) -> None:
super().__init__(message, input_folder=input_folder)


class InputFolderNotFound(UserError):
"""Error thrown when the a folder is not found."""

def __init__(self, folder_that_was_not_found: str) -> None:
message = f'{folder_that_was_not_found} not found.'
super().__init__(
message,
folder_that_was_not_found=folder_that_was_not_found,
)


class CannotUnicodeDecodeFile(UserError):
"""Error thrown when the input folder is missing data."""

Expand Down

0 comments on commit 14cff66

Please sign in to comment.