diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index f7c502a53e..7c40a7e698 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -29,6 +29,7 @@ merge_shard_groups, ) from llmfoundry.utils.exceptions import ( + CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, @@ -71,31 +72,35 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]: buffer += self.bos_tokens first_chunk = True # Read the file in 1MB chunks to avoid memory issues - for chunk in iter(partial(f.read, 1000000), ''): - # Tokenize the chunk - encoded = self.tokenizer( - chunk, - truncation=False, - padding=False, - ) - iids = encoded['input_ids'] - - # If this is not the first chunk, remove the BOS token - if not first_chunk: - if iids[0] == self.tokenizer.bos_token_id: - iids = iids[1:] - - # Add the tokens to the buffer - buffer += iids - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self. - max_length:] if self.should_wrap else [] - yield { - 'tokens': np.asarray(concat_sample, dtype=np.int32), - } - - first_chunk = False + try: + for chunk in iter(partial(f.read, 1000000), ''): + # Tokenize the chunk + encoded = self.tokenizer( + chunk, + truncation=False, + padding=False, + ) + iids = encoded['input_ids'] + + # If this is not the first chunk, remove the BOS token + if not first_chunk: + if iids[0] == self.tokenizer.bos_token_id: + iids = iids[1:] + + # Add the tokens to the buffer + buffer += iids + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self.max_length: + ] if self.should_wrap else [] + yield { + 'tokens': + np.asarray(concat_sample, dtype=np.int32), + } + + first_chunk = False + except UnicodeDecodeError: + raise CannotUnicodeDecodeFile(text_file=file) # Add the EOS token to the buffer to separate files. buffer += self.eos_tokens diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index b2e5cc06e8..206095f28b 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -348,6 +348,14 @@ def __init__(self, input_folder: str) -> None: super().__init__(message, input_folder=input_folder) +class CannotUnicodeDecodeFile(UserError): + """Error thrown when the input folder is missing data.""" + + def __init__(self, text_file: str) -> None: + message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.' + super().__init__(message, text_file=text_file) + + class OutputFolderNotEmptyError(UserError): """Error thrown when the output folder is not empty.""" diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index 302a540217..d604565e59 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -22,6 +22,7 @@ write_done_file, ) from llmfoundry.utils.exceptions import ( + CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, @@ -290,6 +291,28 @@ def test_dataset_too_small(tmp_path: pathlib.Path): ) +def test_decode_invalid_unicode(tmp_path: pathlib.Path): + input_folder = tmp_path / 'input' + os.makedirs(input_folder, exist_ok=True) + with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f: + f.write('HELLO WORLD') + with pytest.raises(CannotUnicodeDecodeFile): + convert_text_to_mds( + tokenizer_name='mosaicml/mpt-7b', + output_folder=str(tmp_path / 'output'), + input_folder=str(input_folder), + concat_tokens=1, + eos_text='', + bos_text='', + no_wrap=False, + compression='zstd', + processes=1, + args_str='Namespace()', + reprocess=False, + trust_remote_code=False, + ) + + def test_is_already_processed(tmp_path: pathlib.Path): tmp_path_str = str(tmp_path) args_str = 'Namespace(x = 5)'