mosaicml · mattyding · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
@@ -29,7 +29,6 @@
     merge_shard_groups,
 )
 from llmfoundry.utils.exceptions import (
-    CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
@@ -68,39 +67,35 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
         buffer = []
         for file in self.files:
             log.info(f'Processing file: {file}')
-            with open(file, 'r') as f:
+            with open(file, 'r', errors='replace') as f:
                 buffer += self.bos_tokens
                 first_chunk = True
                 # Read the file in 1MB chunks to avoid memory issues
-                try:
-                    for chunk in iter(partial(f.read, 1000000), ''):
-                        # Tokenize the chunk
-                        encoded = self.tokenizer(
-                            chunk,
-                            truncation=False,
-                            padding=False,
-                        )
-                        iids = encoded['input_ids']
-
-                        # If this is not the first chunk, remove the BOS token
-                        if not first_chunk:
-                            if iids[0] == self.tokenizer.bos_token_id:
-                                iids = iids[1:]
-
-                        # Add the tokens to the buffer
-                        buffer += iids
-                        while len(buffer) >= self.max_length:
-                            concat_sample = buffer[:self.max_length]
-                            buffer = buffer[self.max_length:
-                                           ] if self.should_wrap else []
-                            yield {
-                                'tokens':
-                                    np.asarray(concat_sample, dtype=np.int32),
-                            }
+                for chunk in iter(partial(f.read, 1000000), ''):
+                    # Tokenize the chunk
+                    encoded = self.tokenizer(
+                        chunk,
+                        truncation=False,
+                        padding=False,
+                    )
+                    iids = encoded['input_ids']
+
+                    # If this is not the first chunk, remove the BOS token
+                    if not first_chunk:
+                        if iids[0] == self.tokenizer.bos_token_id:
+                            iids = iids[1:]
+
+                    # Add the tokens to the buffer
+                    buffer += iids
+                    while len(buffer) >= self.max_length:
+                        concat_sample = buffer[:self.max_length]
+                        buffer = buffer[self.
+                                        max_length:] if self.should_wrap else []
+                        yield {
+                            'tokens': np.asarray(concat_sample, dtype=np.int32),
+                        }
 
                         first_chunk = False
-                except UnicodeDecodeError:
-                    raise CannotUnicodeDecodeFile(text_file=file)
 
                 # Add the EOS token to the buffer to separate files.
                 buffer += self.eos_tokens

@@ -348,14 +348,6 @@ def __init__(self, input_folder: str) -> None:
         super().__init__(message, input_folder=input_folder)
 
 
-class CannotUnicodeDecodeFile(UserError):
-    """Error thrown when the input folder is missing data."""
-
-    def __init__(self, text_file: str) -> None:
-        message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.'
-        super().__init__(message, text_file=text_file)
-
-
 class OutputFolderNotEmptyError(UserError):
     """Error thrown when the output folder is not empty."""
 

diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -22,7 +22,6 @@
     write_done_file,
 )
 from llmfoundry.utils.exceptions import (
-    CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
@@ -296,7 +295,7 @@ def test_decode_invalid_unicode(tmp_path: pathlib.Path):
     os.makedirs(input_folder, exist_ok=True)
     with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f:
         f.write('HELLO WORLD')
-    with pytest.raises(CannotUnicodeDecodeFile):
+    try:
         convert_text_to_mds(
             tokenizer_name='mosaicml/mpt-7b',
             output_folder=str(tmp_path / 'output'),
@@ -311,6 +310,8 @@ def test_decode_invalid_unicode(tmp_path: pathlib.Path):
             reprocess=False,
             trust_remote_code=False,
         )
+    except UnicodeDecodeError:
+        pytest.fail('UnicodeDecodeError raised')
 
 
 def test_is_already_processed(tmp_path: pathlib.Path):