From ddccd12b8e5bbca153826878e5c937c91c4a5ea8 Mon Sep 17 00:00:00 2001
From: Irene Dea <deaairene@gmail.com>
Date: Fri, 16 Aug 2024 13:38:55 -0700
Subject: [PATCH] Add user error for UnicodeDeocdeError in convert text to mds
 (#1457)

---
 .../data_prep/convert_text_to_mds.py          | 55 ++++++++++---------
 llmfoundry/utils/exceptions.py                |  8 +++
 .../data_prep/test_convert_text_to_mds.py     | 23 ++++++++
 3 files changed, 61 insertions(+), 25 deletions(-)

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index f7c502a53e..7c40a7e698 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -29,6 +29,7 @@
     merge_shard_groups,
 )
 from llmfoundry.utils.exceptions import (
+    CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
@@ -71,31 +72,35 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
                 buffer += self.bos_tokens
                 first_chunk = True
                 # Read the file in 1MB chunks to avoid memory issues
-                for chunk in iter(partial(f.read, 1000000), ''):
-                    # Tokenize the chunk
-                    encoded = self.tokenizer(
-                        chunk,
-                        truncation=False,
-                        padding=False,
-                    )
-                    iids = encoded['input_ids']
-
-                    # If this is not the first chunk, remove the BOS token
-                    if not first_chunk:
-                        if iids[0] == self.tokenizer.bos_token_id:
-                            iids = iids[1:]
-
-                    # Add the tokens to the buffer
-                    buffer += iids
-                    while len(buffer) >= self.max_length:
-                        concat_sample = buffer[:self.max_length]
-                        buffer = buffer[self.
-                                        max_length:] if self.should_wrap else []
-                        yield {
-                            'tokens': np.asarray(concat_sample, dtype=np.int32),
-                        }
-
-                    first_chunk = False
+                try:
+                    for chunk in iter(partial(f.read, 1000000), ''):
+                        # Tokenize the chunk
+                        encoded = self.tokenizer(
+                            chunk,
+                            truncation=False,
+                            padding=False,
+                        )
+                        iids = encoded['input_ids']
+
+                        # If this is not the first chunk, remove the BOS token
+                        if not first_chunk:
+                            if iids[0] == self.tokenizer.bos_token_id:
+                                iids = iids[1:]
+
+                        # Add the tokens to the buffer
+                        buffer += iids
+                        while len(buffer) >= self.max_length:
+                            concat_sample = buffer[:self.max_length]
+                            buffer = buffer[self.max_length:
+                                           ] if self.should_wrap else []
+                            yield {
+                                'tokens':
+                                    np.asarray(concat_sample, dtype=np.int32),
+                            }
+
+                        first_chunk = False
+                except UnicodeDecodeError:
+                    raise CannotUnicodeDecodeFile(text_file=file)
 
                 # Add the EOS token to the buffer to separate files.
                 buffer += self.eos_tokens
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index b2e5cc06e8..206095f28b 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -348,6 +348,14 @@ def __init__(self, input_folder: str) -> None:
         super().__init__(message, input_folder=input_folder)
 
 
+class CannotUnicodeDecodeFile(UserError):
+    """Error thrown when the input folder is missing data."""
+
+    def __init__(self, text_file: str) -> None:
+        message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.'
+        super().__init__(message, text_file=text_file)
+
+
 class OutputFolderNotEmptyError(UserError):
     """Error thrown when the output folder is not empty."""
 
diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index 302a540217..d604565e59 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -22,6 +22,7 @@
     write_done_file,
 )
 from llmfoundry.utils.exceptions import (
+    CannotUnicodeDecodeFile,
     DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
@@ -290,6 +291,28 @@ def test_dataset_too_small(tmp_path: pathlib.Path):
         )
 
 
+def test_decode_invalid_unicode(tmp_path: pathlib.Path):
+    input_folder = tmp_path / 'input'
+    os.makedirs(input_folder, exist_ok=True)
+    with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f:
+        f.write('HELLO WORLD')
+    with pytest.raises(CannotUnicodeDecodeFile):
+        convert_text_to_mds(
+            tokenizer_name='mosaicml/mpt-7b',
+            output_folder=str(tmp_path / 'output'),
+            input_folder=str(input_folder),
+            concat_tokens=1,
+            eos_text='',
+            bos_text='',
+            no_wrap=False,
+            compression='zstd',
+            processes=1,
+            args_str='Namespace()',
+            reprocess=False,
+            trust_remote_code=False,
+        )
+
+
 def test_is_already_processed(tmp_path: pathlib.Path):
     tmp_path_str = str(tmp_path)
     args_str = 'Namespace(x = 5)'