Skip to content

Commit

Permalink
Merge branch 'main' into envlogger
Browse files Browse the repository at this point in the history
  • Loading branch information
josejg authored Aug 17, 2024
2 parents 370ceaf + ddccd12 commit 7cee757
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 25 deletions.
55 changes: 30 additions & 25 deletions llmfoundry/command_utils/data_prep/convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
merge_shard_groups,
)
from llmfoundry.utils.exceptions import (
CannotUnicodeDecodeFile,
DatasetTooSmallError,
InputFolderMissingDataError,
OutputFolderNotEmptyError,
Expand Down Expand Up @@ -71,31 +72,35 @@ def __iter__(self) -> Iterable[dict[str, NDArray]]:
buffer += self.bos_tokens
first_chunk = True
# Read the file in 1MB chunks to avoid memory issues
for chunk in iter(partial(f.read, 1000000), ''):
# Tokenize the chunk
encoded = self.tokenizer(
chunk,
truncation=False,
padding=False,
)
iids = encoded['input_ids']

# If this is not the first chunk, remove the BOS token
if not first_chunk:
if iids[0] == self.tokenizer.bos_token_id:
iids = iids[1:]

# Add the tokens to the buffer
buffer += iids
while len(buffer) >= self.max_length:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.
max_length:] if self.should_wrap else []
yield {
'tokens': np.asarray(concat_sample, dtype=np.int32),
}

first_chunk = False
try:
for chunk in iter(partial(f.read, 1000000), ''):
# Tokenize the chunk
encoded = self.tokenizer(
chunk,
truncation=False,
padding=False,
)
iids = encoded['input_ids']

# If this is not the first chunk, remove the BOS token
if not first_chunk:
if iids[0] == self.tokenizer.bos_token_id:
iids = iids[1:]

# Add the tokens to the buffer
buffer += iids
while len(buffer) >= self.max_length:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.max_length:
] if self.should_wrap else []
yield {
'tokens':
np.asarray(concat_sample, dtype=np.int32),
}

first_chunk = False
except UnicodeDecodeError:
raise CannotUnicodeDecodeFile(text_file=file)

# Add the EOS token to the buffer to separate files.
buffer += self.eos_tokens
Expand Down
8 changes: 8 additions & 0 deletions llmfoundry/utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,14 @@ def __init__(self, input_folder: str) -> None:
super().__init__(message, input_folder=input_folder)


class CannotUnicodeDecodeFile(UserError):
"""Error thrown when the input folder is missing data."""

def __init__(self, text_file: str) -> None:
message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.'
super().__init__(message, text_file=text_file)


class OutputFolderNotEmptyError(UserError):
"""Error thrown when the output folder is not empty."""

Expand Down
23 changes: 23 additions & 0 deletions tests/a_scripts/data_prep/test_convert_text_to_mds.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
write_done_file,
)
from llmfoundry.utils.exceptions import (
CannotUnicodeDecodeFile,
DatasetTooSmallError,
InputFolderMissingDataError,
OutputFolderNotEmptyError,
Expand Down Expand Up @@ -290,6 +291,28 @@ def test_dataset_too_small(tmp_path: pathlib.Path):
)


def test_decode_invalid_unicode(tmp_path: pathlib.Path):
input_folder = tmp_path / 'input'
os.makedirs(input_folder, exist_ok=True)
with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f:
f.write('HELLO WORLD')
with pytest.raises(CannotUnicodeDecodeFile):
convert_text_to_mds(
tokenizer_name='mosaicml/mpt-7b',
output_folder=str(tmp_path / 'output'),
input_folder=str(input_folder),
concat_tokens=1,
eos_text='',
bos_text='',
no_wrap=False,
compression='zstd',
processes=1,
args_str='Namespace()',
reprocess=False,
trust_remote_code=False,
)


def test_is_already_processed(tmp_path: pathlib.Path):
tmp_path_str = str(tmp_path)
args_str = 'Namespace(x = 5)'
Expand Down

0 comments on commit 7cee757

Please sign in to comment.