Skip to content

Commit

Permalink
Merge branch 'main' into dbfs-hf
Browse files Browse the repository at this point in the history
  • Loading branch information
KuuCi authored Jun 14, 2024
2 parents 99ed590 + 1a2fac0 commit 35d2aaa
Show file tree
Hide file tree
Showing 33 changed files with 959 additions and 560 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ jobs:
- name: "2.3.0_cu121_flash2_aws"
base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[gpu-flash2]"
- name: "2.3.1_cu121"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
dep_groups: "[gpu]"
- name: "2.3.1_cu121_aws"
base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
dep_groups: "[gpu]"
steps:
- name: Maximize Build Space on Worker
uses: easimon/maximize-build-space@v4
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ jobs:
container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
- name: "cpu-2.3.1"
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: "not gpu"
pytest_command: "coverage run -m pytest"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ jobs:
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
- name: "gpu-2.3.1"
container: mosaicml/llm-foundry:2.3.1_cu121_flash2-latest
markers: "gpu"
pytest_command: "coverage run -m pytest"
pip_deps: "[all]"
name: ${{ matrix.name }}
if: github.repository_owner == 'mosaicml'
with:
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
RUN rm setup.py

# Install TransformerEngine
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c
RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335

# Install and uninstall foundry to cache foundry requirements
RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ python data_prep/convert_dataset_hf.py \
# Train an MPT-125m model for 10 batches
composer train/train.py \
train/yamls/pretrain/mpt-125m.yaml \
data_local=my-copy-c4 \
variables.data_local=my-copy-c4 \
train_loader.dataset.split=train_small \
eval_loader.dataset.split=val_small \
max_duration=10ba \
Expand Down
2 changes: 1 addition & 1 deletion llmfoundry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,4 +71,4 @@
'utils',
]

__version__ = '0.9.0.dev0'
__version__ = '0.10.0.dev0'
9 changes: 8 additions & 1 deletion llmfoundry/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
from llmfoundry.data.data import (
SUPPORTED_MDS_ENCODING_TYPES,
ConcatTokensDataset,
NoConcatDataset,
stream_remote_local_validate,
)
from llmfoundry.data.dataloader import build_dataloader
from llmfoundry.data.finetuning import (
Seq2SeqFinetuningCollator,
Expand Down Expand Up @@ -55,4 +60,6 @@
'auto_packing_ratio',
'profile_packing',
'ConcatenatedSequenceCollatorWrapper',
'stream_remote_local_validate',
'SUPPORTED_MDS_ENCODING_TYPES',
]
50 changes: 43 additions & 7 deletions llmfoundry/data/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,31 @@
import os
import warnings
from abc import ABC, abstractmethod
from typing import Dict, Iterable, Union
from typing import Dict, Iterable, Optional, Union

import datasets as hf_datasets
import numpy as np
from numpy.typing import NDArray
from torch.utils.data import IterableDataset
from transformers import PreTrainedTokenizerBase

__all__ = [
'AbstractConcatTokensDataset',
'ConcatTokensDataset',
'NoConcatDataset',
'stream_remote_local_validate',
'SUPPORTED_MDS_ENCODING_TYPES',
]

SUPPORTED_MDS_ENCODING_TYPES = [
'int8',
'int16',
'int32',
'int64',
'uint8',
'uint16',
'uint32',
'uint64',
]


Expand Down Expand Up @@ -97,14 +112,14 @@ def __init__(
)

@abstractmethod
def __iter__(self) -> Iterable[Dict[str, bytes]]:
def __iter__(self) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]:
pass


class ConcatTokensDataset(AbstractConcatTokensDataset):
"""An IterableDataset that returns token samples for MDSWriter.
Returns dicts of {'tokens': bytes}
Returns dicts of {'tokens': ndarray:int32}
To use data created by this class and written to MDS format:
Expand All @@ -119,7 +134,7 @@ class ConcatTokensDataset(AbstractConcatTokensDataset):
# note, you need to copy the numpy array because the original is non-writeable
# and torch does not support non-writeable tensors, so you get a scary warning and
# if you do try to write to the tensor you get undefined behavior
tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int64).copy())
tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int32).copy())
print(tokenizer.decode(tokens))
```
"""
Expand All @@ -136,7 +151,7 @@ def __init__(
self.hf_dataset = hf_dataset
super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap)

def __iter__(self) -> Iterable[Dict[str, bytes]]:
def __iter__(self) -> Iterable[Dict[str, NDArray]]:
buffer = []
for sample in self.hf_dataset:
encoded = self.tokenizer(
Expand All @@ -150,6 +165,27 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]:
concat_sample = buffer[:self.max_length]
buffer = buffer[self.max_length:] if self.should_wrap else []
yield {
# convert to bytes to store in MDS binary format
'tokens': np.asarray(concat_sample).tobytes(),
# convert to ndarray to store in MDS format
'tokens': np.asarray(concat_sample, dtype=np.int32),
}


def stream_remote_local_validate(
remote: Optional[str],
local: Optional[str],
split: Optional[str],
):
"""Check that, if needed, the local/split directory exists.
Args:
remote (Optional[str]): Remote path to the dataset.
local (Optional[str]): Local path to the dataset.
split (Optional[str]): Subdirectory specifying which dataset split to use, if any.
"""
if remote is None or (local == remote):
if local is not None and os.path.isdir(local):
contents = set(os.listdir(local))
if split is not None and split not in contents:
raise ValueError(
f'Local directory {local} does not contain split {split}',
)
4 changes: 2 additions & 2 deletions llmfoundry/data/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""Dataloader builder utilities."""

from typing import Any, Dict
from typing import Any, Dict, Union

from composer import DataSpec
from transformers import PreTrainedTokenizerBase
Expand All @@ -19,7 +19,7 @@
def build_dataloader(
cfg: Dict[str, Any],
tokenizer: PreTrainedTokenizerBase,
device_batch_size: int,
device_batch_size: Union[int, float],
) -> DataSpec:
"""Builds a dataloader from a config.
Expand Down
36 changes: 18 additions & 18 deletions llmfoundry/data/finetuning/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
from streaming import Stream, StreamingDataset
from transformers import PreTrainedTokenizerBase

from llmfoundry.data import (
SUPPORTED_MDS_ENCODING_TYPES,
stream_remote_local_validate,
)
from llmfoundry.data.finetuning.collator import (
_HF_IGNORE_INDEX,
stitch_turns_decoder_only,
Expand Down Expand Up @@ -494,26 +498,15 @@ def is_valid_ift_example(
return True


def _stream_remote_local_validate(
remote: Optional[str],
local: Optional[str],
split: Optional[str],
):
if remote is None or (local == remote):
if local is not None and os.path.isdir(local):
contents = set(os.listdir(local))
if split is not None and split not in contents:
raise ValueError(
f'Local directory {local} does not contain split {split}',
)


class StreamingFinetuningDataset(StreamingDataset):
"""Finetuning dataset with flexible tokenization using StreamingDataset.
Args:
tokenizer (Tokenizer): The name of the HuggingFace tokenizer to use to
tokenize samples.
token_encoding_type (str): The encoding type of the tokenized samples. This is only used
for legacy datasets that have been written directly as 'bytes' instead of numpy
arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'.
streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
``remote``/``local``. Defaults to ``None``.
Expand Down Expand Up @@ -574,6 +567,7 @@ class StreamingFinetuningDataset(StreamingDataset):
def __init__(
self,
tokenizer: PreTrainedTokenizerBase,
token_encoding_type: str = 'int64',
streams: Optional[Sequence[Stream]] = None,
local: Optional[str] = None,
remote: Optional[str] = None,
Expand Down Expand Up @@ -606,11 +600,17 @@ def __init__(
f'StreamingFinetuningDataset() got an unexpected keyword argument: {kwargs}',
)

if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES:
raise ValueError(
f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}',
)
self.token_encoding_type = token_encoding_type

if streams is None:
_stream_remote_local_validate(remote, local, split)
stream_remote_local_validate(remote, local, split)
else:
for stream in streams:
_stream_remote_local_validate(
stream_remote_local_validate(
stream.remote,
stream.local,
split,
Expand Down Expand Up @@ -656,11 +656,11 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
if isinstance(sample['input_ids'], bytes):
sample['input_ids'] = np.frombuffer(
sample['input_ids'],
dtype=np.int64,
dtype=getattr(np, self.token_encoding_type),
)[:self.max_seq_len].tolist().copy()
sample['labels'] = np.frombuffer(
sample['labels'],
dtype=np.int64,
dtype=getattr(np, self.token_encoding_type),
)[:self.max_seq_len].tolist().copy()
elif isinstance(sample['input_ids'], np.ndarray):
sample['input_ids'] = sample['input_ids'][:self.max_seq_len
Expand Down
47 changes: 35 additions & 12 deletions llmfoundry/data/text_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""Build a StreamingTextDataset dataset and dataloader for training."""

import inspect
import os
from itertools import islice
from typing import (
Any,
Expand All @@ -25,6 +24,10 @@
from transformers import PreTrainedTokenizerBase

from llmfoundry import registry
from llmfoundry.data import (
SUPPORTED_MDS_ENCODING_TYPES,
stream_remote_local_validate,
)
from llmfoundry.utils.registry_utils import construct_from_registry

__all__ = [
Expand All @@ -41,6 +44,9 @@ class StreamingTextDataset(StreamingDataset):
tokenizer (Tokenizer): HuggingFace tokenizer to
tokenize samples.
max_seq_len (int): The max sequence length of each sample.
token_encoding_type (str): The encoding type of the tokenized samples. This is only used
for legacy datasets that have been written directly as 'bytes' instead of numpy
arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'.
streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
``remote``/``local``. Defaults to ``None``.
Expand Down Expand Up @@ -106,6 +112,7 @@ def __init__(
self,
tokenizer: PreTrainedTokenizerBase,
max_seq_len: int,
token_encoding_type: str = 'int64',
streams: Optional[Sequence[Stream]] = None,
remote: Optional[str] = None,
local: Optional[str] = None,
Expand Down Expand Up @@ -137,13 +144,21 @@ def __init__(
f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}',
)

if local is not None and (remote is None or (local == remote)):
if os.path.isdir(local):
contents = set(os.listdir(local))
if split not in contents:
raise ValueError(
f'local directory {local} does not contain split {split}',
)
if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES:
raise ValueError(
f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}',
)
self.token_encoding_type = token_encoding_type

if streams is None:
stream_remote_local_validate(remote, local, split)
else:
for stream in streams:
stream_remote_local_validate(
stream.remote,
stream.local,
split,
)

# TODO: discover where yamls are being converted incorrect, but temporary workaround
if isinstance(shuffle_block_size, float):
Expand Down Expand Up @@ -197,10 +212,18 @@ def _read_binary_tokenized_sample(
self,
sample: Dict[str, Any],
) -> torch.Tensor:
return torch.from_numpy(
np.frombuffer(sample['tokens'],
dtype=np.int64)[:self.max_seq_len].copy(),
)
# Modeling code still expects int64 tensors.
if isinstance(sample['tokens'], np.ndarray):
return torch.from_numpy(
sample['tokens'][:self.max_seq_len].copy(),
).to(torch.int64)
else:
return torch.from_numpy(
np.frombuffer(
sample['tokens'],
dtype=getattr(np, self.token_encoding_type),
)[:self.max_seq_len].copy(),
).to(torch.int64)

# How to process a sample
def __getitem__(self,
Expand Down
8 changes: 0 additions & 8 deletions llmfoundry/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,6 @@ def _validate_cfg(
eos_token_id = dataset_cfg.get('eos_token_id', None)
bos_token_id = dataset_cfg.get('bos_token_id', None)

if eos_token_id is None and bos_token_id is None and (
hasattr(tokenizer, 'eos_token_id') or
hasattr(tokenizer, 'bos_token_id')
):
log.warning(
'The user has not provided an eos_token_id or bos_token_id, but the tokenizer has an eos_token_id or a bos_token_id.',
)

tokenizer_eos_token_id = getattr(tokenizer, 'eos_token_id', None)
if eos_token_id is not None and eos_token_id != tokenizer_eos_token_id:
eos_mismatch_str = f'Provided {eos_token_id=} does not match the eos_token_id of the tokenizer={tokenizer_eos_token_id}.'
Expand Down
Loading

0 comments on commit 35d2aaa

Please sign in to comment.