Merge branch 'main' into dbfs-hf

mosaicml · Jun 14, 2024 · 35d2aaa · 35d2aaa
2 parents 99ed590 + 1a2fac0
commit 35d2aaa
Show file tree

Hide file tree

Showing 33 changed files with 959 additions and 560 deletions.
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -23,6 +23,12 @@ jobs:
         - name: "2.3.0_cu121_flash2_aws"
           base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws
           dep_groups: "[gpu-flash2]"
+        - name: "2.3.1_cu121"
+          base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+          dep_groups: "[gpu]"
+        - name: "2.3.1_cu121_aws"
+          base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+          dep_groups: "[gpu]"
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
@@ -23,6 +23,10 @@ jobs:
           container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
+        - name: "cpu-2.3.1"
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          markers: "not gpu"
+          pytest_command: "coverage run -m pytest"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
@@ -24,6 +24,11 @@ jobs:
           markers: "gpu"
           pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
+        - name: "gpu-2.3.1"
+          container: mosaicml/llm-foundry:2.3.1_cu121_flash2-latest
+          markers: "gpu"
+          pytest_command: "coverage run -m pytest"
+          pip_deps: "[all]"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

diff --git a/Dockerfile b/Dockerfile
@@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
 RUN rm setup.py
 
 # Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c
+RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335
 
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git

diff --git a/README.md b/README.md
@@ -230,7 +230,7 @@ python data_prep/convert_dataset_hf.py \
 # Train an MPT-125m model for 10 batches
 composer train/train.py \
   train/yamls/pretrain/mpt-125m.yaml \
-  data_local=my-copy-c4 \
+  variables.data_local=my-copy-c4 \
   train_loader.dataset.split=train_small \
   eval_loader.dataset.split=val_small \
   max_duration=10ba \

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
@@ -71,4 +71,4 @@
     'utils',
 ]
 
-__version__ = '0.9.0.dev0'
+__version__ = '0.10.0.dev0'
diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
@@ -1,7 +1,12 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
+from llmfoundry.data.data import (
+    SUPPORTED_MDS_ENCODING_TYPES,
+    ConcatTokensDataset,
+    NoConcatDataset,
+    stream_remote_local_validate,
+)
 from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.data.finetuning import (
     Seq2SeqFinetuningCollator,
@@ -55,4 +60,6 @@
     'auto_packing_ratio',
     'profile_packing',
     'ConcatenatedSequenceCollatorWrapper',
+    'stream_remote_local_validate',
+    'SUPPORTED_MDS_ENCODING_TYPES',
 ]
diff --git a/llmfoundry/data/data.py b/llmfoundry/data/data.py
@@ -5,16 +5,31 @@
 import os
 import warnings
 from abc import ABC, abstractmethod
-from typing import Dict, Iterable, Union
+from typing import Dict, Iterable, Optional, Union
 
 import datasets as hf_datasets
 import numpy as np
+from numpy.typing import NDArray
 from torch.utils.data import IterableDataset
 from transformers import PreTrainedTokenizerBase
 
 __all__ = [
+    'AbstractConcatTokensDataset',
     'ConcatTokensDataset',
     'NoConcatDataset',
+    'stream_remote_local_validate',
+    'SUPPORTED_MDS_ENCODING_TYPES',
+]
+
+SUPPORTED_MDS_ENCODING_TYPES = [
+    'int8',
+    'int16',
+    'int32',
+    'int64',
+    'uint8',
+    'uint16',
+    'uint32',
+    'uint64',
 ]
 
 
@@ -97,14 +112,14 @@ def __init__(
             )
 
     @abstractmethod
-    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+    def __iter__(self) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]:
         pass
 
 
 class ConcatTokensDataset(AbstractConcatTokensDataset):
     """An IterableDataset that returns token samples for MDSWriter.
 
-    Returns dicts of {'tokens': bytes}
+    Returns dicts of {'tokens': ndarray:int32}
 
     To use data created by this class and written to MDS format:
 
@@ -119,7 +134,7 @@ class ConcatTokensDataset(AbstractConcatTokensDataset):
         # note, you need to copy the numpy array because the original is non-writeable
         # and torch does not support non-writeable tensors, so you get a scary warning and
         # if you do try to write to the tensor you get undefined behavior
-        tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int64).copy())
+        tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int32).copy())
         print(tokenizer.decode(tokens))
     ```
     """
@@ -136,7 +151,7 @@ def __init__(
         self.hf_dataset = hf_dataset
         super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap)
 
-    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+    def __iter__(self) -> Iterable[Dict[str, NDArray]]:
         buffer = []
         for sample in self.hf_dataset:
             encoded = self.tokenizer(
@@ -150,6 +165,27 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]:
                 concat_sample = buffer[:self.max_length]
                 buffer = buffer[self.max_length:] if self.should_wrap else []
                 yield {
-                    # convert to bytes to store in MDS binary format
-                    'tokens': np.asarray(concat_sample).tobytes(),
+                    # convert to ndarray to store in MDS format
+                    'tokens': np.asarray(concat_sample, dtype=np.int32),
                 }
+
+
+def stream_remote_local_validate(
+    remote: Optional[str],
+    local: Optional[str],
+    split: Optional[str],
+):
+    """Check that, if needed, the local/split directory exists.
+
+    Args:
+        remote (Optional[str]): Remote path to the dataset.
+        local (Optional[str]): Local path to the dataset.
+        split (Optional[str]): Subdirectory specifying which dataset split to use, if any.
+    """
+    if remote is None or (local == remote):
+        if local is not None and os.path.isdir(local):
+            contents = set(os.listdir(local))
+            if split is not None and split not in contents:
+                raise ValueError(
+                    f'Local directory {local} does not contain split {split}',
+                )
diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py
@@ -3,7 +3,7 @@
 
 """Dataloader builder utilities."""
 
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 from composer import DataSpec
 from transformers import PreTrainedTokenizerBase
@@ -19,7 +19,7 @@
 def build_dataloader(
     cfg: Dict[str, Any],
     tokenizer: PreTrainedTokenizerBase,
-    device_batch_size: int,
+    device_batch_size: Union[int, float],
 ) -> DataSpec:
     """Builds a dataloader from a config.
 

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
@@ -59,6 +59,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 from streaming import Stream, StreamingDataset
 from transformers import PreTrainedTokenizerBase
 
+from llmfoundry.data import (
+    SUPPORTED_MDS_ENCODING_TYPES,
+    stream_remote_local_validate,
+)
 from llmfoundry.data.finetuning.collator import (
     _HF_IGNORE_INDEX,
     stitch_turns_decoder_only,
@@ -494,26 +498,15 @@ def is_valid_ift_example(
     return True
 
 
-def _stream_remote_local_validate(
-    remote: Optional[str],
-    local: Optional[str],
-    split: Optional[str],
-):
-    if remote is None or (local == remote):
-        if local is not None and os.path.isdir(local):
-            contents = set(os.listdir(local))
-            if split is not None and split not in contents:
-                raise ValueError(
-                    f'Local directory {local} does not contain split {split}',
-                )
-
-
 class StreamingFinetuningDataset(StreamingDataset):
     """Finetuning dataset with flexible tokenization using StreamingDataset.
 
     Args:
         tokenizer (Tokenizer): The name of the HuggingFace tokenizer to use to
             tokenize samples.
+        token_encoding_type (str): The encoding type of the tokenized samples. This is only used
+            for legacy datasets that have been written directly as 'bytes' instead of numpy
+            arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'.
         streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
             which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
             ``remote``/``local``. Defaults to ``None``.
@@ -574,6 +567,7 @@ class StreamingFinetuningDataset(StreamingDataset):
     def __init__(
         self,
         tokenizer: PreTrainedTokenizerBase,
+        token_encoding_type: str = 'int64',
         streams: Optional[Sequence[Stream]] = None,
         local: Optional[str] = None,
         remote: Optional[str] = None,
@@ -606,11 +600,17 @@ def __init__(
                 f'StreamingFinetuningDataset() got an unexpected keyword argument: {kwargs}',
             )
 
+        if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES:
+            raise ValueError(
+                f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}',
+            )
+        self.token_encoding_type = token_encoding_type
+
         if streams is None:
-            _stream_remote_local_validate(remote, local, split)
+            stream_remote_local_validate(remote, local, split)
         else:
             for stream in streams:
-                _stream_remote_local_validate(
+                stream_remote_local_validate(
                     stream.remote,
                     stream.local,
                     split,
@@ -656,11 +656,11 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
             if isinstance(sample['input_ids'], bytes):
                 sample['input_ids'] = np.frombuffer(
                     sample['input_ids'],
-                    dtype=np.int64,
+                    dtype=getattr(np, self.token_encoding_type),
                 )[:self.max_seq_len].tolist().copy()
                 sample['labels'] = np.frombuffer(
                     sample['labels'],
-                    dtype=np.int64,
+                    dtype=getattr(np, self.token_encoding_type),
                 )[:self.max_seq_len].tolist().copy()
             elif isinstance(sample['input_ids'], np.ndarray):
                 sample['input_ids'] = sample['input_ids'][:self.max_seq_len

diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
@@ -4,7 +4,6 @@
 """Build a StreamingTextDataset dataset and dataloader for training."""
 
 import inspect
-import os
 from itertools import islice
 from typing import (
     Any,
@@ -25,6 +24,10 @@
 from transformers import PreTrainedTokenizerBase
 
 from llmfoundry import registry
+from llmfoundry.data import (
+    SUPPORTED_MDS_ENCODING_TYPES,
+    stream_remote_local_validate,
+)
 from llmfoundry.utils.registry_utils import construct_from_registry
 
 __all__ = [
@@ -41,6 +44,9 @@ class StreamingTextDataset(StreamingDataset):
         tokenizer (Tokenizer): HuggingFace tokenizer to
             tokenize samples.
         max_seq_len (int): The max sequence length of each sample.
+        token_encoding_type (str): The encoding type of the tokenized samples. This is only used
+            for legacy datasets that have been written directly as 'bytes' instead of numpy
+            arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'.
         streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
             which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
             ``remote``/``local``. Defaults to ``None``.
@@ -106,6 +112,7 @@ def __init__(
         self,
         tokenizer: PreTrainedTokenizerBase,
         max_seq_len: int,
+        token_encoding_type: str = 'int64',
         streams: Optional[Sequence[Stream]] = None,
         remote: Optional[str] = None,
         local: Optional[str] = None,
@@ -137,13 +144,21 @@ def __init__(
                 f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}',
             )
 
-        if local is not None and (remote is None or (local == remote)):
-            if os.path.isdir(local):
-                contents = set(os.listdir(local))
-                if split not in contents:
-                    raise ValueError(
-                        f'local directory {local} does not contain split {split}',
-                    )
+        if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES:
+            raise ValueError(
+                f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}',
+            )
+        self.token_encoding_type = token_encoding_type
+
+        if streams is None:
+            stream_remote_local_validate(remote, local, split)
+        else:
+            for stream in streams:
+                stream_remote_local_validate(
+                    stream.remote,
+                    stream.local,
+                    split,
+                )
 
         # TODO: discover where yamls are being converted incorrect, but temporary workaround
         if isinstance(shuffle_block_size, float):
@@ -197,10 +212,18 @@ def _read_binary_tokenized_sample(
         self,
         sample: Dict[str, Any],
     ) -> torch.Tensor:
-        return torch.from_numpy(
-            np.frombuffer(sample['tokens'],
-                          dtype=np.int64)[:self.max_seq_len].copy(),
-        )
+        # Modeling code still expects int64 tensors.
+        if isinstance(sample['tokens'], np.ndarray):
+            return torch.from_numpy(
+                sample['tokens'][:self.max_seq_len].copy(),
+            ).to(torch.int64)
+        else:
+            return torch.from_numpy(
+                np.frombuffer(
+                    sample['tokens'],
+                    dtype=getattr(np, self.token_encoding_type),
+                )[:self.max_seq_len].copy(),
+            ).to(torch.int64)
 
     # How to process a sample
     def __getitem__(self,

diff --git a/llmfoundry/data/utils.py b/llmfoundry/data/utils.py
@@ -26,14 +26,6 @@ def _validate_cfg(
     eos_token_id = dataset_cfg.get('eos_token_id', None)
     bos_token_id = dataset_cfg.get('bos_token_id', None)
 
-    if eos_token_id is None and bos_token_id is None and (
-        hasattr(tokenizer, 'eos_token_id') or
-        hasattr(tokenizer, 'bos_token_id')
-    ):
-        log.warning(
-            'The user has not provided an eos_token_id or bos_token_id, but the tokenizer has an eos_token_id or a bos_token_id.',
-        )
-
     tokenizer_eos_token_id = getattr(tokenizer, 'eos_token_id', None)
     if eos_token_id is not None and eos_token_id != tokenizer_eos_token_id:
         eos_mismatch_str = f'Provided {eos_token_id=} does not match the eos_token_id of the tokenizer={tokenizer_eos_token_id}.'