From 67928cb4d5def4996afe8bc91d2be3bbe42b9aba Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 6 Jun 2024 02:01:54 -0400
Subject: [PATCH 01/16] Fix MPT HF conversion (#1257)

---
 llmfoundry/utils/huggingface_hub_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llmfoundry/utils/huggingface_hub_utils.py b/llmfoundry/utils/huggingface_hub_utils.py
index 3f7b3a0f55..3038014d7f 100644
--- a/llmfoundry/utils/huggingface_hub_utils.py
+++ b/llmfoundry/utils/huggingface_hub_utils.py
@@ -280,6 +280,9 @@ def edit_files_for_hf_compatibility(
         for f in files_processed_and_queued
     }
     for entrypoint in entrypoint_files:
+        file_path = os.path.join(folder, entrypoint)
+        if not os.path.exists(file_path):
+            continue
         existing_relative_imports = get_all_relative_imports(
             os.path.join(folder, entrypoint),
         )

From 3966f0efe5f6c216834a8ed5f5e319d9335fe49b Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 6 Jun 2024 03:40:12 -0400
Subject: [PATCH 02/16] remove warning (#1258)

---
 llmfoundry/data/utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llmfoundry/data/utils.py b/llmfoundry/data/utils.py
index a5fe3a1022..206e884f70 100644
--- a/llmfoundry/data/utils.py
+++ b/llmfoundry/data/utils.py
@@ -26,14 +26,6 @@ def _validate_cfg(
     eos_token_id = dataset_cfg.get('eos_token_id', None)
     bos_token_id = dataset_cfg.get('bos_token_id', None)
 
-    if eos_token_id is None and bos_token_id is None and (
-        hasattr(tokenizer, 'eos_token_id') or
-        hasattr(tokenizer, 'bos_token_id')
-    ):
-        log.warning(
-            'The user has not provided an eos_token_id or bos_token_id, but the tokenizer has an eos_token_id or a bos_token_id.',
-        )
-
     tokenizer_eos_token_id = getattr(tokenizer, 'eos_token_id', None)
     if eos_token_id is not None and eos_token_id != tokenizer_eos_token_id:
         eos_mismatch_str = f'Provided {eos_token_id=} does not match the eos_token_id of the tokenizer={tokenizer_eos_token_id}.'

From 42c2d9a003d697a060eae76c0bf54a0ffbf7722a Mon Sep 17 00:00:00 2001
From: Saaketh Narayan <narayan.saaketh@gmail.com>
Date: Thu, 6 Jun 2024 11:52:32 -0700
Subject: [PATCH 03/16] Adding more token encoding types (#1254)

* add more token encoing types

* add more token encoing types

* add tests

* add tests

* ft support, tests

* linting is shortening my lifespan

* linting is shortening my lifespan

* long tensor

* long tensor

* long tensor

* feedbacc

* import

* import

---------

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/data/__init__.py                   |   9 +-
 llmfoundry/data/data.py                       |  50 ++++-
 llmfoundry/data/finetuning/tasks.py           |  36 +--
 llmfoundry/data/text_data.py                  |  47 +++-
 scripts/data_prep/README.md                   |  17 ++
 scripts/data_prep/convert_dataset_hf.py       |  12 +-
 scripts/data_prep/convert_dataset_json.py     |  30 +--
 scripts/data_prep/convert_text_to_mds.py      |  13 +-
 .../data_prep/test_convert_text_to_mds.py     |   3 +-
 tests/data/test_data_encodings.py             | 205 ++++++++++++++++++
 tests/data/test_dataloader.py                 |   6 +-
 11 files changed, 350 insertions(+), 78 deletions(-)
 create mode 100644 tests/data/test_data_encodings.py

diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py
index 966ca90c86..5710be0c55 100644
--- a/llmfoundry/data/__init__.py
+++ b/llmfoundry/data/__init__.py
@@ -1,7 +1,12 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
-from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset
+from llmfoundry.data.data import (
+    SUPPORTED_MDS_ENCODING_TYPES,
+    ConcatTokensDataset,
+    NoConcatDataset,
+    stream_remote_local_validate,
+)
 from llmfoundry.data.dataloader import build_dataloader
 from llmfoundry.data.finetuning import (
     Seq2SeqFinetuningCollator,
@@ -55,4 +60,6 @@
     'auto_packing_ratio',
     'profile_packing',
     'ConcatenatedSequenceCollatorWrapper',
+    'stream_remote_local_validate',
+    'SUPPORTED_MDS_ENCODING_TYPES',
 ]
diff --git a/llmfoundry/data/data.py b/llmfoundry/data/data.py
index 04eb6d345d..bde68a6998 100644
--- a/llmfoundry/data/data.py
+++ b/llmfoundry/data/data.py
@@ -5,16 +5,31 @@
 import os
 import warnings
 from abc import ABC, abstractmethod
-from typing import Dict, Iterable, Union
+from typing import Dict, Iterable, Optional, Union
 
 import datasets as hf_datasets
 import numpy as np
+from numpy.typing import NDArray
 from torch.utils.data import IterableDataset
 from transformers import PreTrainedTokenizerBase
 
 __all__ = [
+    'AbstractConcatTokensDataset',
     'ConcatTokensDataset',
     'NoConcatDataset',
+    'stream_remote_local_validate',
+    'SUPPORTED_MDS_ENCODING_TYPES',
+]
+
+SUPPORTED_MDS_ENCODING_TYPES = [
+    'int8',
+    'int16',
+    'int32',
+    'int64',
+    'uint8',
+    'uint16',
+    'uint32',
+    'uint64',
 ]
 
 
@@ -97,14 +112,14 @@ def __init__(
             )
 
     @abstractmethod
-    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+    def __iter__(self) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]:
         pass
 
 
 class ConcatTokensDataset(AbstractConcatTokensDataset):
     """An IterableDataset that returns token samples for MDSWriter.
 
-    Returns dicts of {'tokens': bytes}
+    Returns dicts of {'tokens': ndarray:int32}
 
     To use data created by this class and written to MDS format:
 
@@ -119,7 +134,7 @@ class ConcatTokensDataset(AbstractConcatTokensDataset):
         # note, you need to copy the numpy array because the original is non-writeable
         # and torch does not support non-writeable tensors, so you get a scary warning and
         # if you do try to write to the tensor you get undefined behavior
-        tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int64).copy())
+        tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int32).copy())
         print(tokenizer.decode(tokens))
     ```
     """
@@ -136,7 +151,7 @@ def __init__(
         self.hf_dataset = hf_dataset
         super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap)
 
-    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+    def __iter__(self) -> Iterable[Dict[str, NDArray]]:
         buffer = []
         for sample in self.hf_dataset:
             encoded = self.tokenizer(
@@ -150,6 +165,27 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]:
                 concat_sample = buffer[:self.max_length]
                 buffer = buffer[self.max_length:] if self.should_wrap else []
                 yield {
-                    # convert to bytes to store in MDS binary format
-                    'tokens': np.asarray(concat_sample).tobytes(),
+                    # convert to ndarray to store in MDS format
+                    'tokens': np.asarray(concat_sample, dtype=np.int32),
                 }
+
+
+def stream_remote_local_validate(
+    remote: Optional[str],
+    local: Optional[str],
+    split: Optional[str],
+):
+    """Check that, if needed, the local/split directory exists.
+
+    Args:
+        remote (Optional[str]): Remote path to the dataset.
+        local (Optional[str]): Local path to the dataset.
+        split (Optional[str]): Subdirectory specifying which dataset split to use, if any.
+    """
+    if remote is None or (local == remote):
+        if local is not None and os.path.isdir(local):
+            contents = set(os.listdir(local))
+            if split is not None and split not in contents:
+                raise ValueError(
+                    f'Local directory {local} does not contain split {split}',
+                )
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index b7cce4d20a..40f178fb6e 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -59,6 +59,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 from streaming import Stream, StreamingDataset
 from transformers import PreTrainedTokenizerBase
 
+from llmfoundry.data import (
+    SUPPORTED_MDS_ENCODING_TYPES,
+    stream_remote_local_validate,
+)
 from llmfoundry.data.finetuning.collator import (
     _HF_IGNORE_INDEX,
     stitch_turns_decoder_only,
@@ -494,26 +498,15 @@ def is_valid_ift_example(
     return True
 
 
-def _stream_remote_local_validate(
-    remote: Optional[str],
-    local: Optional[str],
-    split: Optional[str],
-):
-    if remote is None or (local == remote):
-        if local is not None and os.path.isdir(local):
-            contents = set(os.listdir(local))
-            if split is not None and split not in contents:
-                raise ValueError(
-                    f'Local directory {local} does not contain split {split}',
-                )
-
-
 class StreamingFinetuningDataset(StreamingDataset):
     """Finetuning dataset with flexible tokenization using StreamingDataset.
 
     Args:
         tokenizer (Tokenizer): The name of the HuggingFace tokenizer to use to
             tokenize samples.
+        token_encoding_type (str): The encoding type of the tokenized samples. This is only used
+            for legacy datasets that have been written directly as 'bytes' instead of numpy
+            arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'.
         streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
             which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
             ``remote``/``local``. Defaults to ``None``.
@@ -574,6 +567,7 @@ class StreamingFinetuningDataset(StreamingDataset):
     def __init__(
         self,
         tokenizer: PreTrainedTokenizerBase,
+        token_encoding_type: str = 'int64',
         streams: Optional[Sequence[Stream]] = None,
         local: Optional[str] = None,
         remote: Optional[str] = None,
@@ -606,11 +600,17 @@ def __init__(
                 f'StreamingFinetuningDataset() got an unexpected keyword argument: {kwargs}',
             )
 
+        if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES:
+            raise ValueError(
+                f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}',
+            )
+        self.token_encoding_type = token_encoding_type
+
         if streams is None:
-            _stream_remote_local_validate(remote, local, split)
+            stream_remote_local_validate(remote, local, split)
         else:
             for stream in streams:
-                _stream_remote_local_validate(
+                stream_remote_local_validate(
                     stream.remote,
                     stream.local,
                     split,
@@ -656,11 +656,11 @@ def __getitem__(self, idx: int) -> Dict[str, Any]:
             if isinstance(sample['input_ids'], bytes):
                 sample['input_ids'] = np.frombuffer(
                     sample['input_ids'],
-                    dtype=np.int64,
+                    dtype=getattr(np, self.token_encoding_type),
                 )[:self.max_seq_len].tolist().copy()
                 sample['labels'] = np.frombuffer(
                     sample['labels'],
-                    dtype=np.int64,
+                    dtype=getattr(np, self.token_encoding_type),
                 )[:self.max_seq_len].tolist().copy()
             elif isinstance(sample['input_ids'], np.ndarray):
                 sample['input_ids'] = sample['input_ids'][:self.max_seq_len
diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py
index 60b81cd145..86d5edbaf4 100644
--- a/llmfoundry/data/text_data.py
+++ b/llmfoundry/data/text_data.py
@@ -4,7 +4,6 @@
 """Build a StreamingTextDataset dataset and dataloader for training."""
 
 import inspect
-import os
 from itertools import islice
 from typing import (
     Any,
@@ -25,6 +24,10 @@
 from transformers import PreTrainedTokenizerBase
 
 from llmfoundry import registry
+from llmfoundry.data import (
+    SUPPORTED_MDS_ENCODING_TYPES,
+    stream_remote_local_validate,
+)
 from llmfoundry.utils.registry_utils import construct_from_registry
 
 __all__ = [
@@ -41,6 +44,9 @@ class StreamingTextDataset(StreamingDataset):
         tokenizer (Tokenizer): HuggingFace tokenizer to
             tokenize samples.
         max_seq_len (int): The max sequence length of each sample.
+        token_encoding_type (str): The encoding type of the tokenized samples. This is only used
+            for legacy datasets that have been written directly as 'bytes' instead of numpy
+            arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'.
         streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from,
             which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or
             ``remote``/``local``. Defaults to ``None``.
@@ -106,6 +112,7 @@ def __init__(
         self,
         tokenizer: PreTrainedTokenizerBase,
         max_seq_len: int,
+        token_encoding_type: str = 'int64',
         streams: Optional[Sequence[Stream]] = None,
         remote: Optional[str] = None,
         local: Optional[str] = None,
@@ -137,13 +144,21 @@ def __init__(
                 f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}',
             )
 
-        if local is not None and (remote is None or (local == remote)):
-            if os.path.isdir(local):
-                contents = set(os.listdir(local))
-                if split not in contents:
-                    raise ValueError(
-                        f'local directory {local} does not contain split {split}',
-                    )
+        if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES:
+            raise ValueError(
+                f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}',
+            )
+        self.token_encoding_type = token_encoding_type
+
+        if streams is None:
+            stream_remote_local_validate(remote, local, split)
+        else:
+            for stream in streams:
+                stream_remote_local_validate(
+                    stream.remote,
+                    stream.local,
+                    split,
+                )
 
         # TODO: discover where yamls are being converted incorrect, but temporary workaround
         if isinstance(shuffle_block_size, float):
@@ -197,10 +212,18 @@ def _read_binary_tokenized_sample(
         self,
         sample: Dict[str, Any],
     ) -> torch.Tensor:
-        return torch.from_numpy(
-            np.frombuffer(sample['tokens'],
-                          dtype=np.int64)[:self.max_seq_len].copy(),
-        )
+        # Modeling code still expects int64 tensors.
+        if isinstance(sample['tokens'], np.ndarray):
+            return torch.from_numpy(
+                sample['tokens'][:self.max_seq_len].copy(),
+            ).to(torch.int64)
+        else:
+            return torch.from_numpy(
+                np.frombuffer(
+                    sample['tokens'],
+                    dtype=getattr(np, self.token_encoding_type),
+                )[:self.max_seq_len].copy(),
+            ).to(torch.int64)
 
     # How to process a sample
     def __getitem__(self,
diff --git a/scripts/data_prep/README.md b/scripts/data_prep/README.md
index 7881298b2f..3601cc865f 100644
--- a/scripts/data_prep/README.md
+++ b/scripts/data_prep/README.md
@@ -35,6 +35,23 @@ python convert_dataset_json.py \
 
 Where `--path` can be a single json file, or a folder containing json files. `--split` denotes the intended split (hf defaults to `train`).
 
+### Raw text files
+
+Using the `convert_text_to_mds.py` script, we convert a [text file](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) containing the complete works of William Shakespeare.
+
+<!--pytest.mark.skip-->
+```bash
+# Convert json dataset to StreamingDataset format
+mkdir shakespeare && cd shakespeare
+curl -O https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt
+cd ..
+python convert_text_to_mds.py \
+  --output_folder my-copy-shakespeare \
+  --input_folder shakespeare \
+  --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b \
+  --compression zstd
+```
+
 ## Converting a finetuning dataset
 Using the `convert_finetuning_dataset.py` script you can run a command such as:
 <!--pytest.mark.skip-->
diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py
index d7aaa52193..bf7f145610 100644
--- a/scripts/data_prep/convert_dataset_hf.py
+++ b/scripts/data_prep/convert_dataset_hf.py
@@ -12,6 +12,8 @@
 
 import datasets as hf_datasets
 import psutil
+import torch
+from numpy.typing import NDArray
 from streaming import MDSWriter
 from torch.utils.data import DataLoader, Dataset, IterableDataset
 from tqdm import tqdm
@@ -338,7 +340,7 @@ def build_dataloader(
 def generate_samples(
     loader: DataLoader,
     truncate_num_samples: Optional[int] = None,
-) -> Iterable[Dict[str, bytes]]:
+) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]:
     """Generator over samples of a dataloader.
 
     Args:
@@ -356,7 +358,11 @@ def generate_samples(
             if truncate_num_samples is not None and n_samples == truncate_num_samples:
                 return
             n_samples += 1
-            yield {k: v[idx] for k, v in batch.items()}
+            yield {
+                k:
+                v[idx].numpy() if isinstance(v[idx], torch.Tensor) else v[idx]
+                for k, v in batch.items()
+            }
 
 
 def main(args: Namespace) -> None:
@@ -377,7 +383,7 @@ def main(args: Namespace) -> None:
         tokenizer = build_tokenizer(args.tokenizer, args.tokenizer_kwargs)
         # we will enforce length, so suppress warnings about sequences too long for the model
         tokenizer.model_max_length = int(1e30)
-        columns = {'tokens': 'bytes'}
+        columns = {'tokens': 'ndarray:int32'}
     else:
         mode = ConcatMode.NO_CONCAT
         tokenizer = None
diff --git a/scripts/data_prep/convert_dataset_json.py b/scripts/data_prep/convert_dataset_json.py
index fb117ddef3..37b0465692 100644
--- a/scripts/data_prep/convert_dataset_json.py
+++ b/scripts/data_prep/convert_dataset_json.py
@@ -6,11 +6,11 @@
 from argparse import ArgumentParser, Namespace
 from enum import Enum
 from glob import glob
-from typing import Dict, Iterable, Optional
+from typing import Optional
 
 import datasets as hf_datasets
 from streaming import MDSWriter
-from torch.utils.data import DataLoader, IterableDataset
+from torch.utils.data import IterableDataset
 from tqdm import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
@@ -140,30 +140,6 @@ def build_hf_dataset(
     return dataset
 
 
-def generate_samples(
-    loader: DataLoader,
-    truncate_num_samples: Optional[int] = None,
-) -> Iterable[Dict[str, bytes]]:
-    """Generator over samples of a dataloader.
-
-    Args:
-       loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]}
-       truncate_num_samples (Optional[int]): An optional # of samples to stop at.
-
-    Yields:
-        Sample dicts.
-    """
-    n_samples = 0
-    for batch in loader:
-        keys = list(batch.keys())
-        current_bs = len(batch[keys[0]])
-        for idx in range(current_bs):
-            if truncate_num_samples is not None and n_samples == truncate_num_samples:
-                return
-            n_samples += 1
-            yield {k: v[idx] for k, v in batch.items()}
-
-
 def main(args: Namespace) -> None:
     """Main: create C4/pile streaming dataset.
 
@@ -175,7 +151,7 @@ def main(args: Namespace) -> None:
         tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
         # we will enforce length, so suppress warnings about sequences too long for the model
         tokenizer.model_max_length = int(1e30)
-        columns = {'tokens': 'bytes'}
+        columns = {'tokens': 'ndarray:int32'}
     else:
         mode = ConcatMode.NO_CONCAT
         tokenizer = None
diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py
index 365cc9b71d..b2f0b0e7b4 100644
--- a/scripts/data_prep/convert_text_to_mds.py
+++ b/scripts/data_prep/convert_text_to_mds.py
@@ -18,6 +18,7 @@
     maybe_create_object_store_from_uri,
     parse_uri,
 )
+from numpy.typing import NDArray
 from streaming import MDSWriter
 from tqdm import tqdm
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
@@ -42,7 +43,7 @@
 class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset):
     """An IterableDataset that returns token samples for MDSWriter from files.
 
-    Returns dicts of {'tokens': bytes}
+    Returns dicts of {'tokens': ndarray:int32}
 
     Each file is considered a sequence.
     """
@@ -59,7 +60,7 @@ def __init__(
         self.files = files
         super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap)
 
-    def __iter__(self) -> Iterable[Dict[str, bytes]]:
+    def __iter__(self) -> Iterable[Dict[str, NDArray]]:
 
         buffer = []
         for file in self.files:
@@ -87,7 +88,9 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]:
                         concat_sample = buffer[:self.max_length]
                         buffer = buffer[self.
                                         max_length:] if self.should_wrap else []
-                        yield {'tokens': np.asarray(concat_sample).tobytes()}
+                        yield {
+                            'tokens': np.asarray(concat_sample, dtype=np.int32),
+                        }
 
                     first_chunk = False
 
@@ -98,7 +101,7 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]:
         while len(buffer) >= self.max_length:
             concat_sample = buffer[:self.max_length]
             buffer = buffer[self.max_length:] if self.should_wrap else []
-            yield {'tokens': np.asarray(concat_sample).tobytes()}
+            yield {'tokens': np.asarray(concat_sample, dtype=np.int32)}
 
 
 def parse_args() -> Namespace:
@@ -356,7 +359,7 @@ def download_and_convert(
             no_wrap=no_wrap,
         )
 
-        columns = {'tokens': 'bytes'}
+        columns = {'tokens': 'ndarray:int32'}
 
         log.info('Converting to MDS format...')
         with MDSWriter(
diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index df4309e13d..8dac151f55 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -9,7 +9,6 @@
 from typing import Callable, Iterable, List
 from unittest.mock import Mock, patch
 
-import numpy as np
 import pytest
 from streaming import StreamingDataset
 from transformers import AutoTokenizer
@@ -194,7 +193,7 @@ def call_convert_text_to_mds() -> None:
     n_tokens = 0
     for i in range(dataset.num_samples):
         sample = dataset[i]
-        tokens = np.frombuffer(sample['tokens'], dtype=int)
+        tokens = sample['tokens']
         if i == 0:  # For the first sample, check that the decoded sample matches the text_content
             decoded = tokenizer.decode(tokens)
             assert decoded == text_content[:len(decoded)]
diff --git a/tests/data/test_data_encodings.py b/tests/data/test_data_encodings.py
new file mode 100644
index 0000000000..a45bfbcb88
--- /dev/null
+++ b/tests/data/test_data_encodings.py
@@ -0,0 +1,205 @@
+# Copyright 2022 MosaicML LLM Foundry authors
+# SPDX-License-Identifier: Apache-2.0
+import pathlib
+
+import numpy as np
+import pytest
+import torch
+from streaming import MDSWriter
+
+from llmfoundry.data import SUPPORTED_MDS_ENCODING_TYPES, StreamingTextDataset
+from llmfoundry.data.finetuning.tasks import StreamingFinetuningDataset
+
+
+@pytest.mark.parametrize(
+    'token_encoding_type',
+    SUPPORTED_MDS_ENCODING_TYPES + ['default'],
+)
+@pytest.mark.parametrize('use_bytes', [True, False])
+@pytest.mark.parametrize('samples', [10])
+@pytest.mark.parametrize('max_seq_len', [2048])
+def test_encoding_types_text(
+    tmp_path: pathlib.Path,
+    token_encoding_type: str,
+    use_bytes: bool,
+    samples: int,
+    max_seq_len: int,
+):
+    dataset_local_path = str(tmp_path)
+    if token_encoding_type != 'default':
+        encoding_dtype = getattr(np, token_encoding_type)
+    else:
+        encoding_dtype = None
+
+    if use_bytes:
+        columns = {
+            'tokens': 'bytes',
+        }
+    else:
+        columns = {
+            'tokens':
+                'ndarray:' + token_encoding_type
+                if token_encoding_type != 'default' else 'ndarray',
+        }
+
+    with MDSWriter(out=dataset_local_path, columns=columns) as writer:
+        for _ in range(samples):
+            if token_encoding_type != 'default':
+                tokens = np.random.randint(
+                    0,
+                    np.iinfo(encoding_dtype).max,
+                    max_seq_len,
+                    dtype=encoding_dtype,
+                )
+            else:
+                tokens = np.random.randint(
+                    0,
+                    200,
+                    max_seq_len,
+                )
+            if use_bytes:
+                tokens = tokens.tobytes()
+            writer.write({'tokens': tokens})
+
+    if use_bytes and token_encoding_type != 'default':
+        dataset = StreamingTextDataset(
+            tokenizer=None,
+            token_encoding_type=token_encoding_type,
+            max_seq_len=max_seq_len,
+            local=dataset_local_path,
+            batch_size=1,
+        )
+    else:
+        # There should be no need to pass in the token encoding type if writing out ndarrays,
+        # or if using the default token encoding type.
+        dataset = StreamingTextDataset(
+            tokenizer=None,
+            max_seq_len=max_seq_len,
+            local=dataset_local_path,
+            batch_size=1,
+        )
+
+    for _, sample in enumerate(dataset):
+        # StreamingTextDataset should return an int64 torch Tensor
+        assert sample.dtype == torch.int64
+        assert sample.shape == (max_seq_len,)
+
+
+@pytest.mark.parametrize(
+    'token_encoding_type',
+    SUPPORTED_MDS_ENCODING_TYPES + ['default'],
+)
+@pytest.mark.parametrize('use_bytes', [True, False])
+@pytest.mark.parametrize('samples', [10])
+@pytest.mark.parametrize('max_seq_len', [2048])
+def test_encoding_types_finetuning(
+    tmp_path: pathlib.Path,
+    token_encoding_type: str,
+    use_bytes: bool,
+    samples: int,
+    max_seq_len: int,
+):
+    dataset_local_path = str(tmp_path)
+    if token_encoding_type != 'default':
+        encoding_dtype = getattr(np, token_encoding_type)
+    else:
+        encoding_dtype = None
+
+    if use_bytes:
+        columns = {
+            'input_ids': 'bytes',
+            'labels': 'bytes',
+        }
+    else:
+        columns = {
+            'input_ids':
+                'ndarray:' + token_encoding_type
+                if token_encoding_type != 'default' else 'ndarray',
+            'labels':
+                'ndarray:' + token_encoding_type
+                if token_encoding_type != 'default' else 'ndarray',
+        }
+
+    with MDSWriter(out=dataset_local_path, columns=columns) as writer:
+        for _ in range(samples):
+            if token_encoding_type != 'default':
+                input_ids = np.random.randint(
+                    0,
+                    np.iinfo(encoding_dtype).max,
+                    max_seq_len,
+                    dtype=encoding_dtype,
+                )
+                labels = np.random.randint(
+                    0,
+                    np.iinfo(encoding_dtype).max,
+                    max_seq_len,
+                    dtype=encoding_dtype,
+                )
+            else:
+                input_ids = np.random.randint(
+                    0,
+                    200,
+                    max_seq_len,
+                )
+                labels = np.random.randint(
+                    0,
+                    200,
+                    max_seq_len,
+                )
+            if use_bytes:
+                input_ids = input_ids.tobytes()
+                labels = labels.tobytes()
+            writer.write({'input_ids': input_ids, 'labels': labels})
+
+    if use_bytes and token_encoding_type != 'default':
+        dataset = StreamingFinetuningDataset(
+            tokenizer=None,
+            token_encoding_type=token_encoding_type,
+            local=dataset_local_path,
+            max_seq_len=max_seq_len,
+            batch_size=1,
+        )
+    else:
+        # There should be no need to pass in the token encoding type if writing out ndarrays,
+        # or if using the default token encoding type.
+        dataset = StreamingFinetuningDataset(
+            tokenizer=None,
+            local=dataset_local_path,
+            max_seq_len=max_seq_len,
+            batch_size=1,
+        )
+
+    for _, sample in enumerate(dataset):
+        # StreamingFinetuningDataset puts samples in a list, and converts arrays to lists too.
+        assert isinstance(sample['turns'][0]['input_ids'][0], int)
+        assert len(sample['turns'][0]['input_ids']) == max_seq_len
+        assert isinstance(sample['turns'][0]['labels'][0], int)
+        assert len(sample['turns'][0]['labels']) == max_seq_len
+
+
+@pytest.mark.parametrize(
+    'token_encoding_type',
+    ['int17', 'float32', 'complex', 'int4'],
+)
+@pytest.mark.parametrize('use_finetuning', [True, False])
+def test_unsupported_encoding_type(
+    token_encoding_type: str,
+    use_finetuning: bool,
+):
+    with pytest.raises(ValueError, match='The token_encoding_type*'):
+        if use_finetuning:
+            StreamingFinetuningDataset(
+                tokenizer=None,
+                token_encoding_type=token_encoding_type,
+                local='dataset/path',
+                max_seq_len=2048,
+                batch_size=1,
+            )
+        else:
+            StreamingTextDataset(
+                tokenizer=None,
+                token_encoding_type=token_encoding_type,
+                max_seq_len=2048,
+                local='dataset/path',
+                batch_size=1,
+            )
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index 7c8e808bab..ec27df8121 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -114,8 +114,8 @@ def build_mock_ft_streaming_dataset(
                 columns = {'input_ids': 'bytes', 'labels': 'bytes'}
             else:
                 columns = {
-                    'input_ids': 'ndarray:uint32',
-                    'labels': 'ndarray:uint32',
+                    'input_ids': 'ndarray:int32',
+                    'labels': 'ndarray:int32',
                 }
         else:
             columns = {'prompt': 'str', 'response': 'str'}
@@ -142,7 +142,7 @@ def build_mock_ft_streaming_dataset(
                         else:
                             sample_to_write[key] = np.asarray(
                                 sample[key],
-                                dtype=np.uint32,
+                                dtype=np.int32,
                             )
                     output_writer.write(sample_to_write)
                 else:

From 14f296c340f85dea04970ad191ef5abd2aaf4326 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Thu, 6 Jun 2024 19:36:45 -0400
Subject: [PATCH 04/16] Bump Composer to 0.23.0 (#1259)

---
 scripts/train/train.py     |  2 +-
 setup.py                   |  8 ++++----
 tests/models/test_model.py | 18 +++++++++++++++---
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/scripts/train/train.py b/scripts/train/train.py
index c9e2d67bf4..3cf3d9551d 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -504,7 +504,7 @@ def main(cfg: DictConfig) -> Trainer:
         precision=train_cfg.precision,
         algorithms=algorithms,
         device_train_microbatch_size=train_cfg.device_train_microbatch_size,
-        fsdp_config=fsdp_config,
+        parallelism_config={'fsdp': fsdp_config},
         save_folder=train_cfg.save_folder,
         save_filename=save_filename,
         save_latest_filename=save_latest_filename,
diff --git a/setup.py b/setup.py
index 78182976d4..0556050de9 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,oci,gcs]>=0.22.0,<0.23',
+    'mosaicml[libcloud,wandb,oci,gcs]>=0.23.0,<0.24',
     'mlflow>=2.12.1,<2.13',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
     'transformers>=4.40,<4.41',
@@ -92,14 +92,14 @@
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]>=0.22.0,<0.23',
+    'mosaicml[databricks]>=0.23.0,<0.24',
     'databricks-sql-connector>=3,<4',
     'databricks-connect==14.1.0',
     'lz4>=4,<5',
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.22.0,<0.23',
+    'mosaicml[tensorboard]>=0.23.0,<0.24',
 ]
 
 # Flash 2 group kept for backwards compatibility
@@ -110,7 +110,7 @@
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
 
 extra_deps['peft'] = [
-    'mosaicml[peft]>=0.22.0,<0.23',
+    'mosaicml[peft]>=0.23.0,<0.24',
 ]
 
 extra_deps['openai'] = [
diff --git a/tests/models/test_model.py b/tests/models/test_model.py
index a62a7dd114..2f93b1d3ce 100644
--- a/tests/models/test_model.py
+++ b/tests/models/test_model.py
@@ -13,10 +13,15 @@
 import torch.nn as nn
 from accelerate import init_empty_weights
 from composer.core.precision import Precision, get_precision_context
+from composer.distributed.dist_strategy import prepare_fsdp_module
 from composer.models.huggingface import maybe_get_underlying_model
 from composer.optim import DecoupledAdamW
-from composer.trainer.dist_strategy import prepare_fsdp_module
-from composer.utils import dist, get_device, reproducibility
+from composer.utils import (
+    FSDPConfig,
+    dist,
+    get_device,
+    reproducibility,
+)
 from omegaconf import DictConfig, ListConfig
 from omegaconf import OmegaConf as om
 from transformers import (
@@ -2538,7 +2543,14 @@ def test_hf_init(
         betas=(0.9, 0.99),
     )
 
-    prepare_fsdp_module(model, optimizer, fsdp_config, precision, device, False)
+    prepare_fsdp_module(
+        model,
+        optimizer,
+        FSDPConfig(**fsdp_config),
+        precision,
+        device,
+        False,
+    )
 
     model = HuggingFaceModelWithFSDP(model, tokenizer)
 

From bea61fb5d979d855f7025ae218de6dbd68857cc6 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Thu, 6 Jun 2024 20:25:18 -0400
Subject: [PATCH 05/16] Bump Version to 0.10.0.dev0 (#1255)

* bump version

* typo

* Update config_utils.py

These changes are necessary as the deprecation broke compatibility with `update_batch_size`.

* Update config_utils.py

fix typo

* typo

* typo I

* update tests

* typo II

* typo III

* bump composer version

* undo composer bump for seperate pr

* fix test

* fix tests II

* yolo

* tye-o

* pyrite

* we resolve later

* revert new . syntax

---------

Co-authored-by: v-chen_data <v-chen_data@example.com>
Co-authored-by: Milo Cress <milo.cress@databricks.com>
Co-authored-by: Saaketh Narayan <narayan.saaketh@gmail.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 llmfoundry/__init__.py                     |  2 +-
 llmfoundry/utils/config_utils.py           | 13 ++++++++-----
 scripts/train/train.py                     |  1 -
 tests/a_scripts/eval/test_eval.py          | 10 +++++++++-
 tests/a_scripts/eval/test_eval_inputs.py   | 15 ++++-----------
 tests/a_scripts/train/test_train_inputs.py | 18 ++++++------------
 6 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py
index c9666566bf..5e2795f9c9 100644
--- a/llmfoundry/__init__.py
+++ b/llmfoundry/__init__.py
@@ -71,4 +71,4 @@
     'utils',
 ]
 
-__version__ = '0.9.0.dev0'
+__version__ = '0.10.0.dev0'
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index b6a5acf6d9..5ab148bbe8 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -67,6 +67,7 @@ class EvalConfig:
     # Logging parameters
     python_log_level: Optional[str] = 'debug'
     loggers: Optional[Dict[str, Any]] = None
+    console_log_interval: Union[int, str] = '1ba'
     log_config: bool = True
 
     # Model/run parameters
@@ -180,6 +181,11 @@ class TrainConfig:
     # Variables to ignore
     variables: Optional[Dict[str, Any]] = None
 
+    # Fields created by `update_batch_size_info`
+    n_gpus: int = MISSING
+    device_train_batch_size: int = MISSING
+    device_train_grad_accum: str = MISSING
+
 
 TRAIN_CONFIG_KEYS = {field.name for field in fields(TrainConfig)}
 
@@ -242,7 +248,6 @@ def make_dataclass_and_log_config(
     icl_tasks_required: bool = False,
 ) -> Tuple[Dict[str, Any], T]:
     """Converts a DictConfig to a dataclass and creates a logged config."""
-    # Resolve all interpolation variables as early as possible
     unstructured_config = om.to_container(cfg, resolve=True)
     assert isinstance(unstructured_config, dict)
     assert all(isinstance(k, str) for k in unstructured_config.keys())
@@ -289,11 +294,9 @@ def make_dataclass_and_log_config(
         unstructured_config['variables'] = {}
 
     for key in extraneous_keys:
-        warnings.warn(
-            f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases. Please place any variables under the `variables` key.',
-            category=DeprecationWarning,
+        raise ValueError(
+            f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Please place any variables under the `variables` key.',
         )
-        unstructured_config['variables'][key] = unstructured_config.pop(key)
 
     dataclass_dict_config: DictConfig = om.structured(
         dataclass_constructor(**unstructured_config),
diff --git a/scripts/train/train.py b/scripts/train/train.py
index 3cf3d9551d..f2a70b526d 100644
--- a/scripts/train/train.py
+++ b/scripts/train/train.py
@@ -553,6 +553,5 @@ def main(cfg: DictConfig) -> Trainer:
         yaml_cfg = om.load(f)
     cli_cfg = om.from_cli(args_list)
     cfg = om.merge(yaml_cfg, cli_cfg)
-    om.resolve(cfg)
     assert isinstance(cfg, DictConfig)
     main(cfg)
diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py
index a56778538c..01f3760d26 100644
--- a/tests/a_scripts/eval/test_eval.py
+++ b/tests/a_scripts/eval/test_eval.py
@@ -13,7 +13,7 @@
 
 from llmfoundry.utils import build_tokenizer
 from llmfoundry.utils.builders import build_composer_model
-from llmfoundry.utils.config_utils import to_dict_container
+from llmfoundry.utils.config_utils import EVAL_CONFIG_KEYS, to_dict_container
 from scripts.eval.eval import main  # noqa: E402
 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg
 
@@ -134,6 +134,14 @@ def test_loader_eval(
     test_cfg.eval_interval = '1ba'
     test_cfg.loggers = om.DictConfig({'inmemory': om.DictConfig({})})
 
+    # This test uses a training yaml with training-only keys present.
+    # We exclude these keys before calling `main` from the eval script.
+    allowed_keys = EVAL_CONFIG_KEYS
+    present_keys = set(test_cfg.keys())
+    keys_to_pop = present_keys.difference(allowed_keys)
+
+    [test_cfg.pop(key) for key in keys_to_pop]
+
     trainers, eval_gauntlet_df = main(test_cfg)
 
     assert eval_gauntlet_df is None
diff --git a/tests/a_scripts/eval/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py
index 98b15743b3..0ca5765a26 100644
--- a/tests/a_scripts/eval/test_eval_inputs.py
+++ b/tests/a_scripts/eval/test_eval_inputs.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import copy
 import os
-import warnings
 
 import omegaconf
 import pytest
@@ -42,12 +41,13 @@ def test_mispelled_mandatory_params_fail(self, cfg: DictConfig) -> None:
                 omegaconf.errors.InterpolationKeyError,
                 omegaconf.errors.MissingMandatoryValue,
                 TypeError,
+                ValueError,
             )):
                 cfg[p + '-mispelled'] = cfg.pop(p)
                 main(cfg)
                 cfg[p] = cfg.pop(p + '-mispelled')
 
-    def test_optional_mispelled_params_raise_warning(
+    def test_optional_mispelled_params_raise_error(
         self,
         cfg: DictConfig,
     ) -> None:
@@ -67,15 +67,8 @@ def test_optional_mispelled_params_raise_warning(
             orig_value = cfg.pop(param, None)
             updated_param = param + '-mispelling'
             cfg[updated_param] = orig_value
-            with warnings.catch_warnings(record=True) as warning_list:
-                try:
-                    main(cfg)
-                except:
-                    pass
-                assert any(
-                    f'Unused parameter {updated_param} found in cfg.' in
-                    str(warning.message) for warning in warning_list
-                )
+            with pytest.raises(ValueError):
+                main(cfg)
             # restore configs.
             cfg = copy.deepcopy(old_cfg)
 
diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py
index 5a3b21dc3b..5901d53e94 100644
--- a/tests/a_scripts/train/test_train_inputs.py
+++ b/tests/a_scripts/train/test_train_inputs.py
@@ -3,7 +3,6 @@
 import copy
 import json
 import os
-import warnings
 
 import omegaconf
 import pytest
@@ -63,7 +62,9 @@ def cfg(self, foundry_dir: str) -> DictConfig:
     def test_misspelled_mandatory_params_fail(self, cfg: DictConfig) -> None:
         """Check that mandatory misspelled inputs fail to train."""
         cfg.trai_loader = cfg.pop('train_loader')
-        with pytest.raises((omegaconf.errors.MissingMandatoryValue, TypeError)):
+        with pytest.raises(
+            (omegaconf.errors.MissingMandatoryValue, TypeError, ValueError),
+        ):
             main(cfg)
 
     def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None:
@@ -89,7 +90,7 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None:
                 main(cfg)
             cfg[param] = orig_param
 
-    def test_optional_misspelled_params_raise_warning(
+    def test_optional_misspelled_params_raise_error(
         self,
         cfg: DictConfig,
     ) -> None:
@@ -113,15 +114,8 @@ def test_optional_misspelled_params_raise_warning(
             orig_value = cfg.pop(param, None)
             updated_param = param + '-misspelling'
             cfg[updated_param] = orig_value
-            with warnings.catch_warnings(record=True) as warning_list:
-                try:
-                    main(cfg)
-                except:
-                    pass
-                assert any(
-                    f'Unused parameter {updated_param} found in cfg.' in
-                    str(warning.message) for warning in warning_list
-                )
+            with pytest.raises(ValueError):
+                main(cfg)
             # restore configs.
             cfg = copy.deepcopy(old_cfg)
 

From e4b8b571b82933d382aee69fe74e9d8171163d83 Mon Sep 17 00:00:00 2001
From: Xiaohan Zhang <xiaohanzhang.cmu@gmail.com>
Date: Fri, 7 Jun 2024 00:08:14 -0700
Subject: [PATCH 06/16] Fix typo in setup.py (#1263)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0556050de9..3fefc0426e 100644
--- a/setup.py
+++ b/setup.py
@@ -74,7 +74,7 @@
     'beautifulsoup4>=4.12.2,<5',  # required for model download utils
     'tenacity>=8.2.3,<9',
     'catalogue>=2,<3',
-    'typer[all]<1',
+    'typer<1',
 ]
 
 extra_deps = {}

From db7013516133849b9e2cab3f9e66bf9ad0882a39 Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:47:38 -0700
Subject: [PATCH 07/16] Update TE Dockerfile (#1265)

Update Dockerfile with TE main
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 253a5b6cd8..73b6d7fb07 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
 RUN rm setup.py
 
 # Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c
+RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@0edf30b87159e82048b5f248e4b379aebb8f364a
 
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git

From 4e53e7449471c2671a512364fd6241fad83c3cdd Mon Sep 17 00:00:00 2001
From: Charles Tang <j316chuck@users.noreply.github.com>
Date: Fri, 7 Jun 2024 12:15:20 -0700
Subject: [PATCH 08/16] Revert "Update TE Dockerfile (#1265)" (#1266)

This reverts commit db7013516133849b9e2cab3f9e66bf9ad0882a39.
---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 73b6d7fb07..253a5b6cd8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
 RUN rm setup.py
 
 # Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@0edf30b87159e82048b5f248e4b379aebb8f364a
+RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c
 
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git

From dddb9b81cdde35f3768ff5f112150916ca1ba379 Mon Sep 17 00:00:00 2001
From: Mihir Patel <mihir.v.patel7@gmail.com>
Date: Fri, 7 Jun 2024 16:35:51 -0400
Subject: [PATCH 09/16] revert to nvidia code (#1267)

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 253a5b6cd8..ca684dca2a 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py
 RUN rm setup.py
 
 # Install TransformerEngine
-RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c
+RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335
 
 # Install and uninstall foundry to cache foundry requirements
 RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git

From dd92abf78a1927ac1ec8674b670fe3744f759be2 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Fri, 7 Jun 2024 20:46:55 -0700
Subject: [PATCH 10/16] Bump composer to 0.23.2 (#1269)

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 3fefc0426e..f81b1cd0f1 100644
--- a/setup.py
+++ b/setup.py
@@ -54,7 +54,7 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,oci,gcs]>=0.23.0,<0.24',
+    'mosaicml[libcloud,wandb,oci,gcs]>=0.23.2,<0.24',
     'mlflow>=2.12.1,<2.13',
     'accelerate>=0.25,<0.26',  # for HF inference `device_map`
     'transformers>=4.40,<4.41',
@@ -92,14 +92,14 @@
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]>=0.23.0,<0.24',
+    'mosaicml[databricks]>=0.23.2,<0.24',
     'databricks-sql-connector>=3,<4',
     'databricks-connect==14.1.0',
     'lz4>=4,<5',
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.23.0,<0.24',
+    'mosaicml[tensorboard]>=0.23.2,<0.24',
 ]
 
 # Flash 2 group kept for backwards compatibility
@@ -110,7 +110,7 @@
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
 
 extra_deps['peft'] = [
-    'mosaicml[peft]>=0.23.0,<0.24',
+    'mosaicml[peft]>=0.23.2,<0.24',
 ]
 
 extra_deps['openai'] = [

From 5571101a50804406ef0fe23e7ea6795b3c4a1bcb Mon Sep 17 00:00:00 2001
From: Milo Cress <milo.cress@databricks.com>
Date: Sun, 9 Jun 2024 10:50:54 -0400
Subject: [PATCH 11/16] fix linting (#1270)

* fix linting

* fix
---
 llmfoundry/data/dataloader.py    | 4 ++--
 llmfoundry/utils/config_utils.py | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py
index 83a9a7d8ea..e7521bc343 100644
--- a/llmfoundry/data/dataloader.py
+++ b/llmfoundry/data/dataloader.py
@@ -3,7 +3,7 @@
 
 """Dataloader builder utilities."""
 
-from typing import Any, Dict
+from typing import Any, Dict, Union
 
 from composer import DataSpec
 from transformers import PreTrainedTokenizerBase
@@ -19,7 +19,7 @@
 def build_dataloader(
     cfg: Dict[str, Any],
     tokenizer: PreTrainedTokenizerBase,
-    device_batch_size: int,
+    device_batch_size: Union[int, float],
 ) -> DataSpec:
     """Builds a dataloader from a config.
 
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index 5ab148bbe8..5c1ec9114a 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -100,7 +100,7 @@ class TrainConfig:
     optimizer: Dict[str, Any] = MISSING
     scheduler: Dict[str, Any] = MISSING
     train_loader: Dict[str, Any] = MISSING
-    device_train_batch_size: int = MISSING
+    device_train_batch_size: Union[int, float] = MISSING
     device_eval_batch_size: int = MISSING
     max_duration: Union[int, str] = MISSING
     eval_interval: Union[int, str] = MISSING
@@ -183,7 +183,6 @@ class TrainConfig:
 
     # Fields created by `update_batch_size_info`
     n_gpus: int = MISSING
-    device_train_batch_size: int = MISSING
     device_train_grad_accum: str = MISSING
 
 

From ffec54b491bd7c1bd3de236707a6e9f5aadcbb51 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 13 Jun 2024 09:59:23 -0700
Subject: [PATCH 12/16] Add torch 2.3.1 docker images (#1275)

---
 .github/workflows/docker.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index 6ca10fcd47..89aa917809 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -23,6 +23,12 @@ jobs:
         - name: "2.3.0_cu121_flash2_aws"
           base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws
           dep_groups: "[gpu-flash2]"
+        - name: "2.3.1_cu121"
+          base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
+          dep_groups: "[gpu]"
+        - name: "2.3.1_cu121_aws"
+          base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
+          dep_groups: "[gpu]"
     steps:
     - name: Maximize Build Space on Worker
       uses: easimon/maximize-build-space@v4

From c30856f96949a298d307219c4f13e7bd6aeddbab Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Thu, 13 Jun 2024 16:24:01 -0400
Subject: [PATCH 13/16] Make expandable segments on by default (#1278)

---
 llmfoundry/utils/config_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index 5c1ec9114a..f91ae79404 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -115,7 +115,7 @@ class TrainConfig:
 
     # Cuda allocation configuration
     max_split_size_mb: Optional[int] = None
-    expandable_segments: bool = False
+    expandable_segments: bool = True
     cuda_load_lazy: bool = False
 
     # Distributed training parameters

From 630fc6879f721ead6064d501cd70b5cc69807386 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Thu, 13 Jun 2024 19:25:03 -0700
Subject: [PATCH 14/16] Add CI for torch 2.3.1 (#1281)

---
 .github/workflows/pr-cpu.yaml | 4 ++++
 .github/workflows/pr-gpu.yaml | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml
index 93612b7983..78faea8e44 100644
--- a/.github/workflows/pr-cpu.yaml
+++ b/.github/workflows/pr-cpu.yaml
@@ -23,6 +23,10 @@ jobs:
           container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04
           markers: "not gpu"
           pytest_command: "coverage run -m pytest"
+        - name: "cpu-2.3.1"
+          container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
+          markers: "not gpu"
+          pytest_command: "coverage run -m pytest"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:
diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml
index 31af66e51f..335d049306 100644
--- a/.github/workflows/pr-gpu.yaml
+++ b/.github/workflows/pr-gpu.yaml
@@ -24,6 +24,11 @@ jobs:
           markers: "gpu"
           pytest_command: "coverage run -m pytest"
           pip_deps: "[all]"
+        - name: "gpu-2.3.1"
+          container: mosaicml/llm-foundry:2.3.1_cu121_flash2-latest
+          markers: "gpu"
+          pytest_command: "coverage run -m pytest"
+          pip_deps: "[all]"
     name: ${{ matrix.name }}
     if: github.repository_owner == 'mosaicml'
     with:

From 9b9fc24b86c156e45d2e54b64f3dbc7a68235c1e Mon Sep 17 00:00:00 2001
From: Milo Cress <milo.cress@databricks.com>
Date: Thu, 13 Jun 2024 23:06:19 -0400
Subject: [PATCH 15/16] Update README.md to use variables (#1282)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 70436271dd..c92c252395 100644
--- a/README.md
+++ b/README.md
@@ -230,7 +230,7 @@ python data_prep/convert_dataset_hf.py \
 # Train an MPT-125m model for 10 batches
 composer train/train.py \
   train/yamls/pretrain/mpt-125m.yaml \
-  data_local=my-copy-c4 \
+  variables.data_local=my-copy-c4 \
   train_loader.dataset.split=train_small \
   eval_loader.dataset.split=val_small \
   max_duration=10ba \

From 1a2fac0c25be354c3e1531301ed69202af66c085 Mon Sep 17 00:00:00 2001
From: sanjari-orb <137819448+sanjari-orb@users.noreply.github.com>
Date: Fri, 14 Jun 2024 10:43:14 -0700
Subject: [PATCH 16/16] Add registry for ICL datasets (#1252)

---
 llmfoundry/eval/datasets/__init__.py          |  12 +
 .../in_context_learning_evaluation.py         | 519 ++++++++++--------
 llmfoundry/registry.py                        |  17 +
 llmfoundry/utils/builders.py                  |  53 +-
 .../eval/test_in_context_learning_datasets.py | 363 ++++++------
 tests/test_registry.py                        |   1 +
 6 files changed, 537 insertions(+), 428 deletions(-)

diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py
index 02a2b88b21..a3a36053da 100644
--- a/llmfoundry/eval/datasets/__init__.py
+++ b/llmfoundry/eval/datasets/__init__.py
@@ -22,6 +22,18 @@
     tokenizer_needs_prefix_space,
     trim_context,
 )
+from llmfoundry.registry import icl_datasets
+
+icl_datasets.register(
+    'multiple_choice',
+    func=InContextLearningMultipleChoiceTaskDataset,
+)
+icl_datasets.register('schema', func=InContextLearningSchemaTaskDataset)
+icl_datasets.register('language_modeling', func=InContextLearningLMTaskDataset)
+icl_datasets.register(
+    'generation_task_with_answers',
+    func=InContextLearningGenerationTaskWithAnswersDataset,
+)
 
 __all__ = [
     'InContextLearningDataset',
diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
index debb0dbc6f..c87b38b09a 100644
--- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py
+++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py
@@ -19,6 +19,7 @@
 from datasets import IterableDataset, load_dataset
 from torch.utils.data import DataLoader, Dataset
 
+from llmfoundry import registry
 from llmfoundry.eval.datasets.utils import (
     convert_tokens_to_tensors,
     get_continuation_span,
@@ -29,6 +30,7 @@
     tokenizer_needs_prefix_space,
     trim_context,
 )
+from llmfoundry.utils.registry_utils import construct_from_registry
 
 log = logging.getLogger(__name__)
 
@@ -114,11 +116,11 @@ def __init__(
         max_seq_len: int,
         pad_tok_id: int,
         num_fewshot: int,
-        fewshot_random_seed: int,
-        prompt_string: str,
-        example_delimiter: str,
-        continuation_delimiter: str,
         destination_path: str,
+        fewshot_random_seed: int = 1234,
+        prompt_string: str = '',
+        example_delimiter: str = '\n',
+        continuation_delimiter: str = ' ',
         prelimiter: str = '',
         context_key: str = 'context',
         answer_key: str = 'answer',
@@ -189,6 +191,20 @@ def __len__(self) -> int:
     def get_num_samples_in_batch(self, batch: Dict) -> int:
         return batch['input_ids'].shape[0]
 
+    def get_effective_batch_size(self, batch_size: int) -> int:
+        r"""Returns effective batch size computed for given ICL task.
+
+        The effective batch size may not be equal to the configured evaluation
+        batch size because for certain ICL tasks, >1 prompts can get created
+        for every input query depending on the number of choices/continuations.
+        This requires the effective batch size to be reduced to prevent larger batches than expected during eval. For example,
+        check InContextLearningMultipleChoiceTaskDataset.
+
+        Args:
+            batch_size (int): Original batch size configured for ICL evaluations
+        """
+        return batch_size
+
     def update_generation_kwargs(self, generation_kwargs: Dict) -> None:
         r"""Updates self.base_batch with the passed in generation_kwargs.
 
@@ -519,46 +535,12 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id)
         return batch
 
-    def split_batch(self, batch: Any,
-                    microbatch_size: Union[int, float]) -> Sequence[Any]:
-        """Handling for certain specialty columns that must be split into.
-
-        batches in different formats.
-
-        Args:
-            batch (Dict): Batch of data
-            microbatch_size (int | float): Size of microbatches
-
-        Returns:
-            List: List of chunked batches
-        """
-        # Don't split kwargs that don't change
-        # Normally split torch tensors
-        # List split lists of strings
-        if isinstance(microbatch_size, float):
-            raise ValueError(
-                'split_batch does not support floating point microbatch_size.',
-            )
-        chunked = {}
-        for k, v in batch.items():
-            if k in self.static_keys:
-                # Defer broadcasting until we know num_chunks
-                pass
-            elif k in self.list_keys:
-                chunked[k] = _split_list(v, microbatch_size)
-            elif k in self.tensor_keys:
-                chunked[k] = _default_split_batch(v, microbatch_size)
-            else:
-                raise ValueError(f'Unexpected key {k} in batch splitting')
-        num_chunks = len(chunked['input_ids'])
-        for k, v in batch.items():
-            if k in self.static_keys:
-                chunked[k] = [v] * num_chunks
-
-        batched_list = [{k: v[idx]
-                         for k, v in chunked.items()}
-                        for idx in range(num_chunks)]
-        return batched_list
+    def split_batch(
+        self,
+        batch: Any,
+        microbatch_size: Union[int, float],
+    ) -> Sequence[Any]:
+        return _default_split_batch(batch, microbatch_size)
 
 
 class InContextLearningGenerationTaskWithAnswersDataset(
@@ -584,13 +566,31 @@ class InContextLearningGenerationTaskWithAnswersDataset(
 
     def __init__(
         self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        destination_path: str,
+        fewshot_random_seed: int = 1234,
+        prompt_string: str = '',
+        example_delimiter: str = '\n',
+        continuation_delimiter: str = ' ',
+        prelimiter: str = '',
+        context_key: str = 'context',
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        padding_size: Optional[int] = None,
+        base_batch: Optional[Dict] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
         cot_delimiter: str = '',
         early_stopping_criteria: Optional[List[str]] = None,
         do_normalization: bool = True,
-        *args: Any,
-        **kwargs: Any,
     ):
-        if kwargs['tokenizer'].eos_token_id is None:
+        if tokenizer.eos_token_id is None:
             raise ValueError(
                 '`InContextLearningGenerationTaskWithAnswersDataset` tokenizer must have non-null `eos_token_id`',
             )
@@ -607,13 +607,32 @@ def __init__(
         tensor_keys = ['input_ids', 'attention_mask']
         list_keys = ['labels']
         super().__init__(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            fewshot_random_seed=fewshot_random_seed,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            context_key=context_key,
+            answer_key=answer_key,
+            strip_dataset=strip_dataset,
+            padding_size=padding_size,
+            base_batch=base_batch,
+            batch_mapping=batch_mapping,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+            # specific to ICL dataset
             padding_side='left',
             tokenize_labels=False,
             static_keys=static_keys,
             list_keys=list_keys,
             tensor_keys=tensor_keys,
-            *args,
-            **kwargs,
         )
         # NOTE: set these after init call because they take class vars
         self.early_stopping_criteria = early_stopping_criteria
@@ -635,8 +654,8 @@ def __init__(
             'input_ids': self.context_key,
             'labels': 'aliases',
         }
-        if 'generation_kwargs' in kwargs:
-            self.update_generation_kwargs(kwargs['generation_kwargs'])
+        if generation_kwargs:
+            self.update_generation_kwargs(generation_kwargs)
 
     def read_dataset(
         self,
@@ -765,6 +784,45 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]:
         batch['generation_kwargs']['stopping_criteria'] = stopping_criteria
         return batch
 
+    def split_batch(self, batch: Any,
+                    microbatch_size: Union[int, float]) -> Sequence[Any]:
+        """Split batch handling for special columns.
+
+        Args:
+            batch (Dict): Batch of data
+            microbatch_size (int | float): Size of microbatches
+
+        Returns:
+            List: List of chunked batches
+        """
+        # Don't split kwargs that don't change
+        # Normally split torch tensors
+        # List split lists of strings
+        if isinstance(microbatch_size, float):
+            raise ValueError(
+                'split_batch does not support floating point microbatch_size.',
+            )
+        chunked = {}
+        for k, v in batch.items():
+            if k in self.static_keys:
+                # Defer broadcasting until we know num_chunks
+                pass
+            elif k in self.list_keys:
+                chunked[k] = _split_list(v, microbatch_size)
+            elif k in self.tensor_keys:
+                chunked[k] = _default_split_batch(v, microbatch_size)
+            else:
+                raise ValueError(f'Unexpected key {k} in batch splitting')
+        num_chunks = len(chunked['input_ids'])
+        for k, v in batch.items():
+            if k in self.static_keys:
+                chunked[k] = [v] * num_chunks
+
+        batched_list = [{k: v[idx]
+                         for k, v in chunked.items()}
+                        for idx in range(num_chunks)]
+        return batched_list
+
 
 class InContextLearningLMTaskDataset(InContextLearningDataset):
     """A dataset that constructs batches for in-context learning language.
@@ -779,8 +837,50 @@ class InContextLearningLMTaskDataset(InContextLearningDataset):
     See InContextLearningDataset for more details.
     """
 
-    def __init__(self, *args: Any, **kwargs: Any):
+    def __init__(
+        self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        destination_path: str,
+        fewshot_random_seed: int = 1234,
+        prompt_string: str = '',
+        example_delimiter: str = '\n',
+        continuation_delimiter: str = ' ',
+        prelimiter: str = '',
+        context_key: str = 'context',
+        strip_dataset: bool = True,
+        tokenize_labels: bool = True,
+        padding_size: Optional[int] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+        static_keys: Optional[List] = None,
+        list_keys: Optional[List] = None,
+    ):
         super().__init__(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            fewshot_random_seed=fewshot_random_seed,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            context_key=context_key,
+            strip_dataset=strip_dataset,
+            tokenize_labels=tokenize_labels,
+            padding_size=padding_size,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+            list_keys=list_keys,
+            # specific to ICL dataset
             answer_key='continuation',
             static_keys=['mode'],
             tensor_keys=[
@@ -800,8 +900,6 @@ def __init__(self, *args: Any, **kwargs: Any):
                 'labels': 'context',
             },
             padding_side='right',
-            *args,
-            **kwargs,
         )
 
 
@@ -833,13 +931,33 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset):
 
     def __init__(
         self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        destination_path: str,
+        fewshot_random_seed: int = 1234,
+        prompt_string: str = '',
+        example_delimiter: str = '\n',
+        continuation_delimiter: str = ' ',
+        prelimiter: str = '',
+        context_key: str = 'query',
+        tensor_keys: Optional[List] = None,
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        tokenize_labels: bool = True,
+        padding_size: Optional[int] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+        list_keys: Optional[List] = None,
         choices_key: str = 'choices',
         static_keys: Optional[List] = None,
         list_of_tensors_keys: Optional[List] = None,
         list_of_tuples_keys: Optional[List] = None,
         list_of_primitives: Optional[List] = None,
-        *args: Any,
-        **kwargs: Any,
     ):
         self.choices_key = choices_key
         base_batch = {
@@ -850,25 +968,42 @@ def __init__(
             'gold_indices': [],
             'choice_groupings': [],
         }
-        context_key = kwargs.pop('context_key', 'query')
-        static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs'])
-        tensor_keys = kwargs.pop(
-            'tensor_keys',
-            ['input_ids', 'labels', 'attention_mask'],
-        )
+        if not static_keys:
+            static_keys = ['mode', 'generation_kwargs']
+        if not tensor_keys:
+            tensor_keys = ['input_ids', 'labels', 'attention_mask']
         self.list_of_tensors_keys = list_of_tensors_keys or [
             'continuation_indices',
         ]
         self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings']
         self.list_of_primitives = list_of_primitives or ['gold_indices']
         super().__init__(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            fewshot_random_seed=fewshot_random_seed,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            answer_key=answer_key,
+            strip_dataset=strip_dataset,
+            tokenize_labels=tokenize_labels,
+            padding_size=padding_size,
+            batch_mapping=batch_mapping,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+            list_keys=list_keys,
+            # specific to ICL dataset
             context_key=context_key,
             base_batch=base_batch,
             static_keys=static_keys,
             tensor_keys=tensor_keys,
             padding_side='right',
-            *args,
-            **kwargs,
         )
         self.num_choices = len(self.dataset[0][self.choices_key])
         self.batch_mapping_per_choice = {
@@ -877,6 +1012,11 @@ def __init__(
         }
         self.batch_map_per_example = {'gold_indices': 'gold'}
 
+    def get_effective_batch_size(self, batch_size: int) -> int:
+        batch_size = max(self.num_choices, batch_size)
+        effective_batchsize = batch_size // self.num_choices
+        return effective_batchsize
+
     def get_answer_from_example(
         self,
         example: Dict,
@@ -1095,21 +1235,58 @@ class InContextLearningSchemaTaskDataset(
 
     def __init__(
         self,
+        dataset_uri: str,
+        tokenizer: transformers.PreTrainedTokenizerBase,
+        max_seq_len: int,
+        pad_tok_id: int,
+        num_fewshot: int,
+        destination_path: str,
+        fewshot_random_seed: int = 1234,
+        prompt_string: str = '',
+        example_delimiter: str = '\n',
+        continuation_delimiter: str = ' ',
+        prelimiter: str = '',
+        answer_key: str = 'answer',
+        strip_dataset: bool = True,
+        tokenize_labels: bool = True,
+        padding_size: Optional[int] = None,
+        batch_mapping: Optional[Dict] = None,
+        hf_loading_vars: Optional[Dict] = None,
+        hf_parsing_map: Optional[Dict] = None,
+        generation_kwargs: Optional[Dict] = None,
+        list_keys: Optional[List] = None,
         choices_key: str = 'context_options',
-        *args: Any,
-        **kwargs: Any,
     ):
         static_keys = ['mode']
         tensor_keys = ['input_ids', 'labels', 'attention_mask']
         list_of_tensors_keys = ['continuation_indices']
         super().__init__(
+            dataset_uri=dataset_uri,
+            tokenizer=tokenizer,
+            max_seq_len=max_seq_len,
+            pad_tok_id=pad_tok_id,
+            num_fewshot=num_fewshot,
+            fewshot_random_seed=fewshot_random_seed,
+            prompt_string=prompt_string,
+            example_delimiter=example_delimiter,
+            continuation_delimiter=continuation_delimiter,
+            destination_path=destination_path,
+            prelimiter=prelimiter,
+            answer_key=answer_key,
+            strip_dataset=strip_dataset,
+            tokenize_labels=tokenize_labels,
+            padding_size=padding_size,
+            batch_mapping=batch_mapping,
+            hf_loading_vars=hf_loading_vars,
+            hf_parsing_map=hf_parsing_map,
+            generation_kwargs=generation_kwargs,
+            list_keys=list_keys,
+            # specific to ICL dataset
             choices_key=choices_key,
             context_key=choices_key,
             static_keys=static_keys,
             tensor_keys=tensor_keys,
             list_of_tensors_keys=list_of_tensors_keys,
-            *args,
-            **kwargs,
         )
         self.base_batch = {
             'input_ids': [],
@@ -1120,6 +1297,11 @@ def __init__(
             'choice_groupings': [],
         }
 
+    def get_effective_batch_size(self, batch_size: int) -> int:
+        batch_size = max(self.num_choices, batch_size)
+        effective_batchsize = batch_size // self.num_choices
+        return effective_batchsize
+
     def construct_context(
         self,
         example: Dict[str, Any],
@@ -1294,23 +1476,10 @@ def build_icl_dataloader(
     dataset_uri: str,
     tokenizer: transformers.PreTrainedTokenizerBase,
     batch_size: int,
-    max_seq_len: int,
-    pad_tok_id: int,
-    num_fewshot: int,
-    prompt_string: str,  # e.g. 'translate english to french:'
-    example_delimiter: str,  # e.g. '\n'
-    continuation_delimiter: str,  # e.g. ''
     hf_loading_vars: Dict,
     hf_parsing_map: Dict,
-    destination_path: str,
-    prelimiter: str,  # e.g. 'Question: '
-    cot_delimiter: str,  # e.g. ' ### '
-    fewshot_random_seed: int,
-    pass_at_k: int,
-    generations_per_sample: int,
-    generation_kwargs: Dict,
-    early_stopping_criteria: Optional[List[str]] = None,
-    do_normalization: bool = True,
+    destination_path: str = '',
+    kwargs: Optional[Dict[str, Any]] = None,
 ) -> DataSpec:
     """Factory method that builds the specific dataset for the specified.
 
@@ -1323,108 +1492,36 @@ def build_icl_dataloader(
             this might be different)
         3. set the `split_batch` function if necessary
     """
-    if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        batch_size = max(dataset.num_choices, batch_size)
-        effective_batchsize = batch_size // dataset.num_choices
-    elif icl_task_type == 'schema':
-        dataset = InContextLearningSchemaTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        batch_size = max(dataset.num_choices, batch_size)
-        effective_batchsize = batch_size // dataset.num_choices
-    elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
-    elif icl_task_type == 'generation_task_with_answers':
-        dataset = InContextLearningGenerationTaskWithAnswersDataset(
-            dataset_uri=dataset_uri,
-            tokenizer=tokenizer,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
-            continuation_delimiter=continuation_delimiter,
-            destination_path=destination_path,
-            prelimiter=prelimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            hf_loading_vars=hf_loading_vars,
-            hf_parsing_map=hf_parsing_map,
-            cot_delimiter=cot_delimiter,
-            early_stopping_criteria=early_stopping_criteria,
-            do_normalization=do_normalization,
-            generation_kwargs=generation_kwargs,
-        )
-        effective_batchsize = batch_size
-    else:
-        raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
-
+    # Add named parameters to kwargs
+    if kwargs is None:
+        kwargs = {}
+    kwargs.update({
+        'dataset_uri': dataset_uri,
+        'tokenizer': tokenizer,
+        'hf_loading_vars': hf_loading_vars,
+        'hf_parsing_map': hf_parsing_map,
+        'destination_path': destination_path,
+    })
+    dataset = construct_from_registry(
+        name=icl_task_type,
+        registry=registry.icl_datasets,
+        partial_function=False,
+        pre_validation_function=None,
+        post_validation_function=None,
+        kwargs=kwargs,
+    )
     sampler = dist.get_sampler(dataset, drop_last=False, shuffle=False)
 
-    split_batch = None
-    if isinstance(
-        dataset,
-        (
-            InContextLearningMultipleChoiceTaskDataset,
-            InContextLearningGenerationTaskWithAnswersDataset,
-        ),
-    ):
-        split_batch = dataset.split_batch
-
     return DataSpec(
         DataLoader(
             dataset,
-            batch_size=effective_batchsize,
+            batch_size=dataset.get_effective_batch_size(batch_size),
             sampler=sampler,
             collate_fn=dataset.collate_fn,
         ),
         device_transforms=None,
         get_num_samples_in_batch=dataset.get_num_samples_in_batch,
-        split_batch=split_batch,
+        split_batch=dataset.split_batch,
     )
 
 
@@ -1514,24 +1611,11 @@ def get_icl_task_dataloader(
     tokenizer: Union[transformers.PreTrainedTokenizer,
                      transformers.PreTrainedTokenizerFast],
     batch_size: int,
-    max_seq_len: int,
-    pad_tok_id: int,
-    num_fewshot: int,
-    prompt_string: str,  # e.g. 'translate english to french:'
-    example_delimiter: str,  # e.g. '\n'
-    continuation_delimiter: str = '',
-    destination_path: str = '',
-    question_prelimiter: str = '',  # e.g. 'Question: '
-    fewshot_random_seed: int = 1234,
-    pass_at_k: int = 1,
-    generations_per_sample: int = 1,
-    cot_delimiter: str = '',
     has_categories: bool = False,
     hf_loading_vars: Optional[Dict] = None,
     hf_parsing_map: Optional[Dict] = None,
-    generation_kwargs: Optional[Dict] = None,
-    early_stopping_criteria: Optional[List[str]] = None,
-    do_normalization: bool = True,
+    destination_path: str = '',
+    kwargs: Optional[Dict[str, Any]] = None,
 ) -> Union[DataSpec, Dict[str, DataSpec]]:
     r"""Constructs a dataloader (or dataloaders if has_categories is True)
 
@@ -1588,28 +1672,12 @@ def get_icl_task_dataloader(
             The default keys expected are "context" and "answer".
         tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids.
         batch_size (int): Size of a batch used for eval
-        max_seq_len (int): The maximum sequence length supported by the model.
-        pad_tok_id (int): The special token used for padding batches.
-        num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples.
-        prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.').
-        example_delimiter (str, default = '\\n'): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting.
-        continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\\nA: ').
-        destination_path: (str, default = ''): This is the local file where remote datasets will be saved.
-        question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: ").
-        fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling
-        pass_at_k (int): k for how many chances the model gets to write passing code.
-        generations_per_sample (int): How many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict.
-        cot_delimiter (str): Delimiter to place between chain of thoughts and continuations.
         has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data.
         hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF.
         hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}.
             Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset.
-        generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation
-                                                  keyword args in this function (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
-                                                  for more details)
-        early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time.
-            Used in generation tasks with CoT
-        do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationTaskWithAnswersDataset. Only used in generation tasks.
+        kwargs (Dict[str, Any], default=None): Dictionary containing a mapping
+        from ICL dataset constructor's parameter names and their desired values.
 
     Returns:
         DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided.
@@ -1618,11 +1686,6 @@ def get_icl_task_dataloader(
         hf_loading_vars = {}
     if hf_parsing_map is None:
         hf_parsing_map = {}
-    if generation_kwargs is None:
-        generation_kwargs = {}
-    if early_stopping_criteria is None:
-        early_stopping_criteria = []
-
     if has_categories:
         result_dls = {}
         output_files = partition_dataset_by_category(
@@ -1639,23 +1702,10 @@ def get_icl_task_dataloader(
                 dataset_uri=partition_uri,
                 tokenizer=tokenizer,
                 batch_size=batch_size,
-                max_seq_len=max_seq_len,
-                pad_tok_id=pad_tok_id,
-                num_fewshot=num_fewshot,
-                prompt_string=prompt_string,
-                example_delimiter=example_delimiter,
-                continuation_delimiter=continuation_delimiter,
                 destination_path=partition_uri + '_tmp',
-                prelimiter=question_prelimiter,
-                cot_delimiter=cot_delimiter,
-                fewshot_random_seed=fewshot_random_seed,
-                pass_at_k=pass_at_k,
-                generations_per_sample=generations_per_sample,
                 hf_loading_vars=hf_loading_vars,
                 hf_parsing_map=hf_parsing_map,
-                generation_kwargs=generation_kwargs,
-                early_stopping_criteria=early_stopping_criteria,
-                do_normalization=do_normalization,
+                kwargs=kwargs,
             )
         return result_dls
     else:
@@ -1664,21 +1714,8 @@ def get_icl_task_dataloader(
             dataset_uri=dataset_uri,
             tokenizer=tokenizer,
             batch_size=batch_size,
-            max_seq_len=max_seq_len,
-            pad_tok_id=pad_tok_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter=example_delimiter,
             hf_loading_vars=hf_loading_vars,
             hf_parsing_map=hf_parsing_map,
-            continuation_delimiter=continuation_delimiter,
             destination_path=destination_path,
-            prelimiter=question_prelimiter,
-            cot_delimiter=cot_delimiter,
-            fewshot_random_seed=fewshot_random_seed,
-            pass_at_k=pass_at_k,
-            generations_per_sample=generations_per_sample,
-            generation_kwargs=generation_kwargs,
-            early_stopping_criteria=early_stopping_criteria,
-            do_normalization=do_normalization,
+            kwargs=kwargs,
         )
diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py
index 0c8e64b759..f36f53fffa 100644
--- a/llmfoundry/registry.py
+++ b/llmfoundry/registry.py
@@ -8,6 +8,7 @@
 from composer.optim import ComposerScheduler
 from torch.optim import Optimizer
 from torch.utils.data import DataLoader as TorchDataloader
+from torch.utils.data import Dataset
 from torchmetrics import Metric
 from transformers import PreTrainedTokenizerBase
 
@@ -206,6 +207,21 @@
     description=_metrics_description,
 )
 
+_icl_datasets_description = (
+    'The ICL datasets registry is used to register an torch.utils.data.Dataset class which can be used for ICL tasks.'
+)
+icl_datasets = create_registry(
+    'llmfoundry',
+    'icl_datasets',
+    # TODO: Change type from Dataset to
+    # llmfoundry.eval.InContextLearningDataset.
+    # Using ICL dataset here introduces a circular import dependency between
+    # the registry and eval packages right now, thus needs some refactoring.
+    generic_type=Type[Dataset],
+    entry_points=True,
+    description=_icl_datasets_description,
+)
+
 __all__ = [
     'loggers',
     'callbacks',
@@ -228,4 +244,5 @@
     'attention_classes',
     'attention_implementations',
     'fcs',
+    'icl_datasets',
 ]
diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 73eb026d98..f9e84aab45 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import copy
 import functools
 import logging
 import os
@@ -545,22 +546,10 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
                     f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg["icl_task_type"]}.',
                 )
 
-        if 'prompt_string' not in icl_cfg:
-            icl_cfg['prompt_string'] = ''
-        if 'example_delimiter' not in icl_cfg:
-            icl_cfg['example_delimiter'] = '\n'
-        if 'continuation_delimiter' not in icl_cfg:
-            icl_cfg['continuation_delimiter'] = ' '
         if 'max_seq_len' not in icl_cfg:
             icl_cfg['max_seq_len'] = default_max_seq_len
         if 'batch_size' not in icl_cfg:
             icl_cfg['batch_size'] = default_batch_size
-        if 'pass_at_k' not in icl_cfg:
-            icl_cfg['pass_at_k'] = 1
-        if 'fewshot_random_seed' not in icl_cfg:
-            icl_cfg['fewshot_random_seed'] = 1234
-        if 'generations_per_sample' not in icl_cfg:
-            icl_cfg['generations_per_sample'] = 1
 
         if 'num_beams' in icl_cfg:
             raise ValueError(
@@ -579,6 +568,7 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
                 pad_tok_id = tokenizer.eos_token_id
             else:
                 pad_tok_id = tokenizer.pad_token_id
+
             label = f'{icl_cfg["label"]}/{num_fewshot}-shot'
             metric_names = list(icl_cfg['metric_names'])
             # TODO: fix Composer bug when copying local paths and destination exists
@@ -589,38 +579,37 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
 
             hf_parsing_map = icl_cfg.get('hf_parsing_map', {})
             hf_loading_vars = icl_cfg.get('hf_loading_vars', {})
-
             early_stopping_criteria = icl_cfg.get(
                 'early_stopping_criteria',
-                None,
+                [],
             )
+            # TODO: fix manual removal of non-constructor fields
+            icl_constructor_kwargs = copy.deepcopy(icl_cfg)
+            icl_constructor_kwargs.pop('label', None)
+            icl_constructor_kwargs.pop('metric_names', None)
+            icl_constructor_kwargs.pop('icl_task_type', None)
+            icl_constructor_kwargs.pop('batch_size', None)
+            icl_constructor_kwargs.pop('has_categories', None)
+
+            # Add custom constructor arguments
+            icl_constructor_kwargs['pad_tok_id'] = pad_tok_id
+            icl_constructor_kwargs['num_fewshot'] = num_fewshot
+
             assert early_stopping_criteria is None or isinstance(
                 early_stopping_criteria,
                 list,
             )
+
             dataloaders = get_icl_task_dataloader(
-                icl_cfg['icl_task_type'],
-                icl_cfg['dataset_uri'],
-                tokenizer,
+                icl_task_type=icl_cfg['icl_task_type'],
+                dataset_uri=icl_cfg['dataset_uri'],
+                tokenizer=tokenizer,
                 batch_size=icl_cfg['batch_size'],
-                max_seq_len=icl_cfg['max_seq_len'],
-                pad_tok_id=pad_tok_id,
-                num_fewshot=num_fewshot,
-                prompt_string=icl_cfg['prompt_string'],
-                example_delimiter=icl_cfg['example_delimiter'],
                 hf_loading_vars=hf_loading_vars,
                 hf_parsing_map=hf_parsing_map,
-                continuation_delimiter=icl_cfg['continuation_delimiter'],
-                question_prelimiter=icl_cfg.get('question_prelimiter', ''),
-                destination_path=destination_path,
-                fewshot_random_seed=icl_cfg['fewshot_random_seed'],
-                pass_at_k=icl_cfg['pass_at_k'],
-                generations_per_sample=icl_cfg['generations_per_sample'],
                 has_categories=icl_cfg.get('has_categories', False),
-                cot_delimiter=icl_cfg.get('cot_delimiter', ''),
-                generation_kwargs=icl_cfg.get('generation_kwargs', {}),
-                early_stopping_criteria=early_stopping_criteria,
-                do_normalization=icl_cfg.get('do_normalization', True),
+                destination_path=destination_path,
+                kwargs=icl_constructor_kwargs,
             )
             if 'has_categories' in icl_cfg and icl_cfg[
                 'has_categories'] and isinstance(dataloaders, dict):
diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py
index a3c3e88364..b5eacdeb0f 100644
--- a/tests/eval/test_in_context_learning_datasets.py
+++ b/tests/eval/test_in_context_learning_datasets.py
@@ -1090,15 +1090,22 @@ def test_mc_task_dataloader_subcategories(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=2,
-        prompt_string=
-        'The following are multiple choice questions (with answers).\n',
-        example_delimiter='\n',
-        continuation_delimiter='Answer: ',
-        destination_path=str(tmp_path / 'icl.jsonl'),
         has_categories=True,
+        destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'num_fewshot':
+                2,
+            'max_seq_len':
+                seqlen,
+            'pad_tok_id':
+                tokenizer.eos_token_id,
+            'prompt_string':
+                'The following are multiple choice questions (with answers).\n',
+            'example_delimiter':
+                '\n',
+            'continuation_delimiter':
+                'Answer: ',
+        },
     )
     assert isinstance(dls, dict)
 
@@ -1142,13 +1149,15 @@ def test_lm_task_dataloader_extra_space(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=10,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=' ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 10,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ' ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1192,13 +1201,15 @@ def test_lm_task_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 0,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': '',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1241,14 +1252,16 @@ def test_schema_task_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=1,
-        prompt_string='',
-        example_delimiter='\n',
-        question_prelimiter=prelimiter,
-        continuation_delimiter='',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 1,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'prelimiter': prelimiter,
+            'continuation_delimiter': '',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)
@@ -1300,13 +1313,15 @@ def test_schema_task_dataloader_sentpiece_tokenizer(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=1,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=' ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 1,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ' ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)
@@ -1358,13 +1373,15 @@ def test_lm_task_dataloader_opt_tokenizer(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': '',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1410,13 +1427,15 @@ def test_mc_task_dataloader_opt_tokenizer(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1473,13 +1492,15 @@ def test_mc_split_batch(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1550,13 +1571,15 @@ def test_qa_split_batch(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=8,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 0,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
 
     assert isinstance(dl, DataSpec)  # pyright
@@ -1612,14 +1635,16 @@ def test_qa_task_dataloader_w_null_eos(
             dataset_uri,
             tokenizer,
             batch_size,
-            max_seq_len=seqlen,
-            pad_tok_id=tokenizer.eos_token_id,
-            num_fewshot=num_fewshot,
-            prompt_string=prompt_string,
-            example_delimiter='\n',
-            question_prelimiter='Q: ',
-            continuation_delimiter='\nA:',
             destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+            kwargs={
+                'max_seq_len': seqlen,
+                'pad_tok_id': tokenizer.eos_token_id,
+                'num_fewshot': num_fewshot,
+                'prompt_string': prompt_string,
+                'example_delimiter': '\n',
+                'prelimiter': 'Q: ',
+                'continuation_delimiter': '\nA:',
+            },
         )
 
 
@@ -1647,14 +1672,16 @@ def test_qa_task_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        question_prelimiter='Q: ',
-        continuation_delimiter='\nA:',
         destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': prompt_string,
+            'example_delimiter': '\n',
+            'prelimiter': 'Q: ',
+            'continuation_delimiter': '\nA:',
+        },
     )
     assert isinstance(dl, DataSpec)
 
@@ -1714,15 +1741,17 @@ def test_qa_task_with_cot_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        question_prelimiter='Q: ',
-        continuation_delimiter="\nA: Let's think step by step. ",
-        cot_delimiter=' #### ',
         destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'prelimiter': 'Q: ',
+            'continuation_delimiter': "\nA: Let's think step by step. ",
+            'cot_delimiter': ' #### ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1779,14 +1808,16 @@ def test_mc_task_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=1,
-        prompt_string='',
-        question_prelimiter=prelimiter,
-        example_delimiter=example_delimiter,
-        continuation_delimiter='\nA: ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 1,
+            'prompt_string': '',
+            'prelimiter': prelimiter,
+            'example_delimiter': example_delimiter,
+            'continuation_delimiter': '\nA: ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -1851,13 +1882,15 @@ def test_lm_task_evaluation(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter='',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': '',
+        },
     )
 
     evaluator = Evaluator(
@@ -1903,13 +1936,15 @@ def test_schema_task_evaluation(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
 
     evaluator = Evaluator(
@@ -1968,14 +2003,16 @@ def test_mc_task_evaluation_subcategories(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=max_seq_len,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
         has_categories=True,
+        kwargs={
+            'max_seq_len': max_seq_len,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
 
     assert isinstance(dls, dict)
@@ -2039,13 +2076,15 @@ def test_mc_task_evaluation(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=64,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 64,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
 
     evaluator = Evaluator(
@@ -2107,13 +2146,15 @@ def test_qa_task_evaluation_opt_tokenizer(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
 
     evaluator = Evaluator(
@@ -2168,14 +2209,16 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter="A: Let's think step by step. ",
-        cot_delimiter=' #### ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': "A: Let's think step by step. ",
+            'cot_delimiter': ' #### ',
+        },
     )
 
     evaluator = Evaluator(
@@ -2228,13 +2271,15 @@ def test_qa_task_evaluation(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=': ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ': ',
+        },
     )
 
     evaluator = Evaluator(
@@ -2288,14 +2333,16 @@ def test_qa_task_with_cot_evaluation(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=1024,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter="A: Let's think step by step",
-        cot_delimiter=' #### ',
         destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': 1024,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': "A: Let's think step by step",
+            'cot_delimiter': ' #### ',
+        },
     )
 
     evaluator = Evaluator(
@@ -2339,13 +2386,15 @@ def test_lm_spacing_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=1,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=' UNIQUE ',
         destination_path=str(tmp_path / 'icl.jsonl'),
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 1,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ' UNIQUE ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -2409,15 +2458,17 @@ def test_hf_dataloading_lm_dataloader(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=0,
-        prompt_string='',
-        example_delimiter='\n',
-        continuation_delimiter=' ',
         destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
         hf_loading_vars=hf_loading_vars,
         hf_parsing_map=hf_parsing_map,
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': 0,
+            'prompt_string': '',
+            'example_delimiter': '\n',
+            'continuation_delimiter': ' ',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
@@ -2490,16 +2541,18 @@ def test_hf_dataloading_custom_parsing(
         dataset_uri=dataset_uri,
         tokenizer=tokenizer,
         batch_size=batch_size,
-        max_seq_len=seqlen,
-        pad_tok_id=tokenizer.eos_token_id,
-        num_fewshot=num_fewshot,
-        prompt_string=prompt_string,
-        example_delimiter='\n',
-        question_prelimiter='Orbs: ',
-        continuation_delimiter='\nSpell:',
         destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'),
         hf_loading_vars=hf_loading_vars,
         hf_parsing_map=hf_parsing_map,
+        kwargs={
+            'max_seq_len': seqlen,
+            'pad_tok_id': tokenizer.eos_token_id,
+            'num_fewshot': num_fewshot,
+            'prompt_string': prompt_string,
+            'example_delimiter': '\n',
+            'prelimiter': 'Orbs: ',
+            'continuation_delimiter': '\nSpell:',
+        },
     )
     assert isinstance(dl, DataSpec)
     assert isinstance(dl.dataloader, DataLoader)  # pyright
diff --git a/tests/test_registry.py b/tests/test_registry.py
index 87881450d4..3bdf5a800f 100644
--- a/tests/test_registry.py
+++ b/tests/test_registry.py
@@ -42,6 +42,7 @@ def test_expected_registries_exist():
         'attention_classes',
         'attention_implementations',
         'fcs',
+        'icl_datasets',
     }
 
     assert existing_registries == expected_registry_names