From 38dcf1e6e25a05dbd4275402f168bac299b45d6d Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Sat, 3 Aug 2024 08:42:38 -0700
Subject: [PATCH 01/13] Remove type ignore (#1421)

---
 llmfoundry/utils/builders.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py
index 000155f1a4..a1d84601b3 100644
--- a/llmfoundry/utils/builders.py
+++ b/llmfoundry/utils/builders.py
@@ -127,8 +127,7 @@ def build_eval_loaders(
             # Load the eval data to fail fast. metrics will get added
             # later in add_metrics_to_eval_loaders, after the model is loaded
             metric_names=[],
-            # TODO: Fix type in Composer
-            device_eval_microbatch_size=device_eval_batch_size,  # type: ignore
+            device_eval_microbatch_size=device_eval_batch_size,
         )
         evaluators.append(eval_loader)
     return evaluators

From 32f84bc8cf0bc302d52d9f9c73af1f02dadb2035 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 5 Aug 2024 09:34:12 -0700
Subject: [PATCH 02/13] Update pytest-cov requirement from <5,>=4 to >=4,<6
 (#1423)

Updates the requirements on [pytest-cov](https://github.com/pytest-dev/pytest-cov) to permit the latest version.
- [Changelog](https://github.com/pytest-dev/pytest-cov/blob/master/CHANGELOG.rst)
- [Commits](https://github.com/pytest-dev/pytest-cov/compare/v4.0.0...v5.0.0)

---
updated-dependencies:
- dependency-name: pytest-cov
  dependency-type: direct:development
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 185d3970f7..2b47928419 100644
--- a/setup.py
+++ b/setup.py
@@ -82,7 +82,7 @@
     'pre-commit>=3.4.0,<4',
     'pytest>=7.2.1,<8',
     'pytest_codeblocks>=0.16.1,<0.17',
-    'pytest-cov>=4,<5',
+    'pytest-cov>=4,<6',
     'pyright==1.1.256',
     'toml>=0.10.2,<0.11',
     'packaging>=21,<23',

From 32803b751137760d826c734b331fb3ef779746bc Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 5 Aug 2024 14:43:24 -0400
Subject: [PATCH 03/13] Bump onnx from 1.16.1 to 1.16.2 (#1425)

Bumps [onnx](https://github.com/onnx/onnx) from 1.16.1 to 1.16.2.
- [Release notes](https://github.com/onnx/onnx/releases)
- [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog-ml.md)
- [Commits](https://github.com/onnx/onnx/compare/v1.16.1...v1.16.2)

---
updated-dependencies:
- dependency-name: onnx
  dependency-type: direct:production
  update-type: version-update:semver-patch
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2b47928419..19e5cee2d6 100644
--- a/setup.py
+++ b/setup.py
@@ -65,7 +65,7 @@
     'omegaconf>=2.2.3,<3',
     'slack-sdk<4',
     'mosaicml-cli>=0.6.10,<1',
-    'onnx==1.16.1',
+    'onnx==1.16.2',
     'onnxruntime==1.18.1',
     'boto3>=1.21.45,<2',
     'huggingface-hub>=0.19.0,<0.25',

From 6dcc18a5d1db104b48eacce68144db5a99da37d7 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Mon, 5 Aug 2024 15:56:35 -0400
Subject: [PATCH 04/13] Add transforms to logged config (#1428)

Co-authored-by: Saaketh Narayan <narayan.saaketh@gmail.com>
---
 llmfoundry/utils/config_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index f10fe32735..eb54fabc3d 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -332,15 +332,15 @@ def make_dataclass_and_log_config(
                 'icl_tasks must be specified in the config',
             )
 
-    # Create copy of config for logging
-    logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config)
-
     # Apply transforms to the unstructured config before constructing dataclass
     unstructured_config = apply_transforms_to_config(
         unstructured_config,
         transforms,
     )
 
+    # Create copy of config for logging
+    logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config)
+
     arg_config_keys = set(unstructured_config.keys())
     extraneous_keys = set.difference(arg_config_keys, dataclass_fields)
 

From 8fe57eb1ea7b1794a214ea6256965210c770c639 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 6 Aug 2024 10:09:52 -0700
Subject: [PATCH 05/13] Install all deps in Docker images (#1431)

---
 .github/workflows/docker.yaml | 4 ++--
 Dockerfile                    | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
index 17bb976a5d..0bb0b4087a 100644
--- a/.github/workflows/docker.yaml
+++ b/.github/workflows/docker.yaml
@@ -19,10 +19,10 @@ jobs:
         include:
         - name: "2.3.1_cu121"
           base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
-          dep_groups: "[gpu]"
+          dep_groups: "[all]"
         - name: "2.3.1_cu121_aws"
           base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws
-          dep_groups: "[gpu]"
+          dep_groups: "[all]"
     steps:
 
     - name: Checkout
diff --git a/Dockerfile b/Dockerfile
index 9366d7dbcd..cee7063cdd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,8 @@ FROM $BASE_IMAGE
 ARG BRANCH_NAME
 ARG DEP_GROUPS
 
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0"
+
 # Check for changes in setup.py.
 # If there are changes, the docker cache is invalidated and a fresh pip installation is triggered.
 ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py

From 420ea30106f015d41a823ce7eb5592d3a144e999 Mon Sep 17 00:00:00 2001
From: Vincent Chen <v.chen@databricks.com>
Date: Tue, 6 Aug 2024 13:37:12 -0400
Subject: [PATCH 06/13] Raise error when not enough data when converting text
 to MDS  (#1430)

* test

* precommit

* precommit

* custom error

* rm input folder

* Update llmfoundry/utils/exceptions.py

Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>

---------

Co-authored-by: v-chen_data <v-chen_data@example.com>
Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com>
---
 .../data_prep/convert_text_to_mds.py          |  7 ++++++
 llmfoundry/utils/exceptions.py                |  9 ++++++++
 .../data_prep/test_convert_text_to_mds.py     | 23 +++++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
index 336c82a5e7..94bdc16526 100644
--- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
+++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py
@@ -1,6 +1,7 @@
 # Copyright 2024 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 import logging
 import math
 import os
@@ -28,6 +29,7 @@
     merge_shard_groups,
 )
 from llmfoundry.utils.exceptions import (
+    DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
 )
@@ -468,6 +470,11 @@ def convert_text_to_mds(
             trust_remote_code,
         )
 
+    index_path = os.path.join(local_output_folder, 'index.json')
+    with open(index_path, 'r') as index_file:
+        if not json.load(index_file)['shards']:
+            raise DatasetTooSmallError()
+
     # Write a done file with the args and object names
     write_done_file(local_output_folder, args_str, object_names)
 
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 76f378f8c6..140bf8540b 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -28,6 +28,7 @@
     'InputFolderMissingDataError',
     'OutputFolderNotEmptyError',
     'MisconfiguredHfDatasetError',
+    'DatasetTooSmallError',
     'RunTimeoutError',
 ]
 
@@ -348,6 +349,14 @@ def __init__(self, dataset_name: str, split: str) -> None:
         super().__init__(message, dataset_name=dataset_name, split=split)
 
 
+class DatasetTooSmallError(UserError):
+    """Error thrown when the dataset is too small to be processed."""
+
+    def __init__(self) -> None:
+        message = f'Your dataset is too small and produced no complete samples during preprocessing. Please provide more data.'
+        super().__init__(message)
+
+
 class RunTimeoutError(InternalError):
     """Error thrown when a run times out."""
 
diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
index f4c160790a..6ba14d62e4 100644
--- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py
+++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py
@@ -22,6 +22,7 @@
     write_done_file,
 )
 from llmfoundry.utils.exceptions import (
+    DatasetTooSmallError,
     InputFolderMissingDataError,
     OutputFolderNotEmptyError,
 )
@@ -267,6 +268,28 @@ def test_input_folder_not_exist(tmp_path: pathlib.Path):
         )
 
 
+def test_dataset_too_small(tmp_path: pathlib.Path):
+    input_folder = tmp_path / 'input'
+    os.makedirs(input_folder, exist_ok=True)
+    with open(input_folder / 'test.txt', 'w') as f:
+        f.write('a')
+    with pytest.raises(DatasetTooSmallError):
+        convert_text_to_mds(
+            tokenizer_name='mosaicml/mpt-7b',
+            output_folder=str(tmp_path / 'output'),
+            input_folder=str(input_folder),
+            concat_tokens=2048,
+            eos_text='',
+            bos_text='',
+            no_wrap=False,
+            compression='zstd',
+            processes=1,
+            args_str='Namespace()',
+            reprocess=False,
+            trust_remote_code=False,
+        )
+
+
 def test_is_already_processed(tmp_path: pathlib.Path):
     tmp_path_str = str(tmp_path)
     args_str = 'Namespace(x = 5)'

From 0ba263adb7a649f07606e2c1b9a404005364d58e Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Tue, 6 Aug 2024 12:09:41 -0700
Subject: [PATCH 07/13] Bump yaml versions (#1433)

---
 mcli/mcli-1b-eval.yaml                | 2 +-
 mcli/mcli-1b-max-seq-len-8k.yaml      | 2 +-
 mcli/mcli-1b.yaml                     | 2 +-
 mcli/mcli-benchmark-mpt.yaml          | 2 +-
 mcli/mcli-convert-composer-to-hf.yaml | 2 +-
 mcli/mcli-hf-eval.yaml                | 2 +-
 mcli/mcli-hf-generate.yaml            | 2 +-
 mcli/mcli-llama2-finetune.yaml        | 2 +-
 mcli/mcli-openai-eval.yaml            | 2 +-
 mcli/mcli-pretokenize-oci-upload.yaml | 2 +-
 10 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
index d8ef42d5d5..4bfa301f8e 100644
--- a/mcli/mcli-1b-eval.yaml
+++ b/mcli/mcli-1b-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index 4b8eb601b2..2dc83d36a9 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index 20d41fb2cc..69b2295011 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml
index a7d44239b5..7a3ea2cbe9 100644
--- a/mcli/mcli-benchmark-mpt.yaml
+++ b/mcli/mcli-benchmark-mpt.yaml
@@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.3.1_cu121-latest
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
 
diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml
index 60c708e2f6..fefaf8e1a3 100644
--- a/mcli/mcli-convert-composer-to-hf.yaml
+++ b/mcli/mcli-convert-composer-to-hf.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit:  # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index e69e6dadda..e58d42483a 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml
index 8b382c41f0..02c49d84c3 100644
--- a/mcli/mcli-hf-generate.yaml
+++ b/mcli/mcli-hf-generate.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 443429ca0a..47c163faf8 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
index 38b02a6019..c372014165 100644
--- a/mcli/mcli-openai-eval.yaml
+++ b/mcli/mcli-openai-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu,openai]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
index 0749dcc86e..a4496503cd 100644
--- a/mcli/mcli-pretokenize-oci-upload.yaml
+++ b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -14,7 +14,7 @@ integrations:
   - oci-cli==3.23.2
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.8.0
+  git_branch: v0.10.0
   # git_commit: # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo

From 84cb2ed43b3eccd4eca747ac0ad69d54348c8224 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 7 Aug 2024 13:06:48 -0700
Subject: [PATCH 08/13] Automatically get the portion of the dataset config
 that is constructor args (#1434)

---
 llmfoundry/data/finetuning/dataloader.py | 136 ++++++++++-------------
 llmfoundry/data/finetuning/tasks.py      |  33 ++++--
 tests/data/test_dataloader.py            |  20 +++-
 3 files changed, 98 insertions(+), 91 deletions(-)

diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index d9450bc657..771033a703 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -1,5 +1,6 @@
 # Copyright 2022 MosaicML LLM Foundry authors
 # SPDX-License-Identifier: Apache-2.0
+import inspect
 import logging
 import os
 from typing import Any, Dict, Optional, Tuple, Union
@@ -17,6 +18,8 @@
     validate_target_settings,
 )
 from llmfoundry.data.finetuning.tasks import (
+    DEFAULT_TARGET_PROMPTS,
+    DEFAULT_TARGET_RESPONSES,
     DOWNLOADED_FT_DATASETS_DIRPATH,
     SUPPORTED_EXTENSIONS,
     dataset_constructor,
@@ -39,9 +42,15 @@
 # HuggingFace hardcodes the ignore index to -100
 _HF_IGNORE_INDEX = -100
 
-# Default settings to use for target responses and target prompts
-_DEFAULT_TARGET_RESPONSES = 'last'
-_DEFAULT_TARGET_PROMPTS = 'none'
+# Extra keys present in the dataset config dictionary beyond the constructor keys
+_ALLOWED_DATASET_KEYS = {
+    'shuffle',
+    'packing_ratio',
+    'allow_pad_trimming',
+    'seq_parallel_replication',
+    'auto_packing_replication',
+    'max_leftover_bins_to_keep',
+}
 
 
 def build_finetuning_dataloader(
@@ -171,7 +180,26 @@ def build_finetuning_dataloader(
         given a starting workload YAML.
     """
     dataset_cfg = dataset
-    _validate_config(**dataset_cfg)
+    is_streaming = (
+        dataset_cfg.get('remote') is not None or
+        dataset_cfg.get('streams') is not None
+    )
+    if is_streaming:
+        dataset_constructor_keys = inspect.signature(
+            dataset_constructor.streaming_dataset_class,
+        ).parameters.keys()
+    else:
+        dataset_constructor_keys = inspect.signature(
+            dataset_constructor.build_from_hf,
+        ).parameters.keys()
+
+    allowed_dataset_config_keys = set(
+        dataset_constructor_keys,
+    ).union(_ALLOWED_DATASET_KEYS)
+    _validate_config(
+        **dataset_cfg,
+        allowed_dataset_keys=allowed_dataset_config_keys,
+    )
 
     # Use EOS as the pad token if none exists
     if tokenizer.pad_token is None:  # type: ignore (sometimes it's none and that's ok)
@@ -213,9 +241,7 @@ def build_finetuning_dataloader(
 
     streaming_dataset = None  # for pyright
     sampler = None
-    if dataset_cfg.get(
-        'remote',
-    ) is not None or dataset_cfg.get('streams') is not None:
+    if is_streaming:
         # Build streaming dataloader
         streams_cfg = dataset_cfg.get('streams', None)
         streams_cfg = to_dict_container(
@@ -225,34 +251,20 @@ def build_finetuning_dataloader(
             streams_cfg,
         ) if streams_cfg is not None else None
 
-        # note: we don't need to use ** here because we're setting default values for almost all arguments
+        # Take the constructor args from above, minus args that have been created separately
+        dataset_constructor_args = {
+            k: v
+            for k, v in dataset_cfg.items()
+            if k in dataset_constructor_keys and
+            k not in {'streams', 'packing_ratio'}
+        }
         streaming_dataset = dataset_constructor.build_from_streaming(
             tokenizer=tokenizer,
             streams=streams,
-            local=dataset_cfg.get('local', None),
-            remote=dataset_cfg.get('remote', None),
-            split=dataset_cfg.get('split', None),
-            download_retry=dataset_cfg.get('download_retry', 2),
-            download_timeout=dataset_cfg.get('download_timeout', 60),
-            validate_hash=dataset_cfg.get('validate_hash', None),
-            keep_zip=dataset_cfg.get('keep_zip', False),
-            epoch_size=dataset_cfg.get('epoch_size', None),
-            predownload=dataset_cfg.get('predownload', None),
-            cache_limit=dataset_cfg.get('cache_limit', None),
-            partition_algo=dataset_cfg.get('partition_algo', 'relaxed'),
-            num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None),
             batch_size=dataloader_batch_size,
-            shuffle=dataset_cfg.get('shuffle', False),
-            shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'),
-            shuffle_seed=dataset_cfg.get('shuffle_seed', 9176),
-            shuffle_block_size=dataset_cfg.get('shuffle_block_size', None),
-            sampling_method=dataset_cfg.get('sampling_method', 'balanced'),
-            sampling_granularity=dataset_cfg.get('sampling_granularity', 1),
-            batching_method=dataset_cfg.get('batching_method', 'random'),
-            max_seq_len=dataset_cfg['max_seq_len'],
-            allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False),
             replication=replication_factor,
             packing_ratio=dataloader_batch_size / dataset_batch_size,
+            **dataset_constructor_args,
         )
 
     else:
@@ -283,24 +295,19 @@ def build_finetuning_dataloader(
                 dataset_name_or_path,
             )
 
-        # Build dataset from HF.
+        # Take the constructor args from above, minus args that have been created separately
+        dataset_constructor_args = {
+            k: v
+            for k, v in dataset_cfg.items()
+            if k in dataset_constructor_keys and
+            k not in {'split', 'preprocessing_fn'}
+        }
         streaming_dataset = dataset_constructor.build_from_hf(
             dataset_name=dataset_name_or_path,
             split=split,
-            safe_load=dataset_cfg.get('safe_load', False),
-            max_seq_len=dataset_cfg['max_seq_len'],
             preprocessing_fn=preprocessing_fn,
             tokenizer=tokenizer,
-            target_prompts=dataset_cfg.get(
-                'target_prompts',
-                _DEFAULT_TARGET_PROMPTS,
-            ),
-            target_responses=dataset_cfg.get(
-                'target_responses',
-                _DEFAULT_TARGET_RESPONSES,
-            ),
-            decoder_only_format=dataset_cfg['decoder_only_format'],
-            hf_kwargs=dataset_cfg.get('hf_kwargs', {}),
+            **dataset_constructor_args,
         )
 
         # Ensure dataset is large enough.
@@ -367,6 +374,7 @@ def _validate_config(
     streams: Optional[Dict[str, Any]] = None,
     target_prompts: Optional[str] = None,
     target_responses: Optional[str] = None,
+    allowed_dataset_keys: set[str] = _ALLOWED_DATASET_KEYS,
     **kwargs: Dict[str, Any],
 ) -> None:
     """Validates the dataset configuration.
@@ -417,6 +425,7 @@ def _validate_config(
             Defaults to "last", meaning only the final response in multi-turn examples
             will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for
             details.
+        allowed_dataset_keys (set[str], optional): The set of allowed keys for the dataset config.
         kwargs (DictConfig, optional): Additional kwargs to
                 pass to `datasets.load_dataset`, which can be used to load
                 a dataset from local files.
@@ -424,41 +433,10 @@ def _validate_config(
     Raises:
         ValueError: If the dataset configuration does not meet the requirements.
     """
-    # Check for extraneous keys in the dataset config
-    allowed_additional_kwargs = {
-        'local',
-        'remote',
-        'split',
-        'download_retry',
-        'download_timeout',
-        'validate_hash',
-        'keep_zip',
-        'epoch_size',
-        'predownload',
-        'cache_limit',
-        'partition_algo',
-        'num_canonical_nodes',
-        'batch_size',
-        'shuffle',
-        'shuffle_algo',
-        'shuffle_seed',
-        'shuffle_block_size',
-        'sampling_method',
-        'sampling_granularity',
-        'batching_method',
-        'max_seq_len',
-        'allow_unsafe_types',
-        'replication',
-        'packing_ratio',
-        'allow_pad_trimming',
-        'seq_parallel_replication',
-        'auto_packing_replication',
-        'max_leftover_bins_to_keep',
-    }
-    if not set(kwargs.keys()).issubset(allowed_additional_kwargs):
+    if not set(kwargs.keys()).issubset(allowed_dataset_keys):
         raise ValueError(
             'The dataset config contains the following extraneous keys: ' +\
-            ', '.join(set(kwargs.keys()) - allowed_additional_kwargs),
+            ', '.join(set(kwargs.keys()) - allowed_dataset_keys),
         )
 
     if hf_name is not None:
@@ -542,9 +520,9 @@ def _validate_config(
     # Raise an error if the target_prompts + target_responses + decoder_only_format settings
     # are invalid
     if target_prompts is None:
-        target_prompts = _DEFAULT_TARGET_PROMPTS
+        target_prompts = DEFAULT_TARGET_PROMPTS
     if target_responses is None:
-        target_responses = _DEFAULT_TARGET_RESPONSES
+        target_responses = DEFAULT_TARGET_RESPONSES
     target_prompts, target_responses = target_prompts.lower(
     ), target_responses.lower()
     validate_target_settings(
@@ -646,9 +624,9 @@ def build_collate_fn(
     dataset_cfg = dataloader_cfg['dataset']
     target_responses = dataset_cfg.get(
         'target_responses',
-        _DEFAULT_TARGET_RESPONSES,
+        DEFAULT_TARGET_RESPONSES,
     )
-    target_prompts = dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS)
+    target_prompts = dataset_cfg.get('target_prompts', DEFAULT_TARGET_PROMPTS)
     max_seq_len = dataset_cfg['max_seq_len']
     decoder_only_format = dataset_cfg['decoder_only_format']
     allow_pad_trimming = dataset_cfg.get('allow_pad_trimming', False)
diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index 397b619e73..dd9b495ce4 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -47,6 +47,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     Optional,
     Sequence,
     Tuple,
+    Type,
     Union,
     cast,
 )
@@ -115,6 +116,8 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
 )
 SUPPORTED_EXTENSIONS = ['.csv', '.json', '.jsonl', '.parquet']
 HUGGINGFACE_FOLDER_EXTENSIONS = ['.lock', '.metadata']
+DEFAULT_TARGET_RESPONSES = 'last'
+DEFAULT_TARGET_PROMPTS = 'none'
 
 PromptResponseDict = Mapping[str, str]
 ChatFormattedDict = Mapping[str, List[Dict[str, str]]]
@@ -805,14 +808,14 @@ def build_from_hf(
         self,
         dataset_name: str,
         split: str,
-        safe_load: bool,
-        max_seq_len: int,
-        preprocessing_fn: Optional[Callable[[dict[str, Any]], Example]],
-        tokenizer: PreTrainedTokenizerBase,
-        target_prompts: str,
-        target_responses: str,
-        decoder_only_format: bool,
-        hf_kwargs: Dict[str, Any],
+        safe_load: bool = False,
+        max_seq_len: int = 2048,
+        preprocessing_fn: Optional[Callable[[dict[str, Any]], Example]] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
+        target_prompts: str = DEFAULT_TARGET_PROMPTS,
+        target_responses: str = DEFAULT_TARGET_RESPONSES,
+        decoder_only_format: bool = True,
+        hf_kwargs: Optional[Dict[str, Any]] = None,
     ) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset,
                hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]:
         """Load a HuggingFace Datasets, preprocess, and tokenize.
@@ -851,6 +854,14 @@ def build_from_hf(
         Returns:
             Dataset: The tokenized dataset.
         """
+        if hf_kwargs is None:
+            hf_kwargs = {}
+
+        # None is checked in the function, because argument defaults were added after the function was written and we want
+        # to preserve the ordering of the arguments for backwards compatibility.
+        if tokenizer is None:
+            raise ValueError('A tokenizer must be provided.')
+
         signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed'
 
         # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing.
@@ -999,12 +1010,16 @@ def dataset_mapper(example: Dict):
         assert filtered_dataset is not None
         return filtered_dataset
 
+    @property
+    def streaming_dataset_class(self) -> Type[StreamingFinetuningDataset]:
+        return StreamingFinetuningDataset
+
     def build_from_streaming(
         self,
         *args: Any,
         **kwargs: Any,
     ) -> StreamingFinetuningDataset:
-        return StreamingFinetuningDataset(*args, **kwargs)
+        return self.streaming_dataset_class(*args, **kwargs)
 
 
 dataset_constructor = DatasetConstructor()
diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py
index 8e92658194..1a43e12536 100644
--- a/tests/data/test_dataloader.py
+++ b/tests/data/test_dataloader.py
@@ -7,7 +7,7 @@
 import shutil
 from contextlib import nullcontext as does_not_raise
 from pathlib import Path
-from typing import ContextManager, Literal, Optional, Union
+from typing import Any, Callable, ContextManager, Dict, Literal, Optional, Union
 from unittest.mock import MagicMock, patch
 
 import catalogue
@@ -1220,6 +1220,21 @@ def test_token_counting_func_dataloader_setting(
         'timeout': 0,
     }
 
+    def build_from_hf(
+        self,  # type: ignore
+        dataset_name: str,
+        split: str,
+        safe_load: bool = False,
+        max_seq_len: int = 2048,
+        preprocessing_fn: Optional[Callable] = None,
+        tokenizer: transformers.PreTrainedTokenizerBase = None,
+        target_prompts: str = 'last',
+        target_responses: str = 'none',
+        decoder_only_format: bool = True,
+        hf_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        return []
+
     if dataloader_type == 'finetuning-hf':
         cfg = DictConfig({
             'dataset': {
@@ -1235,8 +1250,7 @@ def test_token_counting_func_dataloader_setting(
         })
         monkeypatch.setattr(
             'llmfoundry.data.finetuning.tasks.DatasetConstructor.build_from_hf',
-            lambda *args,
-            **kwargs: [],
+            build_from_hf,
         )
         dl = build_finetuning_dataloader(
             tokenizer=gptt,

From c262341173a8ac31e8c77063d94534c8d7a9168d Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Wed, 7 Aug 2024 13:41:28 -0700
Subject: [PATCH 09/13] Remove flash patching for HF (#1436)

---
 llmfoundry/models/hf/hf_causal_lm.py | 23 ++++-----------
 tests/models/hf/test_hf_config.py    | 44 ++++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py
index 34ce22d694..f1f38e2f7d 100644
--- a/llmfoundry/models/hf/hf_causal_lm.py
+++ b/llmfoundry/models/hf/hf_causal_lm.py
@@ -256,23 +256,6 @@ def build_inner_model(
             False,  # Necessary due to https://github.com/huggingface/transformers/issues/28056
         )
 
-        # This is not ideal, however Hugging Face's _autoset_attn_implementation function
-        # forces you to load the model in fp16/bf16 if you want to use flash attention. Rather than loading
-        # the model and then casting it back to fp32, we are monkeypatching their check.
-        # https://github.com/huggingface/transformers/issues/28052
-        def _autoset_attn_implementation_monkeypatch(
-            cls,  # type: ignore
-            config,  # type: ignore
-            *args,  # type: ignore
-            **kwargs,  # type: ignore
-        ):  # type: ignore
-            config._attn_implementation = requested_attention_implementation
-            return config
-
-        PreTrainedModel._autoset_attn_implementation = classmethod(
-            _autoset_attn_implementation_monkeypatch,
-        )
-
         set_config_overrides(config, config_overrides)
 
         # We need to have all non-zero local ranks be not-pretrained
@@ -293,6 +276,8 @@ def _autoset_attn_implementation_monkeypatch(
                             pretrained_model_name_or_path,
                             trust_remote_code=trust_remote_code,
                             use_auth_token=use_auth_token,
+                            attn_implementation=
+                            requested_attention_implementation,
                             config=config,
                         )
             else:
@@ -300,6 +285,7 @@ def _autoset_attn_implementation_monkeypatch(
                     AutoModelForCausalLM.from_config(
                         config,
                         trust_remote_code=trust_remote_code,
+                        attn_implementation=requested_attention_implementation,
                     )
 
         dist.barrier()
@@ -312,12 +298,14 @@ def _autoset_attn_implementation_monkeypatch(
                     trust_remote_code=trust_remote_code,
                     use_auth_token=use_auth_token,
                     load_in_8bit=load_in_8bit,
+                    attn_implementation=requested_attention_implementation,
                     config=config,
                 )
             else:
                 model = AutoModelForCausalLM.from_config(
                     config,
                     trust_remote_code=trust_remote_code,
+                    attn_implementation=requested_attention_implementation,
                 )
         elif resolved_init_device == 'meta':
             if pretrained:
@@ -328,6 +316,7 @@ def _autoset_attn_implementation_monkeypatch(
                 model = AutoModelForCausalLM.from_config(
                     config,
                     trust_remote_code=trust_remote_code,
+                    attn_implementation=requested_attention_implementation,
                 )
         else:
             raise ValueError(
diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py
index d0ec544de8..844ccd7fe5 100644
--- a/tests/models/hf/test_hf_config.py
+++ b/tests/models/hf/test_hf_config.py
@@ -7,9 +7,11 @@
 from unittest.mock import Mock, patch
 
 import pytest
+import torch
 from omegaconf import OmegaConf as om
 from transformers import PretrainedConfig
 
+from llmfoundry.models.hf.hf_fsdp import rgetattr
 from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM
 from llmfoundry.utils import build_tokenizer
 from llmfoundry.utils.builders import build_composer_model
@@ -235,3 +237,45 @@ def test_nested_override():
     assert isinstance(model.config.ffn_config, PretrainedConfig)
     # Ensure the other values still exist and are not set back to their defaults
     assert model.config.ffn_config.moe_num_experts == 16
+
+
+@pytest.mark.gpu
+def test_use_flash():
+    model_cfg = {
+        'name': 'hf_causal_lm',
+        'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf',
+        'config_overrides': {
+            'num_hidden_layers': 2,
+            'hidden_size': 32,
+            'intermediate_size': 64,
+            'torch_dtype': 'bfloat16',
+        },
+        'pretrained': False,
+        'init_device': 'cpu',
+        'use_flash_attention_2': True,
+    }
+
+    name = model_cfg.pop('name')
+    model = build_composer_model(
+        name=name,
+        cfg=model_cfg,
+        tokenizer=None,  # type: ignore
+    )
+
+    from transformers.models.llama.modeling_llama import (
+        LlamaFlashAttention2,
+    )
+    flash_attn_class = LlamaFlashAttention2
+    attention_layers_attr = 'model.model.layers'
+    attention_attr = 'self_attn'
+
+    # check that it actually used flash attention 2
+    assert model.model.config._attn_implementation == ('flash_attention_2')
+    attention_layer = rgetattr(
+        rgetattr(model, attention_layers_attr)[0],
+        attention_attr,
+    )
+    assert isinstance(attention_layer, flash_attn_class)
+
+    # Make sure that HF has not cast the parameters to bf16
+    assert next(model.parameters()).dtype == torch.float32

From 0f4476d874ef1b0b4c9317b0815fec7dfe9c1161 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bruce@2.7182.net>
Date: Wed, 7 Aug 2024 14:27:40 -0700
Subject: [PATCH 10/13] Fix the context size in long context gauntlet for
 wikiqa (#1439)

---
 scripts/eval/yamls/long_context_tasks.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml
index 153e3b9df6..221635da87 100644
--- a/scripts/eval/yamls/long_context_tasks.yaml
+++ b/scripts/eval/yamls/long_context_tasks.yaml
@@ -105,7 +105,7 @@ icl_tasks:
   icl_task_type: generation_task_with_answers
   hf_loading_vars:
     name: wikiqa
-    context_length: 2048
+    context_length: 4096
     split: test
 -
   label: wikiqa_8k
@@ -114,7 +114,7 @@ icl_tasks:
   icl_task_type: generation_task_with_answers
   hf_loading_vars:
     name: wikiqa
-    context_length: 2048
+    context_length: 8192
     split: test
 -
   label: hotpotqa_beginning_2k

From f006d07ce814576adff1c36dc2d1b3e75b3ae2f6 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 8 Aug 2024 05:41:56 +0000
Subject: [PATCH 11/13] Update mlflow requirement from <2.15,>=2.14.1 to
 >=2.14.1,<2.16 (#1424)

Updates the requirements on [mlflow](https://github.com/mlflow/mlflow) to permit the latest version.
- [Release notes](https://github.com/mlflow/mlflow/releases)
- [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md)
- [Commits](https://github.com/mlflow/mlflow/compare/v2.14.1...v2.15.0)

---
updated-dependencies:
- dependency-name: mlflow
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 19e5cee2d6..04c28d8f70 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,7 @@
 
 install_requires = [
     'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.23.4,<0.24',
-    'mlflow>=2.14.1,<2.15',
+    'mlflow>=2.14.1,<2.16',
     'accelerate>=0.25,<0.34',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.44',
     'mosaicml-streaming>=0.8.0,<0.9',

From 805cf83c709732e0d99b952dd51ab528f109814d Mon Sep 17 00:00:00 2001
From: Milo Cress <milo.cress@databricks.com>
Date: Thu, 8 Aug 2024 13:17:02 -0400
Subject: [PATCH 12/13] Add special errors for bad chat/ift types (#1437)

---
 llmfoundry/data/finetuning/tasks.py      | 21 ++++++++-------------
 llmfoundry/utils/exceptions.py           | 16 ++++++++++++++++
 tests/data/test_template_tokenization.py | 18 ++++++++++++++++--
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py
index dd9b495ce4..e8175b4446 100644
--- a/llmfoundry/data/finetuning/tasks.py
+++ b/llmfoundry/data/finetuning/tasks.py
@@ -78,8 +78,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]:
     ConsecutiveRepeatedChatRolesError,
     IncorrectMessageKeyQuantityError,
     InvalidContentTypeError,
+    InvalidExampleTypeError,
     InvalidFileExtensionError,
     InvalidLastChatMessageRoleError,
+    InvalidMessageTypeError,
     InvalidPromptResponseKeysError,
     InvalidPromptTypeError,
     InvalidResponseTypeError,
@@ -139,9 +141,7 @@ def _get_example_type(example: Example) -> ExampleType:
         KeyError: If the example type is unknown.
     """
     if not isinstance(example, Mapping):
-        raise TypeError(
-            f'Expected example to be a Mapping, but found {type(example)}',
-        )
+        raise InvalidExampleTypeError(str(type(example)))
     if (
         len(example.keys()) == 1 and any(
             allowed_message_key in example
@@ -156,7 +156,8 @@ def _get_example_type(example: Example) -> ExampleType:
     ):
         return 'prompt_response'
     else:
-        raise UnknownExampleTypeError(str(example.keys()))
+        keys = str(set(example.keys()))
+        raise UnknownExampleTypeError(keys)
 
 
 def _is_empty_or_nonexistent(dirpath: str) -> bool:
@@ -173,23 +174,17 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool:
 
 def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]):
     if not isinstance(dictionary, Mapping):
-        raise TypeError(
-            f'Expected dictionary to be a mapping, but found {type(dictionary)}',
-        )
+        raise InvalidExampleTypeError(str(type(dictionary)))
     desired_keys = allowed_keys.intersection(dictionary.keys())
     return list(desired_keys)[0]
 
 
 def _validate_chat_formatted_example(example: ChatFormattedDict):
     if not isinstance(example, Mapping):
-        raise TypeError(
-            f'Expected example to be a mapping, but found {type(example)}',
-        )
+        raise InvalidExampleTypeError(str(type(example)))
     messages = example[_get_key(example, ALLOWED_MESSAGES_KEYS)]
     if not isinstance(messages, List):
-        raise TypeError(
-            f'Expected messages to be an iterable, but found {type(messages)}',
-        )
+        raise InvalidMessageTypeError(str(type(messages)))
     if len(messages) <= 1:
         raise NotEnoughChatDataError()
 
diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py
index 140bf8540b..c6a667697d 100644
--- a/llmfoundry/utils/exceptions.py
+++ b/llmfoundry/utils/exceptions.py
@@ -161,6 +161,22 @@ def __init__(
 
 
 ## Tasks exceptions
+class InvalidExampleTypeError(UserError):
+    """Error thrown when a message type is not a `Mapping`."""
+
+    def __init__(self, example_type: str) -> None:
+        message = f'Expected example to be a `Mapping`, but found type {example_type}'
+        super().__init__(message, example_type=example_type)
+
+
+class InvalidMessageTypeError(UserError):
+    """Error thrown when a message type is not an `Iterable`."""
+
+    def __init__(self, message_type: str) -> None:
+        message = f'Expected message to be an `Iterable`, but found type {message_type}'
+        super().__init__(message, message_type=message_type)
+
+
 class UnknownExampleTypeError(UserError):
     """Error thrown when an unknown example type is used in a task."""
 
diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py
index 16447d6623..9f44739b6b 100644
--- a/tests/data/test_template_tokenization.py
+++ b/tests/data/test_template_tokenization.py
@@ -16,6 +16,8 @@
     ALLOWED_PROMPT_KEYS,
     ALLOWED_RESPONSE_KEYS,
     ChatTemplateError,
+    InvalidExampleTypeError,
+    InvalidMessageTypeError,
 )
 
 
@@ -48,13 +50,13 @@ def test_tokenize_chat_example_malformed():
             'content': 'user message not followed by an assistant label',
         }],
     }
-    wrong_type = {'messages': 'this is not a list of messages'}
+    wrong_example_type = ['this is not a dictionary']
+    wrong_messages_type = {'messages': 'this is not a list of messages'}
     malformed_chat_examples = [
         too_few_messages,
         no_content,
         ends_with_user_role,
         no_assistant_message,
-        wrong_type,
     ]
     my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {})
     for example in malformed_chat_examples:
@@ -63,6 +65,18 @@ def test_tokenize_chat_example_malformed():
                 example,
                 my_tokenizer,
             )  # type: ignore (the typing here is supposed to be malformed)
+    with pytest.raises(InvalidExampleTypeError):
+        # Ignore the type here because it's the mistyping that we're
+        # trying to test.
+        tokenize_formatted_example( # type: ignore
+            wrong_example_type, # type: ignore
+            my_tokenizer, # type: ignore
+        )
+    with pytest.raises(InvalidMessageTypeError):
+        tokenize_formatted_example(
+            wrong_messages_type,
+            my_tokenizer,
+        )
 
 
 def test_tokenize_chat_example_well_formed():

From 44b09f0d5b7d844cfb913d41c82b2c35bef21112 Mon Sep 17 00:00:00 2001
From: Brian <23239305+b-chu@users.noreply.github.com>
Date: Thu, 8 Aug 2024 19:13:23 -0400
Subject: [PATCH 13/13] Make autopacking faster (#1435)

---
 llmfoundry/data/finetuning/collator.py   |  68 +++--
 llmfoundry/data/finetuning/dataloader.py |   3 +
 llmfoundry/data/packing.py               | 349 +++++++++++++----------
 3 files changed, 242 insertions(+), 178 deletions(-)

diff --git a/llmfoundry/data/finetuning/collator.py b/llmfoundry/data/finetuning/collator.py
index 42af7e9375..68ebb9d21d 100644
--- a/llmfoundry/data/finetuning/collator.py
+++ b/llmfoundry/data/finetuning/collator.py
@@ -224,6 +224,10 @@ class Seq2SeqFinetuningCollator:
             sizes. Default: ``False`` ensures that all sequences are max_seq_len.
         batch_metadata (dict, optional): A dictionary of metadata which will be added
             to the batch.
+        pad_to_longest (bool, optional): Whether to pad to the longest sequence,
+            which may result in smaller but inconsistent batch sizes. This is
+            primarily used to profile packing.
+            Default: ``False`` ensures that all sequences are max_seq_len.
     """
 
     def __init__(
@@ -235,6 +239,7 @@ def __init__(
         target_prompts: str = 'none',
         allow_pad_trimming: bool = False,
         batch_metadata: Optional[Dict[str, Any]] = None,
+        pad_to_longest: bool = False,
     ):
         self.tokenizer = tokenizer
         self.max_seq_len = max_seq_len
@@ -247,6 +252,8 @@ def __init__(
         self._allow_pad_trimming = allow_pad_trimming
         self._seen_first_batch = False
 
+        self._pad_to_longest = pad_to_longest
+
         illegal_keys = [
             'input_ids',
             'labels',
@@ -320,24 +327,34 @@ def _process_and_batch_decoder_only(
     ) -> Dict[str, torch.Tensor]:
         # Steps explained in comments
         processed_examples = []
-        for example in examples:
-            input_ids, labels = stitch_turns_decoder_only(
+        input_ids_and_labels = [
+            stitch_turns_decoder_only(
                 example_turns=example['turns'],
                 target_prompts=self.target_prompts,
                 target_responses=self.target_responses,
                 eos_token_id=self.tokenizer.eos_token_id,
-            )
+            ) for example in examples
+        ]
+
+        if self._pad_to_longest:
+            max_seq_len = max([
+                len(input_ids) for input_ids, _ in input_ids_and_labels
+            ])
+            max_seq_len = min(max_seq_len, self.max_seq_len)
+        else:
+            max_seq_len = self.max_seq_len
 
+        for input_ids, labels in input_ids_and_labels:
             orig_size = len(input_ids)
             # We may need to truncate the input_ids / labels in order to maintain max_seq_len
-            if orig_size > self.max_seq_len:
-                input_ids = input_ids[:self.max_seq_len]
-                labels = labels[:self.max_seq_len]
+            if orig_size > max_seq_len:
+                input_ids = input_ids[:max_seq_len]
+                labels = labels[:max_seq_len]
 
                 # Check to make sure there are still loss-generating tokens. Error if not.
                 if len([l for l in labels if l != _HF_IGNORE_INDEX]) == 0:
                     raise ValueError(
-                        f'Truncating to max_seq_len={self.max_seq_len} has removed all loss-generating tokens. ' +\
+                        f'Truncating to max_seq_len={max_seq_len} has removed all loss-generating tokens. ' +\
                         f'Pre-truncation sequence length was {orig_size}. ' +\
                         'This sample should have been filtered out before reaching the collator. If using ' +\
                         'pre-tokenized streaming data, this may have resulted from using different ' +\
@@ -348,7 +365,7 @@ def _process_and_batch_decoder_only(
                 # Still issue a warning when truncating
                 if not self._warned_truncated:
                     warnings.warn(
-                        f'Truncating sequence of length={orig_size} to fit max_seq_len={self.max_seq_len}. ' +\
+                        f'Truncating sequence of length={orig_size} to fit max_seq_len={max_seq_len}. ' +\
                         f'If truncation is a problem, consider increasing max_seq_len.',
                     )
                     self._warned_truncated = True
@@ -358,7 +375,7 @@ def _process_and_batch_decoder_only(
             # Annoyingly, we need to pad everything but input_ids
             # and attention_mask ourselves
             n_total = len(input_ids)
-            i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - n_total)
+            i_pad = [_HF_IGNORE_INDEX] * (max_seq_len - n_total)
             if self.tokenizer.padding_side == 'left':
                 labels = i_pad + labels
             else:
@@ -376,7 +393,7 @@ def _process_and_batch_decoder_only(
         batch = self.tokenizer.pad(
             processed_examples,
             padding='max_length',
-            max_length=self.max_seq_len,
+            max_length=max_seq_len,
             return_tensors='pt',
         )
 
@@ -410,35 +427,44 @@ def _process_and_batch_encoder_decoder(
         # The encoder-decoder case is has some gotchas.
         # Steps are explained in comments.
         processed_examples = []
-        for example in examples:
-            context, target = stitch_turns_encoder_decoder(
+        contexts_and_targets = [
+            stitch_turns_encoder_decoder(
                 example_turns=example['turns'],
                 eos_token_id=self.tokenizer.eos_token_id,
-            )
+            ) for example in examples
+        ]
+
+        if self._pad_to_longest:
+            max_seq_len = 0
+            for context, target in contexts_and_targets:
+                max_seq_len = max(max_seq_len, len(context), len(target))
+        else:
+            max_seq_len = self.max_seq_len
 
+        for context, target in contexts_and_targets:
             # We need to pad labels ourselves. Because HF.
-            if len(target) < self.max_seq_len:
-                i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - len(target))
+            if len(target) < max_seq_len:
+                i_pad = [_HF_IGNORE_INDEX] * (max_seq_len - len(target))
                 target = target + i_pad
             else:
                 if not self._warned_target:
                     warnings.warn(
                         f'Truncating TARGET sequence of length={len(target)} ' +\
-                        f'to max_seq_len={self.max_seq_len}. If truncation is ' +\
+                        f'to max_seq_len={max_seq_len}. If truncation is ' +\
                         f'a problem, consider increasing max_seq_len.')
                     self._warned_target = True
-                target = target[:self.max_seq_len -
+                target = target[:max_seq_len -
                                 1] + [self.tokenizer.eos_token_id]
 
             # We might need to truncate the context. Preserve the beginning.
-            if len(context) > self.max_seq_len:
+            if len(context) > max_seq_len:
                 if not self._warned_context:
                     warnings.warn(
                         f'Truncating CONTEXT sequence of length={len(context)} ' +\
-                        f'to max_seq_len={self.max_seq_len}. If truncation is ' +\
+                        f'to max_seq_len={max_seq_len}. If truncation is ' +\
                         f'a problem, consider increasing max_seq_len.')
                     self._warned_context = True
-                context = context[:self.max_seq_len -
+                context = context[:max_seq_len -
                                   1] + [self.tokenizer.eos_token_id]
 
             # Back into the example
@@ -454,7 +480,7 @@ def _process_and_batch_encoder_decoder(
         batch = self.tokenizer.pad(
             processed_examples,
             padding='max_length',
-            max_length=self.max_seq_len,
+            max_length=max_seq_len,
             return_tensors='pt',
         )
         # We're still missing decoder_input_ids and decoder_attention_mask
diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 771033a703..6aecadb6bb 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -50,6 +50,7 @@
     'seq_parallel_replication',
     'auto_packing_replication',
     'max_leftover_bins_to_keep',
+    'pad_to_longest',
 }
 
 
@@ -630,6 +631,7 @@ def build_collate_fn(
     max_seq_len = dataset_cfg['max_seq_len']
     decoder_only_format = dataset_cfg['decoder_only_format']
     allow_pad_trimming = dataset_cfg.get('allow_pad_trimming', False)
+    pad_to_longest = dataset_cfg.get('pad_to_longest', False)
 
     collate_fn = Seq2SeqFinetuningCollator(
         tokenizer=tokenizer,
@@ -638,6 +640,7 @@ def build_collate_fn(
         target_responses=target_responses,
         target_prompts=target_prompts,
         allow_pad_trimming=allow_pad_trimming,
+        pad_to_longest=pad_to_longest,
     )
 
     packing_ratio = dataset_cfg.get('packing_ratio')
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 5579066f89..77e166c474 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -20,7 +20,20 @@
 
 
 class BinPackCollator:
-    """Utility collator for packing to reduce padding."""
+    """Utility collator for packing to reduce padding.
+
+    Args:
+        collator (Callable): The base collator to use.
+        target_batch_size (int): The number of bins.
+        max_seq_len(int): The maximum sequence length of a bin.
+        pad_token_id (int): The padding token id.
+        padding_side (Literal['left', 'right']): The side to pad on.
+        max_leftover_bins_to_keep (Optional[int]): The number of leftover bins
+            to keep.
+        is_profiling (bool): Whether the collator is being used for profiling.
+            In profiling mode, packing and padding the example tensors is
+            avoided for efficiency, and the returned batch will be None.
+    """
 
     def __init__(
         self,
@@ -30,6 +43,7 @@ def __init__(
         pad_token_id: int,
         padding_side: Literal['left', 'right'],
         max_leftover_bins_to_keep: Optional[int] = None,
+        is_profiling: bool = False,
     ):
         self.base_collator = collator
         self.out_size = int(target_batch_size)
@@ -56,6 +70,8 @@ def __init__(
 
         self._leftover_bins: List[Tuple[int, Dict[str, torch.Tensor]]] = []
 
+        self._is_profiling = is_profiling
+
     @property
     def waste(self) -> float:
         return 1 - (self.n_packed_tokens / self.n_total_tokens)
@@ -74,6 +90,9 @@ def __call__(
         return self.pack(batch)
 
     def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        if self._is_profiling:
+            raise ValueError('Cannot pack in profiling mode.')
+
         assert 'attention_mask' in batch
         assert 'input_ids' in batch
 
@@ -86,13 +105,15 @@ def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
             ]
         # Cut everything down to size
         sizes, trimmed_examples = _trim_batch(batch)
-        return self._pack_trimmed_examples(trimmed_examples, sizes)
+        packed_batch = self._pack_trimmed_examples(trimmed_examples, sizes)
+        assert packed_batch is not None
+        return packed_batch
 
     def _pack_trimmed_examples(
         self,
         trimmed_examples: List[Dict[str, torch.Tensor]],
         sizes: List[int],
-    ) -> Dict[str, torch.Tensor]:
+    ) -> Optional[Dict[str, torch.Tensor]]:
         """Packs trimmed examples into fixed-size bins and repads them.
 
         Args:
@@ -100,10 +121,12 @@ def _pack_trimmed_examples(
             sizes (List[int]): The sizes of the trimmed examples.
 
         Returns:
-            Dict[str, torch.Tensor]: A batch of repadded examples ready for processing
+            Optional[Dict[str, torch.Tensor]]: A batch of repadded examples
+                ready for processing. If the collator is in profiling mode,
+                returns None.
         """
         # Apply our CS 101 bin packing algorithm.
-        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = _first_fit_bin_packing(
+        packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = self._first_fit_bin_packing(
             sizes=sizes,
             examples=trimmed_examples,
             num_bins=self.out_size,
@@ -115,15 +138,145 @@ def _pack_trimmed_examples(
         self.n_packed_examples += self.out_size
         self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep]
 
+        if self._is_profiling:
+            return None
+
         # Re-pad to max_seq_len and batch
-        batch = _repad(
-            packed_examples,
-            max_seq_len=self.max_seq_len,
-            pad_token_id=self.pad_token_id,
-            padding_side=self.padding_side,
-        )
+        batch = self._convert_to_batch(packed_examples)
+        return batch
+
+    def _convert_to_batch(
+        self,
+        packed_examples: List[Dict[str, torch.Tensor]],
+    ) -> Dict[str, torch.Tensor]:
+
+        pad_vals = {
+            'input_ids': self.pad_token_id,
+            'labels': -100,
+            'attention_mask': 0,
+            'sequence_id': -1,
+        }
+        keys = packed_examples[0].keys()
+        batch = {}
+        for key in keys:
+            batch[key] = torch.stack([
+                _pad_tensor(
+                    example[key],
+                    pad_vals[key],
+                    self.max_seq_len,
+                    self.padding_side,
+                ) for example in packed_examples
+            ])
         return batch
 
+    def _first_fit_bin_packing(
+        self,
+        sizes: List[int],
+        examples: List[Dict[str, torch.Tensor]],
+        num_bins: int,
+        max_bin_size: int,
+        existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]],
+    ) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[
+        str, torch.Tensor]]]]:
+
+        # Will contain tuples (bin_size_size, packed_example)
+        bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins
+
+        starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+
+        sizes_and_examples = list(zip(sizes, examples))
+        sorted_sizes_and_examples = sorted(
+            sizes_and_examples,
+            key=lambda x: x[0],
+            reverse=True,
+        )
+
+        required_num_examples = max(0, num_bins - len(bins))
+        num_examples = len(sizes)
+        if num_examples < required_num_examples:
+            for size, example in sorted_sizes_and_examples:
+                # Can't keep packing. All remaining items get their own bin.
+                bins.append((size, example))
+
+            total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+            total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
+            total_example_sizes = sum(sizes)
+            if total_new_bin_sizes != total_example_sizes:
+                raise AssertionError(
+                    f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.',
+                )
+
+            sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
+            bin_sizes, packed_examples = [], []
+            for bin_size, packed_example in sorted_bins:
+                bin_sizes.append(bin_size)
+                packed_examples.append(packed_example)
+
+            # Return:
+            #  - the num_bins largest packed examples
+            #  - the total tokens in those examples
+            #  - the total size of all new examples
+            #  - leftover bins
+            return packed_examples[:num_bins], sum(
+                bin_sizes[:num_bins],
+            ), sum(sizes), sorted_bins[num_bins:]
+
+        # Go through each item from longest to shortest.
+        # Note: all items will either go into an existing or new bin.
+        for i, (size, example) in enumerate(sorted_sizes_and_examples):
+            # If we can't keep packing, all remaining items get their own bin.
+            required_num_examples = max(0, num_bins - len(bins))
+            n_remaining = num_examples - i
+            assert n_remaining >= required_num_examples
+            if n_remaining == required_num_examples:
+                # Can't keep packing. All remaining items get their own bin.
+                bins.append((size, example))
+                continue
+
+            # Add it to the first bin it fits in
+            added = False
+            for bidx in range(len(bins)):
+                if bins[bidx][0] + size <= max_bin_size:
+                    bin_size, packed_example = bins.pop(bidx)
+                    bin_size = bin_size + size
+                    # When profiling, don't combine the tensor for efficiency
+                    # and return the first example which isn't "correct"
+                    if not self._is_profiling:
+                        packed_example = _combine_in_place(
+                            packed_example,
+                            example,
+                        )
+
+                    bins.append((bin_size, packed_example))
+                    added = True
+                    break
+            # If it didn't fit anywhere, open a new bin
+            if not added:
+                bins.append((size, example))
+
+        total_bin_sizes = sum([bin_size for bin_size, _ in bins])
+        total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
+        total_example_sizes = sum(sizes)
+        if total_new_bin_sizes != total_example_sizes:
+            raise AssertionError(
+                f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.',
+            )
+
+        sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
+        bin_sizes, packed_examples = [], []
+        for bin_size, packed_example in sorted_bins:
+            bin_sizes.append(bin_size)
+            packed_examples.append(packed_example)
+
+        # Return:
+        #  - the num_bins largest packed examples
+        #  - the total tokens in those examples
+        #  - the total size of all new examples
+        #  - leftover bins
+        return packed_examples[:num_bins], sum(
+            bin_sizes[:num_bins],
+        ), sum(sizes), sorted_bins[num_bins:]
+
 
 def _trim_batch(
     batch: Dict[str, torch.Tensor],
@@ -177,143 +330,25 @@ def _combine_in_place(
     return example
 
 
-def _first_fit_bin_packing(
-    sizes: List[int],
-    examples: List[Dict[str, torch.Tensor]],
-    num_bins: int,
-    max_bin_size: int,
-    existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]],
-) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[
-    str, torch.Tensor]]]]:
-
-    # Will contain tuples (bin_size_size, packed_example)
-    bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins
-
-    starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins])
-
-    sizes_and_examples = list(zip(sizes, examples))
-    sorted_sizes_and_examples = sorted(
-        sizes_and_examples,
-        key=lambda x: x[0],
-        reverse=True,
-    )
-
-    required_num_examples = max(0, num_bins - len(bins))
-    num_examples = len(sizes)
-    if num_examples < required_num_examples:
-        for size, example in sorted_sizes_and_examples:
-            # Can't keep packing. All remaining items get their own bin.
-            bins.append((size, example))
-
-        total_bin_sizes = sum([bin_size for bin_size, _ in bins])
-        total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
-        total_example_sizes = sum(sizes)
-        if total_new_bin_sizes != total_example_sizes:
-            raise AssertionError(
-                f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.',
-            )
-
-        sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
-        bin_sizes, packed_examples = [], []
-        for bin_size, packed_example in sorted_bins:
-            bin_sizes.append(bin_size)
-            packed_examples.append(packed_example)
-
-        # Return:
-        #  - the num_bins largest packed examples
-        #  - the total tokens in those examples
-        #  - the total size of all new examples
-        #  - leftover bins
-        return packed_examples[:num_bins], sum(
-            bin_sizes[:num_bins],
-        ), sum(sizes), sorted_bins[num_bins:]
-
-    # Go through each item from longest to shortest.
-    # Note: all items will either go into an existing or new bin.
-    for i, (size, example) in enumerate(sorted_sizes_and_examples):
-        # If we can't keep packing, all remaining items get their own bin.
-        required_num_examples = max(0, num_bins - len(bins))
-        n_remaining = num_examples - i
-        assert n_remaining >= required_num_examples
-        if n_remaining == required_num_examples:
-            # Can't keep packing. All remaining items get their own bin.
-            bins.append((size, example))
-            continue
-
-        # Add it to the first bin it fits in
-        added = False
-        for bidx in range(len(bins)):
-            if bins[bidx][0] + size <= max_bin_size:
-                bin_size, packed_example = bins.pop(bidx)
-                bin_size = bin_size + size
-                packed_example = _combine_in_place(packed_example, example)
-                bins.append((bin_size, packed_example))
-                added = True
-                break
-        # If it didn't fit anywhere, open a new bin
-        if not added:
-            bins.append((size, example))
-
-    total_bin_sizes = sum([bin_size for bin_size, _ in bins])
-    total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes
-    total_example_sizes = sum(sizes)
-    if total_new_bin_sizes != total_example_sizes:
-        raise AssertionError(
-            f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.',
-        )
-
-    sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True)
-    bin_sizes, packed_examples = [], []
-    for bin_size, packed_example in sorted_bins:
-        bin_sizes.append(bin_size)
-        packed_examples.append(packed_example)
-
-    # Return:
-    #  - the num_bins largest packed examples
-    #  - the total tokens in those examples
-    #  - the total size of all new examples
-    #  - leftover bins
-    return packed_examples[:num_bins], sum(
-        bin_sizes[:num_bins],
-    ), sum(sizes), sorted_bins[num_bins:]
-
-
-def _repad(
-    packed_examples: List[Dict[str, torch.Tensor]],
+def _pad_tensor(
+    tensor: torch.Tensor,
+    pad_value: int,
     max_seq_len: int,
-    pad_token_id: int,
     padding_side: str,
-) -> Dict[str, torch.Tensor]:
-
-    def pad_tensor(tensor: torch.Tensor, pad_value: int):
-        if len(tensor) == max_seq_len:
-            return tensor
-        t = torch.full((max_seq_len,),
-                       pad_value,
-                       dtype=tensor.dtype,
-                       device=tensor.device)
-        if padding_side == 'left':
-            t[-len(tensor):] = tensor
-        elif padding_side == 'right':
-            t[:len(tensor)] = tensor
-        else:
-            raise ValueError(f'Unknown {padding_side=}')
-        return t
-
-    pad_vals = {
-        'input_ids': pad_token_id,
-        'labels': -100,
-        'attention_mask': 0,
-        'sequence_id': -1,
-    }
-    keys = packed_examples[0].keys()
-    batch = {}
-    for key in keys:
-        batch[key] = torch.stack([
-            pad_tensor(example[key], pad_vals[key])
-            for example in packed_examples
-        ])
-    return batch
+) -> torch.Tensor:
+    if len(tensor) == max_seq_len:
+        return tensor
+    t = torch.full((max_seq_len,),
+                   pad_value,
+                   dtype=tensor.dtype,
+                   device=tensor.device)
+    if padding_side == 'left':
+        t[-len(tensor):] = tensor
+    elif padding_side == 'right':
+        t[:len(tensor)] = tensor
+    else:
+        raise ValueError(f'Unknown {padding_side=}')
+    return t
 
 
 def auto_packing_ratio(
@@ -428,24 +463,23 @@ def profile_packing(
         'prefetch_factor': None,
         'persistent_workers': False,
     })
-    dataloader_cfg['dataset']['packing_ratio'] = 1.0
-    dataloader_cfg['dataset']['auto_packing_replication'
-                             ] = dataloader_cfg['dataset'].get(
-                                 'seq_parallel_replication',
-                                 1,
-                             ) or 1
-    dataloader_cfg['dataset']['seq_parallel_replication'] = 1
+    dataset_cfg = dataloader_cfg['dataset']
+    dataset_cfg['packing_ratio'] = 1.0
+    seq_parallel_replication = dataset_cfg.get('seq_parallel_replication', 1)
+    dataset_cfg['auto_packing_replication'] = seq_parallel_replication or 1
+    dataset_cfg['seq_parallel_replication'] = 1
+    dataset_cfg['pad_to_longest'] = True
 
     # If streaming dataset, use a temporary local folder for profiling
     local_rank_zero = dist.get_global_rank() - dist.get_local_rank()
-    if dataloader_cfg['dataset'].get('remote') is not None:
+    if dataset_cfg.get('remote') is not None:
         tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
         gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
         tmp_path = gathered_paths[local_rank_zero]
-        dataloader_cfg['dataset']['local'] = tmp_path
+        dataset_cfg['local'] = tmp_path
 
-    if dataloader_cfg['dataset'].get('streams') is not None:
-        for stream_config in dataloader_cfg['dataset']['streams'].values():
+    if dataset_cfg.get('streams') is not None:
+        for stream_config in dataset_cfg['streams'].values():
             tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
             gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
             tmp_path = gathered_paths[local_rank_zero]
@@ -492,6 +526,7 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]:
             pad_token_id=0,  # <-- Doesn't need to be correct for profiling
             padding_side='left',  # <-- Doesn't need to be correct for profiling
             max_leftover_bins_to_keep=max_leftovers_to_keep,
+            is_profiling=True,
         )
 
         # Simulate feeding the packing collator a bunch of data