From 38dcf1e6e25a05dbd4275402f168bac299b45d6d Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Sat, 3 Aug 2024 08:42:38 -0700 Subject: [PATCH 01/13] Remove type ignore (#1421) --- llmfoundry/utils/builders.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 000155f1a4..a1d84601b3 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -127,8 +127,7 @@ def build_eval_loaders( # Load the eval data to fail fast. metrics will get added # later in add_metrics_to_eval_loaders, after the model is loaded metric_names=[], - # TODO: Fix type in Composer - device_eval_microbatch_size=device_eval_batch_size, # type: ignore + device_eval_microbatch_size=device_eval_batch_size, ) evaluators.append(eval_loader) return evaluators From 32f84bc8cf0bc302d52d9f9c73af1f02dadb2035 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:34:12 -0700 Subject: [PATCH 02/13] Update pytest-cov requirement from <5,>=4 to >=4,<6 (#1423) Updates the requirements on [pytest-cov](https://github.com/pytest-dev/pytest-cov) to permit the latest version. - [Changelog](https://github.com/pytest-dev/pytest-cov/blob/master/CHANGELOG.rst) - [Commits](https://github.com/pytest-dev/pytest-cov/compare/v4.0.0...v5.0.0) --- updated-dependencies: - dependency-name: pytest-cov dependency-type: direct:development ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 185d3970f7..2b47928419 100644 --- a/setup.py +++ b/setup.py @@ -82,7 +82,7 @@ 'pre-commit>=3.4.0,<4', 'pytest>=7.2.1,<8', 'pytest_codeblocks>=0.16.1,<0.17', - 'pytest-cov>=4,<5', + 'pytest-cov>=4,<6', 'pyright==1.1.256', 'toml>=0.10.2,<0.11', 'packaging>=21,<23', From 32803b751137760d826c734b331fb3ef779746bc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Aug 2024 14:43:24 -0400 Subject: [PATCH 03/13] Bump onnx from 1.16.1 to 1.16.2 (#1425) Bumps [onnx](https://github.com/onnx/onnx) from 1.16.1 to 1.16.2. - [Release notes](https://github.com/onnx/onnx/releases) - [Changelog](https://github.com/onnx/onnx/blob/main/docs/Changelog-ml.md) - [Commits](https://github.com/onnx/onnx/compare/v1.16.1...v1.16.2) --- updated-dependencies: - dependency-name: onnx dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2b47928419..19e5cee2d6 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ 'omegaconf>=2.2.3,<3', 'slack-sdk<4', 'mosaicml-cli>=0.6.10,<1', - 'onnx==1.16.1', + 'onnx==1.16.2', 'onnxruntime==1.18.1', 'boto3>=1.21.45,<2', 'huggingface-hub>=0.19.0,<0.25', From 6dcc18a5d1db104b48eacce68144db5a99da37d7 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Mon, 5 Aug 2024 15:56:35 -0400 Subject: [PATCH 04/13] Add transforms to logged config (#1428) Co-authored-by: Saaketh Narayan --- llmfoundry/utils/config_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index f10fe32735..eb54fabc3d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -332,15 +332,15 @@ def make_dataclass_and_log_config( 'icl_tasks must be specified in the config', ) - # Create copy of config for logging - logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) - # Apply transforms to the unstructured config before constructing dataclass unstructured_config = apply_transforms_to_config( unstructured_config, transforms, ) + # Create copy of config for logging + logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) + arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, dataclass_fields) From 8fe57eb1ea7b1794a214ea6256965210c770c639 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 6 Aug 2024 10:09:52 -0700 Subject: [PATCH 05/13] Install all deps in Docker images (#1431) --- .github/workflows/docker.yaml | 4 ++-- Dockerfile | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 17bb976a5d..0bb0b4087a 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -19,10 +19,10 @@ jobs: include: - name: "2.3.1_cu121" base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 - dep_groups: "[gpu]" + dep_groups: "[all]" - name: "2.3.1_cu121_aws" base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws - dep_groups: "[gpu]" + dep_groups: "[all]" steps: - name: Checkout diff --git a/Dockerfile b/Dockerfile index 9366d7dbcd..cee7063cdd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ FROM $BASE_IMAGE ARG BRANCH_NAME ARG DEP_GROUPS +ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.7 8.9 9.0" + # Check for changes in setup.py. # If there are changes, the docker cache is invalidated and a fresh pip installation is triggered. ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py setup.py From 420ea30106f015d41a823ce7eb5592d3a144e999 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Tue, 6 Aug 2024 13:37:12 -0400 Subject: [PATCH 06/13] Raise error when not enough data when converting text to MDS (#1430) * test * precommit * precommit * custom error * rm input folder * Update llmfoundry/utils/exceptions.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --------- Co-authored-by: v-chen_data Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- .../data_prep/convert_text_to_mds.py | 7 ++++++ llmfoundry/utils/exceptions.py | 9 ++++++++ .../data_prep/test_convert_text_to_mds.py | 23 +++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 336c82a5e7..94bdc16526 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -1,6 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import json import logging import math import os @@ -28,6 +29,7 @@ merge_shard_groups, ) from llmfoundry.utils.exceptions import ( + DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, ) @@ -468,6 +470,11 @@ def convert_text_to_mds( trust_remote_code, ) + index_path = os.path.join(local_output_folder, 'index.json') + with open(index_path, 'r') as index_file: + if not json.load(index_file)['shards']: + raise DatasetTooSmallError() + # Write a done file with the args and object names write_done_file(local_output_folder, args_str, object_names) diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 76f378f8c6..140bf8540b 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -28,6 +28,7 @@ 'InputFolderMissingDataError', 'OutputFolderNotEmptyError', 'MisconfiguredHfDatasetError', + 'DatasetTooSmallError', 'RunTimeoutError', ] @@ -348,6 +349,14 @@ def __init__(self, dataset_name: str, split: str) -> None: super().__init__(message, dataset_name=dataset_name, split=split) +class DatasetTooSmallError(UserError): + """Error thrown when the dataset is too small to be processed.""" + + def __init__(self) -> None: + message = f'Your dataset is too small and produced no complete samples during preprocessing. Please provide more data.' + super().__init__(message) + + class RunTimeoutError(InternalError): """Error thrown when a run times out.""" diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index f4c160790a..6ba14d62e4 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -22,6 +22,7 @@ write_done_file, ) from llmfoundry.utils.exceptions import ( + DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, ) @@ -267,6 +268,28 @@ def test_input_folder_not_exist(tmp_path: pathlib.Path): ) +def test_dataset_too_small(tmp_path: pathlib.Path): + input_folder = tmp_path / 'input' + os.makedirs(input_folder, exist_ok=True) + with open(input_folder / 'test.txt', 'w') as f: + f.write('a') + with pytest.raises(DatasetTooSmallError): + convert_text_to_mds( + tokenizer_name='mosaicml/mpt-7b', + output_folder=str(tmp_path / 'output'), + input_folder=str(input_folder), + concat_tokens=2048, + eos_text='', + bos_text='', + no_wrap=False, + compression='zstd', + processes=1, + args_str='Namespace()', + reprocess=False, + trust_remote_code=False, + ) + + def test_is_already_processed(tmp_path: pathlib.Path): tmp_path_str = str(tmp_path) args_str = 'Namespace(x = 5)' From 0ba263adb7a649f07606e2c1b9a404005364d58e Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:09:41 -0700 Subject: [PATCH 07/13] Bump yaml versions (#1433) --- mcli/mcli-1b-eval.yaml | 2 +- mcli/mcli-1b-max-seq-len-8k.yaml | 2 +- mcli/mcli-1b.yaml | 2 +- mcli/mcli-benchmark-mpt.yaml | 2 +- mcli/mcli-convert-composer-to-hf.yaml | 2 +- mcli/mcli-hf-eval.yaml | 2 +- mcli/mcli-hf-generate.yaml | 2 +- mcli/mcli-llama2-finetune.yaml | 2 +- mcli/mcli-openai-eval.yaml | 2 +- mcli/mcli-pretokenize-oci-upload.yaml | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml index d8ef42d5d5..4bfa301f8e 100644 --- a/mcli/mcli-1b-eval.yaml +++ b/mcli/mcli-1b-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index 4b8eb601b2..2dc83d36a9 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index 20d41fb2cc..69b2295011 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index a7d44239b5..7a3ea2cbe9 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.3.1_cu121-latest integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index 60c708e2f6..fefaf8e1a3 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index e69e6dadda..e58d42483a 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index 8b382c41f0..02c49d84c3 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 443429ca0a..47c163faf8 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index 38b02a6019..c372014165 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: .[gpu,openai] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index 0749dcc86e..a4496503cd 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -14,7 +14,7 @@ integrations: - oci-cli==3.23.2 - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.8.0 + git_branch: v0.10.0 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo From 84cb2ed43b3eccd4eca747ac0ad69d54348c8224 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:06:48 -0700 Subject: [PATCH 08/13] Automatically get the portion of the dataset config that is constructor args (#1434) --- llmfoundry/data/finetuning/dataloader.py | 136 ++++++++++------------- llmfoundry/data/finetuning/tasks.py | 33 ++++-- tests/data/test_dataloader.py | 20 +++- 3 files changed, 98 insertions(+), 91 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index d9450bc657..771033a703 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -1,5 +1,6 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import inspect import logging import os from typing import Any, Dict, Optional, Tuple, Union @@ -17,6 +18,8 @@ validate_target_settings, ) from llmfoundry.data.finetuning.tasks import ( + DEFAULT_TARGET_PROMPTS, + DEFAULT_TARGET_RESPONSES, DOWNLOADED_FT_DATASETS_DIRPATH, SUPPORTED_EXTENSIONS, dataset_constructor, @@ -39,9 +42,15 @@ # HuggingFace hardcodes the ignore index to -100 _HF_IGNORE_INDEX = -100 -# Default settings to use for target responses and target prompts -_DEFAULT_TARGET_RESPONSES = 'last' -_DEFAULT_TARGET_PROMPTS = 'none' +# Extra keys present in the dataset config dictionary beyond the constructor keys +_ALLOWED_DATASET_KEYS = { + 'shuffle', + 'packing_ratio', + 'allow_pad_trimming', + 'seq_parallel_replication', + 'auto_packing_replication', + 'max_leftover_bins_to_keep', +} def build_finetuning_dataloader( @@ -171,7 +180,26 @@ def build_finetuning_dataloader( given a starting workload YAML. """ dataset_cfg = dataset - _validate_config(**dataset_cfg) + is_streaming = ( + dataset_cfg.get('remote') is not None or + dataset_cfg.get('streams') is not None + ) + if is_streaming: + dataset_constructor_keys = inspect.signature( + dataset_constructor.streaming_dataset_class, + ).parameters.keys() + else: + dataset_constructor_keys = inspect.signature( + dataset_constructor.build_from_hf, + ).parameters.keys() + + allowed_dataset_config_keys = set( + dataset_constructor_keys, + ).union(_ALLOWED_DATASET_KEYS) + _validate_config( + **dataset_cfg, + allowed_dataset_keys=allowed_dataset_config_keys, + ) # Use EOS as the pad token if none exists if tokenizer.pad_token is None: # type: ignore (sometimes it's none and that's ok) @@ -213,9 +241,7 @@ def build_finetuning_dataloader( streaming_dataset = None # for pyright sampler = None - if dataset_cfg.get( - 'remote', - ) is not None or dataset_cfg.get('streams') is not None: + if is_streaming: # Build streaming dataloader streams_cfg = dataset_cfg.get('streams', None) streams_cfg = to_dict_container( @@ -225,34 +251,20 @@ def build_finetuning_dataloader( streams_cfg, ) if streams_cfg is not None else None - # note: we don't need to use ** here because we're setting default values for almost all arguments + # Take the constructor args from above, minus args that have been created separately + dataset_constructor_args = { + k: v + for k, v in dataset_cfg.items() + if k in dataset_constructor_keys and + k not in {'streams', 'packing_ratio'} + } streaming_dataset = dataset_constructor.build_from_streaming( tokenizer=tokenizer, streams=streams, - local=dataset_cfg.get('local', None), - remote=dataset_cfg.get('remote', None), - split=dataset_cfg.get('split', None), - download_retry=dataset_cfg.get('download_retry', 2), - download_timeout=dataset_cfg.get('download_timeout', 60), - validate_hash=dataset_cfg.get('validate_hash', None), - keep_zip=dataset_cfg.get('keep_zip', False), - epoch_size=dataset_cfg.get('epoch_size', None), - predownload=dataset_cfg.get('predownload', None), - cache_limit=dataset_cfg.get('cache_limit', None), - partition_algo=dataset_cfg.get('partition_algo', 'relaxed'), - num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None), batch_size=dataloader_batch_size, - shuffle=dataset_cfg.get('shuffle', False), - shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'), - shuffle_seed=dataset_cfg.get('shuffle_seed', 9176), - shuffle_block_size=dataset_cfg.get('shuffle_block_size', None), - sampling_method=dataset_cfg.get('sampling_method', 'balanced'), - sampling_granularity=dataset_cfg.get('sampling_granularity', 1), - batching_method=dataset_cfg.get('batching_method', 'random'), - max_seq_len=dataset_cfg['max_seq_len'], - allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), replication=replication_factor, packing_ratio=dataloader_batch_size / dataset_batch_size, + **dataset_constructor_args, ) else: @@ -283,24 +295,19 @@ def build_finetuning_dataloader( dataset_name_or_path, ) - # Build dataset from HF. + # Take the constructor args from above, minus args that have been created separately + dataset_constructor_args = { + k: v + for k, v in dataset_cfg.items() + if k in dataset_constructor_keys and + k not in {'split', 'preprocessing_fn'} + } streaming_dataset = dataset_constructor.build_from_hf( dataset_name=dataset_name_or_path, split=split, - safe_load=dataset_cfg.get('safe_load', False), - max_seq_len=dataset_cfg['max_seq_len'], preprocessing_fn=preprocessing_fn, tokenizer=tokenizer, - target_prompts=dataset_cfg.get( - 'target_prompts', - _DEFAULT_TARGET_PROMPTS, - ), - target_responses=dataset_cfg.get( - 'target_responses', - _DEFAULT_TARGET_RESPONSES, - ), - decoder_only_format=dataset_cfg['decoder_only_format'], - hf_kwargs=dataset_cfg.get('hf_kwargs', {}), + **dataset_constructor_args, ) # Ensure dataset is large enough. @@ -367,6 +374,7 @@ def _validate_config( streams: Optional[Dict[str, Any]] = None, target_prompts: Optional[str] = None, target_responses: Optional[str] = None, + allowed_dataset_keys: set[str] = _ALLOWED_DATASET_KEYS, **kwargs: Dict[str, Any], ) -> None: """Validates the dataset configuration. @@ -417,6 +425,7 @@ def _validate_config( Defaults to "last", meaning only the final response in multi-turn examples will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for details. + allowed_dataset_keys (set[str], optional): The set of allowed keys for the dataset config. kwargs (DictConfig, optional): Additional kwargs to pass to `datasets.load_dataset`, which can be used to load a dataset from local files. @@ -424,41 +433,10 @@ def _validate_config( Raises: ValueError: If the dataset configuration does not meet the requirements. """ - # Check for extraneous keys in the dataset config - allowed_additional_kwargs = { - 'local', - 'remote', - 'split', - 'download_retry', - 'download_timeout', - 'validate_hash', - 'keep_zip', - 'epoch_size', - 'predownload', - 'cache_limit', - 'partition_algo', - 'num_canonical_nodes', - 'batch_size', - 'shuffle', - 'shuffle_algo', - 'shuffle_seed', - 'shuffle_block_size', - 'sampling_method', - 'sampling_granularity', - 'batching_method', - 'max_seq_len', - 'allow_unsafe_types', - 'replication', - 'packing_ratio', - 'allow_pad_trimming', - 'seq_parallel_replication', - 'auto_packing_replication', - 'max_leftover_bins_to_keep', - } - if not set(kwargs.keys()).issubset(allowed_additional_kwargs): + if not set(kwargs.keys()).issubset(allowed_dataset_keys): raise ValueError( 'The dataset config contains the following extraneous keys: ' +\ - ', '.join(set(kwargs.keys()) - allowed_additional_kwargs), + ', '.join(set(kwargs.keys()) - allowed_dataset_keys), ) if hf_name is not None: @@ -542,9 +520,9 @@ def _validate_config( # Raise an error if the target_prompts + target_responses + decoder_only_format settings # are invalid if target_prompts is None: - target_prompts = _DEFAULT_TARGET_PROMPTS + target_prompts = DEFAULT_TARGET_PROMPTS if target_responses is None: - target_responses = _DEFAULT_TARGET_RESPONSES + target_responses = DEFAULT_TARGET_RESPONSES target_prompts, target_responses = target_prompts.lower( ), target_responses.lower() validate_target_settings( @@ -646,9 +624,9 @@ def build_collate_fn( dataset_cfg = dataloader_cfg['dataset'] target_responses = dataset_cfg.get( 'target_responses', - _DEFAULT_TARGET_RESPONSES, + DEFAULT_TARGET_RESPONSES, ) - target_prompts = dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS) + target_prompts = dataset_cfg.get('target_prompts', DEFAULT_TARGET_PROMPTS) max_seq_len = dataset_cfg['max_seq_len'] decoder_only_format = dataset_cfg['decoder_only_format'] allow_pad_trimming = dataset_cfg.get('allow_pad_trimming', False) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 397b619e73..dd9b495ce4 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -47,6 +47,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: Optional, Sequence, Tuple, + Type, Union, cast, ) @@ -115,6 +116,8 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: ) SUPPORTED_EXTENSIONS = ['.csv', '.json', '.jsonl', '.parquet'] HUGGINGFACE_FOLDER_EXTENSIONS = ['.lock', '.metadata'] +DEFAULT_TARGET_RESPONSES = 'last' +DEFAULT_TARGET_PROMPTS = 'none' PromptResponseDict = Mapping[str, str] ChatFormattedDict = Mapping[str, List[Dict[str, str]]] @@ -805,14 +808,14 @@ def build_from_hf( self, dataset_name: str, split: str, - safe_load: bool, - max_seq_len: int, - preprocessing_fn: Optional[Callable[[dict[str, Any]], Example]], - tokenizer: PreTrainedTokenizerBase, - target_prompts: str, - target_responses: str, - decoder_only_format: bool, - hf_kwargs: Dict[str, Any], + safe_load: bool = False, + max_seq_len: int = 2048, + preprocessing_fn: Optional[Callable[[dict[str, Any]], Example]] = None, + tokenizer: Optional[PreTrainedTokenizerBase] = None, + target_prompts: str = DEFAULT_TARGET_PROMPTS, + target_responses: str = DEFAULT_TARGET_RESPONSES, + decoder_only_format: bool = True, + hf_kwargs: Optional[Dict[str, Any]] = None, ) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset, hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]: """Load a HuggingFace Datasets, preprocess, and tokenize. @@ -851,6 +854,14 @@ def build_from_hf( Returns: Dataset: The tokenized dataset. """ + if hf_kwargs is None: + hf_kwargs = {} + + # None is checked in the function, because argument defaults were added after the function was written and we want + # to preserve the ordering of the arguments for backwards compatibility. + if tokenizer is None: + raise ValueError('A tokenizer must be provided.') + signal_file_path = f'.node_{dist.get_node_rank()}_local_rank0_data_prep_completed' # Non local rank 0 ranks will wait here for local rank 0 to finish the data processing. @@ -999,12 +1010,16 @@ def dataset_mapper(example: Dict): assert filtered_dataset is not None return filtered_dataset + @property + def streaming_dataset_class(self) -> Type[StreamingFinetuningDataset]: + return StreamingFinetuningDataset + def build_from_streaming( self, *args: Any, **kwargs: Any, ) -> StreamingFinetuningDataset: - return StreamingFinetuningDataset(*args, **kwargs) + return self.streaming_dataset_class(*args, **kwargs) dataset_constructor = DatasetConstructor() diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 8e92658194..1a43e12536 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -7,7 +7,7 @@ import shutil from contextlib import nullcontext as does_not_raise from pathlib import Path -from typing import ContextManager, Literal, Optional, Union +from typing import Any, Callable, ContextManager, Dict, Literal, Optional, Union from unittest.mock import MagicMock, patch import catalogue @@ -1220,6 +1220,21 @@ def test_token_counting_func_dataloader_setting( 'timeout': 0, } + def build_from_hf( + self, # type: ignore + dataset_name: str, + split: str, + safe_load: bool = False, + max_seq_len: int = 2048, + preprocessing_fn: Optional[Callable] = None, + tokenizer: transformers.PreTrainedTokenizerBase = None, + target_prompts: str = 'last', + target_responses: str = 'none', + decoder_only_format: bool = True, + hf_kwargs: Optional[Dict[str, Any]] = None, + ): + return [] + if dataloader_type == 'finetuning-hf': cfg = DictConfig({ 'dataset': { @@ -1235,8 +1250,7 @@ def test_token_counting_func_dataloader_setting( }) monkeypatch.setattr( 'llmfoundry.data.finetuning.tasks.DatasetConstructor.build_from_hf', - lambda *args, - **kwargs: [], + build_from_hf, ) dl = build_finetuning_dataloader( tokenizer=gptt, From c262341173a8ac31e8c77063d94534c8d7a9168d Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Wed, 7 Aug 2024 13:41:28 -0700 Subject: [PATCH 09/13] Remove flash patching for HF (#1436) --- llmfoundry/models/hf/hf_causal_lm.py | 23 ++++----------- tests/models/hf/test_hf_config.py | 44 ++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 17 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 34ce22d694..f1f38e2f7d 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -256,23 +256,6 @@ def build_inner_model( False, # Necessary due to https://github.com/huggingface/transformers/issues/28056 ) - # This is not ideal, however Hugging Face's _autoset_attn_implementation function - # forces you to load the model in fp16/bf16 if you want to use flash attention. Rather than loading - # the model and then casting it back to fp32, we are monkeypatching their check. - # https://github.com/huggingface/transformers/issues/28052 - def _autoset_attn_implementation_monkeypatch( - cls, # type: ignore - config, # type: ignore - *args, # type: ignore - **kwargs, # type: ignore - ): # type: ignore - config._attn_implementation = requested_attention_implementation - return config - - PreTrainedModel._autoset_attn_implementation = classmethod( - _autoset_attn_implementation_monkeypatch, - ) - set_config_overrides(config, config_overrides) # We need to have all non-zero local ranks be not-pretrained @@ -293,6 +276,8 @@ def _autoset_attn_implementation_monkeypatch( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, use_auth_token=use_auth_token, + attn_implementation= + requested_attention_implementation, config=config, ) else: @@ -300,6 +285,7 @@ def _autoset_attn_implementation_monkeypatch( AutoModelForCausalLM.from_config( config, trust_remote_code=trust_remote_code, + attn_implementation=requested_attention_implementation, ) dist.barrier() @@ -312,12 +298,14 @@ def _autoset_attn_implementation_monkeypatch( trust_remote_code=trust_remote_code, use_auth_token=use_auth_token, load_in_8bit=load_in_8bit, + attn_implementation=requested_attention_implementation, config=config, ) else: model = AutoModelForCausalLM.from_config( config, trust_remote_code=trust_remote_code, + attn_implementation=requested_attention_implementation, ) elif resolved_init_device == 'meta': if pretrained: @@ -328,6 +316,7 @@ def _autoset_attn_implementation_monkeypatch( model = AutoModelForCausalLM.from_config( config, trust_remote_code=trust_remote_code, + attn_implementation=requested_attention_implementation, ) else: raise ValueError( diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index d0ec544de8..844ccd7fe5 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -7,9 +7,11 @@ from unittest.mock import Mock, patch import pytest +import torch from omegaconf import OmegaConf as om from transformers import PretrainedConfig +from llmfoundry.models.hf.hf_fsdp import rgetattr from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model @@ -235,3 +237,45 @@ def test_nested_override(): assert isinstance(model.config.ffn_config, PretrainedConfig) # Ensure the other values still exist and are not set back to their defaults assert model.config.ffn_config.moe_num_experts == 16 + + +@pytest.mark.gpu +def test_use_flash(): + model_cfg = { + 'name': 'hf_causal_lm', + 'pretrained_model_name_or_path': 'codellama/CodeLlama-7b-hf', + 'config_overrides': { + 'num_hidden_layers': 2, + 'hidden_size': 32, + 'intermediate_size': 64, + 'torch_dtype': 'bfloat16', + }, + 'pretrained': False, + 'init_device': 'cpu', + 'use_flash_attention_2': True, + } + + name = model_cfg.pop('name') + model = build_composer_model( + name=name, + cfg=model_cfg, + tokenizer=None, # type: ignore + ) + + from transformers.models.llama.modeling_llama import ( + LlamaFlashAttention2, + ) + flash_attn_class = LlamaFlashAttention2 + attention_layers_attr = 'model.model.layers' + attention_attr = 'self_attn' + + # check that it actually used flash attention 2 + assert model.model.config._attn_implementation == ('flash_attention_2') + attention_layer = rgetattr( + rgetattr(model, attention_layers_attr)[0], + attention_attr, + ) + assert isinstance(attention_layer, flash_attn_class) + + # Make sure that HF has not cast the parameters to bf16 + assert next(model.parameters()).dtype == torch.float32 From 0f4476d874ef1b0b4c9317b0815fec7dfe9c1161 Mon Sep 17 00:00:00 2001 From: Bruce Fontaine Date: Wed, 7 Aug 2024 14:27:40 -0700 Subject: [PATCH 10/13] Fix the context size in long context gauntlet for wikiqa (#1439) --- scripts/eval/yamls/long_context_tasks.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval/yamls/long_context_tasks.yaml b/scripts/eval/yamls/long_context_tasks.yaml index 153e3b9df6..221635da87 100644 --- a/scripts/eval/yamls/long_context_tasks.yaml +++ b/scripts/eval/yamls/long_context_tasks.yaml @@ -105,7 +105,7 @@ icl_tasks: icl_task_type: generation_task_with_answers hf_loading_vars: name: wikiqa - context_length: 2048 + context_length: 4096 split: test - label: wikiqa_8k @@ -114,7 +114,7 @@ icl_tasks: icl_task_type: generation_task_with_answers hf_loading_vars: name: wikiqa - context_length: 2048 + context_length: 8192 split: test - label: hotpotqa_beginning_2k From f006d07ce814576adff1c36dc2d1b3e75b3ae2f6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 8 Aug 2024 05:41:56 +0000 Subject: [PATCH 11/13] Update mlflow requirement from <2.15,>=2.14.1 to >=2.14.1,<2.16 (#1424) Updates the requirements on [mlflow](https://github.com/mlflow/mlflow) to permit the latest version. - [Release notes](https://github.com/mlflow/mlflow/releases) - [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md) - [Commits](https://github.com/mlflow/mlflow/compare/v2.14.1...v2.15.0) --- updated-dependencies: - dependency-name: mlflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 19e5cee2d6..04c28d8f70 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ install_requires = [ 'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.23.4,<0.24', - 'mlflow>=2.14.1,<2.15', + 'mlflow>=2.14.1,<2.16', 'accelerate>=0.25,<0.34', # for HF inference `device_map` 'transformers>=4.43.2,<4.44', 'mosaicml-streaming>=0.8.0,<0.9', From 805cf83c709732e0d99b952dd51ab528f109814d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 8 Aug 2024 13:17:02 -0400 Subject: [PATCH 12/13] Add special errors for bad chat/ift types (#1437) --- llmfoundry/data/finetuning/tasks.py | 21 ++++++++------------- llmfoundry/utils/exceptions.py | 16 ++++++++++++++++ tests/data/test_template_tokenization.py | 18 ++++++++++++++++-- 3 files changed, 40 insertions(+), 15 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index dd9b495ce4..e8175b4446 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -78,8 +78,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: ConsecutiveRepeatedChatRolesError, IncorrectMessageKeyQuantityError, InvalidContentTypeError, + InvalidExampleTypeError, InvalidFileExtensionError, InvalidLastChatMessageRoleError, + InvalidMessageTypeError, InvalidPromptResponseKeysError, InvalidPromptTypeError, InvalidResponseTypeError, @@ -139,9 +141,7 @@ def _get_example_type(example: Example) -> ExampleType: KeyError: If the example type is unknown. """ if not isinstance(example, Mapping): - raise TypeError( - f'Expected example to be a Mapping, but found {type(example)}', - ) + raise InvalidExampleTypeError(str(type(example))) if ( len(example.keys()) == 1 and any( allowed_message_key in example @@ -156,7 +156,8 @@ def _get_example_type(example: Example) -> ExampleType: ): return 'prompt_response' else: - raise UnknownExampleTypeError(str(example.keys())) + keys = str(set(example.keys())) + raise UnknownExampleTypeError(keys) def _is_empty_or_nonexistent(dirpath: str) -> bool: @@ -173,23 +174,17 @@ def _is_empty_or_nonexistent(dirpath: str) -> bool: def _get_key(dictionary: Mapping[str, Any], allowed_keys: set[str]): if not isinstance(dictionary, Mapping): - raise TypeError( - f'Expected dictionary to be a mapping, but found {type(dictionary)}', - ) + raise InvalidExampleTypeError(str(type(dictionary))) desired_keys = allowed_keys.intersection(dictionary.keys()) return list(desired_keys)[0] def _validate_chat_formatted_example(example: ChatFormattedDict): if not isinstance(example, Mapping): - raise TypeError( - f'Expected example to be a mapping, but found {type(example)}', - ) + raise InvalidExampleTypeError(str(type(example))) messages = example[_get_key(example, ALLOWED_MESSAGES_KEYS)] if not isinstance(messages, List): - raise TypeError( - f'Expected messages to be an iterable, but found {type(messages)}', - ) + raise InvalidMessageTypeError(str(type(messages))) if len(messages) <= 1: raise NotEnoughChatDataError() diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index 140bf8540b..c6a667697d 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -161,6 +161,22 @@ def __init__( ## Tasks exceptions +class InvalidExampleTypeError(UserError): + """Error thrown when a message type is not a `Mapping`.""" + + def __init__(self, example_type: str) -> None: + message = f'Expected example to be a `Mapping`, but found type {example_type}' + super().__init__(message, example_type=example_type) + + +class InvalidMessageTypeError(UserError): + """Error thrown when a message type is not an `Iterable`.""" + + def __init__(self, message_type: str) -> None: + message = f'Expected message to be an `Iterable`, but found type {message_type}' + super().__init__(message, message_type=message_type) + + class UnknownExampleTypeError(UserError): """Error thrown when an unknown example type is used in a task.""" diff --git a/tests/data/test_template_tokenization.py b/tests/data/test_template_tokenization.py index 16447d6623..9f44739b6b 100644 --- a/tests/data/test_template_tokenization.py +++ b/tests/data/test_template_tokenization.py @@ -16,6 +16,8 @@ ALLOWED_PROMPT_KEYS, ALLOWED_RESPONSE_KEYS, ChatTemplateError, + InvalidExampleTypeError, + InvalidMessageTypeError, ) @@ -48,13 +50,13 @@ def test_tokenize_chat_example_malformed(): 'content': 'user message not followed by an assistant label', }], } - wrong_type = {'messages': 'this is not a list of messages'} + wrong_example_type = ['this is not a dictionary'] + wrong_messages_type = {'messages': 'this is not a list of messages'} malformed_chat_examples = [ too_few_messages, no_content, ends_with_user_role, no_assistant_message, - wrong_type, ] my_tokenizer = build_tokenizer('mosaicml/mpt-7b-8k-chat', {}) for example in malformed_chat_examples: @@ -63,6 +65,18 @@ def test_tokenize_chat_example_malformed(): example, my_tokenizer, ) # type: ignore (the typing here is supposed to be malformed) + with pytest.raises(InvalidExampleTypeError): + # Ignore the type here because it's the mistyping that we're + # trying to test. + tokenize_formatted_example( # type: ignore + wrong_example_type, # type: ignore + my_tokenizer, # type: ignore + ) + with pytest.raises(InvalidMessageTypeError): + tokenize_formatted_example( + wrong_messages_type, + my_tokenizer, + ) def test_tokenize_chat_example_well_formed(): From 44b09f0d5b7d844cfb913d41c82b2c35bef21112 Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Thu, 8 Aug 2024 19:13:23 -0400 Subject: [PATCH 13/13] Make autopacking faster (#1435) --- llmfoundry/data/finetuning/collator.py | 68 +++-- llmfoundry/data/finetuning/dataloader.py | 3 + llmfoundry/data/packing.py | 349 +++++++++++++---------- 3 files changed, 242 insertions(+), 178 deletions(-) diff --git a/llmfoundry/data/finetuning/collator.py b/llmfoundry/data/finetuning/collator.py index 42af7e9375..68ebb9d21d 100644 --- a/llmfoundry/data/finetuning/collator.py +++ b/llmfoundry/data/finetuning/collator.py @@ -224,6 +224,10 @@ class Seq2SeqFinetuningCollator: sizes. Default: ``False`` ensures that all sequences are max_seq_len. batch_metadata (dict, optional): A dictionary of metadata which will be added to the batch. + pad_to_longest (bool, optional): Whether to pad to the longest sequence, + which may result in smaller but inconsistent batch sizes. This is + primarily used to profile packing. + Default: ``False`` ensures that all sequences are max_seq_len. """ def __init__( @@ -235,6 +239,7 @@ def __init__( target_prompts: str = 'none', allow_pad_trimming: bool = False, batch_metadata: Optional[Dict[str, Any]] = None, + pad_to_longest: bool = False, ): self.tokenizer = tokenizer self.max_seq_len = max_seq_len @@ -247,6 +252,8 @@ def __init__( self._allow_pad_trimming = allow_pad_trimming self._seen_first_batch = False + self._pad_to_longest = pad_to_longest + illegal_keys = [ 'input_ids', 'labels', @@ -320,24 +327,34 @@ def _process_and_batch_decoder_only( ) -> Dict[str, torch.Tensor]: # Steps explained in comments processed_examples = [] - for example in examples: - input_ids, labels = stitch_turns_decoder_only( + input_ids_and_labels = [ + stitch_turns_decoder_only( example_turns=example['turns'], target_prompts=self.target_prompts, target_responses=self.target_responses, eos_token_id=self.tokenizer.eos_token_id, - ) + ) for example in examples + ] + + if self._pad_to_longest: + max_seq_len = max([ + len(input_ids) for input_ids, _ in input_ids_and_labels + ]) + max_seq_len = min(max_seq_len, self.max_seq_len) + else: + max_seq_len = self.max_seq_len + for input_ids, labels in input_ids_and_labels: orig_size = len(input_ids) # We may need to truncate the input_ids / labels in order to maintain max_seq_len - if orig_size > self.max_seq_len: - input_ids = input_ids[:self.max_seq_len] - labels = labels[:self.max_seq_len] + if orig_size > max_seq_len: + input_ids = input_ids[:max_seq_len] + labels = labels[:max_seq_len] # Check to make sure there are still loss-generating tokens. Error if not. if len([l for l in labels if l != _HF_IGNORE_INDEX]) == 0: raise ValueError( - f'Truncating to max_seq_len={self.max_seq_len} has removed all loss-generating tokens. ' +\ + f'Truncating to max_seq_len={max_seq_len} has removed all loss-generating tokens. ' +\ f'Pre-truncation sequence length was {orig_size}. ' +\ 'This sample should have been filtered out before reaching the collator. If using ' +\ 'pre-tokenized streaming data, this may have resulted from using different ' +\ @@ -348,7 +365,7 @@ def _process_and_batch_decoder_only( # Still issue a warning when truncating if not self._warned_truncated: warnings.warn( - f'Truncating sequence of length={orig_size} to fit max_seq_len={self.max_seq_len}. ' +\ + f'Truncating sequence of length={orig_size} to fit max_seq_len={max_seq_len}. ' +\ f'If truncation is a problem, consider increasing max_seq_len.', ) self._warned_truncated = True @@ -358,7 +375,7 @@ def _process_and_batch_decoder_only( # Annoyingly, we need to pad everything but input_ids # and attention_mask ourselves n_total = len(input_ids) - i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - n_total) + i_pad = [_HF_IGNORE_INDEX] * (max_seq_len - n_total) if self.tokenizer.padding_side == 'left': labels = i_pad + labels else: @@ -376,7 +393,7 @@ def _process_and_batch_decoder_only( batch = self.tokenizer.pad( processed_examples, padding='max_length', - max_length=self.max_seq_len, + max_length=max_seq_len, return_tensors='pt', ) @@ -410,35 +427,44 @@ def _process_and_batch_encoder_decoder( # The encoder-decoder case is has some gotchas. # Steps are explained in comments. processed_examples = [] - for example in examples: - context, target = stitch_turns_encoder_decoder( + contexts_and_targets = [ + stitch_turns_encoder_decoder( example_turns=example['turns'], eos_token_id=self.tokenizer.eos_token_id, - ) + ) for example in examples + ] + + if self._pad_to_longest: + max_seq_len = 0 + for context, target in contexts_and_targets: + max_seq_len = max(max_seq_len, len(context), len(target)) + else: + max_seq_len = self.max_seq_len + for context, target in contexts_and_targets: # We need to pad labels ourselves. Because HF. - if len(target) < self.max_seq_len: - i_pad = [_HF_IGNORE_INDEX] * (self.max_seq_len - len(target)) + if len(target) < max_seq_len: + i_pad = [_HF_IGNORE_INDEX] * (max_seq_len - len(target)) target = target + i_pad else: if not self._warned_target: warnings.warn( f'Truncating TARGET sequence of length={len(target)} ' +\ - f'to max_seq_len={self.max_seq_len}. If truncation is ' +\ + f'to max_seq_len={max_seq_len}. If truncation is ' +\ f'a problem, consider increasing max_seq_len.') self._warned_target = True - target = target[:self.max_seq_len - + target = target[:max_seq_len - 1] + [self.tokenizer.eos_token_id] # We might need to truncate the context. Preserve the beginning. - if len(context) > self.max_seq_len: + if len(context) > max_seq_len: if not self._warned_context: warnings.warn( f'Truncating CONTEXT sequence of length={len(context)} ' +\ - f'to max_seq_len={self.max_seq_len}. If truncation is ' +\ + f'to max_seq_len={max_seq_len}. If truncation is ' +\ f'a problem, consider increasing max_seq_len.') self._warned_context = True - context = context[:self.max_seq_len - + context = context[:max_seq_len - 1] + [self.tokenizer.eos_token_id] # Back into the example @@ -454,7 +480,7 @@ def _process_and_batch_encoder_decoder( batch = self.tokenizer.pad( processed_examples, padding='max_length', - max_length=self.max_seq_len, + max_length=max_seq_len, return_tensors='pt', ) # We're still missing decoder_input_ids and decoder_attention_mask diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 771033a703..6aecadb6bb 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -50,6 +50,7 @@ 'seq_parallel_replication', 'auto_packing_replication', 'max_leftover_bins_to_keep', + 'pad_to_longest', } @@ -630,6 +631,7 @@ def build_collate_fn( max_seq_len = dataset_cfg['max_seq_len'] decoder_only_format = dataset_cfg['decoder_only_format'] allow_pad_trimming = dataset_cfg.get('allow_pad_trimming', False) + pad_to_longest = dataset_cfg.get('pad_to_longest', False) collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, @@ -638,6 +640,7 @@ def build_collate_fn( target_responses=target_responses, target_prompts=target_prompts, allow_pad_trimming=allow_pad_trimming, + pad_to_longest=pad_to_longest, ) packing_ratio = dataset_cfg.get('packing_ratio') diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 5579066f89..77e166c474 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -20,7 +20,20 @@ class BinPackCollator: - """Utility collator for packing to reduce padding.""" + """Utility collator for packing to reduce padding. + + Args: + collator (Callable): The base collator to use. + target_batch_size (int): The number of bins. + max_seq_len(int): The maximum sequence length of a bin. + pad_token_id (int): The padding token id. + padding_side (Literal['left', 'right']): The side to pad on. + max_leftover_bins_to_keep (Optional[int]): The number of leftover bins + to keep. + is_profiling (bool): Whether the collator is being used for profiling. + In profiling mode, packing and padding the example tensors is + avoided for efficiency, and the returned batch will be None. + """ def __init__( self, @@ -30,6 +43,7 @@ def __init__( pad_token_id: int, padding_side: Literal['left', 'right'], max_leftover_bins_to_keep: Optional[int] = None, + is_profiling: bool = False, ): self.base_collator = collator self.out_size = int(target_batch_size) @@ -56,6 +70,8 @@ def __init__( self._leftover_bins: List[Tuple[int, Dict[str, torch.Tensor]]] = [] + self._is_profiling = is_profiling + @property def waste(self) -> float: return 1 - (self.n_packed_tokens / self.n_total_tokens) @@ -74,6 +90,9 @@ def __call__( return self.pack(batch) def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + if self._is_profiling: + raise ValueError('Cannot pack in profiling mode.') + assert 'attention_mask' in batch assert 'input_ids' in batch @@ -86,13 +105,15 @@ def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: ] # Cut everything down to size sizes, trimmed_examples = _trim_batch(batch) - return self._pack_trimmed_examples(trimmed_examples, sizes) + packed_batch = self._pack_trimmed_examples(trimmed_examples, sizes) + assert packed_batch is not None + return packed_batch def _pack_trimmed_examples( self, trimmed_examples: List[Dict[str, torch.Tensor]], sizes: List[int], - ) -> Dict[str, torch.Tensor]: + ) -> Optional[Dict[str, torch.Tensor]]: """Packs trimmed examples into fixed-size bins and repads them. Args: @@ -100,10 +121,12 @@ def _pack_trimmed_examples( sizes (List[int]): The sizes of the trimmed examples. Returns: - Dict[str, torch.Tensor]: A batch of repadded examples ready for processing + Optional[Dict[str, torch.Tensor]]: A batch of repadded examples + ready for processing. If the collator is in profiling mode, + returns None. """ # Apply our CS 101 bin packing algorithm. - packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = _first_fit_bin_packing( + packed_examples, n_packed_tokens, n_total_tokens, leftover_bins = self._first_fit_bin_packing( sizes=sizes, examples=trimmed_examples, num_bins=self.out_size, @@ -115,15 +138,145 @@ def _pack_trimmed_examples( self.n_packed_examples += self.out_size self._leftover_bins = leftover_bins[:self.max_leftover_bins_to_keep] + if self._is_profiling: + return None + # Re-pad to max_seq_len and batch - batch = _repad( - packed_examples, - max_seq_len=self.max_seq_len, - pad_token_id=self.pad_token_id, - padding_side=self.padding_side, - ) + batch = self._convert_to_batch(packed_examples) + return batch + + def _convert_to_batch( + self, + packed_examples: List[Dict[str, torch.Tensor]], + ) -> Dict[str, torch.Tensor]: + + pad_vals = { + 'input_ids': self.pad_token_id, + 'labels': -100, + 'attention_mask': 0, + 'sequence_id': -1, + } + keys = packed_examples[0].keys() + batch = {} + for key in keys: + batch[key] = torch.stack([ + _pad_tensor( + example[key], + pad_vals[key], + self.max_seq_len, + self.padding_side, + ) for example in packed_examples + ]) return batch + def _first_fit_bin_packing( + self, + sizes: List[int], + examples: List[Dict[str, torch.Tensor]], + num_bins: int, + max_bin_size: int, + existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]], + ) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[ + str, torch.Tensor]]]]: + + # Will contain tuples (bin_size_size, packed_example) + bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins + + starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins]) + + sizes_and_examples = list(zip(sizes, examples)) + sorted_sizes_and_examples = sorted( + sizes_and_examples, + key=lambda x: x[0], + reverse=True, + ) + + required_num_examples = max(0, num_bins - len(bins)) + num_examples = len(sizes) + if num_examples < required_num_examples: + for size, example in sorted_sizes_and_examples: + # Can't keep packing. All remaining items get their own bin. + bins.append((size, example)) + + total_bin_sizes = sum([bin_size for bin_size, _ in bins]) + total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes + total_example_sizes = sum(sizes) + if total_new_bin_sizes != total_example_sizes: + raise AssertionError( + f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.', + ) + + sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True) + bin_sizes, packed_examples = [], [] + for bin_size, packed_example in sorted_bins: + bin_sizes.append(bin_size) + packed_examples.append(packed_example) + + # Return: + # - the num_bins largest packed examples + # - the total tokens in those examples + # - the total size of all new examples + # - leftover bins + return packed_examples[:num_bins], sum( + bin_sizes[:num_bins], + ), sum(sizes), sorted_bins[num_bins:] + + # Go through each item from longest to shortest. + # Note: all items will either go into an existing or new bin. + for i, (size, example) in enumerate(sorted_sizes_and_examples): + # If we can't keep packing, all remaining items get their own bin. + required_num_examples = max(0, num_bins - len(bins)) + n_remaining = num_examples - i + assert n_remaining >= required_num_examples + if n_remaining == required_num_examples: + # Can't keep packing. All remaining items get their own bin. + bins.append((size, example)) + continue + + # Add it to the first bin it fits in + added = False + for bidx in range(len(bins)): + if bins[bidx][0] + size <= max_bin_size: + bin_size, packed_example = bins.pop(bidx) + bin_size = bin_size + size + # When profiling, don't combine the tensor for efficiency + # and return the first example which isn't "correct" + if not self._is_profiling: + packed_example = _combine_in_place( + packed_example, + example, + ) + + bins.append((bin_size, packed_example)) + added = True + break + # If it didn't fit anywhere, open a new bin + if not added: + bins.append((size, example)) + + total_bin_sizes = sum([bin_size for bin_size, _ in bins]) + total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes + total_example_sizes = sum(sizes) + if total_new_bin_sizes != total_example_sizes: + raise AssertionError( + f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.', + ) + + sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True) + bin_sizes, packed_examples = [], [] + for bin_size, packed_example in sorted_bins: + bin_sizes.append(bin_size) + packed_examples.append(packed_example) + + # Return: + # - the num_bins largest packed examples + # - the total tokens in those examples + # - the total size of all new examples + # - leftover bins + return packed_examples[:num_bins], sum( + bin_sizes[:num_bins], + ), sum(sizes), sorted_bins[num_bins:] + def _trim_batch( batch: Dict[str, torch.Tensor], @@ -177,143 +330,25 @@ def _combine_in_place( return example -def _first_fit_bin_packing( - sizes: List[int], - examples: List[Dict[str, torch.Tensor]], - num_bins: int, - max_bin_size: int, - existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]], -) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[ - str, torch.Tensor]]]]: - - # Will contain tuples (bin_size_size, packed_example) - bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins - - starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins]) - - sizes_and_examples = list(zip(sizes, examples)) - sorted_sizes_and_examples = sorted( - sizes_and_examples, - key=lambda x: x[0], - reverse=True, - ) - - required_num_examples = max(0, num_bins - len(bins)) - num_examples = len(sizes) - if num_examples < required_num_examples: - for size, example in sorted_sizes_and_examples: - # Can't keep packing. All remaining items get their own bin. - bins.append((size, example)) - - total_bin_sizes = sum([bin_size for bin_size, _ in bins]) - total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes - total_example_sizes = sum(sizes) - if total_new_bin_sizes != total_example_sizes: - raise AssertionError( - f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.', - ) - - sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True) - bin_sizes, packed_examples = [], [] - for bin_size, packed_example in sorted_bins: - bin_sizes.append(bin_size) - packed_examples.append(packed_example) - - # Return: - # - the num_bins largest packed examples - # - the total tokens in those examples - # - the total size of all new examples - # - leftover bins - return packed_examples[:num_bins], sum( - bin_sizes[:num_bins], - ), sum(sizes), sorted_bins[num_bins:] - - # Go through each item from longest to shortest. - # Note: all items will either go into an existing or new bin. - for i, (size, example) in enumerate(sorted_sizes_and_examples): - # If we can't keep packing, all remaining items get their own bin. - required_num_examples = max(0, num_bins - len(bins)) - n_remaining = num_examples - i - assert n_remaining >= required_num_examples - if n_remaining == required_num_examples: - # Can't keep packing. All remaining items get their own bin. - bins.append((size, example)) - continue - - # Add it to the first bin it fits in - added = False - for bidx in range(len(bins)): - if bins[bidx][0] + size <= max_bin_size: - bin_size, packed_example = bins.pop(bidx) - bin_size = bin_size + size - packed_example = _combine_in_place(packed_example, example) - bins.append((bin_size, packed_example)) - added = True - break - # If it didn't fit anywhere, open a new bin - if not added: - bins.append((size, example)) - - total_bin_sizes = sum([bin_size for bin_size, _ in bins]) - total_new_bin_sizes = total_bin_sizes - starting_total_bin_sizes - total_example_sizes = sum(sizes) - if total_new_bin_sizes != total_example_sizes: - raise AssertionError( - f'Error in packing. {total_example_sizes=} does not equal {total_new_bin_sizes=}.', - ) - - sorted_bins = sorted(bins, key=lambda x: x[0], reverse=True) - bin_sizes, packed_examples = [], [] - for bin_size, packed_example in sorted_bins: - bin_sizes.append(bin_size) - packed_examples.append(packed_example) - - # Return: - # - the num_bins largest packed examples - # - the total tokens in those examples - # - the total size of all new examples - # - leftover bins - return packed_examples[:num_bins], sum( - bin_sizes[:num_bins], - ), sum(sizes), sorted_bins[num_bins:] - - -def _repad( - packed_examples: List[Dict[str, torch.Tensor]], +def _pad_tensor( + tensor: torch.Tensor, + pad_value: int, max_seq_len: int, - pad_token_id: int, padding_side: str, -) -> Dict[str, torch.Tensor]: - - def pad_tensor(tensor: torch.Tensor, pad_value: int): - if len(tensor) == max_seq_len: - return tensor - t = torch.full((max_seq_len,), - pad_value, - dtype=tensor.dtype, - device=tensor.device) - if padding_side == 'left': - t[-len(tensor):] = tensor - elif padding_side == 'right': - t[:len(tensor)] = tensor - else: - raise ValueError(f'Unknown {padding_side=}') - return t - - pad_vals = { - 'input_ids': pad_token_id, - 'labels': -100, - 'attention_mask': 0, - 'sequence_id': -1, - } - keys = packed_examples[0].keys() - batch = {} - for key in keys: - batch[key] = torch.stack([ - pad_tensor(example[key], pad_vals[key]) - for example in packed_examples - ]) - return batch +) -> torch.Tensor: + if len(tensor) == max_seq_len: + return tensor + t = torch.full((max_seq_len,), + pad_value, + dtype=tensor.dtype, + device=tensor.device) + if padding_side == 'left': + t[-len(tensor):] = tensor + elif padding_side == 'right': + t[:len(tensor)] = tensor + else: + raise ValueError(f'Unknown {padding_side=}') + return t def auto_packing_ratio( @@ -428,24 +463,23 @@ def profile_packing( 'prefetch_factor': None, 'persistent_workers': False, }) - dataloader_cfg['dataset']['packing_ratio'] = 1.0 - dataloader_cfg['dataset']['auto_packing_replication' - ] = dataloader_cfg['dataset'].get( - 'seq_parallel_replication', - 1, - ) or 1 - dataloader_cfg['dataset']['seq_parallel_replication'] = 1 + dataset_cfg = dataloader_cfg['dataset'] + dataset_cfg['packing_ratio'] = 1.0 + seq_parallel_replication = dataset_cfg.get('seq_parallel_replication', 1) + dataset_cfg['auto_packing_replication'] = seq_parallel_replication or 1 + dataset_cfg['seq_parallel_replication'] = 1 + dataset_cfg['pad_to_longest'] = True # If streaming dataset, use a temporary local folder for profiling local_rank_zero = dist.get_global_rank() - dist.get_local_rank() - if dataloader_cfg['dataset'].get('remote') is not None: + if dataset_cfg.get('remote') is not None: tmp_path_to_broadcast = tempfile.TemporaryDirectory().name gathered_paths = dist.all_gather_object(tmp_path_to_broadcast) tmp_path = gathered_paths[local_rank_zero] - dataloader_cfg['dataset']['local'] = tmp_path + dataset_cfg['local'] = tmp_path - if dataloader_cfg['dataset'].get('streams') is not None: - for stream_config in dataloader_cfg['dataset']['streams'].values(): + if dataset_cfg.get('streams') is not None: + for stream_config in dataset_cfg['streams'].values(): tmp_path_to_broadcast = tempfile.TemporaryDirectory().name gathered_paths = dist.all_gather_object(tmp_path_to_broadcast) tmp_path = gathered_paths[local_rank_zero] @@ -492,6 +526,7 @@ def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: pad_token_id=0, # <-- Doesn't need to be correct for profiling padding_side='left', # <-- Doesn't need to be correct for profiling max_leftover_bins_to_keep=max_leftovers_to_keep, + is_profiling=True, ) # Simulate feeding the packing collator a bunch of data