From 67928cb4d5def4996afe8bc91d2be3bbe42b9aba Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 6 Jun 2024 02:01:54 -0400 Subject: [PATCH 01/16] Fix MPT HF conversion (#1257) --- llmfoundry/utils/huggingface_hub_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llmfoundry/utils/huggingface_hub_utils.py b/llmfoundry/utils/huggingface_hub_utils.py index 3f7b3a0f55..3038014d7f 100644 --- a/llmfoundry/utils/huggingface_hub_utils.py +++ b/llmfoundry/utils/huggingface_hub_utils.py @@ -280,6 +280,9 @@ def edit_files_for_hf_compatibility( for f in files_processed_and_queued } for entrypoint in entrypoint_files: + file_path = os.path.join(folder, entrypoint) + if not os.path.exists(file_path): + continue existing_relative_imports = get_all_relative_imports( os.path.join(folder, entrypoint), ) From 3966f0efe5f6c216834a8ed5f5e319d9335fe49b Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 6 Jun 2024 03:40:12 -0400 Subject: [PATCH 02/16] remove warning (#1258) --- llmfoundry/data/utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/llmfoundry/data/utils.py b/llmfoundry/data/utils.py index a5fe3a1022..206e884f70 100644 --- a/llmfoundry/data/utils.py +++ b/llmfoundry/data/utils.py @@ -26,14 +26,6 @@ def _validate_cfg( eos_token_id = dataset_cfg.get('eos_token_id', None) bos_token_id = dataset_cfg.get('bos_token_id', None) - if eos_token_id is None and bos_token_id is None and ( - hasattr(tokenizer, 'eos_token_id') or - hasattr(tokenizer, 'bos_token_id') - ): - log.warning( - 'The user has not provided an eos_token_id or bos_token_id, but the tokenizer has an eos_token_id or a bos_token_id.', - ) - tokenizer_eos_token_id = getattr(tokenizer, 'eos_token_id', None) if eos_token_id is not None and eos_token_id != tokenizer_eos_token_id: eos_mismatch_str = f'Provided {eos_token_id=} does not match the eos_token_id of the tokenizer={tokenizer_eos_token_id}.' From 42c2d9a003d697a060eae76c0bf54a0ffbf7722a Mon Sep 17 00:00:00 2001 From: Saaketh Narayan Date: Thu, 6 Jun 2024 11:52:32 -0700 Subject: [PATCH 03/16] Adding more token encoding types (#1254) * add more token encoing types * add more token encoing types * add tests * add tests * ft support, tests * linting is shortening my lifespan * linting is shortening my lifespan * long tensor * long tensor * long tensor * feedbacc * import * import --------- Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/data/__init__.py | 9 +- llmfoundry/data/data.py | 50 ++++- llmfoundry/data/finetuning/tasks.py | 36 +-- llmfoundry/data/text_data.py | 47 +++- scripts/data_prep/README.md | 17 ++ scripts/data_prep/convert_dataset_hf.py | 12 +- scripts/data_prep/convert_dataset_json.py | 30 +-- scripts/data_prep/convert_text_to_mds.py | 13 +- .../data_prep/test_convert_text_to_mds.py | 3 +- tests/data/test_data_encodings.py | 205 ++++++++++++++++++ tests/data/test_dataloader.py | 6 +- 11 files changed, 350 insertions(+), 78 deletions(-) create mode 100644 tests/data/test_data_encodings.py diff --git a/llmfoundry/data/__init__.py b/llmfoundry/data/__init__.py index 966ca90c86..5710be0c55 100644 --- a/llmfoundry/data/__init__.py +++ b/llmfoundry/data/__init__.py @@ -1,7 +1,12 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from llmfoundry.data.data import ConcatTokensDataset, NoConcatDataset +from llmfoundry.data.data import ( + SUPPORTED_MDS_ENCODING_TYPES, + ConcatTokensDataset, + NoConcatDataset, + stream_remote_local_validate, +) from llmfoundry.data.dataloader import build_dataloader from llmfoundry.data.finetuning import ( Seq2SeqFinetuningCollator, @@ -55,4 +60,6 @@ 'auto_packing_ratio', 'profile_packing', 'ConcatenatedSequenceCollatorWrapper', + 'stream_remote_local_validate', + 'SUPPORTED_MDS_ENCODING_TYPES', ] diff --git a/llmfoundry/data/data.py b/llmfoundry/data/data.py index 04eb6d345d..bde68a6998 100644 --- a/llmfoundry/data/data.py +++ b/llmfoundry/data/data.py @@ -5,16 +5,31 @@ import os import warnings from abc import ABC, abstractmethod -from typing import Dict, Iterable, Union +from typing import Dict, Iterable, Optional, Union import datasets as hf_datasets import numpy as np +from numpy.typing import NDArray from torch.utils.data import IterableDataset from transformers import PreTrainedTokenizerBase __all__ = [ + 'AbstractConcatTokensDataset', 'ConcatTokensDataset', 'NoConcatDataset', + 'stream_remote_local_validate', + 'SUPPORTED_MDS_ENCODING_TYPES', +] + +SUPPORTED_MDS_ENCODING_TYPES = [ + 'int8', + 'int16', + 'int32', + 'int64', + 'uint8', + 'uint16', + 'uint32', + 'uint64', ] @@ -97,14 +112,14 @@ def __init__( ) @abstractmethod - def __iter__(self) -> Iterable[Dict[str, bytes]]: + def __iter__(self) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]: pass class ConcatTokensDataset(AbstractConcatTokensDataset): """An IterableDataset that returns token samples for MDSWriter. - Returns dicts of {'tokens': bytes} + Returns dicts of {'tokens': ndarray:int32} To use data created by this class and written to MDS format: @@ -119,7 +134,7 @@ class ConcatTokensDataset(AbstractConcatTokensDataset): # note, you need to copy the numpy array because the original is non-writeable # and torch does not support non-writeable tensors, so you get a scary warning and # if you do try to write to the tensor you get undefined behavior - tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int64).copy()) + tokens = torch.from_numpy(np.frombuffer(ds[0]['tokens'], dtype=np.int32).copy()) print(tokenizer.decode(tokens)) ``` """ @@ -136,7 +151,7 @@ def __init__( self.hf_dataset = hf_dataset super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) - def __iter__(self) -> Iterable[Dict[str, bytes]]: + def __iter__(self) -> Iterable[Dict[str, NDArray]]: buffer = [] for sample in self.hf_dataset: encoded = self.tokenizer( @@ -150,6 +165,27 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]: concat_sample = buffer[:self.max_length] buffer = buffer[self.max_length:] if self.should_wrap else [] yield { - # convert to bytes to store in MDS binary format - 'tokens': np.asarray(concat_sample).tobytes(), + # convert to ndarray to store in MDS format + 'tokens': np.asarray(concat_sample, dtype=np.int32), } + + +def stream_remote_local_validate( + remote: Optional[str], + local: Optional[str], + split: Optional[str], +): + """Check that, if needed, the local/split directory exists. + + Args: + remote (Optional[str]): Remote path to the dataset. + local (Optional[str]): Local path to the dataset. + split (Optional[str]): Subdirectory specifying which dataset split to use, if any. + """ + if remote is None or (local == remote): + if local is not None and os.path.isdir(local): + contents = set(os.listdir(local)) + if split is not None and split not in contents: + raise ValueError( + f'Local directory {local} does not contain split {split}', + ) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index b7cce4d20a..40f178fb6e 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -59,6 +59,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: from streaming import Stream, StreamingDataset from transformers import PreTrainedTokenizerBase +from llmfoundry.data import ( + SUPPORTED_MDS_ENCODING_TYPES, + stream_remote_local_validate, +) from llmfoundry.data.finetuning.collator import ( _HF_IGNORE_INDEX, stitch_turns_decoder_only, @@ -494,26 +498,15 @@ def is_valid_ift_example( return True -def _stream_remote_local_validate( - remote: Optional[str], - local: Optional[str], - split: Optional[str], -): - if remote is None or (local == remote): - if local is not None and os.path.isdir(local): - contents = set(os.listdir(local)) - if split is not None and split not in contents: - raise ValueError( - f'Local directory {local} does not contain split {split}', - ) - - class StreamingFinetuningDataset(StreamingDataset): """Finetuning dataset with flexible tokenization using StreamingDataset. Args: tokenizer (Tokenizer): The name of the HuggingFace tokenizer to use to tokenize samples. + token_encoding_type (str): The encoding type of the tokenized samples. This is only used + for legacy datasets that have been written directly as 'bytes' instead of numpy + arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'. streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from, which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or ``remote``/``local``. Defaults to ``None``. @@ -574,6 +567,7 @@ class StreamingFinetuningDataset(StreamingDataset): def __init__( self, tokenizer: PreTrainedTokenizerBase, + token_encoding_type: str = 'int64', streams: Optional[Sequence[Stream]] = None, local: Optional[str] = None, remote: Optional[str] = None, @@ -606,11 +600,17 @@ def __init__( f'StreamingFinetuningDataset() got an unexpected keyword argument: {kwargs}', ) + if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES: + raise ValueError( + f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}', + ) + self.token_encoding_type = token_encoding_type + if streams is None: - _stream_remote_local_validate(remote, local, split) + stream_remote_local_validate(remote, local, split) else: for stream in streams: - _stream_remote_local_validate( + stream_remote_local_validate( stream.remote, stream.local, split, @@ -656,11 +656,11 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: if isinstance(sample['input_ids'], bytes): sample['input_ids'] = np.frombuffer( sample['input_ids'], - dtype=np.int64, + dtype=getattr(np, self.token_encoding_type), )[:self.max_seq_len].tolist().copy() sample['labels'] = np.frombuffer( sample['labels'], - dtype=np.int64, + dtype=getattr(np, self.token_encoding_type), )[:self.max_seq_len].tolist().copy() elif isinstance(sample['input_ids'], np.ndarray): sample['input_ids'] = sample['input_ids'][:self.max_seq_len diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 60b81cd145..86d5edbaf4 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -4,7 +4,6 @@ """Build a StreamingTextDataset dataset and dataloader for training.""" import inspect -import os from itertools import islice from typing import ( Any, @@ -25,6 +24,10 @@ from transformers import PreTrainedTokenizerBase from llmfoundry import registry +from llmfoundry.data import ( + SUPPORTED_MDS_ENCODING_TYPES, + stream_remote_local_validate, +) from llmfoundry.utils.registry_utils import construct_from_registry __all__ = [ @@ -41,6 +44,9 @@ class StreamingTextDataset(StreamingDataset): tokenizer (Tokenizer): HuggingFace tokenizer to tokenize samples. max_seq_len (int): The max sequence length of each sample. + token_encoding_type (str): The encoding type of the tokenized samples. This is only used + for legacy datasets that have been written directly as 'bytes' instead of numpy + arrays. Types are auto-inferred for numpy arrays. Defaults to 'int64'. streams (Sequence[Stream], optional): One or more Streams to stream/cache samples from, which may be upsampled or downsampled. StreamingDataset uses either ``streams`` or ``remote``/``local``. Defaults to ``None``. @@ -106,6 +112,7 @@ def __init__( self, tokenizer: PreTrainedTokenizerBase, max_seq_len: int, + token_encoding_type: str = 'int64', streams: Optional[Sequence[Stream]] = None, remote: Optional[str] = None, local: Optional[str] = None, @@ -137,13 +144,21 @@ def __init__( f'StreamingTextDataset() got an unexpected keyword argument: {kwargs}', ) - if local is not None and (remote is None or (local == remote)): - if os.path.isdir(local): - contents = set(os.listdir(local)) - if split not in contents: - raise ValueError( - f'local directory {local} does not contain split {split}', - ) + if token_encoding_type not in SUPPORTED_MDS_ENCODING_TYPES: + raise ValueError( + f'The token_encoding_type must be one of {SUPPORTED_MDS_ENCODING_TYPES}, but got {token_encoding_type}', + ) + self.token_encoding_type = token_encoding_type + + if streams is None: + stream_remote_local_validate(remote, local, split) + else: + for stream in streams: + stream_remote_local_validate( + stream.remote, + stream.local, + split, + ) # TODO: discover where yamls are being converted incorrect, but temporary workaround if isinstance(shuffle_block_size, float): @@ -197,10 +212,18 @@ def _read_binary_tokenized_sample( self, sample: Dict[str, Any], ) -> torch.Tensor: - return torch.from_numpy( - np.frombuffer(sample['tokens'], - dtype=np.int64)[:self.max_seq_len].copy(), - ) + # Modeling code still expects int64 tensors. + if isinstance(sample['tokens'], np.ndarray): + return torch.from_numpy( + sample['tokens'][:self.max_seq_len].copy(), + ).to(torch.int64) + else: + return torch.from_numpy( + np.frombuffer( + sample['tokens'], + dtype=getattr(np, self.token_encoding_type), + )[:self.max_seq_len].copy(), + ).to(torch.int64) # How to process a sample def __getitem__(self, diff --git a/scripts/data_prep/README.md b/scripts/data_prep/README.md index 7881298b2f..3601cc865f 100644 --- a/scripts/data_prep/README.md +++ b/scripts/data_prep/README.md @@ -35,6 +35,23 @@ python convert_dataset_json.py \ Where `--path` can be a single json file, or a folder containing json files. `--split` denotes the intended split (hf defaults to `train`). +### Raw text files + +Using the `convert_text_to_mds.py` script, we convert a [text file](https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt) containing the complete works of William Shakespeare. + + +```bash +# Convert json dataset to StreamingDataset format +mkdir shakespeare && cd shakespeare +curl -O https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt +cd .. +python convert_text_to_mds.py \ + --output_folder my-copy-shakespeare \ + --input_folder shakespeare \ + --concat_tokens 2048 --tokenizer EleutherAI/gpt-neox-20b \ + --compression zstd +``` + ## Converting a finetuning dataset Using the `convert_finetuning_dataset.py` script you can run a command such as: diff --git a/scripts/data_prep/convert_dataset_hf.py b/scripts/data_prep/convert_dataset_hf.py index d7aaa52193..bf7f145610 100644 --- a/scripts/data_prep/convert_dataset_hf.py +++ b/scripts/data_prep/convert_dataset_hf.py @@ -12,6 +12,8 @@ import datasets as hf_datasets import psutil +import torch +from numpy.typing import NDArray from streaming import MDSWriter from torch.utils.data import DataLoader, Dataset, IterableDataset from tqdm import tqdm @@ -338,7 +340,7 @@ def build_dataloader( def generate_samples( loader: DataLoader, truncate_num_samples: Optional[int] = None, -) -> Iterable[Dict[str, bytes]]: +) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]: """Generator over samples of a dataloader. Args: @@ -356,7 +358,11 @@ def generate_samples( if truncate_num_samples is not None and n_samples == truncate_num_samples: return n_samples += 1 - yield {k: v[idx] for k, v in batch.items()} + yield { + k: + v[idx].numpy() if isinstance(v[idx], torch.Tensor) else v[idx] + for k, v in batch.items() + } def main(args: Namespace) -> None: @@ -377,7 +383,7 @@ def main(args: Namespace) -> None: tokenizer = build_tokenizer(args.tokenizer, args.tokenizer_kwargs) # we will enforce length, so suppress warnings about sequences too long for the model tokenizer.model_max_length = int(1e30) - columns = {'tokens': 'bytes'} + columns = {'tokens': 'ndarray:int32'} else: mode = ConcatMode.NO_CONCAT tokenizer = None diff --git a/scripts/data_prep/convert_dataset_json.py b/scripts/data_prep/convert_dataset_json.py index fb117ddef3..37b0465692 100644 --- a/scripts/data_prep/convert_dataset_json.py +++ b/scripts/data_prep/convert_dataset_json.py @@ -6,11 +6,11 @@ from argparse import ArgumentParser, Namespace from enum import Enum from glob import glob -from typing import Dict, Iterable, Optional +from typing import Optional import datasets as hf_datasets from streaming import MDSWriter -from torch.utils.data import DataLoader, IterableDataset +from torch.utils.data import IterableDataset from tqdm import tqdm from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -140,30 +140,6 @@ def build_hf_dataset( return dataset -def generate_samples( - loader: DataLoader, - truncate_num_samples: Optional[int] = None, -) -> Iterable[Dict[str, bytes]]: - """Generator over samples of a dataloader. - - Args: - loader (DataLoader): A dataloader emitting batches like {key: [sample0_bytes, sample1_bytes, sample2_bytes, ...]} - truncate_num_samples (Optional[int]): An optional # of samples to stop at. - - Yields: - Sample dicts. - """ - n_samples = 0 - for batch in loader: - keys = list(batch.keys()) - current_bs = len(batch[keys[0]]) - for idx in range(current_bs): - if truncate_num_samples is not None and n_samples == truncate_num_samples: - return - n_samples += 1 - yield {k: v[idx] for k, v in batch.items()} - - def main(args: Namespace) -> None: """Main: create C4/pile streaming dataset. @@ -175,7 +151,7 @@ def main(args: Namespace) -> None: tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) # we will enforce length, so suppress warnings about sequences too long for the model tokenizer.model_max_length = int(1e30) - columns = {'tokens': 'bytes'} + columns = {'tokens': 'ndarray:int32'} else: mode = ConcatMode.NO_CONCAT tokenizer = None diff --git a/scripts/data_prep/convert_text_to_mds.py b/scripts/data_prep/convert_text_to_mds.py index 365cc9b71d..b2f0b0e7b4 100644 --- a/scripts/data_prep/convert_text_to_mds.py +++ b/scripts/data_prep/convert_text_to_mds.py @@ -18,6 +18,7 @@ maybe_create_object_store_from_uri, parse_uri, ) +from numpy.typing import NDArray from streaming import MDSWriter from tqdm import tqdm from transformers import AutoTokenizer, PreTrainedTokenizerBase @@ -42,7 +43,7 @@ class ConcatTokensFromFilesDataset(AbstractConcatTokensDataset): """An IterableDataset that returns token samples for MDSWriter from files. - Returns dicts of {'tokens': bytes} + Returns dicts of {'tokens': ndarray:int32} Each file is considered a sequence. """ @@ -59,7 +60,7 @@ def __init__( self.files = files super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) - def __iter__(self) -> Iterable[Dict[str, bytes]]: + def __iter__(self) -> Iterable[Dict[str, NDArray]]: buffer = [] for file in self.files: @@ -87,7 +88,9 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]: concat_sample = buffer[:self.max_length] buffer = buffer[self. max_length:] if self.should_wrap else [] - yield {'tokens': np.asarray(concat_sample).tobytes()} + yield { + 'tokens': np.asarray(concat_sample, dtype=np.int32), + } first_chunk = False @@ -98,7 +101,7 @@ def __iter__(self) -> Iterable[Dict[str, bytes]]: while len(buffer) >= self.max_length: concat_sample = buffer[:self.max_length] buffer = buffer[self.max_length:] if self.should_wrap else [] - yield {'tokens': np.asarray(concat_sample).tobytes()} + yield {'tokens': np.asarray(concat_sample, dtype=np.int32)} def parse_args() -> Namespace: @@ -356,7 +359,7 @@ def download_and_convert( no_wrap=no_wrap, ) - columns = {'tokens': 'bytes'} + columns = {'tokens': 'ndarray:int32'} log.info('Converting to MDS format...') with MDSWriter( diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index df4309e13d..8dac151f55 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -9,7 +9,6 @@ from typing import Callable, Iterable, List from unittest.mock import Mock, patch -import numpy as np import pytest from streaming import StreamingDataset from transformers import AutoTokenizer @@ -194,7 +193,7 @@ def call_convert_text_to_mds() -> None: n_tokens = 0 for i in range(dataset.num_samples): sample = dataset[i] - tokens = np.frombuffer(sample['tokens'], dtype=int) + tokens = sample['tokens'] if i == 0: # For the first sample, check that the decoded sample matches the text_content decoded = tokenizer.decode(tokens) assert decoded == text_content[:len(decoded)] diff --git a/tests/data/test_data_encodings.py b/tests/data/test_data_encodings.py new file mode 100644 index 0000000000..a45bfbcb88 --- /dev/null +++ b/tests/data/test_data_encodings.py @@ -0,0 +1,205 @@ +# Copyright 2022 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 +import pathlib + +import numpy as np +import pytest +import torch +from streaming import MDSWriter + +from llmfoundry.data import SUPPORTED_MDS_ENCODING_TYPES, StreamingTextDataset +from llmfoundry.data.finetuning.tasks import StreamingFinetuningDataset + + +@pytest.mark.parametrize( + 'token_encoding_type', + SUPPORTED_MDS_ENCODING_TYPES + ['default'], +) +@pytest.mark.parametrize('use_bytes', [True, False]) +@pytest.mark.parametrize('samples', [10]) +@pytest.mark.parametrize('max_seq_len', [2048]) +def test_encoding_types_text( + tmp_path: pathlib.Path, + token_encoding_type: str, + use_bytes: bool, + samples: int, + max_seq_len: int, +): + dataset_local_path = str(tmp_path) + if token_encoding_type != 'default': + encoding_dtype = getattr(np, token_encoding_type) + else: + encoding_dtype = None + + if use_bytes: + columns = { + 'tokens': 'bytes', + } + else: + columns = { + 'tokens': + 'ndarray:' + token_encoding_type + if token_encoding_type != 'default' else 'ndarray', + } + + with MDSWriter(out=dataset_local_path, columns=columns) as writer: + for _ in range(samples): + if token_encoding_type != 'default': + tokens = np.random.randint( + 0, + np.iinfo(encoding_dtype).max, + max_seq_len, + dtype=encoding_dtype, + ) + else: + tokens = np.random.randint( + 0, + 200, + max_seq_len, + ) + if use_bytes: + tokens = tokens.tobytes() + writer.write({'tokens': tokens}) + + if use_bytes and token_encoding_type != 'default': + dataset = StreamingTextDataset( + tokenizer=None, + token_encoding_type=token_encoding_type, + max_seq_len=max_seq_len, + local=dataset_local_path, + batch_size=1, + ) + else: + # There should be no need to pass in the token encoding type if writing out ndarrays, + # or if using the default token encoding type. + dataset = StreamingTextDataset( + tokenizer=None, + max_seq_len=max_seq_len, + local=dataset_local_path, + batch_size=1, + ) + + for _, sample in enumerate(dataset): + # StreamingTextDataset should return an int64 torch Tensor + assert sample.dtype == torch.int64 + assert sample.shape == (max_seq_len,) + + +@pytest.mark.parametrize( + 'token_encoding_type', + SUPPORTED_MDS_ENCODING_TYPES + ['default'], +) +@pytest.mark.parametrize('use_bytes', [True, False]) +@pytest.mark.parametrize('samples', [10]) +@pytest.mark.parametrize('max_seq_len', [2048]) +def test_encoding_types_finetuning( + tmp_path: pathlib.Path, + token_encoding_type: str, + use_bytes: bool, + samples: int, + max_seq_len: int, +): + dataset_local_path = str(tmp_path) + if token_encoding_type != 'default': + encoding_dtype = getattr(np, token_encoding_type) + else: + encoding_dtype = None + + if use_bytes: + columns = { + 'input_ids': 'bytes', + 'labels': 'bytes', + } + else: + columns = { + 'input_ids': + 'ndarray:' + token_encoding_type + if token_encoding_type != 'default' else 'ndarray', + 'labels': + 'ndarray:' + token_encoding_type + if token_encoding_type != 'default' else 'ndarray', + } + + with MDSWriter(out=dataset_local_path, columns=columns) as writer: + for _ in range(samples): + if token_encoding_type != 'default': + input_ids = np.random.randint( + 0, + np.iinfo(encoding_dtype).max, + max_seq_len, + dtype=encoding_dtype, + ) + labels = np.random.randint( + 0, + np.iinfo(encoding_dtype).max, + max_seq_len, + dtype=encoding_dtype, + ) + else: + input_ids = np.random.randint( + 0, + 200, + max_seq_len, + ) + labels = np.random.randint( + 0, + 200, + max_seq_len, + ) + if use_bytes: + input_ids = input_ids.tobytes() + labels = labels.tobytes() + writer.write({'input_ids': input_ids, 'labels': labels}) + + if use_bytes and token_encoding_type != 'default': + dataset = StreamingFinetuningDataset( + tokenizer=None, + token_encoding_type=token_encoding_type, + local=dataset_local_path, + max_seq_len=max_seq_len, + batch_size=1, + ) + else: + # There should be no need to pass in the token encoding type if writing out ndarrays, + # or if using the default token encoding type. + dataset = StreamingFinetuningDataset( + tokenizer=None, + local=dataset_local_path, + max_seq_len=max_seq_len, + batch_size=1, + ) + + for _, sample in enumerate(dataset): + # StreamingFinetuningDataset puts samples in a list, and converts arrays to lists too. + assert isinstance(sample['turns'][0]['input_ids'][0], int) + assert len(sample['turns'][0]['input_ids']) == max_seq_len + assert isinstance(sample['turns'][0]['labels'][0], int) + assert len(sample['turns'][0]['labels']) == max_seq_len + + +@pytest.mark.parametrize( + 'token_encoding_type', + ['int17', 'float32', 'complex', 'int4'], +) +@pytest.mark.parametrize('use_finetuning', [True, False]) +def test_unsupported_encoding_type( + token_encoding_type: str, + use_finetuning: bool, +): + with pytest.raises(ValueError, match='The token_encoding_type*'): + if use_finetuning: + StreamingFinetuningDataset( + tokenizer=None, + token_encoding_type=token_encoding_type, + local='dataset/path', + max_seq_len=2048, + batch_size=1, + ) + else: + StreamingTextDataset( + tokenizer=None, + token_encoding_type=token_encoding_type, + max_seq_len=2048, + local='dataset/path', + batch_size=1, + ) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 7c8e808bab..ec27df8121 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -114,8 +114,8 @@ def build_mock_ft_streaming_dataset( columns = {'input_ids': 'bytes', 'labels': 'bytes'} else: columns = { - 'input_ids': 'ndarray:uint32', - 'labels': 'ndarray:uint32', + 'input_ids': 'ndarray:int32', + 'labels': 'ndarray:int32', } else: columns = {'prompt': 'str', 'response': 'str'} @@ -142,7 +142,7 @@ def build_mock_ft_streaming_dataset( else: sample_to_write[key] = np.asarray( sample[key], - dtype=np.uint32, + dtype=np.int32, ) output_writer.write(sample_to_write) else: From 14f296c340f85dea04970ad191ef5abd2aaf4326 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 6 Jun 2024 19:36:45 -0400 Subject: [PATCH 04/16] Bump Composer to 0.23.0 (#1259) --- scripts/train/train.py | 2 +- setup.py | 8 ++++---- tests/models/test_model.py | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index c9e2d67bf4..3cf3d9551d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -504,7 +504,7 @@ def main(cfg: DictConfig) -> Trainer: precision=train_cfg.precision, algorithms=algorithms, device_train_microbatch_size=train_cfg.device_train_microbatch_size, - fsdp_config=fsdp_config, + parallelism_config={'fsdp': fsdp_config}, save_folder=train_cfg.save_folder, save_filename=save_filename, save_latest_filename=save_latest_filename, diff --git a/setup.py b/setup.py index 78182976d4..0556050de9 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs]>=0.22.0,<0.23', + 'mosaicml[libcloud,wandb,oci,gcs]>=0.23.0,<0.24', 'mlflow>=2.12.1,<2.13', 'accelerate>=0.25,<0.26', # for HF inference `device_map` 'transformers>=4.40,<4.41', @@ -92,14 +92,14 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.22.0,<0.23', + 'mosaicml[databricks]>=0.23.0,<0.24', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', 'lz4>=4,<5', ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.22.0,<0.23', + 'mosaicml[tensorboard]>=0.23.0,<0.24', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.22.0,<0.23', + 'mosaicml[peft]>=0.23.0,<0.24', ] extra_deps['openai'] = [ diff --git a/tests/models/test_model.py b/tests/models/test_model.py index a62a7dd114..2f93b1d3ce 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -13,10 +13,15 @@ import torch.nn as nn from accelerate import init_empty_weights from composer.core.precision import Precision, get_precision_context +from composer.distributed.dist_strategy import prepare_fsdp_module from composer.models.huggingface import maybe_get_underlying_model from composer.optim import DecoupledAdamW -from composer.trainer.dist_strategy import prepare_fsdp_module -from composer.utils import dist, get_device, reproducibility +from composer.utils import ( + FSDPConfig, + dist, + get_device, + reproducibility, +) from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from transformers import ( @@ -2538,7 +2543,14 @@ def test_hf_init( betas=(0.9, 0.99), ) - prepare_fsdp_module(model, optimizer, fsdp_config, precision, device, False) + prepare_fsdp_module( + model, + optimizer, + FSDPConfig(**fsdp_config), + precision, + device, + False, + ) model = HuggingFaceModelWithFSDP(model, tokenizer) From bea61fb5d979d855f7025ae218de6dbd68857cc6 Mon Sep 17 00:00:00 2001 From: Vincent Chen Date: Thu, 6 Jun 2024 20:25:18 -0400 Subject: [PATCH 05/16] Bump Version to 0.10.0.dev0 (#1255) * bump version * typo * Update config_utils.py These changes are necessary as the deprecation broke compatibility with `update_batch_size`. * Update config_utils.py fix typo * typo * typo I * update tests * typo II * typo III * bump composer version * undo composer bump for seperate pr * fix test * fix tests II * yolo * tye-o * pyrite * we resolve later * revert new . syntax --------- Co-authored-by: v-chen_data Co-authored-by: Milo Cress Co-authored-by: Saaketh Narayan Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/__init__.py | 2 +- llmfoundry/utils/config_utils.py | 13 ++++++++----- scripts/train/train.py | 1 - tests/a_scripts/eval/test_eval.py | 10 +++++++++- tests/a_scripts/eval/test_eval_inputs.py | 15 ++++----------- tests/a_scripts/train/test_train_inputs.py | 18 ++++++------------ 6 files changed, 28 insertions(+), 31 deletions(-) diff --git a/llmfoundry/__init__.py b/llmfoundry/__init__.py index c9666566bf..5e2795f9c9 100644 --- a/llmfoundry/__init__.py +++ b/llmfoundry/__init__.py @@ -71,4 +71,4 @@ 'utils', ] -__version__ = '0.9.0.dev0' +__version__ = '0.10.0.dev0' diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index b6a5acf6d9..5ab148bbe8 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -67,6 +67,7 @@ class EvalConfig: # Logging parameters python_log_level: Optional[str] = 'debug' loggers: Optional[Dict[str, Any]] = None + console_log_interval: Union[int, str] = '1ba' log_config: bool = True # Model/run parameters @@ -180,6 +181,11 @@ class TrainConfig: # Variables to ignore variables: Optional[Dict[str, Any]] = None + # Fields created by `update_batch_size_info` + n_gpus: int = MISSING + device_train_batch_size: int = MISSING + device_train_grad_accum: str = MISSING + TRAIN_CONFIG_KEYS = {field.name for field in fields(TrainConfig)} @@ -242,7 +248,6 @@ def make_dataclass_and_log_config( icl_tasks_required: bool = False, ) -> Tuple[Dict[str, Any], T]: """Converts a DictConfig to a dataclass and creates a logged config.""" - # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) assert isinstance(unstructured_config, dict) assert all(isinstance(k, str) for k in unstructured_config.keys()) @@ -289,11 +294,9 @@ def make_dataclass_and_log_config( unstructured_config['variables'] = {} for key in extraneous_keys: - warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases. Please place any variables under the `variables` key.', - category=DeprecationWarning, + raise ValueError( + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Please place any variables under the `variables` key.', ) - unstructured_config['variables'][key] = unstructured_config.pop(key) dataclass_dict_config: DictConfig = om.structured( dataclass_constructor(**unstructured_config), diff --git a/scripts/train/train.py b/scripts/train/train.py index 3cf3d9551d..f2a70b526d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -553,6 +553,5 @@ def main(cfg: DictConfig) -> Trainer: yaml_cfg = om.load(f) cli_cfg = om.from_cli(args_list) cfg = om.merge(yaml_cfg, cli_cfg) - om.resolve(cfg) assert isinstance(cfg, DictConfig) main(cfg) diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index a56778538c..01f3760d26 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -13,7 +13,7 @@ from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_dict_container +from llmfoundry.utils.config_utils import EVAL_CONFIG_KEYS, to_dict_container from scripts.eval.eval import main # noqa: E402 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg @@ -134,6 +134,14 @@ def test_loader_eval( test_cfg.eval_interval = '1ba' test_cfg.loggers = om.DictConfig({'inmemory': om.DictConfig({})}) + # This test uses a training yaml with training-only keys present. + # We exclude these keys before calling `main` from the eval script. + allowed_keys = EVAL_CONFIG_KEYS + present_keys = set(test_cfg.keys()) + keys_to_pop = present_keys.difference(allowed_keys) + + [test_cfg.pop(key) for key in keys_to_pop] + trainers, eval_gauntlet_df = main(test_cfg) assert eval_gauntlet_df is None diff --git a/tests/a_scripts/eval/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py index 98b15743b3..0ca5765a26 100644 --- a/tests/a_scripts/eval/test_eval_inputs.py +++ b/tests/a_scripts/eval/test_eval_inputs.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import copy import os -import warnings import omegaconf import pytest @@ -42,12 +41,13 @@ def test_mispelled_mandatory_params_fail(self, cfg: DictConfig) -> None: omegaconf.errors.InterpolationKeyError, omegaconf.errors.MissingMandatoryValue, TypeError, + ValueError, )): cfg[p + '-mispelled'] = cfg.pop(p) main(cfg) cfg[p] = cfg.pop(p + '-mispelled') - def test_optional_mispelled_params_raise_warning( + def test_optional_mispelled_params_raise_error( self, cfg: DictConfig, ) -> None: @@ -67,15 +67,8 @@ def test_optional_mispelled_params_raise_warning( orig_value = cfg.pop(param, None) updated_param = param + '-mispelling' cfg[updated_param] = orig_value - with warnings.catch_warnings(record=True) as warning_list: - try: - main(cfg) - except: - pass - assert any( - f'Unused parameter {updated_param} found in cfg.' in - str(warning.message) for warning in warning_list - ) + with pytest.raises(ValueError): + main(cfg) # restore configs. cfg = copy.deepcopy(old_cfg) diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index 5a3b21dc3b..5901d53e94 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -3,7 +3,6 @@ import copy import json import os -import warnings import omegaconf import pytest @@ -63,7 +62,9 @@ def cfg(self, foundry_dir: str) -> DictConfig: def test_misspelled_mandatory_params_fail(self, cfg: DictConfig) -> None: """Check that mandatory misspelled inputs fail to train.""" cfg.trai_loader = cfg.pop('train_loader') - with pytest.raises((omegaconf.errors.MissingMandatoryValue, TypeError)): + with pytest.raises( + (omegaconf.errors.MissingMandatoryValue, TypeError, ValueError), + ): main(cfg) def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: @@ -89,7 +90,7 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: main(cfg) cfg[param] = orig_param - def test_optional_misspelled_params_raise_warning( + def test_optional_misspelled_params_raise_error( self, cfg: DictConfig, ) -> None: @@ -113,15 +114,8 @@ def test_optional_misspelled_params_raise_warning( orig_value = cfg.pop(param, None) updated_param = param + '-misspelling' cfg[updated_param] = orig_value - with warnings.catch_warnings(record=True) as warning_list: - try: - main(cfg) - except: - pass - assert any( - f'Unused parameter {updated_param} found in cfg.' in - str(warning.message) for warning in warning_list - ) + with pytest.raises(ValueError): + main(cfg) # restore configs. cfg = copy.deepcopy(old_cfg) From e4b8b571b82933d382aee69fe74e9d8171163d83 Mon Sep 17 00:00:00 2001 From: Xiaohan Zhang Date: Fri, 7 Jun 2024 00:08:14 -0700 Subject: [PATCH 06/16] Fix typo in setup.py (#1263) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0556050de9..3fefc0426e 100644 --- a/setup.py +++ b/setup.py @@ -74,7 +74,7 @@ 'beautifulsoup4>=4.12.2,<5', # required for model download utils 'tenacity>=8.2.3,<9', 'catalogue>=2,<3', - 'typer[all]<1', + 'typer<1', ] extra_deps = {} From db7013516133849b9e2cab3f9e66bf9ad0882a39 Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Fri, 7 Jun 2024 10:47:38 -0700 Subject: [PATCH 07/16] Update TE Dockerfile (#1265) Update Dockerfile with TE main --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 253a5b6cd8..73b6d7fb07 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install TransformerEngine -RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c +RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@0edf30b87159e82048b5f248e4b379aebb8f364a # Install and uninstall foundry to cache foundry requirements RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git From 4e53e7449471c2671a512364fd6241fad83c3cdd Mon Sep 17 00:00:00 2001 From: Charles Tang Date: Fri, 7 Jun 2024 12:15:20 -0700 Subject: [PATCH 08/16] Revert "Update TE Dockerfile (#1265)" (#1266) This reverts commit db7013516133849b9e2cab3f9e66bf9ad0882a39. --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 73b6d7fb07..253a5b6cd8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install TransformerEngine -RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@0edf30b87159e82048b5f248e4b379aebb8f364a +RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c # Install and uninstall foundry to cache foundry requirements RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git From dddb9b81cdde35f3768ff5f112150916ca1ba379 Mon Sep 17 00:00:00 2001 From: Mihir Patel Date: Fri, 7 Jun 2024 16:35:51 -0400 Subject: [PATCH 09/16] revert to nvidia code (#1267) --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 253a5b6cd8..ca684dca2a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,7 +13,7 @@ ADD https://raw.githubusercontent.com/mosaicml/llm-foundry/$BRANCH_NAME/setup.py RUN rm setup.py # Install TransformerEngine -RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/cli99/TransformerEngine.git@6b21f606f2459d49c2113d69236d68d334edeb4c +RUN NVTE_FRAMEWORK=pytorch CMAKE_BUILD_PARALLEL_LEVEL=3 MAX_JOBS=3 pip install git+https://github.com/NVIDIA/TransformerEngine.git@05eb6deb31c1b48e9f4380d18fe95f3c38e84335 # Install and uninstall foundry to cache foundry requirements RUN git clone -b $BRANCH_NAME https://github.com/mosaicml/llm-foundry.git From dd92abf78a1927ac1ec8674b670fe3744f759be2 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Fri, 7 Jun 2024 20:46:55 -0700 Subject: [PATCH 10/16] Bump composer to 0.23.2 (#1269) --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 3fefc0426e..f81b1cd0f1 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ ] install_requires = [ - 'mosaicml[libcloud,wandb,oci,gcs]>=0.23.0,<0.24', + 'mosaicml[libcloud,wandb,oci,gcs]>=0.23.2,<0.24', 'mlflow>=2.12.1,<2.13', 'accelerate>=0.25,<0.26', # for HF inference `device_map` 'transformers>=4.40,<4.41', @@ -92,14 +92,14 @@ ] extra_deps['databricks'] = [ - 'mosaicml[databricks]>=0.23.0,<0.24', + 'mosaicml[databricks]>=0.23.2,<0.24', 'databricks-sql-connector>=3,<4', 'databricks-connect==14.1.0', 'lz4>=4,<5', ] extra_deps['tensorboard'] = [ - 'mosaicml[tensorboard]>=0.23.0,<0.24', + 'mosaicml[tensorboard]>=0.23.2,<0.24', ] # Flash 2 group kept for backwards compatibility @@ -110,7 +110,7 @@ extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2']) extra_deps['peft'] = [ - 'mosaicml[peft]>=0.23.0,<0.24', + 'mosaicml[peft]>=0.23.2,<0.24', ] extra_deps['openai'] = [ From 5571101a50804406ef0fe23e7ea6795b3c4a1bcb Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 9 Jun 2024 10:50:54 -0400 Subject: [PATCH 11/16] fix linting (#1270) * fix linting * fix --- llmfoundry/data/dataloader.py | 4 ++-- llmfoundry/utils/config_utils.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 83a9a7d8ea..e7521bc343 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -3,7 +3,7 @@ """Dataloader builder utilities.""" -from typing import Any, Dict +from typing import Any, Dict, Union from composer import DataSpec from transformers import PreTrainedTokenizerBase @@ -19,7 +19,7 @@ def build_dataloader( cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, - device_batch_size: int, + device_batch_size: Union[int, float], ) -> DataSpec: """Builds a dataloader from a config. diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 5ab148bbe8..5c1ec9114a 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -100,7 +100,7 @@ class TrainConfig: optimizer: Dict[str, Any] = MISSING scheduler: Dict[str, Any] = MISSING train_loader: Dict[str, Any] = MISSING - device_train_batch_size: int = MISSING + device_train_batch_size: Union[int, float] = MISSING device_eval_batch_size: int = MISSING max_duration: Union[int, str] = MISSING eval_interval: Union[int, str] = MISSING @@ -183,7 +183,6 @@ class TrainConfig: # Fields created by `update_batch_size_info` n_gpus: int = MISSING - device_train_batch_size: int = MISSING device_train_grad_accum: str = MISSING From ffec54b491bd7c1bd3de236707a6e9f5aadcbb51 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 13 Jun 2024 09:59:23 -0700 Subject: [PATCH 12/16] Add torch 2.3.1 docker images (#1275) --- .github/workflows/docker.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 6ca10fcd47..89aa917809 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -23,6 +23,12 @@ jobs: - name: "2.3.0_cu121_flash2_aws" base_image: mosaicml/pytorch:2.3.0_cu121-python3.11-ubuntu20.04-aws dep_groups: "[gpu-flash2]" + - name: "2.3.1_cu121" + base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04 + dep_groups: "[gpu]" + - name: "2.3.1_cu121_aws" + base_image: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws + dep_groups: "[gpu]" steps: - name: Maximize Build Space on Worker uses: easimon/maximize-build-space@v4 From c30856f96949a298d307219c4f13e7bd6aeddbab Mon Sep 17 00:00:00 2001 From: Brian <23239305+b-chu@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:24:01 -0400 Subject: [PATCH 13/16] Make expandable segments on by default (#1278) --- llmfoundry/utils/config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 5c1ec9114a..f91ae79404 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -115,7 +115,7 @@ class TrainConfig: # Cuda allocation configuration max_split_size_mb: Optional[int] = None - expandable_segments: bool = False + expandable_segments: bool = True cuda_load_lazy: bool = False # Distributed training parameters From 630fc6879f721ead6064d501cd70b5cc69807386 Mon Sep 17 00:00:00 2001 From: Daniel King <43149077+dakinggg@users.noreply.github.com> Date: Thu, 13 Jun 2024 19:25:03 -0700 Subject: [PATCH 14/16] Add CI for torch 2.3.1 (#1281) --- .github/workflows/pr-cpu.yaml | 4 ++++ .github/workflows/pr-gpu.yaml | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 93612b7983..78faea8e44 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -23,6 +23,10 @@ jobs: container: mosaicml/pytorch:2.3.0_cpu-python3.11-ubuntu20.04 markers: "not gpu" pytest_command: "coverage run -m pytest" + - name: "cpu-2.3.1" + container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04 + markers: "not gpu" + pytest_command: "coverage run -m pytest" name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' with: diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index 31af66e51f..335d049306 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -24,6 +24,11 @@ jobs: markers: "gpu" pytest_command: "coverage run -m pytest" pip_deps: "[all]" + - name: "gpu-2.3.1" + container: mosaicml/llm-foundry:2.3.1_cu121_flash2-latest + markers: "gpu" + pytest_command: "coverage run -m pytest" + pip_deps: "[all]" name: ${{ matrix.name }} if: github.repository_owner == 'mosaicml' with: From 9b9fc24b86c156e45d2e54b64f3dbc7a68235c1e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 13 Jun 2024 23:06:19 -0400 Subject: [PATCH 15/16] Update README.md to use variables (#1282) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 70436271dd..c92c252395 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,7 @@ python data_prep/convert_dataset_hf.py \ # Train an MPT-125m model for 10 batches composer train/train.py \ train/yamls/pretrain/mpt-125m.yaml \ - data_local=my-copy-c4 \ + variables.data_local=my-copy-c4 \ train_loader.dataset.split=train_small \ eval_loader.dataset.split=val_small \ max_duration=10ba \ From 1a2fac0c25be354c3e1531301ed69202af66c085 Mon Sep 17 00:00:00 2001 From: sanjari-orb <137819448+sanjari-orb@users.noreply.github.com> Date: Fri, 14 Jun 2024 10:43:14 -0700 Subject: [PATCH 16/16] Add registry for ICL datasets (#1252) --- llmfoundry/eval/datasets/__init__.py | 12 + .../in_context_learning_evaluation.py | 519 ++++++++++-------- llmfoundry/registry.py | 17 + llmfoundry/utils/builders.py | 53 +- .../eval/test_in_context_learning_datasets.py | 363 ++++++------ tests/test_registry.py | 1 + 6 files changed, 537 insertions(+), 428 deletions(-) diff --git a/llmfoundry/eval/datasets/__init__.py b/llmfoundry/eval/datasets/__init__.py index 02a2b88b21..a3a36053da 100644 --- a/llmfoundry/eval/datasets/__init__.py +++ b/llmfoundry/eval/datasets/__init__.py @@ -22,6 +22,18 @@ tokenizer_needs_prefix_space, trim_context, ) +from llmfoundry.registry import icl_datasets + +icl_datasets.register( + 'multiple_choice', + func=InContextLearningMultipleChoiceTaskDataset, +) +icl_datasets.register('schema', func=InContextLearningSchemaTaskDataset) +icl_datasets.register('language_modeling', func=InContextLearningLMTaskDataset) +icl_datasets.register( + 'generation_task_with_answers', + func=InContextLearningGenerationTaskWithAnswersDataset, +) __all__ = [ 'InContextLearningDataset', diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py index debb0dbc6f..c87b38b09a 100644 --- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py +++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py @@ -19,6 +19,7 @@ from datasets import IterableDataset, load_dataset from torch.utils.data import DataLoader, Dataset +from llmfoundry import registry from llmfoundry.eval.datasets.utils import ( convert_tokens_to_tensors, get_continuation_span, @@ -29,6 +30,7 @@ tokenizer_needs_prefix_space, trim_context, ) +from llmfoundry.utils.registry_utils import construct_from_registry log = logging.getLogger(__name__) @@ -114,11 +116,11 @@ def __init__( max_seq_len: int, pad_tok_id: int, num_fewshot: int, - fewshot_random_seed: int, - prompt_string: str, - example_delimiter: str, - continuation_delimiter: str, destination_path: str, + fewshot_random_seed: int = 1234, + prompt_string: str = '', + example_delimiter: str = '\n', + continuation_delimiter: str = ' ', prelimiter: str = '', context_key: str = 'context', answer_key: str = 'answer', @@ -189,6 +191,20 @@ def __len__(self) -> int: def get_num_samples_in_batch(self, batch: Dict) -> int: return batch['input_ids'].shape[0] + def get_effective_batch_size(self, batch_size: int) -> int: + r"""Returns effective batch size computed for given ICL task. + + The effective batch size may not be equal to the configured evaluation + batch size because for certain ICL tasks, >1 prompts can get created + for every input query depending on the number of choices/continuations. + This requires the effective batch size to be reduced to prevent larger batches than expected during eval. For example, + check InContextLearningMultipleChoiceTaskDataset. + + Args: + batch_size (int): Original batch size configured for ICL evaluations + """ + return batch_size + def update_generation_kwargs(self, generation_kwargs: Dict) -> None: r"""Updates self.base_batch with the passed in generation_kwargs. @@ -519,46 +535,12 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) return batch - def split_batch(self, batch: Any, - microbatch_size: Union[int, float]) -> Sequence[Any]: - """Handling for certain specialty columns that must be split into. - - batches in different formats. - - Args: - batch (Dict): Batch of data - microbatch_size (int | float): Size of microbatches - - Returns: - List: List of chunked batches - """ - # Don't split kwargs that don't change - # Normally split torch tensors - # List split lists of strings - if isinstance(microbatch_size, float): - raise ValueError( - 'split_batch does not support floating point microbatch_size.', - ) - chunked = {} - for k, v in batch.items(): - if k in self.static_keys: - # Defer broadcasting until we know num_chunks - pass - elif k in self.list_keys: - chunked[k] = _split_list(v, microbatch_size) - elif k in self.tensor_keys: - chunked[k] = _default_split_batch(v, microbatch_size) - else: - raise ValueError(f'Unexpected key {k} in batch splitting') - num_chunks = len(chunked['input_ids']) - for k, v in batch.items(): - if k in self.static_keys: - chunked[k] = [v] * num_chunks - - batched_list = [{k: v[idx] - for k, v in chunked.items()} - for idx in range(num_chunks)] - return batched_list + def split_batch( + self, + batch: Any, + microbatch_size: Union[int, float], + ) -> Sequence[Any]: + return _default_split_batch(batch, microbatch_size) class InContextLearningGenerationTaskWithAnswersDataset( @@ -584,13 +566,31 @@ class InContextLearningGenerationTaskWithAnswersDataset( def __init__( self, + dataset_uri: str, + tokenizer: transformers.PreTrainedTokenizerBase, + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + destination_path: str, + fewshot_random_seed: int = 1234, + prompt_string: str = '', + example_delimiter: str = '\n', + continuation_delimiter: str = ' ', + prelimiter: str = '', + context_key: str = 'context', + answer_key: str = 'answer', + strip_dataset: bool = True, + padding_size: Optional[int] = None, + base_batch: Optional[Dict] = None, + batch_mapping: Optional[Dict] = None, + hf_loading_vars: Optional[Dict] = None, + hf_parsing_map: Optional[Dict] = None, + generation_kwargs: Optional[Dict] = None, cot_delimiter: str = '', early_stopping_criteria: Optional[List[str]] = None, do_normalization: bool = True, - *args: Any, - **kwargs: Any, ): - if kwargs['tokenizer'].eos_token_id is None: + if tokenizer.eos_token_id is None: raise ValueError( '`InContextLearningGenerationTaskWithAnswersDataset` tokenizer must have non-null `eos_token_id`', ) @@ -607,13 +607,32 @@ def __init__( tensor_keys = ['input_ids', 'attention_mask'] list_keys = ['labels'] super().__init__( + dataset_uri=dataset_uri, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + pad_tok_id=pad_tok_id, + num_fewshot=num_fewshot, + fewshot_random_seed=fewshot_random_seed, + prompt_string=prompt_string, + example_delimiter=example_delimiter, + continuation_delimiter=continuation_delimiter, + destination_path=destination_path, + prelimiter=prelimiter, + context_key=context_key, + answer_key=answer_key, + strip_dataset=strip_dataset, + padding_size=padding_size, + base_batch=base_batch, + batch_mapping=batch_mapping, + hf_loading_vars=hf_loading_vars, + hf_parsing_map=hf_parsing_map, + generation_kwargs=generation_kwargs, + # specific to ICL dataset padding_side='left', tokenize_labels=False, static_keys=static_keys, list_keys=list_keys, tensor_keys=tensor_keys, - *args, - **kwargs, ) # NOTE: set these after init call because they take class vars self.early_stopping_criteria = early_stopping_criteria @@ -635,8 +654,8 @@ def __init__( 'input_ids': self.context_key, 'labels': 'aliases', } - if 'generation_kwargs' in kwargs: - self.update_generation_kwargs(kwargs['generation_kwargs']) + if generation_kwargs: + self.update_generation_kwargs(generation_kwargs) def read_dataset( self, @@ -765,6 +784,45 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: batch['generation_kwargs']['stopping_criteria'] = stopping_criteria return batch + def split_batch(self, batch: Any, + microbatch_size: Union[int, float]) -> Sequence[Any]: + """Split batch handling for special columns. + + Args: + batch (Dict): Batch of data + microbatch_size (int | float): Size of microbatches + + Returns: + List: List of chunked batches + """ + # Don't split kwargs that don't change + # Normally split torch tensors + # List split lists of strings + if isinstance(microbatch_size, float): + raise ValueError( + 'split_batch does not support floating point microbatch_size.', + ) + chunked = {} + for k, v in batch.items(): + if k in self.static_keys: + # Defer broadcasting until we know num_chunks + pass + elif k in self.list_keys: + chunked[k] = _split_list(v, microbatch_size) + elif k in self.tensor_keys: + chunked[k] = _default_split_batch(v, microbatch_size) + else: + raise ValueError(f'Unexpected key {k} in batch splitting') + num_chunks = len(chunked['input_ids']) + for k, v in batch.items(): + if k in self.static_keys: + chunked[k] = [v] * num_chunks + + batched_list = [{k: v[idx] + for k, v in chunked.items()} + for idx in range(num_chunks)] + return batched_list + class InContextLearningLMTaskDataset(InContextLearningDataset): """A dataset that constructs batches for in-context learning language. @@ -779,8 +837,50 @@ class InContextLearningLMTaskDataset(InContextLearningDataset): See InContextLearningDataset for more details. """ - def __init__(self, *args: Any, **kwargs: Any): + def __init__( + self, + dataset_uri: str, + tokenizer: transformers.PreTrainedTokenizerBase, + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + destination_path: str, + fewshot_random_seed: int = 1234, + prompt_string: str = '', + example_delimiter: str = '\n', + continuation_delimiter: str = ' ', + prelimiter: str = '', + context_key: str = 'context', + strip_dataset: bool = True, + tokenize_labels: bool = True, + padding_size: Optional[int] = None, + hf_loading_vars: Optional[Dict] = None, + hf_parsing_map: Optional[Dict] = None, + generation_kwargs: Optional[Dict] = None, + static_keys: Optional[List] = None, + list_keys: Optional[List] = None, + ): super().__init__( + dataset_uri=dataset_uri, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + pad_tok_id=pad_tok_id, + num_fewshot=num_fewshot, + fewshot_random_seed=fewshot_random_seed, + prompt_string=prompt_string, + example_delimiter=example_delimiter, + continuation_delimiter=continuation_delimiter, + destination_path=destination_path, + prelimiter=prelimiter, + context_key=context_key, + strip_dataset=strip_dataset, + tokenize_labels=tokenize_labels, + padding_size=padding_size, + hf_loading_vars=hf_loading_vars, + hf_parsing_map=hf_parsing_map, + generation_kwargs=generation_kwargs, + list_keys=list_keys, + # specific to ICL dataset answer_key='continuation', static_keys=['mode'], tensor_keys=[ @@ -800,8 +900,6 @@ def __init__(self, *args: Any, **kwargs: Any): 'labels': 'context', }, padding_side='right', - *args, - **kwargs, ) @@ -833,13 +931,33 @@ class InContextLearningMultipleChoiceTaskDataset(InContextLearningDataset): def __init__( self, + dataset_uri: str, + tokenizer: transformers.PreTrainedTokenizerBase, + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + destination_path: str, + fewshot_random_seed: int = 1234, + prompt_string: str = '', + example_delimiter: str = '\n', + continuation_delimiter: str = ' ', + prelimiter: str = '', + context_key: str = 'query', + tensor_keys: Optional[List] = None, + answer_key: str = 'answer', + strip_dataset: bool = True, + tokenize_labels: bool = True, + padding_size: Optional[int] = None, + batch_mapping: Optional[Dict] = None, + hf_loading_vars: Optional[Dict] = None, + hf_parsing_map: Optional[Dict] = None, + generation_kwargs: Optional[Dict] = None, + list_keys: Optional[List] = None, choices_key: str = 'choices', static_keys: Optional[List] = None, list_of_tensors_keys: Optional[List] = None, list_of_tuples_keys: Optional[List] = None, list_of_primitives: Optional[List] = None, - *args: Any, - **kwargs: Any, ): self.choices_key = choices_key base_batch = { @@ -850,25 +968,42 @@ def __init__( 'gold_indices': [], 'choice_groupings': [], } - context_key = kwargs.pop('context_key', 'query') - static_keys = kwargs.pop('static_keys', ['mode', 'generation_kwargs']) - tensor_keys = kwargs.pop( - 'tensor_keys', - ['input_ids', 'labels', 'attention_mask'], - ) + if not static_keys: + static_keys = ['mode', 'generation_kwargs'] + if not tensor_keys: + tensor_keys = ['input_ids', 'labels', 'attention_mask'] self.list_of_tensors_keys = list_of_tensors_keys or [ 'continuation_indices', ] self.list_of_tuples_keys = list_of_tuples_keys or ['choice_groupings'] self.list_of_primitives = list_of_primitives or ['gold_indices'] super().__init__( + dataset_uri=dataset_uri, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + pad_tok_id=pad_tok_id, + num_fewshot=num_fewshot, + fewshot_random_seed=fewshot_random_seed, + prompt_string=prompt_string, + example_delimiter=example_delimiter, + continuation_delimiter=continuation_delimiter, + destination_path=destination_path, + prelimiter=prelimiter, + answer_key=answer_key, + strip_dataset=strip_dataset, + tokenize_labels=tokenize_labels, + padding_size=padding_size, + batch_mapping=batch_mapping, + hf_loading_vars=hf_loading_vars, + hf_parsing_map=hf_parsing_map, + generation_kwargs=generation_kwargs, + list_keys=list_keys, + # specific to ICL dataset context_key=context_key, base_batch=base_batch, static_keys=static_keys, tensor_keys=tensor_keys, padding_side='right', - *args, - **kwargs, ) self.num_choices = len(self.dataset[0][self.choices_key]) self.batch_mapping_per_choice = { @@ -877,6 +1012,11 @@ def __init__( } self.batch_map_per_example = {'gold_indices': 'gold'} + def get_effective_batch_size(self, batch_size: int) -> int: + batch_size = max(self.num_choices, batch_size) + effective_batchsize = batch_size // self.num_choices + return effective_batchsize + def get_answer_from_example( self, example: Dict, @@ -1095,21 +1235,58 @@ class InContextLearningSchemaTaskDataset( def __init__( self, + dataset_uri: str, + tokenizer: transformers.PreTrainedTokenizerBase, + max_seq_len: int, + pad_tok_id: int, + num_fewshot: int, + destination_path: str, + fewshot_random_seed: int = 1234, + prompt_string: str = '', + example_delimiter: str = '\n', + continuation_delimiter: str = ' ', + prelimiter: str = '', + answer_key: str = 'answer', + strip_dataset: bool = True, + tokenize_labels: bool = True, + padding_size: Optional[int] = None, + batch_mapping: Optional[Dict] = None, + hf_loading_vars: Optional[Dict] = None, + hf_parsing_map: Optional[Dict] = None, + generation_kwargs: Optional[Dict] = None, + list_keys: Optional[List] = None, choices_key: str = 'context_options', - *args: Any, - **kwargs: Any, ): static_keys = ['mode'] tensor_keys = ['input_ids', 'labels', 'attention_mask'] list_of_tensors_keys = ['continuation_indices'] super().__init__( + dataset_uri=dataset_uri, + tokenizer=tokenizer, + max_seq_len=max_seq_len, + pad_tok_id=pad_tok_id, + num_fewshot=num_fewshot, + fewshot_random_seed=fewshot_random_seed, + prompt_string=prompt_string, + example_delimiter=example_delimiter, + continuation_delimiter=continuation_delimiter, + destination_path=destination_path, + prelimiter=prelimiter, + answer_key=answer_key, + strip_dataset=strip_dataset, + tokenize_labels=tokenize_labels, + padding_size=padding_size, + batch_mapping=batch_mapping, + hf_loading_vars=hf_loading_vars, + hf_parsing_map=hf_parsing_map, + generation_kwargs=generation_kwargs, + list_keys=list_keys, + # specific to ICL dataset choices_key=choices_key, context_key=choices_key, static_keys=static_keys, tensor_keys=tensor_keys, list_of_tensors_keys=list_of_tensors_keys, - *args, - **kwargs, ) self.base_batch = { 'input_ids': [], @@ -1120,6 +1297,11 @@ def __init__( 'choice_groupings': [], } + def get_effective_batch_size(self, batch_size: int) -> int: + batch_size = max(self.num_choices, batch_size) + effective_batchsize = batch_size // self.num_choices + return effective_batchsize + def construct_context( self, example: Dict[str, Any], @@ -1294,23 +1476,10 @@ def build_icl_dataloader( dataset_uri: str, tokenizer: transformers.PreTrainedTokenizerBase, batch_size: int, - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, # e.g. 'translate english to french:' - example_delimiter: str, # e.g. '\n' - continuation_delimiter: str, # e.g. '' hf_loading_vars: Dict, hf_parsing_map: Dict, - destination_path: str, - prelimiter: str, # e.g. 'Question: ' - cot_delimiter: str, # e.g. ' ### ' - fewshot_random_seed: int, - pass_at_k: int, - generations_per_sample: int, - generation_kwargs: Dict, - early_stopping_criteria: Optional[List[str]] = None, - do_normalization: bool = True, + destination_path: str = '', + kwargs: Optional[Dict[str, Any]] = None, ) -> DataSpec: """Factory method that builds the specific dataset for the specified. @@ -1323,108 +1492,36 @@ def build_icl_dataloader( this might be different) 3. set the `split_batch` function if necessary """ - if icl_task_type == 'multiple_choice': - dataset = InContextLearningMultipleChoiceTaskDataset( - dataset_uri=dataset_uri, - tokenizer=tokenizer, - max_seq_len=max_seq_len, - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter=example_delimiter, - continuation_delimiter=continuation_delimiter, - destination_path=destination_path, - prelimiter=prelimiter, - fewshot_random_seed=fewshot_random_seed, - hf_loading_vars=hf_loading_vars, - hf_parsing_map=hf_parsing_map, - generation_kwargs=generation_kwargs, - ) - batch_size = max(dataset.num_choices, batch_size) - effective_batchsize = batch_size // dataset.num_choices - elif icl_task_type == 'schema': - dataset = InContextLearningSchemaTaskDataset( - dataset_uri=dataset_uri, - tokenizer=tokenizer, - max_seq_len=max_seq_len, - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter=example_delimiter, - continuation_delimiter=continuation_delimiter, - destination_path=destination_path, - prelimiter=prelimiter, - fewshot_random_seed=fewshot_random_seed, - hf_loading_vars=hf_loading_vars, - hf_parsing_map=hf_parsing_map, - generation_kwargs=generation_kwargs, - ) - batch_size = max(dataset.num_choices, batch_size) - effective_batchsize = batch_size // dataset.num_choices - elif icl_task_type == 'language_modeling': - dataset = InContextLearningLMTaskDataset( - dataset_uri=dataset_uri, - tokenizer=tokenizer, - max_seq_len=max_seq_len, - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter=example_delimiter, - continuation_delimiter=continuation_delimiter, - destination_path=destination_path, - prelimiter=prelimiter, - fewshot_random_seed=fewshot_random_seed, - hf_loading_vars=hf_loading_vars, - hf_parsing_map=hf_parsing_map, - generation_kwargs=generation_kwargs, - ) - effective_batchsize = batch_size - elif icl_task_type == 'generation_task_with_answers': - dataset = InContextLearningGenerationTaskWithAnswersDataset( - dataset_uri=dataset_uri, - tokenizer=tokenizer, - max_seq_len=max_seq_len, - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter=example_delimiter, - continuation_delimiter=continuation_delimiter, - destination_path=destination_path, - prelimiter=prelimiter, - fewshot_random_seed=fewshot_random_seed, - hf_loading_vars=hf_loading_vars, - hf_parsing_map=hf_parsing_map, - cot_delimiter=cot_delimiter, - early_stopping_criteria=early_stopping_criteria, - do_normalization=do_normalization, - generation_kwargs=generation_kwargs, - ) - effective_batchsize = batch_size - else: - raise Exception(f'Unrecognized ICL task type: {icl_task_type}') - + # Add named parameters to kwargs + if kwargs is None: + kwargs = {} + kwargs.update({ + 'dataset_uri': dataset_uri, + 'tokenizer': tokenizer, + 'hf_loading_vars': hf_loading_vars, + 'hf_parsing_map': hf_parsing_map, + 'destination_path': destination_path, + }) + dataset = construct_from_registry( + name=icl_task_type, + registry=registry.icl_datasets, + partial_function=False, + pre_validation_function=None, + post_validation_function=None, + kwargs=kwargs, + ) sampler = dist.get_sampler(dataset, drop_last=False, shuffle=False) - split_batch = None - if isinstance( - dataset, - ( - InContextLearningMultipleChoiceTaskDataset, - InContextLearningGenerationTaskWithAnswersDataset, - ), - ): - split_batch = dataset.split_batch - return DataSpec( DataLoader( dataset, - batch_size=effective_batchsize, + batch_size=dataset.get_effective_batch_size(batch_size), sampler=sampler, collate_fn=dataset.collate_fn, ), device_transforms=None, get_num_samples_in_batch=dataset.get_num_samples_in_batch, - split_batch=split_batch, + split_batch=dataset.split_batch, ) @@ -1514,24 +1611,11 @@ def get_icl_task_dataloader( tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast], batch_size: int, - max_seq_len: int, - pad_tok_id: int, - num_fewshot: int, - prompt_string: str, # e.g. 'translate english to french:' - example_delimiter: str, # e.g. '\n' - continuation_delimiter: str = '', - destination_path: str = '', - question_prelimiter: str = '', # e.g. 'Question: ' - fewshot_random_seed: int = 1234, - pass_at_k: int = 1, - generations_per_sample: int = 1, - cot_delimiter: str = '', has_categories: bool = False, hf_loading_vars: Optional[Dict] = None, hf_parsing_map: Optional[Dict] = None, - generation_kwargs: Optional[Dict] = None, - early_stopping_criteria: Optional[List[str]] = None, - do_normalization: bool = True, + destination_path: str = '', + kwargs: Optional[Dict[str, Any]] = None, ) -> Union[DataSpec, Dict[str, DataSpec]]: r"""Constructs a dataloader (or dataloaders if has_categories is True) @@ -1588,28 +1672,12 @@ def get_icl_task_dataloader( The default keys expected are "context" and "answer". tokenizer (transformers.PreTrainedTokenizerBase): The tokenizer used to map between strings and token ids. batch_size (int): Size of a batch used for eval - max_seq_len (int): The maximum sequence length supported by the model. - pad_tok_id (int): The special token used for padding batches. - num_fewshot (int): The number of complete fewshot examples to prepend before each test example. These are not identical across examples. - prompt_string (str, default = ''): Prompt string to put once before all fewshot examples/test examples (e.g. 'Translate english to french.'). - example_delimiter (str, default = '\\n'): Separator inserted before (context, answer) pairs (e.g. '\\n') for fewshot sampling and prompting. - continuation_delimiter: (str, default = ' '): Separator inserted between context and answer in each example (e.g. '\\nA: '). - destination_path: (str, default = ''): This is the local file where remote datasets will be saved. - question_prelimiter: (str, default = ''): Text to be prepended before each context, including few shot examples (e.g. "Question: "). - fewshot_random_seed (int, default = 1234): Random seed to use for fewshot sampling - pass_at_k (int): k for how many chances the model gets to write passing code. - generations_per_sample (int): How many outputs to generate per prompt. Passed in generation_kwargs under "num_return_sequences" and overwritten by generation_kwargs dict. - cot_delimiter (str): Delimiter to place between chain of thoughts and continuations. has_categories: (bool): If ``True``, we will search the dataset file for a category key, and partition the dataset into a separate dataloader for each category occurring in the data. hf_loading_vars (Dict, default = None): A dictionary containing keyword arguments to be passed into `load_dataset` if dataset is being pulled from HF. hf_parsing_map (Dict, default = None): A dictionary containing a mapping from HF columns to ICL dataset keys. The dictionary should be formatted {icl_key:[hf_key1, hf_key1]}. Column contents will be concatenated with ' ' separating them. If not included, will load the columns already present in the HF dataset. - generation_kwargs (Dict, default = None): A dictionary containing keyword arguments to be passed along to the model's generate function. Overwrites any previously specified generation - keyword args in this function (see https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig - for more details) - early_stopping (List, default = None): A list of strings that, when found in a model's output, will be treated as a stopping criteria at metric computation time. - Used in generation tasks with CoT - do_normalization (bool, default = True): Whether or not to normalize the outputs and labels in InContextLearningGenerationTaskWithAnswersDataset. Only used in generation tasks. + kwargs (Dict[str, Any], default=None): Dictionary containing a mapping + from ICL dataset constructor's parameter names and their desired values. Returns: DataLoader: A dataloader used for performing in-context learning evaluation on the dataset provided. @@ -1618,11 +1686,6 @@ def get_icl_task_dataloader( hf_loading_vars = {} if hf_parsing_map is None: hf_parsing_map = {} - if generation_kwargs is None: - generation_kwargs = {} - if early_stopping_criteria is None: - early_stopping_criteria = [] - if has_categories: result_dls = {} output_files = partition_dataset_by_category( @@ -1639,23 +1702,10 @@ def get_icl_task_dataloader( dataset_uri=partition_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=max_seq_len, - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter=example_delimiter, - continuation_delimiter=continuation_delimiter, destination_path=partition_uri + '_tmp', - prelimiter=question_prelimiter, - cot_delimiter=cot_delimiter, - fewshot_random_seed=fewshot_random_seed, - pass_at_k=pass_at_k, - generations_per_sample=generations_per_sample, hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, - generation_kwargs=generation_kwargs, - early_stopping_criteria=early_stopping_criteria, - do_normalization=do_normalization, + kwargs=kwargs, ) return result_dls else: @@ -1664,21 +1714,8 @@ def get_icl_task_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=max_seq_len, - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter=example_delimiter, hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, - continuation_delimiter=continuation_delimiter, destination_path=destination_path, - prelimiter=question_prelimiter, - cot_delimiter=cot_delimiter, - fewshot_random_seed=fewshot_random_seed, - pass_at_k=pass_at_k, - generations_per_sample=generations_per_sample, - generation_kwargs=generation_kwargs, - early_stopping_criteria=early_stopping_criteria, - do_normalization=do_normalization, + kwargs=kwargs, ) diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index 0c8e64b759..f36f53fffa 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -8,6 +8,7 @@ from composer.optim import ComposerScheduler from torch.optim import Optimizer from torch.utils.data import DataLoader as TorchDataloader +from torch.utils.data import Dataset from torchmetrics import Metric from transformers import PreTrainedTokenizerBase @@ -206,6 +207,21 @@ description=_metrics_description, ) +_icl_datasets_description = ( + 'The ICL datasets registry is used to register an torch.utils.data.Dataset class which can be used for ICL tasks.' +) +icl_datasets = create_registry( + 'llmfoundry', + 'icl_datasets', + # TODO: Change type from Dataset to + # llmfoundry.eval.InContextLearningDataset. + # Using ICL dataset here introduces a circular import dependency between + # the registry and eval packages right now, thus needs some refactoring. + generic_type=Type[Dataset], + entry_points=True, + description=_icl_datasets_description, +) + __all__ = [ 'loggers', 'callbacks', @@ -228,4 +244,5 @@ 'attention_classes', 'attention_implementations', 'fcs', + 'icl_datasets', ] diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 73eb026d98..f9e84aab45 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import contextlib +import copy import functools import logging import os @@ -545,22 +546,10 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg["icl_task_type"]}.', ) - if 'prompt_string' not in icl_cfg: - icl_cfg['prompt_string'] = '' - if 'example_delimiter' not in icl_cfg: - icl_cfg['example_delimiter'] = '\n' - if 'continuation_delimiter' not in icl_cfg: - icl_cfg['continuation_delimiter'] = ' ' if 'max_seq_len' not in icl_cfg: icl_cfg['max_seq_len'] = default_max_seq_len if 'batch_size' not in icl_cfg: icl_cfg['batch_size'] = default_batch_size - if 'pass_at_k' not in icl_cfg: - icl_cfg['pass_at_k'] = 1 - if 'fewshot_random_seed' not in icl_cfg: - icl_cfg['fewshot_random_seed'] = 1234 - if 'generations_per_sample' not in icl_cfg: - icl_cfg['generations_per_sample'] = 1 if 'num_beams' in icl_cfg: raise ValueError( @@ -579,6 +568,7 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): pad_tok_id = tokenizer.eos_token_id else: pad_tok_id = tokenizer.pad_token_id + label = f'{icl_cfg["label"]}/{num_fewshot}-shot' metric_names = list(icl_cfg['metric_names']) # TODO: fix Composer bug when copying local paths and destination exists @@ -589,38 +579,37 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): hf_parsing_map = icl_cfg.get('hf_parsing_map', {}) hf_loading_vars = icl_cfg.get('hf_loading_vars', {}) - early_stopping_criteria = icl_cfg.get( 'early_stopping_criteria', - None, + [], ) + # TODO: fix manual removal of non-constructor fields + icl_constructor_kwargs = copy.deepcopy(icl_cfg) + icl_constructor_kwargs.pop('label', None) + icl_constructor_kwargs.pop('metric_names', None) + icl_constructor_kwargs.pop('icl_task_type', None) + icl_constructor_kwargs.pop('batch_size', None) + icl_constructor_kwargs.pop('has_categories', None) + + # Add custom constructor arguments + icl_constructor_kwargs['pad_tok_id'] = pad_tok_id + icl_constructor_kwargs['num_fewshot'] = num_fewshot + assert early_stopping_criteria is None or isinstance( early_stopping_criteria, list, ) + dataloaders = get_icl_task_dataloader( - icl_cfg['icl_task_type'], - icl_cfg['dataset_uri'], - tokenizer, + icl_task_type=icl_cfg['icl_task_type'], + dataset_uri=icl_cfg['dataset_uri'], + tokenizer=tokenizer, batch_size=icl_cfg['batch_size'], - max_seq_len=icl_cfg['max_seq_len'], - pad_tok_id=pad_tok_id, - num_fewshot=num_fewshot, - prompt_string=icl_cfg['prompt_string'], - example_delimiter=icl_cfg['example_delimiter'], hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, - continuation_delimiter=icl_cfg['continuation_delimiter'], - question_prelimiter=icl_cfg.get('question_prelimiter', ''), - destination_path=destination_path, - fewshot_random_seed=icl_cfg['fewshot_random_seed'], - pass_at_k=icl_cfg['pass_at_k'], - generations_per_sample=icl_cfg['generations_per_sample'], has_categories=icl_cfg.get('has_categories', False), - cot_delimiter=icl_cfg.get('cot_delimiter', ''), - generation_kwargs=icl_cfg.get('generation_kwargs', {}), - early_stopping_criteria=early_stopping_criteria, - do_normalization=icl_cfg.get('do_normalization', True), + destination_path=destination_path, + kwargs=icl_constructor_kwargs, ) if 'has_categories' in icl_cfg and icl_cfg[ 'has_categories'] and isinstance(dataloaders, dict): diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py index a3c3e88364..b5eacdeb0f 100644 --- a/tests/eval/test_in_context_learning_datasets.py +++ b/tests/eval/test_in_context_learning_datasets.py @@ -1090,15 +1090,22 @@ def test_mc_task_dataloader_subcategories( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=2, - prompt_string= - 'The following are multiple choice questions (with answers).\n', - example_delimiter='\n', - continuation_delimiter='Answer: ', - destination_path=str(tmp_path / 'icl.jsonl'), has_categories=True, + destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'num_fewshot': + 2, + 'max_seq_len': + seqlen, + 'pad_tok_id': + tokenizer.eos_token_id, + 'prompt_string': + 'The following are multiple choice questions (with answers).\n', + 'example_delimiter': + '\n', + 'continuation_delimiter': + 'Answer: ', + }, ) assert isinstance(dls, dict) @@ -1142,13 +1149,15 @@ def test_lm_task_dataloader_extra_space( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=10, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=' ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 10, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ' ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1192,13 +1201,15 @@ def test_lm_task_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=0, - prompt_string='', - example_delimiter='\n', - continuation_delimiter='', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 0, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': '', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1241,14 +1252,16 @@ def test_schema_task_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=1, - prompt_string='', - example_delimiter='\n', - question_prelimiter=prelimiter, - continuation_delimiter='', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 1, + 'prompt_string': '', + 'example_delimiter': '\n', + 'prelimiter': prelimiter, + 'continuation_delimiter': '', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) @@ -1300,13 +1313,15 @@ def test_schema_task_dataloader_sentpiece_tokenizer( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=1, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=' ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 1, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ' ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) @@ -1358,13 +1373,15 @@ def test_lm_task_dataloader_opt_tokenizer( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter='', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': '', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1410,13 +1427,15 @@ def test_mc_task_dataloader_opt_tokenizer( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1473,13 +1492,15 @@ def test_mc_split_batch( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1550,13 +1571,15 @@ def test_qa_split_batch( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=8, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=0, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 0, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) assert isinstance(dl, DataSpec) # pyright @@ -1612,14 +1635,16 @@ def test_qa_task_dataloader_w_null_eos( dataset_uri, tokenizer, batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter='\n', - question_prelimiter='Q: ', - continuation_delimiter='\nA:', destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': prompt_string, + 'example_delimiter': '\n', + 'prelimiter': 'Q: ', + 'continuation_delimiter': '\nA:', + }, ) @@ -1647,14 +1672,16 @@ def test_qa_task_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter='\n', - question_prelimiter='Q: ', - continuation_delimiter='\nA:', destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': prompt_string, + 'example_delimiter': '\n', + 'prelimiter': 'Q: ', + 'continuation_delimiter': '\nA:', + }, ) assert isinstance(dl, DataSpec) @@ -1714,15 +1741,17 @@ def test_qa_task_with_cot_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - question_prelimiter='Q: ', - continuation_delimiter="\nA: Let's think step by step. ", - cot_delimiter=' #### ', destination_path=str(tmp_path / f'icl_{num_fewshot}.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'prelimiter': 'Q: ', + 'continuation_delimiter': "\nA: Let's think step by step. ", + 'cot_delimiter': ' #### ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1779,14 +1808,16 @@ def test_mc_task_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=1, - prompt_string='', - question_prelimiter=prelimiter, - example_delimiter=example_delimiter, - continuation_delimiter='\nA: ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 1, + 'prompt_string': '', + 'prelimiter': prelimiter, + 'example_delimiter': example_delimiter, + 'continuation_delimiter': '\nA: ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -1851,13 +1882,15 @@ def test_lm_task_evaluation( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter='', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': '', + }, ) evaluator = Evaluator( @@ -1903,13 +1936,15 @@ def test_schema_task_evaluation( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) evaluator = Evaluator( @@ -1968,14 +2003,16 @@ def test_mc_task_evaluation_subcategories( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=max_seq_len, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), has_categories=True, + kwargs={ + 'max_seq_len': max_seq_len, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) assert isinstance(dls, dict) @@ -2039,13 +2076,15 @@ def test_mc_task_evaluation( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=64, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 64, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) evaluator = Evaluator( @@ -2107,13 +2146,15 @@ def test_qa_task_evaluation_opt_tokenizer( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) evaluator = Evaluator( @@ -2168,14 +2209,16 @@ def test_qa_task_evaluation_with_cot_opt_tokenizer( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter="A: Let's think step by step. ", - cot_delimiter=' #### ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': "A: Let's think step by step. ", + 'cot_delimiter': ' #### ', + }, ) evaluator = Evaluator( @@ -2228,13 +2271,15 @@ def test_qa_task_evaluation( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=': ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ': ', + }, ) evaluator = Evaluator( @@ -2288,14 +2333,16 @@ def test_qa_task_with_cot_evaluation( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=1024, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string='', - example_delimiter='\n', - continuation_delimiter="A: Let's think step by step", - cot_delimiter=' #### ', destination_path=str(Path(gathered_paths[0]) / 'icl.jsonl'), + kwargs={ + 'max_seq_len': 1024, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': "A: Let's think step by step", + 'cot_delimiter': ' #### ', + }, ) evaluator = Evaluator( @@ -2339,13 +2386,15 @@ def test_lm_spacing_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=1, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=' UNIQUE ', destination_path=str(tmp_path / 'icl.jsonl'), + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 1, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ' UNIQUE ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -2409,15 +2458,17 @@ def test_hf_dataloading_lm_dataloader( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=0, - prompt_string='', - example_delimiter='\n', - continuation_delimiter=' ', destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'), hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': 0, + 'prompt_string': '', + 'example_delimiter': '\n', + 'continuation_delimiter': ' ', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright @@ -2490,16 +2541,18 @@ def test_hf_dataloading_custom_parsing( dataset_uri=dataset_uri, tokenizer=tokenizer, batch_size=batch_size, - max_seq_len=seqlen, - pad_tok_id=tokenizer.eos_token_id, - num_fewshot=num_fewshot, - prompt_string=prompt_string, - example_delimiter='\n', - question_prelimiter='Orbs: ', - continuation_delimiter='\nSpell:', destination_path=str(tmp_path / 'test_dataset_lm_juggernaut.jsonl'), hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, + kwargs={ + 'max_seq_len': seqlen, + 'pad_tok_id': tokenizer.eos_token_id, + 'num_fewshot': num_fewshot, + 'prompt_string': prompt_string, + 'example_delimiter': '\n', + 'prelimiter': 'Orbs: ', + 'continuation_delimiter': '\nSpell:', + }, ) assert isinstance(dl, DataSpec) assert isinstance(dl.dataloader, DataLoader) # pyright diff --git a/tests/test_registry.py b/tests/test_registry.py index 87881450d4..3bdf5a800f 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -42,6 +42,7 @@ def test_expected_registries_exist(): 'attention_classes', 'attention_implementations', 'fcs', + 'icl_datasets', } assert existing_registries == expected_registry_names