diff --git a/.github/workflows/pr-cpu.yaml b/.github/workflows/pr-cpu.yaml index 2c85719756..13019a83d4 100644 --- a/.github/workflows/pr-cpu.yaml +++ b/.github/workflows/pr-cpu.yaml @@ -29,7 +29,7 @@ jobs: - name: Checkout code uses: actions/checkout@v2 - name: Run PR CPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.0 + uses: mosaicml/ci-testing/.github/actions/pytest-cpu@v0.1.2 with: name: ${{ matrix.name }} container: ${{ matrix.container }} diff --git a/.github/workflows/pr-gpu.yaml b/.github/workflows/pr-gpu.yaml index ba1a4f9ba4..947d78ae95 100644 --- a/.github/workflows/pr-gpu.yaml +++ b/.github/workflows/pr-gpu.yaml @@ -27,10 +27,10 @@ jobs: markers: "gpu" pip_deps: "[all]" pytest_command: "coverage run -m pytest" - ci_repo_gpu_test_ref: v0.1.0 + ci_repo_gpu_test_ref: v0.1.2 steps: - name: Run PR GPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2 with: container: ${{ matrix.container }} git_repo: mosaicml/llm-foundry @@ -56,10 +56,10 @@ jobs: markers: "gpu" pip_deps: "[all]" pytest_command: "coverage run -m pytest" - ci_repo_gpu_test_ref: v0.1.0 + ci_repo_gpu_test_ref: v0.1.2 steps: - name: Run PR GPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2 with: container: ${{ matrix.container }} git_repo: mosaicml/llm-foundry @@ -85,10 +85,10 @@ jobs: markers: "gpu" pip_deps: "[all]" pytest_command: "coverage run -m pytest" - ci_repo_gpu_test_ref: v0.1.0 + ci_repo_gpu_test_ref: v0.1.2 steps: - name: Run PR GPU Tests - uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.0 + uses: mosaicml/ci-testing/.github/actions/pytest-gpu@v0.1.2 with: container: ${{ matrix.container }} git_repo: mosaicml/llm-foundry diff --git a/llmfoundry/_version.py b/llmfoundry/_version.py index 4c11746b43..2f1f590b19 100644 --- a/llmfoundry/_version.py +++ b/llmfoundry/_version.py @@ -3,4 +3,4 @@ """The LLM Foundry Version.""" -__version__ = '0.11.0.dev' +__version__ = '0.12.0.dev0' diff --git a/llmfoundry/callbacks/__init__.py b/llmfoundry/callbacks/__init__.py index 496e905e13..8c86dda2a6 100644 --- a/llmfoundry/callbacks/__init__.py +++ b/llmfoundry/callbacks/__init__.py @@ -16,6 +16,7 @@ from llmfoundry.callbacks.async_eval_callback import AsyncEval from llmfoundry.callbacks.curriculum_learning_callback import CurriculumLearning +from llmfoundry.callbacks.env_logging_callback import EnvironmentLoggingCallback from llmfoundry.callbacks.eval_gauntlet_callback import EvalGauntlet from llmfoundry.callbacks.eval_output_logging_callback import EvalOutputLogging from llmfoundry.callbacks.fdiff_callback import FDiffMetrics @@ -55,8 +56,8 @@ callbacks.register('eval_output_logging', func=EvalOutputLogging) callbacks.register('mbmoe_tok_per_expert', func=MegaBlocksMoE_TokPerExpert) callbacks.register('run_timeout', func=RunTimeoutCallback) - callbacks.register('loss_perp_v_len', func=LossPerpVsContextLengthLogger) +callbacks.register('env_logger', func=EnvironmentLoggingCallback) callbacks_with_config.register('async_eval', func=AsyncEval) callbacks_with_config.register('curriculum_learning', func=CurriculumLearning) diff --git a/llmfoundry/callbacks/async_eval_callback.py b/llmfoundry/callbacks/async_eval_callback.py index 1b3c31e861..dcc8dcffca 100644 --- a/llmfoundry/callbacks/async_eval_callback.py +++ b/llmfoundry/callbacks/async_eval_callback.py @@ -11,7 +11,7 @@ import warnings from collections import Counter from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Optional, Union from composer.callbacks import CheckpointSaver from composer.core import Event, State, Time, Timestamp, TimeUnit @@ -84,10 +84,10 @@ def get_run_name(training_run_name: str, current_interval: str) -> str: def get_eval_parameters( - parameters: Dict[str, Any], + parameters: dict[str, Any], checkpoint: str, training_run_name: str, -) -> Dict[str, Any]: +) -> dict[str, Any]: """Get the parameters needed for the eval run. Args: @@ -164,8 +164,8 @@ def validate_interval( def validate_eval_run_config( - eval_run_config: Optional[Dict[str, Any]], -) -> Dict[str, Any]: + eval_run_config: Optional[dict[str, Any]], +) -> dict[str, Any]: if not eval_run_config: return {} @@ -220,9 +220,9 @@ class AsyncEval(CallbackWithConfig): def __init__( self, - train_config: Dict[str, Any], + train_config: dict[str, Any], interval: Union[str, int, Time], - eval_run_config: Optional[Dict[str, Any]] = None, + eval_run_config: Optional[dict[str, Any]] = None, ): # Run these during init to fail fast in any of the error cases @@ -263,7 +263,7 @@ def __init__( # Keep track of checkpoints that have already been evaled # Format: {eval_timestamp: (checkpoint, run_name)} - self.checkpoints_evaled: Dict[Time, Tuple[str, str]] = {} + self.checkpoints_evaled: dict[Time, tuple[str, str]] = {} # Scheduling is based on the check interval, while _get_checkpoints_and_launch_runs # will only launch runs at the interval @@ -279,7 +279,7 @@ def __init__( f'interval {interval}, checking at {self.check_interval}', ) - def state_dict(self) -> Dict[str, Any]: + def state_dict(self) -> dict[str, Any]: checkpoints_evaled = [] for eval_ts, (checkpoint, run_name) in self.checkpoints_evaled.items(): eval_ts_dict = { @@ -292,7 +292,7 @@ def state_dict(self) -> Dict[str, Any]: 'checkpoints_evaled': checkpoints_evaled, } - def load_state_dict(self, state_dict: Dict[str, Any]): + def load_state_dict(self, state_dict: dict[str, Any]): previous_checkpoints_evaled = state_dict.get('checkpoints_evaled', []) if previous_checkpoints_evaled: for (eval_ts, checkpoint, run_name) in previous_checkpoints_evaled: @@ -305,9 +305,9 @@ def load_state_dict(self, state_dict: Dict[str, Any]): @staticmethod def _get_ready_sharded_checkpoints( - checkpointer_checkpoints: Dict[str, Timestamp], - remote_files: List[str], - ) -> Dict[str, Timestamp]: + checkpointer_checkpoints: dict[str, Timestamp], + remote_files: list[str], + ) -> dict[str, Timestamp]: """Identify checkpoints ready to be evaled based on remote files. This has special logic for sharded checkpoints to consider checkpoints composed @@ -349,9 +349,9 @@ def _get_ready_sharded_checkpoints( @staticmethod def _get_ready_single_checkpoints( - checkpointer_checkpoints: Dict[str, Timestamp], - remote_checkpoints: List[str], - ) -> Dict[str, Timestamp]: + checkpointer_checkpoints: dict[str, Timestamp], + remote_checkpoints: list[str], + ) -> dict[str, Timestamp]: """Identify checkpoints ready to be evaled based on remote checkpoints. This is much simpler than the sharded case, because there is only one file diff --git a/llmfoundry/callbacks/env_logging_callback.py b/llmfoundry/callbacks/env_logging_callback.py new file mode 100644 index 0000000000..a192390976 --- /dev/null +++ b/llmfoundry/callbacks/env_logging_callback.py @@ -0,0 +1,188 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import os +import platform +import socket +from typing import Any, Optional + +import git +import pkg_resources +import psutil +import torch +from composer.core import Callback, State +from composer.loggers import Logger +from composer.utils import dist + +from mcli import sdk + +__all__ = ['EnvironmentLoggingCallback'] + +_PACKAGES_TO_LOG = [ + 'llm-foundry', + 'mosaicml', + 'megablocks', + 'grouped-gemm', + 'torch', + 'flash_attn', + 'transformers', + 'datasets', + 'peft', +] + + +class EnvironmentLoggingCallback(Callback): + """A callback for logging environment information during model training. + + This callback collects various pieces of information about the training environment, + including git repository details, package versions, system information, GPU details, + distributed training setup, NVIDIA driver information, and Docker container details. + + Args: + workspace_dir (str): The directory containing the workspace. Defaults to '/workspace'. + log_git (bool): Whether to log git repository information. Defaults to True. + log_packages (bool): Whether to log package versions. Defaults to True. + log_nvidia (bool): Whether to log NVIDIA driver information. Defaults to True. + log_docker (bool): Whether to log Docker container information. Defaults to True. + log_system (bool): Whether to log system information. Defaults to False. + log_gpu (bool): Whether to log GPU information. Defaults to False. + log_distributed (bool): Whether to log distributed training information. Defaults to False. + packages_to_log (list[str]): A list of package names to log versions for. Defaults to None. + + The collected information is logged as hyperparameters at the start of model fitting. + """ + + def __init__( + self, + workspace_dir: str = '/workspace', + log_git: bool = True, + log_nvidia: bool = True, + log_docker: bool = True, + log_packages: bool = True, + log_system: bool = False, + log_gpu: bool = False, + log_distributed: bool = False, + packages_to_log: Optional[list[str]] = None, + ): + self.workspace_dir = workspace_dir + self.log_git = log_git + self.log_packages = log_packages + self.log_nvidia = log_nvidia + self.log_docker = log_docker + self.log_system = log_system + self.log_gpu = log_gpu + self.log_distributed = log_distributed + self.env_data: dict[str, Any] = {} + self.packages_to_log = packages_to_log or _PACKAGES_TO_LOG + + def _get_git_info(self, repo_path: str) -> Optional[dict[str, str]]: + if not os.path.isdir(repo_path): + return None + try: + repo = git.Repo(repo_path) + return { + 'commit_hash': repo.head.commit.hexsha, + 'branch': repo.active_branch.name, + } + except (git.InvalidGitRepositoryError, git.NoSuchPathError): + return None + + def _get_package_version(self, package_name: str) -> Optional[str]: + try: + return pkg_resources.get_distribution(package_name).version + except pkg_resources.DistributionNotFound: + return None + + def _get_system_info(self) -> dict[str, Any]: + return { + 'python_version': platform.python_version(), + 'os': f'{platform.system()} {platform.release()}', + 'hostname': socket.gethostname(), + 'cpu_info': { + 'model': platform.processor(), + 'cores': psutil.cpu_count(logical=False), + 'threads': psutil.cpu_count(logical=True), + }, + 'memory': { + 'total': psutil.virtual_memory().total, + 'available': psutil.virtual_memory().available, + }, + } + + def _get_gpu_info(self) -> dict[str, Any]: + if torch.cuda.is_available(): + return { + 'model': torch.cuda.get_device_name(0), + 'count': torch.cuda.device_count(), + 'memory': { + 'total': torch.cuda.get_device_properties(0).total_memory, + 'allocated': torch.cuda.memory_allocated(0), + }, + } + return {'available': False} + + def _get_nvidia_info(self) -> dict[str, Any]: + if torch.cuda.is_available(): + nccl_version = torch.cuda.nccl.version() # type: ignore + return { + 'cuda_version': + torch.version.cuda, # type: ignore[attr-defined] + 'cudnn_version': str( + torch.backends.cudnn.version(), + ), # type: ignore[attr-defined] + 'nccl_version': '.'.join(map(str, nccl_version)), + } + return {'available': False} + + def _get_distributed_info(self) -> dict[str, Any]: + return { + 'world_size': dist.get_world_size(), + 'local_world_size': dist.get_local_world_size(), + 'rank': dist.get_global_rank(), + 'local_rank': dist.get_local_rank(), + } + + def _get_docker_info(self) -> Optional[dict[str, Any]]: + if 'RUN_NAME' not in os.environ: + return None + run = sdk.get_run(os.environ['RUN_NAME']) + image, tag = run.image.split(':') + return { + 'image': image, + 'tag': tag, + } + + def fit_start(self, state: State, logger: Logger) -> None: + # Collect environment data + if self.log_git: + self.env_data['git_info'] = {} + for folder in os.listdir(self.workspace_dir): + path = self._get_git_info( + os.path.join(self.workspace_dir, folder), + ) + if path: + self.env_data['git_info'][folder] = path + + if self.log_packages: + self.env_data['package_versions'] = { + pkg: self._get_package_version(pkg) + for pkg in self.packages_to_log + } + if self.log_nvidia: + self.env_data['nvidia'] = self._get_nvidia_info() + + if self.log_docker: + if docker_info := self._get_docker_info(): + self.env_data['docker'] = docker_info + + if self.log_system: + self.env_data['system_info'] = self._get_system_info() + + if self.log_gpu: + self.env_data['gpu_info'] = self._get_gpu_info() + + if self.log_distributed: + self.env_data['distributed_info'] = self._get_distributed_info() + + # Log the collected data + logger.log_hyperparameters({'environment_data': self.env_data}) diff --git a/llmfoundry/callbacks/eval_gauntlet_callback.py b/llmfoundry/callbacks/eval_gauntlet_callback.py index 4d0f685ecd..35130f137f 100644 --- a/llmfoundry/callbacks/eval_gauntlet_callback.py +++ b/llmfoundry/callbacks/eval_gauntlet_callback.py @@ -6,7 +6,7 @@ import logging import math from enum import Enum -from typing import Dict, Optional +from typing import Optional from composer.core import Callback, State from composer.loggers import Logger @@ -23,8 +23,8 @@ class Weighting(Enum): def calculate_named_averages( - average_names: Dict[str, list], - category_scores: Dict[str, float], + average_names: dict[str, list], + category_scores: dict[str, float], ): """Calculates the named averages based off the raw category scores. @@ -144,7 +144,7 @@ def __init__( f'Found average name `{avg_name}` used as category name. Average names and category names must be non-overlapping.', ) - def extract_metrics_from_state(self, state: State) -> Dict[str, float]: + def extract_metrics_from_state(self, state: State) -> dict[str, float]: results = {} for key in self.logger_keys: @@ -169,7 +169,7 @@ def extract_metrics_from_state(self, state: State) -> Dict[str, float]: return {k: sum(v) / len(v) for k, v in results.items()} - def eval_after_all(self, state: State, logger: Logger) -> Dict[str, float]: + def eval_after_all(self, state: State, logger: Logger) -> dict[str, float]: computed_metrics = self.extract_metrics_from_state(state) if len(computed_metrics) == 0: return {} diff --git a/llmfoundry/callbacks/eval_output_logging_callback.py b/llmfoundry/callbacks/eval_output_logging_callback.py index b84ea063d1..1bdc3da3cd 100644 --- a/llmfoundry/callbacks/eval_output_logging_callback.py +++ b/llmfoundry/callbacks/eval_output_logging_callback.py @@ -5,7 +5,7 @@ import warnings from copy import deepcopy -from typing import Any, Dict, List, Optional, Sequence, Union +from typing import Any, Optional, Sequence, Union import torch from composer.core import Callback, State @@ -59,7 +59,7 @@ def init(self, state: State, logger: Logger) -> None: self.log_output_text = has_output_text def eval_batch_end(self, state: State, logger: Logger) -> None: - if not isinstance(state.batch, Dict): + if not isinstance(state.batch, dict): warnings.warn( f"""EvalOutputLogging only supports batches that are dictionary. \ Found batch for type {type(state.batch)}. \ @@ -69,8 +69,8 @@ def eval_batch_end(self, state: State, logger: Logger) -> None: assert state.outputs is not None assert state.metric_outputs is not None - logging_dict: Dict[str, - Union[List[Any], torch.Tensor, + logging_dict: dict[str, + Union[list[Any], torch.Tensor, Sequence[torch.Tensor]], ] = deepcopy( state.metric_outputs, diff --git a/llmfoundry/callbacks/hf_checkpointer.py b/llmfoundry/callbacks/hf_checkpointer.py index 79dc73de98..ea03df8978 100644 --- a/llmfoundry/callbacks/hf_checkpointer.py +++ b/llmfoundry/callbacks/hf_checkpointer.py @@ -12,7 +12,7 @@ import time from multiprocessing.context import SpawnProcess from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from typing import Any, Optional, Sequence, Union import numpy as np import torch @@ -249,7 +249,7 @@ def __init__( self.last_checkpoint_batch: Optional[Time] = None self.mlflow_loggers = [] - self.child_processes: List[SpawnProcess] = [] + self.child_processes: list[SpawnProcess] = [] # Temporary save directory used by child_processes. self.temp_save_dir = None @@ -349,7 +349,7 @@ def transform_model_and_tokenizer( self, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, - ) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]: + ) -> tuple[PreTrainedModel, PreTrainedTokenizerBase]: """Transform the model and tokenizer before saving. This allows a subclass to modify the model and tokenizer before saving. The base class implementation will @@ -457,10 +457,10 @@ def _save_checkpoint(self, state: State, logger: Logger): # Add hook to move tensors to cpu to avoid CUDA OOM def tensor_hook( module: nn.Module, - state_dict: Dict[str, Any], + state_dict: dict[str, Any], prefix: str, *args: Any, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: dtensor_fqns = [] for fqn in state_dict.keys(): tensor = state_dict[fqn] @@ -612,7 +612,7 @@ def tensor_hook( # TODO: Remove after mlflow fixes the bug that makes this necessary import mlflow mlflow.store._unity_catalog.registry.rest_store.get_feature_dependencies = lambda *args, **kwargs: '' - model_saving_kwargs: Dict[str, Any] = { + model_saving_kwargs: dict[str, Any] = { 'path': local_save_path, } if self.using_peft: diff --git a/llmfoundry/callbacks/loss_perp_v_len_callback.py b/llmfoundry/callbacks/loss_perp_v_len_callback.py index ebb9583224..b402972198 100644 --- a/llmfoundry/callbacks/loss_perp_v_len_callback.py +++ b/llmfoundry/callbacks/loss_perp_v_len_callback.py @@ -1,7 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Mapping, Optional, Tuple +from typing import Any, Mapping, Optional import torch from composer.core import Callback, State @@ -150,7 +150,7 @@ def preprocess_metric_inputs( logits: torch.Tensor, seq_parallel_world_size: int, seq_parallel_rank: int, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: del sequence_id, seq_parallel_rank if seq_parallel_world_size > 1: raise ValueError( @@ -315,7 +315,7 @@ def update( self.sum_perplexity_seq_id += torch.sum(perplexity, dim=(0, 1)) self.sum_length_seq_id += torch.sum(mask, dim=(0, 1)) - def compute(self) -> Dict[str, torch.Tensor]: + def compute(self) -> dict[str, torch.Tensor]: """Aggregate the state over all processes to compute the metric. Returns: diff --git a/llmfoundry/callbacks/resumption_callbacks.py b/llmfoundry/callbacks/resumption_callbacks.py index 509d1595bd..cc14d72442 100644 --- a/llmfoundry/callbacks/resumption_callbacks.py +++ b/llmfoundry/callbacks/resumption_callbacks.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import List from composer.core import Callback, State from composer.loggers import Logger @@ -69,7 +68,7 @@ class LayerFreezing(Callback): layer_names (float): Names of layers to freeze. """ - def __init__(self, layer_names: List[str]): + def __init__(self, layer_names: list[str]): self.layer_names = set(layer_names) def fit_start(self, state: State, logger: Logger) -> None: diff --git a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py index f9bbe6b0cf..0ea94ac687 100644 --- a/llmfoundry/command_utils/data_prep/convert_dataset_hf.py +++ b/llmfoundry/command_utils/data_prep/convert_dataset_hf.py @@ -7,7 +7,7 @@ import platform from dataclasses import dataclass, field from enum import Enum -from typing import Any, Dict, Iterable, Optional, Union +from typing import Any, Iterable, Optional, Union import datasets as hf_datasets import psutil @@ -39,7 +39,7 @@ class DataSplitConstants: class DatasetConstants: chars_per_sample: int chars_per_token: int - splits: Dict[str, DataSplitConstants] = field(default_factory=dict) + splits: dict[str, DataSplitConstants] = field(default_factory=dict) def __iter__(self): for v in self.splits.values(): @@ -273,7 +273,7 @@ def build_dataloader( def generate_samples( loader: DataLoader, truncate_num_samples: Optional[int] = None, -) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]: +) -> Iterable[Union[dict[str, bytes], dict[str, NDArray]]]: """Generator over samples of a dataloader. Args: diff --git a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py index 635efd54d4..50d11b1222 100644 --- a/llmfoundry/command_utils/data_prep/convert_delta_to_json.py +++ b/llmfoundry/command_utils/data_prep/convert_delta_to_json.py @@ -8,7 +8,7 @@ import urllib.parse from collections import namedtuple from concurrent.futures import ProcessPoolExecutor -from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Iterable, Optional, Union from uuid import uuid4 import google.protobuf.any_pb2 as any_pb2 @@ -70,7 +70,7 @@ def to_cf(self: 'SparkConnectClient', plan: 'pb2.Plan', - type: str = 'json') -> Tuple[List[Result], int, bool]: + type: str = 'json') -> tuple[list[Result], int, bool]: """Executes the query plans and return as presigned URLS for cloud fetch. It can handle the current output formats that are supported by the server. @@ -163,7 +163,7 @@ def to_cf(self: 'SparkConnectClient', def collect_as_cf(self: 'DataFrame', - type: str = 'json') -> Tuple[List[Result], int, bool]: + type: str = 'json') -> tuple[list[Result], int, bool]: """Collects DataFrame execution plan as presigned URLs. This method is a wrapper around the `to_cf` method of SparkConnectClient. It takes the @@ -213,7 +213,7 @@ def run_query( cursor: Optional['Cursor'] = None, spark: Optional['SparkSession'] = None, collect: bool = True, -) -> Optional[Union[List['Row'], 'DataFrame', 'SparkDataFrame']]: +) -> Optional[Union[list['Row'], 'DataFrame', 'SparkDataFrame']]: """Run SQL query via databricks-connect or databricks-sql. Args: @@ -240,7 +240,7 @@ def run_query( raise ValueError(f'Unrecognized method: {method}') -def get_args(signed: List, json_output_folder: str, columns: List) -> Iterable: +def get_args(signed: list, json_output_folder: str, columns: list) -> Iterable: for i, r in enumerate(signed): yield (i, r.url, json_output_folder, columns) @@ -249,7 +249,7 @@ def download( ipart: int, url: str, json_output_folder: str, - columns: Optional[List] = None, + columns: Optional[list] = None, resp_format: str = 'arrow', compressed: bool = False, ) -> None: @@ -299,7 +299,7 @@ def download( ) -def download_starargs(args: Tuple) -> None: +def download_starargs(args: tuple) -> None: return download(*args) diff --git a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py index 94cd79815b..1cbd47fb45 100644 --- a/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py +++ b/llmfoundry/command_utils/data_prep/convert_finetuning_dataset.py @@ -5,7 +5,7 @@ import os import platform import warnings -from typing import Any, Callable, Dict, Iterable, Optional, Union +from typing import Any, Callable, Iterable, Optional, Union import datasets as hf_datasets import psutil @@ -63,7 +63,7 @@ def build_dataloader( def generate_samples( loader: DataLoader, truncate_num_samples: Optional[int] = None, -) -> Iterable[Dict[str, bytes]]: +) -> Iterable[dict[str, bytes]]: """Generator over samples of a dataloader. Args: diff --git a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py index 94bdc16526..7c40a7e698 100644 --- a/llmfoundry/command_utils/data_prep/convert_text_to_mds.py +++ b/llmfoundry/command_utils/data_prep/convert_text_to_mds.py @@ -9,7 +9,7 @@ from concurrent.futures import ProcessPoolExecutor from functools import partial from glob import glob -from typing import Dict, Iterable, List, Optional, Tuple, cast +from typing import Iterable, Optional, cast import numpy as np from composer.utils import ( @@ -29,6 +29,7 @@ merge_shard_groups, ) from llmfoundry.utils.exceptions import ( + CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, @@ -60,7 +61,7 @@ def __init__( super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) log.info(f'Initialized ConcatTokensFromFilesDataset.') - def __iter__(self) -> Iterable[Dict[str, NDArray]]: + def __iter__(self) -> Iterable[dict[str, NDArray]]: log.info( 'Starting iteration over files in ConcatTokensFromFilesDataset', ) @@ -71,31 +72,35 @@ def __iter__(self) -> Iterable[Dict[str, NDArray]]: buffer += self.bos_tokens first_chunk = True # Read the file in 1MB chunks to avoid memory issues - for chunk in iter(partial(f.read, 1000000), ''): - # Tokenize the chunk - encoded = self.tokenizer( - chunk, - truncation=False, - padding=False, - ) - iids = encoded['input_ids'] - - # If this is not the first chunk, remove the BOS token - if not first_chunk: - if iids[0] == self.tokenizer.bos_token_id: - iids = iids[1:] - - # Add the tokens to the buffer - buffer += iids - while len(buffer) >= self.max_length: - concat_sample = buffer[:self.max_length] - buffer = buffer[self. - max_length:] if self.should_wrap else [] - yield { - 'tokens': np.asarray(concat_sample, dtype=np.int32), - } - - first_chunk = False + try: + for chunk in iter(partial(f.read, 1000000), ''): + # Tokenize the chunk + encoded = self.tokenizer( + chunk, + truncation=False, + padding=False, + ) + iids = encoded['input_ids'] + + # If this is not the first chunk, remove the BOS token + if not first_chunk: + if iids[0] == self.tokenizer.bos_token_id: + iids = iids[1:] + + # Add the tokens to the buffer + buffer += iids + while len(buffer) >= self.max_length: + concat_sample = buffer[:self.max_length] + buffer = buffer[self.max_length: + ] if self.should_wrap else [] + yield { + 'tokens': + np.asarray(concat_sample, dtype=np.int32), + } + + first_chunk = False + except UnicodeDecodeError: + raise CannotUnicodeDecodeFile(text_file=file) # Add the EOS token to the buffer to separate files. buffer += self.eos_tokens @@ -111,7 +116,7 @@ def __iter__(self) -> Iterable[Dict[str, NDArray]]: ) -def get_object_names(input_folder: str) -> List[str]: +def get_object_names(input_folder: str) -> list[str]: """Get object names from a local or remote folder. Args: @@ -138,7 +143,7 @@ def get_object_names(input_folder: str) -> List[str]: def get_task_args( - object_names: List[str], + object_names: list[str], output_root: str, input_folder: str, n_groups: int, @@ -191,7 +196,7 @@ def get_task_args( ) -def download_and_convert_starargs(args: Tuple): +def download_and_convert_starargs(args: tuple): """Helper function to call download_and_convert with star args. This helps us use download_and_convert with multiprocessing. @@ -200,7 +205,7 @@ def download_and_convert_starargs(args: Tuple): def download_and_convert( - file_names: List[str], + file_names: list[str], output_folder: str, input_folder: str, tokenizer_name: str, @@ -282,7 +287,7 @@ def is_remote_path(path: str) -> bool: def is_already_processed( output_root: str, args_str: str, - object_names: List[str], + object_names: list[str], ) -> bool: """Determines whether a group of text files has already been processed. @@ -349,7 +354,7 @@ def is_already_processed( return True -def write_done_file(folder: str, args_str: str, object_names: List[str]): +def write_done_file(folder: str, args_str: str, object_names: list[str]): """Write a file to signify completion. This the done file includes the arguments to processing and diff --git a/llmfoundry/command_utils/eval.py b/llmfoundry/command_utils/eval.py index bddd592dba..f622ca182d 100644 --- a/llmfoundry/command_utils/eval.py +++ b/llmfoundry/command_utils/eval.py @@ -4,7 +4,7 @@ import logging import os import time -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Optional, Union import pandas as pd import torch @@ -41,27 +41,27 @@ def evaluate_model( - tokenizer: Dict[str, Any], + tokenizer: dict[str, Any], model_name: str, - model: Dict[str, Any], + model: dict[str, Any], dist_timeout: Union[float, int], run_name: str, seed: int, - icl_tasks: Union[str, list[Dict[str, Any]]], + icl_tasks: Union[str, list[dict[str, Any]]], max_seq_len: int, device_eval_batch_size: Union[int, float], - eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], - eval_loader_config: Optional[Union[Dict[str, Any], list[Dict[str, Any]]]], - fsdp_config: Optional[Dict[str, Any]], + eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], + eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], + fsdp_config: Optional[dict[str, Any]], loggers: list[LoggerDestination], python_log_level: Optional[str], precision: str, eval_gauntlet_df: Optional[pd.DataFrame], eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], - callback_configs: Optional[Dict[str, Any]], - metadata: Optional[Dict[str, str]], - logged_config: Dict[str, Any], + callback_configs: Optional[dict[str, Any]], + metadata: Optional[dict[str, str]], + logged_config: dict[str, Any], should_log_config: bool = True, load_path: Optional[str] = None, ): @@ -157,7 +157,7 @@ def evaluate_model( if should_log_config: log.info('Evaluation config:') - log_config(logged_config) + log_config(trainer.logger, logged_config) log.info(f'Starting eval for {model_name}...') if torch.cuda.is_available(): @@ -175,7 +175,7 @@ def evaluate_model( return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) -def allow_toplevel_keys(cfg: Dict[str, Any]) -> Dict[str, Any]: +def allow_toplevel_keys(cfg: dict[str, Any]) -> dict[str, Any]: """Transform the config to allow top-level keys for model configuration. This function allows users to use the 'train.py' syntax in 'eval.py'. @@ -223,7 +223,7 @@ def allow_toplevel_keys(cfg: Dict[str, Any]) -> Dict[str, Any]: return cfg -def evaluate(cfg: DictConfig) -> Tuple[list[Trainer], pd.DataFrame]: +def evaluate(cfg: DictConfig) -> tuple[list[Trainer], pd.DataFrame]: # Run user provided code if specified for code_path in cfg.get('code_paths', []): import_file(code_path) @@ -388,7 +388,7 @@ def evaluate(cfg: DictConfig) -> Tuple[list[Trainer], pd.DataFrame]: def calculate_markdown_results( logger_keys: list[str], trainer: Trainer, - benchmark_to_taxonomy: Dict[str, str], + benchmark_to_taxonomy: dict[str, str], model_name: str, ): results = {} @@ -483,7 +483,7 @@ def calculate_markdown_results( def eval_from_yaml( yaml_path: str, args_list: Optional[list[str]], -) -> Tuple[list[Trainer], pd.DataFrame]: +) -> tuple[list[Trainer], pd.DataFrame]: """Run the evaluation with optional overrides from CLI.""" # Load yaml and CLI arguments. om.clear_resolver('oc.env') diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py index c925e6e586..8fac739544 100644 --- a/llmfoundry/command_utils/train.py +++ b/llmfoundry/command_utils/train.py @@ -5,7 +5,7 @@ import os import time import warnings -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import torch import torch.distributed @@ -153,7 +153,7 @@ def validate_config(train_config: TrainConfig): ) -def _log_num_params(model: ComposerModel, logged_cfg: Dict[str, Any]): +def _log_num_params(model: ComposerModel, logged_cfg: dict[str, Any]): # Log number of parameters if hasattr(model, 'n_total_params'): n_params = model.n_total_params @@ -256,7 +256,7 @@ def train(cfg: DictConfig) -> Trainer: train_loader_config = train_cfg.train_loader # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config + fsdp_config: Optional[dict[str, Any]] = train_cfg.fsdp_config if fsdp_config is not None: if 'load_planner' in fsdp_config: @@ -368,15 +368,15 @@ def train(cfg: DictConfig) -> Trainer: profiler: Optional[Profiler] = None profiler_cfg = train_cfg.profiler if profiler_cfg: - profiler_schedule_cfg: Dict = pop_config( + profiler_schedule_cfg: dict = pop_config( profiler_cfg, 'schedule', must_exist=True, ) profiler_schedule = cyclic_schedule(**profiler_schedule_cfg) # Only support json trace handler - profiler_trace_handlers: List[TraceHandler] = [] - profiler_trace_cfg: Optional[Dict] = pop_config( + profiler_trace_handlers: list[TraceHandler] = [] + profiler_trace_cfg: Optional[dict] = pop_config( profiler_cfg, 'json_trace_handler', must_exist=False, @@ -395,7 +395,7 @@ def train(cfg: DictConfig) -> Trainer: callback_configs = train_cfg.callbacks or {} # Callbacks - callbacks: List[Callback] = [ + callbacks: list[Callback] = [ build_callback( name=str(name), kwargs=callback_cfg, @@ -573,7 +573,7 @@ def train(cfg: DictConfig) -> Trainer: if train_cfg.log_config: log.info('Logging config') - log_config(logged_cfg) + log_config(trainer.logger, logged_cfg) log_dataset_uri(logged_cfg) torch.cuda.empty_cache() gc.collect() @@ -591,7 +591,7 @@ def train(cfg: DictConfig) -> Trainer: def train_from_yaml( yaml_path: str, - args_list: Optional[List[str]] = None, + args_list: Optional[list[str]] = None, ) -> Trainer: """Run the training with optional overrides from CLI.""" # Load yaml and CLI arguments. diff --git a/llmfoundry/data/data.py b/llmfoundry/data/data.py index bde68a6998..17b28e1173 100644 --- a/llmfoundry/data/data.py +++ b/llmfoundry/data/data.py @@ -5,7 +5,7 @@ import os import warnings from abc import ABC, abstractmethod -from typing import Dict, Iterable, Optional, Union +from typing import Iterable, Optional, Union import datasets as hf_datasets import numpy as np @@ -45,7 +45,7 @@ def __init__( ): self.hf_dataset = hf_dataset - def __iter__(self) -> Iterable[Dict[str, bytes]]: + def __iter__(self) -> Iterable[dict[str, bytes]]: for sample in self.hf_dataset: # convert to bytes to store in MDS binary format yield {'text': sample['text'].encode('utf-8')} @@ -112,7 +112,7 @@ def __init__( ) @abstractmethod - def __iter__(self) -> Iterable[Union[Dict[str, bytes], Dict[str, NDArray]]]: + def __iter__(self) -> Iterable[Union[dict[str, bytes], dict[str, NDArray]]]: pass @@ -151,7 +151,7 @@ def __init__( self.hf_dataset = hf_dataset super().__init__(tokenizer, max_length, bos_text, eos_text, no_wrap) - def __iter__(self) -> Iterable[Dict[str, NDArray]]: + def __iter__(self) -> Iterable[dict[str, NDArray]]: buffer = [] for sample in self.hf_dataset: encoded = self.tokenizer( diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index e7521bc343..7211b9c528 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -3,7 +3,7 @@ """Dataloader builder utilities.""" -from typing import Any, Dict, Union +from typing import Any, Union from composer import DataSpec from transformers import PreTrainedTokenizerBase @@ -17,7 +17,7 @@ def build_dataloader( - cfg: Dict[str, Any], + cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: Union[int, float], ) -> DataSpec: @@ -30,7 +30,7 @@ def build_dataloader( that the dataloader will produce. """ name = cfg.pop('name') - kwargs: Dict[str, Any] = { + kwargs: dict[str, Any] = { **cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size, diff --git a/llmfoundry/data/finetuning/collator.py b/llmfoundry/data/finetuning/collator.py index 68ebb9d21d..b24afd163e 100644 --- a/llmfoundry/data/finetuning/collator.py +++ b/llmfoundry/data/finetuning/collator.py @@ -3,7 +3,7 @@ import logging import warnings -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import torch from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast @@ -17,10 +17,10 @@ # HuggingFace hardcodes the ignore index to -100 _HF_IGNORE_INDEX = -100 -TokenizedExample = Dict[str, List[Dict[str, List[int]]]] +TokenizedExample = dict[str, list[dict[str, list[int]]]] -def ensure_list(x: Union[List, torch.Tensor]) -> List: +def ensure_list(x: Union[list, torch.Tensor]) -> list: if isinstance(x, torch.Tensor): x = list(x.flatten()) assert isinstance(x, list) @@ -238,7 +238,7 @@ def __init__( target_responses: str = 'last', target_prompts: str = 'none', allow_pad_trimming: bool = False, - batch_metadata: Optional[Dict[str, Any]] = None, + batch_metadata: Optional[dict[str, Any]] = None, pad_to_longest: bool = False, ): self.tokenizer = tokenizer @@ -300,7 +300,7 @@ def __init__( self._warned_target = False def __call__(self, - examples: List[TokenizedExample]) -> Dict[str, torch.Tensor]: + examples: list[TokenizedExample]) -> dict[str, torch.Tensor]: for check_key in ['input_ids', 'labels']: if check_key not in examples[0]['turns'][0]: raise KeyError( @@ -323,8 +323,8 @@ def __call__(self, def _process_and_batch_decoder_only( self, - examples: List[TokenizedExample], - ) -> Dict[str, torch.Tensor]: + examples: list[TokenizedExample], + ) -> dict[str, torch.Tensor]: # Steps explained in comments processed_examples = [] input_ids_and_labels = [ @@ -422,8 +422,8 @@ def _process_and_batch_decoder_only( def _process_and_batch_encoder_decoder( self, - examples: List[TokenizedExample], - ) -> Dict[str, torch.Tensor]: + examples: list[TokenizedExample], + ) -> dict[str, torch.Tensor]: # The encoder-decoder case is has some gotchas. # Steps are explained in comments. processed_examples = [] diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 6aecadb6bb..69051a2d51 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -3,7 +3,7 @@ import inspect import logging import os -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Optional, Union import torch from composer.core.data_spec import DataSpec @@ -57,7 +57,7 @@ def build_finetuning_dataloader( tokenizer: PreTrainedTokenizerBase, device_batch_size: Union[int, float], - dataset: Dict[str, Any], + dataset: dict[str, Any], num_workers: int, drop_last: bool = False, pin_memory: bool = True, @@ -369,14 +369,14 @@ def _validate_config( hf_name: Optional[str] = None, local: Optional[str] = None, remote: Optional[str] = None, - hf_kwargs: Optional[Dict[str, Any]] = None, + hf_kwargs: Optional[dict[str, Any]] = None, preprocessing_fn: Optional[str] = None, safe_load: Optional[bool] = None, - streams: Optional[Dict[str, Any]] = None, + streams: Optional[dict[str, Any]] = None, target_prompts: Optional[str] = None, target_responses: Optional[str] = None, allowed_dataset_keys: set[str] = _ALLOWED_DATASET_KEYS, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> None: """Validates the dataset configuration. @@ -617,10 +617,10 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: def build_collate_fn( - dataloader_cfg: Dict[str, Any], + dataloader_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: int, -) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: +) -> tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: # These `.get` calls are safe because the dataset_cfg is validated for extra keys dataset_cfg = dataloader_cfg['dataset'] target_responses = dataset_cfg.get( diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index e8175b4446..7aff80e50f 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -41,13 +41,9 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: from typing import ( Any, Callable, - Dict, - List, Literal, Optional, Sequence, - Tuple, - Type, Union, cast, ) @@ -122,10 +118,10 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: DEFAULT_TARGET_PROMPTS = 'none' PromptResponseDict = Mapping[str, str] -ChatFormattedDict = Mapping[str, List[Dict[str, str]]] +ChatFormattedDict = Mapping[str, list[dict[str, str]]] Example = Union[PromptResponseDict, ChatFormattedDict] ExampleType = Literal['prompt_response', 'chat'] -TokenizedExample = Dict[str, List[Dict[str, List[int]]]] +TokenizedExample = dict[str, list[dict[str, list[int]]]] def _get_example_type(example: Example) -> ExampleType: @@ -183,7 +179,7 @@ def _validate_chat_formatted_example(example: ChatFormattedDict): if not isinstance(example, Mapping): raise InvalidExampleTypeError(str(type(example))) messages = example[_get_key(example, ALLOWED_MESSAGES_KEYS)] - if not isinstance(messages, List): + if not isinstance(messages, list): raise InvalidMessageTypeError(str(type(messages))) if len(messages) <= 1: raise NotEnoughChatDataError() @@ -218,7 +214,7 @@ def _validate_chat_formatted_example(example: ChatFormattedDict): def _slice_chat_formatted_example( example: ChatFormattedDict, tokenizer: PreTrainedTokenizerBase, -) -> List[Tuple[str, str]]: +) -> list[tuple[str, str]]: """Slices chat example into a list of templated prompt, response turns. Note: Assistant messages mark the end of chat turns. So there are as many turns as there are @@ -246,9 +242,9 @@ def _slice_chat_formatted_example( ) def slice_out_last_turn( - messages_through_current_turn: List[Dict[str, str]], + messages_through_current_turn: list[dict[str, str]], conversation_through_previous_turn: str, - ) -> Tuple[str, str]: + ) -> tuple[str, str]: try: full_conversation = tokenizer.apply_chat_template( messages_through_current_turn, @@ -285,7 +281,7 @@ def slice_out_last_turn( response = full_conversation[len(prompt_with_history):] return prompt, response - templated_prompt_response_turns: List[Tuple[str, str]] = [] + templated_prompt_response_turns: list[tuple[str, str]] = [] conversation_through_previous_turn = '' for idx, message in enumerate(messages): if message['role'] == 'assistant': @@ -304,7 +300,7 @@ def _tokenize_with_bos_removal( tokenizer: PreTrainedTokenizerBase, text: str, text_target: str, -) -> Dict[str, List[int]]: +) -> dict[str, list[int]]: """Tokenizes the prompt and response using the provided tokenizer. Args: @@ -647,7 +643,7 @@ def __init__( self.packing_ratio = packing_ratio # How to process a sample - def __getitem__(self, idx: int) -> Dict[str, Any]: + def __getitem__(self, idx: int) -> dict[str, Any]: sample = super().__getitem__(idx) if 'turns' in sample: # Already tokenized in latest format @@ -677,7 +673,7 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: return tokenize_formatted_example(sample, tokenizer=self.tokenizer) def state_dict(self, num_samples: int, - from_beginning: bool) -> Dict[str, Any]: + from_beginning: bool) -> dict[str, Any]: if self.packing_ratio is not None: num_samples = int(self.packing_ratio * num_samples) @@ -690,7 +686,7 @@ def state_dict(self, num_samples: int, class DatasetConstructor: def __init__(self): - self._task_preprocessing_registry: Dict[str, Callable] = {} + self._task_preprocessing_registry: dict[str, Callable] = {} def register(self, *names: str) -> Callable[[Callable], Callable]: """Decorator for registering preprocessing functions.""" @@ -716,8 +712,8 @@ def print_registered_tasks(self) -> None: def get_preprocessing_fn_from_dict( self, - mapping: Dict[str, str], - ) -> Callable[[Dict[str, Any]], Example]: + mapping: dict[str, str], + ) -> Callable[[dict[str, Any]], Example]: """Get a preprocessing function from a dictionary. The dictionary maps column names in the dataset to "prompt" and "response". @@ -739,7 +735,7 @@ def get_preprocessing_fn_from_dict( ValueError: If the mapping does not have keys "prompt" and "response". """ - def _preprocessor(example: Dict[str, Any]) -> Dict[str, str]: + def _preprocessor(example: dict[str, Any]) -> dict[str, str]: if list(mapping.keys()) != ['prompt', 'response']: raise InvalidPromptResponseKeysError(mapping, example) return { @@ -753,7 +749,7 @@ def get_preprocessing_fn_from_str( self, preprocessor: Optional[str], dataset_name: Optional[str] = None, - ) -> Optional[Callable[[Dict[str, Any]], Example]]: + ) -> Optional[Callable[[dict[str, Any]], Example]]: """Get a preprocessing function from a string. String can be either a registered function or an import path. @@ -805,12 +801,13 @@ def build_from_hf( split: str, safe_load: bool = False, max_seq_len: int = 2048, + mapping_fn: Callable = tokenize_formatted_example, preprocessing_fn: Optional[Callable[[dict[str, Any]], Example]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, target_prompts: str = DEFAULT_TARGET_PROMPTS, target_responses: str = DEFAULT_TARGET_RESPONSES, decoder_only_format: bool = True, - hf_kwargs: Optional[Dict[str, Any]] = None, + hf_kwargs: Optional[dict[str, Any]] = None, ) -> Union[hf_datasets.DatasetDict, hf_datasets.Dataset, hf_datasets.IterableDatasetDict, hf_datasets.IterableDataset]: """Load a HuggingFace Datasets, preprocess, and tokenize. @@ -828,6 +825,8 @@ def build_from_hf( max_seq_len (int): The maximum length of sequences in the batch. See :class:`Seq2SeqFinetuningCollator` docstring for details. + mapping_fn (Callable): The mapping function to use for mapping the data + examples. preprocessing_fn (Callable, optional): The preprocessing function to use for formatting the data examples. tokenizer (PreTrainedTokenizerBase): The tokenizer to be used for tokenizing @@ -932,13 +931,13 @@ def build_from_hf( **hf_kwargs, ) - def dataset_mapper(example: Dict): + def dataset_mapper(example: dict): if preprocessing_fn is not None: - return tokenize_formatted_example( + return mapping_fn( preprocessing_fn(example), tokenizer, ) - return tokenize_formatted_example(example, tokenizer) + return mapping_fn(example, tokenizer) detected_cpu_count = os.cpu_count() or 1 detected_cpus_with_margin = detected_cpu_count - 8 @@ -1006,7 +1005,7 @@ def dataset_mapper(example: Dict): return filtered_dataset @property - def streaming_dataset_class(self) -> Type[StreamingFinetuningDataset]: + def streaming_dataset_class(self) -> type[StreamingFinetuningDataset]: return StreamingFinetuningDataset def build_from_streaming( @@ -1021,7 +1020,7 @@ def build_from_streaming( @dataset_constructor.register('tatsu-lab/alpaca') -def alpaca_preprocessing_function(inp: Dict) -> PromptResponseDict: +def alpaca_preprocessing_function(inp: dict) -> PromptResponseDict: """Split out prompt/response from text.""" try: prompt, response = inp['text'].split('### Response:') @@ -1033,7 +1032,7 @@ def alpaca_preprocessing_function(inp: Dict) -> PromptResponseDict: @dataset_constructor.register('HuggingFaceH4/databricks_dolly_15k') -def dolly_preprocessing_function(inp: Dict) -> PromptResponseDict: +def dolly_preprocessing_function(inp: dict) -> PromptResponseDict: """Format the text string.""" PROMPT_FORMAT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n' try: @@ -1049,7 +1048,7 @@ def dolly_preprocessing_function(inp: Dict) -> PromptResponseDict: @dataset_constructor.register('bigscience/P3') -def p3_preprocessing_function(inp: Dict) -> PromptResponseDict: +def p3_preprocessing_function(inp: dict) -> PromptResponseDict: """Format the already-split example.""" return { 'prompt': inp['inputs'] + ':', @@ -1059,7 +1058,7 @@ def p3_preprocessing_function(inp: Dict) -> PromptResponseDict: # Muennighoff's P3 and flan datasets share a similar convention @dataset_constructor.register('Muennighoff/P3', 'Muennighoff/flan') -def muennighoff_tokenize_function(inp: Dict) -> PromptResponseDict: +def muennighoff_tokenize_function(inp: dict) -> PromptResponseDict: """Format the already-split example.""" try: prompt: str = inp['inputs'] @@ -1076,7 +1075,7 @@ def muennighoff_tokenize_function(inp: Dict) -> PromptResponseDict: @dataset_constructor.register('teknium/OpenHermes-2.5') -def shareGPT_format_preprocessor(inp: Dict) -> ChatFormattedDict: +def shareGPT_format_preprocessor(inp: dict) -> ChatFormattedDict: """Convert from ShareGPT format to our chat format.""" role_map = { 'human': 'user', @@ -1084,7 +1083,7 @@ def shareGPT_format_preprocessor(inp: Dict) -> ChatFormattedDict: } try: conversation = inp['conversations'] - messages: List[Dict[str, str]] = [] + messages: list[dict[str, str]] = [] for message in conversation: role: str = role_map.get(message['from'], message['from']) content: str = message['value'] diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 77e166c474..2f18bda7fc 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -3,7 +3,7 @@ import logging import tempfile -from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple +from typing import Any, Callable, Iterable, Literal, Optional import numpy as np import torch @@ -68,7 +68,7 @@ def __init__( self.n_total_tokens = 0 self.n_packed_examples = 0 - self._leftover_bins: List[Tuple[int, Dict[str, torch.Tensor]]] = [] + self._leftover_bins: list[tuple[int, dict[str, torch.Tensor]]] = [] self._is_profiling = is_profiling @@ -84,12 +84,12 @@ def efficiency(self) -> float: def __call__( self, - examples: List[Dict[str, torch.Tensor]], - ) -> Dict[str, torch.Tensor]: + examples: list[dict[str, torch.Tensor]], + ) -> dict[str, torch.Tensor]: batch = self.base_collator(examples) return self.pack(batch) - def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + def pack(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]: if self._is_profiling: raise ValueError('Cannot pack in profiling mode.') @@ -111,9 +111,9 @@ def pack(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: def _pack_trimmed_examples( self, - trimmed_examples: List[Dict[str, torch.Tensor]], - sizes: List[int], - ) -> Optional[Dict[str, torch.Tensor]]: + trimmed_examples: list[dict[str, torch.Tensor]], + sizes: list[int], + ) -> Optional[dict[str, torch.Tensor]]: """Packs trimmed examples into fixed-size bins and repads them. Args: @@ -147,8 +147,8 @@ def _pack_trimmed_examples( def _convert_to_batch( self, - packed_examples: List[Dict[str, torch.Tensor]], - ) -> Dict[str, torch.Tensor]: + packed_examples: list[dict[str, torch.Tensor]], + ) -> dict[str, torch.Tensor]: pad_vals = { 'input_ids': self.pad_token_id, @@ -171,16 +171,16 @@ def _convert_to_batch( def _first_fit_bin_packing( self, - sizes: List[int], - examples: List[Dict[str, torch.Tensor]], + sizes: list[int], + examples: list[dict[str, torch.Tensor]], num_bins: int, max_bin_size: int, - existing_bins: List[Tuple[int, Dict[str, torch.Tensor]]], - ) -> Tuple[List[Dict[str, torch.Tensor]], int, int, List[Tuple[int, Dict[ + existing_bins: list[tuple[int, dict[str, torch.Tensor]]], + ) -> tuple[list[dict[str, torch.Tensor]], int, int, list[tuple[int, dict[ str, torch.Tensor]]]]: # Will contain tuples (bin_size_size, packed_example) - bins: List[Tuple[int, Dict[str, torch.Tensor]]] = existing_bins + bins: list[tuple[int, dict[str, torch.Tensor]]] = existing_bins starting_total_bin_sizes = sum([bin_size for bin_size, _ in bins]) @@ -279,8 +279,8 @@ def _first_fit_bin_packing( def _trim_batch( - batch: Dict[str, torch.Tensor], -) -> Tuple[List[int], List[Dict[str, torch.Tensor]]]: + batch: dict[str, torch.Tensor], +) -> tuple[list[int], list[dict[str, torch.Tensor]]]: """Trims padding off all examples in batch. Args: @@ -298,8 +298,8 @@ def _trim_batch( return sizes, trimmed_examples -def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor], - idx: int) -> Tuple[int, Dict[str, torch.Tensor]]: +def _extract_trim_batch_idx(batch: dict[str, torch.Tensor], + idx: int) -> tuple[int, dict[str, torch.Tensor]]: example = {k: v[idx] for k, v in batch.items()} keep = example['attention_mask'] == 1 @@ -311,9 +311,9 @@ def _extract_trim_batch_idx(batch: Dict[str, torch.Tensor], def _combine_in_place( - example: Dict[str, torch.Tensor], - add_on: Dict[str, torch.Tensor], -) -> Dict[str, torch.Tensor]: + example: dict[str, torch.Tensor], + add_on: dict[str, torch.Tensor], +) -> dict[str, torch.Tensor]: if 'labels' in add_on: # Prevents the last token in example from being trained to # predict the first token in add_on, which would make no sense. @@ -352,7 +352,7 @@ def _pad_tensor( def auto_packing_ratio( - dataloader_cfg: Dict[str, Any], + dataloader_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: int, num_packing_ratios: int = 20, @@ -427,13 +427,13 @@ def auto_packing_ratio( def profile_packing( - dataloader_cfg: Dict[str, Any], + dataloader_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, min_ratio: float, max_ratio: float, num_packing_ratios: int, device_batch_size: int, -) -> Iterable[Tuple[float, Optional[float], Optional[float]]]: +) -> Iterable[tuple[float, Optional[float], Optional[float]]]: """Generator function that profiles example packing across packing ratios. Args: @@ -514,7 +514,7 @@ def profile_packing( # Cut everything down to size sizes, trimmed_examples = _trim_batch(big_batch) - def profile(raw_batch_size: int) -> Tuple[Optional[float], Optional[float]]: + def profile(raw_batch_size: int) -> tuple[Optional[float], Optional[float]]: # Copy trimmed examples so that the dicts are not shared between profiling runs. trimmed_examples_copy = [te.copy() for te in trimmed_examples] diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 4bbfc29e7d..3ce248e69f 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -8,7 +8,6 @@ from typing import ( Any, Callable, - Dict, Mapping, Optional, Sequence, @@ -194,7 +193,7 @@ def __init__( self.max_seq_len = max_seq_len # How to tokenize a text sample to a token sample - def _tokenize(self, text_sample: Mapping) -> Dict[str, list[int]]: + def _tokenize(self, text_sample: Mapping) -> dict[str, list[int]]: if self.tokenizer._pad_token is None: # Some tokenizers (e.g. GPT2 tokenizer) have no padding token which causes bugs raise RuntimeError( @@ -210,7 +209,7 @@ def _tokenize(self, text_sample: Mapping) -> Dict[str, list[int]]: def _read_binary_tokenized_sample( self, - sample: Dict[str, Any], + sample: dict[str, Any], ) -> torch.Tensor: # Modeling code still expects int64 tensors. if isinstance(sample['tokens'], np.ndarray): @@ -227,7 +226,7 @@ def _read_binary_tokenized_sample( # How to process a sample def __getitem__(self, - idx: int) -> Union[Dict[str, list[int]], torch.Tensor]: + idx: int) -> Union[dict[str, list[int]], torch.Tensor]: sample = super().__getitem__(idx) if 'text' in sample: token_sample = self._tokenize(sample) @@ -268,14 +267,14 @@ def __init__( self.split_token_id = eos_token_id self.bos_mode = False - def __call__(self, examples: list[Any]) -> Dict[str, torch.Tensor]: + def __call__(self, examples: list[Any]) -> dict[str, torch.Tensor]: batch = self.base_collator(examples) batch['sequence_id'] = self.get_sequence_id_from_batch(batch) return batch def get_sequence_id_from_batch( self, - batch: Dict[str, torch.Tensor], + batch: dict[str, torch.Tensor], ) -> torch.Tensor: is_separator = torch.eq(batch['input_ids'], self.split_token_id) cumulative_sep = torch.cumsum(is_separator, @@ -289,7 +288,7 @@ def get_sequence_id_from_batch( return torch.cat([left_zeros, cumulative_sep[:, :-1]], dim=1) -def build_streams(streams: Optional[Dict[str, Any]] = None,): +def build_streams(streams: Optional[dict[str, Any]] = None,): streams_dict = streams # build streams streams_ret = [] @@ -301,7 +300,7 @@ def build_streams(streams: Optional[Dict[str, Any]] = None,): def build_text_dataloader( tokenizer: PreTrainedTokenizerBase, device_batch_size: Union[int, float], - dataset: Dict[str, Any], + dataset: dict[str, Any], drop_last: bool, num_workers: int, pin_memory: bool = True, diff --git a/llmfoundry/data/utils.py b/llmfoundry/data/utils.py index 206e884f70..e8ed1a947d 100644 --- a/llmfoundry/data/utils.py +++ b/llmfoundry/data/utils.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Any, Callable, Dict, Iterable, Mapping, Tuple, Union +from typing import Any, Callable, Iterable, Mapping, Union import torch import transformers @@ -20,7 +20,7 @@ def _validate_cfg( - dataset_cfg: Dict[str, Any], + dataset_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, ): eos_token_id = dataset_cfg.get('eos_token_id', None) @@ -56,10 +56,10 @@ def _validate_cfg( def validate_ds_replication( - dataset_cfg: Dict[str, Any], + dataset_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: Union[int, float], -) -> Tuple[int, int]: +) -> tuple[int, int]: _validate_cfg(dataset_cfg, tokenizer) if (dataset_cfg.get('seq_parallel_replication', 1) or 1) > 1: raise NotImplementedError('Sequence parallelism is not supported.') @@ -70,7 +70,7 @@ def validate_ds_replication( def get_data_spec( dl: Union[Iterable, TorchDataloader], - dataset_cfg: Dict[str, Any], + dataset_cfg: dict[str, Any], ) -> DataSpec: del dataset_cfg token_counting_func = get_tokens_per_batch_func() @@ -127,10 +127,10 @@ def get_num_tokens_in_batch(batch: Batch) -> int: def get_text_collator( - dataloader_cfg: Dict[str, Any], + dataloader_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, dataset_batch_size: int, -) -> Tuple[Union[transformers.DataCollatorForLanguageModeling, +) -> tuple[Union[transformers.DataCollatorForLanguageModeling, ConcatenatedSequenceCollatorWrapper], int]: dataset_cfg = dataloader_cfg.get('dataset') assert isinstance(dataset_cfg, dict) @@ -155,8 +155,8 @@ def get_text_collator( def get_finetuning_collator( - dataloader_cfg: Dict[str, Any], + dataloader_cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, dataset_batch_size: int, -) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: +) -> tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: return build_collate_fn(dataloader_cfg, tokenizer, dataset_batch_size) diff --git a/llmfoundry/eval/datasets/in_context_learning_evaluation.py b/llmfoundry/eval/datasets/in_context_learning_evaluation.py index 4e49be3fba..168cf7a8e4 100644 --- a/llmfoundry/eval/datasets/in_context_learning_evaluation.py +++ b/llmfoundry/eval/datasets/in_context_learning_evaluation.py @@ -8,7 +8,7 @@ import logging import os import random -from typing import Any, Dict, Iterable, List, Optional, Sequence, Union +from typing import Any, Iterable, Optional, Sequence, Union import torch import transformers @@ -128,14 +128,14 @@ def __init__( padding_side: str = 'right', tokenize_labels: bool = True, padding_size: Optional[int] = None, - base_batch: Optional[Dict] = None, - batch_mapping: Optional[Dict] = None, - hf_loading_vars: Optional[Dict] = None, - hf_parsing_map: Optional[Dict] = None, - generation_kwargs: Optional[Dict] = None, - static_keys: Optional[List] = None, - list_keys: Optional[List] = None, - tensor_keys: Optional[List] = None, + base_batch: Optional[dict] = None, + batch_mapping: Optional[dict] = None, + hf_loading_vars: Optional[dict] = None, + hf_parsing_map: Optional[dict] = None, + generation_kwargs: Optional[dict] = None, + static_keys: Optional[list] = None, + list_keys: Optional[list] = None, + tensor_keys: Optional[list] = None, ): self.tokenizer = tokenizer self.prefix_space = tokenizer_needs_prefix_space(self.tokenizer) @@ -189,7 +189,7 @@ def _prepare_dataset(self): ) self._prepared = True - def __getitem__(self, index: int) -> Dict: + def __getitem__(self, index: int) -> dict: if not self._prepared: self._prepare_dataset() return self.dataset[index] @@ -197,7 +197,7 @@ def __getitem__(self, index: int) -> Dict: def __len__(self) -> int: return len(self.dataset) - def get_num_samples_in_batch(self, batch: Dict) -> int: + def get_num_samples_in_batch(self, batch: dict) -> int: return batch['input_ids'].shape[0] def get_effective_batch_size(self, batch_size: int) -> int: @@ -214,7 +214,7 @@ def get_effective_batch_size(self, batch_size: int) -> int: """ return batch_size - def update_generation_kwargs(self, generation_kwargs: Dict) -> None: + def update_generation_kwargs(self, generation_kwargs: dict) -> None: r"""Updates self.base_batch with the passed in generation_kwargs. This must be run after self.base_batch is set (for example, if @@ -234,8 +234,8 @@ def read_dataset( self, dataset_uri: str, destination_path: str, - hf_loading_vars: Optional[Dict[str, Any]] = None, - hf_parsing_map: Optional[Dict[str, Any]] = None, + hf_loading_vars: Optional[dict[str, Any]] = None, + hf_parsing_map: Optional[dict[str, Any]] = None, ) -> 'HFDataset': """Reads a dataset and handles parsing it from HuggingFace. @@ -328,7 +328,7 @@ def _generate_few_shot_prompt( def construct_context( self, - example: Dict, + example: dict, preceding_text: str = '', add_answer: bool = False, ) -> str: @@ -357,7 +357,7 @@ def construct_context( def get_answer_from_example( self, - example: Dict[str, Any], + example: dict[str, Any], in_context: bool = False, ) -> str: """Returns the answer from the example. @@ -374,7 +374,7 @@ def get_answer_from_example( cont = f' {cont}' return cont - def _fix_eos_on_preamble(self, input_ids: List[int]) -> List[int]: + def _fix_eos_on_preamble(self, input_ids: list[int]) -> list[int]: """If the input_ids is empty then input_ids will be a 0-length List. unless the tokenizer adds special tokens to empty strings (e.g. OPT @@ -399,8 +399,8 @@ def tokenize_example( self, prompt_and_fewshot: str, ctxt: str, - example: Dict, - ) -> Dict[str, Any]: + example: dict, + ) -> dict[str, Any]: """Runs text through the tokenizer and handle special cases. Args: @@ -481,12 +481,12 @@ def tokenize_example( def _prep_example( self, - example: Dict, + example: dict, example_idx: int, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: """Prepares a single example from a HF Dataset into tokenized format. with prompt and fewshot examples. @@ -522,7 +522,7 @@ def _prep_example( ) return tokenized_example - def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + def collate_fn(self, data: list[dict[str, Any]]) -> dict[str, Any]: """The function that the dataloader uses to accumulate data into. batches. @@ -592,13 +592,13 @@ def __init__( answer_key: str = 'answer', strip_dataset: bool = True, padding_size: Optional[int] = None, - base_batch: Optional[Dict] = None, - batch_mapping: Optional[Dict] = None, - hf_loading_vars: Optional[Dict] = None, - hf_parsing_map: Optional[Dict] = None, - generation_kwargs: Optional[Dict] = None, + base_batch: Optional[dict] = None, + batch_mapping: Optional[dict] = None, + hf_loading_vars: Optional[dict] = None, + hf_parsing_map: Optional[dict] = None, + generation_kwargs: Optional[dict] = None, cot_delimiter: str = '', - early_stopping_criteria: Optional[List[str]] = None, + early_stopping_criteria: Optional[list[str]] = None, do_normalization: bool = True, ): if tokenizer.eos_token_id is None: @@ -672,8 +672,8 @@ def read_dataset( self, dataset_uri: str, destination_path: str, - hf_loading_vars: Dict, - hf_parsing_map: Dict, + hf_loading_vars: dict, + hf_parsing_map: dict, ) -> 'HFDataset': dataset = super().read_dataset( dataset_uri, @@ -705,7 +705,7 @@ def read_dataset( def get_answer_from_example( self, - example: Dict, + example: dict, in_context: bool = False, ) -> str: """Returns the answer from the example. Applies chain of thought if. @@ -728,8 +728,8 @@ def tokenize_example( self, prompt_and_fewshot: str, ctxt: str, - example: Dict, - ) -> Dict[str, Any]: + example: dict, + ) -> dict[str, Any]: """Run text through the tokenizer and handle special cases. Args: @@ -777,7 +777,7 @@ def _get_max_answer_length(self, dataset: Iterable[dict]) -> int: ) return max_answer_length - def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + def collate_fn(self, data: list[dict[str, Any]]) -> dict[str, Any]: batch = super().collate_fn(data) batch_size = batch['input_ids'].shape[0] stopping_criteria = None @@ -866,11 +866,11 @@ def __init__( strip_dataset: bool = True, tokenize_labels: bool = True, padding_size: Optional[int] = None, - hf_loading_vars: Optional[Dict] = None, - hf_parsing_map: Optional[Dict] = None, - generation_kwargs: Optional[Dict] = None, - static_keys: Optional[List] = None, - list_keys: Optional[List] = None, + hf_loading_vars: Optional[dict] = None, + hf_parsing_map: Optional[dict] = None, + generation_kwargs: Optional[dict] = None, + static_keys: Optional[list] = None, + list_keys: Optional[list] = None, ): super().__init__( dataset_uri=dataset_uri, @@ -955,21 +955,21 @@ def __init__( continuation_delimiter: str = ' ', prelimiter: str = '', context_key: str = 'query', - tensor_keys: Optional[List] = None, + tensor_keys: Optional[list] = None, answer_key: str = 'answer', strip_dataset: bool = True, tokenize_labels: bool = True, padding_size: Optional[int] = None, - batch_mapping: Optional[Dict] = None, - hf_loading_vars: Optional[Dict] = None, - hf_parsing_map: Optional[Dict] = None, - generation_kwargs: Optional[Dict] = None, - list_keys: Optional[List] = None, + batch_mapping: Optional[dict] = None, + hf_loading_vars: Optional[dict] = None, + hf_parsing_map: Optional[dict] = None, + generation_kwargs: Optional[dict] = None, + list_keys: Optional[list] = None, choices_key: str = 'choices', - static_keys: Optional[List] = None, - list_of_tensors_keys: Optional[List] = None, - list_of_tuples_keys: Optional[List] = None, - list_of_primitives: Optional[List] = None, + static_keys: Optional[list] = None, + list_of_tensors_keys: Optional[list] = None, + list_of_tuples_keys: Optional[list] = None, + list_of_primitives: Optional[list] = None, ): self.choices_key = choices_key base_batch = { @@ -1031,7 +1031,7 @@ def get_effective_batch_size(self, batch_size: int) -> int: def get_answer_from_example( self, - example: Dict, + example: dict, in_context: bool = False, ) -> str: """Returns the correct answer from the example's choices. @@ -1051,8 +1051,8 @@ def tokenize_example( self, prompt_and_fewshot: str, ctxt: str, - example: Dict, - ) -> Dict[str, Any]: + example: dict, + ) -> dict[str, Any]: """Runs text through the tokenizer and handle special cases. Args: @@ -1122,7 +1122,7 @@ def tokenize_example( tokenized_example['gold'] = example['gold'] return tokenized_example - def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: + def collate_fn(self, data: list[dict[str, Any]]) -> dict[str, Any]: """The function that the dataloader uses to accumulate data into. batches. We run each distinct query + answer choice through the model @@ -1159,7 +1159,7 @@ def collate_fn(self, data: List[Dict[str, Any]]) -> Dict[str, Any]: batch['attention_mask'] = ~(batch['input_ids'] == self.pad_tok_id) return batch - def get_num_samples_in_batch(self, batch: Dict[str, torch.Tensor]) -> int: + def get_num_samples_in_batch(self, batch: dict[str, torch.Tensor]) -> int: return batch['input_ids'].shape[0] // self.num_choices def split_batch(self, batch: Any, @@ -1265,11 +1265,11 @@ def __init__( strip_dataset: bool = True, tokenize_labels: bool = True, padding_size: Optional[int] = None, - batch_mapping: Optional[Dict] = None, - hf_loading_vars: Optional[Dict] = None, - hf_parsing_map: Optional[Dict] = None, - generation_kwargs: Optional[Dict] = None, - list_keys: Optional[List] = None, + batch_mapping: Optional[dict] = None, + hf_loading_vars: Optional[dict] = None, + hf_parsing_map: Optional[dict] = None, + generation_kwargs: Optional[dict] = None, + list_keys: Optional[list] = None, choices_key: str = 'context_options', ): static_keys = ['mode'] @@ -1319,7 +1319,7 @@ def get_effective_batch_size(self, batch_size: int) -> int: def construct_context( self, - example: Dict[str, Any], + example: dict[str, Any], preceding_text: str = '', add_answer: bool = False, ) -> str: @@ -1348,9 +1348,9 @@ def construct_context( def _construct_multiple_contexts( self, - example: Dict, + example: dict, preceding_text: str = '', - ) -> List[str]: + ) -> list[str]: """Takes a example and constructs all contexts. Optionally, appends this to preceding text (such as a prompt or fewshot examples). @@ -1378,12 +1378,12 @@ def _construct_multiple_contexts( def _prep_example( self, - example: Dict, + example: dict, example_idx: int, num_fewshot: int, prompt_string: str, fewshot_rng: random.Random, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: """Prepares a single example from a HF Dataset into tokenized format. with prompt and fewshot examples. @@ -1418,9 +1418,9 @@ def _prep_example( def tokenize_example( self, prompt_and_fewshot: str, - context_options: List[str], - example: Dict, - ) -> Dict[str, Any]: + context_options: list[str], + example: dict, + ) -> dict[str, Any]: """Runs text through the tokenizer and handle special cases. Args: @@ -1491,10 +1491,10 @@ def build_icl_dataloader( dataset_uri: str, tokenizer: transformers.PreTrainedTokenizerBase, batch_size: int, - hf_loading_vars: Dict, - hf_parsing_map: Dict, + hf_loading_vars: dict, + hf_parsing_map: dict, destination_path: str = '', - kwargs: Optional[Dict[str, Any]] = None, + kwargs: Optional[dict[str, Any]] = None, ) -> DataSpec: """Factory method that builds the specific dataset for the specified. @@ -1543,9 +1543,9 @@ def build_icl_dataloader( def partition_dataset_by_category( dataset_uri: str, destination_path: str, - hf_loading_vars: Dict, - hf_parsing_map: Dict, -) -> Dict[str, str]: + hf_loading_vars: dict, + hf_parsing_map: dict, +) -> dict[str, str]: """If has_categories is enabled, we partition the dataset into a separate. dataset for each category value in the data and write each partition to a @@ -1631,11 +1631,11 @@ def get_icl_task_dataloader( transformers.PreTrainedTokenizerFast], batch_size: int, has_categories: bool = False, - hf_loading_vars: Optional[Dict] = None, - hf_parsing_map: Optional[Dict] = None, + hf_loading_vars: Optional[dict] = None, + hf_parsing_map: Optional[dict] = None, destination_path: str = '', - kwargs: Optional[Dict[str, Any]] = None, -) -> Union[DataSpec, Dict[str, DataSpec]]: + kwargs: Optional[dict[str, Any]] = None, +) -> Union[DataSpec, dict[str, DataSpec]]: r"""Constructs a dataloader (or dataloaders if has_categories is True) capable of evaluating LLMs on in-context learning language modeling tasks, diff --git a/llmfoundry/eval/datasets/utils.py b/llmfoundry/eval/datasets/utils.py index c19ae15dd9..8049e4d57d 100644 --- a/llmfoundry/eval/datasets/utils.py +++ b/llmfoundry/eval/datasets/utils.py @@ -6,7 +6,7 @@ import logging import random -from typing import Any, Dict, List, Optional, Set +from typing import Any, Optional import torch import transformers @@ -26,7 +26,7 @@ log = logging.getLogger(__name__) -def strip_data(example: Dict) -> Dict: +def strip_data(example: dict) -> dict: """Remove white space from the begging and end of string values in a. dictionary. @@ -62,10 +62,10 @@ def tokenizer_needs_prefix_space( def trim_context( - context_enc: List, - continuation_enc: List, + context_enc: list, + continuation_enc: list, max_seq_len: int, -) -> List: +) -> list: """Trims a list of tokens down to `max_seq_len` if the length of the list. plus the continuation is more than `max_seq_len`. It will always trim tokens @@ -94,8 +94,8 @@ def trim_context( def get_continuation_span( - context_enc: List, - continuation_enc: List, + context_enc: list, + continuation_enc: list, ) -> torch.Tensor: """Gets the list of indices of the continuation tokens for language. @@ -117,8 +117,8 @@ def get_continuation_span( def make_padded_input( - context_enc: List, - continuation_enc: List, + context_enc: list, + continuation_enc: list, max_seq_len: int, pad_tok_id: int, padding_side: str = 'right', @@ -175,8 +175,8 @@ def make_padded_input( return inp -def convert_tokens_to_tensors(batch: Dict, - tokenize_labels: bool) -> Dict[str, Any]: +def convert_tokens_to_tensors(batch: dict, + tokenize_labels: bool) -> dict[str, Any]: """HF Datasets converts tensors into lists when we store them, and we don't. want to use `type='torch'` because some content in the dataset, like @@ -207,7 +207,7 @@ def get_fewshot_sample_idxs( num_fewshot: int, example_idx: int, rng: random.Random, -) -> Set[int]: +) -> set[int]: """Samples indices without replacement. If num_fewshot exceeds the number. of unique examples in the dataset, then we will have fewer than num_fewshot examples in context. @@ -274,7 +274,7 @@ def __call__( self, input_ids: torch.LongTensor, scores: Optional[torch.FloatTensor] = None, - **kwargs: Dict[str, Any], + **kwargs: dict[str, Any], ) -> bool: # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence lookback_ids_batch = input_ids[:, :][:, -self.stop_sequence_id_len:] @@ -294,7 +294,7 @@ def __call__( def stop_sequences_criteria( tokenizer: transformers.PreTrainedTokenizerBase, - stop_sequences: List[str], + stop_sequences: list[str], batch_size: int, ) -> transformers.StoppingCriteriaList: return transformers.StoppingCriteriaList([ diff --git a/llmfoundry/eval/metrics/nlp.py b/llmfoundry/eval/metrics/nlp.py index f0fbba3ece..71cbf0ee7a 100644 --- a/llmfoundry/eval/metrics/nlp.py +++ b/llmfoundry/eval/metrics/nlp.py @@ -8,7 +8,7 @@ import logging import re import string -from typing import Any, Callable, Dict, List +from typing import Any, Callable import torch from torch import Tensor @@ -160,9 +160,9 @@ def replace_underscore(text: str) -> str: def update( self, - batch: Dict[str, Any], - outputs: List[str], - labels: List[List[str]], + batch: dict[str, Any], + outputs: list[str], + labels: list[list[str]], ): cot_delimiter = batch.get('cot_delimiter', '') do_normalization = batch.get('do_normalization', True) diff --git a/llmfoundry/layers_registry.py b/llmfoundry/layers_registry.py index 50a4906ec1..dc75004af0 100644 --- a/llmfoundry/layers_registry.py +++ b/llmfoundry/layers_registry.py @@ -1,7 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, Type +from typing import Callable import torch @@ -23,7 +23,7 @@ norms = create_registry( 'llmfoundry', 'norms', - generic_type=Type[torch.nn.Module], + generic_type=type[torch.nn.Module], entry_points=True, description=_norms_description, ) @@ -45,7 +45,7 @@ fcs = create_registry( 'llmfoundry', 'fcs', - generic_type=Type[torch.nn.Module], + generic_type=type[torch.nn.Module], entry_points=True, description=_fcs_description, ) @@ -138,7 +138,7 @@ attention_classes = create_registry( 'llmfoundry', 'attention_classes', - generic_type=Type[torch.nn.Module], + generic_type=type[torch.nn.Module], entry_points=True, description=_attention_classes_description, ) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index f1f38e2f7d..56f2d1cc8f 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -9,10 +9,7 @@ from typing import ( TYPE_CHECKING, Any, - Dict, - List, Optional, - Tuple, Union, ) @@ -22,9 +19,11 @@ from transformers import ( AutoConfig, AutoModelForCausalLM, + GenerationConfig, PreTrainedModel, PreTrainedTokenizerBase, ) +from transformers.models.auto.auto_factory import _BaseAutoModelClass from llmfoundry.metrics import ( DEFAULT_CAUSAL_LM_EVAL_METRICS, @@ -83,11 +82,12 @@ def __init__( use_flash_attention_2: bool = False, load_in_8bit: bool = False, init_device: str = 'cpu', - config_overrides: Optional[Dict[str, Any]] = None, - peft_config: Optional[Dict[str, Any]] = None, + config_overrides: Optional[dict[str, Any]] = None, + peft_config: Optional[dict[str, Any]] = None, use_train_metrics: bool = True, - additional_train_metrics: Optional[List] = None, - additional_eval_metrics: Optional[List] = None, + allow_embedding_resizing: bool = False, + additional_train_metrics: Optional[list] = None, + additional_eval_metrics: Optional[list] = None, should_save_peft_only: bool = True, ): @@ -132,6 +132,7 @@ def __init__( tokenizer=tokenizer, metrics=train_metrics, eval_metrics=eval_metrics, + allow_embedding_resizing=allow_embedding_resizing, init_device=init_device, peft_config=peft_config_object, should_save_peft_only=should_save_peft_only, @@ -151,9 +152,9 @@ def transform_model(self, model: PreTrainedModel) -> PreTrainedModel: @staticmethod def build_metrics( use_train_metrics: bool, - additional_train_metrics: Optional[List[str]] = None, - additional_eval_metrics: Optional[List[str]] = None, - ) -> Tuple[List[Metric], List[Metric]]: + additional_train_metrics: Optional[list[str]] = None, + additional_eval_metrics: Optional[list[str]] = None, + ) -> tuple[list[Metric], list[Metric]]: """Builds the training and evaluation metrics for the model. Args: @@ -189,9 +190,11 @@ def build_inner_model( init_device: str, use_flash_attention_2: bool, use_auth_token: bool, - config_overrides: Dict[str, Any], + config_overrides: dict[str, Any], load_in_8bit: bool, pretrained: bool, + model_cls: Union[_BaseAutoModelClass, + PreTrainedModel] = AutoModelForCausalLM, prepare_for_fsdp: bool = False, ) -> Union[PreTrainedModel, 'PeftModel']: """Builds the inner model for the ComposerHFCausalLM. @@ -206,7 +209,8 @@ def build_inner_model( config_overrides (Dict[str, Any]): The configuration overrides. load_in_8bit (bool): Whether to load in 8-bit. pretrained (bool): Whether the model is pretrained. - prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: False. + model_cls (Union[Type, Type[PreTrainedModel]]): HF class for models. Default: ``AutoModelForCausalLM``. + prepare_for_fsdp (bool, optional): Whether to prepare the model for FSDP wrapping. Default: ``False``. Returns: Union[PreTrainedModel, 'PeftModel']: The built inner model. @@ -230,9 +234,17 @@ def build_inner_model( + 'Please `pip install llm-foundry[gpu]`.', ) + if not ( + hasattr(model_cls, 'from_pretrained') and + hasattr(model_cls, 'from_config') + ): + raise AttributeError( + f'{model_cls=} is missing `from_pretrained` and `from_config` support.', + ) + # Hugging Face copies the modules into the # transformers modules cache. On particular systems, this operation seems to cause contention between - # the different processes. To avoid this contention, we first create the config on local rank + # the different processes. To avoid this contention, we first create the config and generation config on local rank # zero. This will set up the transformers module cache and avoid the future contention. if dist.get_local_rank() == 0: AutoConfig.from_pretrained( @@ -243,6 +255,13 @@ def build_inner_model( use_cache= False, # Necessary due to https://github.com/huggingface/transformers/issues/28056 ) + try: + GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + ) + except OSError: + pass dist.barrier() @@ -272,7 +291,7 @@ def build_inner_model( with init_empty_weights(include_buffers=False): with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) - AutoModelForCausalLM.from_pretrained( + model_cls.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, use_auth_token=use_auth_token, @@ -282,7 +301,7 @@ def build_inner_model( ) else: with init_empty_weights(include_buffers=False): - AutoModelForCausalLM.from_config( + model_cls.from_config( config, trust_remote_code=trust_remote_code, attn_implementation=requested_attention_implementation, @@ -293,7 +312,7 @@ def build_inner_model( # initialize the model on the correct device if resolved_init_device == 'cpu': if pretrained: - model = AutoModelForCausalLM.from_pretrained( + model = model_cls.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, use_auth_token=use_auth_token, @@ -302,7 +321,7 @@ def build_inner_model( config=config, ) else: - model = AutoModelForCausalLM.from_config( + model = model_cls.from_config( config, trust_remote_code=trust_remote_code, attn_implementation=requested_attention_implementation, @@ -313,7 +332,7 @@ def build_inner_model( 'Setting cfg.pretrained=True is not supported when init_device="meta".', ) with init_empty_weights(include_buffers=False): - model = AutoModelForCausalLM.from_config( + model = model_cls.from_config( config, trust_remote_code=trust_remote_code, attn_implementation=requested_attention_implementation, @@ -337,6 +356,17 @@ def build_inner_model( if dist.get_local_rank() == 0: os.remove(signal_file_path) + # Use the pretrained generation config for the model if it exists. + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + use_auth_token=use_auth_token, + ) + except OSError: + log.warning( + f'No existing generation config found for the model with name or path {pretrained_model_name_or_path}. Using default generation config.', + ) + # Hugging Face's weight tying does not succeed if the model is inited on meta device # so we manually apply the weight tying here if model.config.tie_word_embeddings and resolved_init_device == 'meta': @@ -358,7 +388,7 @@ def build_inner_model( return model - def get_peft_config(self, peft_config_dict: Dict[str, Any]) -> 'PeftConfig': + def get_peft_config(self, peft_config_dict: dict[str, Any]) -> 'PeftConfig': if peft_installed: from peft import LoraConfig peft_type = peft_config_dict.get('peft_type', '') diff --git a/llmfoundry/models/hf/hf_fsdp.py b/llmfoundry/models/hf/hf_fsdp.py index 00dada5532..068e726aee 100644 --- a/llmfoundry/models/hf/hf_fsdp.py +++ b/llmfoundry/models/hf/hf_fsdp.py @@ -5,7 +5,7 @@ # which is MIT licensed import functools -from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, Any, Iterable, Optional, Union from composer.models.huggingface import maybe_get_underlying_model from transformers import PreTrainedModel @@ -40,7 +40,7 @@ def rhasattr(obj: Any, attr: str) -> bool: return hasattr(_curr_obj, _nested_attrs[-1]) -def rgetattr(obj: Any, attr: str, *args: List[Any]) -> Any: +def rgetattr(obj: Any, attr: str, *args: list[Any]) -> Any: """A chain-able attribute version of getattr. For example, to get the attribute `foo.bar.baz` from `obj`, you can use: diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py index f54b7c42ec..23b826a79f 100644 --- a/llmfoundry/models/hf/hf_t5.py +++ b/llmfoundry/models/hf/hf_t5.py @@ -5,7 +5,7 @@ from __future__ import annotations -from typing import List, Mapping, Optional +from typing import Mapping, Optional from composer.utils import dist from transformers import ( @@ -54,7 +54,7 @@ def __init__( use_auth_token: bool = False, config_overrides: Optional[Mapping] = None, init_device: str = 'cpu', - additional_train_metrics: Optional[List] = None, + additional_train_metrics: Optional[list] = None, name: Optional[str] = None, ): from llmfoundry.utils.builders import build_metric diff --git a/llmfoundry/models/hf/model_wrapper.py b/llmfoundry/models/hf/model_wrapper.py index 7051986df8..64e58a4df2 100644 --- a/llmfoundry/models/hf/model_wrapper.py +++ b/llmfoundry/models/hf/model_wrapper.py @@ -6,7 +6,7 @@ from __future__ import annotations from collections import UserDict -from typing import TYPE_CHECKING, List, Mapping, Optional, Union +from typing import TYPE_CHECKING, Mapping, Optional, Union import transformers from composer.models.huggingface import HuggingFaceModel @@ -35,9 +35,10 @@ def __init__( self, model: Union[transformers.PreTrainedModel, 'PeftModel'], tokenizer: Optional[PreTrainedTokenizerBase] = None, - metrics: Optional[List[Metric]] = None, - eval_metrics: Optional[List[Metric]] = None, + metrics: Optional[list[Metric]] = None, + eval_metrics: Optional[list[Metric]] = None, shift_labels: bool = False, + allow_embedding_resizing: bool = False, init_device: Optional[str] = None, peft_config: Optional['PeftConfig'] = None, should_save_peft_only: bool = True, @@ -49,6 +50,7 @@ def __init__( metrics=metrics, eval_metrics=eval_metrics, shift_labels=shift_labels, + allow_embedding_resizing=allow_embedding_resizing, peft_config=peft_config, should_save_peft_only=should_save_peft_only, ) diff --git a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py index fb26c7990c..4f1f2e6f04 100644 --- a/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py +++ b/llmfoundry/models/inference_api_wrapper/openai_causal_lm.py @@ -7,7 +7,7 @@ import os import random from time import sleep -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import torch from composer.core.types import Batch @@ -102,7 +102,7 @@ def try_generate_completion(self, prompt: str, num_tokens: int): break except RateLimitError as e: if 'You exceeded your current quota' in str( - e._message, + e.message, ): # pyright: ignore raise e delay *= 2 * (1 + random.random()) @@ -143,7 +143,7 @@ def __init__( temperature=0.0, ) - def retokenize(self, tokens: List[int], cont_idxs: List[int]): + def retokenize(self, tokens: list[int], cont_idxs: list[int]): """Chat API will never respond with a word-initial space. If the continuation tokens begin with a word initial space, we need to @@ -186,7 +186,7 @@ def rebatch(self, batch: Batch): Model responses will never begin with spaces even if the continuation is expected to, so we need to retokenize the input to account for that. """ - new_batch: Dict[str, Union[List[torch.Tensor], torch.Tensor]] = { + new_batch: dict[str, Union[list[torch.Tensor], torch.Tensor]] = { 'input_ids': [], 'continuation_indices': [], 'labels': [], diff --git a/llmfoundry/models/layers/attention.py b/llmfoundry/models/layers/attention.py index 3e365edc47..acf231558e 100644 --- a/llmfoundry/models/layers/attention.py +++ b/llmfoundry/models/layers/attention.py @@ -6,7 +6,7 @@ import copy import math import warnings -from typing import Any, Dict, Optional, Tuple +from typing import Any, Optional import torch import transformers @@ -112,6 +112,7 @@ def scaled_multihead_dot_product_attention( dropout_p: float = 0.0, training: bool = False, needs_weights: bool = False, + sliding_window_size: int = -1, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor, torch.Tensor]]]: @@ -177,7 +178,7 @@ def scaled_multihead_dot_product_attention( min_val, ) - if is_causal and (not q.size(2) == 1): + if is_causal and (not s_q == 1): s = max(s_q, s_k) causal_mask = attn_weight.new_ones(s, s, dtype=torch.float32) causal_mask = causal_mask.tril() @@ -189,6 +190,31 @@ def scaled_multihead_dot_product_attention( min_val, ) + if sliding_window_size != -1: + window_mask = torch.ones((s_q, s_k), + dtype=torch.bool, + device=attn_weight.device) + if (not s_q == 1): + if s_q != s_k: + raise ValueError( + 'Number of queries should be equal to the number of keys.', + ) + window_mask = torch.tril( + window_mask, + diagonal=sliding_window_size, + ) + window_mask = torch.triu( + window_mask, + diagonal=-sliding_window_size, + ) + else: + window_mask[:, :-(sliding_window_size + 1)] = False + window_mask = ~window_mask + attn_weight = attn_weight.masked_fill( + window_mask.view(1, 1, s_q, s_k), + min_val, + ) + attn_weight = torch.softmax(attn_weight, dim=-1) if dropout_p: @@ -555,7 +581,7 @@ def forward( needs_weights: bool = False, alibi_slopes: Optional[torch.Tensor] = None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None, - prev_layer_key_value: Optional[Tuple[torch.Tensor, + prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[ torch.Tensor, torch.Tensor]]]: @@ -591,6 +617,7 @@ def forward( dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, + sliding_window_size=self.sliding_window_size, **extra_attn_kwargs, ) @@ -599,9 +626,9 @@ def forward( def get_qkv( self, x: torch.Tensor, - prev_layer_key_value: Optional[Tuple[torch.Tensor, + prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """Computes and returns the query, key, and value tensors. Args: @@ -673,11 +700,11 @@ def get_qkv( def _apply_rotary_embeddings( self, - rotary_emb_w_meta_info: Dict[str, Any], + rotary_emb_w_meta_info: dict[str, Any], query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: if self.reuse_kv_layer_idx is not None: orig_key, orig_value = key, value key, value = torch.empty_like(key), torch.empty_like(value) @@ -771,7 +798,6 @@ def get_implementation_specific_args( if self.attn_impl == 'flash': extra_attn_kwargs = { 'should_repeat_kv_for_gqa': not is_flash_v2_installed(), - 'sliding_window_size': self.sliding_window_size, 'alibi_slopes': alibi_slopes, 'flash_attn_padding_info': flash_attn_padding_info, 'key_padding_mask': None, diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py index 92735cc489..38a5bdec26 100644 --- a/llmfoundry/models/layers/blocks.py +++ b/llmfoundry/models/layers/blocks.py @@ -4,7 +4,7 @@ """GPT Blocks used for the GPT Model.""" import copy -from typing import Any, Dict, Optional, Set, Tuple +from typing import Any, Optional import torch import torch.nn as nn @@ -38,8 +38,8 @@ def __init__( d_model: int, n_heads: int, expansion_ratio: int, - attn_config: Optional[Dict] = None, - ffn_config: Optional[Dict] = None, + attn_config: Optional[dict] = None, + ffn_config: Optional[dict] = None, resid_pdrop: float = 0.0, norm_type: str = 'low_precision_layernorm', norm_eps: float = 1e-05, @@ -154,17 +154,17 @@ def args_to_exclude_in_attn_class(self): def forward( self, x: torch.Tensor, - past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attn_bias: Optional[torch.Tensor] = None, - rotary_emb_w_meta_info: Optional[Dict] = None, + rotary_emb_w_meta_info: Optional[dict] = None, attention_mask: Optional[torch.ByteTensor] = None, is_causal: bool = True, output_attentions: bool = False, alibi_slopes: Optional[torch.Tensor] = None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None, - prev_layer_key_value: Optional[Tuple[torch.Tensor, + prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[ + ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[ torch.Tensor, torch.Tensor]]]: extra_kwargs = {} if prev_layer_key_value is not None: @@ -258,8 +258,8 @@ def __init__( self, d_model: int, n_heads: int, - args_to_exclude_in_attn_class: Set[str], - attn_config: Optional[Dict] = None, + args_to_exclude_in_attn_class: set[str], + attn_config: Optional[dict] = None, ffn_has_norm: bool = False, fc_type: Optional[dict[str, Any]] = None, resid_pdrop: float = 0.0, @@ -316,18 +316,18 @@ def __init__( def forward( self, x: torch.Tensor, - past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + past_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None, attn_bias: Optional[torch.Tensor] = None, - rotary_emb_w_meta_info: Optional[Dict] = None, + rotary_emb_w_meta_info: Optional[dict] = None, attention_mask: Optional[torch.ByteTensor] = None, is_causal: bool = True, output_attentions: bool = False, alibi_slopes: Optional[torch.Tensor] = None, flash_attn_padding_info: Optional[dict[str, torch.Tensor]] = None, - prev_layer_key_value: Optional[Tuple[torch.Tensor, + prev_layer_key_value: Optional[tuple[torch.Tensor, torch.Tensor]] = None, - ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], - Optional[Tuple[torch.Tensor, torch.Tensor]]]: + ) -> tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor], + Optional[tuple[torch.Tensor, torch.Tensor]]]: a = self.norm_1(x) extra_kwargs = {} if prev_layer_key_value is not None: diff --git a/llmfoundry/models/layers/dmoe.py b/llmfoundry/models/layers/dmoe.py index 59508e0a50..1390cd90eb 100644 --- a/llmfoundry/models/layers/dmoe.py +++ b/llmfoundry/models/layers/dmoe.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import partial -from typing import Callable, Optional, Tuple, Union +from typing import Callable, Optional, Union import torch import torch.nn.functional as F @@ -72,7 +72,7 @@ def jitter(self, x: torch.Tensor) -> torch.Tensor: ) return low + noise * (high - low) - def _top_k(self, scores: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + def _top_k(self, scores: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: if self.moe_top_k == 1: values, indices = scores.max(dim=-1,) return values.unsqueeze(-1), indices.unsqueeze(-1) diff --git a/llmfoundry/models/layers/ffn.py b/llmfoundry/models/layers/ffn.py index f5d6d67040..40211fc875 100644 --- a/llmfoundry/models/layers/ffn.py +++ b/llmfoundry/models/layers/ffn.py @@ -6,7 +6,7 @@ import logging from copy import deepcopy from functools import partial -from typing import Any, Callable, List, Optional, Union +from typing import Any, Callable, Optional, Union import torch import torch.nn as nn @@ -126,7 +126,7 @@ def resolve_ffn_hidden_size( def dtensorify_param( param: nn.Parameter, mesh: DeviceMesh, - placements: List[Placement], + placements: list[Placement], ): """Construct a DTensor from an already sharded local parameter.""" param_dtensor = DTensor.from_local( @@ -437,7 +437,7 @@ def set_ffn_device_mesh( """ if moe_world_size > 1: expert_mesh = device_mesh['expert_parallel'] - expert_placements: List[Placement] = [Shard(0)] + expert_placements: list[Placement] = [Shard(0)] # Register in two loops as you cannot overwrite parameters while iterating over named_parameters() dtensorified_params = [( name, diff --git a/llmfoundry/models/layers/layer_builders.py b/llmfoundry/models/layers/layer_builders.py index d5fd1d37d4..37bcca8c2e 100644 --- a/llmfoundry/models/layers/layer_builders.py +++ b/llmfoundry/models/layers/layer_builders.py @@ -1,7 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import torch @@ -25,7 +25,7 @@ def build_norm( name: str, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: Optional[float] = 1e-5, device: Optional[str] = None, ): @@ -49,7 +49,7 @@ def build_ffn( expansion_ratio: float, device: Optional[str], bias: bool, - ffn_kwargs: Dict[str, Any], + ffn_kwargs: dict[str, Any], ): registry_to_use = ffns @@ -90,7 +90,7 @@ def _validation_function(maybe_module: Any): def build_attention_layer( name: str, - attn_kwargs: Dict[str, Any], + attn_kwargs: dict[str, Any], ): return construct_from_registry( name=name, @@ -104,7 +104,7 @@ def build_fc( name: str, in_features: int, out_features: int, - fc_kwargs: Dict[str, Any], + fc_kwargs: dict[str, Any], ): kwargs = { 'in_features': in_features, diff --git a/llmfoundry/models/layers/norm.py b/llmfoundry/models/layers/norm.py index c853f5fd26..5aae828188 100644 --- a/llmfoundry/models/layers/norm.py +++ b/llmfoundry/models/layers/norm.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import List, Optional, Union +from typing import Optional, Union import torch @@ -35,7 +35,7 @@ class LPLayerNorm(torch.nn.LayerNorm): def __init__( self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-05, elementwise_affine: bool = True, device: Optional[torch.device] = None, @@ -84,7 +84,7 @@ class RMSNorm(torch.nn.Module): def __init__( self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, weight: bool = True, dtype: Optional[torch.dtype] = None, @@ -108,7 +108,7 @@ class LPRMSNorm(RMSNorm): def __init__( self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, weight: bool = True, dtype: Optional[torch.dtype] = None, @@ -137,7 +137,7 @@ class TritonRMSNorm(torch.nn.Module): def __init__( self, - normalized_shape: Union[int, List[int], torch.Size], + normalized_shape: Union[int, list[int], torch.Size], eps: float = 1e-5, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, diff --git a/llmfoundry/models/mpt/configuration_mpt.py b/llmfoundry/models/mpt/configuration_mpt.py index 9671eb6ed5..759f347e89 100644 --- a/llmfoundry/models/mpt/configuration_mpt.py +++ b/llmfoundry/models/mpt/configuration_mpt.py @@ -5,7 +5,7 @@ import copy import warnings -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union from transformers import PretrainedConfig @@ -37,8 +37,8 @@ def __init__( resid_pdrop: float = 0.0, emb_pdrop: float = 0.0, learned_pos_emb: bool = True, - attn_config: Optional[Dict] = None, - ffn_config: Optional[Dict] = None, + attn_config: Optional[dict] = None, + ffn_config: Optional[dict] = None, init_device: str = 'cpu', logit_scale: Optional[Union[float, str]] = None, no_bias: bool = False, @@ -46,11 +46,11 @@ def __init__( norm_type: str = 'low_precision_layernorm', norm_eps: float = 1e-05, use_cache: bool = False, - init_config: Optional[Dict] = None, - fc_type: Union[str, Dict] = 'torch', + init_config: Optional[dict] = None, + fc_type: Union[str, dict] = 'torch', tie_word_embeddings: bool = True, use_pad_tok_in_ffn: bool = True, - block_overrides: Optional[Dict[str, Any]] = None, + block_overrides: Optional[dict[str, Any]] = None, **kwargs: Any, ): """The MPT configuration class. @@ -210,7 +210,7 @@ def __init__( self._validate_config() - def _validate_block_overrides(self, block_overrides: Dict[str, Any]): + def _validate_block_overrides(self, block_overrides: dict[str, Any]): warnings.warn(ExperimentalWarning('block_overrides')) if 'order' not in block_overrides: raise ValueError('`order` should be defined in block_overrides',) @@ -229,9 +229,9 @@ def _validate_block_overrides(self, block_overrides: Dict[str, Any]): def _set_config_defaults( self, - config: Dict[str, Any], - config_defaults: Dict[str, Any], - ) -> Dict[str, Any]: + config: dict[str, Any], + config_defaults: dict[str, Any], + ) -> dict[str, Any]: # set config defaults for k, v in config_defaults.items(): if k not in config: @@ -329,12 +329,11 @@ def _validate_config(self) -> None: raise ImportError( 'If using the dail implementation of rope, the flash_attn library v2.0.1 or higher must be installed. Please check the instructions at https://github.com/mosaicml/llm-foundry/blob/main/TUTORIAL.md#what-kinds-of-positional-embeddings-does-llm-foundry-support', ) - if self.attn_config['sliding_window_size'] != -1 and not ( - self.attn_config['attn_impl'] == 'flash' and - is_flash_v2_installed(v2_version='v2.3.0') - ): + if self.attn_config['sliding_window_size'] != -1 and self.attn_config[ + 'attn_impl' + ] == 'flash' and not is_flash_v2_installed(v2_version='v2.3.0',): raise NotImplementedError( - 'sliding window only implemented with flash attention v2.3.0 or higher.', + 'sliding window attention only implemented for torch attention and flash attention (v2.3.0 or higher).', ) if self.embedding_fraction > 1 or self.embedding_fraction <= 0: raise ValueError( diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 6f9b6bf806..2ae733a6f1 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -14,13 +14,9 @@ from functools import cached_property from typing import ( Any, - Dict, - List, Mapping, MutableMapping, Optional, - Tuple, - Type, Union, ) @@ -348,7 +344,7 @@ def forward( self, x: torch.Tensor, position_ids: torch.Tensor, - ) -> Tuple[torch.Tensor, torch.Tensor]: + ) -> tuple[torch.Tensor, torch.Tensor]: # In this subclass, we move `inv_freq` to same device as position_ids. This operation should be a no-op during training. # This is done to fix pipeline parallel generation using hf.generate. Please see this comment for details: https://github.com/mosaicml/llm-foundry/pull/1334#issue-2387337525 self.inv_freq = self.inv_freq.to(position_ids.device) @@ -480,7 +476,7 @@ def __init__(self, config: MPTConfig): log.debug(f'Using {self.config.init_config["name"]} initialization.') @property - def block_class(self) -> Type[MPTBlock]: + def block_class(self) -> type[MPTBlock]: return MPTBlock def construct_blocks(self, config: MPTConfig) -> nn.ModuleList: @@ -517,8 +513,8 @@ def construct_blocks(self, config: MPTConfig) -> nn.ModuleList: def _get_override_block_args_list( self, config: MPTConfig, - block_args: Dict[str, Any], - ) -> List[Dict[str, Any]]: + block_args: dict[str, Any], + ) -> list[dict[str, Any]]: if config.block_overrides is None: raise ValueError( 'config.block_overrides should not be None when calling _get_override_block_args_list.', @@ -581,11 +577,11 @@ def _get_override_block_args_list( @staticmethod def _resolve_reuse_kv_layer_idx( - overrides_definition: Dict[str, Any], - model_modules_order_expanded: List[str], + overrides_definition: dict[str, Any], + model_modules_order_expanded: list[str], b_idx: int, - override_config: Dict[str, Any], - reuse_kv_layer_idx_dict: Dict[int, int], + override_config: dict[str, Any], + reuse_kv_layer_idx_dict: dict[int, int], ) -> int: override_attn_config = override_config['attn_config'] if override_attn_config['reuse_kv_layer_idx'] >= 0: @@ -621,7 +617,7 @@ def _resolve_reuse_kv_layer_idx( return reuse_kv_layer_idx @staticmethod - def _get_modules_order_expanded(order: List[Dict[str, Any]]) -> List[str]: + def _get_modules_order_expanded(order: list[dict[str, Any]]) -> list[str]: model_modules_order_expanded = [] for item in order: repeat = item['repeat'] if 'repeat' in item else 1 @@ -642,10 +638,10 @@ def _get_modules_order_expanded(order: List[Dict[str, Any]]) -> List[str]: @staticmethod def _override_block_args( - block_args: Dict[str, Any], - override_config: Dict[str, Any], - allowed_block_overrides: Dict[str, Any], - ) -> Dict[str, Any]: + block_args: dict[str, Any], + override_config: dict[str, Any], + allowed_block_overrides: dict[str, Any], + ) -> dict[str, Any]: unpermitted_keys = override_config.keys( ) - allowed_block_overrides.keys() if len(unpermitted_keys): @@ -668,7 +664,7 @@ def _override_block_args( new_block_args[k] = override_config[k] return new_block_args - def extract_block_args(self, block_args: Dict[str, Any]) -> Dict[str, Any]: + def extract_block_args(self, block_args: dict[str, Any]) -> dict[str, Any]: """Sets the block args.""" if block_args['ffn_config']['ffn_type'] in ffns_with_megablocks: block_args['ffn_config'] = config_moe_args( @@ -696,7 +692,7 @@ def _attn_bias( dtype: torch.dtype, attention_mask: Optional[torch.ByteTensor] = None, sequence_id: Optional[torch.LongTensor] = None, - ) -> Tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]: + ) -> tuple[Optional[torch.Tensor], Optional[torch.ByteTensor]]: if not self._attn_bias_initialized: if self.attn_bias_shape: self.attn_bias = torch.zeros( @@ -759,7 +755,7 @@ def _attn_bias( def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[list[tuple[torch.FloatTensor]]] = None, attention_mask: Optional[torch.ByteTensor] = None, sequence_id: Optional[torch.LongTensor] = None, return_dict: Optional[bool] = None, @@ -1074,7 +1070,7 @@ def __init__(self, config: MPTConfig): self.logit_scale = logit_scale @property - def backbone_model_class(self) -> Type[MPTModel]: + def backbone_model_class(self) -> type[MPTModel]: return MPTModel def get_input_embeddings(self) -> Union[SharedEmbedding, nn.Embedding]: @@ -1126,7 +1122,7 @@ def get_decoder(self) -> MPTModel: def forward( self, input_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[list[tuple[torch.FloatTensor]]] = None, attention_mask: Optional[torch.ByteTensor] = None, sequence_id: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None, @@ -1267,11 +1263,11 @@ def activation_checkpointing_fn(self, module: nn.Module) -> bool: def prepare_inputs_for_generation( self, input_ids: torch.Tensor, - past_key_values: Optional[List[Tuple[torch.Tensor, + past_key_values: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None, inputs_embeds: Optional[torch.Tensor] = None, **kwargs: Any, - ) -> Dict[str, Any]: + ) -> dict[str, Any]: attention_mask = kwargs['attention_mask'].bool() if attention_mask[:, -1].sum() != attention_mask.shape[0]: raise NotImplementedError( @@ -1303,9 +1299,9 @@ def prepare_inputs_for_generation( @staticmethod def _reorder_cache( - past_key_values: List[Tuple[torch.Tensor, torch.Tensor]], + past_key_values: list[tuple[torch.Tensor, torch.Tensor]], beam_idx: torch.LongTensor, - ) -> List[Tuple[torch.Tensor, ...]]: + ) -> list[tuple[torch.Tensor, ...]]: """Used by HuggingFace generate when using beam search with kv-caching. See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133 @@ -1362,9 +1358,9 @@ def __init__( self, tokenizer: Optional[PreTrainedTokenizerBase] = None, use_train_metrics: Optional[bool] = True, - additional_train_metrics: Optional[List] = None, - loss_fn: Optional[Union[str, Dict]] = 'fused_crossentropy', - **kwargs: Dict[str, Any], + additional_train_metrics: Optional[list] = None, + loss_fn: Optional[Union[str, dict]] = 'fused_crossentropy', + **kwargs: dict[str, Any], ): from llmfoundry.metrics import ( DEFAULT_CAUSAL_LM_EVAL_METRICS, @@ -1425,11 +1421,11 @@ def __init__( ) @property - def model_class(self) -> Type[MPTForCausalLM]: + def model_class(self) -> type[MPTForCausalLM]: return MPTForCausalLM @property - def config_class(self) -> Type[MPTConfig]: + def config_class(self) -> type[MPTConfig]: return MPTConfig def get_targets(self, batch: Mapping) -> torch.Tensor: diff --git a/llmfoundry/models/utils/config_defaults.py b/llmfoundry/models/utils/config_defaults.py index c272a52dd4..8a15f0d81a 100644 --- a/llmfoundry/models/utils/config_defaults.py +++ b/llmfoundry/models/utils/config_defaults.py @@ -3,13 +3,11 @@ """Defaults for MPT model component configs.""" -from typing import Dict - -ffn_config_defaults: Dict = { +ffn_config_defaults: dict = { 'ffn_type': 'mptmlp', } -attn_config_defaults: Dict = { +attn_config_defaults: dict = { 'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'flash', @@ -36,7 +34,7 @@ }, } -init_config_defaults: Dict = { +init_config_defaults: dict = { 'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', @@ -47,6 +45,6 @@ 'init_gain': 0.0, } -fc_type_defaults: Dict = { +fc_type_defaults: dict = { 'name': 'torch', } diff --git a/llmfoundry/models/utils/param_init_fns.py b/llmfoundry/models/utils/param_init_fns.py index 6ff241870d..9941c2d049 100644 --- a/llmfoundry/models/utils/param_init_fns.py +++ b/llmfoundry/models/utils/param_init_fns.py @@ -6,7 +6,7 @@ from collections.abc import Sequence from copy import deepcopy from functools import partial -from typing import Any, Callable, Optional, Tuple, Union +from typing import Any, Callable, Optional, Union import torch from torch import nn @@ -193,7 +193,7 @@ def embedding_init( module: nn.Module, init_fn_: Callable, emb_init_std: Optional[float], - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]], + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]], **kwargs: Any, ) -> bool: del kwargs # unused, just to capture any extra args @@ -408,7 +408,7 @@ def generic_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, **kwargs: Any, ) -> None: del kwargs # unused, just to capture any extra args from the config @@ -674,7 +674,7 @@ def _normal_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, **kwargs: Any, ) -> None: del kwargs # unused, just to capture any extra args from the config @@ -698,7 +698,7 @@ def baseline_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, **kwargs: Any, ) -> None: del kwargs # unused, just to capture any extra args from the config @@ -723,7 +723,7 @@ def small_param_init_fn_( d_model: int, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, **kwargs: Any, ) -> None: del kwargs # unused, just to capture any extra args from the config @@ -746,7 +746,7 @@ def neox_param_init_fn_( n_layers: int, d_model: int, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, **kwargs: Any, ) -> None: """From section 2.3.1 of GPT-NeoX-20B: @@ -774,7 +774,7 @@ def kaiming_uniform_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, init_gain: float = 0, fan_mode: str = 'fan_in', init_nonlinearity: str = 'leaky_relu', @@ -806,7 +806,7 @@ def kaiming_normal_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, init_gain: float = 0, fan_mode: str = 'fan_in', init_nonlinearity: str = 'leaky_relu', @@ -838,7 +838,7 @@ def xavier_uniform_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, init_gain: float = 0, **kwargs: Any, ) -> None: @@ -862,7 +862,7 @@ def xavier_normal_param_init_fn_( d_model: Optional[int] = None, init_div_is_residual: Union[int, float, str, bool] = True, emb_init_std: Optional[float] = None, - emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None, + emb_init_uniform_lim: Optional[Union[tuple[float, float], float]] = None, init_gain: float = 0, **kwargs: Any, ) -> None: diff --git a/llmfoundry/optim/adaptive_lion.py b/llmfoundry/optim/adaptive_lion.py index cb4ce59cd0..bf4a686b22 100644 --- a/llmfoundry/optim/adaptive_lion.py +++ b/llmfoundry/optim/adaptive_lion.py @@ -3,7 +3,7 @@ import logging import math -from typing import Callable, Dict, Iterable, Optional, Tuple, Union +from typing import Callable, Iterable, Optional, Union import torch from composer.utils import dist @@ -59,7 +59,7 @@ def __init__( self, params: Union[Iterable[torch.Tensor], Iterable[dict]], lr: float = 1e-4, - betas: Tuple[float, float] = (0.9, 0.99), + betas: tuple[float, float] = (0.9, 0.99), weight_decay: float = 0.0, outlier_threshold: float = 10.0, timeout: int = 100, @@ -198,7 +198,7 @@ def step(self, closure: Optional[Callable] = None): return loss - def dist_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): + def dist_reduce_metrics(self, optimizer_metrics: dict[str, torch.Tensor]): for metric in optimizer_metrics: if metric.startswith('l2_norm'): reduced = optimizer_metrics[metric] @@ -229,7 +229,7 @@ def dist_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): return optimizer_metrics - def pre_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): + def pre_reduce_metrics(self, optimizer_metrics: dict[str, torch.Tensor]): """Preprocess metrics to reduce across ranks correctly.""" # Only L2 norm metric keys are present, can skip sorting at this stage for metric in optimizer_metrics: @@ -310,7 +310,7 @@ def __init__( self, params: Union[Iterable[torch.Tensor], Iterable[dict]], lr: float = 1e-4, - betas: Tuple[float, float] = (0.9, 0.99), + betas: tuple[float, float] = (0.9, 0.99), weight_decay: float = 0.0, outlier_threshold: float = 5.0, ): @@ -404,7 +404,7 @@ def step(self, closure: Optional[Callable] = None): return loss - def dist_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): + def dist_reduce_metrics(self, optimizer_metrics: dict[str, torch.Tensor]): local_keys = list(optimizer_metrics.keys()) all_gathered_keys = dist.all_gather_object(local_keys) all_keys = set() @@ -431,7 +431,7 @@ def dist_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): return optimizer_metrics - def pre_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): + def pre_reduce_metrics(self, optimizer_metrics: dict[str, torch.Tensor]): """Preprocess metrics to reduce across ranks correctly.""" # Sort L2 norms first so they are squared before other metrics, which depend on squared values metrics = optimizer_metrics.keys() diff --git a/llmfoundry/optim/lion.py b/llmfoundry/optim/lion.py index 667c3f55b1..8b5bbdf7f6 100644 --- a/llmfoundry/optim/lion.py +++ b/llmfoundry/optim/lion.py @@ -3,7 +3,7 @@ import logging import math -from typing import Callable, Dict, Iterable, Optional, Tuple, Union +from typing import Callable, Iterable, Optional, Union import torch from composer.utils import dist @@ -36,7 +36,7 @@ def __init__( self, params: Union[Iterable[torch.Tensor], Iterable[dict]], lr: float = 1e-4, - betas: Tuple[float, float] = (0.9, 0.99), + betas: tuple[float, float] = (0.9, 0.99), weight_decay: float = 0.0, ): if lr <= 0.: @@ -111,7 +111,7 @@ def step(self, closure: Optional[Callable] = None): return loss - def dist_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): + def dist_reduce_metrics(self, optimizer_metrics: dict[str, torch.Tensor]): local_keys = list(optimizer_metrics.keys()) all_gathered_keys = dist.all_gather_object(local_keys) all_keys = set() @@ -136,7 +136,7 @@ def dist_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): return optimizer_metrics - def pre_reduce_metrics(self, optimizer_metrics: Dict[str, torch.Tensor]): + def pre_reduce_metrics(self, optimizer_metrics: dict[str, torch.Tensor]): """Preprocess metrics to reduce across ranks correctly.""" # Only L2 norm metric keys are present, can skip sorting at this stage for metric in optimizer_metrics: diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index 3f0163ff01..cb2455a760 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -1,6 +1,6 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Dict, Iterable, Tuple, Type, Union +from typing import Any, Callable, Iterable, Union from composer.core import Algorithm, Callback, DataSpec from composer.loggers import LoggerDestination @@ -43,7 +43,7 @@ loggers = create_registry( 'llmfoundry', 'loggers', - generic_type=Type[LoggerDestination], + generic_type=type[LoggerDestination], entry_points=True, description=_loggers_description, ) @@ -64,7 +64,7 @@ callbacks = create_registry( 'llmfoundry', 'callbacks', - generic_type=Type[Callback], + generic_type=type[Callback], entry_points=True, description=_callbacks_description, ) @@ -86,7 +86,7 @@ callbacks_with_config = create_registry( 'llmfoundry', 'callbacks_with_config', - generic_type=Type[CallbackWithConfig], + generic_type=type[CallbackWithConfig], entry_points=True, description=_callbacks_with_config_description, ) @@ -108,7 +108,7 @@ optimizers = create_registry( 'llmfoundry', 'optimizers', - generic_type=Type[Optimizer], + generic_type=type[Optimizer], entry_points=True, description=_optimizers_description, ) @@ -129,7 +129,7 @@ algorithms = create_registry( 'llmfoundry', 'algorithms', - generic_type=Type[Algorithm], + generic_type=type[Algorithm], entry_points=True, description=_algorithms_description, ) @@ -150,7 +150,7 @@ schedulers = create_registry( 'llmfoundry', 'schedulers', - generic_type=Type[ComposerScheduler], + generic_type=type[ComposerScheduler], entry_points=True, description=_schedulers_description, ) @@ -163,7 +163,7 @@ tokenizers = create_registry( 'llmfoundry', 'tokenizers', - generic_type=Type[PreTrainedTokenizerBase], + generic_type=type[PreTrainedTokenizerBase], entry_points=True, description=_tokenizers_description, ) @@ -185,7 +185,7 @@ models = create_registry( 'llmfoundry', 'models', - generic_type=Type[ComposerModel], + generic_type=type[ComposerModel], entry_points=True, description=_models_description, ) @@ -234,8 +234,8 @@ 'llmfoundry', 'dataset_replication_validators', generic_type=Callable[ - [Dict[str, Any], PreTrainedTokenizerBase, Union[int, float]], - Tuple[int, int]], + [dict[str, Any], PreTrainedTokenizerBase, Union[int, float]], + tuple[int, int]], entry_points=True, description=_dataset_replication_validators_description, ) @@ -258,8 +258,8 @@ collators = create_registry( 'llmfoundry', 'collators', - generic_type=Callable[[Dict[str, Any], PreTrainedTokenizerBase, int], - Tuple[Any, int]], + generic_type=Callable[[dict[str, Any], PreTrainedTokenizerBase, int], + tuple[Any, int]], entry_points=True, description=_collators_description, ) @@ -280,7 +280,7 @@ data_specs = create_registry( 'llmfoundry', 'data_specs', - generic_type=Callable[[Union[Iterable, TorchDataloader], Dict[str, Any]], + generic_type=Callable[[Union[Iterable, TorchDataloader], dict[str, Any]], DataSpec], entry_points=True, description=_data_specs_description, @@ -302,7 +302,7 @@ metrics = create_registry( 'llmfoundry', 'metrics', - generic_type=Type[Metric], + generic_type=type[Metric], entry_points=True, description=_metrics_description, ) @@ -327,7 +327,7 @@ # llmfoundry.eval.InContextLearningDataset. # Using ICL dataset here introduces a circular import dependency between # the registry and eval packages right now, thus needs some refactoring. - generic_type=Type[Dataset], + generic_type=type[Dataset], entry_points=True, description=_icl_datasets_description, ) @@ -348,7 +348,7 @@ config_transforms = create_registry( 'llmfoundry', 'config_transforms', - generic_type=Callable[[Dict[str, Any]], Dict[str, Any]], + generic_type=Callable[[dict[str, Any]], dict[str, Any]], entry_points=True, description=_config_transforms_description, ) @@ -366,7 +366,7 @@ load_planners = create_registry( 'llmfoundry', 'load_planners', - generic_type=Type[LoadPlanner], + generic_type=type[LoadPlanner], entry_points=True, description=_load_planners_description, ) @@ -384,7 +384,7 @@ save_planners = create_registry( 'llmfoundry', 'save_planners', - generic_type=Type[SavePlanner], + generic_type=type[SavePlanner], entry_points=True, description=_save_planners_description, ) diff --git a/llmfoundry/tokenizers/tiktoken.py b/llmfoundry/tokenizers/tiktoken.py index fd0fc5948a..6458ad3ba4 100644 --- a/llmfoundry/tokenizers/tiktoken.py +++ b/llmfoundry/tokenizers/tiktoken.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from functools import lru_cache -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Optional from transformers import PreTrainedTokenizer @@ -148,7 +148,7 @@ def pickle_Encoding(enc: Encoding): self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.errors = errors - self.decoder: Dict[int, str] = {} + self.decoder: dict[int, str] = {} for i in range(self.encoding.n_vocab): try: self.encoding.decode_single_token_bytes(i) @@ -162,7 +162,7 @@ def pickle_Encoding(enc: Encoding): ]) self.decoder[i] = decoding - self.encoder: Dict[str, int] = {} + self.encoder: dict[str, int] = {} for i in range(self.encoding.n_vocab): if i in self.decoder: self.encoder[self.decoder[i]] = i @@ -231,7 +231,7 @@ def default_chat_template(self): ) return template - def get_vocab(self) -> Dict[str, int]: + def get_vocab(self) -> dict[str, int]: """Returns vocab as a dict.""" # As far as I can tell, we don't require get_vocab to completely work, # but when using additional_special_tokens, Hugging Face determines the next @@ -255,7 +255,7 @@ def get_vocab(self) -> Dict[str, int]: return dict(vocab_clone, **self.added_tokens_encoder) - def _tokenize(self, text: str) -> List[str]: + def _tokenize(self, text: str) -> list[str]: """Returns a tokenized string.""" if not isinstance(text, str): raise ValueError( @@ -280,7 +280,7 @@ def _convert_id_to_token(self, index: int) -> Optional[str]: # but not slow tokenizers. return self.decoder.get(index, '') - def convert_tokens_to_string(self, tokens: List[str]) -> str: + def convert_tokens_to_string(self, tokens: list[str]) -> str: """Converts a sequence of tokens (string) in a single string.""" text = ''.join(tokens) text = bytearray([self.byte_decoder[c] for c in text @@ -289,9 +289,9 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str: def build_inputs_with_special_tokens( self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - ) -> List[int]: + token_ids_0: list[int], + token_ids_1: Optional[list[int]] = None, + ) -> list[int]: bos_token_id = [self.bos_token_id] if self.add_bos_token else [] eos_token_id = [self.eos_token_id] if self.add_eos_token else [] @@ -304,10 +304,10 @@ def build_inputs_with_special_tokens( def get_special_tokens_mask( self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, + token_ids_0: list[int], + token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False, - ) -> List[int]: + ) -> list[int]: """Retrieves sequence ids from a token list that has no special tokens. Function copied from @@ -346,9 +346,9 @@ def get_special_tokens_mask( def create_token_type_ids_from_sequences( self, - token_ids_0: List[int], - token_ids_1: Optional[List[int]] = None, - ) -> List[int]: + token_ids_0: list[int], + token_ids_1: Optional[list[int]] = None, + ) -> list[int]: sep = [self.sep_token_id] if token_ids_1 is None: @@ -359,7 +359,7 @@ def save_vocabulary( self, save_directory: str, filename_prefix: Optional[str] = None, - ) -> Tuple[str]: + ) -> tuple[str]: # ignore the below type to keep the original signature # we are knowingly breaking the signature here, although not 100% certain diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index a1d84601b3..f2d5cfc0f7 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -11,11 +11,8 @@ from typing import ( Any, ContextManager, - Dict, Iterable, - List, Optional, - Tuple, Union, ) @@ -59,15 +56,15 @@ def build_evaluators( - eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], - icl_tasks_config: Optional[Union[str, List[Dict[str, Any]]]], - eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], + eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], + icl_tasks_config: Optional[Union[str, list[dict[str, Any]]]], + eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], *, tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: Union[int, float], icl_seq_len: int, icl_subset_num_batches: Optional[int], -) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: +) -> tuple[list[Evaluator], list[str], Optional[EvalGauntlet]]: evaluators = [] if eval_loader_config is not None: @@ -98,11 +95,11 @@ def build_evaluators( def build_eval_loaders( - eval_loader_config: Union[Dict[str, Any], List[Dict[str, Any]]], + eval_loader_config: Union[dict[str, Any], list[dict[str, Any]]], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: Union[int, float], -) -> List[Evaluator]: - evaluators: List[Evaluator] = [] +) -> list[Evaluator]: + evaluators: list[Evaluator] = [] if isinstance(eval_loader_config, list): eval_configs = eval_loader_config is_multi_eval = True @@ -134,9 +131,9 @@ def build_eval_loaders( def add_metrics_to_eval_loaders( - evaluators: List[Evaluator], - metric_names: List[str], -) -> List[Evaluator]: + evaluators: list[Evaluator], + metric_names: list[str], +) -> list[Evaluator]: eval_loaders, other_evaluators = [], [] for evaluator in evaluators: if evaluator.metric_names == []: @@ -150,13 +147,13 @@ def add_metrics_to_eval_loaders( def build_icl_data_and_gauntlet( - icl_tasks_config: Union[str, List[Dict[str, Any]]], - eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], + icl_tasks_config: Union[str, list[dict[str, Any]]], + eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, icl_subset_num_batches: Optional[int] = None, -) -> Tuple[List[Evaluator], List[str], Optional[EvalGauntlet]]: +) -> tuple[list[Evaluator], list[str], Optional[EvalGauntlet]]: icl_evaluators, logger_keys = build_icl_evaluators( icl_tasks_config, tokenizer, @@ -229,7 +226,7 @@ def build_save_planner(name: str, **kwargs: Any) -> SavePlanner: def build_composer_model( name: str, - cfg: Dict[str, Any], + cfg: dict[str, Any], tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager] = None, master_weights_dtype: Optional[str] = None, @@ -282,7 +279,7 @@ def build_composer_model( def build_callback( name: str, - kwargs: Optional[Dict[str, Any]] = None, + kwargs: Optional[dict[str, Any]] = None, train_config: Any = None, ) -> Callback: """Builds a callback from the registry.""" @@ -309,7 +306,7 @@ def build_callback( def build_logger( name: str, - kwargs: Optional[Dict[str, Any]] = None, + kwargs: Optional[dict[str, Any]] = None, ) -> LoggerDestination: """Builds a logger from the registry.""" return construct_from_registry( @@ -324,7 +321,7 @@ def build_logger( def build_algorithm( name: str, - kwargs: Optional[Dict[str, Any]] = None, + kwargs: Optional[dict[str, Any]] = None, ) -> Algorithm: """Builds an algorithm from the registry.""" return construct_from_registry( @@ -337,7 +334,7 @@ def build_algorithm( ) -def build_metric(name: str, kwargs: Optional[Dict[str, Any]] = None) -> Metric: +def build_metric(name: str, kwargs: Optional[dict[str, Any]] = None) -> Metric: """Builds a metric from the registry.""" return construct_from_registry( name=name, @@ -351,8 +348,8 @@ def build_metric(name: str, kwargs: Optional[Dict[str, Any]] = None) -> Metric: def _extract_param_groups( model: torch.nn.Module, - optimizer_config: Optional[Dict[str, Any]] = None, -) -> Union[Iterable[torch.Tensor], Iterable[Dict[str, Any]]]: + optimizer_config: Optional[dict[str, Any]] = None, +) -> Union[Iterable[torch.Tensor], Iterable[dict[str, Any]]]: """Extracts parameter groups defined in the optimizer config. The optimizer_config defines the optimizer args. It can additionally have key @@ -455,7 +452,7 @@ def _extract_param_groups( def build_optimizer( model: torch.nn.Module, name: str, - optimizer_config: Dict[str, Any], + optimizer_config: dict[str, Any], ) -> Optimizer: params = _extract_param_groups(model, optimizer_config) @@ -480,7 +477,7 @@ def build_optimizer( def build_scheduler( name: str, - scheduler_config: Optional[Dict[str, Any]] = None, + scheduler_config: Optional[dict[str, Any]] = None, ) -> ComposerScheduler: return construct_from_registry( name=name, @@ -494,7 +491,7 @@ def build_scheduler( def build_tokenizer( tokenizer_name: str, - tokenizer_kwargs: Dict[str, Any], + tokenizer_kwargs: dict[str, Any], ) -> PreTrainedTokenizerBase: os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' os.environ['TOKENIZERS_PARALLELISM'] = 'false' @@ -550,13 +547,13 @@ def build_tokenizer( def build_icl_evaluators( - icl_tasks: Union[str, List[Dict[str, Any]]], + icl_tasks: Union[str, list[dict[str, Any]]], tokenizer: PreTrainedTokenizerBase, default_max_seq_len: int, default_batch_size: int, destination_dir: Optional[str] = None, icl_subset_num_batches: Optional[int] = None, -) -> Tuple[List[Evaluator], List[str]]: +) -> tuple[list[Evaluator], list[str]]: if destination_dir is None: destination_dir = os.getcwd() @@ -572,7 +569,7 @@ def build_icl_evaluators( else: icl_tasks_list = icl_tasks - def _validate_cfg(icl_cfg: Dict[str, Any]): + def _validate_cfg(icl_cfg: dict[str, Any]): assert 'label' in icl_cfg assert 'dataset_uri' in icl_cfg and icl_cfg['dataset_uri'] is not None assert 'icl_task_type' in icl_cfg diff --git a/llmfoundry/utils/checkpoint_conversion_helpers.py b/llmfoundry/utils/checkpoint_conversion_helpers.py index 5c65a7475e..39e79c3271 100644 --- a/llmfoundry/utils/checkpoint_conversion_helpers.py +++ b/llmfoundry/utils/checkpoint_conversion_helpers.py @@ -15,7 +15,7 @@ import random import string from pathlib import Path -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Optional, Union import numpy as np from transformers import ( @@ -44,7 +44,7 @@ def _get_weight_data_type(data_type: str): # TODO: move this functionality to composer once the bug fixes are upstreamed def get_hf_tokenizer_from_composer_state_dict( - state_dict: Dict[str, Any], + state_dict: dict[str, Any], trust_remote_code: bool, tokenizer_save_dir: Optional[str] = None, ) -> Optional[PreTrainedTokenizer]: @@ -138,7 +138,7 @@ def load_tokenizer( def _write_zero_bias( weight_name: str, weight_file_path: str, - bias_shape: Union[Tuple[int, ...], int], + bias_shape: Union[tuple[int, ...], int], np_data_type: np.dtype, ) -> None: """Write zeros for bias when converting MPT to FasterTransformer weights. @@ -165,7 +165,7 @@ def _convert_weight_to_ft_each( save_dir: str, infer_gpu_num: int, tensor_name: str, - config: Dict[str, Any], + config: dict[str, Any], data: np.ndarray, np_weight_data_type: np.dtype, ) -> None: diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index ece3cc2bad..1d6303a2cf 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -11,18 +11,15 @@ from typing import ( Any, Callable, - Dict, - List, Literal, Mapping, Optional, - Set, - Tuple, TypeVar, Union, ) import mlflow +from composer.loggers import Logger from composer.utils import dist, parse_uri from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om @@ -47,27 +44,27 @@ @dataclass class EvalConfig: # Eval Config required parameters: - models: List[Dict[str, Any]] = MISSING + models: list[dict[str, Any]] = MISSING max_seq_len: int = MISSING device_eval_batch_size: Union[int, float] = MISSING # Eval Config optional parameters: - code_paths: Optional[List[str]] = None + code_paths: Optional[list[str]] = None # Eval hyperparameters - eval_gauntlet: Optional[Dict[str, Any]] = None + eval_gauntlet: Optional[dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None - eval_loader: Optional[Dict[str, Any]] = None - eval_loaders: Optional[List[Dict[str, Any]]] = None + eval_loader: Optional[dict[str, Any]] = None + eval_loaders: Optional[list[dict[str, Any]]] = None eval_subset_num_batches: int = -1 icl_subset_num_batches: Optional[int] = None # One of icl_tasks or icl_tasks_str must be specified - icl_tasks: Optional[List[Dict[str, Any]]] = None + icl_tasks: Optional[list[dict[str, Any]]] = None icl_tasks_str: Optional[str] = None # Logging parameters python_log_level: Optional[str] = 'debug' - loggers: Optional[Dict[str, Any]] = None + loggers: Optional[dict[str, Any]] = None console_log_interval: Union[int, str] = '1ba' log_config: bool = True @@ -75,17 +72,17 @@ class EvalConfig: seed: int = 17 precision: str = 'amp_bf16' run_name: Optional[str] = None - metadata: Optional[Dict[str, str]] = None + metadata: Optional[dict[str, str]] = None # Distributed parameters dist_timeout: Union[float, int] = 600.0 - fsdp_config: Optional[Dict[str, Any]] = None + fsdp_config: Optional[dict[str, Any]] = None # Callback parameters - callbacks: Optional[Dict[str, Any]] = None + callbacks: Optional[dict[str, Any]] = None # Variables to ignore - variables: Optional[Dict[str, Any]] = None + variables: Optional[dict[str, Any]] = None EVAL_CONFIG_KEYS = {field.name for field in fields(EvalConfig)} @@ -96,11 +93,11 @@ class TrainConfig: """Dataclass for training configuration.""" # Mandatory model training parameters - model: Dict[str, Any] = MISSING - tokenizer: Dict[str, Any] = MISSING - optimizer: Dict[str, Any] = MISSING - scheduler: Dict[str, Any] = MISSING - train_loader: Dict[str, Any] = MISSING + model: dict[str, Any] = MISSING + tokenizer: dict[str, Any] = MISSING + optimizer: dict[str, Any] = MISSING + scheduler: dict[str, Any] = MISSING + train_loader: dict[str, Any] = MISSING device_train_batch_size: Union[int, float] = MISSING device_eval_batch_size: Union[int, float] = MISSING max_duration: Union[int, str] = MISSING @@ -113,7 +110,7 @@ class TrainConfig: precision: str = 'amp_bf16' # Code paths to import - code_paths: Optional[List[str]] = None + code_paths: Optional[list[str]] = None # Cuda allocation configuration max_split_size_mb: Optional[int] = None @@ -122,22 +119,22 @@ class TrainConfig: # Distributed training parameters dist_timeout: Union[int, float] = 600.0 - fsdp_config: Optional[Dict[str, Any]] = None + fsdp_config: Optional[dict[str, Any]] = None # Evaluation parameters eval_interval: Union[int, str] = 1 - eval_loader: Optional[Dict[str, Any]] = None - eval_loaders: Optional[List[Dict[str, Any]] + eval_loader: Optional[dict[str, Any]] = None + eval_loaders: Optional[list[dict[str, Any]] ] = None # should not be set by the user - icl_tasks: Optional[List[Dict[str, Any]]] = None + icl_tasks: Optional[list[dict[str, Any]]] = None icl_tasks_str: Optional[str] = None # should not be set by the user - eval_gauntlet: Optional[Dict[str, Any]] = None + eval_gauntlet: Optional[dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None # should not be set by the user icl_subset_num_batches: Optional[int] = None icl_seq_len: Optional[int] = None # Logging - loggers: Optional[Dict[str, Any]] = None + loggers: Optional[dict[str, Any]] = None progress_bar: bool = False log_to_console: bool = True python_log_level: Optional[str] = 'debug' @@ -145,8 +142,8 @@ class TrainConfig: log_config: bool = True # Callbacks - callbacks: Optional[Dict[str, Any]] = None - algorithms: Optional[Dict[str, Any]] = None + callbacks: Optional[dict[str, Any]] = None + algorithms: Optional[dict[str, Any]] = None # Checkpoints save_folder: Optional[str] = None @@ -159,8 +156,8 @@ class TrainConfig: load_path: Optional[str] = None load_weights_only: bool = False load_strict_model_weights: bool = True - load_ignore_keys: Optional[List[str]] = None - save_ignore_keys: Optional[List[str]] = None + load_ignore_keys: Optional[list[str]] = None + save_ignore_keys: Optional[list[str]] = None only_hf_checkpoint: bool = False only_composer_checkpoint: bool = False @@ -172,10 +169,10 @@ class TrainConfig: # Eval dataloader eval_subset_num_batches: int = -1 eval_first: bool = False - compile_config: Optional[Dict[str, Any]] = None + compile_config: Optional[dict[str, Any]] = None # Metadata - metadata: Optional[Dict[str, Any]] = None + metadata: Optional[dict[str, Any]] = None flatten_metadata: bool = True run_name: Optional[str] = None @@ -183,10 +180,10 @@ class TrainConfig: autoresume: bool = False # Profiling - profiler: Optional[Dict[str, Any]] = None + profiler: Optional[dict[str, Any]] = None # Variables to ignore - variables: Optional[Dict[str, Any]] = None + variables: Optional[dict[str, Any]] = None # Fields created by `update_batch_size_info` n_gpus: int = MISSING @@ -196,14 +193,14 @@ class TrainConfig: TRAIN_CONFIG_KEYS = {field.name for field in fields(TrainConfig)} -def forbid_config_key(cfg_dict: Dict[str, Any], key: str): +def forbid_config_key(cfg_dict: dict[str, Any], key: str): if key in cfg_dict: raise ValueError( f'Config key `{key}` should not be set. Please remove it from the config.', ) -def to_dict_container(cfg: Union[DictConfig, Dict[str, Any]]) -> Dict[str, Any]: +def to_dict_container(cfg: Union[DictConfig, dict[str, Any]]) -> dict[str, Any]: maybe_dict = to_container(cfg) if isinstance(maybe_dict, dict): return maybe_dict @@ -212,8 +209,8 @@ def to_dict_container(cfg: Union[DictConfig, Dict[str, Any]]) -> Dict[str, Any]: def to_list_container( - cfg: Union[ListConfig, List[Dict[str, Any]]], -) -> List[Dict[str, Any]]: + cfg: Union[ListConfig, list[dict[str, Any]]], +) -> list[dict[str, Any]]: maybe_list = to_container(cfg) if isinstance(maybe_list, list): return maybe_list @@ -222,9 +219,9 @@ def to_list_container( def to_container( - cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], - List[Dict[str, Any]]]], -) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + cfg: Optional[Union[DictConfig, ListConfig, dict[str, Any], + list[dict[str, Any]]]], +) -> Union[dict[str, Any], list[dict[str, Any]]]: """Converts a DictConfig or ListConfig to a dict or list. `omegaconf.to_container` does not handle nested DictConfig or ListConfig @@ -246,10 +243,10 @@ def to_container( def apply_transforms_to_config( - cfg: Dict[str, Any], - transforms: Optional[Union[List[Callable[[Dict[str, Any]], Dict[str, Any]]], - List[str], str]], -) -> Dict[str, Any]: + cfg: dict[str, Any], + transforms: Optional[Union[list[Callable[[dict[str, Any]], dict[str, Any]]], + list[str], str]], +) -> dict[str, Any]: """Applies a list of transforms to a config. Args: @@ -296,11 +293,11 @@ def apply_transforms_to_config( def make_dataclass_and_log_config( cfg: DictConfig, dataclass_constructor: Callable[..., T], - dataclass_fields: Set[str], - transforms: Optional[Union[List[Callable[[Dict[str, Any]], Dict[str, Any]]], - List[str], str]] = None, + dataclass_fields: set[str], + transforms: Optional[Union[list[Callable[[dict[str, Any]], dict[str, Any]]], + list[str], str]] = None, icl_tasks_required: bool = False, -) -> Tuple[Dict[str, Any], T]: +) -> tuple[dict[str, Any], T]: """Converts a DictConfig to a dataclass and creates a logged config.""" unstructured_config = om.to_container(cfg, resolve=True) assert isinstance(unstructured_config, dict) @@ -339,7 +336,7 @@ def make_dataclass_and_log_config( ) # Create copy of config for logging - logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) + logged_cfg: dict[str, Any] = copy.deepcopy(unstructured_config) arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, dataclass_fields) @@ -370,7 +367,7 @@ def make_dataclass_and_log_config( def pop_config( - cfg: Union[Dict[str, Any], DictConfig], + cfg: Union[dict[str, Any], DictConfig], key: str, must_exist: bool = True, default_value: Any = None, @@ -420,7 +417,7 @@ def calculate_batch_size_info( global_batch_size: int, device_microbatch_size: Union[int, float, Literal['auto']], data_replication_degree: int = 1, -) -> Tuple[Union[int, float], Union[int, float, Literal['auto']], Union[ +) -> tuple[Union[int, float], Union[int, float, Literal['auto']], Union[ int, Literal['auto']]]: world_size = dist.get_world_size() @@ -457,11 +454,11 @@ def calculate_batch_size_info( def update_config_with_batch_size_info( - cfg: Dict[str, Any], + cfg: dict[str, Any], device_train_batch_size: Union[int, float], device_train_microbatch_size: Union[int, float, Literal['auto']], device_train_grad_accum: Union[int, Literal['auto']], -) -> Dict[str, Any]: +) -> dict[str, Any]: """Update the config with batch size information. Args: @@ -487,7 +484,7 @@ def update_config_with_batch_size_info( return cfg -def update_batch_size_info(cfg: Dict[str, Any]) -> Dict[str, Any]: +def update_batch_size_info(cfg: dict[str, Any]) -> dict[str, Any]: data_replication_degree = 1 device_train_batch_size, device_train_microbatch_size, device_train_grad_accum = calculate_batch_size_info( cfg['global_train_batch_size'], @@ -503,7 +500,7 @@ def update_batch_size_info(cfg: Dict[str, Any]) -> Dict[str, Any]: return cfg -def process_init_device(model_cfg: Dict[str, Any], fsdp_config: Optional[Dict]): +def process_init_device(model_cfg: dict[str, Any], fsdp_config: Optional[dict]): # Restrict model init_device to 'meta' and 'cpu', # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors # when multiple GPUs are available. @@ -575,24 +572,17 @@ def process_init_device(model_cfg: Dict[str, Any], fsdp_config: Optional[Dict]): return init_context -def log_config(cfg: Dict[str, Any]) -> None: +def log_config(logger: Logger, cfg: dict[str, Any]) -> None: """Logs the current config and updates the wandb and mlflow configs. This function can be called multiple times to update the wandb and MLflow config with different variables. """ print(om.to_yaml(cfg)) - loggers = cfg.get('loggers', None) or {} - if 'wandb' in loggers: - import wandb - if wandb.run: - wandb.config.update(cfg) - - if 'mlflow' in loggers and mlflow.active_run(): - mlflow.log_params(params=cfg) + logger.log_hyperparameters(cfg) -def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: +def _parse_source_dataset(cfg: dict[str, Any]) -> list[tuple[str, str, str]]: """Parse a run config for dataset information. Given a config dictionary, parse through it to determine what the datasource @@ -608,7 +598,7 @@ def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: data_paths = [] # Handle train loader if it exists - train_dataset: Dict = cfg.get('train_loader', {}).get('dataset', {}) + train_dataset: dict = cfg.get('train_loader', {}).get('dataset', {}) train_split = train_dataset.get('split', None) train_source_path = cfg.get('source_dataset_train', None) _process_data_source( @@ -628,7 +618,7 @@ def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: for eval_data_loader in eval_data_loaders: assert isinstance(eval_data_loader, dict) # pyright type check - eval_dataset: Dict = eval_data_loader.get('dataset', {}) + eval_dataset: dict = eval_data_loader.get('dataset', {}) eval_split = eval_dataset.get('split', None) eval_source_path = cfg.get('source_dataset_eval', None) _process_data_source( @@ -644,10 +634,10 @@ def _parse_source_dataset(cfg: Dict[str, Any]) -> List[Tuple[str, str, str]]: def _process_data_source( source_dataset_path: Optional[str], - dataset: Dict[str, str], + dataset: dict[str, str], cfg_split: Optional[str], true_split: str, - data_paths: List[Tuple[str, str, str]], + data_paths: list[tuple[str, str, str]], ): """Add a data source by mutating data_paths. @@ -720,7 +710,7 @@ def _process_data_source( log.warning('DataSource Not Found.') -def log_dataset_uri(cfg: Dict[str, Any]) -> None: +def log_dataset_uri(cfg: dict[str, Any]) -> None: """Logs dataset tracking information to MLflow. Args: @@ -817,7 +807,7 @@ def _verify_uc_path(path: str) -> bool: def set_config_overrides( config: PretrainedConfig, - config_overrides: Dict[str, Any], + config_overrides: dict[str, Any], ): # set config overrides for k, v in config_overrides.items(): diff --git a/llmfoundry/utils/data_prep_utils.py b/llmfoundry/utils/data_prep_utils.py index 1662ab74c2..df67f3223a 100644 --- a/llmfoundry/utils/data_prep_utils.py +++ b/llmfoundry/utils/data_prep_utils.py @@ -4,7 +4,7 @@ import json import os from glob import glob -from typing import List, Optional +from typing import Optional from composer.utils import ObjectStore from composer.utils.object_store import ObjectStoreTransientError @@ -104,7 +104,7 @@ class DownloadingIterable: def __init__( self, - object_names: List[str], + object_names: list[str], output_folder: str, object_store: Optional[ObjectStore], ): diff --git a/llmfoundry/utils/exceptions.py b/llmfoundry/utils/exceptions.py index c6a667697d..206095f28b 100644 --- a/llmfoundry/utils/exceptions.py +++ b/llmfoundry/utils/exceptions.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 """Custom exceptions for the LLMFoundry.""" -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Literal, Optional, Union __all__ = [ 'ALLOWED_RESPONSE_KEYS', @@ -212,7 +212,7 @@ class ChatTemplateError(UserError): def __init__( self, template: str, - sample: List[Dict[str, Any]], + sample: list[dict[str, Any]], inner_message: str, ) -> None: message = f'Failed to process sample {sample} with template {template}. {inner_message}' @@ -239,7 +239,7 @@ def __init__(self, last_role: str, expected_roles: set[str]) -> None: class IncorrectMessageKeyQuantityError(UserError): """Error thrown when a message has an incorrect number of keys.""" - def __init__(self, keys: List[str]) -> None: + def __init__(self, keys: list[str]) -> None: message = f'Expected 2 keys in message, but found {len(keys)}' super().__init__(message, keys=keys) @@ -279,7 +279,7 @@ def __init__(self, response_type: type) -> None: class InvalidPromptResponseKeysError(UserError): """Error thrown when missing expected prompt and response keys.""" - def __init__(self, mapping: Dict[str, str], example: Dict[str, Any]): + def __init__(self, mapping: dict[str, str], example: dict[str, Any]): message = f'Expected {mapping=} to have keys "prompt" and "response".' super().__init__(message, mapping=mapping, example=example) @@ -287,7 +287,7 @@ def __init__(self, mapping: Dict[str, str], example: Dict[str, Any]): class InvalidFileExtensionError(UserError): """Error thrown when a file extension is not a safe extension.""" - def __init__(self, dataset_name: str, valid_extensions: List[str]) -> None: + def __init__(self, dataset_name: str, valid_extensions: list[str]) -> None: message = ( f'safe_load is set to True. No data files with safe extensions {valid_extensions} ' + f'found for dataset at local path {dataset_name}.' @@ -304,7 +304,7 @@ class UnableToProcessPromptResponseError( ): """Error thrown when a prompt and response cannot be processed.""" - def __init__(self, input: Dict) -> None: + def __init__(self, input: dict) -> None: message = f'Unable to extract prompt/response from {input}' super().__init__(message, input=input) @@ -348,6 +348,14 @@ def __init__(self, input_folder: str) -> None: super().__init__(message, input_folder=input_folder) +class CannotUnicodeDecodeFile(UserError): + """Error thrown when the input folder is missing data.""" + + def __init__(self, text_file: str) -> None: + message = f'Text file {text_file} contains chars that cannot be utf-8 decoded. Please remove or replace these chars.' + super().__init__(message, text_file=text_file) + + class OutputFolderNotEmptyError(UserError): """Error thrown when the output folder is not empty.""" diff --git a/llmfoundry/utils/mosaicml_logger_utils.py b/llmfoundry/utils/mosaicml_logger_utils.py index b01170ff0f..b2372b611b 100644 --- a/llmfoundry/utils/mosaicml_logger_utils.py +++ b/llmfoundry/utils/mosaicml_logger_utils.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import json import os -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union from composer.loggers import MosaicMLLogger from composer.loggers.logger_destination import LoggerDestination @@ -38,7 +38,7 @@ def maybe_create_mosaicml_logger() -> Optional[MosaicMLLogger]: def find_mosaicml_logger( - loggers: List[LoggerDestination], + loggers: list[LoggerDestination], ) -> Optional[MosaicMLLogger]: """Returns the first MosaicMLLogger from a list, and None otherwise.""" return next( @@ -49,12 +49,12 @@ def find_mosaicml_logger( def log_eval_analytics( mosaicml_logger: MosaicMLLogger, - model_configs: List[Dict[str, Any]], - icl_tasks: Union[str, List[Dict[str, Any]]], - eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], + model_configs: list[dict[str, Any]], + icl_tasks: Union[str, list[dict[str, Any]]], + eval_gauntlet_config: Optional[Union[str, dict[str, Any]]], ): """Logs analytics for runs using the `eval.py` script.""" - metrics: Dict[str, Any] = { + metrics: dict[str, Any] = { 'llmfoundry/script': 'eval', } @@ -83,18 +83,18 @@ def log_eval_analytics( def log_train_analytics( mosaicml_logger: MosaicMLLogger, - model_config: Dict[str, Any], - train_loader_config: Dict[str, Any], - eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], - callback_configs: Optional[Dict[str, Any]], + model_config: dict[str, Any], + train_loader_config: dict[str, Any], + eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]], + callback_configs: Optional[dict[str, Any]], tokenizer_name: str, load_path: Optional[str], - icl_tasks_config: Optional[Union[List[Dict[str, Any]], str]], - eval_gauntlet: Optional[Union[Dict[str, Any], str]], + icl_tasks_config: Optional[Union[list[dict[str, Any]], str]], + eval_gauntlet: Optional[Union[dict[str, Any], str]], ): """Logs analytics for runs using the `train.py` script.""" train_loader_dataset = train_loader_config.get('dataset', {}) - metrics: Dict[str, Any] = { + metrics: dict[str, Any] = { 'llmfoundry/tokenizer_name': tokenizer_name, 'llmfoundry/script': 'train', 'llmfoundry/train_loader_name': train_loader_config.get('name'), diff --git a/llmfoundry/utils/prompt_files.py b/llmfoundry/utils/prompt_files.py index 64e5de70a1..6e9c6ae2e9 100644 --- a/llmfoundry/utils/prompt_files.py +++ b/llmfoundry/utils/prompt_files.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import List, Optional +from typing import Optional PROMPTFILE_PREFIX = 'file::' @@ -12,8 +12,8 @@ ] -def load_prompts(prompts: List[str], - prompt_delimiter: Optional[str] = None) -> List[str]: +def load_prompts(prompts: list[str], + prompt_delimiter: Optional[str] = None) -> list[str]: """Loads a set of prompts, both free text and from file. Args: @@ -37,7 +37,7 @@ def load_prompts(prompts: List[str], def load_prompts_from_file( prompt_path: str, prompt_delimiter: Optional[str] = None, -) -> List[str]: +) -> list[str]: """Load a set of prompts from a text fie. Args: diff --git a/llmfoundry/utils/registry_utils.py b/llmfoundry/utils/registry_utils.py index f96e72b3a2..74ba0996ef 100644 --- a/llmfoundry/utils/registry_utils.py +++ b/llmfoundry/utils/registry_utils.py @@ -11,11 +11,9 @@ from typing import ( Any, Callable, - Dict, Generic, Optional, Sequence, - Type, TypeVar, Union, ) @@ -31,7 +29,7 @@ ] T = TypeVar('T') -TypeBoundT = TypeVar('TypeBoundT', bound=Type) +TypeBoundT = TypeVar('TypeBoundT', bound=type) CallableBoundT = TypeVar('CallableBoundT', bound=Callable[..., Any]) @@ -68,13 +66,13 @@ def register_class( def get(self, name: str) -> T: return super().get(name) - def get_all(self) -> Dict[str, T]: + def get_all(self) -> dict[str, T]: return super().get_all() def get_entry_point(self, name: str, default: Optional[T] = None) -> T: return super().get_entry_point(name, default=default) - def get_entry_points(self) -> Dict[str, T]: + def get_entry_points(self) -> dict[str, T]: return super().get_entry_points() @@ -83,7 +81,7 @@ def get_entry_points(self) -> Dict[str, T]: def create_registry( *namespace: str, - generic_type: Type[S], + generic_type: type[S], entry_points: bool = False, description: str = '', ) -> 'TypedRegistry[S]': @@ -115,7 +113,7 @@ def construct_from_registry( pre_validation_function: Optional[Union[Callable[[Any], None], type]] = None, post_validation_function: Optional[Callable[[Any], None]] = None, - kwargs: Optional[Dict[str, Any]] = None, + kwargs: Optional[dict[str, Any]] = None, ) -> Any: """Helper function to build an item from the registry. diff --git a/llmfoundry/utils/warnings.py b/llmfoundry/utils/warnings.py index 83b2d1a32a..6da0d5e605 100644 --- a/llmfoundry/utils/warnings.py +++ b/llmfoundry/utils/warnings.py @@ -79,10 +79,12 @@ def wrapper(*args: Any, **kwargs: Any): return decorator -def experimental_class(feature_name: str) -> Callable[[Type], Type]: +def experimental_class( + feature_name: str, +) -> Callable[[Type], Type]: # noqa: UP006 """Class decorator to mark a class as experimental.""" - def class_decorator(cls: Type): + def class_decorator(cls: Type): # noqa: UP006 original_init = cls.__init__ def new_init(self: Any, *args: Any, **kwargs: Any): diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml index 4bfa301f8e..4fcf8b3cb9 100644 --- a/mcli/mcli-1b-eval.yaml +++ b/mcli/mcli-1b-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml index 2dc83d36a9..fb96c576e0 100644 --- a/mcli/mcli-1b-max-seq-len-8k.yaml +++ b/mcli/mcli-1b-max-seq-len-8k.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml index 69b2295011..26255977f4 100644 --- a/mcli/mcli-1b.yaml +++ b/mcli/mcli-1b.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml index 7a3ea2cbe9..3995598fd3 100644 --- a/mcli/mcli-benchmark-mpt.yaml +++ b/mcli/mcli-benchmark-mpt.yaml @@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.3.1_cu121-latest integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml index fefaf8e1a3..7b715f6792 100644 --- a/mcli/mcli-convert-composer-to-hf.yaml +++ b/mcli/mcli-convert-composer-to-hf.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml index e58d42483a..27f5938d67 100644 --- a/mcli/mcli-hf-eval.yaml +++ b/mcli/mcli-hf-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml index 02c49d84c3..cb3040e4ee 100644 --- a/mcli/mcli-hf-generate.yaml +++ b/mcli/mcli-hf-generate.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml index 47c163faf8..7134e6204c 100644 --- a/mcli/mcli-llama2-finetune.yaml +++ b/mcli/mcli-llama2-finetune.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml index c372014165..cd04d89f4e 100644 --- a/mcli/mcli-openai-eval.yaml +++ b/mcli/mcli-openai-eval.yaml @@ -1,7 +1,7 @@ integrations: - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: .[gpu,openai] ssh_clone: false # Should be true if using a private repo diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml index a4496503cd..5425ce9897 100644 --- a/mcli/mcli-pretokenize-oci-upload.yaml +++ b/mcli/mcli-pretokenize-oci-upload.yaml @@ -14,7 +14,7 @@ integrations: - oci-cli==3.23.2 - integration_type: git_repo git_repo: mosaicml/llm-foundry - git_branch: v0.10.0 + git_branch: v0.11.0 # git_commit: # OR use your commit hash pip_install: . ssh_clone: false # Should be true if using a private repo diff --git a/pyproject.toml b/pyproject.toml index fdbabfff96..2208fdac2e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ split_on_trailing_comma = true # Ruff global [tool.ruff] +target-version = "py39" exclude = [ "build/**", "docs/**", @@ -28,10 +29,12 @@ select = [ "PLE", "COM812", "D", # pydocstyle + "UP006" +] +extend-safe-fixes = [ + "UP006", ] - extend-select = ["D404"] # pydocstyle - ignore = [ "D100", "D101", diff --git a/scripts/inference/convert_composer_mpt_to_ft.py b/scripts/inference/convert_composer_mpt_to_ft.py index bea5b6715e..16cfabf125 100644 --- a/scripts/inference/convert_composer_mpt_to_ft.py +++ b/scripts/inference/convert_composer_mpt_to_ft.py @@ -8,7 +8,7 @@ import tempfile from argparse import ArgumentParser, Namespace from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union import torch from composer.utils import get_file, safe_torch_load @@ -21,7 +21,7 @@ def save_ft_config( - composer_config: Dict[str, Any], + composer_config: dict[str, Any], tokenizer: PreTrainedTokenizer, save_dir: str, infer_gpu_num: int = 1, diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py index 4d4019208c..dc7314f3e9 100644 --- a/scripts/inference/convert_composer_to_hf.py +++ b/scripts/inference/convert_composer_to_hf.py @@ -5,7 +5,7 @@ import tempfile from argparse import ArgumentParser, Namespace from pathlib import Path -from typing import Optional, Tuple, Union +from typing import Optional, Union import torch import transformers @@ -31,7 +31,7 @@ def write_huggingface_pretrained_from_composer_checkpoint( trust_remote_code: bool, output_precision: str = 'fp32', local_checkpoint_save_location: Optional[Union[Path, str]] = None, -) -> Tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]: +) -> tuple[PretrainedConfig, Optional[PreTrainedTokenizerBase]]: """Convert a Composer checkpoint to a pretrained HF checkpoint folder. Write a ``config.json`` and ``pytorch_model.bin``, like diff --git a/scripts/inference/convert_hf_to_onnx.py b/scripts/inference/convert_hf_to_onnx.py index f230e56bad..0f62917ef8 100644 --- a/scripts/inference/convert_hf_to_onnx.py +++ b/scripts/inference/convert_hf_to_onnx.py @@ -30,7 +30,7 @@ import os from argparse import ArgumentTypeError from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union import torch from composer.utils import ( @@ -85,7 +85,7 @@ def export_to_onnx( export_batch_size: int, max_seq_len: Optional[int], verify_export: bool, - from_pretrained_kwargs: Dict[str, Any], + from_pretrained_kwargs: dict[str, Any], ): reproducibility.seed_all(42) save_object_store = maybe_create_object_store_from_uri(output_folder) diff --git a/scripts/inference/hf_chat.py b/scripts/inference/hf_chat.py index 7fb3d2af46..89c73e5afc 100644 --- a/scripts/inference/hf_chat.py +++ b/scripts/inference/hf_chat.py @@ -5,7 +5,7 @@ import warnings from argparse import ArgumentParser, ArgumentTypeError, Namespace from contextlib import nullcontext -from typing import Any, Dict, List, Optional, Union +from typing import Any, Optional, Union import torch from transformers import ( @@ -35,7 +35,7 @@ def __init__(self, role: str, content: str) -> None: self.role = role self.content = content - def to_dict(self,) -> Dict[str, str]: + def to_dict(self,) -> dict[str, str]: return {'role': self.role, 'content': self.content} def __repr__(self) -> str: @@ -67,9 +67,9 @@ def __init__( self, model: PreTrainedModel, tokenizer: PreTrainedTokenizerBase, - generate_kwargs: Dict[str, Any], + generate_kwargs: dict[str, Any], system_prompt: str, - stop_tokens: Optional[List[str]] = None, + stop_tokens: Optional[list[str]] = None, ) -> None: if stop_tokens is None: stop_tokens = ['<|endoftext|>', '<|im_end|>'] @@ -121,7 +121,7 @@ def __call__( "- Type 'history_fmt' to see the conversation\n- Type 'quit' to end\n- Type 'system' to change the system prompt\n" ) - def _history_to_chat_conversation(self) -> List[Dict[str, str]]: + def _history_to_chat_conversation(self) -> list[dict[str, str]]: msg_history = [chat_msg.to_dict() for chat_msg in self.history] return msg_history diff --git a/scripts/inference/hf_generate.py b/scripts/inference/hf_generate.py index b2e758b4ce..7ac3cd5d72 100644 --- a/scripts/inference/hf_generate.py +++ b/scripts/inference/hf_generate.py @@ -6,7 +6,7 @@ import warnings from argparse import ArgumentParser, ArgumentTypeError, Namespace from contextlib import nullcontext -from typing import Dict, Union +from typing import Union import numpy as np import torch @@ -280,7 +280,7 @@ def main(args: Namespace) -> None: print(f'\nGenerate kwargs:\n{generate_kwargs}') # Generate function with correct context managers - def _generate(encoded_inp: Dict[str, torch.Tensor]): + def _generate(encoded_inp: dict[str, torch.Tensor]): with torch.no_grad(): with autocast_context: return model.generate( diff --git a/scripts/misc/convert_examples_ckpt.py b/scripts/misc/convert_examples_ckpt.py index 437bcb6fa7..9ed96984f6 100644 --- a/scripts/misc/convert_examples_ckpt.py +++ b/scripts/misc/convert_examples_ckpt.py @@ -7,7 +7,7 @@ from collections import OrderedDict from copy import deepcopy from pathlib import Path -from typing import Any, Dict, Optional, Union +from typing import Any, Optional, Union import torch from composer.utils import ( @@ -33,9 +33,9 @@ def convert_examples_ckpt_state_dict( - state_dict: Dict[str, Any], - conversion_dict: Dict[str, str], -) -> Dict[str, Any]: + state_dict: dict[str, Any], + conversion_dict: dict[str, str], +) -> dict[str, Any]: # map old keys to new keys key_mappings = OrderedDict() for k in state_dict.keys(): @@ -59,7 +59,7 @@ def convert_examples_ckpt_state_dict( def convert_examples_ckpt( checkpoint_path: Union[Path, str], output_path: Union[Path, str], - conversion_dict: Dict[str, str], + conversion_dict: dict[str, str], local_ckpt_path: Optional[Union[Path, str]] = None, ) -> None: """Convert a ckpt created in examples repo to an llmfoundry compat ckpt. diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index 6bd048fd97..c6dba3a49f 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -3,7 +3,6 @@ """Script to profile example packing.""" import os -from typing import Dict from llmfoundry.data.packing import profile_packing @@ -87,7 +86,7 @@ def parse_args() -> Namespace: raise ValueError('config must define tokenizer') resolved_tokenizer_cfg = om.to_container(cfg.tokenizer, resolve=True) - if not isinstance(resolved_tokenizer_cfg, Dict): + if not isinstance(resolved_tokenizer_cfg, dict): raise ValueError( 'tokenizer config needs to be resolved by omegaconf into a Dict.', ) diff --git a/scripts/misc/update_hub_code.py b/scripts/misc/update_hub_code.py index 20bb92fd04..af4ad374a8 100644 --- a/scripts/misc/update_hub_code.py +++ b/scripts/misc/update_hub_code.py @@ -5,7 +5,6 @@ import os import tempfile from datetime import datetime -from typing import List import torch import transformers @@ -28,7 +27,7 @@ ] -def main(hf_repos_for_upload: List[str]): +def main(hf_repos_for_upload: list[str]): if len(hf_repos_for_upload) == 1 and hf_repos_for_upload[0] == 'all': hf_repos_for_upload = _ALL_MODELS diff --git a/scripts/train/benchmarking/collect_results.py b/scripts/train/benchmarking/collect_results.py index 26788788a2..ef9d6ea534 100644 --- a/scripts/train/benchmarking/collect_results.py +++ b/scripts/train/benchmarking/collect_results.py @@ -4,7 +4,7 @@ import argparse import csv import math -from typing import Any, Dict, List, Union +from typing import Any, Union from composer.callbacks.speed_monitor import \ GPU_AVAILABLE_FLOPS as GPU_FLOP_DICT @@ -91,7 +91,7 @@ def sort_key(r: msdk.Run): return runs -def filter_runs(runs: List[msdk.Run]): +def filter_runs(runs: list[msdk.Run]): pop_runs = [] for run in runs: if run.status == msdk.RunStatus('FAILED'): @@ -114,7 +114,7 @@ def filter_runs(runs: List[msdk.Run]): return runs -def parse_run(run: msdk.Run) -> Dict[str, Any]: +def parse_run(run: msdk.Run) -> dict[str, Any]: n_params = micro_batchsize = throughput = -1 model_name = run.name.split('-')[2] diff --git a/scripts/train/benchmarking/submit_benchmarks.py b/scripts/train/benchmarking/submit_benchmarks.py index 7e5bae7afc..fd7be1fc6d 100644 --- a/scripts/train/benchmarking/submit_benchmarks.py +++ b/scripts/train/benchmarking/submit_benchmarks.py @@ -3,7 +3,7 @@ import argparse import math import os -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Optional, Union import requests import yaml @@ -243,7 +243,7 @@ def parse_args(): return parser.parse_args() -def get_max_seq_lens(pows: Optional[List[int]] = None): +def get_max_seq_lens(pows: Optional[list[int]] = None): if pows is None: pows = [9, 14] return [2**n for n in range(pows[0], pows[1] + 1)] @@ -251,8 +251,8 @@ def get_max_seq_lens(pows: Optional[List[int]] = None): def get_global_train_batch_sizes( max_seq_len: int, - pows: List[int], - batch_sizes: Optional[List[int]] = None, + pows: list[int], + batch_sizes: Optional[list[int]] = None, ): if batch_sizes is None: batch_sizes = [] @@ -284,7 +284,7 @@ def get_cluster_gpu_types(cluster: str): return [gpu_info[0] for gpu_info in CLUSTER_INFO[cluster]] -def get_gpu_types(clusters: List[str]): +def get_gpu_types(clusters: list[str]): gpu_types = set() for c in clusters: for g in get_cluster_gpu_types(c): @@ -292,7 +292,7 @@ def get_gpu_types(clusters: List[str]): return gpu_types -def get_gpu_nums(clusters: List[str], gpu_types: List[str]): +def get_gpu_nums(clusters: list[str], gpu_types: list[str]): max_gpus_per_run = 1 for c in clusters: for gpu_info in CLUSTER_INFO[c]: @@ -314,7 +314,7 @@ def get_valid_gpu_lim(cluster: str, gpu_type: str): def mod_parameters( - parameters: Dict[str, Any], + parameters: dict[str, Any], max_seq_len: int, global_train_batch_size: int, precision: str, @@ -449,7 +449,7 @@ def get_integrations( def run_config( - config: Tuple[str, int, int, str, str, int, str], + config: tuple[str, int, int, str, str, int, str], args: argparse.Namespace, ): model_yaml, max_seq_len, global_train_batch_size, cluster, gpu_type, gpu_num, precision = config @@ -625,7 +625,7 @@ def run_check_dtms(num_gpus: int, dtms: int, batch_size: int): ) if run: - config: Tuple[str, int, int, str, str, int, + config: tuple[str, int, int, str, str, int, str] = ( model_yaml, max_seq_len, diff --git a/scripts/train/finetune_example/preprocessing.py b/scripts/train/finetune_example/preprocessing.py index 5f0639d22b..ad0d1a6a56 100644 --- a/scripts/train/finetune_example/preprocessing.py +++ b/scripts/train/finetune_example/preprocessing.py @@ -31,15 +31,15 @@ } """ -from typing import Dict, List, Union +from typing import Union def multiple_choice( - inp: Dict[str, Union[str, List[str], int]], -) -> Dict[str, str]: + inp: dict[str, Union[str, list[str], int]], +) -> dict[str, str]: PROMPT_FORMAT = '{query}\nOptions:{options}\nAnswer: ' options = '' - assert isinstance(inp['choices'], List) + assert isinstance(inp['choices'], list) for option in inp['choices']: options += f'\n - {option}' query = inp['query'] diff --git a/setup.py b/setup.py index 04c28d8f70..229efe5b5b 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import copy import os -from typing import Any, Dict, Mapping +from typing import Any, Mapping import setuptools from setuptools import setup @@ -19,7 +19,7 @@ # We can't use `.__version__` from the library since it's not installed yet version_path = os.path.join(_PACKAGE_REAL_PATH, '_version.py') with open(version_path, encoding='utf-8') as f: - version_globals: Dict[str, Any] = {} + version_globals: dict[str, Any] = {} version_locals: Mapping[str, object] = {} content = f.read() exec(content, version_globals, version_locals) @@ -66,26 +66,27 @@ 'slack-sdk<4', 'mosaicml-cli>=0.6.10,<1', 'onnx==1.16.2', - 'onnxruntime==1.18.1', + 'onnxruntime==1.19.0', 'boto3>=1.21.45,<2', 'huggingface-hub>=0.19.0,<0.25', 'beautifulsoup4>=4.12.2,<5', # required for model download utils - 'tenacity>=8.2.3,<9', + 'tenacity>=8.2.3,<10', 'catalogue>=2,<3', 'typer<1', + 'GitPython==3.1.43', ] extra_deps = {} extra_deps['dev'] = [ - 'coverage[toml]==7.4.4', + 'coverage[toml]==7.6.1', 'pre-commit>=3.4.0,<4', - 'pytest>=7.2.1,<8', - 'pytest_codeblocks>=0.16.1,<0.17', + 'pytest>=7.2.1,<9', + 'pytest_codeblocks>=0.16.1,<0.18', 'pytest-cov>=4,<6', 'pyright==1.1.256', 'toml>=0.10.2,<0.11', - 'packaging>=21,<23', + 'packaging>=21,<25', 'hf_transfer==0.1.3', ] diff --git a/tests/a_scripts/data_prep/test_convert_text_to_mds.py b/tests/a_scripts/data_prep/test_convert_text_to_mds.py index 6ba14d62e4..d604565e59 100644 --- a/tests/a_scripts/data_prep/test_convert_text_to_mds.py +++ b/tests/a_scripts/data_prep/test_convert_text_to_mds.py @@ -6,7 +6,7 @@ import shutil from concurrent.futures import ProcessPoolExecutor from glob import glob -from typing import Callable, Iterable, List +from typing import Callable, Iterable from unittest.mock import Mock, patch import pytest @@ -22,6 +22,7 @@ write_done_file, ) from llmfoundry.utils.exceptions import ( + CannotUnicodeDecodeFile, DatasetTooSmallError, InputFolderMissingDataError, OutputFolderNotEmptyError, @@ -59,7 +60,7 @@ def download_object( ) as remote_file, open(filename, 'wb') as local_file: local_file.write(remote_file.read()) - def list_objects(self, prefix: str) -> List[str]: + def list_objects(self, prefix: str) -> list[str]: return glob(os.path.join(self.remote_folder, '*.txt')) def upload_object(self, object_name: str, filename: str): @@ -76,7 +77,7 @@ def _mock_map(func: Callable, args: Iterable) -> Iterable: yield func(arg) -def _assert_files_exist(prefix: str, files: List[str]): +def _assert_files_exist(prefix: str, files: list[str]): for file in files: assert os.path.exists(os.path.join(prefix, file)) @@ -290,6 +291,28 @@ def test_dataset_too_small(tmp_path: pathlib.Path): ) +def test_decode_invalid_unicode(tmp_path: pathlib.Path): + input_folder = tmp_path / 'input' + os.makedirs(input_folder, exist_ok=True) + with open(input_folder / 'test.txt', 'w', encoding='utf-16') as f: + f.write('HELLO WORLD') + with pytest.raises(CannotUnicodeDecodeFile): + convert_text_to_mds( + tokenizer_name='mosaicml/mpt-7b', + output_folder=str(tmp_path / 'output'), + input_folder=str(input_folder), + concat_tokens=1, + eos_text='', + bos_text='', + no_wrap=False, + compression='zstd', + processes=1, + args_str='Namespace()', + reprocess=False, + trust_remote_code=False, + ) + + def test_is_already_processed(tmp_path: pathlib.Path): tmp_path_str = str(tmp_path) args_str = 'Namespace(x = 5)' diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index cd47b2df7c..28046fdce5 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -8,7 +8,7 @@ import pathlib import shutil from argparse import Namespace -from typing import Any, Callable, Dict, Optional, cast +from typing import Any, Callable, Optional, cast from unittest.mock import ANY, MagicMock, patch import catalogue @@ -314,7 +314,7 @@ class MockSpawnProcess: multiprocessing, so we need to patch SpawnProcess for tests. """ - def __init__(self, target: Callable, kwargs: Dict[str, Any]): + def __init__(self, target: Callable, kwargs: dict[str, Any]): self.target = target self.kwargs = kwargs diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index 9c80127af5..c754066937 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict, List, Optional +from typing import Optional import omegaconf as om import pytest @@ -29,7 +29,7 @@ def set_correct_cwd(): class MockState(State): - def __init__(self, logger_keys: List[str], accuracy: float = 0.25) -> None: + def __init__(self, logger_keys: list[str], accuracy: float = 0.25) -> None: self.eval_metrics = {} self.timestamp = 0 for key in logger_keys: @@ -49,7 +49,7 @@ def __init__(self, state: MockState): self.inmemorylogger = InMemoryLogger() self.inmemorylogger.state = state - def log_metrics(self, metrics: Dict[str, float]) -> None: + def log_metrics(self, metrics: dict[str, float]) -> None: self.inmemorylogger.log_metrics(metrics) @@ -74,7 +74,7 @@ def test_gauntlet_callback(averages: Optional[dict]): icl_task_type: language_modeling """, ) - icl_task_config_list: List[om.DictConfig + icl_task_config_list: list[om.DictConfig ] = list(icl_task_config) # type: ignore assert all(isinstance(c, om.DictConfig) for c in icl_task_config_list) diff --git a/tests/conftest.py b/tests/conftest.py index b099a88cd1..5c5802efd2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import List, Optional +from typing import Optional import pytest from composer.utils import reproducibility @@ -28,7 +28,7 @@ def _add_option( parser: pytest.Parser, name: str, help: str, - choices: Optional[List[str]] = None, + choices: Optional[list[str]] = None, ): parser.addoption( f'--{name}', @@ -63,7 +63,7 @@ def _get_world_size(item: pytest.Item): def pytest_collection_modifyitems( config: pytest.Config, - items: List[pytest.Item], + items: list[pytest.Item], ) -> None: """Filter tests by world_size (for multi-GPU tests)""" world_size = int(os.environ.get('WORLD_SIZE', '1')) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 1a43e12536..d215d93542 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -7,7 +7,7 @@ import shutil from contextlib import nullcontext as does_not_raise from pathlib import Path -from typing import Any, Callable, ContextManager, Dict, Literal, Optional, Union +from typing import Any, Callable, ContextManager, Literal, Optional, Union from unittest.mock import MagicMock, patch import catalogue @@ -1231,7 +1231,7 @@ def build_from_hf( target_prompts: str = 'last', target_responses: str = 'none', decoder_only_format: bool = True, - hf_kwargs: Optional[Dict[str, Any]] = None, + hf_kwargs: Optional[dict[str, Any]] = None, ): return [] diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index d181dbde0b..0fad6c0d53 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 from pathlib import Path -from typing import Any, Dict, List +from typing import Any from unittest.mock import Mock, patch import pytest @@ -19,8 +19,8 @@ from llmfoundry.utils.builders import build_tokenizer -def _data_to_batch(data: List[List[int]], max_seq_len: int, - pad_token_id: int) -> Dict[str, torch.Tensor]: +def _data_to_batch(data: list[list[int]], max_seq_len: int, + pad_token_id: int) -> dict[str, torch.Tensor]: """Helper function to create a proper batch of data.""" input_ids = torch.stack([ torch.tensor(d + [pad_token_id] * (max_seq_len - len(d))) for d in data diff --git a/tests/data_utils.py b/tests/data_utils.py index ea64943735..117310b0cf 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -5,7 +5,7 @@ import os import shutil from pathlib import Path -from typing import Dict, List, Optional +from typing import Optional from omegaconf import DictConfig from omegaconf import OmegaConf as om @@ -199,7 +199,7 @@ def make_tiny_conversation_ft_dataset( }], }) - def messages_to_conversation(sample: Dict): + def messages_to_conversation(sample: dict): assert 'messages' in sample messages = sample['messages'] @@ -207,7 +207,7 @@ def messages_to_conversation(sample: Dict): 'user': 'human', 'assistant': 'gpt', } - conversations: List[Dict[str, str]] = [] + conversations: list[dict[str, str]] = [] for message in messages: role: str = role_map.get(message['role'], message['role']) content: str = message['content'] diff --git a/tests/eval/test_in_context_learning_datasets.py b/tests/eval/test_in_context_learning_datasets.py index 81769a18e6..5fe9643cde 100644 --- a/tests/eval/test_in_context_learning_datasets.py +++ b/tests/eval/test_in_context_learning_datasets.py @@ -5,7 +5,7 @@ import os import random from pathlib import Path -from typing import Dict, List, Optional +from typing import Optional import pytest import torch @@ -2447,8 +2447,8 @@ def test_hf_dataloading_lm_dataloader( tmp_path: Path, num_fewshot: int, prompt_string: str, - hf_loading_vars: Dict[str, str], - hf_parsing_map: Optional[Dict[str, List[str]]], + hf_loading_vars: dict[str, str], + hf_parsing_map: Optional[dict[str, list[str]]], ): tokenizer = tiny_gpt2_tokenizer @@ -2526,8 +2526,8 @@ def test_hf_dataloading_custom_parsing( tmp_path: Path, num_fewshot: int, prompt_string: str, - hf_loading_vars: Dict[str, str], - hf_parsing_map: Dict[str, List[str]], + hf_loading_vars: dict[str, str], + hf_parsing_map: dict[str, list[str]], ): tokenizer = tiny_gpt2_tokenizer diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index 83b0924a5d..2908a8a270 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import copy -from typing import Any, Callable, Dict +from typing import Any, Callable import pytest from pytest import fixture @@ -13,7 +13,7 @@ from llmfoundry.utils.builders import build_composer_model, build_tokenizer -def _build_model(config: Dict[str, Any], tokenizer: PreTrainedTokenizerBase): +def _build_model(config: dict[str, Any], tokenizer: PreTrainedTokenizerBase): name = config.pop('name') model = build_composer_model( name=name, diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 844ccd7fe5..4cdabf1a13 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -3,19 +3,28 @@ import os from copy import deepcopy -from typing import Any, Dict, Mapping +from pathlib import Path +from typing import Any, Mapping from unittest.mock import Mock, patch import pytest import torch from omegaconf import OmegaConf as om -from transformers import PretrainedConfig +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + PretrainedConfig, + PreTrainedModel, +) from llmfoundry.models.hf.hf_fsdp import rgetattr from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_dict_container +from llmfoundry.utils.config_utils import ( + set_config_overrides, + to_dict_container, +) def test_remote_code_false_mpt( @@ -36,7 +45,7 @@ def test_remote_code_false_mpt( test_cfg.device = device test_cfg.precision = 'fp16' - tokenizer_cfg: Dict[str, Any] = om.to_container( + tokenizer_cfg: dict[str, Any] = om.to_container( test_cfg.tokenizer, resolve=True, ) # type: ignore @@ -127,13 +136,13 @@ def test_tie_weights(tie_word_embeddings: bool): new=Mock(return_value=True), ) def test_hf_config_override( - model_cfg_overrides: Dict[str, Any], + model_cfg_overrides: dict[str, Any], conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml', ): with open(conf_path) as f: test_cfg = om.load(f) - tokenizer_cfg: Dict[str, Any] = om.to_container( + tokenizer_cfg: dict[str, Any] = om.to_container( test_cfg.tokenizer, resolve=True, ) # type: ignore @@ -279,3 +288,57 @@ def test_use_flash(): # Make sure that HF has not cast the parameters to bf16 assert next(model.parameters()).dtype == torch.float32 + + +def test_generation_config(tmp_path: Path): + # Create a small llama model to edit and save. + config = AutoConfig.from_pretrained('codellama/CodeLlama-7b-hf') + set_config_overrides( + config, + config_overrides={ + 'num_hidden_layers': 2, + 'hidden_size': 32, + 'intermediate_size': 64, + }, + ) + model = AutoModelForCausalLM.from_config(config) + + assert isinstance(model, PreTrainedModel) + assert model.generation_config is not None + + new_bos_token_id = 100 + + # Set the bos_token_id to something else + model.generation_config.bos_token_id = new_bos_token_id + + # Generation config and model config no longer match + assert model.generation_config.bos_token_id != model.config.bos_token_id + + save_dir = tmp_path / 'model' + + # Save the model. + model.save_pretrained(save_dir) + + # Now load the model from the save directory and check that the bos_token_id is the same as what we set. + model_cfg = { + 'name': 'hf_causal_lm', + 'pretrained_model_name_or_path': str(save_dir), + 'use_auth_token': True, + 'pretrained': False, + 'init_device': 'cpu', + } + + name = model_cfg.pop('name') + model = build_composer_model( + name=name, + cfg=model_cfg, + tokenizer=None, # type: ignore + ) + + inner_model = model.model + + assert isinstance(inner_model, PreTrainedModel) + assert inner_model.generation_config is not None + + # save_pretrained and reloading with hf_causal_lm should use the bos_token_id we set from earlier. + assert inner_model.generation_config.bos_token_id == new_bos_token_id diff --git a/tests/models/hf/test_hf_transform.py b/tests/models/hf/test_hf_transform.py index f479b50f73..ebc26ef131 100644 --- a/tests/models/hf/test_hf_transform.py +++ b/tests/models/hf/test_hf_transform.py @@ -1,7 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Optional +from typing import Any, Optional import pytest from composer.models.huggingface import maybe_get_underlying_model @@ -55,7 +55,7 @@ def transform_model(self, model: PreTrainedModel) -> PreTrainedModel: def get_peft_config( self, - peft_config_dict: Dict[str, Any], + peft_config_dict: dict[str, Any], ) -> PeftConfig: peft_config_dict['target_modules'] = ['o_proj'] return super().get_peft_config(peft_config_dict) diff --git a/tests/models/inference_api_wrapper/test_fmapi.py b/tests/models/inference_api_wrapper/test_fmapi.py index af26823aae..c1a38f49b9 100644 --- a/tests/models/inference_api_wrapper/test_fmapi.py +++ b/tests/models/inference_api_wrapper/test_fmapi.py @@ -1,7 +1,6 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Dict from unittest.mock import patch import pytest @@ -74,7 +73,7 @@ def __init__(self, expected_token: str) -> None: setattr(self, 'choices', [MockMessage(expected_token)]) -def mock_create(**kwargs: Dict[str, str]): +def mock_create(**kwargs: dict[str, str]): prompt = kwargs['prompt'] if prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer:': # pyright: ignore[reportUnnecessaryComparison] return MockCompletion(' Tre') diff --git a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py index f35e5cd750..7eca339354 100644 --- a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py +++ b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 import os -from typing import Dict from unittest.mock import patch import pytest @@ -80,7 +79,7 @@ def __init__(self, expected_token: str) -> None: setattr(self, 'choices', [MockMessage(expected_token)]) -def mock_create(**kwargs: Dict[str, str]): +def mock_create(**kwargs: dict[str, str]): prompt = kwargs['prompt'] if prompt == 'AMERICAN HISTORY: On May 29, 1765 Patrick Henrys Stamp Act protest was interrupted with this one word\nAnswer:': # pyright: ignore[reportUnnecessaryComparison] return MockCompletion(' Tre') diff --git a/tests/models/layers/test_attention.py b/tests/models/layers/test_attention.py index bdffe2b49f..c51a532092 100644 --- a/tests/models/layers/test_attention.py +++ b/tests/models/layers/test_attention.py @@ -1,10 +1,17 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +import math + import pytest import torch +from llmfoundry.models.layers.attention import ( + attention_implementations, + scaled_multihead_dot_product_attention, +) from llmfoundry.models.layers.layer_builders import build_attention_layer +from llmfoundry.models.mpt.modeling_mpt import gen_flash_attn_padding_info @pytest.mark.parametrize( @@ -158,3 +165,109 @@ def test_unfused_wqkv(attn_name: str, dim: int): assert isinstance(attn_fused.Wqkv.weight.grad, torch.Tensor) assert isinstance(combined_grad, torch.Tensor) assert torch.allclose(attn_fused.Wqkv.weight.grad, combined_grad) + + +@pytest.mark.gpu +@pytest.mark.parametrize('sliding_window_size', [1, 4, 8]) +@pytest.mark.parametrize('attn_impl', ['flash', 'torch']) +def test_sliding_window(sliding_window_size: int, attn_impl: str): + # Test that sliding window attention works as expected. + dtype = torch.bfloat16 + device = 'cuda' + d = 128 + n_heads = 8 + seqlen_1 = 8 + bsz = 2 + + query_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + query_1.requires_grad = True + key_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + key_1.requires_grad = True + value_1 = torch.randn(bsz, seqlen_1, + n_heads * d).to(dtype=dtype, device=device) + value_1.requires_grad = True + + attn_extra_kwargs = {} + if attn_impl == 'flash': + attn_extra_kwargs = { + 'flash_attn_padding_info': + gen_flash_attn_padding_info( + bsz, + seqlen_1, + 0, + query_1.device, + None, + None, + ), + 'should_repeat_kv_for_gqa': + True, + } + + output_1, _, _ = attention_implementations.get(attn_impl)( + query=query_1, + key=key_1, + value=value_1, + n_heads=n_heads, + kv_n_heads=n_heads, + past_key_value=None, + softmax_scale=1 / math.sqrt(d), + attn_bias=None, + key_padding_mask=None, + is_causal=True, + dropout_p=0.0, + training=False, + needs_weights=False, + sliding_window_size=sliding_window_size, + **attn_extra_kwargs, + ) + + output_1.sum().backward() + + query_2 = query_1.detach().clone() + query_2.requires_grad = True + key_2 = key_1.detach().clone() + key_2.requires_grad = True + value_2 = value_1.detach().clone() + value_2.requires_grad = True + + attn_bias_2 = torch.zeros(1, 1, seqlen_1, + seqlen_1).to(dtype=dtype, device=device) + + window_mask_2 = torch.tril( + torch.ones(seqlen_1, seqlen_1), + diagonal=-(sliding_window_size + 1), + ).to(dtype=dtype, device=device) * torch.finfo(attn_bias_2.dtype).min + attn_bias_2 = attn_bias_2 + window_mask_2 + output_2, _, _ = scaled_multihead_dot_product_attention( + query=query_2, + key=key_2, + value=value_2, + n_heads=n_heads, + kv_n_heads=n_heads, + past_key_value=None, + softmax_scale=1 / math.sqrt(d), + attn_bias=attn_bias_2, + key_padding_mask=None, + is_causal=True, + dropout_p=0.0, + training=False, + needs_weights=False, + ) + + output_2.sum().backward() + + print(torch.max(output_1 - output_2)) + + _assert_approx_equal(output_1, output_2) + assert (query_2.grad is not None) and (query_1.grad is not None) + _assert_approx_equal(query_1.grad, query_2.grad) + assert (key_2.grad is not None) and (key_1.grad is not None) + _assert_approx_equal(key_1.grad, key_2.grad) + assert (value_2.grad is not None) and (value_1.grad is not None) + _assert_approx_equal(value_1.grad, value_2.grad) + + +def _assert_approx_equal(value1: torch.Tensor, value2: torch.Tensor): + assert torch.norm(value2 - value1) <= 1e-2 + 1e-2 * torch.norm(value2) diff --git a/tests/models/layers/test_dmoe.py b/tests/models/layers/test_dmoe.py index a7393674dc..039e6527a4 100644 --- a/tests/models/layers/test_dmoe.py +++ b/tests/models/layers/test_dmoe.py @@ -4,7 +4,7 @@ import copy from contextlib import nullcontext from functools import partial -from typing import List, Optional, Union +from typing import Optional, Union import pytest import torch @@ -33,7 +33,7 @@ def _get_all_inputs( - input_shape: List[int], + input_shape: list[int], dtype: Optional[torch.dtype], ): world_size: int = dist.get_world_size() @@ -144,7 +144,7 @@ def test_dmoe( if moe_world_size > 1: assert device_mesh is not None - two_d_placements: List[Placement] = [Replicate(), Shard(0)] + two_d_placements: list[Placement] = [Replicate(), Shard(0)] dtensorified_params = [( name, dtensorify_param( diff --git a/tests/models/layers/test_flash_attn.py b/tests/models/layers/test_flash_attn.py index dcce0fe118..987ea7160a 100644 --- a/tests/models/layers/test_flash_attn.py +++ b/tests/models/layers/test_flash_attn.py @@ -218,104 +218,6 @@ def test_seq_id_masking_FA_v2(): ) -@pytest.mark.gpu -@pytest.mark.skipif( - not is_flash_v2_installed(v2_version='v2.3.0'), - reason= - 'Sliding window attention only supported by Flash Attention after v2.3.0.', -) -@pytest.mark.parametrize('sliding_window_size', [1, 4, 8]) -def test_sliding_window(sliding_window_size: int): - # Test that sliding window attention works as expected. - dtype = torch.bfloat16 - device = 'cuda' - d = 128 - n_heads = 8 - seqlen_1 = 8 - bsz = 2 - - query_1 = torch.randn(bsz, seqlen_1, - n_heads * d).to(dtype=dtype, device=device) - query_1.requires_grad = True - key_1 = torch.randn(bsz, seqlen_1, - n_heads * d).to(dtype=dtype, device=device) - key_1.requires_grad = True - value_1 = torch.randn(bsz, seqlen_1, - n_heads * d).to(dtype=dtype, device=device) - value_1.requires_grad = True - - output_1, _, _ = flash_attn_fn( - query=query_1, - key=key_1, - value=value_1, - n_heads=n_heads, - kv_n_heads=n_heads, - past_key_value=None, - softmax_scale=1 / math.sqrt(d), - attn_bias=None, - key_padding_mask=None, - is_causal=True, - dropout_p=0.0, - training=False, - needs_weights=False, - flash_attn_padding_info=gen_flash_attn_padding_info( - bsz, - seqlen_1, - 0, - query_1.device, - None, - None, - ), - should_repeat_kv_for_gqa=True, - sliding_window_size=sliding_window_size, - ) - - output_1.sum().backward() - - query_2 = query_1.detach().clone() - query_2.requires_grad = True - key_2 = key_1.detach().clone() - key_2.requires_grad = True - value_2 = value_1.detach().clone() - value_2.requires_grad = True - - attn_bias_2 = torch.zeros(1, 1, seqlen_1, - seqlen_1).to(dtype=dtype, device=device) - - window_mask_2 = torch.tril( - torch.ones(seqlen_1, seqlen_1), - diagonal=-(sliding_window_size + 1), - ).to(dtype=dtype, device=device) * torch.finfo(attn_bias_2.dtype).min - attn_bias_2 = attn_bias_2 + window_mask_2 - output_2, _, _ = scaled_multihead_dot_product_attention( - query=query_2, - key=key_2, - value=value_2, - n_heads=n_heads, - kv_n_heads=n_heads, - past_key_value=None, - softmax_scale=1 / math.sqrt(d), - attn_bias=attn_bias_2, - key_padding_mask=None, - is_causal=True, - dropout_p=0.0, - training=False, - needs_weights=False, - ) - - output_2.sum().backward() - - print(torch.max(output_1 - output_2)) - - _assert_approx_equal(output_1, output_2) - assert (query_2.grad is not None) and (query_1.grad is not None) - _assert_approx_equal(query_1.grad, query_2.grad) - assert (key_2.grad is not None) and (key_1.grad is not None) - _assert_approx_equal(key_1.grad, key_2.grad) - assert (value_2.grad is not None) and (value_1.grad is not None) - _assert_approx_equal(value_1.grad, value_2.grad) - - @pytest.mark.gpu @pytest.mark.skipif( not check_alibi_support('flash'), diff --git a/tests/models/layers/test_flash_torch.py b/tests/models/layers/test_flash_torch.py index 4bfdfb84dc..01a6a7576d 100644 --- a/tests/models/layers/test_flash_torch.py +++ b/tests/models/layers/test_flash_torch.py @@ -77,6 +77,7 @@ def allclose_helper( ) @pytest.mark.parametrize('attn_uses_sequence_id', [True, False]) @pytest.mark.parametrize('pad_attention_mask', [True, False]) +@pytest.mark.parametrize('sliding_window_size', [-1, 2]) def test_attn_impl( attn_impl_0: str, attn_impl_1: str, @@ -87,6 +88,7 @@ def test_attn_impl( attn_type: str, attn_uses_sequence_id: bool, pad_attention_mask: bool, + sliding_window_size: int, device: str = 'cuda', ): """Compare all attn impl with each other. @@ -122,6 +124,7 @@ def test_attn_impl( 'clip_qkv': clip_qkv, 'qk_ln': qk_ln, 'qk_gn': qk_gn, + 'sliding_window_size': sliding_window_size, }) n, s, f = 2, 4, cfg.d_model diff --git a/tests/models/test_model.py b/tests/models/test_model.py index ed40e7a88a..ac1bdacf4e 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -6,7 +6,7 @@ import pathlib import warnings from functools import partial -from typing import Any, Dict, List, Optional, Union, cast +from typing import Any, Optional, Union, cast from unittest import mock import pytest @@ -63,7 +63,7 @@ def get_config( return cast(DictConfig, test_cfg) -def _load_tokenizer_cfg(cfg: Union[Dict[str, Any], DictConfig]) -> Dict: +def _load_tokenizer_cfg(cfg: Union[dict[str, Any], DictConfig]) -> dict: if isinstance(cfg, DictConfig): config = to_dict_container(cfg) else: @@ -75,7 +75,7 @@ def _load_tokenizer_cfg(cfg: Union[Dict[str, Any], DictConfig]) -> Dict: def _get_objs( request: pytest.FixtureRequest, conf_path: str = 'scripts/train/yamls/pretrain/testing.yaml', - model_config_overrides: Optional[Dict] = None, + model_config_overrides: Optional[dict] = None, attn_impl: str = 'torch', ): warnings.filterwarnings( @@ -114,7 +114,7 @@ def _get_objs( test_cfg.device_eval_batch_size = 2 test_cfg.device_train_microbatch_size = 2 - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) tokenizer = build_tokenizer( test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -143,7 +143,7 @@ def _get_objs( def gen_random_batch( batch_size: int, test_cfg: Union[DictConfig, ListConfig], - inputs: Optional[List[str]] = None, + inputs: Optional[list[str]] = None, ): # inputs can be [], ['input_ids'], ['input_ids', 'inputs_embeds'], and ['inputs_embeds'] # default to only input ids @@ -260,7 +260,7 @@ def test_full_forward_and_backward_with_inputs_embeds( @pytest.mark.parametrize('inputs', [[], ['input_ids', 'inputs_embeds']]) def test_invalid_inputs_embeds_input_ids_combinations( request: pytest.FixtureRequest, - inputs: List[str], + inputs: list[str], ): test_cfg, model, _ = _get_objs( request=request, @@ -366,7 +366,7 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): neo_cfg.max_seq_len = 256 neo_cfg.model.name = 'hf_causal_lm' - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(neo_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(neo_cfg.tokenizer) tokenizer = build_tokenizer( neo_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -425,7 +425,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): t5_cfg.device = device t5_cfg.max_seq_len = 16 - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(t5_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(t5_cfg.tokenizer) tokenizer = build_tokenizer( t5_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -525,7 +525,7 @@ def test_determinism( test_cfg.model.init_device = 'cuda:0' test_cfg.device = 'cuda:0' - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) tokenizer = build_tokenizer( test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -605,7 +605,7 @@ def test_loss_fn(): 'init_std': 0.02, } - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) tokenizer = build_tokenizer( test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -709,7 +709,7 @@ def test_loss_reduction(loss_fn_config: str): 'init_std': 0.02, } - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) tokenizer = build_tokenizer( test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -822,7 +822,7 @@ def test_opt_wrapping(peft_config: Optional[dict[str, str]]): if peft_config is not None: conf['model']['peft_config'] = peft_config - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(conf['tokenizer']) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(conf['tokenizer']) tokenizer = build_tokenizer( conf['tokenizer']['name'], tokenizer_cfg.get('kwargs', {}), @@ -856,7 +856,7 @@ def test_lora_id(): config = DictConfig(conf) - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(config.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(config.tokenizer) tokenizer = build_tokenizer( config.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -2199,7 +2199,7 @@ def test_generate_with_past_kv( @pytest.mark.parametrize('tie_word_embeddings', [True, False]) def test_generation_kwargs_dont_crash( attn_impl: str, - generation_kwargs: Dict[str, Any], + generation_kwargs: dict[str, Any], pos_emb_config: dict, tie_word_embeddings: bool, ): @@ -2539,7 +2539,7 @@ def test_hf_init( trust_remote_code=True, ) - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) tokenizer = build_tokenizer( test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), @@ -2601,7 +2601,7 @@ def test_head_dim_8_flash_mqa_attn(batch_size: int = 2): ) test_cfg.device = torch.cuda.current_device() - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) + tokenizer_cfg: dict[str, Any] = _load_tokenizer_cfg(test_cfg.tokenizer) tokenizer = build_tokenizer( test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {}), diff --git a/tests/models/test_mpt_gen.py b/tests/models/test_mpt_gen.py index 1c9b5ef9d4..379f4b34bd 100644 --- a/tests/models/test_mpt_gen.py +++ b/tests/models/test_mpt_gen.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Callable, List, Optional, Tuple +from typing import Callable, Optional from unittest.mock import Mock, patch import pytest @@ -28,7 +28,7 @@ class MockMPTForCausalLM(MPTForCausalLM): def forward( self, input_ids: torch.LongTensor, - past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None, + past_key_values: Optional[list[tuple[torch.FloatTensor]]] = None, attention_mask: Optional[torch.ByteTensor] = None, sequence_id: Optional[torch.LongTensor] = None, labels: Optional[torch.LongTensor] = None, diff --git a/tests/models/test_rmsnorm_triton_vs_eager.py b/tests/models/test_rmsnorm_triton_vs_eager.py index c8f0a2e07f..e60f9afde3 100644 --- a/tests/models/test_rmsnorm_triton_vs_eager.py +++ b/tests/models/test_rmsnorm_triton_vs_eager.py @@ -1,7 +1,7 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union +from typing import Union import pytest import torch @@ -14,7 +14,7 @@ @pytest.mark.gpu @pytest.mark.parametrize('normalized_shape', [32, 128, 4096]) def test_rmsnorm_triton_vs_eager( - normalized_shape: Union[int, List[int]], + normalized_shape: Union[int, list[int]], device: str = 'cuda', ): # Compare Triton and PyTorch Eager implementations of RMSNorm diff --git a/tests/models/utils/test_param_init_fns.py b/tests/models/utils/test_param_init_fns.py index de818304a6..0eaf60c869 100644 --- a/tests/models/utils/test_param_init_fns.py +++ b/tests/models/utils/test_param_init_fns.py @@ -4,7 +4,7 @@ from collections import OrderedDict from collections.abc import Sequence from functools import partial -from typing import Dict, List, Optional, Tuple, Union +from typing import Optional, Union import pytest import torch @@ -146,8 +146,8 @@ def max_fill_init_(weight: torch.Tensor): ('emb_init_uniform_lim', [1, 1]), ], ) -def test_emb_init(emb_init_cfg: Optional[Tuple[str, Union[int, List[int]]]]): - cfg: Dict[str, Union[int, List[int]]] = { +def test_emb_init(emb_init_cfg: Optional[tuple[str, Union[int, list[int]]]]): + cfg: dict[str, Union[int, list[int]]] = { 'vocab_size': 64, 'in_features': 16, 'out_features': 32, diff --git a/tests/optim/test_scheduler.py b/tests/optim/test_scheduler.py index 602a4ef8c7..294dbaffd4 100644 --- a/tests/optim/test_scheduler.py +++ b/tests/optim/test_scheduler.py @@ -1,8 +1,6 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import List - import pytest import torch from composer.core import State, Time, TimeUnit @@ -66,8 +64,8 @@ def dummy_schedulers_state(request: pytest.FixtureRequest): def test_scheduler_init( scheduler: ComposerScheduler, ssr: float, - test_times: List[str], - expected_lrs: List[float], + test_times: list[str], + expected_lrs: list[float], dummy_schedulers_state: State, ): diff --git a/tests/test_registry.py b/tests/test_registry.py index c4d1a1bcd5..5108a7d46c 100644 --- a/tests/test_registry.py +++ b/tests/test_registry.py @@ -5,7 +5,7 @@ import os import pathlib from importlib.metadata import EntryPoint -from typing import Any, Callable, Type, Union +from typing import Any, Callable, Union import catalogue import pytest @@ -171,7 +171,7 @@ def test_registry_builder(monkeypatch: pytest.MonkeyPatch): 'llmfoundry', 'test_registry', entry_points=False, - generic_type=Union[Type[LoggerDestination], + generic_type=Union[type[LoggerDestination], Callable[..., LoggerDestination]], ) diff --git a/tests/test_utils.py b/tests/test_utils.py index dc9bcd9baf..05c0881b9f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,8 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, List +import copy +from typing import Any import catalogue import pytest @@ -15,7 +16,7 @@ ) -def generate_exclusive_test_params(param_names: List[str]): +def generate_exclusive_test_params(param_names: list[str]): """Generates pytest.param objects with one true parameter for testing. Creates pytest.param objects for each parameter name given. For each @@ -52,7 +53,7 @@ def test_config_transforms(): 'variables': {}, },) - def dummy_transform(config: Dict[str, Any]) -> Dict[str, Any]: + def dummy_transform(config: dict[str, Any]) -> dict[str, Any]: config['variables']['fake_key'] = 'fake_value' return config @@ -65,8 +66,39 @@ def dummy_transform(config: Dict[str, Any]) -> Dict[str, Any]: transforms='all', ) - assert isinstance(parsed_config.variables, Dict) + assert isinstance(parsed_config.variables, dict) assert parsed_config.variables['fake_key'] == 'fake_value' del catalogue.REGISTRY[ ('llmfoundry', 'config_transforms', 'dummy_transform')] + + +def test_logged_cfg(): + config = DictConfig({ + 'global_train_batch_size': 1, + 'device_train_microbatch_size': 1, + 'model': {}, + 'scheduler': {}, + 'max_seq_len': 128, + 'train_loader': {}, + 'max_duration': 1, + 'tokenizer': {}, + 'eval_interval': 1, + 'seed': 1, + 'optimizer': {}, + 'variables': {}, + },) + logged_config, _ = make_dataclass_and_log_config( + config, + TrainConfig, + TRAIN_CONFIG_KEYS, + transforms='all', + ) + expected_config = copy.deepcopy(config) + expected_config.update({ + 'n_gpus': 1, + 'device_train_batch_size': 1, + 'device_train_grad_accum': 1, + 'device_eval_batch_size': 1, + }) + assert expected_config == logged_config diff --git a/tests/tokenizers/test_registry.py b/tests/tokenizers/test_registry.py index 920c207a64..fd70639b4e 100644 --- a/tests/tokenizers/test_registry.py +++ b/tests/tokenizers/test_registry.py @@ -1,7 +1,7 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Dict, Optional +from typing import Any, Optional from transformers import PreTrainedTokenizer @@ -15,7 +15,7 @@ class DummyTokenizer(PreTrainedTokenizer): def __init__( self, model_name: Optional[str] = 'dummy', - **kwargs: Optional[Dict[str, Any]], + **kwargs: Optional[dict[str, Any]], ): """Dummy constructor that has no real purpose.""" super().__init__( @@ -25,7 +25,7 @@ def __init__( **kwargs, ) - def get_vocab(self) -> Dict[str, int]: + def get_vocab(self) -> dict[str, int]: return {} diff --git a/tests/tokenizers/test_tiktoken.py b/tests/tokenizers/test_tiktoken.py index af18c73927..8a61c6124f 100644 --- a/tests/tokenizers/test_tiktoken.py +++ b/tests/tokenizers/test_tiktoken.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import pathlib -from typing import TYPE_CHECKING, List, Optional, Tuple +from typing import TYPE_CHECKING, Optional import pytest import transformers @@ -147,8 +147,8 @@ def get_tokenizers_for_testing( use_default_system_prompt: bool = False, add_bos_token: bool = False, add_eos_token: bool = False, - additional_special_tokens: Optional[List[str]] = None, -) -> Tuple[TiktokenTokenizerWrapper, TiktokenTokenizerWrapper, 'Encoding']: + additional_special_tokens: Optional[list[str]] = None, +) -> tuple[TiktokenTokenizerWrapper, TiktokenTokenizerWrapper, 'Encoding']: tiktoken = pytest.importorskip('tiktoken') # Construction diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index fb6cb0c5df..72ca540311 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -4,7 +4,7 @@ import re import unittest.mock as mock from copy import deepcopy -from typing import Any, Dict, Union +from typing import Any, Union from unittest.mock import MagicMock import pytest @@ -240,8 +240,8 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: # type:ignore ) def test_build_optimizer( name: str, - optimizer_config: Dict[str, Any], - opt_additional_config: Dict[str, Any], + optimizer_config: dict[str, Any], + opt_additional_config: dict[str, Any], ): model = _DummyModule() optimizer_config = deepcopy(optimizer_config) diff --git a/tests/utils/test_exceptions.py b/tests/utils/test_exceptions.py index 90841c5222..097bdf77fb 100644 --- a/tests/utils/test_exceptions.py +++ b/tests/utils/test_exceptions.py @@ -4,7 +4,7 @@ import contextlib import inspect import pickle -from typing import Any, Dict, List, Optional, Type +from typing import Any, Optional import pytest @@ -12,7 +12,7 @@ def create_exception_object( - exception_class: Type[foundry_exceptions.BaseContextualError], + exception_class: type[foundry_exceptions.BaseContextualError], ): # get required arg types of exception class by inspecting its __init__ method @@ -27,9 +27,9 @@ def create_exception_object( required_args.pop('kwargs', None) def get_default_value(arg_type: Optional[type] = None): - if arg_type == Dict[str, - str] or arg_type == Dict[str, - Any] or arg_type == Dict: + if arg_type == dict[str, + str] or arg_type == dict[str, + Any] or arg_type == dict: return {'key': 'value'} elif arg_type == str: return 'string' @@ -37,13 +37,13 @@ def get_default_value(arg_type: Optional[type] = None): return 1 elif arg_type == set[str]: return {'set'} - elif arg_type == List[str]: + elif arg_type == list[str]: return ['list'] elif arg_type == None: return None elif arg_type == type: return bool - elif arg_type == List[Dict[str, Any]]: + elif arg_type == list[dict[str, Any]]: return [{'key': 'value'}] raise ValueError(f'Unsupported arg type: {arg_type}') @@ -56,7 +56,7 @@ def get_default_value(arg_type: Optional[type] = None): return exception_class(**kwargs) # type: ignore -def filter_exceptions(possible_exceptions: List[str]): +def filter_exceptions(possible_exceptions: list[str]): attrs = [ getattr(foundry_exceptions, exception) for exception in possible_exceptions @@ -74,7 +74,7 @@ def filter_exceptions(possible_exceptions: List[str]): filter_exceptions(dir(foundry_exceptions)), ) def test_exception_serialization( - exception_class: Type[foundry_exceptions.BaseContextualError], + exception_class: type[foundry_exceptions.BaseContextualError], ): excluded_base_classes = [ foundry_exceptions.InternalError, diff --git a/tests/utils/test_model_download_utils.py b/tests/utils/test_model_download_utils.py index 8519277e74..ea42c0ff64 100644 --- a/tests/utils/test_model_download_utils.py +++ b/tests/utils/test_model_download_utils.py @@ -4,7 +4,7 @@ import os import unittest.mock as mock from http import HTTPStatus -from typing import Any, Dict, List +from typing import Any from unittest.mock import MagicMock from urllib.parse import urljoin @@ -103,8 +103,8 @@ def test_download_from_hf_hub_weights_pref( mock_list_repo_files: MagicMock, mock_snapshot_download: MagicMock, prefer_safetensors: bool, - repo_files: List[str], - expected_ignore_patterns: List[str], + repo_files: list[str], + expected_ignore_patterns: list[str], ): test_repo_id = 'test_repo_id' save_dir = 'save_dir' @@ -204,7 +204,7 @@ def test_download_from_http_fileserver( mock_open.return_value = MagicMock() - def _server_response(url: str, **kwargs: Dict[str, Any]): + def _server_response(url: str, **kwargs: dict[str, Any]): if url == model_url: return MagicMock(status_code=HTTPStatus.OK, content=ROOT_HTML) if url == urljoin(model_url, 'file1'):