From 7dc3fbc798ea17683b41035ba8449fc5151f66c9 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 22 Mar 2024 15:15:31 +0000 Subject: [PATCH 001/201] first commit for structuredconfig for train.py --- scripts/train/train.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/train/train.py b/scripts/train/train.py index 478b484fb9..215ca97f9d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -9,6 +9,7 @@ import warnings from typing import Any, Dict, List, Optional, Union +import attr import torch from composer import Trainer from composer.core.callback import Callback @@ -43,6 +44,11 @@ log = logging.getLogger(__name__) +@attr.s(auto_attribs=True) +class TrainConfig: + eval_loader: Optional[Union[DictConfig, ListConfig]] = None + + def validate_config(cfg: DictConfig): """Validates compatible model and dataloader selection.""" loaders = [cfg.train_loader] From 6f7c519819a4db6da2fdca2f25cad64a614b878e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 4 Apr 2024 17:31:15 +0000 Subject: [PATCH 002/201] revamp configs --- scripts/train/train.py | 67 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index c5e35d13cc..9941b74893 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -18,6 +18,7 @@ cyclic_schedule) from composer.utils import dist, get_device, reproducibility from omegaconf import DictConfig, ListConfig +from omegaconf import OmegaConf from omegaconf import OmegaConf as om from rich.traceback import install @@ -43,6 +44,63 @@ @attr.s(auto_attribs=True) class TrainConfig: eval_loader: Optional[Union[DictConfig, ListConfig]] = None + icl_tasks: Optional[Union[ListConfig, str]] = None + code_paths: List[str] = [] + seed: int = 42 + dist_timeout: Union[int, float] = 600.0 + model: DictConfig + tokenizer: Dict[str, Any] + optimizer: Dict[str, Any] + scheduler: Dict[str, Any] + train_loader: DictConfig + fsdp_config: Optional[Dict[str, Any]] = None + eval_loader: Optional[Union[DictConfig, ListConfig]] = None + icl_tasks: Optional[Union[ListConfig, str]] = None + eval_gauntlet: Optional[Union[DictConfig, str]] = None + icl_subset_num_batches: Optional[int] = None + icl_seq_len: Optional[int] = None + loggers: Optional[DictConfig] = None + callbacks: Optional[DictConfig] = None + algorithms: Optional[DictConfig] = None + device_train_batch_size: int + device_eval_batch_size: int + max_duration: Union[int, str] + eval_interval: Union[int, str] + precision: str + max_seq_len: int + run_name: Optional[str] = None + save_folder: Optional[str] = None + save_latest_filename: Optional[str] = None + save_overwrite: bool = False + save_weights_only: bool = False + save_filename: Optional[str] = None + save_interval: Union[str, int] = '1000ba' + + save_num_checkpoints_to_keep: int = -1 + progress_bar: bool = False + log_to_console: bool = True + python_log_level: Optional[str] = 'debug' + console_log_interval: Union[int, str] = '1ba' + device_train_microbatch_size: Union[str, int] = 'auto' + eval_subset_num_batches: int = -1 + eval_first: bool = False + load_path: Optional[str] = None + load_weights_only: bool = False + load_strict_model_weights: bool = True + load_ignore_keys: Optional[List[str]] = None + compile_config: Optional[Dict[str, Any]] = None + metadata: Optional[Dict[str, str]] = None + log_config: bool = True + autoresume: bool = False + + data_local: Optional[Dict[str, Any]] = None + data_remote: Optional[Dict[str, Any]] = None + global_seed: Optional[int] = None + global_train_batch_size: Optional[int] = None + n_gpus: Optional[int] = None + device_train_grad_accum: Optional[int] = None + + profiler: Optional[Dict[str, Any]] = None def validate_config(cfg: DictConfig): @@ -109,12 +167,9 @@ def validate_config(cfg: DictConfig): def main(cfg: DictConfig) -> Trainer: - # Run user provided code if specified - code_paths = pop_config(cfg, - 'code_paths', - must_exist=False, - default_value=[], - convert=True) + scfg: TrainConfig = OmegaConf.structured(TrainConfig(**cfg)) + + code_paths = scfg.code_paths # Import any user provided code for code_path in code_paths: import_file(code_path) From ea77a278733c9087375b85367aa506248c1e30de Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 16:45:52 +0000 Subject: [PATCH 003/201] wip latest issue --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 9941b74893..3244a7ecb9 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -103,7 +103,7 @@ class TrainConfig: profiler: Optional[Dict[str, Any]] = None -def validate_config(cfg: DictConfig): +def validate_config(cfg: TrainConfig): """Validates compatible model and dataloader selection.""" loaders = [cfg.train_loader] if 'eval_loader' in cfg: From a74aa9e892e0d7b44caf4ab3ac5d008fa7241a2c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 17:13:03 +0000 Subject: [PATCH 004/201] reorder so mandatory attributes come first --- scripts/train/train.py | 49 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index b98dd2680c..76a081e192 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -26,6 +26,8 @@ maybe_create_mosaicml_logger) install() +from omegaconf import MISSING + from llmfoundry.callbacks import AsyncEval from llmfoundry.data.dataloader import build_dataloader from llmfoundry.utils.builders import (add_metrics_to_eval_loaders, @@ -43,16 +45,21 @@ @attr.s(auto_attribs=True) class TrainConfig: + model: DictConfig = MISSING + tokenizer: Dict[str, Any] = MISSING + optimizer: Dict[str, Any] = MISSING + scheduler: Dict[str, Any] = MISSING + train_loader: DictConfig = MISSING + device_train_batch_size: int = MISSING + device_eval_batch_size: int = MISSING + max_duration: Union[int, str] = MISSING + eval_interval: Union[int, str] = MISSING + precision: str = MISSING + max_seq_len: int = MISSING + + code_paths: List[str] = [] eval_loader: Optional[Union[DictConfig, ListConfig]] = None icl_tasks: Optional[Union[ListConfig, str]] = None - code_paths: List[str] = [] - seed: int = 42 - dist_timeout: Union[int, float] = 600.0 - model: DictConfig - tokenizer: Dict[str, Any] - optimizer: Dict[str, Any] - scheduler: Dict[str, Any] - train_loader: DictConfig fsdp_config: Optional[Dict[str, Any]] = None eval_loader: Optional[Union[DictConfig, ListConfig]] = None icl_tasks: Optional[Union[ListConfig, str]] = None @@ -62,12 +69,6 @@ class TrainConfig: loggers: Optional[DictConfig] = None callbacks: Optional[DictConfig] = None algorithms: Optional[DictConfig] = None - device_train_batch_size: int - device_eval_batch_size: int - max_duration: Union[int, str] - eval_interval: Union[int, str] - precision: str - max_seq_len: int run_name: Optional[str] = None save_folder: Optional[str] = None save_latest_filename: Optional[str] = None @@ -75,7 +76,6 @@ class TrainConfig: save_weights_only: bool = False save_filename: Optional[str] = None save_interval: Union[str, int] = '1000ba' - save_num_checkpoints_to_keep: int = -1 progress_bar: bool = False log_to_console: bool = True @@ -92,21 +92,19 @@ class TrainConfig: metadata: Optional[Dict[str, str]] = None log_config: bool = True autoresume: bool = False - data_local: Optional[Dict[str, Any]] = None data_remote: Optional[Dict[str, Any]] = None global_seed: Optional[int] = None global_train_batch_size: Optional[int] = None n_gpus: Optional[int] = None device_train_grad_accum: Optional[int] = None - profiler: Optional[Dict[str, Any]] = None def validate_config(cfg: TrainConfig): """Validates compatible model and dataloader selection.""" loaders = [cfg.train_loader] - if 'eval_loader' in cfg: + if cfg.eval_loader is not None: eval_loader = cfg.eval_loader if isinstance(eval_loader, ListConfig): for loader in eval_loader: @@ -124,7 +122,7 @@ def validate_config(cfg: TrainConfig): f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ f'dataloader. Only finetuning is supported.') - if 'icl_tasks' in cfg: + if cfg.icl_tasks is not None: if cfg.model.name == 'hf_t5': raise ValueError( 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' @@ -141,17 +139,18 @@ def validate_config(cfg: TrainConfig): if (cfg.model.get('fc_type', 'torch') == 'te' or 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp')): - fsdp_config = cfg.get('fsdp_config', None) - act_ckpt = fsdp_config.get('activation_checkpointing', False) + fsdp_config = cfg.fsdp_config + act_ckpt = fsdp_config.get('activation_checkpointing', + False) if fsdp_config else False act_ckpt_reentrant = fsdp_config.get( - 'activation_checkpointing_reentrant', True) - if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == False: + 'activation_checkpointing_reentrant', True) if fsdp_config else True + if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == False and cfg.fsdp_config is not None: warnings.warn( '`te.Linear` layers do not support activation_checkpointing with ' + '`activation_checkpointing_reentrant = False`. ' + 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=True.' ) - cfg.fsdp_config.activation_checkpointing_reentrant = True + cfg.fsdp_config['activation_checkpointing_reentrant'] = True if 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp'): warnings.warn( @@ -167,7 +166,7 @@ def validate_config(cfg: TrainConfig): def main(cfg: DictConfig) -> Trainer: - scfg: TrainConfig = OmegaConf.structured(TrainConfig(**cfg)) + scfg: TrainConfig = OmegaConf.structured(TrainConfig(**cfg)) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) code_paths = scfg.code_paths # Import any user provided code From f76758675e32a6368be02aa0e31201d0fba0d969 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 17:14:06 +0000 Subject: [PATCH 005/201] fix --- scripts/train/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 76a081e192..1b475601d2 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -166,7 +166,9 @@ def validate_config(cfg: TrainConfig): def main(cfg: DictConfig) -> Trainer: - scfg: TrainConfig = OmegaConf.structured(TrainConfig(**cfg)) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) + scfg: TrainConfig = OmegaConf.structured( + TrainConfig(**cfg) + ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) code_paths = scfg.code_paths # Import any user provided code From e3134e36572b29debf361e31a975a46d32c94ba0 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 17:53:10 +0000 Subject: [PATCH 006/201] fix --- llmfoundry/utils/config_utils.py | 14 +- scripts/train/train.py | 248 ++++++++----------------------- 2 files changed, 78 insertions(+), 184 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 0edbae80a5..a67f1483f9 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -5,7 +5,7 @@ import logging import math import warnings -from typing import Any, Dict, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union from composer.utils import dist from omegaconf import DictConfig, ListConfig @@ -24,6 +24,18 @@ ] +def convert_to_dict( + value: Optional[Union[ListConfig, DictConfig]] +) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + if value is None: + return None + if not isinstance(value, DictConfig) and not isinstance(value, ListConfig): + raise ValueError( + f'The value {value} is of type {type(value)} that cannot be \ + converted to a dict or list. Please check your yaml.') + return om.to_container(value) + + def pop_config(cfg: DictConfig, key: str, must_exist: bool = True, diff --git a/scripts/train/train.py b/scripts/train/train.py index 1b475601d2..1201cb621e 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -7,9 +7,9 @@ import sys import time import warnings +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union -import attr import torch from composer import Trainer from composer.core.callback import Callback @@ -35,20 +35,20 @@ build_composer_model, build_evaluators, build_logger, build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, pop_config, - process_init_device, +from llmfoundry.utils.config_utils import (convert_to_dict, log_config, + pop_config, process_init_device, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) -@attr.s(auto_attribs=True) +@dataclass class TrainConfig: model: DictConfig = MISSING - tokenizer: Dict[str, Any] = MISSING - optimizer: Dict[str, Any] = MISSING - scheduler: Dict[str, Any] = MISSING + tokenizer: DictConfig = MISSING + optimizer: DictConfig = MISSING + scheduler: DictConfig = MISSING train_loader: DictConfig = MISSING device_train_batch_size: int = MISSING device_eval_batch_size: int = MISSING @@ -56,8 +56,13 @@ class TrainConfig: eval_interval: Union[int, str] = MISSING precision: str = MISSING max_seq_len: int = MISSING + seed: int = MISSING code_paths: List[str] = [] + max_split_size_mb: Optional[int] = None + expandable_segments: bool = False + cuda_load_lazy: bool = False + dist_timeout: Union[int, float] = 600.0 eval_loader: Optional[Union[DictConfig, ListConfig]] = None icl_tasks: Optional[Union[ListConfig, str]] = None fsdp_config: Optional[Dict[str, Any]] = None @@ -98,7 +103,7 @@ class TrainConfig: global_train_batch_size: Optional[int] = None n_gpus: Optional[int] = None device_train_grad_accum: Optional[int] = None - profiler: Optional[Dict[str, Any]] = None + profiler: Optional[DictConfig] = None def validate_config(cfg: TrainConfig): @@ -184,7 +189,7 @@ def main(cfg: DictConfig) -> Trainer: ) # Check for incompatibilities between the model and data loaders - validate_config(cfg) + validate_config(scfg) # Resolve all interpolation variables as early as possible om.resolve(cfg) @@ -194,12 +199,12 @@ def main(cfg: DictConfig) -> Trainer: cuda_alloc_conf = [] # Get max split size mb - max_split_size_mb: Optional[int] = cfg.pop('max_split_size_mb', None) + max_split_size_mb: Optional[int] = scfg.max_split_size_mb if max_split_size_mb is not None: cuda_alloc_conf.append(f'max_split_size_mb:{max_split_size_mb}') # Expandable segments - if cfg.pop('expandable_segments', False): + if scfg.expandable_segments: cuda_alloc_conf.append('expandable_segments:True') if len(cuda_alloc_conf) > 0: @@ -207,19 +212,16 @@ def main(cfg: DictConfig) -> Trainer: # Set CUDA lazy loading # This can save a bit of memory if not all modules are needed - cuda_load_lazy: bool = cfg.pop('cuda_load_lazy', False) + cuda_load_lazy: bool = scfg.cuda_load_lazy if cuda_load_lazy: os.environ['CUDA_MODULE_LOADING'] = 'LAZY' # Set seed first - seed: int = pop_config(cfg, 'seed', must_exist=True) + seed: int = scfg.seed reproducibility.seed_all(seed) # Initialize pytorch distributed training process groups - dist_timeout: Union[int, float] = pop_config(cfg, - 'dist_timeout', - must_exist=False, - default_value=600.0) + dist_timeout: Union[int, float] = scfg.dist_timeout dist.initialize_dist(get_device(None), timeout=dist_timeout) # Get global and device batch size information from distributed/single node setting @@ -227,177 +229,61 @@ def main(cfg: DictConfig) -> Trainer: logged_cfg.update(cfg, merge=True) # Mandatory model training configs - model_config: DictConfig = pop_config(cfg, 'model', must_exist=True) - tokenizer_config: Dict[str, Any] = pop_config(cfg, - 'tokenizer', - must_exist=True, - convert=True) - optimizer_config: Dict[str, Any] = pop_config(cfg, - 'optimizer', - must_exist=True, - convert=True) - scheduler_config: Dict[str, Any] = pop_config(cfg, - 'scheduler', - must_exist=True, - convert=True) - train_loader_config: DictConfig = pop_config(cfg, - 'train_loader', - must_exist=True) + model_config: DictConfig = scfg.model + tokenizer_config: Dict[str, Any] = convert_to_dict(scfg.tokenizer) + optimizer_config: Dict[str, Any] = convert_to_dict(scfg.optimizer) + scheduler_config: Dict[str, Any] = convert_to_dict(scfg.scheduler) + train_loader_config: DictConfig = scfg.train_loader # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'fsdp_config', - must_exist=False, - default_value=None, - convert=True) - eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( - cfg, 'eval_loader', must_exist=False, default_value=None) - icl_tasks_config: Optional[Union[ListConfig, - str]] = pop_config(cfg, - 'icl_tasks', - must_exist=False, - default_value=None) - eval_gauntlet_config: Optional[Union[DictConfig, - str]] = pop_config(cfg, - 'eval_gauntlet', - must_exist=False, - default_value=None) - icl_subset_num_batches: Optional[int] = pop_config(cfg, - 'icl_subset_num_batches', - must_exist=False, - default_value=None) - icl_seq_len: Optional[int] = pop_config(cfg, - 'icl_seq_len', - must_exist=False, - default_value=None) + fsdp_config: Optional[Dict[str, Any]] = convert_to_dict(scfg.fsdp_config) + + eval_loader_config: Optional[Union[DictConfig, + ListConfig]] = scfg.eval_loader + icl_tasks_config: Optional[Union[ListConfig, str]] = scfg.icl_tasks + eval_gauntlet_config: Optional[Union[DictConfig, str]] = scfg.eval_gauntlet + icl_subset_num_batches: Optional[int] = scfg.icl_subset_num_batches + icl_seq_len: Optional[int] = scfg.icl_seq_len # Optional logging, evaluation and callback configs - logger_configs: Optional[DictConfig] = pop_config(cfg, - 'loggers', - must_exist=False, - default_value=None, - convert=True) - callback_configs: Optional[DictConfig] = pop_config(cfg, - 'callbacks', - must_exist=False, - default_value=None, - convert=True) - algorithm_configs: Optional[DictConfig] = pop_config(cfg, - 'algorithms', - must_exist=False, - default_value=None) + logger_configs: Optional[DictConfig] = scfg.loggers + callback_configs: Optional[DictConfig] = scfg.callbacks + algorithm_configs: Optional[DictConfig] = scfg.algorithms # Mandatory hyperparameters for training - device_train_batch_size: int = pop_config(cfg, - 'device_train_batch_size', - must_exist=True) - device_eval_batch_size: int = pop_config(cfg, - 'device_eval_batch_size', - must_exist=True) - max_duration: Union[int, str] = pop_config(cfg, - 'max_duration', - must_exist=True) - eval_interval: Union[int, str] = pop_config(cfg, - 'eval_interval', - default_value=1, - must_exist=False) - precision: str = pop_config(cfg, 'precision', must_exist=True) - max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True) + device_train_batch_size: int = scfg.device_train_batch_size + device_eval_batch_size: int = scfg.device_eval_batch_size + max_duration: Union[int, str] = scfg.max_duration + eval_interval: Union[int, str] = scfg.eval_interval + precision: str = scfg.precision + max_seq_len: int = scfg.max_seq_len # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name: str = pop_config(cfg, - 'run_name', - must_exist=False, - default_value=default_run_name) - save_folder: Optional[str] = pop_config(cfg, - 'save_folder', - must_exist=False, - default_value=None) + run_name: str = scfg.run_name if scfg.run_name else default_run_name + save_folder: Optional[str] = scfg.save_folder is_state_dict_sharded: bool = (fsdp_config.get('state_dict_type', 'full') == 'sharded') if fsdp_config else False - save_latest_filename: str = pop_config( - cfg, - 'save_latest_filename', - must_exist=False, - default_value='latest-sharded-rank{rank}' - if is_state_dict_sharded else 'latest-rank{rank}.pt') - save_overwrite: bool = pop_config(cfg, - 'save_overwrite', - must_exist=False, - default_value=False) - save_weights_only: bool = pop_config(cfg, - 'save_weights_only', - must_exist=False, - default_value=False) - save_filename: str = pop_config( - cfg, - 'save_filename', - must_exist=False, - default_value='ep{epoch}-ba{batch}-rank{rank}.pt') - save_interval: Union[str, int] = pop_config(cfg, - 'save_interval', - must_exist=False, - default_value='1000ba') - save_num_checkpoints_to_keep: int = pop_config( - cfg, 'save_num_checkpoints_to_keep', must_exist=False, default_value=-1) - progress_bar = pop_config(cfg, - 'progress_bar', - must_exist=False, - default_value=False) - log_to_console: bool = pop_config(cfg, - 'log_to_console', - must_exist=False, - default_value=True) - python_log_level: Optional[str] = pop_config(cfg, - 'python_log_level', - must_exist=False, - default_value='debug') - console_log_interval: Union[int, str] = pop_config(cfg, - 'console_log_interval', - must_exist=False, - default_value='1ba') - device_train_microbatch_size: Union[str, int] = pop_config( - cfg, - 'device_train_microbatch_size', - must_exist=False, - default_value='auto') - eval_subset_num_batches: int = pop_config(cfg, - 'eval_subset_num_batches', - must_exist=False, - default_value=-1) - eval_first: bool = pop_config(cfg, - 'eval_first', - must_exist=False, - default_value=False) - load_path: str = pop_config(cfg, - 'load_path', - must_exist=False, - default_value=None) - load_weights_only: bool = pop_config(cfg, - 'load_weights_only', - must_exist=False, - default_value=False) - load_strict_model_weights: bool = pop_config(cfg, - 'load_strict_model_weights', - must_exist=False, - default_value=True) - load_ignore_keys: Optional[List[str]] = pop_config(cfg, - 'load_ignore_keys', - must_exist=False, - default_value=None) - compile_config: Optional[Dict[str, Any]] = pop_config(cfg, - 'compile_config', - must_exist=False, - default_value=None) - metadata: Optional[Dict[str, str]] = pop_config(cfg, - 'metadata', - must_exist=False, - default_value=None, - convert=True) - should_log_config: bool = pop_config(cfg, - 'log_config', - must_exist=False, - default_value=True) + save_latest_filename: str = scfg.save_latest_filename if scfg.save_latest_filename else 'latest-sharded-rank{rank}' if is_state_dict_sharded else 'latest-rank{rank}.pt' + save_overwrite: bool = scfg.save_overwrite + save_weights_only: bool = scfg.save_weights_only + save_filename: str = scfg.save_filename if scfg.save_filename else 'ep{epoch}-ba{batch}-rank{rank}.pt' + save_interval: Union[str, int] = scfg.save_interval + save_num_checkpoints_to_keep: int = scfg.save_num_checkpoints_to_keep + progress_bar = scfg.progress_bar + log_to_console: bool = scfg.log_to_console + python_log_level: Optional[str] = scfg.python_log_level + console_log_interval: Union[int, str] = scfg.console_log_interval + device_train_microbatch_size: Union[str, + int] = scfg.device_train_microbatch_size + eval_subset_num_batches: int = scfg.eval_subset_num_batches + eval_first: bool = scfg.eval_first + load_path: str = scfg.load_path + load_weights_only: bool = scfg.load_weights_only + load_strict_model_weights: bool = scfg.load_strict_model_weights + load_ignore_keys: Optional[List[str]] = scfg.load_ignore_keys + compile_config: Optional[Dict[str, Any]] = scfg.compile_config + metadata: Optional[Dict[str, str]] = convert_to_dict(scfg.metadata) + should_log_config: bool = scfg.log_config # Enable autoresume from model checkpoints if possible autoresume_default: bool = False @@ -487,11 +373,7 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg: Optional[DictConfig] = pop_config(cfg, - 'profiler', - must_exist=False, - convert=False, - default_value=None) + profiler_cfg: Optional[DictConfig] = scfg.profiler if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', From 686fc66d440a277ae80c0c9dcdb7d6a0e96f280b Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 18:07:42 +0000 Subject: [PATCH 007/201] fix fix --- scripts/train/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 1201cb621e..f70a4d0569 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -58,7 +58,7 @@ class TrainConfig: max_seq_len: int = MISSING seed: int = MISSING - code_paths: List[str] = [] + code_paths: Optional[List[str]] = None max_split_size_mb: Optional[int] = None expandable_segments: bool = False cuda_load_lazy: bool = False @@ -175,7 +175,7 @@ def main(cfg: DictConfig) -> Trainer: TrainConfig(**cfg) ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) - code_paths = scfg.code_paths + code_paths = scfg.code_paths if scfg.code_paths else [] # Import any user provided code for code_path in code_paths: import_file(code_path) From cf1e42eeabb2dd09a964c86ede13c4bce2b7566b Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 18:11:36 +0000 Subject: [PATCH 008/201] fix types --- llmfoundry/utils/config_utils.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index a67f1483f9..2aa3f25780 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -5,7 +5,7 @@ import logging import math import warnings -from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Literal, Mapping, Optional, Tuple, Union from composer.utils import dist from omegaconf import DictConfig, ListConfig @@ -24,9 +24,7 @@ ] -def convert_to_dict( - value: Optional[Union[ListConfig, DictConfig]] -) -> Union[Dict[str, Any], List[Dict[str, Any]]]: +def convert_to_dict(value: Optional[Union[ListConfig, DictConfig]]) -> Any: if value is None: return None if not isinstance(value, DictConfig) and not isinstance(value, ListConfig): From 4cf99fe76c2c8228dedd66a1dc951f1c0180ea1f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 18:31:00 +0000 Subject: [PATCH 009/201] fix dictconfig --- scripts/train/train.py | 24 +++--------------------- 1 file changed, 3 insertions(+), 21 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index f70a4d0569..2946f0a694 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -65,7 +65,7 @@ class TrainConfig: dist_timeout: Union[int, float] = 600.0 eval_loader: Optional[Union[DictConfig, ListConfig]] = None icl_tasks: Optional[Union[ListConfig, str]] = None - fsdp_config: Optional[Dict[str, Any]] = None + fsdp_config: Optional[DictConfig] = None eval_loader: Optional[Union[DictConfig, ListConfig]] = None icl_tasks: Optional[Union[ListConfig, str]] = None eval_gauntlet: Optional[Union[DictConfig, str]] = None @@ -293,29 +293,11 @@ def main(cfg: DictConfig) -> Trainer: and not save_weights_only: autoresume_default = True - if cfg.get('autoresume') is None and autoresume_default: + if not scfg.autoresume and autoresume_default: log.info('As run_name, save_folder, and save_latest_filename are set, \ changing autoresume default to True...') - autoresume: bool = pop_config(cfg, - 'autoresume', - must_exist=False, - default_value=autoresume_default) - - # Pop known unused parameters that are used as interpolation variables or - # created by update_batch_size_info. - pop_config(cfg, 'data_local', must_exist=False) - pop_config(cfg, 'data_remote', must_exist=False) - pop_config(cfg, 'global_seed', must_exist=False) - pop_config(cfg, 'global_train_batch_size', must_exist=False) - pop_config(cfg, 'n_gpus', must_exist=False) - pop_config(cfg, 'device_train_grad_accum', must_exist=False) - - # Warn users for unused parameters - for key in cfg: - warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' - ) + autoresume: bool = scfg.autoresume # Warn if fsdp is enabled but user only has 1 GPU if dist.get_world_size() == 1 and fsdp_config is not None: From 839c61c84975b0231136895c3b1426629442d594 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 18:58:47 +0000 Subject: [PATCH 010/201] fix union of list|dict configs --- scripts/train/train.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 2946f0a694..70bded697b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -66,8 +66,6 @@ class TrainConfig: eval_loader: Optional[Union[DictConfig, ListConfig]] = None icl_tasks: Optional[Union[ListConfig, str]] = None fsdp_config: Optional[DictConfig] = None - eval_loader: Optional[Union[DictConfig, ListConfig]] = None - icl_tasks: Optional[Union[ListConfig, str]] = None eval_gauntlet: Optional[Union[DictConfig, str]] = None icl_subset_num_batches: Optional[int] = None icl_seq_len: Optional[int] = None @@ -277,7 +275,7 @@ def main(cfg: DictConfig) -> Trainer: int] = scfg.device_train_microbatch_size eval_subset_num_batches: int = scfg.eval_subset_num_batches eval_first: bool = scfg.eval_first - load_path: str = scfg.load_path + load_path: Optional[str] = scfg.load_path load_weights_only: bool = scfg.load_weights_only load_strict_model_weights: bool = scfg.load_strict_model_weights load_ignore_keys: Optional[List[str]] = scfg.load_ignore_keys From 710c9b0a511e7fb5f51263a9f40d0a07924f558e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 19:07:34 +0000 Subject: [PATCH 011/201] fix type annotation --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 70bded697b..464f6a2a55 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -280,7 +280,7 @@ def main(cfg: DictConfig) -> Trainer: load_strict_model_weights: bool = scfg.load_strict_model_weights load_ignore_keys: Optional[List[str]] = scfg.load_ignore_keys compile_config: Optional[Dict[str, Any]] = scfg.compile_config - metadata: Optional[Dict[str, str]] = convert_to_dict(scfg.metadata) + metadata: Optional[Dict[str, Any]] = convert_to_dict(scfg.metadata) should_log_config: bool = scfg.log_config # Enable autoresume from model checkpoints if possible From 142518a516f89b23c7bf2b309e40c5b0d5f0639a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 19:08:51 +0000 Subject: [PATCH 012/201] oops --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 464f6a2a55..bd1fdcdd6c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -92,7 +92,7 @@ class TrainConfig: load_strict_model_weights: bool = True load_ignore_keys: Optional[List[str]] = None compile_config: Optional[Dict[str, Any]] = None - metadata: Optional[Dict[str, str]] = None + metadata: Optional[DictConfig] = None log_config: bool = True autoresume: bool = False data_local: Optional[Dict[str, Any]] = None From 740840381fac198e5313e5ce640e969de72a7393 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 9 Apr 2024 21:33:36 +0000 Subject: [PATCH 013/201] fixed configs --- scripts/train/train.py | 86 ++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 32 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index bd1fdcdd6c..75b69b3e08 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -35,8 +35,8 @@ build_composer_model, build_evaluators, build_logger, build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (convert_to_dict, log_config, - pop_config, process_init_device, +from llmfoundry.utils.config_utils import (log_config, pop_config, + process_init_device, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -45,11 +45,11 @@ @dataclass class TrainConfig: - model: DictConfig = MISSING - tokenizer: DictConfig = MISSING - optimizer: DictConfig = MISSING - scheduler: DictConfig = MISSING - train_loader: DictConfig = MISSING + model: Dict[str, Any] = MISSING + tokenizer: Dict[str, Any] = MISSING + optimizer: Dict[str, Any] = MISSING + scheduler: Dict[str, Any] = MISSING + train_loader: Dict[str, Any] = MISSING device_train_batch_size: int = MISSING device_eval_batch_size: int = MISSING max_duration: Union[int, str] = MISSING @@ -63,15 +63,18 @@ class TrainConfig: expandable_segments: bool = False cuda_load_lazy: bool = False dist_timeout: Union[int, float] = 600.0 - eval_loader: Optional[Union[DictConfig, ListConfig]] = None - icl_tasks: Optional[Union[ListConfig, str]] = None - fsdp_config: Optional[DictConfig] = None - eval_gauntlet: Optional[Union[DictConfig, str]] = None + eval_loader: Optional[Dict[str, Any]] = None + eval_loaders: Optional[List[Dict[str, Any]]] = None + icl_tasks: Optional[List[Dict[str, Any]]] = None + icl_tasks_str: Optional[str] = None + fsdp_config: Optional[Dict[str, Any]] = None + eval_gauntlet: Optional[Dict[str, Any]] = None + eval_gauntlet_str: Optional[str] = None icl_subset_num_batches: Optional[int] = None icl_seq_len: Optional[int] = None - loggers: Optional[DictConfig] = None - callbacks: Optional[DictConfig] = None - algorithms: Optional[DictConfig] = None + loggers: Optional[Dict[str, Any]] = None + callbacks: Optional[Dict[str, Any]] = None + algorithms: Optional[Dict[str, Any]] = None run_name: Optional[str] = None save_folder: Optional[str] = None save_latest_filename: Optional[str] = None @@ -92,25 +95,25 @@ class TrainConfig: load_strict_model_weights: bool = True load_ignore_keys: Optional[List[str]] = None compile_config: Optional[Dict[str, Any]] = None - metadata: Optional[DictConfig] = None + metadata: Optional[Dict[str, Any]] = None log_config: bool = True autoresume: bool = False - data_local: Optional[Dict[str, Any]] = None - data_remote: Optional[Dict[str, Any]] = None + data_local: Optional[str] = None + data_remote: Optional[str] = None global_seed: Optional[int] = None global_train_batch_size: Optional[int] = None n_gpus: Optional[int] = None device_train_grad_accum: Optional[int] = None - profiler: Optional[DictConfig] = None + profiler: Optional[Dict[str, Any]] = None def validate_config(cfg: TrainConfig): """Validates compatible model and dataloader selection.""" loaders = [cfg.train_loader] - if cfg.eval_loader is not None: + if cfg.eval_loader is not None or cfg.eval_loaders is not None: eval_loader = cfg.eval_loader - if isinstance(eval_loader, ListConfig): - for loader in eval_loader: + if isinstance(cfg.eval_loaders, ListConfig): + for loader in cfg.eval_loaders: if loader.label is None: raise ValueError( 'When specifying multiple evaluation datasets, each one must include the \ @@ -125,7 +128,7 @@ def validate_config(cfg: TrainConfig): f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ f'dataloader. Only finetuning is supported.') - if cfg.icl_tasks is not None: + if cfg.icl_tasks is not None or cfg.icl_tasks_str is not None: if cfg.model.name == 'hf_t5': raise ValueError( 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' @@ -169,6 +172,24 @@ def validate_config(cfg: TrainConfig): def main(cfg: DictConfig) -> Trainer: + # Resolve all interpolation variables as early as possible + om.resolve(cfg) + + if (loader := cfg.get('eval_loader', None)) is not None: + # structured config does not support unions of containers + if isinstance(loader, ListConfig): + loaders: Optional[ListConfig] = loader + cfg['eval_loaders'] = loaders + cfg.pop('eval_loader') + if (tasks := cfg.get('icl_tasks', None)) is not None: + if isinstance(tasks, str): + cfg['icl_tasks_str'] = tasks + cfg.pop('icl_tasks') + if (gauntlet := cfg.get('eval_gauntlet', None)) is not None: + if isinstance(gauntlet, str): + cfg['eval_gauntlet_str'] = gauntlet + cfg.pop('eval_gauntlet') + scfg: TrainConfig = OmegaConf.structured( TrainConfig(**cfg) ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) @@ -189,9 +210,6 @@ def main(cfg: DictConfig) -> Trainer: # Check for incompatibilities between the model and data loaders validate_config(scfg) - # Resolve all interpolation variables as early as possible - om.resolve(cfg) - # Create copy of config for logging logged_cfg: DictConfig = copy.deepcopy(cfg) @@ -228,16 +246,20 @@ def main(cfg: DictConfig) -> Trainer: # Mandatory model training configs model_config: DictConfig = scfg.model - tokenizer_config: Dict[str, Any] = convert_to_dict(scfg.tokenizer) - optimizer_config: Dict[str, Any] = convert_to_dict(scfg.optimizer) - scheduler_config: Dict[str, Any] = convert_to_dict(scfg.scheduler) + tokenizer_config: Dict[str, Any] = scfg.tokenizer + optimizer_config: Dict[str, Any] = scfg.optimizer + scheduler_config: Dict[str, Any] = scfg.scheduler train_loader_config: DictConfig = scfg.train_loader # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = convert_to_dict(scfg.fsdp_config) + fsdp_config: Optional[Dict[str, Any]] = scfg.fsdp_config - eval_loader_config: Optional[Union[DictConfig, - ListConfig]] = scfg.eval_loader + if scfg.eval_loader is not None and scfg.eval_loaders is not None: + raise ValueError( + 'Only one of `eval_loader` or `eval_loaders` should be provided.') + eval_loader_config: Optional[Union[ + DictConfig, + ListConfig]] = scfg.eval_loader if scfg.eval_loader is not None else scfg.eval_loaders icl_tasks_config: Optional[Union[ListConfig, str]] = scfg.icl_tasks eval_gauntlet_config: Optional[Union[DictConfig, str]] = scfg.eval_gauntlet icl_subset_num_batches: Optional[int] = scfg.icl_subset_num_batches @@ -280,7 +302,7 @@ def main(cfg: DictConfig) -> Trainer: load_strict_model_weights: bool = scfg.load_strict_model_weights load_ignore_keys: Optional[List[str]] = scfg.load_ignore_keys compile_config: Optional[Dict[str, Any]] = scfg.compile_config - metadata: Optional[Dict[str, Any]] = convert_to_dict(scfg.metadata) + metadata: Optional[Dict[str, Any]] = scfg.metadata should_log_config: bool = scfg.log_config # Enable autoresume from model checkpoints if possible From a1bf2b82df8002285670ae02747a84cf42ef8a05 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 00:38:24 +0000 Subject: [PATCH 014/201] add save ignore keys --- scripts/train/train.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/train/train.py b/scripts/train/train.py index 75b69b3e08..2e1b0bd580 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -105,6 +105,7 @@ class TrainConfig: n_gpus: Optional[int] = None device_train_grad_accum: Optional[int] = None profiler: Optional[Dict[str, Any]] = None + save_ignore_keys: Optional[List[str]] = None def validate_config(cfg: TrainConfig): @@ -301,6 +302,7 @@ def main(cfg: DictConfig) -> Trainer: load_weights_only: bool = scfg.load_weights_only load_strict_model_weights: bool = scfg.load_strict_model_weights load_ignore_keys: Optional[List[str]] = scfg.load_ignore_keys + save_ignore_keys: Optional[List[str]] = scfg.save_ignore_keys compile_config: Optional[Dict[str, Any]] = scfg.compile_config metadata: Optional[Dict[str, Any]] = scfg.metadata should_log_config: bool = scfg.log_config From dcf4142351cd3866e617b375d1cb5d08d12d3474 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 01:31:10 +0000 Subject: [PATCH 015/201] fix batch size kerfuffle --- scripts/train/train.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 2289e9ffe4..5104143ad6 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -187,8 +187,15 @@ def main(cfg: DictConfig) -> Trainer: # Resolve all interpolation variables as early as possible om.resolve(cfg) + # Create copy of config for logging + logged_cfg: DictConfig = copy.deepcopy(cfg) + + # Get global and device batch size information from distributed/single node setting + cfg = update_batch_size_info(cfg) + logged_cfg.update(cfg, merge=True) + + # structured config does not support unions of containers, so separate single and plural containers if (loader := cfg.get('eval_loader', None)) is not None: - # structured config does not support unions of containers if isinstance(loader, ListConfig): loaders: Optional[ListConfig] = loader cfg['eval_loaders'] = loaders @@ -222,9 +229,6 @@ def main(cfg: DictConfig) -> Trainer: # Check for incompatibilities between the model and data loaders validate_config(scfg) - # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(cfg) - cuda_alloc_conf = [] # Get max split size mb max_split_size_mb: Optional[int] = scfg.max_split_size_mb @@ -252,10 +256,6 @@ def main(cfg: DictConfig) -> Trainer: dist_timeout: Union[int, float] = scfg.dist_timeout dist.initialize_dist(get_device(None), timeout=dist_timeout) - # Get global and device batch size information from distributed/single node setting - cfg = update_batch_size_info(cfg) - logged_cfg.update(cfg, merge=True) - # Mandatory model training configs model_config: DictConfig = scfg.model tokenizer_config: Dict[str, Any] = scfg.tokenizer From 8f1177b6b12ffcd0b834633865d92ff864646515 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 02:00:57 +0000 Subject: [PATCH 016/201] fix dictconfig stuff --- scripts/train/train.py | 36 ++++++++++++++++------------- tests/a_scripts/train/test_train.py | 6 +++-- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 5104143ad6..92f7ae2714 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -115,7 +115,7 @@ def validate_config(cfg: TrainConfig): eval_loader = cfg.eval_loader if isinstance(cfg.eval_loaders, ListConfig): for loader in cfg.eval_loaders: - if loader.label is None: + if 'label' not in loader: raise ValueError( 'When specifying multiple evaluation datasets, each one must include the \ `label` attribute.') @@ -123,14 +123,14 @@ def validate_config(cfg: TrainConfig): else: loaders.append(eval_loader) for loader in loaders: - if loader.name == 'text': - if cfg.model.name == 'hf_t5': + if loader['name'] == 'text': + if cfg.model['name'] == 'hf_t5': raise ValueError( - f'Model type "{cfg.model.name}" is not supported when using the "text " ' +\ + f'Model type "{cfg.model["name"]}" is not supported when using the "text " ' +\ f'dataloader. Only finetuning is supported.') if cfg.icl_tasks is not None or cfg.icl_tasks_str is not None: - if cfg.model.name == 'hf_t5': + if cfg.model['name'] == 'hf_t5': raise ValueError( 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' ) @@ -175,8 +175,8 @@ def validate_config(cfg: TrainConfig): 'mptmlp') in ('mb_moe', 'mb_dmoe'): moe_world_size = cfg.model.get('ffn_config', {}).get('moe_world_size', 1) - use_orig_params = cfg.get('fsdp_config', - {}).get('use_orig_params', True) + use_orig_params = cfg.fsdp_config.get( + 'use_orig_params', True) if cfg.fsdp_config is not None else True if moe_world_size > 1 and not use_orig_params: raise ValueError( f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.' @@ -257,11 +257,11 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config: DictConfig = scfg.model + model_config: Dict[str, Any] = scfg.model tokenizer_config: Dict[str, Any] = scfg.tokenizer optimizer_config: Dict[str, Any] = scfg.optimizer scheduler_config: Dict[str, Any] = scfg.scheduler - train_loader_config: DictConfig = scfg.train_loader + train_loader_config: Dict[str, Any] = scfg.train_loader # Optional fsdp data, fine-tuning, and eval configs fsdp_config: Optional[Dict[str, Any]] = scfg.fsdp_config @@ -269,17 +269,21 @@ def main(cfg: DictConfig) -> Trainer: if scfg.eval_loader is not None and scfg.eval_loaders is not None: raise ValueError( 'Only one of `eval_loader` or `eval_loaders` should be provided.') - eval_loader_config: Optional[Union[ + eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[ + str, + Any]]]] = scfg.eval_loader if scfg.eval_loader is not None else scfg.eval_loaders + icl_tasks_config: Optional[Union[ + List[Dict[str, Any]], + str]] = scfg.icl_tasks if scfg.icl_tasks is not None else scfg.icl_tasks_str + eval_gauntlet_config: Optional[Union[ DictConfig, - ListConfig]] = scfg.eval_loader if scfg.eval_loader is not None else scfg.eval_loaders - icl_tasks_config: Optional[Union[ListConfig, str]] = scfg.icl_tasks - eval_gauntlet_config: Optional[Union[DictConfig, str]] = scfg.eval_gauntlet + str]] = scfg.eval_gauntlet if scfg.eval_gauntlet is not None else scfg.eval_gauntlet_str icl_subset_num_batches: Optional[int] = scfg.icl_subset_num_batches icl_seq_len: Optional[int] = scfg.icl_seq_len # Optional logging, evaluation and callback configs - logger_configs: Optional[DictConfig] = scfg.loggers - callback_configs: Optional[DictConfig] = scfg.callbacks - algorithm_configs: Optional[DictConfig] = scfg.algorithms + logger_configs: Optional[Dict[str, Any]] = scfg.loggers + callback_configs: Optional[Dict[str, Any]] = scfg.callbacks + algorithm_configs: Optional[Dict[str, Any]] = scfg.algorithms # Mandatory hyperparameters for training device_train_batch_size: int = scfg.device_train_batch_size diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index ff885ac735..3878b22704 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -11,7 +11,8 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -from scripts.train.train import main, validate_config # noqa: E402 +from llmfoundry.utils.config_utils import update_batch_size_info +from scripts.train.train import TrainConfig, main, validate_config # noqa: E402 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) from tests.fixtures.autouse import REPO_DIR @@ -156,12 +157,13 @@ def test_validate_config(): test_cfg: DictConfig = om.load(f) # type: ignore test_cfg.model.ffn_config.moe_world_size = 4 test_cfg.fsdp_config.use_orig_params = False + test_cfg = update_batch_size_info(test_cfg) with pytest.raises( ValueError, match= 'MoEs with expert parallelism (.*) require `use_orig_params=True`.' ): - validate_config(test_cfg) + validate_config(om.structured(TrainConfig(**test_cfg))) def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): From 4965cd2e405c14e563676434aef9b3793061de17 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 02:42:38 +0000 Subject: [PATCH 017/201] fix dictconfig stuff again --- scripts/train/train.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 92f7ae2714..a1d6741d7c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -121,7 +121,8 @@ def validate_config(cfg: TrainConfig): `label` attribute.') loaders.append(loader) else: - loaders.append(eval_loader) + if eval_loader is not None: + loaders.append(eval_loader) for loader in loaders: if loader['name'] == 'text': if cfg.model['name'] == 'hf_t5': @@ -257,11 +258,11 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config: Dict[str, Any] = scfg.model + model_config: DictConfig = DictConfig(scfg.model) tokenizer_config: Dict[str, Any] = scfg.tokenizer optimizer_config: Dict[str, Any] = scfg.optimizer scheduler_config: Dict[str, Any] = scfg.scheduler - train_loader_config: Dict[str, Any] = scfg.train_loader + train_loader_config: DictConfig = DictConfig(scfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs fsdp_config: Optional[Dict[str, Any]] = scfg.fsdp_config @@ -269,20 +270,20 @@ def main(cfg: DictConfig) -> Trainer: if scfg.eval_loader is not None and scfg.eval_loaders is not None: raise ValueError( 'Only one of `eval_loader` or `eval_loaders` should be provided.') - eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[ - str, - Any]]]] = scfg.eval_loader if scfg.eval_loader is not None else scfg.eval_loaders - icl_tasks_config: Optional[Union[ - List[Dict[str, Any]], - str]] = scfg.icl_tasks if scfg.icl_tasks is not None else scfg.icl_tasks_str - eval_gauntlet_config: Optional[Union[ - DictConfig, - str]] = scfg.eval_gauntlet if scfg.eval_gauntlet is not None else scfg.eval_gauntlet_str + eval_loader_config: Optional[Union[DictConfig, ListConfig]] = DictConfig( + scfg.eval_loader) if scfg.eval_loader is not None else ListConfig( + scfg.eval_loaders) if scfg.eval_loaders is not None else None + icl_tasks_config: Optional[Union[ListConfig, str]] = ListConfig( + scfg.icl_tasks) if scfg.icl_tasks is not None else scfg.icl_tasks_str + eval_gauntlet_config: Optional[Union[DictConfig, str]] = DictConfig( + scfg.eval_gauntlet + ) if scfg.eval_gauntlet is not None else scfg.eval_gauntlet_str icl_subset_num_batches: Optional[int] = scfg.icl_subset_num_batches icl_seq_len: Optional[int] = scfg.icl_seq_len # Optional logging, evaluation and callback configs logger_configs: Optional[Dict[str, Any]] = scfg.loggers - callback_configs: Optional[Dict[str, Any]] = scfg.callbacks + callback_configs: Optional[DictConfig] = DictConfig( + scfg.callbacks) if scfg.callbacks else None algorithm_configs: Optional[Dict[str, Any]] = scfg.algorithms # Mandatory hyperparameters for training @@ -392,7 +393,8 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg: Optional[DictConfig] = scfg.profiler + profiler_cfg: Optional[DictConfig] = DictConfig( + scfg.profiler) if scfg.profiler is not None else None if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', From 53889fd1a4ee63d53cc8dbd4e453a0399f7b9bd4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 03:10:27 +0000 Subject: [PATCH 018/201] fix --- llmfoundry/utils/builders.py | 6 ++++-- scripts/train/train.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index a8c660df70..900f5ed384 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -388,7 +388,7 @@ def build_optimizer( 'optimizer config. Please remove it from the optimizer config kwargs.' ) - kwargs['params'] = params + kwargs['params'] = list(params) return construct_from_registry(name=name, registry=registry.optimizers, partial_function=True, @@ -438,7 +438,9 @@ def build_tokenizer( int(1e30), ) - if not hasattr(tokenizer, 'eos_token') or tokenizer.eos_token is None: + if not hasattr( + tokenizer, 'eos_token' + ) or tokenizer.eos_token is None: # type: ignore (sometime's it's not none but that's ok too) raise ValueError( f'The tokenizer {tokenizer_name} must have an eos_token.') diff --git a/scripts/train/train.py b/scripts/train/train.py index a1d6741d7c..4255bd3276 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -113,7 +113,7 @@ def validate_config(cfg: TrainConfig): loaders = [cfg.train_loader] if cfg.eval_loader is not None or cfg.eval_loaders is not None: eval_loader = cfg.eval_loader - if isinstance(cfg.eval_loaders, ListConfig): + if isinstance(cfg.eval_loaders, list): for loader in cfg.eval_loaders: if 'label' not in loader: raise ValueError( From 76699541e796d15831415b4134e7996fc0eb1107 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 03:32:37 +0000 Subject: [PATCH 019/201] fix --- llmfoundry/utils/builders.py | 2 +- scripts/train/train.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 900f5ed384..5cdbb4ee62 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -388,7 +388,7 @@ def build_optimizer( 'optimizer config. Please remove it from the optimizer config kwargs.' ) - kwargs['params'] = list(params) + kwargs['params'] = params return construct_from_registry(name=name, registry=registry.optimizers, partial_function=True, diff --git a/scripts/train/train.py b/scripts/train/train.py index 4255bd3276..860c0f2854 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -259,9 +259,9 @@ def main(cfg: DictConfig) -> Trainer: # Mandatory model training configs model_config: DictConfig = DictConfig(scfg.model) - tokenizer_config: Dict[str, Any] = scfg.tokenizer - optimizer_config: Dict[str, Any] = scfg.optimizer - scheduler_config: Dict[str, Any] = scfg.scheduler + tokenizer_config: Dict[str, Any] = {**scfg.tokenizer} + optimizer_config: Dict[str, Any] = {**scfg.optimizer} + scheduler_config: Dict[str, Any] = {**scfg.scheduler} train_loader_config: DictConfig = DictConfig(scfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs From ba0783df80fba1872dc301b96b4d751068bce7cd Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 04:30:55 +0000 Subject: [PATCH 020/201] updated unit tests for variables --- scripts/train/train.py | 36 ++++++++++++++++------ tests/a_scripts/train/test_train_inputs.py | 4 +-- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 860c0f2854..e7c7d61841 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -7,7 +7,7 @@ import sys import time import warnings -from dataclasses import dataclass +from dataclasses import dataclass, fields from typing import Any, Dict, List, Optional, Union import torch @@ -106,6 +106,10 @@ class TrainConfig: device_train_grad_accum: Optional[int] = None profiler: Optional[Dict[str, Any]] = None save_ignore_keys: Optional[List[str]] = None + variables: Optional[Dict[str, Any]] = None + + +TRAIN_CONFIG_KEYS = set(field.name for field in fields(TrainConfig)) def validate_config(cfg: TrainConfig): @@ -113,7 +117,8 @@ def validate_config(cfg: TrainConfig): loaders = [cfg.train_loader] if cfg.eval_loader is not None or cfg.eval_loaders is not None: eval_loader = cfg.eval_loader - if isinstance(cfg.eval_loaders, list): + if isinstance(cfg.eval_loaders, list) or isinstance( + cfg.eval_loaders, ListConfig): for loader in cfg.eval_loaders: if 'label' not in loader: raise ValueError( @@ -188,13 +193,6 @@ def main(cfg: DictConfig) -> Trainer: # Resolve all interpolation variables as early as possible om.resolve(cfg) - # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(cfg) - - # Get global and device batch size information from distributed/single node setting - cfg = update_batch_size_info(cfg) - logged_cfg.update(cfg, merge=True) - # structured config does not support unions of containers, so separate single and plural containers if (loader := cfg.get('eval_loader', None)) is not None: if isinstance(loader, ListConfig): @@ -210,6 +208,26 @@ def main(cfg: DictConfig) -> Trainer: cfg['eval_gauntlet_str'] = gauntlet cfg.pop('eval_gauntlet') + arg_config_keys = set(cfg.keys()) + extraneous_keys = set.difference(arg_config_keys, TRAIN_CONFIG_KEYS) + + if 'variables' not in cfg: + cfg['variables'] = {} + + for key in extraneous_keys: + warnings.warn( + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes.' + ) + # TODO (milo): delete the below line once we deprecate variables at the top level. + cfg['variables'][key] = cfg.pop(key) + + # Create copy of config for logging + logged_cfg: DictConfig = copy.deepcopy(cfg) + + # Get global and device batch size information from distributed/single node setting + cfg = update_batch_size_info(cfg) + logged_cfg.update(cfg, merge=True) + scfg: TrainConfig = OmegaConf.structured( TrainConfig(**cfg) ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index 5eb24e05c8..eaf7a72589 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -61,7 +61,7 @@ def cfg(self, foundry_dir: str) -> DictConfig: def test_misspelled_mandatory_params_fail(self, cfg: DictConfig) -> None: """Check that mandatory misspelled inputs fail to train.""" cfg.trai_loader = cfg.pop('train_loader') - with pytest.raises(omegaconf.errors.ConfigAttributeError): + with pytest.raises(TypeError): main(cfg) def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: @@ -80,7 +80,7 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: for param in mandatory_params: orig_param = cfg.pop(param) with pytest.raises( - (omegaconf.errors.ConfigAttributeError, NameError)): + (omegaconf.errors.MissingMandatoryValue, NameError)): main(cfg) cfg[param] = orig_param From 23493908e0bcb1508d8336ae66fbbf5315fe14ef Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 05:00:14 +0000 Subject: [PATCH 021/201] last fix? --- scripts/train/train.py | 10 +++------- tests/a_scripts/train/test_train_inputs.py | 2 +- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index e7c7d61841..615ee7819f 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -196,17 +196,13 @@ def main(cfg: DictConfig) -> Trainer: # structured config does not support unions of containers, so separate single and plural containers if (loader := cfg.get('eval_loader', None)) is not None: if isinstance(loader, ListConfig): - loaders: Optional[ListConfig] = loader - cfg['eval_loaders'] = loaders - cfg.pop('eval_loader') + cfg['eval_loaders'] = list(cfg.pop('eval_loader')) if (tasks := cfg.get('icl_tasks', None)) is not None: if isinstance(tasks, str): - cfg['icl_tasks_str'] = tasks - cfg.pop('icl_tasks') + cfg['icl_tasks_str'] = cfg.pop('icl_tasks') if (gauntlet := cfg.get('eval_gauntlet', None)) is not None: if isinstance(gauntlet, str): - cfg['eval_gauntlet_str'] = gauntlet - cfg.pop('eval_gauntlet') + cfg['eval_gauntlet_str'] = cfg.pop('eval_gauntlet') arg_config_keys = set(cfg.keys()) extraneous_keys = set.difference(arg_config_keys, TRAIN_CONFIG_KEYS) diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index eaf7a72589..f69d66e847 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -61,7 +61,7 @@ def cfg(self, foundry_dir: str) -> DictConfig: def test_misspelled_mandatory_params_fail(self, cfg: DictConfig) -> None: """Check that mandatory misspelled inputs fail to train.""" cfg.trai_loader = cfg.pop('train_loader') - with pytest.raises(TypeError): + with pytest.raises((omegaconf.errors.MissingMandatoryValue, TypeError)): main(cfg) def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: From 0acd8c77db932709d8c961a05865d9ae5e62d50d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 05:22:26 +0000 Subject: [PATCH 022/201] if this test case does not pass I will venmo Mihir 0 --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 615ee7819f..1f40114f2d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -120,7 +120,7 @@ def validate_config(cfg: TrainConfig): if isinstance(cfg.eval_loaders, list) or isinstance( cfg.eval_loaders, ListConfig): for loader in cfg.eval_loaders: - if 'label' not in loader: + if 'label' not in loader or loader['label'] is not None: raise ValueError( 'When specifying multiple evaluation datasets, each one must include the \ `label` attribute.') From 6a3d43ab2595d1099bcf0c0d58bcb7431cffbcfa Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 10 Apr 2024 05:42:05 +0000 Subject: [PATCH 023/201] remove a 'not' -- eg. 'I am not going crazy' --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 1f40114f2d..d256796a3d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -120,7 +120,7 @@ def validate_config(cfg: TrainConfig): if isinstance(cfg.eval_loaders, list) or isinstance( cfg.eval_loaders, ListConfig): for loader in cfg.eval_loaders: - if 'label' not in loader or loader['label'] is not None: + if 'label' not in loader or loader['label'] is None: raise ValueError( 'When specifying multiple evaluation datasets, each one must include the \ `label` attribute.') From 704195e0b68d9403db8171f53d3e28e3fd516f0f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 16 Apr 2024 11:57:34 -0400 Subject: [PATCH 024/201] Update scripts/train/train.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 0fcf77f78b..c6b85851d1 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -194,7 +194,7 @@ def main(cfg: DictConfig) -> Trainer: # Resolve all interpolation variables as early as possible om.resolve(cfg) - # structured config does not support unions of containers, so separate single and plural containers + # Structured config does not support unions of containers, so separate single and plural containers if (loader := cfg.get('eval_loader', None)) is not None: if isinstance(loader, ListConfig): cfg['eval_loaders'] = list(cfg.pop('eval_loader')) From ef0554c6990bde48c3a9a91dafc22d4526c74091 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 03:33:32 +0000 Subject: [PATCH 025/201] set amp bf16 as default precision, etc --- scripts/eval/eval.py | 148 +++++++++++++++++++---------------------- scripts/train/train.py | 2 +- 2 files changed, 70 insertions(+), 80 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 22108d4c75..6a165dc874 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -7,6 +7,7 @@ import sys import time import warnings +from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd @@ -15,7 +16,7 @@ from composer.loggers.logger_destination import LoggerDestination from composer.trainer import Trainer from composer.utils import dist, get_device, reproducibility -from omegaconf import DictConfig, ListConfig +from omegaconf import MISSING, DictConfig, ListConfig from omegaconf import OmegaConf as om from rich.traceback import install @@ -27,8 +28,7 @@ build_callback, build_composer_model, build_evaluators, build_logger, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, pop_config, - process_init_device) +from llmfoundry.utils.config_utils import log_config, process_init_device from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) @@ -164,92 +164,81 @@ def evaluate_model( return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) -def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: - # Run user provided code if specified - code_paths = pop_config(cfg, - 'code_paths', - must_exist=False, - default_value=[], - convert=True) - for code_path in code_paths: - import_file(code_path) +@dataclass +class EvalConfig: + models: List[Dict[str, Any]] = MISSING + + code_paths: Optional[List[str]] = None + eval_gauntlet: Optional[Dict[str, Any]] = None + eval_gauntlet_str: Optional[str] = None + fsdp_config: Optional[Dict[str, Any]] = None + icl_tasks: Union[str, List[str]] = MISSING + max_seq_len: int = MISSING + device_eval_batch_size: int = MISSING + precision: str = 'amp_bf16' + python_log_level: Optional[str] = None + eval_loader: Optional[Dict[str, Any]] = None + eval_loaders: Optional[List[Dict[str, Any]]] = None + + seed: int = 17 + dist_timeout: Union[float, int] = 600.0 + run_name: Optional[str] = None + loggers: Optional[Dict[str, Any]] = None + eval_subset_num_batches: int = -1 + icl_subset_num_batches: Optional[int] = None + metadata: Optional[Dict[str, str]] = None + log_config: bool = True + model_name_or_path: Optional[str] = None + callbacks: Optional[Dict[str, Any]] = None + +def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: om.resolve(cfg) + # flatten union types before creating structured config: + if 'eval_gauntlet' in cfg: + if isinstance(cfg.eval_gauntlet, str): + cfg.eval_gauntlet_str = cfg.pop('eval_gauntlet') + if 'eval_loader' in cfg: + if isinstance(cfg.eval_loader, ListConfig): + cfg.eval_loaders = cfg.pop('eval_loader') + + scfg: EvalConfig = om.structured(EvalConfig(**cfg)) # Create copy of config for logging logged_cfg: DictConfig = copy.deepcopy(cfg) - model_configs: ListConfig = pop_config(cfg, 'models', must_exist=True) - eval_gauntlet_config: Optional[Union[str, DictConfig]] = pop_config( - cfg, 'eval_gauntlet', must_exist=False, default_value=None) - - fsdp_dict_cfg: Optional[DictConfig] = pop_config(cfg, - 'fsdp_config', - must_exist=False, - default_value=None) - fsdp_config: Optional[Dict] = om.to_container( - fsdp_dict_cfg, - resolve=True) if fsdp_dict_cfg is not None else None # type: ignore + # Run user provided code if specified + code_paths = scfg.code_paths + for code_path in (code_paths or []): + import_file(code_path) + + model_configs = scfg.models + eval_gauntlet_config = scfg.eval_gauntlet if scfg.eval_gauntlet else scfg.eval_gauntlet_str + + fsdp_config = scfg.fsdp_config + assert isinstance(fsdp_config, Dict) or fsdp_config is None # Mandatory Evaluation Parameters - icl_tasks: Union[str, ListConfig] = pop_config(cfg, - 'icl_tasks', - must_exist=True) - max_seq_len: int = pop_config(cfg, 'max_seq_len', must_exist=True) - device_eval_batch_size: int = pop_config(cfg, - 'device_eval_batch_size', - must_exist=True) - precision: str = pop_config(cfg, - 'precision', - must_exist=False, - default_value=None) - python_log_level: Optional[str] = pop_config(cfg, - 'python_log_level', - must_exist=False, - default_value='debug') + icl_tasks = scfg.icl_tasks + max_seq_len = scfg.max_seq_len + device_eval_batch_size = scfg.device_eval_batch_size + precision = scfg.precision + python_log_level: Optional[str] = scfg.python_log_level # Optional Evaluation Parameters with default values - eval_loader_config: Optional[Union[DictConfig, ListConfig]] = pop_config( - cfg, 'eval_loader', must_exist=False, default_value=None) - seed: int = pop_config(cfg, 'seed', must_exist=False, default_value=17) - dist_timeout: Union[float, int] = pop_config(cfg, - 'dist_timeout', - must_exist=False, - default_value=600.0) + eval_loader_config = scfg.eval_loader if scfg.eval_loader else scfg.eval_loaders + seed = scfg.seed + dist_timeout = scfg.dist_timeout default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name: str = pop_config(cfg, - 'run_name', - must_exist=False, - default_value=default_run_name) - loggers_cfg: Dict[str, Any] = pop_config(cfg, - 'loggers', - must_exist=False, - default_value={}) - eval_subset_num_batches: int = pop_config(cfg, - 'eval_subset_num_batches', - must_exist=False, - default_value=-1) - icl_subset_num_batches: Optional[int] = pop_config(cfg, - 'icl_subset_num_batches', - must_exist=False, - default_value=None) - metadata: Optional[Dict[str, str]] = pop_config(cfg, - 'metadata', - must_exist=False, - default_value=None, - convert=True) - should_log_config: bool = pop_config(cfg, - 'log_config', - must_exist=False, - default_value=True) - - # Pop out interpolation variables. - pop_config(cfg, 'model_name_or_path', must_exist=False, default_value=None) - callback_configs: Optional[DictConfig] = pop_config(cfg, - 'callbacks', - must_exist=False, - default_value=None) + run_name = scfg.run_name if scfg.run_name else default_run_name + loggers_cfg = scfg.loggers + eval_subset_num_batches = scfg.eval_subset_num_batches + icl_subset_num_batches = scfg.icl_subset_num_batches + metadata = scfg.metadata + should_log_config = scfg.log_config + + callback_configs = scfg.callbacks # Warn for unused parameters for key in cfg: @@ -276,7 +265,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: loggers: List[LoggerDestination] = [ build_logger(name, logger_cfg) - for name, logger_cfg in loggers_cfg.items() + for name, logger_cfg in (loggers_cfg or {}).items() ] mosaicml_logger = find_mosaicml_logger(loggers) @@ -326,9 +315,10 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for b in t.benchmarks: benchmark_to_taxonomy[b.name] = t.name + assert 'model_name' in model_cfg, 'model_name must be specified in model config' model_results = calculate_markdown_results(logger_keys, trainer, benchmark_to_taxonomy, - model_cfg.model_name) + model_cfg['model_name']) if models_df is None: models_df = model_results diff --git a/scripts/train/train.py b/scripts/train/train.py index d256796a3d..279edd15dd 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -54,7 +54,7 @@ class TrainConfig: device_eval_batch_size: int = MISSING max_duration: Union[int, str] = MISSING eval_interval: Union[int, str] = MISSING - precision: str = MISSING + precision: str = 'amp_bf16' max_seq_len: int = MISSING seed: int = MISSING From cb0ad66df54cb320902ad5bc256b698edd39c7f4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 17:16:18 +0000 Subject: [PATCH 026/201] temporarily wrap with dictconfig before ** migration --- scripts/eval/eval.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 6a165dc874..55f60ad5eb 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -172,7 +172,8 @@ class EvalConfig: eval_gauntlet: Optional[Dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None fsdp_config: Optional[Dict[str, Any]] = None - icl_tasks: Union[str, List[str]] = MISSING + icl_tasks: Optional[List[str]] = MISSING + icl_tasks_str: Optional[str] = None max_seq_len: int = MISSING device_eval_batch_size: int = MISSING precision: str = 'amp_bf16' @@ -202,6 +203,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: if 'eval_loader' in cfg: if isinstance(cfg.eval_loader, ListConfig): cfg.eval_loaders = cfg.pop('eval_loader') + if 'icl_tasks' in cfg: + if isinstance(cfg.icl_tasks, str): + cfg.icl_tasks_str = cfg.pop('icl_tasks') scfg: EvalConfig = om.structured(EvalConfig(**cfg)) # Create copy of config for logging @@ -212,22 +216,26 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for code_path in (code_paths or []): import_file(code_path) - model_configs = scfg.models - eval_gauntlet_config = scfg.eval_gauntlet if scfg.eval_gauntlet else scfg.eval_gauntlet_str + model_configs = ListConfig(scfg.models) + eval_gauntlet_config = DictConfig( + scfg.eval_gauntlet) if scfg.eval_gauntlet else scfg.eval_gauntlet_str fsdp_config = scfg.fsdp_config assert isinstance(fsdp_config, Dict) or fsdp_config is None # Mandatory Evaluation Parameters - icl_tasks = scfg.icl_tasks + icl_tasks = ListConfig( + scfg.icl_tasks) if scfg.icl_tasks else scfg.icl_tasks_str max_seq_len = scfg.max_seq_len device_eval_batch_size = scfg.device_eval_batch_size precision = scfg.precision python_log_level: Optional[str] = scfg.python_log_level # Optional Evaluation Parameters with default values - eval_loader_config = scfg.eval_loader if scfg.eval_loader else scfg.eval_loaders + eval_loader_config = DictConfig( + scfg.eval_loader) if scfg.eval_loader else ListConfig( + scfg.eval_loaders) if scfg.eval_loaders else None seed = scfg.seed dist_timeout = scfg.dist_timeout default_run_name: str = os.environ.get('RUN_NAME', 'llm') @@ -238,7 +246,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: metadata = scfg.metadata should_log_config = scfg.log_config - callback_configs = scfg.callbacks + callback_configs = DictConfig(scfg.callbacks) # Warn for unused parameters for key in cfg: From 560e574d36fbb1f8bcd6c6d3a142e09297e3852c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 17:26:11 +0000 Subject: [PATCH 027/201] fix icl tasks --- scripts/eval/eval.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 55f60ad5eb..4157ed9260 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -225,8 +225,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: assert isinstance(fsdp_config, Dict) or fsdp_config is None # Mandatory Evaluation Parameters - icl_tasks = ListConfig( + icl_tasks: Union[ListConfig, str, None] = ListConfig( scfg.icl_tasks) if scfg.icl_tasks else scfg.icl_tasks_str + assert icl_tasks is not None, 'icl_tasks must be specified in the config' max_seq_len = scfg.max_seq_len device_eval_batch_size = scfg.device_eval_batch_size precision = scfg.precision From 346a87557174eb1a1a17f92751185c41fa34c7bb Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 17:45:04 +0000 Subject: [PATCH 028/201] fix --- scripts/train/train.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index c3f8b60a6a..c4e0eaa54b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -153,18 +153,19 @@ def validate_config(cfg: TrainConfig): if (cfg.model.get('fc_type', 'torch') == 'te' or 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp')): - fsdp_config = cfg.fsdp_config + fsdp_config = cfg.fsdp_config or DictConfig({}) act_ckpt = fsdp_config.get('activation_checkpointing', False) if fsdp_config else False act_ckpt_reentrant = fsdp_config.get( 'activation_checkpointing_reentrant', False) - if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == True: + if act_ckpt == True and act_ckpt_reentrant == True: warnings.warn( '`te.Linear` layers do not support activation_checkpointing with ' + '`activation_checkpointing_reentrant = True`. ' + 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' ) - cfg.fsdp_config.activation_checkpointing_reentrant = False + if cfg.fsdp_config is not None: + cfg.fsdp_config.activation_checkpointing_reentrant = False if cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': warnings.warn( From f57a83ad3aa9201f5718dbdab309bc36a60ec2aa Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 17:51:24 +0000 Subject: [PATCH 029/201] fix activation checkpointing reentrant --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index c4e0eaa54b..bcbf1fb374 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -165,7 +165,7 @@ def validate_config(cfg: TrainConfig): 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' ) if cfg.fsdp_config is not None: - cfg.fsdp_config.activation_checkpointing_reentrant = False + cfg.fsdp_config['activation_checkpointing_reentrant'] = False if cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': warnings.warn( From fff864a9b18b722b403f9c7a69fbdedff02861d7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 18:12:30 +0000 Subject: [PATCH 030/201] fix extraneous keys --- scripts/eval/eval.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 4157ed9260..2ea740ed2b 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -7,7 +7,7 @@ import sys import time import warnings -from dataclasses import dataclass +from dataclasses import dataclass, fields from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd @@ -193,6 +193,9 @@ class EvalConfig: callbacks: Optional[Dict[str, Any]] = None +EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) + + def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: om.resolve(cfg) @@ -207,6 +210,19 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: if isinstance(cfg.icl_tasks, str): cfg.icl_tasks_str = cfg.pop('icl_tasks') + arg_config_keys = set(cfg.keys()) + extraneous_keys = set.difference(arg_config_keys, EVAL_CONFIG_KEYS) + + if 'variables' not in cfg: + cfg['variables'] = {} + + for key in extraneous_keys: + warnings.warn( + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes.' + ) + # TODO (milo): delete the below line once we deprecate variables at the top level. + cfg['variables'][key] = cfg.pop(key) + scfg: EvalConfig = om.structured(EvalConfig(**cfg)) # Create copy of config for logging logged_cfg: DictConfig = copy.deepcopy(cfg) From ed8a94d539fa6f00bf62df1f80ee828dd323c0e4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 18:31:01 +0000 Subject: [PATCH 031/201] first round ** --- scripts/eval/eval.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 2ea740ed2b..3af44a4488 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -35,7 +35,9 @@ def evaluate_model( - model_cfg: DictConfig, + tokenizer: Dict[str, Any], + model_name: str, + model: Dict[str, Any], dist_timeout: Union[float, int], run_name: str, seed: int, @@ -55,13 +57,12 @@ def evaluate_model( metadata: Optional[Dict[str, str]], logged_config: DictConfig, should_log_config: bool = True, + load_path: Optional[str] = None, ): - log.info(f'Evaluating model: {model_cfg.model_name}') + log.info(f'Evaluating model: {model_name}') # Build tokenizer and model - tokenizer_cfg: Dict[str, - Any] = om.to_container(model_cfg.tokenizer, - resolve=True) # type: ignore + tokenizer_cfg = tokenizer tokenizer_name = tokenizer_cfg['name'] tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) @@ -93,16 +94,16 @@ def evaluate_model( mosaicml_logger.log_metrics(metadata) mosaicml_logger._flush_metadata(force_flush=True) - if fsdp_config and model_cfg.model.get('load_in_8bit', False): + if fsdp_config and model.get('load_in_8bit', False): raise ValueError( 'The FSDP config block is not supported when loading ' + 'Hugging Face models in 8bit.') - init_context = process_init_device(model_cfg.model, fsdp_config) + init_context = process_init_device(model, fsdp_config) composer_model = build_composer_model( - name=model_cfg.model.name, - cfg=model_cfg.model, + name=model['name'], + cfg=model, tokenizer=tokenizer, init_context=init_context, ) @@ -119,8 +120,7 @@ def evaluate_model( [avg for avg in eval_gauntlet_callback.averages] + [t.name for t in eval_gauntlet_callback.categories]) - load_path = model_cfg.get('load_path', None) - if model_cfg.model.name == 'mpt_causal_lm' and load_path is None: + if model['name'] == 'mpt_causal_lm' and load_path is None: raise ValueError( 'MPT causal LMs require a load_path to the checkpoint for model evaluation.' + @@ -129,7 +129,7 @@ def evaluate_model( assert composer_model is not None - log.info(f'Building trainer for {model_cfg.model_name}...') + log.info(f'Building trainer for {model_name}...') trainer = Trainer( run_name=run_name, seed=seed, @@ -150,7 +150,7 @@ def evaluate_model( log.info('Evaluation config:') log_config(logged_config) - log.info(f'Starting eval for {model_cfg.model_name}...') + log.info(f'Starting eval for {model_name}...') if torch.cuda.is_available(): torch.cuda.synchronize() a = time.time() @@ -160,7 +160,7 @@ def evaluate_model( torch.cuda.synchronize() b = time.time() - log.info(f'Ran {model_cfg.model_name} eval in: {b-a} seconds') + log.info(f'Ran {model_name} eval in: {b-a} seconds') return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) @@ -192,6 +192,8 @@ class EvalConfig: model_name_or_path: Optional[str] = None callbacks: Optional[Dict[str, Any]] = None + variables: Optional[Dict[str, Any]] = None # variables to ignore + EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) @@ -308,7 +310,6 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for model_cfg in model_configs: (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) = evaluate_model( - model_cfg=model_cfg, dist_timeout=dist_timeout, run_name=run_name, seed=seed, @@ -327,7 +328,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: icl_subset_num_batches=icl_subset_num_batches, metadata=metadata, logged_config=logged_cfg, - should_log_config=should_log_config) + should_log_config=should_log_config, + **model_cfg) trainers.append(trainer) if eval_gauntlet_callback is not None: From 42939aaaa489df7ff8b76f5d69a64084a8182ef6 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 18:39:54 +0000 Subject: [PATCH 032/201] fix? --- scripts/eval/eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 3af44a4488..0ef2bf607c 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -99,11 +99,11 @@ def evaluate_model( 'The FSDP config block is not supported when loading ' + 'Hugging Face models in 8bit.') - init_context = process_init_device(model, fsdp_config) + init_context = process_init_device(DictConfig(model), fsdp_config) composer_model = build_composer_model( name=model['name'], - cfg=model, + cfg=DictConfig(model), tokenizer=tokenizer, init_context=init_context, ) From 8e981aa983e3118021ea1b4fb7d19f7342420d2c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 18:51:28 +0000 Subject: [PATCH 033/201] quick fsdp config fix --- scripts/eval/eval.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 0ef2bf607c..7f4b84474b 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -238,9 +238,11 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: eval_gauntlet_config = DictConfig( scfg.eval_gauntlet) if scfg.eval_gauntlet else scfg.eval_gauntlet_str - fsdp_config = scfg.fsdp_config + fsdp_config = {**scfg.fsdp_config} if scfg.fsdp_config else None - assert isinstance(fsdp_config, Dict) or fsdp_config is None + assert isinstance( + fsdp_config, Dict + ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' # Mandatory Evaluation Parameters icl_tasks: Union[ListConfig, str, None] = ListConfig( From 767f097bceacd3ccc8bfdab107726be1a70b27d6 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 19:33:08 +0000 Subject: [PATCH 034/201] updated yamls to make variables explicit --- scripts/train/finetune_example/gpt2-arc-easy--cpu.yaml | 4 +++- scripts/train/yamls/finetune/1b_local_data_sft.yaml | 4 +++- scripts/train/yamls/finetune/7b_dolly_sft.yaml | 4 +++- scripts/train/yamls/finetune/dbrx-full-ft.yaml | 7 ++++--- scripts/train/yamls/finetune/dbrx-lora-ft.yaml | 6 ++++-- .../finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml | 4 +++- scripts/train/yamls/finetune/mpt-30b-instruct.yaml | 6 ++++-- scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml | 4 +++- scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml | 4 +++- scripts/train/yamls/finetune/t5-small_dolly_sft.yaml | 6 ++++-- 10 files changed, 34 insertions(+), 15 deletions(-) diff --git a/scripts/train/finetune_example/gpt2-arc-easy--cpu.yaml b/scripts/train/finetune_example/gpt2-arc-easy--cpu.yaml index 2b1821c92c..635313d4bc 100644 --- a/scripts/train/finetune_example/gpt2-arc-easy--cpu.yaml +++ b/scripts/train/finetune_example/gpt2-arc-easy--cpu.yaml @@ -1,5 +1,7 @@ +variables: + global_seed: 17 + max_seq_len: 512 -global_seed: 17 # Run Name run_name: # If left blank, will be read from env var $RUN_NAME diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml index d7b9db10d4..cec9febf68 100644 --- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml +++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml @@ -4,8 +4,10 @@ # This is not the right YAML if you are trying to finetune a HuggingFace pretrained model. # ############################################################################################ +variables: + global_seed: 17 + max_seq_len: 2048 -global_seed: 17 # Run Name run_name: # If left blank, will be read from env var $RUN_NAME diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml index 024362299a..d46393bd8a 100644 --- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml @@ -4,8 +4,10 @@ # This is not the right YAML if you are trying to finetune a HuggingFace pretrained model. # ############################################################################################ +variables: + global_seed: 17 + max_seq_len: 2048 -global_seed: 17 # Run Name run_name: # If left blank, will be read from env var $RUN_NAME diff --git a/scripts/train/yamls/finetune/dbrx-full-ft.yaml b/scripts/train/yamls/finetune/dbrx-full-ft.yaml index 9cb53e40fd..c48f269788 100644 --- a/scripts/train/yamls/finetune/dbrx-full-ft.yaml +++ b/scripts/train/yamls/finetune/dbrx-full-ft.yaml @@ -1,10 +1,11 @@ +variables: + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + # Note: This requires ~64x80GB GPUs max_seq_len: 4096 icl_seq_len: 1024 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME - # Model model: name: hf_causal_lm diff --git a/scripts/train/yamls/finetune/dbrx-lora-ft.yaml b/scripts/train/yamls/finetune/dbrx-lora-ft.yaml index 06e8f1d6f0..dacb2c8563 100644 --- a/scripts/train/yamls/finetune/dbrx-lora-ft.yaml +++ b/scripts/train/yamls/finetune/dbrx-lora-ft.yaml @@ -1,6 +1,8 @@ -# Note: This requires ~16x80GB GPUs +variables: + # Note: This requires ~16x80GB GPUs + icl_seq_len: 1024 + max_seq_len: 4096 -icl_seq_len: 1024 # Run Name run_name: # If left blank, will be read from env var $RUN_NAME diff --git a/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml b/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml index 4047256614..2de4e29795 100644 --- a/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml +++ b/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml @@ -1,5 +1,7 @@ +variables: + global_seed: 17 + max_seq_len: 512 -global_seed: 17 data_local: ./my_data data_remote: # If blank, files must be present in data_local diff --git a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml index 226f96230e..373f3b379b 100644 --- a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml +++ b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml @@ -1,6 +1,8 @@ -tokenizer_name: mosaicml/mpt-30b +variables: + tokenizer_name: mosaicml/mpt-30b + global_seed: 17 + max_seq_len: 8192 -global_seed: 17 # Run Name run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml index a789c4b491..bd100dd01c 100644 --- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml @@ -1,5 +1,7 @@ +variables: + global_seed: 17 + max_seq_len: 2048 -global_seed: 17 # Run Name run_name: # If left blank, will be read from env var $RUN_NAME diff --git a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml index 49a70e97f2..3dcdb95e7a 100644 --- a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml +++ b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml @@ -1,7 +1,9 @@ +variables: + global_seed: 17 + data_local: ./my-adaptation-data data_remote: # If blank, files must be present in data_local max_seq_len: 4096 -global_seed: 17 # Run Name run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME diff --git a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml index 2264e359e0..a035a909cd 100644 --- a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml @@ -1,6 +1,8 @@ +variables: + global_seed: 17 + model_name: t5-small + max_seq_len: 1024 -global_seed: 17 -model_name: t5-small # Run Name run_name: # If left blank, will be read from env var $RUN_NAME From af250fecc29096d61ba384a277145592a8d2c962 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 19:40:42 +0000 Subject: [PATCH 035/201] remove precision from mandatory params list --- tests/a_scripts/train/test_train_inputs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index f69d66e847..24cad29a6b 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -74,7 +74,6 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: 'scheduler', 'max_duration', 'eval_interval', - 'precision', 'max_seq_len', ] for param in mandatory_params: From 1c14ea5677f3ce6b49cc5747678f31a483b341f0 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 20:38:41 +0000 Subject: [PATCH 036/201] I expect many of these to fail in interesting ways --- llmfoundry/models/hf/hf_t5.py | 38 +++++++++++-------- llmfoundry/utils/builders.py | 9 ++--- llmfoundry/utils/mosaicml_logger_utils.py | 2 +- scripts/eval/eval.py | 17 +++++---- scripts/inference/benchmarking/benchmark.py | 4 +- scripts/train/train.py | 7 ++-- .../yamls/finetune/t5-small_dolly_sft.yaml | 4 +- tests/a_scripts/eval/test_eval.py | 8 ++-- .../inference/test_convert_composer_to_hf.py | 16 ++++---- tests/fixtures/models.py | 4 +- tests/models/hf/test_fsdp_weight_tying.py | 4 +- tests/models/hf/test_hf_config.py | 20 +++++----- tests/models/hf/test_hf_peft_wrapping.py | 4 +- tests/models/hf/test_hf_v_mpt.py | 8 ++-- tests/models/layers/test_huggingface_flash.py | 4 +- tests/models/test_model.py | 28 +++++++------- 16 files changed, 91 insertions(+), 86 deletions(-) diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py index b9c1df64cf..bb014d1798 100644 --- a/llmfoundry/models/hf/hf_t5.py +++ b/llmfoundry/models/hf/hf_t5.py @@ -5,10 +5,9 @@ from __future__ import annotations -from typing import Mapping +from typing import List, Mapping, Optional from composer.utils import dist -from omegaconf import DictConfig from transformers import (AutoConfig, PreTrainedTokenizerBase, T5ForConditionalGeneration) @@ -44,18 +43,28 @@ class ComposerHFT5(HuggingFaceModelWithFSDP): tokenizer (PreTrainedTokenizer): The tokenizer that the model will use. """ - def __init__(self, om_model_config: DictConfig, - tokenizer: PreTrainedTokenizerBase): + def __init__( + self, + tokenizer: PreTrainedTokenizerBase, + pretrained_model_name_or_path: str, + pretrained: bool, + trust_remote_code: bool = True, + use_auth_token: bool = False, + config_overrides: Optional[Mapping] = None, + init_device: str = 'cpu', + additional_train_metrics: Optional[List] = None, + name: Optional[str] = None, + ): from llmfoundry.utils.builders import build_metric config = AutoConfig.from_pretrained( - om_model_config.pretrained_model_name_or_path, - trust_remote_code=om_model_config.get('trust_remote_code', True), - use_auth_token=om_model_config.get('use_auth_token', False), + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + use_auth_token=use_auth_token, ) # set config overrides - for k, v in om_model_config.get('config_overrides', {}).items(): + for k, v in (config_overrides or {}): if not hasattr(config, k): raise ValueError( f'config does not have attribute "{k}" to override ({k}: {v}).' @@ -77,7 +86,7 @@ def __init__(self, om_model_config: DictConfig, raise ValueError(f'Model type "hf_t5" currently only supports T5 models ' +\ f'using configs where `is_encoder_decoder` is ``True``.') - init_device = om_model_config.get('init_device', 'cpu') + init_device = init_device # Get the device we want to initialize, and use the # resolved version to initialize the HF model @@ -86,17 +95,16 @@ def __init__(self, om_model_config: DictConfig, # We need to have all non-zero local ranks be not-pretrained # Rank 0 will still be pretrained, and distribute the weights appropriately if dist.get_local_rank() != 0 and init_device == 'mixed': - om_model_config.pretrained = False + pretrained = False if resolved_init_device == 'cpu': - if om_model_config.pretrained: + if pretrained: model = T5ForConditionalGeneration.from_pretrained( - om_model_config.pretrained_model_name_or_path, - config=config) + pretrained_model_name_or_path, config=config) else: model = T5ForConditionalGeneration(config) elif resolved_init_device == 'meta': - if om_model_config.pretrained: + if pretrained: raise ValueError( 'Setting cfg.pretrained=True is not supported when init_device="meta".' ) @@ -108,7 +116,7 @@ def __init__(self, om_model_config: DictConfig, metrics = [ build_metric(metric, {}) for metric in DEFAULT_ENC_DEC_METRICS + - om_model_config.get('additional_train_metrics', []) + (additional_train_metrics or []) ] composer_model = super().__init__(model=model, diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 2c9c3d6ac2..577343bbdd 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -163,11 +163,11 @@ def build_icl_data_and_gauntlet( def build_composer_model( - name: str, - cfg: DictConfig, + composer_model_name: str, tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager] = None, master_weights_dtype: Optional[str] = None, + **cfg: Dict[str, Any], ) -> ComposerModel: """Builds a ComposerModel from the registry. @@ -186,13 +186,12 @@ def build_composer_model( with init_context: model = construct_from_registry( - name=name, + name=composer_model_name, registry=registry.models, pre_validation_function=ComposerModel, post_validation_function=None, kwargs={ - 'om_model_config': cfg, - 'tokenizer': tokenizer + **cfg, 'tokenizer': tokenizer }, ) diff --git a/llmfoundry/utils/mosaicml_logger_utils.py b/llmfoundry/utils/mosaicml_logger_utils.py index e54f11ce32..d365e8fed1 100644 --- a/llmfoundry/utils/mosaicml_logger_utils.py +++ b/llmfoundry/utils/mosaicml_logger_utils.py @@ -71,7 +71,7 @@ def log_train_analytics(mosaicml_logger: MosaicMLLogger, train_loader_config: DictConfig, eval_loader_config: Optional[Union[DictConfig, ListConfig]], - callback_configs: Optional[DictConfig], + callback_configs: Optional[Dict[str, Any]], tokenizer_name: str, load_path: Optional[str], icl_tasks_config: Optional[Union[ListConfig, str]], eval_gauntlet: Optional[Union[DictConfig, str]]): diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 7f4b84474b..f2904d7f3f 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -53,12 +53,15 @@ def evaluate_model( eval_gauntlet_df: Optional[pd.DataFrame], eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], - callback_configs: Optional[DictConfig], + callback_configs: Optional[Dict[str, Any]], metadata: Optional[Dict[str, str]], logged_config: DictConfig, should_log_config: bool = True, load_path: Optional[str] = None, + **kwargs: Dict[str, Any], ): + model_extra_params = kwargs + warnings.warn(f'Extra parameters: {model_extra_params}') log.info(f'Evaluating model: {model_name}') # Build tokenizer and model @@ -101,12 +104,10 @@ def evaluate_model( init_context = process_init_device(DictConfig(model), fsdp_config) - composer_model = build_composer_model( - name=model['name'], - cfg=DictConfig(model), - tokenizer=tokenizer, - init_context=init_context, - ) + composer_model = build_composer_model(composer_model_name=model['name'], + tokenizer=tokenizer, + init_context=init_context, + **model) # Now add the eval metrics if eval_loader_config is not None: @@ -267,7 +268,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: metadata = scfg.metadata should_log_config = scfg.log_config - callback_configs = DictConfig(scfg.callbacks) + callback_configs = {**scfg.callbacks} # Warn for unused parameters for key in cfg: diff --git a/scripts/inference/benchmarking/benchmark.py b/scripts/inference/benchmarking/benchmark.py index 00daf6b559..947f50dc46 100644 --- a/scripts/inference/benchmarking/benchmark.py +++ b/scripts/inference/benchmarking/benchmark.py @@ -65,9 +65,9 @@ def main(config: DictConfig): tokenizer_kwargs=tokenizer_kwargs, ) composer_model = build_composer_model( - name=config.model.name, - cfg=config.model, + composer_model_name=config.model.name, tokenizer=tokenizer, + **config.model, ) model = composer_model.model model.eval() diff --git a/scripts/train/train.py b/scripts/train/train.py index bcbf1fb374..ddd73b5ef4 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -298,8 +298,7 @@ def main(cfg: DictConfig) -> Trainer: icl_seq_len: Optional[int] = scfg.icl_seq_len # Optional logging, evaluation and callback configs logger_configs: Optional[Dict[str, Any]] = scfg.loggers - callback_configs: Optional[DictConfig] = DictConfig( - scfg.callbacks) if scfg.callbacks else None + callback_configs: Optional[Dict[str, Any]] = scfg.callbacks algorithm_configs: Optional[Dict[str, Any]] = scfg.algorithms # Mandatory hyperparameters for training @@ -493,11 +492,11 @@ def main(cfg: DictConfig) -> Trainer: # Build Model log.info('Initializing model...') model = build_composer_model( - name=model_config.name, - cfg=model_config, + composer_model_name=model_config.name, tokenizer=tokenizer, init_context=init_context, master_weights_dtype=model_config.get('master_weights_dtype', None), + **model_config, ) # Log number of parameters diff --git a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml index a035a909cd..257c088c9e 100644 --- a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml @@ -10,12 +10,12 @@ run_name: # If left blank, will be read from env var $RUN_NAME # Model model: name: hf_t5 - pretrained_model_name_or_path: ${model_name} + pretrained_model_name_or_path: ${variables.model_name} pretrained: true # Tokenizer tokenizer: - name: ${model_name} + name: ${variables.model_name} # Dataloaders train_loader: diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index 63c4ea8261..509f01a9f5 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -47,11 +47,9 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): tokenizer = build_tokenizer(model_cfg.tokenizer.name, model_cfg.tokenizer.get('kwargs', {})) # build model - model = build_composer_model( - name=model_cfg.model.name, - cfg=model_cfg.model, - tokenizer=tokenizer, - ) + model = build_composer_model(composer_model_name=model_cfg.model.name, + tokenizer=tokenizer, + **model_cfg.model) # create mocked save checkpoint trainer = Trainer(model=model, device=device) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 061227d8a4..4e8fc86517 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -774,8 +774,8 @@ def test_huggingface_conversion_callback( device_batch_size, ) - original_model = build_composer_model(model_cfg['name'], model_cfg, - tokenizer) + original_model = build_composer_model(model_cfg['name'], tokenizer, + **model_cfg) optimizer_name = optimizer_config.pop('name') optimizer = build_optimizer(original_model, optimizer_name, optimizer_config) @@ -871,9 +871,9 @@ def test_convert_and_generate(model: str, tie_word_embeddings: bool, tokenizer = transformers.AutoTokenizer.from_pretrained( om_cfg.tokenizer.name, use_auth_token=model == 'llama2') original_model = build_composer_model( - name=om_cfg['model'].name, - cfg=om_cfg['model'], + composer_model_name=om_cfg['model'].name, tokenizer=tokenizer, + **om_cfg['model'], ) trainer = Trainer(model=original_model, device='cpu' if not model == 'mptmoe' else 'gpu') @@ -943,9 +943,9 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, tokenizer = transformers.AutoTokenizer.from_pretrained( om_cfg.tokenizer.name) original_model = build_composer_model( - name=om_cfg['model'].name, - cfg=om_cfg['model'], + composer_model_name=om_cfg['model'].name, tokenizer=tokenizer, + **om_cfg['model'], ) trainer = Trainer(model=original_model, device='cpu' if not 'moe' in conf_path else 'gpu') @@ -1153,10 +1153,10 @@ def test_mptmoe_huggingface_conversion_callback( init_context = process_init_device(model_cfg, fsdp_config) original_model = build_composer_model( - name=model_cfg.name, - cfg=model_cfg, + composer_model_name=model_cfg.name, tokenizer=tokenizer, init_context=init_context, + **model_cfg, ) optimizer = build_optimizer(original_model, optimizer_name, diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index 616d66085c..ced0e5690a 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -16,9 +16,9 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase): model = build_composer_model( - name=config.name, - cfg=config, + composer_model_name=config.name, tokenizer=tokenizer, + **config, ) return model diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 6e7838e7ba..75df260d44 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -68,9 +68,9 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, ) original_model = build_composer_model( - name=model_cfg['name'], - cfg=model_cfg, + composer_model_name=model_cfg['name'], tokenizer=tokenizer, + **model_cfg, ) underlying_model = maybe_get_underlying_model(original_model.model) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index e79756aba3..7359e8c6e0 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -47,9 +47,9 @@ def test_remote_code_false_mpt( ValueError, match='trust_remote_code must be set to True for MPT models.'): _ = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + composer_model_name=test_cfg.model.name, tokenizer=tokenizer, + **test_cfg.model, ) @@ -139,9 +139,9 @@ def test_hf_config_override( tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) model = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + composer_model_name=test_cfg.model.name, tokenizer=tokenizer, + **test_cfg.model, ) # save model @@ -163,9 +163,9 @@ def test_hf_config_override( hf_model_config.model = model_cfg hf_model = build_composer_model( - name=hf_model_config.model.name, - cfg=hf_model_config.model, + composer_model_name=hf_model_config.model.name, tokenizer=tokenizer, + **hf_model_config.model, ) for k, v in hf_model_config.model.config_overrides.items(): @@ -198,9 +198,9 @@ def test_rope_scaling_override(): model_cfg = om.create(model_cfg) model = build_composer_model( - name=model_cfg.name, - cfg=model_cfg, + composer_model_name=model_cfg.name, tokenizer=None, # type: ignore + **model_cfg, ) # This would error if the config isn't parsed into a proper dictionary model.get_metadata() @@ -225,9 +225,9 @@ def test_nested_override(): model_cfg = om.create(model_cfg) model = build_composer_model( - name=model_cfg.name, - cfg=model_cfg, + composer_model_name=model_cfg.name, tokenizer=None, # type: ignore + **model_cfg, ) # The value we changed diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index d8bea33dd4..a75803d845 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -84,9 +84,9 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, ) original_model = build_composer_model( - name=model_cfg['name'], - cfg=model_cfg, + composer_model_name=model_cfg['name'], tokenizer=tokenizer, + **model_cfg, ) trainer = Trainer( diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 82b64ce80c..50fe3857c8 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -59,9 +59,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, tokenizer_kwargs=tokenizer_kwargs, ) hf_model = build_composer_model( - name=hf_cfg.model.name, - cfg=hf_cfg.model, + composer_model_name=hf_cfg.model.name, tokenizer=tokenizer, + **hf_cfg.model, ).to(device) hf_n_params = sum(p.numel() for p in hf_model.parameters()) @@ -111,9 +111,9 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, print(model_cfg) model = build_composer_model( - name=model_cfg.name, - cfg=model_cfg, + composer_model_name=model_cfg.name, tokenizer=tokenizer, + **model_cfg, ).to(device) n_params = sum(p.numel() for p in model.parameters()) diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 1e8ec2383d..8652f24739 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -83,9 +83,9 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): with error_context: model = build_composer_model( - name=model_cfg['name'], - cfg=model_cfg, + composer_model_name=model_cfg['name'], tokenizer=tokenizer, + **model_cfg, ) # check that it actually used flash attention 2 diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 402698cb27..3f24598a48 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -90,9 +90,9 @@ def _get_objs(request: pytest.FixtureRequest, tokenizer_cfg.get('kwargs', {})) model = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + composer_model_name=test_cfg.model.name, tokenizer=tokenizer, + **test_cfg.model, ) # Optimizer @@ -292,9 +292,9 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): tokenizer_cfg.get('kwargs', {})) model = build_composer_model( - name=neo_cfg.model.name, - cfg=neo_cfg.model, + composer_model_name=neo_cfg.model.name, tokenizer=tokenizer, + **neo_cfg.model, ).to(device) assert isinstance(model.tokenizer, @@ -341,9 +341,9 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): tokenizer_cfg.get('kwargs', {})) model = build_composer_model( - name=t5_cfg.model.name, - cfg=t5_cfg.model, + composer_model_name=t5_cfg.model.name, tokenizer=tokenizer, + **t5_cfg.model, ).to(device) assert isinstance(model.tokenizer, @@ -418,9 +418,9 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, tokenizer_cfg.get('kwargs', {})) model_1 = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + composer_model_name=test_cfg.model.name, tokenizer=tokenizer, + **test_cfg.model, ) model_2 = copy.deepcopy(model_1) @@ -488,9 +488,9 @@ def test_loss_fn(): tokenizer_cfg.get('kwargs', {})) model_1 = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + composer_model_name=test_cfg.model.name, tokenizer=tokenizer, + **test_cfg.model, ) model_2 = copy.deepcopy(model_1) @@ -574,9 +574,9 @@ def test_loss_reduction(loss_fn_config: str): tokenizer_cfg.get('kwargs', {})) model_1 = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + composer_model_name=test_cfg.model.name, tokenizer=tokenizer, + **test_cfg.model, ) model_2 = copy.deepcopy(model_1) @@ -766,7 +766,7 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool, assert mpt.config.d_model == 128 assert mpt.config.n_heads == 4 assert mpt.config.n_layers == 2 - if ffn_hidden_size is None: + if ffn_hidden_size is None: # type: ignore (sometimes it may not be none) assert mpt.config.expansion_ratio == expansion_ratio else: assert mpt.config.ffn_config['ffn_hidden_size'] == ffn_hidden_size @@ -783,7 +783,7 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool, assert len(mpt.transformer.blocks) == 2 d_model = hf_config.d_model - if ffn_hidden_size is None: + if ffn_hidden_size is None: # type: ignore (sometimes it may not be none) ffn_hidden_size = int(hf_config.d_model * hf_config.expansion_ratio) for block in mpt.transformer.blocks: assert isinstance(block, MPTBlock) From 0b5721ee57fdc756a5321b2f712dc0404a0d8e2d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 21:02:48 +0000 Subject: [PATCH 037/201] fix test_model test cases with ** --- llmfoundry/models/hf/hf_causal_lm.py | 66 +++++++++++++-------------- llmfoundry/models/mpt/modeling_mpt.py | 25 +++++----- 2 files changed, 44 insertions(+), 47 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 5bca5cb21a..43778ef04c 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -6,11 +6,10 @@ import logging import os import warnings -from typing import TYPE_CHECKING, Any, Dict, Mapping +from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional from composer.models.huggingface import peft_installed from composer.utils import dist -from omegaconf import DictConfig from transformers import (AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase) @@ -20,7 +19,6 @@ from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP from llmfoundry.models.layers.attention import is_flash_v2_installed from llmfoundry.models.utils import init_empty_weights -from llmfoundry.utils.config_utils import pop_config if TYPE_CHECKING: from peft import PeftConfig @@ -59,32 +57,40 @@ class ComposerHFCausalLM(HuggingFaceModelWithFSDP): tokenizer (PreTrainedTokenizer): The tokenizer that the model will use. """ - def __init__(self, om_model_config: DictConfig, - tokenizer: PreTrainedTokenizerBase): + def __init__( + self, + tokenizer: PreTrainedTokenizerBase, + pretrained_model_name_or_path: str, + pretrained: bool, + pretrained_lora_id_or_path: Optional[str] = None, + trust_remote_code: bool = True, + use_auth_token: bool = False, + use_flash_attention_2: bool = False, + load_in_8bit: bool = False, + init_device: str = 'cpu', + config_overrides: Optional[Dict[str, Any]] = None, + peft_config: Optional[Dict[str, Any]] = None, + use_train_metrics: bool = True, + additional_train_metrics: Optional[List] = None, + name: Optional[str] = None, + ): + from llmfoundry.utils.builders import build_metric - pretrained_model_name_or_path = om_model_config.pretrained_model_name_or_path - pretrained_lora_id_or_path = om_model_config.get( - 'pretrained_lora_id_or_path', None) + config_overrides = config_overrides or {} + additional_train_metrics = additional_train_metrics or [] + + pretrained_model_name_or_path = pretrained_model_name_or_path + pretrained_lora_id_or_path = pretrained_lora_id_or_path - if not om_model_config.get( - 'trust_remote_code', True - ) and pretrained_model_name_or_path.startswith('mosaicml/mpt'): + if not trust_remote_code and pretrained_model_name_or_path.startswith( + 'mosaicml/mpt'): raise ValueError( 'trust_remote_code must be set to True for MPT models. Without this, the MPT model code will come from the transformers library, ' + 'which is significantly slower and not compatible with the LLM foundry training code, rather than the code release by MosaicML.' ) - # Set up Hugging Face args - trust_remote_code = om_model_config.get('trust_remote_code', True) - use_auth_token = om_model_config.get('use_auth_token', False) - use_flash_attention_2 = om_model_config.get('use_flash_attention_2', - False) - load_in_8bit = om_model_config.get('load_in_8bit', False) - - # Set up config args for the model construction and base classes - init_device = om_model_config.get('init_device', 'cpu') # Resolve "mixed" init device to either "cpu" or "meta" resolved_init_device = hf_get_init_device(init_device) requested_attention_implementation = 'flash_attention_2' if use_flash_attention_2 else 'eager' @@ -94,23 +100,17 @@ def __init__(self, om_model_config: DictConfig, 'use_flash_attention_2 is set to True, but flash-attention 2 is not installed. ' + 'Please `pip install llm-foundry[gpu]`.') - peft_config_dict = pop_config(om_model_config, - 'peft_config', - must_exist=False, - convert=True) + peft_config_dict = peft_config if peft_config_dict is not None and not peft_installed: raise ValueError( 'PEFT is not installed, but peft_config was passed. Please install LLM Foundry with the peft extra to use peft_config.' ) - use_train_metrics = om_model_config.get('use_train_metrics', True) - train_metric_names = DEFAULT_CAUSAL_LM_TRAIN_METRICS + om_model_config.get( - 'additional_train_metrics', []) + train_metric_names = DEFAULT_CAUSAL_LM_TRAIN_METRICS + additional_train_metrics train_metrics = [ build_metric(metric, {}) for metric in train_metric_names ] if use_train_metrics else [] - eval_metric_names = DEFAULT_CAUSAL_LM_EVAL_METRICS + om_model_config.get( - 'additional_eval_metrics', []) + eval_metric_names = DEFAULT_CAUSAL_LM_EVAL_METRICS + additional_train_metrics eval_metrics = [ build_metric(metric, {}) for metric in eval_metric_names ] @@ -141,7 +141,7 @@ def _autoset_attn_implementation_monkeypatch( _autoset_attn_implementation_monkeypatch) # set config overrides - for k, v in om_model_config.get('config_overrides', {}).items(): + for k, v in config_overrides.items(): if not hasattr(config, k): raise ValueError( f'config does not have attribute "{k}" to override ({k}: {v}).' @@ -179,7 +179,7 @@ def _autoset_attn_implementation_monkeypatch( # We need to have all non-zero local ranks be not-pretrained # Rank 0 will still be pretrained, and distribute the weights appropriately if dist.get_local_rank() != 0 and init_device == 'mixed': - om_model_config.pretrained = False + pretrained = False # If the HuggingFace model is coming from a local folder, Hugging Face copies the modules into the # transformers modules cache. On particular systems, this operation seems to cause contention between @@ -201,7 +201,7 @@ def _autoset_attn_implementation_monkeypatch( # initialize the model on the correct device if resolved_init_device == 'cpu': - if om_model_config.pretrained: + if pretrained: model = AutoModelForCausalLM.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, @@ -215,7 +215,7 @@ def _autoset_attn_implementation_monkeypatch( trust_remote_code=trust_remote_code, ) elif resolved_init_device == 'meta': - if om_model_config.pretrained: + if pretrained: raise ValueError( 'Setting cfg.pretrained=True is not supported when init_device="meta".' ) diff --git a/llmfoundry/models/mpt/modeling_mpt.py b/llmfoundry/models/mpt/modeling_mpt.py index 1ef62a3b19..892dd4e07c 100644 --- a/llmfoundry/models/mpt/modeling_mpt.py +++ b/llmfoundry/models/mpt/modeling_mpt.py @@ -31,8 +31,6 @@ except Exception as e: raise e -from omegaconf import DictConfig -from omegaconf import OmegaConf as om from transformers import PreTrainedModel, PreTrainedTokenizerBase from transformers.modeling_outputs import (BaseModelOutputWithPast, CausalLMOutputWithPast) @@ -961,28 +959,27 @@ class ComposerMPTCausalLM(HuggingFaceModel): def __init__( self, - om_model_config: DictConfig, tokenizer: Optional[PreTrainedTokenizerBase] = None, + use_train_metrics: Optional[bool] = True, + additional_train_metrics: Optional[List] = None, + loss_fn: Optional[Union[str, Dict]] = 'fused_crossentropy', + **kwargs: Dict[str, Any], ): from llmfoundry.metrics import (DEFAULT_CAUSAL_LM_EVAL_METRICS, DEFAULT_CAUSAL_LM_TRAIN_METRICS) from llmfoundry.utils.builders import build_metric - resolved_om_model_config = om.to_container(om_model_config, - resolve=True) - assert isinstance(resolved_om_model_config, dict) + additional_train_metrics = additional_train_metrics or [] - hf_config = MPTConfig.from_dict(resolved_om_model_config) - model = MPTForCausalLM(hf_config) + model = MPTForCausalLM( + MPTConfig(use_train_metrics=use_train_metrics, **kwargs)) - use_train_metrics = om_model_config.get('use_train_metrics', True) - train_metric_names = DEFAULT_CAUSAL_LM_TRAIN_METRICS + resolved_om_model_config.get( - 'additional_train_metrics', []) + use_train_metrics = use_train_metrics + train_metric_names = DEFAULT_CAUSAL_LM_TRAIN_METRICS + additional_train_metrics train_metrics = [ build_metric(metric, {}) for metric in train_metric_names ] if use_train_metrics else [] - eval_metric_names = DEFAULT_CAUSAL_LM_EVAL_METRICS + resolved_om_model_config.get( - 'additional_eval_metrics', []) + eval_metric_names = DEFAULT_CAUSAL_LM_EVAL_METRICS + additional_train_metrics eval_metrics = [ build_metric(metric, {}) for metric in eval_metric_names ] @@ -997,7 +994,7 @@ def __init__( allow_embedding_resizing=True, ) - loss_fn_config = om_model_config.get('loss_fn', 'fused_crossentropy') + loss_fn_config = loss_fn if loss_fn_config == 'fused_crossentropy': try: from flash_attn.losses.cross_entropy import \ From f770f60e0c600fceaa99b3ef2c1a397b657a1b1c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 21:27:11 +0000 Subject: [PATCH 038/201] fix many more test cases --- llmfoundry/models/hf/hf_causal_lm.py | 6 ++++-- llmfoundry/models/hf/hf_t5.py | 7 +++++-- tests/models/hf/test_hf_fsdp.py | 2 +- tests/models/hf/test_hf_t5.py | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 43778ef04c..e83fa6b8c3 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -10,6 +10,7 @@ from composer.models.huggingface import peft_installed from composer.utils import dist +from omegaconf import OmegaConf as om from transformers import (AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase) @@ -61,7 +62,7 @@ def __init__( self, tokenizer: PreTrainedTokenizerBase, pretrained_model_name_or_path: str, - pretrained: bool, + pretrained: Optional[bool] = True, pretrained_lora_id_or_path: Optional[str] = None, trust_remote_code: bool = True, use_auth_token: bool = False, @@ -77,7 +78,8 @@ def __init__( from llmfoundry.utils.builders import build_metric - config_overrides = config_overrides or {} + config_overrides = om.to_container( + config_overrides, resolve=True) if config_overrides else {} additional_train_metrics = additional_train_metrics or [] pretrained_model_name_or_path = pretrained_model_name_or_path diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py index bb014d1798..a4ff90360d 100644 --- a/llmfoundry/models/hf/hf_t5.py +++ b/llmfoundry/models/hf/hf_t5.py @@ -8,6 +8,7 @@ from typing import List, Mapping, Optional from composer.utils import dist +from omegaconf import OmegaConf as om from transformers import (AutoConfig, PreTrainedTokenizerBase, T5ForConditionalGeneration) @@ -47,7 +48,7 @@ def __init__( self, tokenizer: PreTrainedTokenizerBase, pretrained_model_name_or_path: str, - pretrained: bool, + pretrained: Optional[bool] = True, trust_remote_code: bool = True, use_auth_token: bool = False, config_overrides: Optional[Mapping] = None, @@ -57,6 +58,8 @@ def __init__( ): from llmfoundry.utils.builders import build_metric + config_overrides = om.to_container(config_overrides or {}, resolve=True) + config = AutoConfig.from_pretrained( pretrained_model_name_or_path, trust_remote_code=trust_remote_code, @@ -64,7 +67,7 @@ def __init__( ) # set config overrides - for k, v in (config_overrides or {}): + for k, v in (config_overrides or {}).items(): if not hasattr(config, k): raise ValueError( f'config does not have attribute "{k}" to override ({k}: {v}).' diff --git a/tests/models/hf/test_hf_fsdp.py b/tests/models/hf/test_hf_fsdp.py index 0f49a4d43b..5405d30697 100644 --- a/tests/models/hf/test_hf_fsdp.py +++ b/tests/models/hf/test_hf_fsdp.py @@ -21,7 +21,7 @@ def test_olmo_wraps(): config = DictConfig(conf) - model = ComposerHFCausalLM(config.model, None) + model = ComposerHFCausalLM(**config.model, tokenizer=None) # check that all the modules we except are blocked from FSDP wrapping underlying_model = maybe_get_underlying_model(model.model) diff --git a/tests/models/hf/test_hf_t5.py b/tests/models/hf/test_hf_t5.py index fb8689e665..47443f2410 100644 --- a/tests/models/hf/test_hf_t5.py +++ b/tests/models/hf/test_hf_t5.py @@ -23,4 +23,4 @@ def test_experimental_hf_t5(): tokenizer = transformers.T5Tokenizer.from_pretrained('t5-base') with pytest.warns(ExperimentalWarning): - _ = ComposerHFT5(cfg, tokenizer) + _ = ComposerHFT5(**cfg, tokenizer=tokenizer) From 9dfa01b69dac96068b6d7883184f72f6fe4b0512 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 21:44:57 +0000 Subject: [PATCH 039/201] fix dictconfig objectification --- llmfoundry/models/hf/hf_causal_lm.py | 8 ++++---- llmfoundry/utils/builders.py | 6 +++++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index e83fa6b8c3..a5aeb60837 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -10,7 +10,6 @@ from composer.models.huggingface import peft_installed from composer.utils import dist -from omegaconf import OmegaConf as om from transformers import (AutoConfig, AutoModelForCausalLM, PretrainedConfig, PreTrainedModel, PreTrainedTokenizerBase) @@ -73,14 +72,15 @@ def __init__( peft_config: Optional[Dict[str, Any]] = None, use_train_metrics: bool = True, additional_train_metrics: Optional[List] = None, + additional_eval_metrics: Optional[List] = None, name: Optional[str] = None, ): from llmfoundry.utils.builders import build_metric - config_overrides = om.to_container( - config_overrides, resolve=True) if config_overrides else {} + config_overrides = config_overrides or {} additional_train_metrics = additional_train_metrics or [] + additional_eval_metrics = additional_eval_metrics or [] pretrained_model_name_or_path = pretrained_model_name_or_path pretrained_lora_id_or_path = pretrained_lora_id_or_path @@ -112,7 +112,7 @@ def __init__( train_metrics = [ build_metric(metric, {}) for metric in train_metric_names ] if use_train_metrics else [] - eval_metric_names = DEFAULT_CAUSAL_LM_EVAL_METRICS + additional_train_metrics + eval_metric_names = DEFAULT_CAUSAL_LM_EVAL_METRICS + additional_eval_metrics eval_metrics = [ build_metric(metric, {}) for metric in eval_metric_names ] diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 577343bbdd..7f9acd4a2e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -167,7 +167,7 @@ def build_composer_model( tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager] = None, master_weights_dtype: Optional[str] = None, - **cfg: Dict[str, Any], + **cfg: Union[Dict, DictConfig], ) -> ComposerModel: """Builds a ComposerModel from the registry. @@ -184,6 +184,10 @@ def build_composer_model( if init_context is None: init_context = contextlib.nullcontext() + for k, v in cfg.items(): + if isinstance(v, DictConfig): + cfg[k] = om.to_container(v, resolve=True) + with init_context: model = construct_from_registry( name=composer_model_name, From fc4a86ad658edd3d72ada27f570a5f9c328b7793 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 22:25:57 +0000 Subject: [PATCH 040/201] fix remaining test cases --- llmfoundry/models/hf/hf_t5.py | 3 +-- scripts/eval/eval.py | 6 ++++-- tests/models/test_model.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py index a4ff90360d..cf6f6d0ece 100644 --- a/llmfoundry/models/hf/hf_t5.py +++ b/llmfoundry/models/hf/hf_t5.py @@ -8,7 +8,6 @@ from typing import List, Mapping, Optional from composer.utils import dist -from omegaconf import OmegaConf as om from transformers import (AutoConfig, PreTrainedTokenizerBase, T5ForConditionalGeneration) @@ -58,7 +57,7 @@ def __init__( ): from llmfoundry.utils.builders import build_metric - config_overrides = om.to_container(config_overrides or {}, resolve=True) + config_overrides = config_overrides or {} config = AutoConfig.from_pretrained( pretrained_model_name_or_path, diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index f2904d7f3f..49cf610e05 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -239,7 +239,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: eval_gauntlet_config = DictConfig( scfg.eval_gauntlet) if scfg.eval_gauntlet else scfg.eval_gauntlet_str - fsdp_config = {**scfg.fsdp_config} if scfg.fsdp_config else None + fsdp_config = om.to_container( + scfg.fsdp_config) if scfg.fsdp_config else None assert isinstance( fsdp_config, Dict @@ -268,7 +269,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: metadata = scfg.metadata should_log_config = scfg.log_config - callback_configs = {**scfg.callbacks} + callback_configs = om.to_container( + scfg.callbacks) if scfg.callbacks else None # Warn for unused parameters for key in cfg: diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 3f24598a48..3d25c755a4 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -664,7 +664,7 @@ def test_opt_wrapping(peft_config: Optional[dict[str, str]]): tokenizer = build_tokenizer(config.tokenizer.name, tokenizer_cfg.get('kwargs', {})) - model = ComposerHFCausalLM(config.model, tokenizer) + model = ComposerHFCausalLM(**config.model, tokenizer=tokenizer) # check that all the modules we except are blocked from FSDP wrapping underlying_model = maybe_get_underlying_model(model.model) From c7bb8664551f8a1107a75e4c5542d5f69d618194 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 22:49:29 +0000 Subject: [PATCH 041/201] remove unneeded ** --- llmfoundry/utils/builders.py | 18 +++++++++--------- scripts/eval/eval.py | 2 +- tests/models/test_model.py | 12 ++++++------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 7f9acd4a2e..7971d7b03a 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -163,11 +163,11 @@ def build_icl_data_and_gauntlet( def build_composer_model( - composer_model_name: str, + name: str, + cfg: Union[Dict, DictConfig], tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager] = None, master_weights_dtype: Optional[str] = None, - **cfg: Union[Dict, DictConfig], ) -> ComposerModel: """Builds a ComposerModel from the registry. @@ -190,7 +190,7 @@ def build_composer_model( with init_context: model = construct_from_registry( - name=composer_model_name, + name=name, registry=registry.models, pre_validation_function=ComposerModel, post_validation_function=None, @@ -377,16 +377,16 @@ def _extract_param_groups( return model.parameters() -def build_optimizer( - model: torch.nn.Module, - name: str, - optimizer_config: Optional[Dict[str, Any]] = None) -> Optimizer: +def build_optimizer(model: torch.nn.Module, name: str, + **optimizer_config: Dict[str, Any]) -> Optimizer: + + for k, v in optimizer_config.items(): + if isinstance(v, DictConfig): + optimizer_config[k] = om.to_container(v, resolve=True) params = _extract_param_groups(model, optimizer_config) kwargs = optimizer_config - if kwargs is None: - kwargs = {} if 'params' in kwargs: raise ValueError( 'The `params` will be automatically extracted from the model and ' + diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 49cf610e05..fef174c314 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -107,7 +107,7 @@ def evaluate_model( composer_model = build_composer_model(composer_model_name=model['name'], tokenizer=tokenizer, init_context=init_context, - **model) + cfg=model) # Now add the eval metrics if eval_loader_config is not None: diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 3d25c755a4..1e7d7e0089 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -92,7 +92,7 @@ def _get_objs(request: pytest.FixtureRequest, model = build_composer_model( composer_model_name=test_cfg.model.name, tokenizer=tokenizer, - **test_cfg.model, + cfg=test_cfg.model, ) # Optimizer @@ -294,7 +294,7 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): model = build_composer_model( composer_model_name=neo_cfg.model.name, tokenizer=tokenizer, - **neo_cfg.model, + cfg=neo_cfg.model, ).to(device) assert isinstance(model.tokenizer, @@ -343,7 +343,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): model = build_composer_model( composer_model_name=t5_cfg.model.name, tokenizer=tokenizer, - **t5_cfg.model, + cfg=t5_cfg.model, ).to(device) assert isinstance(model.tokenizer, @@ -420,7 +420,7 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, model_1 = build_composer_model( composer_model_name=test_cfg.model.name, tokenizer=tokenizer, - **test_cfg.model, + cfg=test_cfg.model, ) model_2 = copy.deepcopy(model_1) @@ -490,7 +490,7 @@ def test_loss_fn(): model_1 = build_composer_model( composer_model_name=test_cfg.model.name, tokenizer=tokenizer, - **test_cfg.model, + cfg=test_cfg.model, ) model_2 = copy.deepcopy(model_1) @@ -576,7 +576,7 @@ def test_loss_reduction(loss_fn_config: str): model_1 = build_composer_model( composer_model_name=test_cfg.model.name, tokenizer=tokenizer, - **test_cfg.model, + cfg=test_cfg.model, ) model_2 = copy.deepcopy(model_1) From 6997d142e6455d47fa49a5f36035b22f8b4ef095 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 17 Apr 2024 22:51:48 +0000 Subject: [PATCH 042/201] fix test case --- scripts/inference/benchmarking/benchmark.py | 2 +- scripts/train/train.py | 2 +- tests/a_scripts/eval/test_eval.py | 2 +- .../inference/test_convert_composer_to_hf.py | 11 ++++++----- tests/fixtures/models.py | 2 +- tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_config.py | 10 +++++----- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/hf/test_hf_v_mpt.py | 4 ++-- tests/models/layers/test_huggingface_flash.py | 2 +- tests/models/test_model.py | 2 +- 11 files changed, 21 insertions(+), 20 deletions(-) diff --git a/scripts/inference/benchmarking/benchmark.py b/scripts/inference/benchmarking/benchmark.py index 947f50dc46..ac1a55cb61 100644 --- a/scripts/inference/benchmarking/benchmark.py +++ b/scripts/inference/benchmarking/benchmark.py @@ -67,7 +67,7 @@ def main(config: DictConfig): composer_model = build_composer_model( composer_model_name=config.model.name, tokenizer=tokenizer, - **config.model, + cfg=config.model, ) model = composer_model.model model.eval() diff --git a/scripts/train/train.py b/scripts/train/train.py index ddd73b5ef4..a49f1f8715 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -496,7 +496,7 @@ def main(cfg: DictConfig) -> Trainer: tokenizer=tokenizer, init_context=init_context, master_weights_dtype=model_config.get('master_weights_dtype', None), - **model_config, + cfg=model_config, ) # Log number of parameters diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index 509f01a9f5..d8970302eb 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -49,7 +49,7 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): # build model model = build_composer_model(composer_model_name=model_cfg.model.name, tokenizer=tokenizer, - **model_cfg.model) + cfg=model_cfg.model) # create mocked save checkpoint trainer = Trainer(model=model, device=device) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 4e8fc86517..96d2e4d66a 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -774,8 +774,9 @@ def test_huggingface_conversion_callback( device_batch_size, ) - original_model = build_composer_model(model_cfg['name'], tokenizer, - **model_cfg) + original_model = build_composer_model(model_cfg['name'], + tokenizer=tokenizer, + cfg=model_cfg) optimizer_name = optimizer_config.pop('name') optimizer = build_optimizer(original_model, optimizer_name, optimizer_config) @@ -873,7 +874,7 @@ def test_convert_and_generate(model: str, tie_word_embeddings: bool, original_model = build_composer_model( composer_model_name=om_cfg['model'].name, tokenizer=tokenizer, - **om_cfg['model'], + cfg=om_cfg['model'], ) trainer = Trainer(model=original_model, device='cpu' if not model == 'mptmoe' else 'gpu') @@ -945,7 +946,7 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, original_model = build_composer_model( composer_model_name=om_cfg['model'].name, tokenizer=tokenizer, - **om_cfg['model'], + cfg=om_cfg['model'], ) trainer = Trainer(model=original_model, device='cpu' if not 'moe' in conf_path else 'gpu') @@ -1156,7 +1157,7 @@ def test_mptmoe_huggingface_conversion_callback( composer_model_name=model_cfg.name, tokenizer=tokenizer, init_context=init_context, - **model_cfg, + cfg=model_cfg, ) optimizer = build_optimizer(original_model, optimizer_name, diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index ced0e5690a..be1f4318b8 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -18,7 +18,7 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase): model = build_composer_model( composer_model_name=config.name, tokenizer=tokenizer, - **config, + cfg=config, ) return model diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 75df260d44..f195c101cd 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -70,7 +70,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, original_model = build_composer_model( composer_model_name=model_cfg['name'], tokenizer=tokenizer, - **model_cfg, + cfg=model_cfg, ) underlying_model = maybe_get_underlying_model(original_model.model) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 7359e8c6e0..bc770e82a2 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -49,7 +49,7 @@ def test_remote_code_false_mpt( _ = build_composer_model( composer_model_name=test_cfg.model.name, tokenizer=tokenizer, - **test_cfg.model, + cfg=test_cfg.model, ) @@ -141,7 +141,7 @@ def test_hf_config_override( model = build_composer_model( composer_model_name=test_cfg.model.name, tokenizer=tokenizer, - **test_cfg.model, + cfg=test_cfg.model, ) # save model @@ -165,7 +165,7 @@ def test_hf_config_override( hf_model = build_composer_model( composer_model_name=hf_model_config.model.name, tokenizer=tokenizer, - **hf_model_config.model, + cfg=hf_model_config.model, ) for k, v in hf_model_config.model.config_overrides.items(): @@ -200,7 +200,7 @@ def test_rope_scaling_override(): model = build_composer_model( composer_model_name=model_cfg.name, tokenizer=None, # type: ignore - **model_cfg, + cfg=model_cfg, ) # This would error if the config isn't parsed into a proper dictionary model.get_metadata() @@ -227,7 +227,7 @@ def test_nested_override(): model = build_composer_model( composer_model_name=model_cfg.name, tokenizer=None, # type: ignore - **model_cfg, + cfg=model_cfg, ) # The value we changed diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index a75803d845..7f5a7a55c0 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -86,7 +86,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, original_model = build_composer_model( composer_model_name=model_cfg['name'], tokenizer=tokenizer, - **model_cfg, + cfg=model_cfg, ) trainer = Trainer( diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 50fe3857c8..99fd29a037 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -61,7 +61,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, hf_model = build_composer_model( composer_model_name=hf_cfg.model.name, tokenizer=tokenizer, - **hf_cfg.model, + cfg=hf_cfg.model, ).to(device) hf_n_params = sum(p.numel() for p in hf_model.parameters()) @@ -113,7 +113,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, model = build_composer_model( composer_model_name=model_cfg.name, tokenizer=tokenizer, - **model_cfg, + cfg=model_cfg, ).to(device) n_params = sum(p.numel() for p in model.parameters()) diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 8652f24739..1cefddf834 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -85,7 +85,7 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): model = build_composer_model( composer_model_name=model_cfg['name'], tokenizer=tokenizer, - **model_cfg, + cfg=model_cfg, ) # check that it actually used flash attention 2 diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 1e7d7e0089..deb719e957 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -695,7 +695,7 @@ def test_lora_id(): tokenizer = build_tokenizer(config.tokenizer.name, tokenizer_cfg.get('kwargs', {})) - model = ComposerHFCausalLM(config.model, tokenizer) + model = ComposerHFCausalLM(**config.model, tokenizer=tokenizer) assert isinstance(model.model, peft.PeftModelForCausalLM) From c55bafa96f5ef2ce884935a0ced51848945cab3a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 00:19:16 +0000 Subject: [PATCH 043/201] changed back argument name --- scripts/eval/eval.py | 2 +- scripts/inference/benchmarking/benchmark.py | 2 +- scripts/train/train.py | 2 +- tests/a_scripts/eval/test_eval.py | 2 +- .../inference/test_convert_composer_to_hf.py | 6 +++--- tests/fixtures/models.py | 2 +- tests/models/hf/test_fsdp_weight_tying.py | 2 +- tests/models/hf/test_hf_config.py | 10 +++++----- tests/models/hf/test_hf_peft_wrapping.py | 2 +- tests/models/hf/test_hf_v_mpt.py | 4 ++-- tests/models/layers/test_huggingface_flash.py | 2 +- tests/models/test_model.py | 12 ++++++------ 12 files changed, 24 insertions(+), 24 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index fef174c314..646ab700a9 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -104,7 +104,7 @@ def evaluate_model( init_context = process_init_device(DictConfig(model), fsdp_config) - composer_model = build_composer_model(composer_model_name=model['name'], + composer_model = build_composer_model(name=model['name'], tokenizer=tokenizer, init_context=init_context, cfg=model) diff --git a/scripts/inference/benchmarking/benchmark.py b/scripts/inference/benchmarking/benchmark.py index ac1a55cb61..3cbc70974e 100644 --- a/scripts/inference/benchmarking/benchmark.py +++ b/scripts/inference/benchmarking/benchmark.py @@ -65,7 +65,7 @@ def main(config: DictConfig): tokenizer_kwargs=tokenizer_kwargs, ) composer_model = build_composer_model( - composer_model_name=config.model.name, + name=config.model.name, tokenizer=tokenizer, cfg=config.model, ) diff --git a/scripts/train/train.py b/scripts/train/train.py index a49f1f8715..e2f8c74dcb 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -492,7 +492,7 @@ def main(cfg: DictConfig) -> Trainer: # Build Model log.info('Initializing model...') model = build_composer_model( - composer_model_name=model_config.name, + name=model_config.name, tokenizer=tokenizer, init_context=init_context, master_weights_dtype=model_config.get('master_weights_dtype', None), diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index d8970302eb..95979fd986 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -47,7 +47,7 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): tokenizer = build_tokenizer(model_cfg.tokenizer.name, model_cfg.tokenizer.get('kwargs', {})) # build model - model = build_composer_model(composer_model_name=model_cfg.model.name, + model = build_composer_model(name=model_cfg.model.name, tokenizer=tokenizer, cfg=model_cfg.model) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 96d2e4d66a..01603a63de 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -872,7 +872,7 @@ def test_convert_and_generate(model: str, tie_word_embeddings: bool, tokenizer = transformers.AutoTokenizer.from_pretrained( om_cfg.tokenizer.name, use_auth_token=model == 'llama2') original_model = build_composer_model( - composer_model_name=om_cfg['model'].name, + name=om_cfg['model'].name, tokenizer=tokenizer, cfg=om_cfg['model'], ) @@ -944,7 +944,7 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, tokenizer = transformers.AutoTokenizer.from_pretrained( om_cfg.tokenizer.name) original_model = build_composer_model( - composer_model_name=om_cfg['model'].name, + name=om_cfg['model'].name, tokenizer=tokenizer, cfg=om_cfg['model'], ) @@ -1154,7 +1154,7 @@ def test_mptmoe_huggingface_conversion_callback( init_context = process_init_device(model_cfg, fsdp_config) original_model = build_composer_model( - composer_model_name=model_cfg.name, + name=model_cfg.name, tokenizer=tokenizer, init_context=init_context, cfg=model_cfg, diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index be1f4318b8..9012380c68 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -16,7 +16,7 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase): model = build_composer_model( - composer_model_name=config.name, + name=config.name, tokenizer=tokenizer, cfg=config, ) diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index f195c101cd..1b2e84daaf 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -68,7 +68,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, ) original_model = build_composer_model( - composer_model_name=model_cfg['name'], + name=model_cfg['name'], tokenizer=tokenizer, cfg=model_cfg, ) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index bc770e82a2..9be4467a4f 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -47,7 +47,7 @@ def test_remote_code_false_mpt( ValueError, match='trust_remote_code must be set to True for MPT models.'): _ = build_composer_model( - composer_model_name=test_cfg.model.name, + name=test_cfg.model.name, tokenizer=tokenizer, cfg=test_cfg.model, ) @@ -139,7 +139,7 @@ def test_hf_config_override( tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) model = build_composer_model( - composer_model_name=test_cfg.model.name, + name=test_cfg.model.name, tokenizer=tokenizer, cfg=test_cfg.model, ) @@ -163,7 +163,7 @@ def test_hf_config_override( hf_model_config.model = model_cfg hf_model = build_composer_model( - composer_model_name=hf_model_config.model.name, + name=hf_model_config.model.name, tokenizer=tokenizer, cfg=hf_model_config.model, ) @@ -198,7 +198,7 @@ def test_rope_scaling_override(): model_cfg = om.create(model_cfg) model = build_composer_model( - composer_model_name=model_cfg.name, + name=model_cfg.name, tokenizer=None, # type: ignore cfg=model_cfg, ) @@ -225,7 +225,7 @@ def test_nested_override(): model_cfg = om.create(model_cfg) model = build_composer_model( - composer_model_name=model_cfg.name, + name=model_cfg.name, tokenizer=None, # type: ignore cfg=model_cfg, ) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 7f5a7a55c0..8ae8e93c47 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -84,7 +84,7 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, ) original_model = build_composer_model( - composer_model_name=model_cfg['name'], + name=model_cfg['name'], tokenizer=tokenizer, cfg=model_cfg, ) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 99fd29a037..3729e66cbf 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -59,7 +59,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, tokenizer_kwargs=tokenizer_kwargs, ) hf_model = build_composer_model( - composer_model_name=hf_cfg.model.name, + name=hf_cfg.model.name, tokenizer=tokenizer, cfg=hf_cfg.model, ).to(device) @@ -111,7 +111,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, print(model_cfg) model = build_composer_model( - composer_model_name=model_cfg.name, + name=model_cfg.name, tokenizer=tokenizer, cfg=model_cfg, ).to(device) diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 1cefddf834..cdd9fde50a 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -83,7 +83,7 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): with error_context: model = build_composer_model( - composer_model_name=model_cfg['name'], + name=model_cfg['name'], tokenizer=tokenizer, cfg=model_cfg, ) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index deb719e957..20a6b935b5 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -90,7 +90,7 @@ def _get_objs(request: pytest.FixtureRequest, tokenizer_cfg.get('kwargs', {})) model = build_composer_model( - composer_model_name=test_cfg.model.name, + name=test_cfg.model.name, tokenizer=tokenizer, cfg=test_cfg.model, ) @@ -292,7 +292,7 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): tokenizer_cfg.get('kwargs', {})) model = build_composer_model( - composer_model_name=neo_cfg.model.name, + name=neo_cfg.model.name, tokenizer=tokenizer, cfg=neo_cfg.model, ).to(device) @@ -341,7 +341,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): tokenizer_cfg.get('kwargs', {})) model = build_composer_model( - composer_model_name=t5_cfg.model.name, + name=t5_cfg.model.name, tokenizer=tokenizer, cfg=t5_cfg.model, ).to(device) @@ -418,7 +418,7 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, tokenizer_cfg.get('kwargs', {})) model_1 = build_composer_model( - composer_model_name=test_cfg.model.name, + name=test_cfg.model.name, tokenizer=tokenizer, cfg=test_cfg.model, ) @@ -488,7 +488,7 @@ def test_loss_fn(): tokenizer_cfg.get('kwargs', {})) model_1 = build_composer_model( - composer_model_name=test_cfg.model.name, + name=test_cfg.model.name, tokenizer=tokenizer, cfg=test_cfg.model, ) @@ -574,7 +574,7 @@ def test_loss_reduction(loss_fn_config: str): tokenizer_cfg.get('kwargs', {})) model_1 = build_composer_model( - composer_model_name=test_cfg.model.name, + name=test_cfg.model.name, tokenizer=tokenizer, cfg=test_cfg.model, ) From db8d207761cc9dfce56dfc61e61335bfc264048b Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 00:32:47 +0000 Subject: [PATCH 044/201] fix --- llmfoundry/utils/builders.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 7971d7b03a..5e69fda568 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -184,9 +184,8 @@ def build_composer_model( if init_context is None: init_context = contextlib.nullcontext() - for k, v in cfg.items(): - if isinstance(v, DictConfig): - cfg[k] = om.to_container(v, resolve=True) + if isinstance(cfg, DictConfig): + cfg = om.to_container(cfg, resolve=True) with init_context: model = construct_from_registry( From cd5460ee33338d1536c2ae528225e4418a2ab2f7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 01:02:08 +0000 Subject: [PATCH 045/201] ** for finetuning dataloader --- llmfoundry/data/dataloader.py | 3 +- llmfoundry/data/finetuning/dataloader.py | 54 ++++++++++++++---------- llmfoundry/data/packing.py | 27 +++++++----- llmfoundry/utils/builders.py | 2 +- tests/data/test_packing.py | 8 +--- 5 files changed, 52 insertions(+), 42 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index a98526001a..2e14dde27c 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -22,8 +22,7 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, that the dataloader will produce. """ kwargs = { - 'cfg': cfg, - 'tokenizer': tokenizer, + **cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size } diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 1d8711d280..3b1fee13db 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging import os -from typing import Tuple, Union +from typing import Optional, Tuple, Union import torch from composer.core.data_spec import DataSpec @@ -31,9 +31,11 @@ _DEFAULT_TARGET_PROMPTS = 'none' -def build_finetuning_dataloader(cfg: DictConfig, - tokenizer: PreTrainedTokenizerBase, - device_batch_size: int) -> DataSpec: +def build_finetuning_dataloader( + tokenizer: PreTrainedTokenizerBase, + device_batch_size: int, + dataset: DictConfig, +) -> DataSpec: """Builds a finetuning dataloader for training or evaluating. The underlying dataset can be built through one of two code paths: @@ -131,14 +133,16 @@ def build_finetuning_dataloader(cfg: DictConfig, padding/waste rates for different `cfg.dataset.packing_ratio` choices, given a starting workload YAML. """ - _validate_config(cfg.dataset) + _validate_config(dataset) # Use EOS as the pad token if none exists - if tokenizer.pad_token is None: + if tokenizer.pad_token is None: # type: ignore (sometimes it's none and that's ok) tokenizer.pad_token = tokenizer.eos_token collate_fn, dataloader_batch_size = _build_collate_fn( - cfg, tokenizer, device_batch_size) + dataset=dataset, + tokenizer=tokenizer, + device_batch_size=device_batch_size) dataset = None # for pyright sampler = None @@ -422,25 +426,30 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: def _build_collate_fn( - dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, - device_batch_size: int + tokenizer: PreTrainedTokenizerBase, + device_batch_size: int, + max_seq_len: int, + decoder_only_format: bool, + dataset: DictConfig, + max_leftover_bins_to_keep: Optional[int] = None, + packing_ratio: Optional[Union[float, str]] = None, + target_responses: str = _DEFAULT_TARGET_RESPONSES, + target_prompts: str = _DEFAULT_TARGET_PROMPTS, + allow_pad_trimming: bool = False, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: - dataset_cfg = dataloader_cfg.dataset - max_seq_len = dataset_cfg.max_seq_len + max_seq_len = max_seq_len collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, max_seq_len=max_seq_len, - decoder_only_format=dataset_cfg.decoder_only_format, - target_responses=dataset_cfg.get('target_responses', - _DEFAULT_TARGET_RESPONSES), - target_prompts=dataset_cfg.get('target_prompts', - _DEFAULT_TARGET_PROMPTS), - allow_pad_trimming=dataset_cfg.get('allow_pad_trimming', False), + decoder_only_format=decoder_only_format, + target_responses=target_responses, + target_prompts=target_prompts, + allow_pad_trimming=allow_pad_trimming, ) - packing_ratio = dataset_cfg.get('packing_ratio') - max_leftover_bins_to_keep = dataset_cfg.get('max_leftover_bins_to_keep') + packing_ratio = packing_ratio + max_leftover_bins_to_keep = max_leftover_bins_to_keep if packing_ratio is None: if max_leftover_bins_to_keep is not None: raise ValueError( @@ -450,8 +459,9 @@ def _build_collate_fn( return collate_fn, device_batch_size if packing_ratio == 'auto': - packing_ratio = auto_packing_ratio(dataloader_cfg, tokenizer, - device_batch_size) + packing_ratio = auto_packing_ratio(dataset_config=dataset, + tokenizer=tokenizer, + device_batch_size=device_batch_size) if isinstance(packing_ratio, str): raise ValueError( @@ -465,7 +475,7 @@ def _build_collate_fn( elif packing_ratio < 1.0: raise ValueError('packing_ratio must be >= 1, if supplied') - if not dataset_cfg.decoder_only_format: + if not decoder_only_format: raise NotImplementedError( 'On-the-fly packing is currently only supported for decoder-only formats.' ) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 9696f967ca..3a399ac345 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -290,8 +290,9 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): return batch -def auto_packing_ratio(dataloader_cfg: DictConfig, +def auto_packing_ratio(dataset_config: DictConfig, tokenizer: PreTrainedTokenizerBase, + max_seq_len: int, device_batch_size: int, num_packing_ratios: int = 20) -> float: """Find a packing ratio that minimizes padding with zero waste. @@ -323,16 +324,18 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, # Set the seed so that auto packing is deterministic. reproducibility.seed_all(0) - max_seq_len = dataloader_cfg.dataset.max_seq_len # If max_seq_len is very small, skip profiling and select packing ratio of 1. if max_seq_len <= 100: return 1 min_ratio = 1 max_ratio = max_seq_len / 100 - profiling_results = profile_packing(dataloader_cfg, tokenizer, min_ratio, - max_ratio, num_packing_ratios, - device_batch_size) + profiling_results = profile_packing(dataset_config=dataset_config, + tokenizer=tokenizer, + min_ratio=min_ratio, + max_ratio=max_ratio, + num_packing_ratios=num_packing_ratios, + device_batch_size=device_batch_size) # Obtain the maximum packing_ratio/minimum padding that has no waste. # profiling_results are sorted from smallest to largest packing_ratio. @@ -357,7 +360,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, def profile_packing( - dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, + dataset_config: DictConfig, tokenizer: PreTrainedTokenizerBase, min_ratio: float, max_ratio: float, num_packing_ratios: int, device_batch_size: int ) -> Iterable[Tuple[float, Optional[float], Optional[float]]]: @@ -383,12 +386,14 @@ def profile_packing( None) # Turn off packing for the dataloader (we want raw, pre-packed examples) - dataloader_cfg = copy.deepcopy(dataloader_cfg) + dataloader_cfg = DictConfig({ + 'dataset': copy.deepcopy(dataset_config), + 'drop_last': False, + 'num_workers': 0, + 'prefetch_factor': None, + 'persistent_workers': False, + }) dataloader_cfg.dataset.packing_ratio = 1.0 - dataloader_cfg.drop_last = False - dataloader_cfg.num_workers = 0 - dataloader_cfg.prefetch_factor = None - dataloader_cfg.persistent_workers = False # If streaming dataset, use a temporary local folder for profiling local_rank_zero = dist.get_global_rank() - dist.get_local_rank() diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 5e69fda568..bef2c73be4 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -377,7 +377,7 @@ def _extract_param_groups( def build_optimizer(model: torch.nn.Module, name: str, - **optimizer_config: Dict[str, Any]) -> Optimizer: + optimizer_config: Dict[str, Any]) -> Optimizer: for k, v in optimizer_config.items(): if isinstance(v, DictConfig): diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 963f8e56b6..434705df51 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -107,9 +107,7 @@ def test_auto_packing(profile_packing: Mock): profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': { - 'max_seq_len': 2048 - }}), + dataset_config=DictConfig({'max_seq_len': 2048}), tokenizer=None, device_batch_size=1, ) # Dummy values, profiling results are already set. @@ -134,9 +132,7 @@ def test_dist_auto_packing(profile_packing: Mock): (3, .7, .5)] # should pick 2 packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': { - 'max_seq_len': 2048 - }}), + dataloader_cfg=DictConfig({'max_seq_len': 2048}), tokenizer=None, device_batch_size=1, ) # Dummy values, profiling results are already set. From bbd04d1456a49f9cd60562793c4f251b1a28ded6 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 01:16:40 +0000 Subject: [PATCH 046/201] fix? --- llmfoundry/data/packing.py | 2 +- scripts/misc/profile_packing.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 3a399ac345..4c6b465895 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -292,7 +292,6 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): def auto_packing_ratio(dataset_config: DictConfig, tokenizer: PreTrainedTokenizerBase, - max_seq_len: int, device_batch_size: int, num_packing_ratios: int = 20) -> float: """Find a packing ratio that minimizes padding with zero waste. @@ -325,6 +324,7 @@ def auto_packing_ratio(dataset_config: DictConfig, reproducibility.seed_all(0) # If max_seq_len is very small, skip profiling and select packing ratio of 1. + max_seq_len = dataset_config.get('max_seq_len') if max_seq_len <= 100: return 1 diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index fff10d158b..8351069535 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -88,8 +88,9 @@ def parse_args() -> Namespace: tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max, - args.num_packing_ratios, device_batch_size) + results = profile_packing(dataloader_cfg.dataset, tokenizer, args.min, + args.max, args.num_packing_ratios, + device_batch_size) header = '\n\n\n packing_ratio | % PADDING | % WASTE' fstr = ' {:5.1f} | {:5.2f}% | {:6.2f}%' From fc6fb1b9ca7e2a4a29db79fee1ea91fa86860a86 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 01:33:12 +0000 Subject: [PATCH 047/201] fix dataloader --- llmfoundry/data/finetuning/dataloader.py | 111 +++++++++++++---------- llmfoundry/data/text_data.py | 9 +- tests/data/test_packing.py | 3 +- 3 files changed, 68 insertions(+), 55 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 3b1fee13db..60f093de96 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -35,6 +35,13 @@ def build_finetuning_dataloader( tokenizer: PreTrainedTokenizerBase, device_batch_size: int, dataset: DictConfig, + num_workers: int, + drop_last: bool = False, + pin_memory: bool = True, + prefetch_factor: int = 2, + persistent_workers: bool = True, + name: Optional[str] = None, + timeout: int = 0, ) -> DataSpec: """Builds a finetuning dataloader for training or evaluating. @@ -133,55 +140,61 @@ def build_finetuning_dataloader( padding/waste rates for different `cfg.dataset.packing_ratio` choices, given a starting workload YAML. """ - _validate_config(dataset) + dataset_cfg = dataset + _validate_config(dataset_cfg) # Use EOS as the pad token if none exists if tokenizer.pad_token is None: # type: ignore (sometimes it's none and that's ok) tokenizer.pad_token = tokenizer.eos_token collate_fn, dataloader_batch_size = _build_collate_fn( - dataset=dataset, + dataset_cfg=dataset_cfg, tokenizer=tokenizer, - device_batch_size=device_batch_size) + device_batch_size=device_batch_size, + max_seq_len=dataset_cfg.max_seq_len, + decoder_only_format=dataset_cfg.decoder_only_format, + ) dataset = None # for pyright sampler = None - if cfg.dataset.get('remote') is not None or cfg.dataset.get( + if dataset_cfg.get('remote') is not None or cfg.dataset.get( 'streams') is not None: # Build streaming dataloader - streams = build_streams(cfg.dataset) + streams = build_streams(**dataset_cfg) + + # note: we don't need to use ** here because we're setting default values for almost all arguments dataset = dataset_constructor.build_from_streaming( tokenizer=tokenizer, streams=streams, - local=cfg.dataset.get('local', None), - remote=cfg.dataset.get('remote', None), - split=cfg.dataset.get('split', None), - download_retry=cfg.dataset.get('download_retry', 2), - download_timeout=cfg.dataset.get('download_timeout', 60), - validate_hash=cfg.dataset.get('validate_hash', None), - keep_zip=cfg.dataset.get('keep_zip', False), - epoch_size=cfg.dataset.get('epoch_size', None), - predownload=cfg.dataset.get('predownload', None), - cache_limit=cfg.dataset.get('cache_limit', None), - partition_algo=cfg.dataset.get('partition_algo', 'relaxed'), - num_canonical_nodes=cfg.dataset.get('num_canonical_nodes', None), + local=dataset_cfg.get('local', None), + remote=dataset_cfg.get('remote', None), + split=dataset_cfg.get('split', None), + download_retry=dataset_cfg.get('download_retry', 2), + download_timeout=dataset_cfg.get('download_timeout', 60), + validate_hash=dataset_cfg.get('validate_hash', None), + keep_zip=dataset_cfg.get('keep_zip', False), + epoch_size=dataset_cfg.get('epoch_size', None), + predownload=dataset_cfg.get('predownload', None), + cache_limit=dataset_cfg.get('cache_limit', None), + partition_algo=dataset_cfg.get('partition_algo', 'relaxed'), + num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None), batch_size=device_batch_size, - shuffle=cfg.dataset.get('shuffle', False), - shuffle_algo=cfg.dataset.get('shuffle_algo', 'py1e'), - shuffle_seed=cfg.dataset.get('shuffle_seed', 9176), - shuffle_block_size=cfg.dataset.get('shuffle_block_size', None), - sampling_method=cfg.dataset.get('sampling_method', 'balanced'), - sampling_granularity=cfg.dataset.get('sampling_granularity', 1), - batching_method=cfg.dataset.get('batching_method', 'random'), - max_seq_len=cfg.dataset.max_seq_len, - allow_unsafe_types=cfg.dataset.get('allow_unsafe_types', False), - replication=cfg.dataset.get('replication', None), + shuffle=dataset_cfg.get('shuffle', False), + shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'), + shuffle_seed=dataset_cfg.get('shuffle_seed', 9176), + shuffle_block_size=dataset_cfg.get('shuffle_block_size', None), + sampling_method=dataset_cfg.get('sampling_method', 'balanced'), + sampling_granularity=dataset_cfg.get('sampling_granularity', 1), + batching_method=dataset_cfg.get('batching_method', 'random'), + max_seq_len=dataset_cfg.max_seq_len, + allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), + replication=dataset_cfg.get('replication', None), ) else: # Build HF dataloader - dataset_name_or_path = cfg.dataset.hf_name - split = cfg.dataset.get('split') + dataset_name_or_path = dataset_cfg.hf_name + split = dataset_cfg.get('split') if split is None: raise MissingHuggingFaceURLSplitError() @@ -193,7 +206,7 @@ def build_finetuning_dataloader( split = split.replace('-', '_') # Get the preprocessing function. - proto_preprocessing_fn = cfg.dataset.get('preprocessing_fn') + proto_preprocessing_fn = dataset_cfg.get('preprocessing_fn') if isinstance(proto_preprocessing_fn, (dict, DictConfig)): preprocessing_fn = dataset_constructor.get_preprocessing_fn_from_dict( dict(proto_preprocessing_fn)) @@ -205,26 +218,26 @@ def build_finetuning_dataloader( dataset = dataset_constructor.build_from_hf( dataset_name=dataset_name_or_path, split=split, - safe_load=cfg.dataset.get('safe_load', False), - max_seq_len=cfg.dataset.max_seq_len, + safe_load=dataset_cfg.get('safe_load', False), + max_seq_len=dataset_cfg.max_seq_len, preprocessing_fn=preprocessing_fn, tokenizer=tokenizer, - target_prompts=cfg.dataset.get('target_prompts', + target_prompts=dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS), - target_responses=cfg.dataset.get('target_responses', + target_responses=dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES), - decoder_only_format=cfg.dataset.decoder_only_format, - hf_kwargs=cfg.dataset.get('hf_kwargs', {})) + decoder_only_format=dataset_cfg.decoder_only_format, + hf_kwargs=dataset_cfg.get('hf_kwargs', {})) # Ensure dataset is large enough. - if cfg.drop_last: + if drop_last: world_size = dist.get_world_size() minimum_dataset_size = world_size * dataloader_batch_size if hasattr(dataset, '__len__'): full_dataset_size = len(dataset) if full_dataset_size < minimum_dataset_size: raise NotEnoughDatasetSamplesError( - dataset_name=cfg.dataset.hf_name, + dataset_name=dataset_cfg.hf_name, split=split, dataloader_batch_size=dataloader_batch_size, world_size=world_size, @@ -232,21 +245,21 @@ def build_finetuning_dataloader( minimum_dataset_size=minimum_dataset_size) # Initialize sampler. sampler = dist.get_sampler(dataset, - drop_last=cfg.drop_last, - shuffle=cfg.dataset.shuffle) + drop_last=drop_last, + shuffle=dataset_cfg.shuffle) assert dataset is not None # for pyright dl = DataLoader( dataset, collate_fn=collate_fn, batch_size=dataloader_batch_size, - drop_last=cfg.drop_last, + drop_last=drop_last, sampler=sampler, - num_workers=cfg.num_workers, - pin_memory=cfg.get('pin_memory', True), - prefetch_factor=cfg.get('prefetch_factor', 2), - persistent_workers=cfg.get('persistent_workers', True), - timeout=cfg.get('timeout', 0), + num_workers=num_workers, + pin_memory=pin_memory, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers, + timeout=timeout, ) token_counting_func = get_tokens_per_batch_func() @@ -430,15 +443,13 @@ def _build_collate_fn( device_batch_size: int, max_seq_len: int, decoder_only_format: bool, - dataset: DictConfig, + dataset_cfg: DictConfig, max_leftover_bins_to_keep: Optional[int] = None, packing_ratio: Optional[Union[float, str]] = None, target_responses: str = _DEFAULT_TARGET_RESPONSES, target_prompts: str = _DEFAULT_TARGET_PROMPTS, allow_pad_trimming: bool = False, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: - max_seq_len = max_seq_len - collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, max_seq_len=max_seq_len, @@ -459,7 +470,7 @@ def _build_collate_fn( return collate_fn, device_batch_size if packing_ratio == 'auto': - packing_ratio = auto_packing_ratio(dataset_config=dataset, + packing_ratio = auto_packing_ratio(dataset_config=dataset_cfg, tokenizer=tokenizer, device_batch_size=device_batch_size) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index fc31b890b0..a89312b500 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -245,12 +245,13 @@ def get_sequence_id_from_batch( return torch.cat([left_zeros, cumulative_sep[:, :-1]], dim=1) -def build_streams(dataset_cfg: DictConfig): - streams_dict = dataset_cfg.pop('streams', None) +def build_streams(streams: Optional[Dict[str, Any]] = None, + **dataset_cfg: DictConfig): + streams_dict = streams # build streams streams = None if streams_dict is not None: - streams = [] + streams: List = [] for _, stream in streams_dict.items(): # stream is the streams kwargs # fwd all kwargs with **stream allows streaming to check args @@ -298,7 +299,7 @@ def build_text_dataloader( ' To override this error, set the override_bos_token_id_mismatch_error flag to True in the dataset config section of the YAML.' ) - streams = build_streams(cfg.dataset) + streams = build_streams(**cfg.dataset) # build dataset potentially with streams dataset = StreamingTextDataset( diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 434705df51..38ae882faa 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -175,7 +175,8 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path): 'timeout': 0, }) - loader = build_finetuning_dataloader(cfg, tokenizer, + loader = build_finetuning_dataloader(**cfg, + tokenizer=tokenizer, device_batch_size=6).dataloader batch_ix = 0 From 1887ed0693cae0519a37170e4e9ecd869c27afcd Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 03:46:25 +0000 Subject: [PATCH 048/201] fix --- llmfoundry/data/finetuning/dataloader.py | 5 +- .../inference/test_convert_composer_to_hf.py | 12 ++--- tests/data/test_dataloader.py | 53 +++++++++++++------ tests/data/test_packing.py | 3 +- tests/fixtures/data.py | 6 +-- 5 files changed, 50 insertions(+), 29 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 60f093de96..b107ef7f76 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -546,8 +546,9 @@ def _build_collate_fn( tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) device_batch_size = 1 - dataloader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + dataloader = build_finetuning_dataloader( + **cfg, tokenizer=tokenizer, + device_batch_size=device_batch_size).dataloader packing = cfg.dataset.get('packing_ratio') is not None diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 01603a63de..e2cab79c34 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -769,9 +769,9 @@ def test_huggingface_conversion_callback( ) train_dataloader = build_finetuning_dataloader( - dataloader_cfg, - tokenizer, - device_batch_size, + tokenizer=tokenizer, + device_batch_size=device_batch_size, + **dataloader_cfg, ) original_model = build_composer_model(model_cfg['name'], @@ -1138,9 +1138,9 @@ def test_mptmoe_huggingface_conversion_callback( ) train_dataloader = build_finetuning_dataloader( - dataloader_cfg, - tokenizer, - device_batch_size, + **dataloader_cfg, + tokenizer=tokenizer, + device_batch_size=device_batch_size, ) optimizer_config = { diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index c99ae6baf2..8ae460cfce 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -322,8 +322,9 @@ def test_finetuning_dataloader(use_chat_formatting: bool, if not decoder_only_format: expected_keys += ['decoder_attention_mask', 'decoder_input_ids'] - loader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + loader = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=device_batch_size, + **cfg).dataloader batch_ix = 0 for batch in loader: for k in expected_keys: @@ -368,7 +369,9 @@ def test_finetuning_dataloader_safe_load(hf_name: str, tokenizer = build_tokenizer('gpt2', {}) with expectation: - _ = build_finetuning_dataloader(cfg, tokenizer, 1) + _ = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=1, + **cfg) # If no raised errors, we should expect downloaded files with only safe file types. if expectation == does_not_raise(): @@ -432,7 +435,9 @@ def test_finetuning_dataloader_small_data(dataset_size: int, match='Your dataset') with error_context: - _ = build_finetuning_dataloader(cfg, tokenizer, device_batch_size) + _ = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=device_batch_size, + **cfg) if dist.get_global_rank() == 0: shutil.rmtree(tiny_dataset_folder_path) @@ -474,7 +479,9 @@ def test_finetuning_dataloader_custom_split(tmp_path: pathlib.Path, split: str): tokenizer_kwargs={'model_max_length': max_seq_len}, ) - _ = build_finetuning_dataloader(cfg, tokenizer, 4) + _ = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=4, + **cfg) def mock_get_file(path: str, destination: str, overwrite: bool = False): @@ -519,7 +526,9 @@ def test_finetuning_dataloader_custom_split_remote(split: str): # Mock get_file to avoid downloading the file with patch('llmfoundry.data.finetuning.dataloader.get_file', wraps=mock_get_file) as f: - _ = build_finetuning_dataloader(cfg, tokenizer, 4) + _ = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=4, + **cfg) for call in f.call_args_list: path_arg = call.kwargs['path'] dest_arg = call.kwargs['destination'] @@ -587,7 +596,9 @@ def test_finetuning_dataloader_streaming(pretokenize: bool, cfg = om.create(cfg) - dataloader = build_finetuning_dataloader(cfg, tokenizer, 2).dataloader + dataloader = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=2, + **cfg).dataloader expected_keys = ['input_ids', 'labels'] for batch in dataloader: @@ -754,8 +765,9 @@ def test_malformed_data( match='Please specify exactly one.') with error_context: - dl = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + dl = build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=device_batch_size, + **cfg).dataloader if not any(invalid_prompt_response_params): # +5 because we added samples with just bos/eos in each of prompt/response @@ -854,8 +866,9 @@ def test_malformed_conversation_data(tmp_path: pathlib.Path, match='Conversation roles must alternate') with error_context: - build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=device_batch_size, + **cfg).dataloader def test_finetune_dataloader_pure_pad_responses(): @@ -905,8 +918,9 @@ def pad_preprocessing_function( # type: ignore assert tokenizer('|PAD|').input_ids[0] == tokenizer.pad_token_id device_batch_size = 1 - dataloader = build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + dataloader = build_finetuning_dataloader( + tokenizer=tokenizer, device_batch_size=device_batch_size, + **cfg).dataloader # We should be able to iterate through this dataset without crashing for i, batch in enumerate(dataloader): @@ -1037,7 +1051,9 @@ def test_token_counting_func_dataloader_setting( monkeypatch.setattr( 'llmfoundry.data.finetuning.tasks.DatasetConstructor.build_from_hf', lambda *args, **kwargs: []) - dl = build_finetuning_dataloader(cfg, gptt, batch_size) + dl = build_finetuning_dataloader(tokenizer=gptt, + device_batch_size=batch_size, + **cfg) elif dataloader_type == 'finetuning-streaming': cfg = DictConfig({ 'name': 'finetuning', @@ -1056,7 +1072,9 @@ def test_token_counting_func_dataloader_setting( monkeypatch.setattr( 'llmfoundry.data.finetuning.tasks.DatasetConstructor.build_from_streaming', lambda *args, **kwargs: []) - dl = build_finetuning_dataloader(cfg, gptt, batch_size) + dl = build_finetuning_dataloader(tokenizer=gptt, + device_batch_size=batch_size, + **cfg) elif dataloader_type == 'text': cfg = DictConfig({ 'name': 'text', @@ -1175,5 +1193,6 @@ def test_sharegpt_format(tmp_path: pathlib.Path, match='Conversation roles must alternate') with error_context: - build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + build_finetuning_dataloader(tokenizer=tokenizer, + device_batch_size=device_batch_size, + **cfg).dataloader diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 38ae882faa..3fe1cfa1b6 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -214,7 +214,8 @@ def test_packing_with_dataloader(packing_ratio: Any): 'timeout': 0, }) - loader = build_finetuning_dataloader(cfg, tokenizer, + loader = build_finetuning_dataloader(**cfg, + tokenizer=tokenizer, device_batch_size=6).dataloader assert isinstance(loader, DataLoader) diff --git a/tests/fixtures/data.py b/tests/fixtures/data.py index 9ba053ffe8..390be0c6c5 100644 --- a/tests/fixtures/data.py +++ b/tests/fixtures/data.py @@ -51,9 +51,9 @@ def tiny_ft_dataloader(tiny_ft_dataset_path: Path, }) dataloader = build_finetuning_dataloader( - dataloader_cfg, - mpt_tokenizer, - device_batch_size, + **dataloader_cfg, + tokenizer=mpt_tokenizer, + device_batch_size=device_batch_size, ).dataloader assert isinstance(dataloader, DataLoader) From 9fd912f615df94b6988bdbff200b0d8373309446 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 03:49:11 +0000 Subject: [PATCH 049/201] fix finetuning dataloader --- llmfoundry/data/finetuning/dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index b107ef7f76..b98dacf792 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -157,7 +157,7 @@ def build_finetuning_dataloader( dataset = None # for pyright sampler = None - if dataset_cfg.get('remote') is not None or cfg.dataset.get( + if dataset_cfg.get('remote') is not None or dataset_cfg.get( 'streams') is not None: # Build streaming dataloader streams = build_streams(**dataset_cfg) From 8cd9e653b21b0aa966a9535498535b99187867a8 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 14:30:26 +0000 Subject: [PATCH 050/201] fix build_text_dataloader --- llmfoundry/data/text_data.py | 39 ++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index a89312b500..496e1ea4c2 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -260,16 +260,25 @@ def build_streams(streams: Optional[Dict[str, Any]] = None, def build_text_dataloader( - cfg: DictConfig, + name: str, tokenizer: PreTrainedTokenizerBase, device_batch_size: int, + dataset: DictConfig, + drop_last: bool, + num_workers: int, + pin_memory: bool = True, + prefetch_factor: int = 2, + persistent_workers: bool = True, + timeout: int = 0, ) -> DataSpec: - assert cfg.name == 'text', f'Tried to build text dataloader with cfg.name={cfg.name}' + dataset_cfg = dataset + dataset = None + assert name == 'text', f'Tried to build text dataloader with cfg.name={name}' # get kwargs - mlm_probability = cfg.dataset.pop('mlm_probability', None) - eos_token_id = cfg.dataset.pop('eos_token_id', None) - bos_token_id = cfg.dataset.pop('bos_token_id', None) + mlm_probability = dataset_cfg.pop('mlm_probability', None) + eos_token_id = dataset_cfg.pop('eos_token_id', None) + bos_token_id = dataset_cfg.dataset.pop('bos_token_id', None) if eos_token_id is None and bos_token_id is None and (hasattr( tokenizer, 'eos_token_id') or hasattr(tokenizer, 'bos_token_id')): @@ -280,7 +289,7 @@ def build_text_dataloader( tokenizer_eos_token_id = getattr(tokenizer, 'eos_token_id', None) if eos_token_id is not None and eos_token_id != tokenizer_eos_token_id: eos_mismatch_str = f'Provided {eos_token_id=} does not match the eos_token_id of the tokenizer={tokenizer_eos_token_id}.' - if cfg.dataset.pop('override_eos_token_id_mismatch_error', False): + if dataset_cfg.pop('override_eos_token_id_mismatch_error', False): log.warning(eos_mismatch_str) else: raise ValueError( @@ -291,7 +300,7 @@ def build_text_dataloader( tokenizer_bos_token_id = getattr(tokenizer, 'bos_token_id', None) if bos_token_id is not None and bos_token_id != tokenizer_bos_token_id: bos_mismatch_str = f'Provided {bos_token_id=} does not match the bos_token_id of the tokenizer={tokenizer_bos_token_id}.' - if cfg.dataset.pop('override_bos_token_id_mismatch_error', False): + if dataset_cfg.pop('override_bos_token_id_mismatch_error', False): log.warning(bos_mismatch_str) else: raise ValueError( @@ -299,14 +308,14 @@ def build_text_dataloader( ' To override this error, set the override_bos_token_id_mismatch_error flag to True in the dataset config section of the YAML.' ) - streams = build_streams(**cfg.dataset) + streams = build_streams(**dataset_cfg) # build dataset potentially with streams dataset = StreamingTextDataset( tokenizer=tokenizer, streams=streams, batch_size=device_batch_size, - **cfg.dataset, + **dataset_cfg, ) collate_fn = transformers.DataCollatorForLanguageModeling( @@ -325,12 +334,12 @@ def build_text_dataloader( dataset, collate_fn=collate_fn, batch_size=device_batch_size, - drop_last=cfg.drop_last, - num_workers=cfg.num_workers, - pin_memory=cfg.get('pin_memory', True), - prefetch_factor=cfg.get('prefetch_factor', 2), - persistent_workers=cfg.get('persistent_workers', True), - timeout=cfg.get('timeout', 0), + drop_last=drop_last, + num_workers=num_workers, + pin_memory=pin_memory, + prefetch_factor=prefetch_factor, + persistent_workers=persistent_workers, + timeout=timeout, ) # If we pretokenized, we may not have padding, in which case the From 10485809752d50501a50082de3669d2662536476 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 14:40:42 +0000 Subject: [PATCH 051/201] left to my own devices --- llmfoundry/models/hf/hf_causal_lm.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index a5aeb60837..29ec6439e6 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -73,9 +73,18 @@ def __init__( use_train_metrics: bool = True, additional_train_metrics: Optional[List] = None, additional_eval_metrics: Optional[List] = None, + # ignored args name: Optional[str] = None, + device: Optional[Any] = None, ): + if device is not None: + warnings.warn( + 'device is deprecated and will be removed in a future release. ' + + 'Please use init_device instead.', + DeprecationWarning, + ) + from llmfoundry.utils.builders import build_metric config_overrides = config_overrides or {} From de2f893c248ee286cb7f9d9b1e0477c8bfaa4b2d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 15:25:05 +0000 Subject: [PATCH 052/201] fix packing --- llmfoundry/data/finetuning/dataloader.py | 39 +++++++++++++++--------- llmfoundry/data/packing.py | 24 +++++++++------ scripts/misc/profile_packing.py | 5 ++- tests/data/test_packing.py | 8 +++-- 4 files changed, 47 insertions(+), 29 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index b98dacf792..e77ad9ae2f 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -147,12 +147,21 @@ def build_finetuning_dataloader( if tokenizer.pad_token is None: # type: ignore (sometimes it's none and that's ok) tokenizer.pad_token = tokenizer.eos_token + # this full config is necessary for properly profiling the packing ratio + dataloader_cfg = DictConfig({ + 'name': name, + 'dataset': dataset_cfg, + 'drop_last': drop_last, + 'num_workers': num_workers, + 'pin_memory': pin_memory, + 'prefetch_factor': prefetch_factor, + 'persistent_workers': persistent_workers, + 'timeout': timeout, + }) collate_fn, dataloader_batch_size = _build_collate_fn( - dataset_cfg=dataset_cfg, + dataloader_cfg=dataloader_cfg, tokenizer=tokenizer, device_batch_size=device_batch_size, - max_seq_len=dataset_cfg.max_seq_len, - decoder_only_format=dataset_cfg.decoder_only_format, ) dataset = None # for pyright @@ -439,17 +448,19 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: def _build_collate_fn( + dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int, - max_seq_len: int, - decoder_only_format: bool, - dataset_cfg: DictConfig, - max_leftover_bins_to_keep: Optional[int] = None, - packing_ratio: Optional[Union[float, str]] = None, - target_responses: str = _DEFAULT_TARGET_RESPONSES, - target_prompts: str = _DEFAULT_TARGET_PROMPTS, - allow_pad_trimming: bool = False, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: + # these `.get` calls are safe because the dataset_cfg is validated for extra keys + dataset_cfg = dataloader_cfg.dataset + target_responses = dataset_cfg.get('target_responses', + _DEFAULT_TARGET_RESPONSES) + target_prompts = dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS) + max_seq_len = dataset_cfg.max_seq_len + decoder_only_format = dataset_cfg.decoder_only_format + allow_pad_trimming = dataset_cfg.get('allow_pad_trimming', False) + collate_fn = Seq2SeqFinetuningCollator( tokenizer=tokenizer, max_seq_len=max_seq_len, @@ -459,8 +470,8 @@ def _build_collate_fn( allow_pad_trimming=allow_pad_trimming, ) - packing_ratio = packing_ratio - max_leftover_bins_to_keep = max_leftover_bins_to_keep + packing_ratio = dataset_cfg.get('packing_ratio') + max_leftover_bins_to_keep = dataset_cfg.get('max_leftover_bins_to_keep') if packing_ratio is None: if max_leftover_bins_to_keep is not None: raise ValueError( @@ -470,7 +481,7 @@ def _build_collate_fn( return collate_fn, device_batch_size if packing_ratio == 'auto': - packing_ratio = auto_packing_ratio(dataset_config=dataset_cfg, + packing_ratio = auto_packing_ratio(dataloader_cfg=dataloader_cfg, tokenizer=tokenizer, device_batch_size=device_batch_size) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 4c6b465895..9f4908d709 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -290,7 +290,7 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): return batch -def auto_packing_ratio(dataset_config: DictConfig, +def auto_packing_ratio(dataloader_cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size: int, num_packing_ratios: int = 20) -> float: @@ -324,13 +324,14 @@ def auto_packing_ratio(dataset_config: DictConfig, reproducibility.seed_all(0) # If max_seq_len is very small, skip profiling and select packing ratio of 1. + dataset_config = dataloader_cfg.dataset max_seq_len = dataset_config.get('max_seq_len') if max_seq_len <= 100: return 1 min_ratio = 1 max_ratio = max_seq_len / 100 - profiling_results = profile_packing(dataset_config=dataset_config, + profiling_results = profile_packing(dataloader_cfg=dataloader_cfg, tokenizer=tokenizer, min_ratio=min_ratio, max_ratio=max_ratio, @@ -360,9 +361,12 @@ def auto_packing_ratio(dataset_config: DictConfig, def profile_packing( - dataset_config: DictConfig, tokenizer: PreTrainedTokenizerBase, - min_ratio: float, max_ratio: float, num_packing_ratios: int, - device_batch_size: int + dataloader_cfg: DictConfig, + tokenizer: PreTrainedTokenizerBase, + min_ratio: float, + max_ratio: float, + num_packing_ratios: int, + device_batch_size: int, ) -> Iterable[Tuple[float, Optional[float], Optional[float]]]: """Generator function that profiles example packing across packing ratios. @@ -381,13 +385,13 @@ def profile_packing( from llmfoundry.data.dataloader import build_dataloader - max_seq_len = dataloader_cfg.dataset.get('max_seq_len') - max_leftovers_to_keep = dataloader_cfg.dataset.get('max_leftovers_to_keep', - None) + dataset_cfg = dataloader_cfg.dataset + max_seq_len = dataset_cfg.get('max_seq_len') + max_leftovers_to_keep = dataset_cfg.get('max_leftovers_to_keep', None) # Turn off packing for the dataloader (we want raw, pre-packed examples) - dataloader_cfg = DictConfig({ - 'dataset': copy.deepcopy(dataset_config), + dataloader_cfg = copy.deepcopy(dataloader_cfg) + dataloader_cfg.update({ 'drop_last': False, 'num_workers': 0, 'prefetch_factor': None, diff --git a/scripts/misc/profile_packing.py b/scripts/misc/profile_packing.py index 8351069535..fff10d158b 100644 --- a/scripts/misc/profile_packing.py +++ b/scripts/misc/profile_packing.py @@ -88,9 +88,8 @@ def parse_args() -> Namespace: tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - results = profile_packing(dataloader_cfg.dataset, tokenizer, args.min, - args.max, args.num_packing_ratios, - device_batch_size) + results = profile_packing(dataloader_cfg, tokenizer, args.min, args.max, + args.num_packing_ratios, device_batch_size) header = '\n\n\n packing_ratio | % PADDING | % WASTE' fstr = ' {:5.1f} | {:5.2f}% | {:6.2f}%' diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 3fe1cfa1b6..bcf5835ca5 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -107,7 +107,9 @@ def test_auto_packing(profile_packing: Mock): profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] packing_ratio = auto_packing_ratio( - dataset_config=DictConfig({'max_seq_len': 2048}), + dataset_config=DictConfig({'dataset': { + 'max_seq_len': 2048 + }}), tokenizer=None, device_batch_size=1, ) # Dummy values, profiling results are already set. @@ -132,7 +134,9 @@ def test_dist_auto_packing(profile_packing: Mock): (3, .7, .5)] # should pick 2 packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'max_seq_len': 2048}), + dataloader_cfg=DictConfig({'dataset': { + 'max_seq_len': 2048 + }}), tokenizer=None, device_batch_size=1, ) # Dummy values, profiling results are already set. From 98690578b6f1472ee896e33da0241464db6d6fdc Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 15:33:27 +0000 Subject: [PATCH 053/201] fix typo --- llmfoundry/data/text_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 496e1ea4c2..a68d6c345f 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -278,7 +278,7 @@ def build_text_dataloader( # get kwargs mlm_probability = dataset_cfg.pop('mlm_probability', None) eos_token_id = dataset_cfg.pop('eos_token_id', None) - bos_token_id = dataset_cfg.dataset.pop('bos_token_id', None) + bos_token_id = dataset_cfg.pop('bos_token_id', None) if eos_token_id is None and bos_token_id is None and (hasattr( tokenizer, 'eos_token_id') or hasattr(tokenizer, 'bos_token_id')): From e2fdf06da16818717e24681173bfaca1a6f60237 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 15:38:18 +0000 Subject: [PATCH 054/201] fix padding test cases --- llmfoundry/data/text_data.py | 4 +++- tests/data/test_dataloader.py | 10 ++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index a68d6c345f..4c11632530 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -455,7 +455,9 @@ def get_num_samples_in_batch(batch: Batch) -> int: tokenizer_kwargs = {'model_max_length': args.max_seq_len} tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) - loader = build_text_dataloader(cfg, tokenizer, device_batch_size).dataloader + loader = build_text_dataloader( + **cfg, tokenizer=tokenizer, + device_batch_size=device_batch_size).dataloader assert isinstance(loader, DataLoader) assert isinstance(loader.dataset, StreamingTextDataset) tokenizer = loader.dataset.tokenizer diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 8ae460cfce..aad588f1a9 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -227,9 +227,9 @@ def test_correct_padding(tokenizer_name: str, # Dataloaders eval_loader = build_text_dataloader( - test_cfg.eval_loader, - tokenizer, - batch_size, + **test_cfg.eval_loader, + tokenizer=tokenizer, + device_batch_size=batch_size, ).dataloader batch = next(iter(eval_loader)) @@ -1092,7 +1092,9 @@ def test_token_counting_func_dataloader_setting( ds_mock.tokenizer = gptt monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', lambda *args, **kwargs: ds_mock) - dl = build_text_dataloader(cfg, gptt, batch_size) + dl = build_text_dataloader(**cfg, + tokenizer=gptt, + device_batch_size=batch_size) else: raise NotImplementedError() From 4ee17f0f011f0ad958cad41ab81e508c70cff02a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 15:53:43 +0000 Subject: [PATCH 055/201] ignore extra parameters and warn --- llmfoundry/data/text_data.py | 6 ++++++ scripts/train/train.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 4c11632530..a4067ec553 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -5,6 +5,7 @@ import logging import os +import warnings from itertools import islice from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, Union, cast) @@ -270,7 +271,12 @@ def build_text_dataloader( prefetch_factor: int = 2, persistent_workers: bool = True, timeout: int = 0, + **kwargs: Dict[str, Any], ) -> DataSpec: + for kwarg in kwargs.keys(): + warnings.warn( + f'Unused parameter `{kwarg}` passed to build_text_dataloader. This parameter is ignored. In future releases, this will raise an error.', + DeprecationWarning) dataset_cfg = dataset dataset = None assert name == 'text', f'Tried to build text dataloader with cfg.name={name}' diff --git a/scripts/train/train.py b/scripts/train/train.py index e2f8c74dcb..aaeb38617a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -271,7 +271,7 @@ def main(cfg: DictConfig) -> Trainer: # Initialize pytorch distributed training process groups dist_timeout: Union[int, float] = scfg.dist_timeout - dist.initialize_dist(get_device(None), timeout=dist_timeout) + # dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs model_config: DictConfig = DictConfig(scfg.model) From d06e357a57dec964ba7a4c816e6fc681fdd9aac3 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 16:17:05 +0000 Subject: [PATCH 056/201] fix style --- llmfoundry/data/text_data.py | 13 +++++++------ llmfoundry/registry.py | 6 +++--- scripts/train/train.py | 4 ++-- tests/data/test_packing.py | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index a4067ec553..da0948213c 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -246,18 +246,19 @@ def get_sequence_id_from_batch( return torch.cat([left_zeros, cumulative_sep[:, :-1]], dim=1) -def build_streams(streams: Optional[Dict[str, Any]] = None, - **dataset_cfg: DictConfig): +def build_streams( + streams: Optional[Dict[str, Any]] = None, + **dataset_cfg_rest: DictConfig # unused +): streams_dict = streams # build streams - streams = None if streams_dict is not None: - streams: List = [] + streams_ret: List = [] for _, stream in streams_dict.items(): # stream is the streams kwargs # fwd all kwargs with **stream allows streaming to check args - streams.append(Stream(**stream)) - return streams + streams_ret.append(Stream(**stream)) + return streams_ret def build_text_dataloader( diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index 6e1824ea08..5aa0b93208 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -6,10 +6,8 @@ from composer.loggers import LoggerDestination from composer.models import ComposerModel from composer.optim import ComposerScheduler -from omegaconf import DictConfig from torch.optim import Optimizer from torchmetrics import Metric -from transformers import PreTrainedTokenizerBase from llmfoundry.interfaces import CallbackWithConfig from llmfoundry.layers_registry import (attention_classes, @@ -109,7 +107,9 @@ dataloaders = create_registry( 'llmfoundry', 'dataloaders', - generic_type=Callable[[DictConfig, PreTrainedTokenizerBase, int], DataSpec], + generic_type=Callable[ + ..., + DataSpec], # the arguments to the dataloader may vary depending on the contents of the config. entry_points=True, description=_dataloaders_description) diff --git a/scripts/train/train.py b/scripts/train/train.py index aaeb38617a..fb14d6d358 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -120,7 +120,7 @@ def validate_config(cfg: TrainConfig): eval_loader = cfg.eval_loader if isinstance(cfg.eval_loaders, list) or isinstance( cfg.eval_loaders, ListConfig): - for loader in cfg.eval_loaders: + for loader in (cfg.eval_loaders or []): # pyright if 'label' not in loader or loader['label'] is None: raise ValueError( 'When specifying multiple evaluation datasets, each one must include the \ @@ -271,7 +271,7 @@ def main(cfg: DictConfig) -> Trainer: # Initialize pytorch distributed training process groups dist_timeout: Union[int, float] = scfg.dist_timeout - # dist.initialize_dist(get_device(None), timeout=dist_timeout) + dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs model_config: DictConfig = DictConfig(scfg.model) diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index bcf5835ca5..7e4c04586a 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -107,7 +107,7 @@ def test_auto_packing(profile_packing: Mock): profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] packing_ratio = auto_packing_ratio( - dataset_config=DictConfig({'dataset': { + dataloader_cfg=DictConfig({'dataset': { 'max_seq_len': 2048 }}), tokenizer=None, From b8fd65d0576a49ae6816a47eccf879b61354716e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 16:36:46 +0000 Subject: [PATCH 057/201] fix quality checks --- llmfoundry/data/dataloader.py | 4 +++- llmfoundry/data/text_data.py | 9 ++++----- llmfoundry/utils/builders.py | 2 +- scripts/eval/eval.py | 4 ++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 2e14dde27c..b4355eafc0 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -3,6 +3,8 @@ """Dataloader builder utilities.""" +from typing import Any, Dict + from composer import DataSpec from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase @@ -21,7 +23,7 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, device_batch_size (int): The size of the batches (number of examples) that the dataloader will produce. """ - kwargs = { + kwargs: Dict[str, Any] = { **cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size } diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index da0948213c..0ff9100f5e 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -252,8 +252,8 @@ def build_streams( ): streams_dict = streams # build streams + streams_ret: List = [] if streams_dict is not None: - streams_ret: List = [] for _, stream in streams_dict.items(): # stream is the streams kwargs # fwd all kwargs with **stream allows streaming to check args @@ -279,7 +279,6 @@ def build_text_dataloader( f'Unused parameter `{kwarg}` passed to build_text_dataloader. This parameter is ignored. In future releases, this will raise an error.', DeprecationWarning) dataset_cfg = dataset - dataset = None assert name == 'text', f'Tried to build text dataloader with cfg.name={name}' # get kwargs @@ -318,7 +317,7 @@ def build_text_dataloader( streams = build_streams(**dataset_cfg) # build dataset potentially with streams - dataset = StreamingTextDataset( + text_dataset = StreamingTextDataset( tokenizer=tokenizer, streams=streams, batch_size=device_batch_size, @@ -326,7 +325,7 @@ def build_text_dataloader( ) collate_fn = transformers.DataCollatorForLanguageModeling( - tokenizer=dataset.tokenizer, + tokenizer=text_dataset.tokenizer, mlm=mlm_probability is not None, mlm_probability=mlm_probability) @@ -338,7 +337,7 @@ def build_text_dataloader( bos_token_id=bos_token_id) dl = DataLoader( - dataset, + text_dataset, collate_fn=collate_fn, batch_size=device_batch_size, drop_last=drop_last, diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index bef2c73be4..483c466ddc 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -185,7 +185,7 @@ def build_composer_model( init_context = contextlib.nullcontext() if isinstance(cfg, DictConfig): - cfg = om.to_container(cfg, resolve=True) + cfg: Dict[str, Any] = om.to_container(cfg, resolve=True) with init_context: model = construct_from_registry( diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 646ab700a9..120119100d 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -53,7 +53,7 @@ def evaluate_model( eval_gauntlet_df: Optional[pd.DataFrame], eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], - callback_configs: Optional[Dict[str, Any]], + callback_configs: Optional[Union[Dict[str, Any], List]], metadata: Optional[Dict[str, str]], logged_config: DictConfig, should_log_config: bool = True, @@ -269,7 +269,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: metadata = scfg.metadata should_log_config = scfg.log_config - callback_configs = om.to_container( + callback_configs: Dict[str, Any] = om.to_container( scfg.callbacks) if scfg.callbacks else None # Warn for unused parameters From 01b7419f7ccd7e6628437228844e3927ec97ba61 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 17:04:13 +0000 Subject: [PATCH 058/201] fix code quality --- llmfoundry/data/dataloader.py | 3 ++- llmfoundry/data/finetuning/dataloader.py | 14 +++++++------- llmfoundry/utils/builders.py | 12 +++++++++--- scripts/eval/eval.py | 9 ++++++--- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index b4355eafc0..4f82861f80 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -24,7 +24,8 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, that the dataloader will produce. """ kwargs: Dict[str, Any] = { - **cfg, 'tokenizer': tokenizer, + **{str(k): v for k, v in cfg.items()}, # pyright + 'tokenizer': tokenizer, 'device_batch_size': device_batch_size } diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index e77ad9ae2f..88827c7c2d 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -164,7 +164,7 @@ def build_finetuning_dataloader( device_batch_size=device_batch_size, ) - dataset = None # for pyright + hf_dataset = None # for pyright sampler = None if dataset_cfg.get('remote') is not None or dataset_cfg.get( 'streams') is not None: @@ -224,7 +224,7 @@ def build_finetuning_dataloader( proto_preprocessing_fn, dataset_name_or_path) # Build dataset from HF. - dataset = dataset_constructor.build_from_hf( + hf_dataset = dataset_constructor.build_from_hf( dataset_name=dataset_name_or_path, split=split, safe_load=dataset_cfg.get('safe_load', False), @@ -242,8 +242,8 @@ def build_finetuning_dataloader( if drop_last: world_size = dist.get_world_size() minimum_dataset_size = world_size * dataloader_batch_size - if hasattr(dataset, '__len__'): - full_dataset_size = len(dataset) + if hasattr(hf_dataset, '__len__'): + full_dataset_size = len(hf_dataset) if full_dataset_size < minimum_dataset_size: raise NotEnoughDatasetSamplesError( dataset_name=dataset_cfg.hf_name, @@ -253,13 +253,13 @@ def build_finetuning_dataloader( full_dataset_size=full_dataset_size, minimum_dataset_size=minimum_dataset_size) # Initialize sampler. - sampler = dist.get_sampler(dataset, + sampler = dist.get_sampler(hf_dataset, drop_last=drop_last, shuffle=dataset_cfg.shuffle) - assert dataset is not None # for pyright + assert hf_dataset is not None # for pyright dl = DataLoader( - dataset, + hf_dataset, collate_fn=collate_fn, batch_size=dataloader_batch_size, drop_last=drop_last, diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 483c466ddc..193fd010e7 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -164,7 +164,7 @@ def build_icl_data_and_gauntlet( def build_composer_model( name: str, - cfg: Union[Dict, DictConfig], + cfg: Union[Dict[str, Any], DictConfig], tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager] = None, master_weights_dtype: Optional[str] = None, @@ -185,7 +185,13 @@ def build_composer_model( init_context = contextlib.nullcontext() if isinstance(cfg, DictConfig): - cfg: Dict[str, Any] = om.to_container(cfg, resolve=True) + model_cfg = om.to_container(cfg, resolve=True) + assert isinstance(model_cfg, Dict[str, Any]) + elif isinstance(cfg, Dict[str, Any]): + model_cfg = cfg + else: + raise ValueError( + f'Invalid type for cfg: {type(cfg)}. Must be DictConfig or Dict.') with init_context: model = construct_from_registry( @@ -194,7 +200,7 @@ def build_composer_model( pre_validation_function=ComposerModel, post_validation_function=None, kwargs={ - **cfg, 'tokenizer': tokenizer + **model_cfg, 'tokenizer': tokenizer }, ) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 120119100d..3a6e7cd449 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -53,7 +53,7 @@ def evaluate_model( eval_gauntlet_df: Optional[pd.DataFrame], eval_subset_num_batches: int, icl_subset_num_batches: Optional[int], - callback_configs: Optional[Union[Dict[str, Any], List]], + callback_configs: Optional[Dict[str, Any]], metadata: Optional[Dict[str, str]], logged_config: DictConfig, should_log_config: bool = True, @@ -269,8 +269,11 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: metadata = scfg.metadata should_log_config = scfg.log_config - callback_configs: Dict[str, Any] = om.to_container( - scfg.callbacks) if scfg.callbacks else None + callback_configs = om.to_container(scfg.callbacks) if scfg.callbacks else [] + + if callback_configs is not None: + assert isinstance(callback_configs, + Dict[str, Any]), 'callbacks must be a Dict' # pyright # Warn for unused parameters for key in cfg: From d986503815f41111fe778952e051caec10ecd3bb Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 17:27:02 +0000 Subject: [PATCH 059/201] pyright-fu --- llmfoundry/data/finetuning/dataloader.py | 16 ++++++++-------- llmfoundry/utils/builders.py | 17 +++++++++++++---- scripts/eval/eval.py | 4 ++-- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 88827c7c2d..f2610920e2 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -164,7 +164,7 @@ def build_finetuning_dataloader( device_batch_size=device_batch_size, ) - hf_dataset = None # for pyright + streaming_dataset = None # for pyright sampler = None if dataset_cfg.get('remote') is not None or dataset_cfg.get( 'streams') is not None: @@ -172,7 +172,7 @@ def build_finetuning_dataloader( streams = build_streams(**dataset_cfg) # note: we don't need to use ** here because we're setting default values for almost all arguments - dataset = dataset_constructor.build_from_streaming( + streaming_dataset = dataset_constructor.build_from_streaming( tokenizer=tokenizer, streams=streams, local=dataset_cfg.get('local', None), @@ -224,7 +224,7 @@ def build_finetuning_dataloader( proto_preprocessing_fn, dataset_name_or_path) # Build dataset from HF. - hf_dataset = dataset_constructor.build_from_hf( + streaming_dataset = dataset_constructor.build_from_hf( dataset_name=dataset_name_or_path, split=split, safe_load=dataset_cfg.get('safe_load', False), @@ -242,8 +242,8 @@ def build_finetuning_dataloader( if drop_last: world_size = dist.get_world_size() minimum_dataset_size = world_size * dataloader_batch_size - if hasattr(hf_dataset, '__len__'): - full_dataset_size = len(hf_dataset) + if hasattr(streaming_dataset, '__len__'): + full_dataset_size = len(streaming_dataset) if full_dataset_size < minimum_dataset_size: raise NotEnoughDatasetSamplesError( dataset_name=dataset_cfg.hf_name, @@ -253,13 +253,13 @@ def build_finetuning_dataloader( full_dataset_size=full_dataset_size, minimum_dataset_size=minimum_dataset_size) # Initialize sampler. - sampler = dist.get_sampler(hf_dataset, + sampler = dist.get_sampler(streaming_dataset, drop_last=drop_last, shuffle=dataset_cfg.shuffle) - assert hf_dataset is not None # for pyright + assert streaming_dataset is not None # for pyright dl = DataLoader( - hf_dataset, + streaming_dataset, collate_fn=collate_fn, batch_size=dataloader_batch_size, drop_last=drop_last, diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 193fd010e7..151014d9f1 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -162,6 +162,15 @@ def build_icl_data_and_gauntlet( return icl_evaluators, logger_keys, eval_gauntlet_cb +def _is_string_keyed_dict(d: dict) -> bool: + return isinstance(d, dict) and all(isinstance(k, str) for k in d.keys()) + + +def _string_keyed_dict(d: dict) -> Dict[str, Any]: + assert all(isinstance(k, str) for k in d.keys()) + return {str(k): v for k, v in d.items()} + + def build_composer_model( name: str, cfg: Union[Dict[str, Any], DictConfig], @@ -185,10 +194,10 @@ def build_composer_model( init_context = contextlib.nullcontext() if isinstance(cfg, DictConfig): - model_cfg = om.to_container(cfg, resolve=True) - assert isinstance(model_cfg, Dict[str, Any]) - elif isinstance(cfg, Dict[str, Any]): - model_cfg = cfg + cfg = om.to_container(cfg, resolve=True) + model_cfg = _string_keyed_dict(model_cfg) # pyright + elif _is_string_keyed_dict(cfg): + model_cfg = _string_keyed_dict(cfg) # pyright else: raise ValueError( f'Invalid type for cfg: {type(cfg)}. Must be DictConfig or Dict.') diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 3a6e7cd449..7a3a56b36f 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -272,8 +272,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: callback_configs = om.to_container(scfg.callbacks) if scfg.callbacks else [] if callback_configs is not None: - assert isinstance(callback_configs, - Dict[str, Any]), 'callbacks must be a Dict' # pyright + assert isinstance(callback_configs, dict) + callback_configs = {str(k): v for k, v in callback_configs.items()} # Warn for unused parameters for key in cfg: From 41fbe285f0befbde72cfcd07ba3ddf7ba0a42b10 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 17:34:53 +0000 Subject: [PATCH 060/201] fix --- llmfoundry/utils/builders.py | 4 ++-- llmfoundry/utils/config_utils.py | 10 ---------- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 151014d9f1..3cd2e2934d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -194,8 +194,8 @@ def build_composer_model( init_context = contextlib.nullcontext() if isinstance(cfg, DictConfig): - cfg = om.to_container(cfg, resolve=True) - model_cfg = _string_keyed_dict(model_cfg) # pyright + container_cfg = om.to_container(cfg, resolve=True) + model_cfg = _string_keyed_dict(container_cfg) # pyright elif _is_string_keyed_dict(cfg): model_cfg = _string_keyed_dict(cfg) # pyright else: diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 16ae1aafee..a4fd005c3a 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -25,16 +25,6 @@ ] -def convert_to_dict(value: Optional[Union[ListConfig, DictConfig]]) -> Any: - if value is None: - return None - if not isinstance(value, DictConfig) and not isinstance(value, ListConfig): - raise ValueError( - f'The value {value} is of type {type(value)} that cannot be \ - converted to a dict or list. Please check your yaml.') - return om.to_container(value) - - def pop_config(cfg: DictConfig, key: str, must_exist: bool = True, From d7302707d101e591937799d939398f429cb60dc9 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 17:44:45 +0000 Subject: [PATCH 061/201] just one more type constraint bro --- llmfoundry/utils/builders.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 3cd2e2934d..a66c331703 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -195,6 +195,7 @@ def build_composer_model( if isinstance(cfg, DictConfig): container_cfg = om.to_container(cfg, resolve=True) + assert isinstance(container_cfg, dict) # pyright model_cfg = _string_keyed_dict(container_cfg) # pyright elif _is_string_keyed_dict(cfg): model_cfg = _string_keyed_dict(cfg) # pyright From 0fbb3c63bb5d3a177d56a8a13839e0b601792720 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 17:56:05 +0000 Subject: [PATCH 062/201] OmegaConf -> om --- scripts/train/train.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index fb14d6d358..fe9704578d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -17,7 +17,6 @@ cyclic_schedule) from composer.utils import dist, get_device, reproducibility from omegaconf import DictConfig, ListConfig -from omegaconf import OmegaConf from omegaconf import OmegaConf as om from rich.traceback import install @@ -226,7 +225,7 @@ def main(cfg: DictConfig) -> Trainer: cfg = update_batch_size_info(cfg) logged_cfg.update(cfg, merge=True) - scfg: TrainConfig = OmegaConf.structured( + scfg: TrainConfig = om.structured( TrainConfig(**cfg) ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) From da962d3f16eee449e74dbad8fd441ea3d1ff194a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 17:59:49 +0000 Subject: [PATCH 063/201] rename variables for clarity --- scripts/train/train.py | 211 ++++++++++++++++++++++------------------- 1 file changed, 111 insertions(+), 100 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index fe9704578d..5315db7233 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -112,14 +112,14 @@ class TrainConfig: TRAIN_CONFIG_KEYS = set(field.name for field in fields(TrainConfig)) -def validate_config(cfg: TrainConfig): +def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" - loaders = [cfg.train_loader] - if cfg.eval_loader is not None or cfg.eval_loaders is not None: - eval_loader = cfg.eval_loader - if isinstance(cfg.eval_loaders, list) or isinstance( - cfg.eval_loaders, ListConfig): - for loader in (cfg.eval_loaders or []): # pyright + loaders = [train_config.train_loader] + if train_config.eval_loader is not None or train_config.eval_loaders is not None: + eval_loader = train_config.eval_loader + if isinstance(train_config.eval_loaders, list) or isinstance( + train_config.eval_loaders, ListConfig): + for loader in (train_config.eval_loaders or []): # pyright if 'label' not in loader or loader['label'] is None: raise ValueError( 'When specifying multiple evaluation datasets, each one must include the \ @@ -130,29 +130,30 @@ def validate_config(cfg: TrainConfig): loaders.append(eval_loader) for loader in loaders: if loader['name'] == 'text': - if cfg.model['name'] == 'hf_t5': + if train_config.model['name'] == 'hf_t5': raise ValueError( - f'Model type "{cfg.model["name"]}" is not supported when using the "text " ' +\ + f'Model type "{train_config.model["name"]}" is not supported when using the "text " ' +\ f'dataloader. Only finetuning is supported.') - if cfg.icl_tasks is not None or cfg.icl_tasks_str is not None: - if cfg.model['name'] == 'hf_t5': + if train_config.icl_tasks is not None or train_config.icl_tasks_str is not None: + if train_config.model['name'] == 'hf_t5': raise ValueError( 'ICL evaluation does not currently support Encoder-Decoder models, such as "hf_t5".' ) - if (cfg.model.get('fc_type', 'torch') != 'te' and 'te' not in cfg.model.get( - 'ffn_config', {}).get('ffn_type', 'mptmlp') and - 'fp8' in cfg.precision): + if (train_config.model.get('fc_type', 'torch') != 'te' and + 'te' not in train_config.model.get('ffn_config', {}).get( + 'ffn_type', 'mptmlp') and 'fp8' in train_config.precision): warnings.warn( "fp8 only supported for te.Linear layers. Either set `cfg.model.fc_typ='te'` or " + "`cfg.model.ffn_config.ffn_type='te_ln_mlp'` to enable layers using fp8 precision." ) - if (cfg.model.get('fc_type', 'torch') == 'te' or - 'te' in cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp')): - fsdp_config = cfg.fsdp_config or DictConfig({}) + if (train_config.model.get('fc_type', 'torch') == 'te' or + 'te' in train_config.model.get('ffn_config', {}).get( + 'ffn_type', 'mptmlp')): + fsdp_config = train_config.fsdp_config or DictConfig({}) act_ckpt = fsdp_config.get('activation_checkpointing', False) if fsdp_config else False act_ckpt_reentrant = fsdp_config.get( @@ -163,27 +164,31 @@ def validate_config(cfg: TrainConfig): + '`activation_checkpointing_reentrant = True`. ' + 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' ) - if cfg.fsdp_config is not None: - cfg.fsdp_config['activation_checkpointing_reentrant'] = False + if train_config.fsdp_config is not None: + train_config.fsdp_config[ + 'activation_checkpointing_reentrant'] = False - if cfg.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': + if train_config.model.get('ffn_config', {}).get('ffn_type', + 'mptmlp') == 'te_ln_mlp': warnings.warn( '`te.LayerNormMLP` requires has issues with torch._dynamo. ' + 'Setting `torch._dynamo.config.suppress_errors = True` and falling back to eager.' ) torch._dynamo.config.suppress_errors = True # type: ignore (third-party) - if cfg.model.get('load_in_8bit', False): + if train_config.model.get('load_in_8bit', False): raise ValueError( '`load_in_8bit` is only supported for evaluation rather than training.' ) - if cfg.model.get('ffn_config', {}).get('ffn_type', - 'mptmlp') in ffns_with_megablocks: - moe_world_size = cfg.model.get('ffn_config', - {}).get('moe_world_size', 1) - use_orig_params = cfg.fsdp_config.get( - 'use_orig_params', True) if cfg.fsdp_config is not None else True + if train_config.model.get('ffn_config', + {}).get('ffn_type', + 'mptmlp') in ffns_with_megablocks: + moe_world_size = train_config.model.get('ffn_config', + {}).get('moe_world_size', 1) + use_orig_params = train_config.fsdp_config.get( + 'use_orig_params', + True) if train_config.fsdp_config is not None else True if moe_world_size > 1 and not use_orig_params: raise ValueError( f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.' @@ -191,45 +196,49 @@ def validate_config(cfg: TrainConfig): def main(cfg: DictConfig) -> Trainer: + unstructured_config = cfg # Resolve all interpolation variables as early as possible - om.resolve(cfg) + om.resolve(unstructured_config) # Structured config does not support unions of containers, so separate single and plural containers - if (loader := cfg.get('eval_loader', None)) is not None: + if (loader := unstructured_config.get('eval_loader', None)) is not None: if isinstance(loader, ListConfig): - cfg['eval_loaders'] = list(cfg.pop('eval_loader')) - if (tasks := cfg.get('icl_tasks', None)) is not None: + unstructured_config['eval_loaders'] = list( + unstructured_config.pop('eval_loader')) + if (tasks := unstructured_config.get('icl_tasks', None)) is not None: if isinstance(tasks, str): - cfg['icl_tasks_str'] = cfg.pop('icl_tasks') - if (gauntlet := cfg.get('eval_gauntlet', None)) is not None: + unstructured_config['icl_tasks_str'] = unstructured_config.pop( + 'icl_tasks') + if (gauntlet := unstructured_config.get('eval_gauntlet', None)) is not None: if isinstance(gauntlet, str): - cfg['eval_gauntlet_str'] = cfg.pop('eval_gauntlet') + unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( + 'eval_gauntlet') - arg_config_keys = set(cfg.keys()) + arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, TRAIN_CONFIG_KEYS) - if 'variables' not in cfg: - cfg['variables'] = {} + if 'variables' not in unstructured_config: + unstructured_config['variables'] = {} for key in extraneous_keys: warnings.warn( f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes.' ) # TODO (milo): delete the below line once we deprecate variables at the top level. - cfg['variables'][key] = cfg.pop(key) + unstructured_config['variables'][key] = unstructured_config.pop(key) # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(cfg) + logged_cfg: DictConfig = copy.deepcopy(unstructured_config) # Get global and device batch size information from distributed/single node setting - cfg = update_batch_size_info(cfg) - logged_cfg.update(cfg, merge=True) + unstructured_config = update_batch_size_info(unstructured_config) + logged_cfg.update(unstructured_config, merge=True) - scfg: TrainConfig = om.structured( - TrainConfig(**cfg) + train_cfg: TrainConfig = om.structured( + TrainConfig(**unstructured_config) ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) - code_paths = scfg.code_paths if scfg.code_paths else [] + code_paths = train_cfg.code_paths if train_cfg.code_paths else [] # Import any user provided code for code_path in code_paths: import_file(code_path) @@ -243,16 +252,16 @@ def main(cfg: DictConfig) -> Trainer: ) # Check for incompatibilities between the model and data loaders - validate_config(scfg) + validate_config(train_cfg) cuda_alloc_conf = [] # Get max split size mb - max_split_size_mb: Optional[int] = scfg.max_split_size_mb + max_split_size_mb: Optional[int] = train_cfg.max_split_size_mb if max_split_size_mb is not None: cuda_alloc_conf.append(f'max_split_size_mb:{max_split_size_mb}') # Expandable segments - if scfg.expandable_segments: + if train_cfg.expandable_segments: cuda_alloc_conf.append('expandable_segments:True') if len(cuda_alloc_conf) > 0: @@ -260,82 +269,84 @@ def main(cfg: DictConfig) -> Trainer: # Set CUDA lazy loading # This can save a bit of memory if not all modules are needed - cuda_load_lazy: bool = scfg.cuda_load_lazy + cuda_load_lazy: bool = train_cfg.cuda_load_lazy if cuda_load_lazy: os.environ['CUDA_MODULE_LOADING'] = 'LAZY' # Set seed first - seed: int = scfg.seed + seed: int = train_cfg.seed reproducibility.seed_all(seed) # Initialize pytorch distributed training process groups - dist_timeout: Union[int, float] = scfg.dist_timeout + dist_timeout: Union[int, float] = train_cfg.dist_timeout dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config: DictConfig = DictConfig(scfg.model) - tokenizer_config: Dict[str, Any] = {**scfg.tokenizer} - optimizer_config: Dict[str, Any] = {**scfg.optimizer} - scheduler_config: Dict[str, Any] = {**scfg.scheduler} - train_loader_config: DictConfig = DictConfig(scfg.train_loader) + model_config: DictConfig = DictConfig(train_cfg.model) + tokenizer_config: Dict[str, Any] = {**train_cfg.tokenizer} + optimizer_config: Dict[str, Any] = {**train_cfg.optimizer} + scheduler_config: Dict[str, Any] = {**train_cfg.scheduler} + train_loader_config: DictConfig = DictConfig(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = scfg.fsdp_config + fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config - if scfg.eval_loader is not None and scfg.eval_loaders is not None: + if train_cfg.eval_loader is not None and train_cfg.eval_loaders is not None: raise ValueError( 'Only one of `eval_loader` or `eval_loaders` should be provided.') eval_loader_config: Optional[Union[DictConfig, ListConfig]] = DictConfig( - scfg.eval_loader) if scfg.eval_loader is not None else ListConfig( - scfg.eval_loaders) if scfg.eval_loaders is not None else None + train_cfg.eval_loader + ) if train_cfg.eval_loader is not None else ListConfig( + train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None icl_tasks_config: Optional[Union[ListConfig, str]] = ListConfig( - scfg.icl_tasks) if scfg.icl_tasks is not None else scfg.icl_tasks_str + train_cfg.icl_tasks + ) if train_cfg.icl_tasks is not None else train_cfg.icl_tasks_str eval_gauntlet_config: Optional[Union[DictConfig, str]] = DictConfig( - scfg.eval_gauntlet - ) if scfg.eval_gauntlet is not None else scfg.eval_gauntlet_str - icl_subset_num_batches: Optional[int] = scfg.icl_subset_num_batches - icl_seq_len: Optional[int] = scfg.icl_seq_len + train_cfg.eval_gauntlet + ) if train_cfg.eval_gauntlet is not None else train_cfg.eval_gauntlet_str + icl_subset_num_batches: Optional[int] = train_cfg.icl_subset_num_batches + icl_seq_len: Optional[int] = train_cfg.icl_seq_len # Optional logging, evaluation and callback configs - logger_configs: Optional[Dict[str, Any]] = scfg.loggers - callback_configs: Optional[Dict[str, Any]] = scfg.callbacks - algorithm_configs: Optional[Dict[str, Any]] = scfg.algorithms + logger_configs: Optional[Dict[str, Any]] = train_cfg.loggers + callback_configs: Optional[Dict[str, Any]] = train_cfg.callbacks + algorithm_configs: Optional[Dict[str, Any]] = train_cfg.algorithms # Mandatory hyperparameters for training - device_train_batch_size: int = scfg.device_train_batch_size - device_eval_batch_size: int = scfg.device_eval_batch_size - max_duration: Union[int, str] = scfg.max_duration - eval_interval: Union[int, str] = scfg.eval_interval - precision: str = scfg.precision - max_seq_len: int = scfg.max_seq_len + device_train_batch_size: int = train_cfg.device_train_batch_size + device_eval_batch_size: int = train_cfg.device_eval_batch_size + max_duration: Union[int, str] = train_cfg.max_duration + eval_interval: Union[int, str] = train_cfg.eval_interval + precision: str = train_cfg.precision + max_seq_len: int = train_cfg.max_seq_len # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name: str = scfg.run_name if scfg.run_name else default_run_name - save_folder: Optional[str] = scfg.save_folder + run_name: str = train_cfg.run_name if train_cfg.run_name else default_run_name + save_folder: Optional[str] = train_cfg.save_folder is_state_dict_sharded: bool = (fsdp_config.get('state_dict_type', 'full') == 'sharded') if fsdp_config else False - save_latest_filename: str = scfg.save_latest_filename if scfg.save_latest_filename else 'latest-sharded-rank{rank}' if is_state_dict_sharded else 'latest-rank{rank}.pt' - save_overwrite: bool = scfg.save_overwrite - save_weights_only: bool = scfg.save_weights_only - save_filename: str = scfg.save_filename if scfg.save_filename else 'ep{epoch}-ba{batch}-rank{rank}.pt' - save_interval: Union[str, int] = scfg.save_interval - save_num_checkpoints_to_keep: int = scfg.save_num_checkpoints_to_keep - progress_bar = scfg.progress_bar - log_to_console: bool = scfg.log_to_console - python_log_level: Optional[str] = scfg.python_log_level - console_log_interval: Union[int, str] = scfg.console_log_interval - device_train_microbatch_size: Union[str, - int] = scfg.device_train_microbatch_size - eval_subset_num_batches: int = scfg.eval_subset_num_batches - eval_first: bool = scfg.eval_first - load_path: Optional[str] = scfg.load_path - load_weights_only: bool = scfg.load_weights_only - load_strict_model_weights: bool = scfg.load_strict_model_weights - load_ignore_keys: Optional[List[str]] = scfg.load_ignore_keys - save_ignore_keys: Optional[List[str]] = scfg.save_ignore_keys - compile_config: Optional[Dict[str, Any]] = scfg.compile_config - metadata: Optional[Dict[str, Any]] = scfg.metadata - should_log_config: bool = scfg.log_config + save_latest_filename: str = train_cfg.save_latest_filename if train_cfg.save_latest_filename else 'latest-sharded-rank{rank}' if is_state_dict_sharded else 'latest-rank{rank}.pt' + save_overwrite: bool = train_cfg.save_overwrite + save_weights_only: bool = train_cfg.save_weights_only + save_filename: str = train_cfg.save_filename if train_cfg.save_filename else 'ep{epoch}-ba{batch}-rank{rank}.pt' + save_interval: Union[str, int] = train_cfg.save_interval + save_num_checkpoints_to_keep: int = train_cfg.save_num_checkpoints_to_keep + progress_bar = train_cfg.progress_bar + log_to_console: bool = train_cfg.log_to_console + python_log_level: Optional[str] = train_cfg.python_log_level + console_log_interval: Union[int, str] = train_cfg.console_log_interval + device_train_microbatch_size: Union[ + str, int] = train_cfg.device_train_microbatch_size + eval_subset_num_batches: int = train_cfg.eval_subset_num_batches + eval_first: bool = train_cfg.eval_first + load_path: Optional[str] = train_cfg.load_path + load_weights_only: bool = train_cfg.load_weights_only + load_strict_model_weights: bool = train_cfg.load_strict_model_weights + load_ignore_keys: Optional[List[str]] = train_cfg.load_ignore_keys + save_ignore_keys: Optional[List[str]] = train_cfg.save_ignore_keys + compile_config: Optional[Dict[str, Any]] = train_cfg.compile_config + metadata: Optional[Dict[str, Any]] = train_cfg.metadata + should_log_config: bool = train_cfg.log_config # Enable autoresume from model checkpoints if possible autoresume_default: bool = False @@ -345,11 +356,11 @@ def main(cfg: DictConfig) -> Trainer: and not save_weights_only: autoresume_default = True - if not scfg.autoresume and autoresume_default: + if not train_cfg.autoresume and autoresume_default: log.info('As run_name, save_folder, and save_latest_filename are set, \ changing autoresume default to True...') - autoresume: bool = scfg.autoresume + autoresume: bool = train_cfg.autoresume # Warn if fsdp is enabled but user only has 1 GPU if dist.get_world_size() == 1 and fsdp_config is not None: @@ -408,7 +419,7 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None profiler_cfg: Optional[DictConfig] = DictConfig( - scfg.profiler) if scfg.profiler is not None else None + train_cfg.profiler) if train_cfg.profiler is not None else None if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', From f838b74a3dae280e0d873a670af8a3d444d63912 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:05:32 +0000 Subject: [PATCH 064/201] revert file --- tests/fixtures/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index 9012380c68..616d66085c 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -17,8 +17,8 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase): model = build_composer_model( name=config.name, - tokenizer=tokenizer, cfg=config, + tokenizer=tokenizer, ) return model From 4c31b6f87db3c608a7e2262bd345d349f247991e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:06:12 +0000 Subject: [PATCH 065/201] revert file II --- tests/models/hf/test_fsdp_weight_tying.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 1b2e84daaf..6e7838e7ba 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -69,8 +69,8 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, original_model = build_composer_model( name=model_cfg['name'], - tokenizer=tokenizer, cfg=model_cfg, + tokenizer=tokenizer, ) underlying_model = maybe_get_underlying_model(original_model.model) From ee469187b49fa544cbe971c6261420b37e99abfd Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:07:22 +0000 Subject: [PATCH 066/201] revert file III: revert of the sith --- tests/models/hf/test_hf_config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 9be4467a4f..e79756aba3 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -48,8 +48,8 @@ def test_remote_code_false_mpt( match='trust_remote_code must be set to True for MPT models.'): _ = build_composer_model( name=test_cfg.model.name, - tokenizer=tokenizer, cfg=test_cfg.model, + tokenizer=tokenizer, ) @@ -140,8 +140,8 @@ def test_hf_config_override( tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) model = build_composer_model( name=test_cfg.model.name, - tokenizer=tokenizer, cfg=test_cfg.model, + tokenizer=tokenizer, ) # save model @@ -164,8 +164,8 @@ def test_hf_config_override( hf_model = build_composer_model( name=hf_model_config.model.name, - tokenizer=tokenizer, cfg=hf_model_config.model, + tokenizer=tokenizer, ) for k, v in hf_model_config.model.config_overrides.items(): @@ -199,8 +199,8 @@ def test_rope_scaling_override(): model = build_composer_model( name=model_cfg.name, - tokenizer=None, # type: ignore cfg=model_cfg, + tokenizer=None, # type: ignore ) # This would error if the config isn't parsed into a proper dictionary model.get_metadata() @@ -226,8 +226,8 @@ def test_nested_override(): model = build_composer_model( name=model_cfg.name, - tokenizer=None, # type: ignore cfg=model_cfg, + tokenizer=None, # type: ignore ) # The value we changed From ff108c83b8726dfc30b9d8b96a58231d7efd23b0 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:08:17 +0000 Subject: [PATCH 067/201] peft revert file --- tests/models/hf/test_hf_peft_wrapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 8ae8e93c47..d8bea33dd4 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -85,8 +85,8 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, original_model = build_composer_model( name=model_cfg['name'], - tokenizer=tokenizer, cfg=model_cfg, + tokenizer=tokenizer, ) trainer = Trainer( From e6edad1dd475135b23ab7c2991506b5af1f0b4d9 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:09:44 +0000 Subject: [PATCH 068/201] revert v_mpt --- tests/models/hf/test_hf_v_mpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 3729e66cbf..8d7f024565 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -60,8 +60,8 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, ) hf_model = build_composer_model( name=hf_cfg.model.name, - tokenizer=tokenizer, cfg=hf_cfg.model, + tokenizer=tokenizer, ).to(device) hf_n_params = sum(p.numel() for p in hf_model.parameters()) From a59299c0da1fa15a741e6ea2b8152138a377e3b0 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:12:12 +0000 Subject: [PATCH 069/201] last revert --- tests/models/hf/test_hf_v_mpt.py | 2 +- tests/models/layers/test_huggingface_flash.py | 2 +- tests/models/test_model.py | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 8d7f024565..82b64ce80c 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -112,8 +112,8 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, print(model_cfg) model = build_composer_model( name=model_cfg.name, - tokenizer=tokenizer, cfg=model_cfg, + tokenizer=tokenizer, ).to(device) n_params = sum(p.numel() for p in model.parameters()) diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index cdd9fde50a..1e8ec2383d 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -84,8 +84,8 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): with error_context: model = build_composer_model( name=model_cfg['name'], - tokenizer=tokenizer, cfg=model_cfg, + tokenizer=tokenizer, ) # check that it actually used flash attention 2 diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 20a6b935b5..f74fe16f93 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -91,8 +91,8 @@ def _get_objs(request: pytest.FixtureRequest, model = build_composer_model( name=test_cfg.model.name, - tokenizer=tokenizer, cfg=test_cfg.model, + tokenizer=tokenizer, ) # Optimizer @@ -293,8 +293,8 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): model = build_composer_model( name=neo_cfg.model.name, - tokenizer=tokenizer, cfg=neo_cfg.model, + tokenizer=tokenizer, ).to(device) assert isinstance(model.tokenizer, @@ -342,8 +342,8 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): model = build_composer_model( name=t5_cfg.model.name, - tokenizer=tokenizer, cfg=t5_cfg.model, + tokenizer=tokenizer, ).to(device) assert isinstance(model.tokenizer, @@ -419,8 +419,8 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, model_1 = build_composer_model( name=test_cfg.model.name, - tokenizer=tokenizer, cfg=test_cfg.model, + tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -489,8 +489,8 @@ def test_loss_fn(): model_1 = build_composer_model( name=test_cfg.model.name, - tokenizer=tokenizer, cfg=test_cfg.model, + tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -575,8 +575,8 @@ def test_loss_reduction(loss_fn_config: str): model_1 = build_composer_model( name=test_cfg.model.name, - tokenizer=tokenizer, cfg=test_cfg.model, + tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) From 702910ff58ecc45839f85a5c094f3b75bbc9d9d4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:22:11 +0000 Subject: [PATCH 070/201] remove redundant checks --- scripts/train/train.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 5315db7233..9538b48aa5 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -116,9 +116,7 @@ def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" loaders = [train_config.train_loader] if train_config.eval_loader is not None or train_config.eval_loaders is not None: - eval_loader = train_config.eval_loader - if isinstance(train_config.eval_loaders, list) or isinstance( - train_config.eval_loaders, ListConfig): + if isinstance(train_config.eval_loaders, list): for loader in (train_config.eval_loaders or []): # pyright if 'label' not in loader or loader['label'] is None: raise ValueError( @@ -126,8 +124,7 @@ def validate_config(train_config: TrainConfig): `label` attribute.') loaders.append(loader) else: - if eval_loader is not None: - loaders.append(eval_loader) + loaders.append(train_config.eval_loader) for loader in loaders: if loader['name'] == 'text': if train_config.model['name'] == 'hf_t5': From 902254ca74d9a66e9491c144d9a305813dc18b95 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:23:41 +0000 Subject: [PATCH 071/201] deprecate --- scripts/train/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 9538b48aa5..a052fe6929 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -219,8 +219,8 @@ def main(cfg: DictConfig) -> Trainer: for key in extraneous_keys: warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes.' - ) + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. This behavior is deprecated.', + DeprecationWarning) # TODO (milo): delete the below line once we deprecate variables at the top level. unstructured_config['variables'][key] = unstructured_config.pop(key) From 20a7703ddd517ad3897935ffcea9957866c9a2f5 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:31:37 +0000 Subject: [PATCH 072/201] make cleaner --- llmfoundry/utils/builders.py | 18 ++++-------------- llmfoundry/utils/config_utils.py | 7 +++++++ scripts/train/train.py | 8 ++++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index a66c331703..7b93a4235d 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -29,6 +29,7 @@ from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper +from llmfoundry.utils.config_utils import to_str_dict from llmfoundry.utils.registry_utils import construct_from_registry from llmfoundry.utils.warnings import VersionedDeprecationWarning @@ -162,15 +163,6 @@ def build_icl_data_and_gauntlet( return icl_evaluators, logger_keys, eval_gauntlet_cb -def _is_string_keyed_dict(d: dict) -> bool: - return isinstance(d, dict) and all(isinstance(k, str) for k in d.keys()) - - -def _string_keyed_dict(d: dict) -> Dict[str, Any]: - assert all(isinstance(k, str) for k in d.keys()) - return {str(k): v for k, v in d.items()} - - def build_composer_model( name: str, cfg: Union[Dict[str, Any], DictConfig], @@ -194,11 +186,9 @@ def build_composer_model( init_context = contextlib.nullcontext() if isinstance(cfg, DictConfig): - container_cfg = om.to_container(cfg, resolve=True) - assert isinstance(container_cfg, dict) # pyright - model_cfg = _string_keyed_dict(container_cfg) # pyright - elif _is_string_keyed_dict(cfg): - model_cfg = _string_keyed_dict(cfg) # pyright + model_cfg = to_str_dict(cfg) # pyright + elif isinstance(cfg, dict): + model_cfg = {str(k): v for k, v in cfg.items()} else: raise ValueError( f'Invalid type for cfg: {type(cfg)}. Must be DictConfig or Dict.') diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index a4fd005c3a..7901d2b44d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -25,6 +25,13 @@ ] +def to_str_dict(cfg: DictConfig) -> Dict[str, Any]: + """Converts a DictConfig to a dictionary with string keys.""" + cfg_dict = om.to_container(cfg, resolve=True) + assert isinstance(cfg_dict, dict) + return {str(k): v for k, v in cfg_dict.items()} + + def pop_config(cfg: DictConfig, key: str, must_exist: bool = True, diff --git a/scripts/train/train.py b/scripts/train/train.py index a052fe6929..4c4d7f7f2b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -36,7 +36,7 @@ build_logger, build_optimizer, build_scheduler, build_tokenizer) from llmfoundry.utils.config_utils import (log_config, pop_config, - process_init_device, + process_init_device, to_str_dict, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -280,9 +280,9 @@ def main(cfg: DictConfig) -> Trainer: # Mandatory model training configs model_config: DictConfig = DictConfig(train_cfg.model) - tokenizer_config: Dict[str, Any] = {**train_cfg.tokenizer} - optimizer_config: Dict[str, Any] = {**train_cfg.optimizer} - scheduler_config: Dict[str, Any] = {**train_cfg.scheduler} + tokenizer_config: Dict[str, Any] = to_str_dict(train_cfg.tokenizer) + optimizer_config: Dict[str, Any] = to_str_dict(train_cfg.optimizer) + scheduler_config: Dict[str, Any] = to_str_dict(train_cfg.scheduler) train_loader_config: DictConfig = DictConfig(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs From b7db045503cd98986fd8d99654ed79af0b737894 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:34:08 +0000 Subject: [PATCH 073/201] pyright is bullying me again --- scripts/train/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/train/train.py b/scripts/train/train.py index 4c4d7f7f2b..7313716fb1 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -124,6 +124,7 @@ def validate_config(train_config: TrainConfig): `label` attribute.') loaders.append(loader) else: + assert train_config.eval_loader is not None # pyright being pyright loaders.append(train_config.eval_loader) for loader in loaders: if loader['name'] == 'text': From 40324c89c4dc6ac89e6fad76035fc1821e473776 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 18:37:25 +0000 Subject: [PATCH 074/201] further clean config_utils --- scripts/train/train.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 7313716fb1..30baaac260 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -36,7 +36,7 @@ build_logger, build_optimizer, build_scheduler, build_tokenizer) from llmfoundry.utils.config_utils import (log_config, pop_config, - process_init_device, to_str_dict, + process_init_device, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -194,7 +194,8 @@ def validate_config(train_config: TrainConfig): def main(cfg: DictConfig) -> Trainer: - unstructured_config = cfg + unstructured_config = om.to_container(cfg, resolve=True) + assert isinstance(unstructured_config, dict) # Resolve all interpolation variables as early as possible om.resolve(unstructured_config) @@ -226,7 +227,7 @@ def main(cfg: DictConfig) -> Trainer: unstructured_config['variables'][key] = unstructured_config.pop(key) # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(unstructured_config) + logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) # Get global and device batch size information from distributed/single node setting unstructured_config = update_batch_size_info(unstructured_config) @@ -281,9 +282,9 @@ def main(cfg: DictConfig) -> Trainer: # Mandatory model training configs model_config: DictConfig = DictConfig(train_cfg.model) - tokenizer_config: Dict[str, Any] = to_str_dict(train_cfg.tokenizer) - optimizer_config: Dict[str, Any] = to_str_dict(train_cfg.optimizer) - scheduler_config: Dict[str, Any] = to_str_dict(train_cfg.scheduler) + tokenizer_config: Dict[str, Any] = train_cfg.tokenizer + optimizer_config: Dict[str, Any] = train_cfg.optimizer + scheduler_config: Dict[str, Any] = train_cfg.scheduler train_loader_config: DictConfig = DictConfig(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs From e7a2bfc324e409088050100900c1223369069580 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 19:18:20 +0000 Subject: [PATCH 075/201] polish train --- llmfoundry/utils/builders.py | 2 +- llmfoundry/utils/config_utils.py | 18 +++++++------- scripts/train/train.py | 28 ++++++++++------------ tests/a_scripts/train/test_train.py | 5 ++-- tests/a_scripts/train/test_train_inputs.py | 3 ++- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 7b93a4235d..615b0b09d0 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -390,7 +390,7 @@ def build_optimizer(model: torch.nn.Module, name: str, optimizer_config[k] = om.to_container(v, resolve=True) params = _extract_param_groups(model, optimizer_config) - kwargs = optimizer_config + kwargs = {**optimizer_config} if 'params' in kwargs: raise ValueError( diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 7901d2b44d..35ab679b4a 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -90,19 +90,19 @@ def calculate_batch_size_info( # Coming soon: this conversion math will be done inside Composer Trainer -def update_batch_size_info(cfg: DictConfig) -> DictConfig: +def update_batch_size_info(cfg: Dict[str, Any]) -> DictConfig: device_train_batch_size, device_train_microbatch_size, device_train_grad_accum = calculate_batch_size_info( - cfg.global_train_batch_size, cfg.device_train_microbatch_size) - cfg.n_gpus = dist.get_world_size() - cfg.device_train_batch_size = device_train_batch_size - cfg.device_train_microbatch_size = device_train_microbatch_size - cfg.device_train_grad_accum = device_train_grad_accum + cfg['global_train_batch_size'], cfg['device_train_microbatch_size']) + cfg['n_gpus'] = dist.get_world_size() + cfg['device_train_batch_size'] = device_train_batch_size + cfg['device_train_microbatch_size'] = device_train_microbatch_size + cfg['device_train_grad_accum'] = device_train_grad_accum # Safely set `device_eval_batch_size` if not provided by user if 'device_eval_batch_size' not in cfg: - if cfg.device_train_microbatch_size == 'auto': - cfg.device_eval_batch_size = 1 # TODO debug auto eval microbatching + if cfg['device_train_microbatch_size'] == 'auto': + cfg['device_eval_batch_size'] = 1 # TODO debug auto eval microbatching else: - cfg.device_eval_batch_size = cfg.device_train_microbatch_size + cfg['device_eval_batch_size'] = cfg.device_train_microbatch_size return cfg diff --git a/scripts/train/train.py b/scripts/train/train.py index 30baaac260..576ac383d1 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -115,17 +115,16 @@ class TrainConfig: def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" loaders = [train_config.train_loader] - if train_config.eval_loader is not None or train_config.eval_loaders is not None: - if isinstance(train_config.eval_loaders, list): - for loader in (train_config.eval_loaders or []): # pyright - if 'label' not in loader or loader['label'] is None: - raise ValueError( - 'When specifying multiple evaluation datasets, each one must include the \ + if train_config.eval_loaders is not None: + for loader in (train_config.eval_loaders or []): # pyright + if 'label' not in loader or loader['label'] is None: + raise ValueError( + 'When specifying multiple evaluation datasets, each one must include the \ `label` attribute.') - loaders.append(loader) - else: - assert train_config.eval_loader is not None # pyright being pyright - loaders.append(train_config.eval_loader) + loaders.append(loader) + if train_config.eval_loader is not None: + assert train_config.eval_loaders is None, 'Only one of `eval_loader` or `eval_loaders` should be provided.' + loaders.append(train_config.eval_loader) for loader in loaders: if loader['name'] == 'text': if train_config.model['name'] == 'hf_t5': @@ -194,14 +193,13 @@ def validate_config(train_config: TrainConfig): def main(cfg: DictConfig) -> Trainer: + # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) assert isinstance(unstructured_config, dict) - # Resolve all interpolation variables as early as possible - om.resolve(unstructured_config) # Structured config does not support unions of containers, so separate single and plural containers if (loader := unstructured_config.get('eval_loader', None)) is not None: - if isinstance(loader, ListConfig): + if isinstance(loader, list) or isinstance(loader, ListConfig): unstructured_config['eval_loaders'] = list( unstructured_config.pop('eval_loader')) if (tasks := unstructured_config.get('icl_tasks', None)) is not None: @@ -441,7 +439,7 @@ def main(cfg: DictConfig) -> Trainer: # Callbacks callbacks: List[Callback] = [ - build_callback(str(name), callback_cfg, om.to_container(logged_cfg)) + build_callback(str(name), callback_cfg, logged_cfg) for name, callback_cfg in callback_configs.items() ] if callback_configs else [] @@ -588,7 +586,7 @@ def main(cfg: DictConfig) -> Trainer: if should_log_config: log.info('Logging config') - log_config(logged_cfg) + log_config(DictConfig(logged_cfg)) torch.cuda.empty_cache() gc.collect() diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 7899eeda0a..9efc04755d 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -11,7 +11,7 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -from llmfoundry.utils.config_utils import update_batch_size_info +from llmfoundry.utils.config_utils import to_str_dict, update_batch_size_info from scripts.train.train import TrainConfig, main, validate_config # noqa: E402 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) @@ -158,7 +158,8 @@ def test_validate_config(): test_cfg: DictConfig = om.load(f) # type: ignore test_cfg.model.ffn_config.moe_world_size = 4 test_cfg.fsdp_config.use_orig_params = False - test_cfg = update_batch_size_info(test_cfg) + test_cfg_dict = to_str_dict(test_cfg) + test_cfg_dict = update_batch_size_info(test_cfg_dict) with pytest.raises( ValueError, match= diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index 24cad29a6b..c2dd5b3d27 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -79,7 +79,8 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: for param in mandatory_params: orig_param = cfg.pop(param) with pytest.raises( - (omegaconf.errors.MissingMandatoryValue, NameError)): + (omegaconf.errors.MissingMandatoryValue, NameError, + omegaconf.errors.InterpolationKeyError)): main(cfg) cfg[param] = orig_param From 4c403fd2e56402628d2974f9e795fabdf3f9c355 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 19:45:01 +0000 Subject: [PATCH 076/201] polish train and eval --- scripts/eval/eval.py | 96 ++++++++++++++++++++++-------------------- scripts/train/train.py | 8 ++-- 2 files changed, 55 insertions(+), 49 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 7a3a56b36f..4c50aefb80 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -173,8 +173,11 @@ class EvalConfig: eval_gauntlet: Optional[Dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None fsdp_config: Optional[Dict[str, Any]] = None - icl_tasks: Optional[List[str]] = MISSING + + # one of icl_tasks or icl_tasks_str must be specified + icl_tasks: Optional[List[Dict[str, Any]]] = None icl_tasks_str: Optional[str] = None + max_seq_len: int = MISSING device_eval_batch_size: int = MISSING precision: str = 'amp_bf16' @@ -200,47 +203,53 @@ class EvalConfig: def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: - om.resolve(cfg) + unstructured_config = om.to_container(cfg, resolve=True) # flatten union types before creating structured config: - if 'eval_gauntlet' in cfg: - if isinstance(cfg.eval_gauntlet, str): - cfg.eval_gauntlet_str = cfg.pop('eval_gauntlet') - if 'eval_loader' in cfg: - if isinstance(cfg.eval_loader, ListConfig): - cfg.eval_loaders = cfg.pop('eval_loader') - if 'icl_tasks' in cfg: - if isinstance(cfg.icl_tasks, str): - cfg.icl_tasks_str = cfg.pop('icl_tasks') - - arg_config_keys = set(cfg.keys()) + if 'eval_gauntlet' in unstructured_config: + if isinstance(unstructured_config['eval_gauntlet'], str): + unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( + 'eval_gauntlet') + if (loader := unstructured_config.get('eval_loader', None)) is not None: + if isinstance(loader, list): + unstructured_config['eval_loaders'] = unstructured_config.pop( + 'eval_loader') + if 'icl_tasks' in unstructured_config: + if isinstance(unstructured_config['icl_tasks'], str): + unstructured_config['icl_tasks_str'] = unstructured_config.pop( + 'icl_tasks') + else: + raise ValueError('icl_tasks must be specified in the config') + + arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, EVAL_CONFIG_KEYS) - if 'variables' not in cfg: - cfg['variables'] = {} + if 'variables' not in unstructured_config: + unstructured_config['variables'] = {} for key in extraneous_keys: warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes.' - ) + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', + DeprecationWarning) # TODO (milo): delete the below line once we deprecate variables at the top level. - cfg['variables'][key] = cfg.pop(key) + unstructured_config['variables'][key] = unstructured_config.pop(key) - scfg: EvalConfig = om.structured(EvalConfig(**cfg)) + eval_config: EvalConfig = om.structured(EvalConfig(**unstructured_config)) # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(cfg) + logged_cfg: DictConfig = copy.deepcopy(unstructured_config) # Run user provided code if specified - code_paths = scfg.code_paths + code_paths = eval_config.code_paths for code_path in (code_paths or []): import_file(code_path) - model_configs = ListConfig(scfg.models) + model_configs = ListConfig(eval_config.models) eval_gauntlet_config = DictConfig( - scfg.eval_gauntlet) if scfg.eval_gauntlet else scfg.eval_gauntlet_str + eval_config.eval_gauntlet + ) if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str fsdp_config = om.to_container( - scfg.fsdp_config) if scfg.fsdp_config else None + eval_config.fsdp_config) if eval_config.fsdp_config else None assert isinstance( fsdp_config, Dict @@ -248,35 +257,32 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: # Mandatory Evaluation Parameters icl_tasks: Union[ListConfig, str, None] = ListConfig( - scfg.icl_tasks) if scfg.icl_tasks else scfg.icl_tasks_str + eval_config.icl_tasks + ) if eval_config.icl_tasks else eval_config.icl_tasks_str assert icl_tasks is not None, 'icl_tasks must be specified in the config' - max_seq_len = scfg.max_seq_len - device_eval_batch_size = scfg.device_eval_batch_size - precision = scfg.precision - python_log_level: Optional[str] = scfg.python_log_level + max_seq_len = eval_config.max_seq_len + device_eval_batch_size = eval_config.device_eval_batch_size + precision = eval_config.precision + python_log_level: Optional[str] = eval_config.python_log_level # Optional Evaluation Parameters with default values eval_loader_config = DictConfig( - scfg.eval_loader) if scfg.eval_loader else ListConfig( - scfg.eval_loaders) if scfg.eval_loaders else None - seed = scfg.seed - dist_timeout = scfg.dist_timeout + eval_config.eval_loader) if eval_config.eval_loader else ListConfig( + eval_config.eval_loaders) if eval_config.eval_loaders else None + seed = eval_config.seed + dist_timeout = eval_config.dist_timeout default_run_name: str = os.environ.get('RUN_NAME', 'llm') - run_name = scfg.run_name if scfg.run_name else default_run_name - loggers_cfg = scfg.loggers - eval_subset_num_batches = scfg.eval_subset_num_batches - icl_subset_num_batches = scfg.icl_subset_num_batches - metadata = scfg.metadata - should_log_config = scfg.log_config - - callback_configs = om.to_container(scfg.callbacks) if scfg.callbacks else [] + run_name = eval_config.run_name if eval_config.run_name else default_run_name + loggers_cfg = eval_config.loggers + eval_subset_num_batches = eval_config.eval_subset_num_batches + icl_subset_num_batches = eval_config.icl_subset_num_batches + metadata = eval_config.metadata + should_log_config = eval_config.log_config - if callback_configs is not None: - assert isinstance(callback_configs, dict) - callback_configs = {str(k): v for k, v in callback_configs.items()} + callback_configs = eval_config.callbacks # Warn for unused parameters - for key in cfg: + for key in unstructured_config: warnings.warn( f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' ) diff --git a/scripts/train/train.py b/scripts/train/train.py index 576ac383d1..45ae255850 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -199,9 +199,9 @@ def main(cfg: DictConfig) -> Trainer: # Structured config does not support unions of containers, so separate single and plural containers if (loader := unstructured_config.get('eval_loader', None)) is not None: - if isinstance(loader, list) or isinstance(loader, ListConfig): - unstructured_config['eval_loaders'] = list( - unstructured_config.pop('eval_loader')) + if isinstance(loader, list): + unstructured_config['eval_loaders'] = unstructured_config.pop( + 'eval_loader') if (tasks := unstructured_config.get('icl_tasks', None)) is not None: if isinstance(tasks, str): unstructured_config['icl_tasks_str'] = unstructured_config.pop( @@ -219,7 +219,7 @@ def main(cfg: DictConfig) -> Trainer: for key in extraneous_keys: warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. This behavior is deprecated.', + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', DeprecationWarning) # TODO (milo): delete the below line once we deprecate variables at the top level. unstructured_config['variables'][key] = unstructured_config.pop(key) From f7166420bdd4e8c75db202bd0c2b860297ba994e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 19:49:29 +0000 Subject: [PATCH 077/201] fix dist --- llmfoundry/utils/config_utils.py | 4 ++-- scripts/eval/eval.py | 4 ++++ scripts/train/train.py | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 35ab679b4a..026169df11 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -90,7 +90,7 @@ def calculate_batch_size_info( # Coming soon: this conversion math will be done inside Composer Trainer -def update_batch_size_info(cfg: Dict[str, Any]) -> DictConfig: +def update_batch_size_info(cfg: Dict[str, Any]) -> Dict[str, Any]: device_train_batch_size, device_train_microbatch_size, device_train_grad_accum = calculate_batch_size_info( cfg['global_train_batch_size'], cfg['device_train_microbatch_size']) cfg['n_gpus'] = dist.get_world_size() @@ -102,7 +102,7 @@ def update_batch_size_info(cfg: Dict[str, Any]) -> DictConfig: if cfg['device_train_microbatch_size'] == 'auto': cfg['device_eval_batch_size'] = 1 # TODO debug auto eval microbatching else: - cfg['device_eval_batch_size'] = cfg.device_train_microbatch_size + cfg['device_eval_batch_size'] = cfg['device_train_microbatch_size'] return cfg diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 4c50aefb80..1e5e382a53 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -203,7 +203,11 @@ class EvalConfig: def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: + # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) + assert isinstance(unstructured_config, dict) + assert all(isinstance(k, str) for k in unstructured_config.keys()) + unstructured_config = {str(k): v for k, v in unstructured_config.items()} # flatten union types before creating structured config: if 'eval_gauntlet' in unstructured_config: diff --git a/scripts/train/train.py b/scripts/train/train.py index 45ae255850..812f8a7921 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -196,6 +196,8 @@ def main(cfg: DictConfig) -> Trainer: # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) assert isinstance(unstructured_config, dict) + assert all(isinstance(k, str) for k in unstructured_config.keys()) + unstructured_config = {str(k): v for k, v in unstructured_config.items()} # Structured config does not support unions of containers, so separate single and plural containers if (loader := unstructured_config.get('eval_loader', None)) is not None: From 0baae32bc6c060bb9dfd3db8df0d4300343643db Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 19:59:34 +0000 Subject: [PATCH 078/201] fix style --- scripts/eval/eval.py | 2 +- scripts/train/train.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 1e5e382a53..914799053b 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -240,7 +240,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: eval_config: EvalConfig = om.structured(EvalConfig(**unstructured_config)) # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(unstructured_config) + logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) # Run user provided code if specified code_paths = eval_config.code_paths diff --git a/scripts/train/train.py b/scripts/train/train.py index 812f8a7921..0fe4a1ad16 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -227,7 +227,7 @@ def main(cfg: DictConfig) -> Trainer: unstructured_config['variables'][key] = unstructured_config.pop(key) # Create copy of config for logging - logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) + logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) # Get global and device batch size information from distributed/single node setting unstructured_config = update_batch_size_info(unstructured_config) @@ -588,7 +588,7 @@ def main(cfg: DictConfig) -> Trainer: if should_log_config: log.info('Logging config') - log_config(DictConfig(logged_cfg)) + log_config(logged_cfg) torch.cuda.empty_cache() gc.collect() From e4ee9fc20845315dbb5d779292a98e7f918b5633 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 20:44:09 +0000 Subject: [PATCH 079/201] organize eval and train --- scripts/eval/eval.py | 110 +++++++++++----------- scripts/train/train.py | 204 +++++++++++++++++++++-------------------- 2 files changed, 156 insertions(+), 158 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 914799053b..b8450c4cd8 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -167,42 +167,52 @@ def evaluate_model( @dataclass class EvalConfig: + # Eval Config required parameters: models: List[Dict[str, Any]] = MISSING + max_seq_len: int = MISSING + device_eval_batch_size: int = MISSING + # Eval Config optional parameters: code_paths: Optional[List[str]] = None + + # eval hyperparameters eval_gauntlet: Optional[Dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None - fsdp_config: Optional[Dict[str, Any]] = None - + eval_loader: Optional[Dict[str, Any]] = None + eval_loaders: Optional[List[Dict[str, Any]]] = None + eval_subset_num_batches: int = -1 + icl_subset_num_batches: Optional[int] = None # one of icl_tasks or icl_tasks_str must be specified icl_tasks: Optional[List[Dict[str, Any]]] = None icl_tasks_str: Optional[str] = None - max_seq_len: int = MISSING - device_eval_batch_size: int = MISSING - precision: str = 'amp_bf16' + # loggirg parameters python_log_level: Optional[str] = None - eval_loader: Optional[Dict[str, Any]] = None - eval_loaders: Optional[List[Dict[str, Any]]] = None + loggers: Optional[Dict[str, Any]] = None + log_config: bool = True + # model/run parameters seed: int = 17 - dist_timeout: Union[float, int] = 600.0 + precision: str = 'amp_bf16' run_name: Optional[str] = None - loggers: Optional[Dict[str, Any]] = None - eval_subset_num_batches: int = -1 - icl_subset_num_batches: Optional[int] = None - metadata: Optional[Dict[str, str]] = None - log_config: bool = True model_name_or_path: Optional[str] = None + metadata: Optional[Dict[str, str]] = None + + # distributed parameters + dist_timeout: Union[float, int] = 600.0 + fsdp_config: Optional[Dict[str, Any]] = None + + # callback parameters callbacks: Optional[Dict[str, Any]] = None + # variables to ignore variables: Optional[Dict[str, Any]] = None # variables to ignore EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) -def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: +def _make_eval_and_log_config(cfg: DictConfig) -> Tuple[DictConfig, EvalConfig]: # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) assert isinstance(unstructured_config, dict) @@ -242,9 +252,14 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: # Create copy of config for logging logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) + return logged_cfg, eval_config + + +def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: + logged_cfg, eval_config = _make_eval_and_log_config(cfg) + # Run user provided code if specified - code_paths = eval_config.code_paths - for code_path in (code_paths or []): + for code_path in (eval_config.code_paths or []): import_file(code_path) model_configs = ListConfig(eval_config.models) @@ -252,65 +267,46 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: eval_config.eval_gauntlet ) if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str - fsdp_config = om.to_container( - eval_config.fsdp_config) if eval_config.fsdp_config else None - assert isinstance( - fsdp_config, Dict - ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' + eval_config.fsdp_config, Dict + ) or eval_config.fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(eval_config.fsdp_config)}' # Mandatory Evaluation Parameters icl_tasks: Union[ListConfig, str, None] = ListConfig( eval_config.icl_tasks ) if eval_config.icl_tasks else eval_config.icl_tasks_str assert icl_tasks is not None, 'icl_tasks must be specified in the config' - max_seq_len = eval_config.max_seq_len - device_eval_batch_size = eval_config.device_eval_batch_size - precision = eval_config.precision - python_log_level: Optional[str] = eval_config.python_log_level # Optional Evaluation Parameters with default values eval_loader_config = DictConfig( eval_config.eval_loader) if eval_config.eval_loader else ListConfig( eval_config.eval_loaders) if eval_config.eval_loaders else None - seed = eval_config.seed - dist_timeout = eval_config.dist_timeout default_run_name: str = os.environ.get('RUN_NAME', 'llm') run_name = eval_config.run_name if eval_config.run_name else default_run_name - loggers_cfg = eval_config.loggers - eval_subset_num_batches = eval_config.eval_subset_num_batches - icl_subset_num_batches = eval_config.icl_subset_num_batches - metadata = eval_config.metadata - should_log_config = eval_config.log_config - - callback_configs = eval_config.callbacks - - # Warn for unused parameters - for key in unstructured_config: - warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary.' - ) - reproducibility.seed_all(seed) - dist.initialize_dist(get_device(None), timeout=dist_timeout) + reproducibility.seed_all(eval_config.seed) + dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) - if python_log_level is not None: + if eval_config.python_log_level is not None: logging.basicConfig( # Example of format string # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here format= f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' ) - logging.getLogger('llmfoundry').setLevel(python_log_level.upper()) + logging.getLogger('llmfoundry').setLevel( + eval_config.python_log_level.upper()) + # default argument values for evaluate_model eval_gauntlet_df = None models_df = None composite_scores = None trainers = [] + # build loggers loggers: List[LoggerDestination] = [ build_logger(name, logger_cfg) - for name, logger_cfg in (loggers_cfg or {}).items() + for name, logger_cfg in (eval_config.loggers or {}).items() ] mosaicml_logger = find_mosaicml_logger(loggers) @@ -328,25 +324,25 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for model_cfg in model_configs: (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) = evaluate_model( - dist_timeout=dist_timeout, + dist_timeout=eval_config.dist_timeout, run_name=run_name, - seed=seed, + seed=eval_config.seed, icl_tasks=icl_tasks, - max_seq_len=max_seq_len, - device_eval_batch_size=device_eval_batch_size, + max_seq_len=eval_config.max_seq_len, + device_eval_batch_size=eval_config.device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, eval_loader_config=eval_loader_config, - fsdp_config=fsdp_config, + fsdp_config=eval_config.fsdp_config, loggers=loggers, - python_log_level=python_log_level, - precision=precision, + python_log_level=eval_config.python_log_level, + precision=eval_config.precision, eval_gauntlet_df=eval_gauntlet_df, - callback_configs=callback_configs, - eval_subset_num_batches=eval_subset_num_batches, - icl_subset_num_batches=icl_subset_num_batches, - metadata=metadata, + callback_configs=eval_config.callbacks, + eval_subset_num_batches=eval_config.eval_subset_num_batches, + icl_subset_num_batches=eval_config.icl_subset_num_batches, + metadata=eval_config.metadata, logged_config=logged_cfg, - should_log_config=should_log_config, + should_log_config=eval_config.should_log_config, **model_cfg) trainers.append(trainer) diff --git a/scripts/train/train.py b/scripts/train/train.py index 0fe4a1ad16..f33d4ed91a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -8,7 +8,7 @@ import time import warnings from dataclasses import dataclass, fields -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Tuple, Union import torch from composer import Trainer @@ -45,6 +45,9 @@ @dataclass class TrainConfig: + """Dataclass for training configuration.""" + + # Mandatory model training parameters model: Dict[str, Any] = MISSING tokenizer: Dict[str, Any] = MISSING optimizer: Dict[str, Any] = MISSING @@ -58,24 +61,43 @@ class TrainConfig: max_seq_len: int = MISSING seed: int = MISSING + # Optional model training parameters + + # Code paths to import code_paths: Optional[List[str]] = None + + # Cuda allocation configuration max_split_size_mb: Optional[int] = None expandable_segments: bool = False cuda_load_lazy: bool = False + + # distributed training parameters dist_timeout: Union[int, float] = 600.0 + fsdp_config: Optional[Dict[str, Any]] = None + + # evaluation parameters eval_loader: Optional[Dict[str, Any]] = None eval_loaders: Optional[List[Dict[str, Any]]] = None icl_tasks: Optional[List[Dict[str, Any]]] = None icl_tasks_str: Optional[str] = None - fsdp_config: Optional[Dict[str, Any]] = None eval_gauntlet: Optional[Dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None icl_subset_num_batches: Optional[int] = None icl_seq_len: Optional[int] = None + + # logging loggers: Optional[Dict[str, Any]] = None + progress_bar: bool = False + log_to_console: bool = True + python_log_level: Optional[str] = 'debug' + console_log_interval: Union[int, str] = '1ba' + log_config: bool = True + + # callbacks callbacks: Optional[Dict[str, Any]] = None algorithms: Optional[Dict[str, Any]] = None - run_name: Optional[str] = None + + # checkpoints save_folder: Optional[str] = None save_latest_filename: Optional[str] = None save_overwrite: bool = False @@ -83,29 +105,39 @@ class TrainConfig: save_filename: Optional[str] = None save_interval: Union[str, int] = '1000ba' save_num_checkpoints_to_keep: int = -1 - progress_bar: bool = False - log_to_console: bool = True - python_log_level: Optional[str] = 'debug' - console_log_interval: Union[int, str] = '1ba' - device_train_microbatch_size: Union[str, int] = 'auto' - eval_subset_num_batches: int = -1 - eval_first: bool = False load_path: Optional[str] = None load_weights_only: bool = False load_strict_model_weights: bool = True load_ignore_keys: Optional[List[str]] = None + save_ignore_keys: Optional[List[str]] = None + + # dataloader + device_train_microbatch_size: Union[str, int] = 'auto' + data_local: Optional[str] = None + data_remote: Optional[str] = None + + # eval dataloader + eval_subset_num_batches: int = -1 + eval_first: bool = False compile_config: Optional[Dict[str, Any]] = None + + # metadata metadata: Optional[Dict[str, Any]] = None - log_config: bool = True + run_name: Optional[str] = None + + # resumption autoresume: bool = False - data_local: Optional[str] = None - data_remote: Optional[str] = None + + # gradient accumulation + device_train_grad_accum: Optional[int] = None + + # profiling + profiler: Optional[Dict[str, Any]] = None + + # ignore keys global_seed: Optional[int] = None global_train_batch_size: Optional[int] = None n_gpus: Optional[int] = None - device_train_grad_accum: Optional[int] = None - profiler: Optional[Dict[str, Any]] = None - save_ignore_keys: Optional[List[str]] = None variables: Optional[Dict[str, Any]] = None @@ -192,7 +224,8 @@ def validate_config(train_config: TrainConfig): ) -def main(cfg: DictConfig) -> Trainer: +def _make_train_and_log_config( + cfg: DictConfig) -> Tuple[DictConfig, TrainConfig]: # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) assert isinstance(unstructured_config, dict) @@ -236,6 +269,11 @@ def main(cfg: DictConfig) -> Trainer: train_cfg: TrainConfig = om.structured( TrainConfig(**unstructured_config) ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) + return logged_cfg, train_cfg + + +def main(cfg: DictConfig) -> Trainer: + logged_cfg, train_cfg = _make_train_and_log_config(cfg) code_paths = train_cfg.code_paths if train_cfg.code_paths else [] # Import any user provided code @@ -282,9 +320,6 @@ def main(cfg: DictConfig) -> Trainer: # Mandatory model training configs model_config: DictConfig = DictConfig(train_cfg.model) - tokenizer_config: Dict[str, Any] = train_cfg.tokenizer - optimizer_config: Dict[str, Any] = train_cfg.optimizer - scheduler_config: Dict[str, Any] = train_cfg.scheduler train_loader_config: DictConfig = DictConfig(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs @@ -305,54 +340,21 @@ def main(cfg: DictConfig) -> Trainer: ) if train_cfg.eval_gauntlet is not None else train_cfg.eval_gauntlet_str icl_subset_num_batches: Optional[int] = train_cfg.icl_subset_num_batches icl_seq_len: Optional[int] = train_cfg.icl_seq_len - # Optional logging, evaluation and callback configs - logger_configs: Optional[Dict[str, Any]] = train_cfg.loggers - callback_configs: Optional[Dict[str, Any]] = train_cfg.callbacks - algorithm_configs: Optional[Dict[str, Any]] = train_cfg.algorithms - - # Mandatory hyperparameters for training - device_train_batch_size: int = train_cfg.device_train_batch_size - device_eval_batch_size: int = train_cfg.device_eval_batch_size - max_duration: Union[int, str] = train_cfg.max_duration - eval_interval: Union[int, str] = train_cfg.eval_interval - precision: str = train_cfg.precision - max_seq_len: int = train_cfg.max_seq_len # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') run_name: str = train_cfg.run_name if train_cfg.run_name else default_run_name - save_folder: Optional[str] = train_cfg.save_folder is_state_dict_sharded: bool = (fsdp_config.get('state_dict_type', 'full') == 'sharded') if fsdp_config else False save_latest_filename: str = train_cfg.save_latest_filename if train_cfg.save_latest_filename else 'latest-sharded-rank{rank}' if is_state_dict_sharded else 'latest-rank{rank}.pt' - save_overwrite: bool = train_cfg.save_overwrite - save_weights_only: bool = train_cfg.save_weights_only save_filename: str = train_cfg.save_filename if train_cfg.save_filename else 'ep{epoch}-ba{batch}-rank{rank}.pt' - save_interval: Union[str, int] = train_cfg.save_interval - save_num_checkpoints_to_keep: int = train_cfg.save_num_checkpoints_to_keep - progress_bar = train_cfg.progress_bar - log_to_console: bool = train_cfg.log_to_console - python_log_level: Optional[str] = train_cfg.python_log_level - console_log_interval: Union[int, str] = train_cfg.console_log_interval - device_train_microbatch_size: Union[ - str, int] = train_cfg.device_train_microbatch_size - eval_subset_num_batches: int = train_cfg.eval_subset_num_batches - eval_first: bool = train_cfg.eval_first - load_path: Optional[str] = train_cfg.load_path - load_weights_only: bool = train_cfg.load_weights_only - load_strict_model_weights: bool = train_cfg.load_strict_model_weights - load_ignore_keys: Optional[List[str]] = train_cfg.load_ignore_keys - save_ignore_keys: Optional[List[str]] = train_cfg.save_ignore_keys - compile_config: Optional[Dict[str, Any]] = train_cfg.compile_config - metadata: Optional[Dict[str, Any]] = train_cfg.metadata - should_log_config: bool = train_cfg.log_config # Enable autoresume from model checkpoints if possible autoresume_default: bool = False if logged_cfg.get('run_name', None) is not None \ - and save_folder is not None \ - and not save_overwrite \ - and not save_weights_only: + and train_cfg.save_folder is not None \ + and not train_cfg.save_overwrite \ + and not train_cfg.save_weights_only: autoresume_default = True if not train_cfg.autoresume and autoresume_default: @@ -368,7 +370,7 @@ def main(cfg: DictConfig) -> Trainer: fsdp_config = None # set logging level - if python_log_level is not None: + if train_cfg.python_log_level is not None: logging.basicConfig( # Example of format string # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here @@ -376,9 +378,9 @@ def main(cfg: DictConfig) -> Trainer: f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' ) logging.getLogger('llmfoundry').setLevel( - python_log_level.upper()) # Foundry module + train_cfg.python_log_level.upper()) # Foundry module logging.getLogger(__name__).setLevel( - python_log_level.upper()) # Train script + train_cfg.python_log_level.upper()) # Train script # Initialize context init_context = process_init_device(model_config, fsdp_config) @@ -386,19 +388,19 @@ def main(cfg: DictConfig) -> Trainer: # Build tokenizer log.info('Building tokenizer...') - tokenizer_name = tokenizer_config['name'] - tokenizer_kwargs = tokenizer_config.get('kwargs', {}) + tokenizer_name = train_cfg.tokenizer['name'] + tokenizer_kwargs = train_cfg.tokenizer.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) # Scheduler - scheduler_name: str = scheduler_config.pop('name') - scheduler = build_scheduler(scheduler_name, scheduler_config) + scheduler_name: str = train_cfg.scheduler.pop('name') + scheduler = build_scheduler(scheduler_name, train_cfg.scheduler) # Loggers loggers = [ build_logger(str(name), logger_cfg) - for name, logger_cfg in logger_configs.items() - ] if logger_configs else [] + for name, logger_cfg in train_cfg.loggers.items() + ] if train_cfg.loggers else [] mosaicml_logger = find_mosaicml_logger(loggers) if mosaicml_logger is None: @@ -407,12 +409,12 @@ def main(cfg: DictConfig) -> Trainer: # mosaicml_logger will be None if run isn't on MosaicML platform loggers.append(mosaicml_logger) - if metadata is not None: + if train_cfg.metadata is not None: # Flatten the metadata for logging logged_cfg.pop('metadata', None) - logged_cfg.update(metadata, merge=True) + logged_cfg.update(train_cfg.metadata, merge=True) if mosaicml_logger is not None: - mosaicml_logger.log_metrics(metadata) + mosaicml_logger.log_metrics(train_cfg.metadata) mosaicml_logger._flush_metadata(force_flush=True) # Profiling @@ -442,16 +444,16 @@ def main(cfg: DictConfig) -> Trainer: # Callbacks callbacks: List[Callback] = [ build_callback(str(name), callback_cfg, logged_cfg) - for name, callback_cfg in callback_configs.items() - ] if callback_configs else [] + for name, callback_cfg in train_cfg.callbacks.items() + ] if train_cfg.callbacks else [] use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) # Algorithms algorithms = [ build_algorithm(str(name), algorithm_cfg) - for name, algorithm_cfg in algorithm_configs.items() - ] if algorithm_configs else None + for name, algorithm_cfg in train_cfg.algorithms.items() + ] if train_cfg.algorithms else None # Dataloaders log.info('Building train loader...') @@ -459,7 +461,7 @@ def main(cfg: DictConfig) -> Trainer: train_loader = build_dataloader( train_loader_config, tokenizer, - device_train_batch_size, + train_cfg.device_train_batch_size, ) except Exception as e: if mosaicml_logger is not None: @@ -480,13 +482,13 @@ def main(cfg: DictConfig) -> Trainer: else: log.info('Building eval loader...') - eval_icl_seq_len: int = icl_seq_len if icl_seq_len else max_seq_len + eval_icl_seq_len: int = icl_seq_len if icl_seq_len else train_cfg.max_seq_len evaluators, _, eval_gauntlet_callback = build_evaluators( eval_loader_config, icl_tasks_config, eval_gauntlet_config, tokenizer=tokenizer, - device_eval_batch_size=device_eval_batch_size, + device_eval_batch_size=train_cfg.device_eval_batch_size, icl_seq_len=eval_icl_seq_len, icl_subset_num_batches=icl_subset_num_batches, ) @@ -495,9 +497,9 @@ def main(cfg: DictConfig) -> Trainer: if mosaicml_logger is not None: log_train_analytics(mosaicml_logger, model_config, train_loader_config, - eval_loader_config, callback_configs, - tokenizer_name, load_path, icl_tasks_config, - eval_gauntlet_config) + eval_loader_config, train_cfg.callback_configs, + tokenizer_name, train_cfg.load_path, + icl_tasks_config, eval_gauntlet_config) # Build Model log.info('Initializing model...') model = build_composer_model( @@ -527,8 +529,8 @@ def main(cfg: DictConfig) -> Trainer: }) # Optimizer - optimizer_name: str = optimizer_config.pop('name') - optimizer = build_optimizer(model, optimizer_name, optimizer_config) + optimizer_name: str = train_cfg.optimizer.pop('name') + optimizer = build_optimizer(model, optimizer_name, train_cfg.optimizer) # Now add the eval metrics try: @@ -555,38 +557,38 @@ def main(cfg: DictConfig) -> Trainer: eval_dataloader=evaluators, optimizers=optimizer, schedulers=scheduler, - max_duration=max_duration, - eval_interval=eval_interval, - eval_subset_num_batches=eval_subset_num_batches, - progress_bar=progress_bar, - log_to_console=log_to_console, - console_log_interval=console_log_interval, + max_duration=train_cfg.max_duration, + eval_interval=train_cfg.eval_interval, + eval_subset_num_batches=train_cfg.eval_subset_num_batches, + progress_bar=train_cfg.progress_bar, + log_to_console=train_cfg.log_to_console, + console_log_interval=train_cfg.console_log_interval, loggers=loggers, callbacks=callbacks, - precision=precision, + precision=train_cfg.precision, algorithms=algorithms, - device_train_microbatch_size=device_train_microbatch_size, + device_train_microbatch_size=train_cfg.device_train_microbatch_size, fsdp_config=fsdp_config, - save_folder=save_folder, + save_folder=train_cfg.save_folder, save_filename=save_filename, save_latest_filename=save_latest_filename, - save_interval=save_interval, - save_num_checkpoints_to_keep=save_num_checkpoints_to_keep, - save_overwrite=save_overwrite, - save_weights_only=save_weights_only, - load_path=load_path, - load_weights_only=load_weights_only, - load_strict_model_weights=load_strict_model_weights, - load_ignore_keys=load_ignore_keys, - save_ignore_keys=save_ignore_keys, + save_interval=train_cfg.save_interval, + save_num_checkpoints_to_keep=train_cfg.save_num_checkpoints_to_keep, + save_overwrite=train_cfg.save_overwrite, + save_weights_only=train_cfg.save_weights_only, + load_path=train_cfg.load_path, + load_weights_only=train_cfg.load_weights_only, + load_strict_model_weights=train_cfg.load_strict_model_weights, + load_ignore_keys=train_cfg.load_ignore_keys, + save_ignore_keys=train_cfg.save_ignore_keys, autoresume=autoresume, - python_log_level=python_log_level, + python_log_level=train_cfg.python_log_level, dist_timeout=dist_timeout, profiler=profiler, - compile_config=compile_config, + compile_config=train_cfg.compile_config, ) - if should_log_config: + if train_cfg.log_config: log.info('Logging config') log_config(logged_cfg) torch.cuda.empty_cache() From 5ff18164d60cd9ac543861e3832dd454b053987a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 20:53:34 +0000 Subject: [PATCH 080/201] fix --- scripts/eval/eval.py | 2 +- scripts/train/train.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index b8450c4cd8..21bbe6d8e5 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -342,7 +342,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: icl_subset_num_batches=eval_config.icl_subset_num_batches, metadata=eval_config.metadata, logged_config=logged_cfg, - should_log_config=eval_config.should_log_config, + should_log_config=eval_config.log_config, **model_cfg) trainers.append(trainer) diff --git a/scripts/train/train.py b/scripts/train/train.py index f33d4ed91a..f95ba4434b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -474,11 +474,11 @@ def main(cfg: DictConfig) -> Trainer: ## Evaluation if use_async_eval: evaluators = [] - if eval_first: + if train_cfg.eval_first: warnings.warn( 'AsyncEval callback does not support eval_first=True. Ignoring.' ) - eval_first = False + train_cfg.eval_first = False else: log.info('Building eval loader...') @@ -497,7 +497,7 @@ def main(cfg: DictConfig) -> Trainer: if mosaicml_logger is not None: log_train_analytics(mosaicml_logger, model_config, train_loader_config, - eval_loader_config, train_cfg.callback_configs, + eval_loader_config, train_cfg.callbacks, tokenizer_name, train_cfg.load_path, icl_tasks_config, eval_gauntlet_config) # Build Model @@ -595,7 +595,7 @@ def main(cfg: DictConfig) -> Trainer: gc.collect() # Eval first if requested - if eval_first and trainer.state.timestamp.batch.value == 0: + if train_cfg.eval_first and trainer.state.timestamp.batch.value == 0: trainer.eval() log.info('Starting training...') From 8e5bc1dd1ae49423104daff60dc107a9ac7dbdc4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 21:04:00 +0000 Subject: [PATCH 081/201] used helper function to make main cleaner --- scripts/train/train.py | 56 +++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index f95ba4434b..a3d11d88b3 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -11,7 +11,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch -from composer import Trainer +from composer import ComposerModel, Trainer from composer.core.callback import Callback from composer.profiler import (JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule) @@ -272,6 +272,26 @@ def _make_train_and_log_config( return logged_cfg, train_cfg +def _log_num_params(model: ComposerModel, logged_cfg: DictConfig): + # Log number of parameters + if hasattr(model, 'n_total_params'): + n_params = model.n_total_params + n_trainable_params = n_params # TODO: we currently assume all parameters are trainable. + else: + n_params = sum(p.numel() for p in model.parameters()) + n_trainable_params = sum( + p.numel() for p in model.parameters() if p.requires_grad) + if hasattr(model, 'n_active_params'): + n_active_params = model.n_active_params + else: + n_active_params = n_params + logged_cfg.update({ + 'n_params': n_params, + 'n_active_params': n_active_params, + 'n_trainable_params': n_trainable_params, + }) + + def main(cfg: DictConfig) -> Trainer: logged_cfg, train_cfg = _make_train_and_log_config(cfg) @@ -328,18 +348,16 @@ def main(cfg: DictConfig) -> Trainer: if train_cfg.eval_loader is not None and train_cfg.eval_loaders is not None: raise ValueError( 'Only one of `eval_loader` or `eval_loaders` should be provided.') - eval_loader_config: Optional[Union[DictConfig, ListConfig]] = DictConfig( + eval_loader_config = DictConfig( train_cfg.eval_loader ) if train_cfg.eval_loader is not None else ListConfig( train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None - icl_tasks_config: Optional[Union[ListConfig, str]] = ListConfig( + icl_tasks_config = ListConfig( train_cfg.icl_tasks ) if train_cfg.icl_tasks is not None else train_cfg.icl_tasks_str - eval_gauntlet_config: Optional[Union[DictConfig, str]] = DictConfig( + eval_gauntlet_config = DictConfig( train_cfg.eval_gauntlet ) if train_cfg.eval_gauntlet is not None else train_cfg.eval_gauntlet_str - icl_subset_num_batches: Optional[int] = train_cfg.icl_subset_num_batches - icl_seq_len: Optional[int] = train_cfg.icl_seq_len # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') @@ -361,8 +379,6 @@ def main(cfg: DictConfig) -> Trainer: log.info('As run_name, save_folder, and save_latest_filename are set, \ changing autoresume default to True...') - autoresume: bool = train_cfg.autoresume - # Warn if fsdp is enabled but user only has 1 GPU if dist.get_world_size() == 1 and fsdp_config is not None: warnings.warn( @@ -482,7 +498,7 @@ def main(cfg: DictConfig) -> Trainer: else: log.info('Building eval loader...') - eval_icl_seq_len: int = icl_seq_len if icl_seq_len else train_cfg.max_seq_len + eval_icl_seq_len: int = train_cfg.icl_seq_len if train_cfg.icl_seq_len else train_cfg.max_seq_len evaluators, _, eval_gauntlet_callback = build_evaluators( eval_loader_config, icl_tasks_config, @@ -490,7 +506,7 @@ def main(cfg: DictConfig) -> Trainer: tokenizer=tokenizer, device_eval_batch_size=train_cfg.device_eval_batch_size, icl_seq_len=eval_icl_seq_len, - icl_subset_num_batches=icl_subset_num_batches, + icl_subset_num_batches=train_cfg.icl_subset_num_batches, ) if eval_gauntlet_callback is not None: callbacks.append(eval_gauntlet_callback) @@ -510,23 +526,7 @@ def main(cfg: DictConfig) -> Trainer: cfg=model_config, ) - # Log number of parameters - if hasattr(model, 'n_total_params'): - n_params = model.n_total_params - n_trainable_params = n_params # TODO: we currently assume all parameters are trainable. - else: - n_params = sum(p.numel() for p in model.parameters()) - n_trainable_params = sum( - p.numel() for p in model.parameters() if p.requires_grad) - if hasattr(model, 'n_active_params'): - n_active_params = model.n_active_params - else: - n_active_params = n_params - logged_cfg.update({ - 'n_params': n_params, - 'n_active_params': n_active_params, - 'n_trainable_params': n_trainable_params, - }) + _log_num_params(model, logged_cfg) # Optimizer optimizer_name: str = train_cfg.optimizer.pop('name') @@ -581,7 +581,7 @@ def main(cfg: DictConfig) -> Trainer: load_strict_model_weights=train_cfg.load_strict_model_weights, load_ignore_keys=train_cfg.load_ignore_keys, save_ignore_keys=train_cfg.save_ignore_keys, - autoresume=autoresume, + autoresume=train_cfg.autoresume, python_log_level=train_cfg.python_log_level, dist_timeout=dist_timeout, profiler=profiler, From f916a15bd4029077a3868ad7ad0cbf2b4729456f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 21:21:55 +0000 Subject: [PATCH 082/201] fix stuff --- scripts/eval/eval.py | 9 +++++---- scripts/train/train.py | 5 +++++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 21bbe6d8e5..71afeeeb74 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -46,7 +46,7 @@ def evaluate_model( device_eval_batch_size: int, eval_gauntlet_config: Optional[Union[str, DictConfig]], eval_loader_config: Optional[Union[DictConfig, ListConfig]], - fsdp_config: Optional[Dict], + fsdp_config: Optional[Dict[str, Any]], loggers: List[LoggerDestination], python_log_level: Optional[str], precision: str, @@ -267,9 +267,10 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: eval_config.eval_gauntlet ) if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str + fsdp_config = om.to_container(eval_config.fsdp_config) assert isinstance( - eval_config.fsdp_config, Dict - ) or eval_config.fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(eval_config.fsdp_config)}' + fsdp_config, Dict + ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' # Mandatory Evaluation Parameters icl_tasks: Union[ListConfig, str, None] = ListConfig( @@ -332,7 +333,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: device_eval_batch_size=eval_config.device_eval_batch_size, eval_gauntlet_config=eval_gauntlet_config, eval_loader_config=eval_loader_config, - fsdp_config=eval_config.fsdp_config, + fsdp_config=fsdp_config, loggers=loggers, python_log_level=eval_config.python_log_level, precision=eval_config.precision, diff --git a/scripts/train/train.py b/scripts/train/train.py index a3d11d88b3..00e09aecde 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -146,6 +146,11 @@ class TrainConfig: def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" + # check for missing mandatory fields + for field in TRAIN_CONFIG_KEYS: + _ = getattr(train_config, field) + + # validate the rest of the config loaders = [train_config.train_loader] if train_config.eval_loaders is not None: for loader in (train_config.eval_loaders or []): # pyright From 0baca2e86eff2ae5ad6ca0b67b04c9d7d4e0bcf5 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 21:33:32 +0000 Subject: [PATCH 083/201] fix pyright --- scripts/eval/eval.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 71afeeeb74..6243838f57 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -271,6 +271,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: assert isinstance( fsdp_config, Dict ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' + assert all(isinstance(k, str) + for k in fsdp_config.keys()), 'fsdp_config keys must be strings' + fsdp_config = {str(k): v for k, v in fsdp_config.items()} # pyright fix # Mandatory Evaluation Parameters icl_tasks: Union[ListConfig, str, None] = ListConfig( From 05342b2af479da4af0879f9339e9f9043e4f01b1 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 21:41:59 +0000 Subject: [PATCH 084/201] added fix and explanation --- scripts/eval/eval.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 6243838f57..c11a62cc76 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -267,13 +267,17 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: eval_config.eval_gauntlet ) if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str - fsdp_config = om.to_container(eval_config.fsdp_config) + # the below line fixes a strange issue where the fsdp_config is a DictConfig rather than a Dict, + # despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`. + # I think it might be rewrapped in DictConfig during the `structured` call in `_make_eval_and_log_config`. + # this redundant check is necessary to avoid a pyright error. + fsdp_config = om.to_container( + eval_config.fsdp_config) if eval_config.fsdp_config else None assert isinstance( fsdp_config, Dict ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' - assert all(isinstance(k, str) - for k in fsdp_config.keys()), 'fsdp_config keys must be strings' - fsdp_config = {str(k): v for k, v in fsdp_config.items()} # pyright fix + fsdp_config = {str(k): v for k, v in fsdp_config.items() + } if fsdp_config else None # pyright fix # Mandatory Evaluation Parameters icl_tasks: Union[ListConfig, str, None] = ListConfig( From 41d9255a21ce9c46166f1e4d065fb9417260263f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 22:01:43 +0000 Subject: [PATCH 085/201] fix typo in unit test update smh --- tests/a_scripts/train/test_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 9efc04755d..5f624906c8 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -165,7 +165,7 @@ def test_validate_config(): match= 'MoEs with expert parallelism (.*) require `use_orig_params=True`.' ): - validate_config(om.structured(TrainConfig(**test_cfg))) + validate_config(om.structured(TrainConfig(**test_cfg_dict))) def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): From 0f8b26bf78562219df56e986fc81bf14cf57c2bc Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 20:50:32 -0400 Subject: [PATCH 086/201] Update llmfoundry/registry.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index 5aa0b93208..8d35968d7a 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -109,7 +109,7 @@ 'dataloaders', generic_type=Callable[ ..., - DataSpec], # the arguments to the dataloader may vary depending on the contents of the config. + DataSpec], # The arguments to the dataloader may vary depending on the contents of the config. entry_points=True, description=_dataloaders_description) From 5d805c3d1670e93275a130c0ec08967b0324054c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 20:51:03 -0400 Subject: [PATCH 087/201] Update scripts/train/train.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 00e09aecde..af2423fe76 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -97,7 +97,7 @@ class TrainConfig: callbacks: Optional[Dict[str, Any]] = None algorithms: Optional[Dict[str, Any]] = None - # checkpoints + # Checkpoints save_folder: Optional[str] = None save_latest_filename: Optional[str] = None save_overwrite: bool = False From cab542ff84303d641112fcc6ba874da0da148ede Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 20:56:52 -0400 Subject: [PATCH 088/201] Update scripts/train/train.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index af2423fe76..676457638b 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -71,7 +71,7 @@ class TrainConfig: expandable_segments: bool = False cuda_load_lazy: bool = False - # distributed training parameters + # Distributed training parameters dist_timeout: Union[int, float] = 600.0 fsdp_config: Optional[Dict[str, Any]] = None From ebcafc4953a6431617cdf4f34aae2537284fbd9c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 20:57:26 -0400 Subject: [PATCH 089/201] Update scripts/train/train.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 676457638b..1a4f06cf6d 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -75,7 +75,7 @@ class TrainConfig: dist_timeout: Union[int, float] = 600.0 fsdp_config: Optional[Dict[str, Any]] = None - # evaluation parameters + # Evaluation parameters eval_loader: Optional[Dict[str, Any]] = None eval_loaders: Optional[List[Dict[str, Any]]] = None icl_tasks: Optional[List[Dict[str, Any]]] = None From f744bd466c5b1782c0f5bdc069b090ca5dc573d3 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 18 Apr 2024 21:07:29 -0400 Subject: [PATCH 090/201] Apply suggestions from code review Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/data/finetuning/dataloader.py | 2 +- scripts/eval/eval.py | 22 +++++++++++----------- scripts/train/train.py | 22 +++++++++++----------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index f2610920e2..b8778a37f6 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -452,7 +452,7 @@ def _build_collate_fn( tokenizer: PreTrainedTokenizerBase, device_batch_size: int, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: - # these `.get` calls are safe because the dataset_cfg is validated for extra keys + # These `.get` calls are safe because the dataset_cfg is validated for extra keys dataset_cfg = dataloader_cfg.dataset target_responses = dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index c11a62cc76..8f6302c884 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -175,38 +175,38 @@ class EvalConfig: # Eval Config optional parameters: code_paths: Optional[List[str]] = None - # eval hyperparameters + # Eval hyperparameters eval_gauntlet: Optional[Dict[str, Any]] = None eval_gauntlet_str: Optional[str] = None eval_loader: Optional[Dict[str, Any]] = None eval_loaders: Optional[List[Dict[str, Any]]] = None eval_subset_num_batches: int = -1 icl_subset_num_batches: Optional[int] = None - # one of icl_tasks or icl_tasks_str must be specified + # One of icl_tasks or icl_tasks_str must be specified icl_tasks: Optional[List[Dict[str, Any]]] = None icl_tasks_str: Optional[str] = None - # loggirg parameters + # Logging parameters python_log_level: Optional[str] = None loggers: Optional[Dict[str, Any]] = None log_config: bool = True - # model/run parameters + # Model/run parameters seed: int = 17 precision: str = 'amp_bf16' run_name: Optional[str] = None model_name_or_path: Optional[str] = None metadata: Optional[Dict[str, str]] = None - # distributed parameters + # Distributed parameters dist_timeout: Union[float, int] = 600.0 fsdp_config: Optional[Dict[str, Any]] = None - # callback parameters + # Callback parameters callbacks: Optional[Dict[str, Any]] = None - # variables to ignore - variables: Optional[Dict[str, Any]] = None # variables to ignore + # Variables to ignore + variables: Optional[Dict[str, Any]] = None EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) @@ -219,7 +219,7 @@ def _make_eval_and_log_config(cfg: DictConfig) -> Tuple[DictConfig, EvalConfig]: assert all(isinstance(k, str) for k in unstructured_config.keys()) unstructured_config = {str(k): v for k, v in unstructured_config.items()} - # flatten union types before creating structured config: + # Flatten union types before creating structured config: if 'eval_gauntlet' in unstructured_config: if isinstance(unstructured_config['eval_gauntlet'], str): unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( @@ -305,13 +305,13 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: logging.getLogger('llmfoundry').setLevel( eval_config.python_log_level.upper()) - # default argument values for evaluate_model + # Default argument values for evaluate_model eval_gauntlet_df = None models_df = None composite_scores = None trainers = [] - # build loggers + # Build loggers loggers: List[LoggerDestination] = [ build_logger(name, logger_cfg) for name, logger_cfg in (eval_config.loggers or {}).items() diff --git a/scripts/train/train.py b/scripts/train/train.py index 1a4f06cf6d..18dc3e2dc4 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -85,7 +85,7 @@ class TrainConfig: icl_subset_num_batches: Optional[int] = None icl_seq_len: Optional[int] = None - # logging + # Logging loggers: Optional[Dict[str, Any]] = None progress_bar: bool = False log_to_console: bool = True @@ -93,7 +93,7 @@ class TrainConfig: console_log_interval: Union[int, str] = '1ba' log_config: bool = True - # callbacks + # Callbacks callbacks: Optional[Dict[str, Any]] = None algorithms: Optional[Dict[str, Any]] = None @@ -111,30 +111,30 @@ class TrainConfig: load_ignore_keys: Optional[List[str]] = None save_ignore_keys: Optional[List[str]] = None - # dataloader + # Dataloader device_train_microbatch_size: Union[str, int] = 'auto' data_local: Optional[str] = None data_remote: Optional[str] = None - # eval dataloader + # Eval dataloader eval_subset_num_batches: int = -1 eval_first: bool = False compile_config: Optional[Dict[str, Any]] = None - # metadata + # Metadata metadata: Optional[Dict[str, Any]] = None run_name: Optional[str] = None - # resumption + # Resumption autoresume: bool = False - # gradient accumulation + # Gradient accumulation device_train_grad_accum: Optional[int] = None - # profiling + # Profiling profiler: Optional[Dict[str, Any]] = None - # ignore keys + # Ignore keys global_seed: Optional[int] = None global_train_batch_size: Optional[int] = None n_gpus: Optional[int] = None @@ -146,11 +146,11 @@ class TrainConfig: def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" - # check for missing mandatory fields + # Check for missing mandatory fields for field in TRAIN_CONFIG_KEYS: _ = getattr(train_config, field) - # validate the rest of the config + # Validate the rest of the config loaders = [train_config.train_loader] if train_config.eval_loaders is not None: for loader in (train_config.eval_loaders or []): # pyright From fba1f63e62f21a50e713c041ad5e6bf3a0a9e6d3 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 18:07:16 +0000 Subject: [PATCH 091/201] see if this fails --- tests/models/test_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index f74fe16f93..046d40a4ff 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -766,7 +766,7 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool, assert mpt.config.d_model == 128 assert mpt.config.n_heads == 4 assert mpt.config.n_layers == 2 - if ffn_hidden_size is None: # type: ignore (sometimes it may not be none) + if ffn_hidden_size is None: assert mpt.config.expansion_ratio == expansion_ratio else: assert mpt.config.ffn_config['ffn_hidden_size'] == ffn_hidden_size From a2de27ac09789409f52b07eb3d021c42a4c0ca6e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 18:23:14 +0000 Subject: [PATCH 092/201] reject name and device rather than ignoring --- llmfoundry/models/hf/hf_causal_lm.py | 11 ----------- tests/fixtures/models.py | 3 ++- tests/models/hf/test_hf_config.py | 12 ++++++++---- tests/models/hf/test_hf_fsdp.py | 1 + 4 files changed, 11 insertions(+), 16 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 29ec6439e6..0672cfa638 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -73,18 +73,7 @@ def __init__( use_train_metrics: bool = True, additional_train_metrics: Optional[List] = None, additional_eval_metrics: Optional[List] = None, - # ignored args - name: Optional[str] = None, - device: Optional[Any] = None, ): - - if device is not None: - warnings.warn( - 'device is deprecated and will be removed in a future release. ' - + 'Please use init_device instead.', - DeprecationWarning, - ) - from llmfoundry.utils.builders import build_metric config_overrides = config_overrides or {} diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index 616d66085c..3fd004735a 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -15,8 +15,9 @@ def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase): + name = config.pop('name') model = build_composer_model( - name=config.name, + name=name, cfg=config, tokenizer=tokenizer, ) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index e79756aba3..62f0b7b0b3 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -46,8 +46,9 @@ def test_remote_code_false_mpt( with pytest.raises( ValueError, match='trust_remote_code must be set to True for MPT models.'): + name = test_cfg.model.pop('name') _ = build_composer_model( - name=test_cfg.model.name, + name=name, cfg=test_cfg.model, tokenizer=tokenizer, ) @@ -162,8 +163,9 @@ def test_hf_config_override( }) hf_model_config.model = model_cfg + name = hf_model_config.model.pop('name') hf_model = build_composer_model( - name=hf_model_config.model.name, + name=name, cfg=hf_model_config.model, tokenizer=tokenizer, ) @@ -197,8 +199,9 @@ def test_rope_scaling_override(): } model_cfg = om.create(model_cfg) + name = model_cfg.pop('name') model = build_composer_model( - name=model_cfg.name, + name=name, cfg=model_cfg, tokenizer=None, # type: ignore ) @@ -224,8 +227,9 @@ def test_nested_override(): } model_cfg = om.create(model_cfg) + name = model_cfg.pop('name') model = build_composer_model( - name=model_cfg.name, + name=name, cfg=model_cfg, tokenizer=None, # type: ignore ) diff --git a/tests/models/hf/test_hf_fsdp.py b/tests/models/hf/test_hf_fsdp.py index 5405d30697..274ebeabcd 100644 --- a/tests/models/hf/test_hf_fsdp.py +++ b/tests/models/hf/test_hf_fsdp.py @@ -21,6 +21,7 @@ def test_olmo_wraps(): config = DictConfig(conf) + config.model.pop('name') model = ComposerHFCausalLM(**config.model, tokenizer=None) # check that all the modules we except are blocked from FSDP wrapping From b6ebcccddd0126e9defa2f5cd07e3d362a9685d9 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 18:24:12 +0000 Subject: [PATCH 093/201] pretrained is not a bool --- llmfoundry/models/hf/hf_causal_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 0672cfa638..63be7277ab 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -61,7 +61,7 @@ def __init__( self, tokenizer: PreTrainedTokenizerBase, pretrained_model_name_or_path: str, - pretrained: Optional[bool] = True, + pretrained: bool = True, pretrained_lora_id_or_path: Optional[str] = None, trust_remote_code: bool = True, use_auth_token: bool = False, From 604f25494e989654b2a057fe37b6fd086b421534 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 18:27:00 +0000 Subject: [PATCH 094/201] add validation to make sure the user doesn't set both --- scripts/eval/eval.py | 12 ++++++++++++ scripts/train/train.py | 12 ++++++++++++ 2 files changed, 24 insertions(+) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 8f6302c884..5f29e04264 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -222,14 +222,26 @@ def _make_eval_and_log_config(cfg: DictConfig) -> Tuple[DictConfig, EvalConfig]: # Flatten union types before creating structured config: if 'eval_gauntlet' in unstructured_config: if isinstance(unstructured_config['eval_gauntlet'], str): + if 'eval_gauntlet_str' in unstructured_config: + raise ValueError( + 'Cannot specify both eval_gauntlet and eval_gauntlet_str in the config' + ) unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( 'eval_gauntlet') if (loader := unstructured_config.get('eval_loader', None)) is not None: if isinstance(loader, list): + if 'eval_loaders' in unstructured_config: + raise ValueError( + 'Cannot specify both eval_loader and eval_loaders in the config' + ) unstructured_config['eval_loaders'] = unstructured_config.pop( 'eval_loader') if 'icl_tasks' in unstructured_config: if isinstance(unstructured_config['icl_tasks'], str): + if 'icl_tasks_str' in unstructured_config: + raise ValueError( + 'Cannot specify both icl_tasks and icl_tasks_str in the config' + ) unstructured_config['icl_tasks_str'] = unstructured_config.pop( 'icl_tasks') else: diff --git a/scripts/train/train.py b/scripts/train/train.py index 18dc3e2dc4..892a19d704 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -240,14 +240,26 @@ def _make_train_and_log_config( # Structured config does not support unions of containers, so separate single and plural containers if (loader := unstructured_config.get('eval_loader', None)) is not None: if isinstance(loader, list): + if 'eval_loaders' in unstructured_config: + raise ValueError( + 'Only one of `eval_loader` or `eval_loaders` should be provided.' + ) unstructured_config['eval_loaders'] = unstructured_config.pop( 'eval_loader') if (tasks := unstructured_config.get('icl_tasks', None)) is not None: if isinstance(tasks, str): + if 'icl_tasks_str' in unstructured_config: + raise ValueError( + 'Only one of `icl_tasks` or `icl_tasks_str` should be provided.' + ) unstructured_config['icl_tasks_str'] = unstructured_config.pop( 'icl_tasks') if (gauntlet := unstructured_config.get('eval_gauntlet', None)) is not None: if isinstance(gauntlet, str): + if 'eval_gauntlet_str' in unstructured_config: + raise ValueError( + 'Only one of `eval_gauntlet` or `eval_gauntlet_str` should be provided.' + ) unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( 'eval_gauntlet') From ba9391be45e58d686efb766276b4bbe526c8b391 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 18:43:32 +0000 Subject: [PATCH 095/201] forbid config keys --- llmfoundry/utils/config_utils.py | 7 +++++++ scripts/eval/eval.py | 18 +++++------------- scripts/train/train.py | 28 ++++++++++++++-------------- 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 026169df11..5ca1b422a6 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -32,6 +32,13 @@ def to_str_dict(cfg: DictConfig) -> Dict[str, Any]: return {str(k): v for k, v in cfg_dict.items()} +def forbid_config_key(cfg_dict: Dict[str, Any], key: str): + if key in cfg_dict: + raise ValueError( + f'Config key `{key}` should not be set. Please remove it from the config.' + ) + + def pop_config(cfg: DictConfig, key: str, must_exist: bool = True, diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 5f29e04264..81e600e0c7 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -28,7 +28,8 @@ build_callback, build_composer_model, build_evaluators, build_logger, build_tokenizer) -from llmfoundry.utils.config_utils import log_config, process_init_device +from llmfoundry.utils.config_utils import (forbid_config_key, log_config, + process_init_device) from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) @@ -221,27 +222,18 @@ def _make_eval_and_log_config(cfg: DictConfig) -> Tuple[DictConfig, EvalConfig]: # Flatten union types before creating structured config: if 'eval_gauntlet' in unstructured_config: + forbid_config_key(unstructured_config, 'eval_gauntlet_str') if isinstance(unstructured_config['eval_gauntlet'], str): - if 'eval_gauntlet_str' in unstructured_config: - raise ValueError( - 'Cannot specify both eval_gauntlet and eval_gauntlet_str in the config' - ) unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( 'eval_gauntlet') if (loader := unstructured_config.get('eval_loader', None)) is not None: + forbid_config_key(unstructured_config, 'eval_loaders') if isinstance(loader, list): - if 'eval_loaders' in unstructured_config: - raise ValueError( - 'Cannot specify both eval_loader and eval_loaders in the config' - ) unstructured_config['eval_loaders'] = unstructured_config.pop( 'eval_loader') if 'icl_tasks' in unstructured_config: + forbid_config_key(unstructured_config, 'icl_tasks_str') if isinstance(unstructured_config['icl_tasks'], str): - if 'icl_tasks_str' in unstructured_config: - raise ValueError( - 'Cannot specify both icl_tasks and icl_tasks_str in the config' - ) unstructured_config['icl_tasks_str'] = unstructured_config.pop( 'icl_tasks') else: diff --git a/scripts/train/train.py b/scripts/train/train.py index 892a19d704..b2cc459a0f 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -35,8 +35,8 @@ build_composer_model, build_evaluators, build_logger, build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, pop_config, - process_init_device, +from llmfoundry.utils.config_utils import (forbid_config_key, log_config, + pop_config, process_init_device, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -77,11 +77,12 @@ class TrainConfig: # Evaluation parameters eval_loader: Optional[Dict[str, Any]] = None - eval_loaders: Optional[List[Dict[str, Any]]] = None + eval_loaders: Optional[List[Dict[ + str, Any]]] = None # should not be set by the user icl_tasks: Optional[List[Dict[str, Any]]] = None - icl_tasks_str: Optional[str] = None + icl_tasks_str: Optional[str] = None # should not be set by the user eval_gauntlet: Optional[Dict[str, Any]] = None - eval_gauntlet_str: Optional[str] = None + eval_gauntlet_str: Optional[str] = None # should not be set by the user icl_subset_num_batches: Optional[int] = None icl_seq_len: Optional[int] = None @@ -160,7 +161,6 @@ def validate_config(train_config: TrainConfig): `label` attribute.') loaders.append(loader) if train_config.eval_loader is not None: - assert train_config.eval_loaders is None, 'Only one of `eval_loader` or `eval_loaders` should be provided.' loaders.append(train_config.eval_loader) for loader in loaders: if loader['name'] == 'text': @@ -187,7 +187,9 @@ def validate_config(train_config: TrainConfig): if (train_config.model.get('fc_type', 'torch') == 'te' or 'te' in train_config.model.get('ffn_config', {}).get( 'ffn_type', 'mptmlp')): - fsdp_config = train_config.fsdp_config or DictConfig({}) + if train_config.fsdp_config is None: + train_config.fsdp_config = {} + fsdp_config = train_config.fsdp_config act_ckpt = fsdp_config.get('activation_checkpointing', False) if fsdp_config else False act_ckpt_reentrant = fsdp_config.get( @@ -198,9 +200,8 @@ def validate_config(train_config: TrainConfig): + '`activation_checkpointing_reentrant = True`. ' + 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' ) - if train_config.fsdp_config is not None: - train_config.fsdp_config[ - 'activation_checkpointing_reentrant'] = False + train_config.fsdp_config[ + 'activation_checkpointing_reentrant'] = False if train_config.model.get('ffn_config', {}).get('ffn_type', 'mptmlp') == 'te_ln_mlp': @@ -239,14 +240,12 @@ def _make_train_and_log_config( # Structured config does not support unions of containers, so separate single and plural containers if (loader := unstructured_config.get('eval_loader', None)) is not None: + forbid_config_key(unstructured_config, 'eval_loaders') if isinstance(loader, list): - if 'eval_loaders' in unstructured_config: - raise ValueError( - 'Only one of `eval_loader` or `eval_loaders` should be provided.' - ) unstructured_config['eval_loaders'] = unstructured_config.pop( 'eval_loader') if (tasks := unstructured_config.get('icl_tasks', None)) is not None: + forbid_config_key(unstructured_config, 'icl_tasks_str') if isinstance(tasks, str): if 'icl_tasks_str' in unstructured_config: raise ValueError( @@ -255,6 +254,7 @@ def _make_train_and_log_config( unstructured_config['icl_tasks_str'] = unstructured_config.pop( 'icl_tasks') if (gauntlet := unstructured_config.get('eval_gauntlet', None)) is not None: + forbid_config_key(unstructured_config, 'eval_gauntlet_str') if isinstance(gauntlet, str): if 'eval_gauntlet_str' in unstructured_config: raise ValueError( From fe7d7b928751441d961a22e0c4cbf6026f98bafc Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 18:46:04 +0000 Subject: [PATCH 096/201] oops forgot eval --- scripts/eval/eval.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 81e600e0c7..6195c49ac8 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -249,7 +249,6 @@ def _make_eval_and_log_config(cfg: DictConfig) -> Tuple[DictConfig, EvalConfig]: warnings.warn( f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', DeprecationWarning) - # TODO (milo): delete the below line once we deprecate variables at the top level. unstructured_config['variables'][key] = unstructured_config.pop(key) eval_config: EvalConfig = om.structured(EvalConfig(**unstructured_config)) From bcedcfd7c468f225a23cb388e3e0a5bfe3265055 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 19:02:58 +0000 Subject: [PATCH 097/201] address coomments --- llmfoundry/utils/builders.py | 13 ++----------- llmfoundry/utils/config_utils.py | 14 +++++++------- scripts/train/train.py | 7 ++++--- 3 files changed, 13 insertions(+), 21 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 615b0b09d0..1f57a28b1a 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -29,7 +29,6 @@ from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.config_utils import to_str_dict from llmfoundry.utils.registry_utils import construct_from_registry from llmfoundry.utils.warnings import VersionedDeprecationWarning @@ -165,7 +164,7 @@ def build_icl_data_and_gauntlet( def build_composer_model( name: str, - cfg: Union[Dict[str, Any], DictConfig], + cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, init_context: Optional[ContextManager] = None, master_weights_dtype: Optional[str] = None, @@ -185,14 +184,6 @@ def build_composer_model( if init_context is None: init_context = contextlib.nullcontext() - if isinstance(cfg, DictConfig): - model_cfg = to_str_dict(cfg) # pyright - elif isinstance(cfg, dict): - model_cfg = {str(k): v for k, v in cfg.items()} - else: - raise ValueError( - f'Invalid type for cfg: {type(cfg)}. Must be DictConfig or Dict.') - with init_context: model = construct_from_registry( name=name, @@ -200,7 +191,7 @@ def build_composer_model( pre_validation_function=ComposerModel, post_validation_function=None, kwargs={ - **model_cfg, 'tokenizer': tokenizer + **cfg, 'tokenizer': tokenizer }, ) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 5ca1b422a6..17edc46380 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -113,22 +113,22 @@ def update_batch_size_info(cfg: Dict[str, Any]) -> Dict[str, Any]: return cfg -def process_init_device(model_cfg: DictConfig, fsdp_config: Optional[Dict]): +def process_init_device(model_cfg: Dict[str, Any], fsdp_config: Optional[Dict]): # Restrict model init_device to 'meta' and 'cpu', # using 'cuda' vs. 'cuda:id' is tricky and can lead to common user errors # when multiple GPUs are available. # Also 'meta' is only valid when using FSDP init_context = contextlib.nullcontext() if 'init_device' in model_cfg: - assert model_cfg.init_device in ['meta', 'cpu', 'mixed'] - if fsdp_config is None and model_cfg.init_device == 'meta': + assert model_cfg['init_device'] in ['meta', 'cpu', 'mixed'] + if fsdp_config is None and model_cfg['init_device'] == 'meta': warnings.warn( "Using `cfg.model.init_device='meta'` is only valid when using FSDP! " +\ "Reverting to `cfg.model.init_device='cpu'`.") - model_cfg.init_device = 'cpu' - if model_cfg.init_device == 'meta': + model_cfg['init_device'] = 'cpu' + if model_cfg['init_device'] == 'meta': init_context = init_empty_weights() - if model_cfg.init_device == 'mixed': + if model_cfg['init_device'] == 'mixed': if fsdp_config is None: raise NotImplementedError( 'Using init_device `mixed` is only supported with FSDP. ' + @@ -153,7 +153,7 @@ def process_init_device(model_cfg: DictConfig, fsdp_config: Optional[Dict]): raise ValueError( 'device_mesh must be specified in fsdp_config when using MoE with moe_world_size > 1.' ) - model_cfg.ffn_config.device_mesh = fsdp_config['device_mesh'] + model_cfg['ffn_config']['device_mesh'] = fsdp_config['device_mesh'] # No mixed precision needed for weights when they're already 16 bits master_dtype = model_cfg.get('master_weights_dtype') diff --git a/scripts/train/train.py b/scripts/train/train.py index b2cc459a0f..a5a155faef 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -356,8 +356,8 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config: DictConfig = DictConfig(train_cfg.model) - train_loader_config: DictConfig = DictConfig(train_cfg.train_loader) + model_config = train_cfg.model + train_loader_config = train_cfg.train_loader # Optional fsdp data, fine-tuning, and eval configs fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config @@ -535,8 +535,9 @@ def main(cfg: DictConfig) -> Trainer: icl_tasks_config, eval_gauntlet_config) # Build Model log.info('Initializing model...') + name = model_config.pop('name') model = build_composer_model( - name=model_config.name, + name=name, tokenizer=tokenizer, init_context=init_context, master_weights_dtype=model_config.get('master_weights_dtype', None), From 8a2bb1995dfe7260644ca4d3f8e1bf5f54c3dd6c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 19:06:12 +0000 Subject: [PATCH 098/201] removed redundant check --- scripts/train/train.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index a5a155faef..1f5b845f68 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -362,9 +362,6 @@ def main(cfg: DictConfig) -> Trainer: # Optional fsdp data, fine-tuning, and eval configs fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config - if train_cfg.eval_loader is not None and train_cfg.eval_loaders is not None: - raise ValueError( - 'Only one of `eval_loader` or `eval_loaders` should be provided.') eval_loader_config = DictConfig( train_cfg.eval_loader ) if train_cfg.eval_loader is not None else ListConfig( From 2fdbbc550fdd49b15a33a7c893693d1d26eca71f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 19:53:07 +0000 Subject: [PATCH 099/201] updated callsites not to use name --- scripts/eval/eval.py | 3 +- scripts/inference/benchmarking/benchmark.py | 3 +- tests/a_scripts/eval/test_eval.py | 6 ++-- .../inference/test_convert_composer_to_hf.py | 20 +++++++----- tests/models/hf/test_fsdp_weight_tying.py | 6 ++-- tests/models/hf/test_hf_config.py | 15 +++++---- tests/models/hf/test_hf_peft_wrapping.py | 5 ++- tests/models/hf/test_hf_v_mpt.py | 11 ++++--- tests/models/layers/test_huggingface_flash.py | 6 ++-- tests/models/test_model.py | 31 ++++++++++++------- 10 files changed, 61 insertions(+), 45 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 6195c49ac8..ed12c33cf3 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -105,7 +105,8 @@ def evaluate_model( init_context = process_init_device(DictConfig(model), fsdp_config) - composer_model = build_composer_model(name=model['name'], + name = model.pop('name') + composer_model = build_composer_model(name=name, tokenizer=tokenizer, init_context=init_context, cfg=model) diff --git a/scripts/inference/benchmarking/benchmark.py b/scripts/inference/benchmarking/benchmark.py index 3cbc70974e..e50a8b7d1f 100644 --- a/scripts/inference/benchmarking/benchmark.py +++ b/scripts/inference/benchmarking/benchmark.py @@ -64,8 +64,9 @@ def main(config: DictConfig): tokenizer_name=tokenizer_name, tokenizer_kwargs=tokenizer_kwargs, ) + name = config.model.pop('name') composer_model = build_composer_model( - name=config.model.name, + name=name, tokenizer=tokenizer, cfg=config.model, ) diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index 95979fd986..adb11acff7 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -13,6 +13,7 @@ from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model +from llmfoundry.utils.config_utils import to_str_dict from scripts.eval.eval import main # noqa: E402 from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, gpt_tiny_cfg) @@ -47,9 +48,10 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): tokenizer = build_tokenizer(model_cfg.tokenizer.name, model_cfg.tokenizer.get('kwargs', {})) # build model - model = build_composer_model(name=model_cfg.model.name, + name = model_cfg.model.pop('name') + model = build_composer_model(name=name, tokenizer=tokenizer, - cfg=model_cfg.model) + cfg=to_str_dict(model_cfg.model)) # create mocked save checkpoint trainer = Trainer(model=model, device=device) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index e2cab79c34..216b9a4694 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -28,7 +28,7 @@ from llmfoundry.models.mpt import MPTConfig from llmfoundry.utils.builders import (build_composer_model, build_optimizer, build_tokenizer) -from llmfoundry.utils.config_utils import process_init_device +from llmfoundry.utils.config_utils import process_init_device, to_str_dict from scripts.inference.convert_composer_to_hf import convert_composer_to_hf from tests.data_utils import make_tiny_ft_dataset @@ -774,7 +774,8 @@ def test_huggingface_conversion_callback( **dataloader_cfg, ) - original_model = build_composer_model(model_cfg['name'], + name = model_cfg.pop('name') + original_model = build_composer_model(name, tokenizer=tokenizer, cfg=model_cfg) optimizer_name = optimizer_config.pop('name') @@ -871,10 +872,11 @@ def test_convert_and_generate(model: str, tie_word_embeddings: bool, om_cfg['model']['init_device'] = 'cpu' tokenizer = transformers.AutoTokenizer.from_pretrained( om_cfg.tokenizer.name, use_auth_token=model == 'llama2') + name = om_cfg.model.pop('name') original_model = build_composer_model( - name=om_cfg['model'].name, + name=name, tokenizer=tokenizer, - cfg=om_cfg['model'], + cfg=to_str_dict(om_cfg['model']), ) trainer = Trainer(model=original_model, device='cpu' if not model == 'mptmoe' else 'gpu') @@ -943,10 +945,11 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, om_cfg['tie_word_embeddings'] = tie_word_embeddings tokenizer = transformers.AutoTokenizer.from_pretrained( om_cfg.tokenizer.name) + name = om_cfg.model.pop('name') original_model = build_composer_model( - name=om_cfg['model'].name, + name=name, tokenizer=tokenizer, - cfg=om_cfg['model'], + cfg=to_str_dict(om_cfg['model']), ) trainer = Trainer(model=original_model, device='cpu' if not 'moe' in conf_path else 'gpu') @@ -1153,11 +1156,12 @@ def test_mptmoe_huggingface_conversion_callback( optimizer_name = optimizer_config.pop('name') init_context = process_init_device(model_cfg, fsdp_config) + name = model_cfg.pop('name') original_model = build_composer_model( - name=model_cfg.name, + name=name, tokenizer=tokenizer, init_context=init_context, - cfg=model_cfg, + cfg=to_str_dict(model_cfg), ) optimizer = build_optimizer(original_model, optimizer_name, diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 712e515653..6ffe144fa5 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -10,6 +10,7 @@ from omegaconf import OmegaConf as om from llmfoundry.utils.builders import build_composer_model, build_tokenizer +from llmfoundry.utils.config_utils import to_str_dict @pytest.mark.world_size(2) @@ -67,9 +68,10 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, tokenizer_kwargs={'model_max_length': 32}, ) + name = model_cfg.pop('name') original_model = build_composer_model( - name=model_cfg['name'], - cfg=model_cfg, + name=name, + cfg=to_str_dict(model_cfg), tokenizer=tokenizer, ) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 62f0b7b0b3..207ad4068c 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -10,13 +10,13 @@ import pytest import torch -from omegaconf import DictConfig from omegaconf import OmegaConf as om from transformers import AutoModelForCausalLM, PretrainedConfig from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model +from llmfoundry.utils.config_utils import to_str_dict def test_remote_code_false_mpt( @@ -49,7 +49,7 @@ def test_remote_code_false_mpt( name = test_cfg.model.pop('name') _ = build_composer_model( name=name, - cfg=test_cfg.model, + cfg=to_str_dict(test_cfg.model), tokenizer=tokenizer, ) @@ -139,9 +139,10 @@ def test_hf_config_override( tokenizer_name = tokenizer_cfg['name'] tokenizer_kwargs = tokenizer_cfg.get('kwargs', {}) tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs) + name = test_cfg.model.pop('name') model = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + name=name, + cfg=to_str_dict(test_cfg.model), tokenizer=tokenizer, ) @@ -155,12 +156,12 @@ def test_hf_config_override( # load hf causal lm model with config_overrides hf_model_config = deepcopy(test_cfg) - model_cfg = DictConfig({ + model_cfg = { 'name': 'hf_causal_lm', 'pretrained_model_name_or_path': save_path, 'pretrained': False, 'config_overrides': model_cfg_overrides, - }) + } hf_model_config.model = model_cfg name = hf_model_config.model.pop('name') @@ -197,7 +198,6 @@ def test_rope_scaling_override(): 'pretrained': False, 'init_device': 'cpu', } - model_cfg = om.create(model_cfg) name = model_cfg.pop('name') model = build_composer_model( @@ -225,7 +225,6 @@ def test_nested_override(): 'pretrained': False, 'init_device': 'meta', } - model_cfg = om.create(model_cfg) name = model_cfg.pop('name') model = build_composer_model( diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 7fe886ffe3..052704e785 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -9,7 +9,6 @@ import torch import transformers from composer import Trainer -from omegaconf import OmegaConf as om from peft import LoraConfig, get_peft_model from llmfoundry.models.hf.hf_fsdp import prepare_hf_model_for_fsdp @@ -66,7 +65,6 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, assert model_cfg is not None assert tokenizer_name is not None - model_cfg = om.create(model_cfg) model_cfg['peft_config'] = peft_config fsdp_config = { @@ -85,8 +83,9 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, tokenizer_kwargs={'model_max_length': 32}, ) + name = model_cfg.pop('name') original_model = build_composer_model( - name=model_cfg['name'], + name=name, cfg=model_cfg, tokenizer=tokenizer, ) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 82b64ce80c..5bb38097fc 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -9,6 +9,7 @@ from omegaconf import OmegaConf as om from llmfoundry.utils.builders import build_composer_model, build_tokenizer +from llmfoundry.utils.config_utils import to_str_dict @pytest.mark.gpu @@ -58,9 +59,10 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, tokenizer_name=tokenizer_name, tokenizer_kwargs=tokenizer_kwargs, ) + name = hf_cfg.model.pop('name') hf_model = build_composer_model( - name=hf_cfg.model.name, - cfg=hf_cfg.model, + name=name, + cfg=to_str_dict(hf_cfg.model), tokenizer=tokenizer, ).to(device) hf_n_params = sum(p.numel() for p in hf_model.parameters()) @@ -110,9 +112,10 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, print('Initializing model...') print(model_cfg) + name = model_cfg.pop('name') model = build_composer_model( - name=model_cfg.name, - cfg=model_cfg, + name=name, + cfg=to_str_dict(model_cfg), tokenizer=tokenizer, ).to(device) n_params = sum(p.numel() for p in model.parameters()) diff --git a/tests/models/layers/test_huggingface_flash.py b/tests/models/layers/test_huggingface_flash.py index 08891d5199..779f102187 100644 --- a/tests/models/layers/test_huggingface_flash.py +++ b/tests/models/layers/test_huggingface_flash.py @@ -5,7 +5,6 @@ import pytest from composer.core.precision import get_precision_context -from omegaconf import OmegaConf as om from llmfoundry.models.hf.hf_fsdp import rgetattr from llmfoundry.models.layers.attention import is_flash_v2_installed @@ -43,8 +42,6 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): if use_flash_attention_2: model_cfg['use_flash_attention_2'] = True - model_cfg = om.create(model_cfg) - tokenizer = build_tokenizer( tokenizer_name=tokenizer_name, tokenizer_kwargs={'model_max_length': 10}, @@ -57,8 +54,9 @@ def test_flash2(model_name: str, use_flash_attention_2: bool, init_device: str): ) and use_flash_attention_2 else contextlib.nullcontext() with error_context: + name = model_cfg.pop('name') model = build_composer_model( - name=model_cfg['name'], + name=name, cfg=model_cfg, tokenizer=tokenizer, ) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 046d40a4ff..f705783a4b 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -35,6 +35,7 @@ from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model +from llmfoundry.utils.config_utils import to_str_dict def get_config( @@ -89,9 +90,10 @@ def _get_objs(request: pytest.FixtureRequest, tokenizer = build_tokenizer(test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + name = test_cfg.model.pop('name') model = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + name=name, + cfg=to_str_dict(test_cfg.model), tokenizer=tokenizer, ) @@ -291,9 +293,10 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): tokenizer = build_tokenizer(neo_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + name = neo_cfg.model.pop('name') model = build_composer_model( - name=neo_cfg.model.name, - cfg=neo_cfg.model, + name=name, + cfg=to_str_dict(neo_cfg.model), tokenizer=tokenizer, ).to(device) @@ -340,9 +343,10 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): tokenizer = build_tokenizer(t5_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + name = t5_cfg.model.pop('name') model = build_composer_model( - name=t5_cfg.model.name, - cfg=t5_cfg.model, + name=name, + cfg=to_str_dict(t5_cfg.model), tokenizer=tokenizer, ).to(device) @@ -417,9 +421,10 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, tokenizer = build_tokenizer(test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + name = test_cfg.model.pop('name') model_1 = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + name=name, + cfg=to_str_dict(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -487,9 +492,10 @@ def test_loss_fn(): tokenizer = build_tokenizer(test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + name = test_cfg.model.pop('name') model_1 = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + name=name, + cfg=to_str_dict(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -573,9 +579,10 @@ def test_loss_reduction(loss_fn_config: str): tokenizer = build_tokenizer(test_cfg.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + name = test_cfg.model.pop('name') model_1 = build_composer_model( - name=test_cfg.model.name, - cfg=test_cfg.model, + name=name, + cfg=to_str_dict(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) From acb9e1d1c82c5a98984eb4901bbdfe14f1731157 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 20:17:17 +0000 Subject: [PATCH 100/201] fix --- llmfoundry/data/dataloader.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 4f82861f80..1f28d0ee2c 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -6,14 +6,13 @@ from typing import Any, Dict from composer import DataSpec -from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase from llmfoundry import registry from llmfoundry.utils.registry_utils import construct_from_registry -def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, +def build_dataloader(cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: int) -> DataSpec: """Builds a dataloader from a config. @@ -24,8 +23,7 @@ def build_dataloader(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase, that the dataloader will produce. """ kwargs: Dict[str, Any] = { - **{str(k): v for k, v in cfg.items()}, # pyright - 'tokenizer': tokenizer, + **cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size } From f831c61365ed817dcb454a5fda2898b601f575b3 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 20:44:58 +0000 Subject: [PATCH 101/201] validate extraneous keys in dataloader --- llmfoundry/data/finetuning/dataloader.py | 133 ++++++++++++++++++----- 1 file changed, 106 insertions(+), 27 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index b8778a37f6..e5ee68e3cb 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import logging import os -from typing import Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple, Union import torch from composer.core.data_spec import DataSpec @@ -34,7 +34,7 @@ def build_finetuning_dataloader( tokenizer: PreTrainedTokenizerBase, device_batch_size: int, - dataset: DictConfig, + dataset: Dict[str, Any], num_workers: int, drop_last: bool = False, pin_memory: bool = True, @@ -276,7 +276,45 @@ def build_finetuning_dataloader( return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) -def _validate_config(dataset_cfg: DictConfig) -> None: +# local=dataset_cfg.get('local', None), +# remote=dataset_cfg.get('remote', None), +# split=dataset_cfg.get('split', None), +# download_retry=dataset_cfg.get('download_retry', 2), +# download_timeout=dataset_cfg.get('download_timeout', 60), +# validate_hash=dataset_cfg.get('validate_hash', None), +# keep_zip=dataset_cfg.get('keep_zip', False), +# epoch_size=dataset_cfg.get('epoch_size', None), +# predownload=dataset_cfg.get('predownload', None), +# cache_limit=dataset_cfg.get('cache_limit', None), +# partition_algo=dataset_cfg.get('partition_algo', 'relaxed'), +# num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None), +# batch_size=device_batch_size, +# shuffle=dataset_cfg.get('shuffle', False), +# shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'), +# shuffle_seed=dataset_cfg.get('shuffle_seed', 9176), +# shuffle_block_size=dataset_cfg.get('shuffle_block_size', None), +# sampling_method=dataset_cfg.get('sampling_method', 'balanced'), +# sampling_granularity=dataset_cfg.get('sampling_granularity', 1), +# batching_method=dataset_cfg.get('batching_method', 'random'), +# max_seq_len=dataset_cfg.max_seq_len, +# allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), +# replication=dataset_cfg.get('replication', None), + + +def _validate_config( + max_seq_len: int, + decoder_only_format: bool, + hf_name: Optional[str] = None, + local: Optional[str] = None, + remote: Optional[str] = None, + hf_kwargs: Optional[Dict[str, Any]] = None, + preprocessing_fn: Optional[str] = None, + safe_load: Optional[bool] = False, + streams: Optional[Dict[str, Any]] = None, + target_prompts: Optional[str] = None, + target_responses: Optional[str] = None, + **kwargs: Dict[str, Any], +) -> None: """Validates the dataset configuration. Makes sure that the dataset is properly configured for either @@ -289,13 +327,46 @@ def _validate_config(dataset_cfg: DictConfig) -> None: Raises: ValueError: If the dataset configuration does not meet the requirements. """ - if dataset_cfg.get('hf_name') is not None: + # Check for extraneous keys in the dataset config + allowed_additional_kwargs = { + 'local', + 'remote', + 'split', + 'download_retry', + 'download_timeout', + 'validate_hash', + 'keep_zip', + 'epoch_size', + 'predownload', + 'cache_limit', + 'partition_algo', + 'num_canonical_nodes', + 'batch_size', + 'shuffle', + 'shuffle_algo', + 'shuffle_seed', + 'shuffle_block_size', + 'sampling_method', + 'sampling_granularity', + 'batching_method', + 'max_seq_len', + 'allow_unsafe_types', + 'replication', + } + if not set(kwargs.keys()).issubset(allowed_additional_kwargs): + raise ValueError( + 'The dataset config contains the following extraneous keys: ' +\ + ', '.join(set(kwargs.keys()) - allowed_additional_kwargs) + ) + + if hf_name is not None: # Using the HuggingFace dataset codepath illegal_keys = ['local', 'remote'] discovered_illegal_keys = [] - for key in illegal_keys: - if dataset_cfg.get(key) is not None: - discovered_illegal_keys.append('`' + key + '`') + if local is not None: + discovered_illegal_keys.append('`local`') + if remote is not None: + discovered_illegal_keys.append('`remote`') if discovered_illegal_keys: raise ValueError( 'The dataset config sets a value for `hf_name` as well as the ' +\ @@ -303,12 +374,17 @@ def _validate_config(dataset_cfg: DictConfig) -> None: 'Those keys are used when building from a streaming dataset, but ' +\ 'setting `hf_name` instructs the dataset to build from a HuggingFace dataset.' ) - elif dataset_cfg.get('remote') is not None: + elif remote is not None: # Using the streaming dataset codepath - illegal_keys = ['hf_name', 'hf_kwargs', 'preprocessing_fn', 'safe_load'] + illegal_keys = { + 'hf_name': hf_name, + 'hf_kwargs': hf_kwargs, + 'preprocessing_fn': preprocessing_fn, + 'safe_load': safe_load + } discovered_illegal_keys = [] - for key in illegal_keys: - if dataset_cfg.get(key) is not None: + for key, value in illegal_keys.items(): + if value is not None: discovered_illegal_keys.append('`' + key + '`') if discovered_illegal_keys: raise ValueError( @@ -317,17 +393,22 @@ def _validate_config(dataset_cfg: DictConfig) -> None: 'Those keys are used when building from a HuggingFace dataset, but ' +\ 'setting `remote` instructs the dataset to build from a streaming dataset.' ) - if dataset_cfg.get('local') is None: + if local is None: raise ValueError( 'Using a streaming dataset requires setting both `remote` and `local`, ' +\ 'but dataset.local is None.' ) - elif dataset_cfg.get('streams') is not None: + elif streams is not None: # Using the streaming dataset codepath - illegal_keys = ['hf_name', 'hf_kwargs', 'preprocessing_fn', 'safe_load'] + illegal_keys = { + 'hf_name': hf_name, + 'hf_kwargs': hf_kwargs, + 'preprocessing_fn': preprocessing_fn, + 'safe_load': safe_load + } discovered_illegal_keys = [] - for key in illegal_keys: - if dataset_cfg.get(key) is not None: + for key, value in illegal_keys.items(): + if value is not None: discovered_illegal_keys.append('`' + key + '`') if discovered_illegal_keys: raise ValueError( @@ -336,10 +417,10 @@ def _validate_config(dataset_cfg: DictConfig) -> None: 'Those keys are used when building from a HuggingFace dataset, but ' +\ 'setting `streams` instructs the dataset to build from a streaming dataset.' ) - illegal_keys = ['remote', 'local'] + illegal_keys = {'remote': remote, 'local': local} discovered_illegal_keys = [] - for key in illegal_keys: - if dataset_cfg.get(key) is not None: + for key, value in illegal_keys.items(): + if value is not None: discovered_illegal_keys.append('`' + key + '`') if discovered_illegal_keys: raise ValueError( @@ -355,17 +436,15 @@ def _validate_config(dataset_cfg: DictConfig) -> None: 'dataset, or set `remote` to use a streaming dataset, or set ' +\ '`streams` to use multiple streaming datasets, but all were None.' ) - if dataset_cfg.get('max_seq_len') is None: - raise ValueError( - 'In the dataset config, you must set the `max_seq_len`') # Raise an error if the target_prompts + target_responses + decoder_only_format settings # are invalid - target_responses = str( - dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES)).lower() - target_prompts = str( - dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS)).lower() - decoder_only_format = dataset_cfg.decoder_only_format + if target_prompts is None: + target_prompts = _DEFAULT_TARGET_PROMPTS + if target_responses is None: + target_responses = _DEFAULT_TARGET_RESPONSES + target_prompts, target_responses = target_prompts.lower( + ), target_responses.lower() validate_target_settings(target_prompts, target_responses, decoder_only_format) From 1c16decdf5e4b45bf3930aa0ce31ba26f59a6eed Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 20:50:08 +0000 Subject: [PATCH 102/201] fix --- llmfoundry/data/text_data.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 0ff9100f5e..1fa884ca6b 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -5,7 +5,6 @@ import logging import os -import warnings from itertools import islice from typing import (Any, Callable, Dict, List, Mapping, Optional, Sequence, Union, cast) @@ -246,10 +245,7 @@ def get_sequence_id_from_batch( return torch.cat([left_zeros, cumulative_sep[:, :-1]], dim=1) -def build_streams( - streams: Optional[Dict[str, Any]] = None, - **dataset_cfg_rest: DictConfig # unused -): +def build_streams(streams: Optional[Dict[str, Any]] = None,): streams_dict = streams # build streams streams_ret: List = [] @@ -262,7 +258,6 @@ def build_streams( def build_text_dataloader( - name: str, tokenizer: PreTrainedTokenizerBase, device_batch_size: int, dataset: DictConfig, @@ -272,14 +267,8 @@ def build_text_dataloader( prefetch_factor: int = 2, persistent_workers: bool = True, timeout: int = 0, - **kwargs: Dict[str, Any], ) -> DataSpec: - for kwarg in kwargs.keys(): - warnings.warn( - f'Unused parameter `{kwarg}` passed to build_text_dataloader. This parameter is ignored. In future releases, this will raise an error.', - DeprecationWarning) dataset_cfg = dataset - assert name == 'text', f'Tried to build text dataloader with cfg.name={name}' # get kwargs mlm_probability = dataset_cfg.pop('mlm_probability', None) @@ -314,7 +303,7 @@ def build_text_dataloader( ' To override this error, set the override_bos_token_id_mismatch_error flag to True in the dataset config section of the YAML.' ) - streams = build_streams(**dataset_cfg) + streams = build_streams(streams=dataset_cfg.streams) # build dataset potentially with streams text_dataset = StreamingTextDataset( From 83cad9c722dd7e5a590a8cbdebb40684df0cf3d4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 20:58:27 +0000 Subject: [PATCH 103/201] fix more --- llmfoundry/models/hf/hf_causal_lm.py | 7 +++--- llmfoundry/utils/builders.py | 4 ---- llmfoundry/utils/mosaicml_logger_utils.py | 29 +++++++++++++---------- scripts/eval/eval.py | 25 ++++++++----------- 4 files changed, 29 insertions(+), 36 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 63be7277ab..ddac76e98b 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -100,8 +100,7 @@ def __init__( 'use_flash_attention_2 is set to True, but flash-attention 2 is not installed. ' + 'Please `pip install llm-foundry[gpu]`.') - peft_config_dict = peft_config - if peft_config_dict is not None and not peft_installed: + if peft_config is not None and not peft_installed: raise ValueError( 'PEFT is not installed, but peft_config was passed. Please install LLM Foundry with the peft extra to use peft_config.' ) @@ -248,8 +247,8 @@ def _autoset_attn_implementation_monkeypatch( model.tie_weights() peft_config = None - if peft_config_dict is not None: - peft_config = self._get_peft_config(peft_config_dict) + if peft_config is not None: + peft_config = self._get_peft_config(peft_config) if pretrained_lora_id_or_path is not None: if not peft_installed: diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 1f57a28b1a..f9052f8e08 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -376,10 +376,6 @@ def _extract_param_groups( def build_optimizer(model: torch.nn.Module, name: str, optimizer_config: Dict[str, Any]) -> Optimizer: - for k, v in optimizer_config.items(): - if isinstance(v, DictConfig): - optimizer_config[k] = om.to_container(v, resolve=True) - params = _extract_param_groups(model, optimizer_config) kwargs = {**optimizer_config} diff --git a/llmfoundry/utils/mosaicml_logger_utils.py b/llmfoundry/utils/mosaicml_logger_utils.py index d365e8fed1..cd290a8421 100644 --- a/llmfoundry/utils/mosaicml_logger_utils.py +++ b/llmfoundry/utils/mosaicml_logger_utils.py @@ -8,7 +8,6 @@ from composer.loggers.logger_destination import LoggerDestination from composer.loggers.mosaicml_logger import (MOSAICML_ACCESS_TOKEN_ENV_VAR, MOSAICML_PLATFORM_ENV_VAR) -from omegaconf import DictConfig, ListConfig _MODEL_KEYS_TO_LOG = [ 'pretrained_model_name_or_path', @@ -38,9 +37,10 @@ def find_mosaicml_logger( def log_eval_analytics(mosaicml_logger: MosaicMLLogger, - model_configs: ListConfig, icl_tasks: Union[str, - ListConfig], - eval_gauntlet_config: Optional[Union[str, DictConfig]]): + model_configs: List[Dict[str, Any]], + icl_tasks: Union[str, List[Dict[str, Any]]], + eval_gauntlet_config: Optional[Union[str, Dict[str, + Any]]]): """Logs analytics for runs using the `eval.py` script.""" metrics: Dict[str, Any] = { 'llmfoundry/script': 'eval', @@ -67,14 +67,17 @@ def log_eval_analytics(mosaicml_logger: MosaicMLLogger, def log_train_analytics(mosaicml_logger: MosaicMLLogger, - model_config: DictConfig, - train_loader_config: DictConfig, - eval_loader_config: Optional[Union[DictConfig, - ListConfig]], + model_config: Dict[str, + Any], train_loader_config: Dict[str, + Any], + eval_loader_config: Optional[Union[Dict[str, Any], + List[Dict[str, + Any]]]], callback_configs: Optional[Dict[str, Any]], tokenizer_name: str, load_path: Optional[str], - icl_tasks_config: Optional[Union[ListConfig, str]], - eval_gauntlet: Optional[Union[DictConfig, str]]): + icl_tasks_config: Optional[Union[List[Dict[str, Any]], + str]], + eval_gauntlet: Optional[Union[Dict[str, Any], str]]): """Logs analytics for runs using the `train.py` script.""" train_loader_dataset = train_loader_config.get('dataset', {}) metrics: Dict[str, Any] = { @@ -106,10 +109,10 @@ def log_train_analytics(mosaicml_logger: MosaicMLLogger, if eval_loader_config is not None: metrics['llmfoundry/eval_loaders'] = [] - if isinstance(eval_loader_config, ListConfig): - eval_loader_configs: ListConfig = eval_loader_config + if isinstance(eval_loader_config, list): + eval_loader_configs: list = eval_loader_config else: - eval_loader_configs = ListConfig([eval_loader_config]) + eval_loader_configs = [eval_loader_config] for loader_config in eval_loader_configs: eval_loader_info = {} diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index ed12c33cf3..80e1c9aedf 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -59,11 +59,7 @@ def evaluate_model( logged_config: DictConfig, should_log_config: bool = True, load_path: Optional[str] = None, - **kwargs: Dict[str, Any], ): - model_extra_params = kwargs - warnings.warn(f'Extra parameters: {model_extra_params}') - log.info(f'Evaluating model: {model_name}') # Build tokenizer and model tokenizer_cfg = tokenizer @@ -103,7 +99,7 @@ def evaluate_model( 'The FSDP config block is not supported when loading ' + 'Hugging Face models in 8bit.') - init_context = process_init_device(DictConfig(model), fsdp_config) + init_context = process_init_device(model, fsdp_config) name = model.pop('name') composer_model = build_composer_model(name=name, @@ -189,7 +185,7 @@ class EvalConfig: icl_tasks_str: Optional[str] = None # Logging parameters - python_log_level: Optional[str] = None + python_log_level: str = 'debug' loggers: Optional[Dict[str, Any]] = None log_config: bool = True @@ -299,15 +295,14 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: reproducibility.seed_all(eval_config.seed) dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) - if eval_config.python_log_level is not None: - logging.basicConfig( - # Example of format string - # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' - ) - logging.getLogger('llmfoundry').setLevel( - eval_config.python_log_level.upper()) + logging.basicConfig( + # Example of format string + # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here + format= + f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' + ) + logging.getLogger('llmfoundry').setLevel( + eval_config.python_log_level.upper()) # Default argument values for evaluate_model eval_gauntlet_df = None From d1b26f3e98c90ce1f743a625dbdbc053cd0f1430 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 21:01:46 +0000 Subject: [PATCH 104/201] fix III: revenge of the fix --- llmfoundry/utils/config_utils.py | 2 +- scripts/train/train.py | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 17edc46380..b0a2d2594e 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,7 +39,7 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): ) -def pop_config(cfg: DictConfig, +def pop_config(cfg: Union[Dict[str, Any], DictConfig], key: str, must_exist: bool = True, default_value: Any = None, diff --git a/scripts/train/train.py b/scripts/train/train.py index 1f5b845f68..0ebbe3386f 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -449,21 +449,18 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg: Optional[DictConfig] = DictConfig( - train_cfg.profiler) if train_cfg.profiler is not None else None + profiler_cfg = train_cfg.profiler if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', - must_exist=True, - convert=True) + must_exist=True) profiler_schedule = cyclic_schedule(**profiler_schedule_cfg) # Only support json trace handler profiler_trace_handlers: List[TraceHandler] = [] profiler_trace_cfg: Optional[Dict] = pop_config(profiler_cfg, 'json_trace_handler', must_exist=False, - default_value=None, - convert=True) + default_value=None) if profiler_trace_cfg: profiler_trace_handlers.append( JSONTraceHandler(**profiler_trace_cfg)) From bc2d5d37750209771f5c0a0f7d987c0cfb489c7a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 21:18:11 +0000 Subject: [PATCH 105/201] fix IV: a new hope --- scripts/train/train.py | 15 ++++----------- .../inference/test_convert_composer_to_hf.py | 4 +--- tests/data/test_dataloader.py | 11 ++++++----- tests/fixtures/models.py | 13 ++++++------- tests/models/hf/test_hf_config.py | 6 +++--- 5 files changed, 20 insertions(+), 29 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 0ebbe3386f..8a9955e23c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -16,7 +16,7 @@ from composer.profiler import (JSONTraceHandler, Profiler, TraceHandler, cyclic_schedule) from composer.utils import dist, get_device, reproducibility -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig from omegaconf import OmegaConf as om from rich.traceback import install @@ -362,16 +362,9 @@ def main(cfg: DictConfig) -> Trainer: # Optional fsdp data, fine-tuning, and eval configs fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config - eval_loader_config = DictConfig( - train_cfg.eval_loader - ) if train_cfg.eval_loader is not None else ListConfig( - train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None - icl_tasks_config = ListConfig( - train_cfg.icl_tasks - ) if train_cfg.icl_tasks is not None else train_cfg.icl_tasks_str - eval_gauntlet_config = DictConfig( - train_cfg.eval_gauntlet - ) if train_cfg.eval_gauntlet is not None else train_cfg.eval_gauntlet_str + eval_loader_config = train_cfg.eval_loader if train_cfg.eval_loader is not None else train_cfg.eval_loaders + icl_tasks_config = train_cfg.icl_tasks + eval_gauntlet_config = train_cfg.eval_gauntlet # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 216b9a4694..fc1662627e 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -747,7 +747,6 @@ def test_huggingface_conversion_callback( model, max_seq_len, tie_word_embeddings) assert model_cfg is not None assert tokenizer_name is not None - model_cfg = om.create(model_cfg) if peft_config is not None: model_cfg['peft_config'] = peft_config @@ -1094,7 +1093,6 @@ def test_mptmoe_huggingface_conversion_callback( tokenizer_name = 'EleutherAI/gpt-neox-20b' assert model_cfg is not None assert tokenizer_name is not None - model_cfg = om.create(model_cfg) fsdp_config = { 'sharding_strategy': sharding_strategy, @@ -1161,7 +1159,7 @@ def test_mptmoe_huggingface_conversion_callback( name=name, tokenizer=tokenizer, init_context=init_context, - cfg=to_str_dict(model_cfg), + cfg=model_cfg, ) optimizer = build_optimizer(original_model, optimizer_name, diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index a9457006df..a2fcc24b5a 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -306,8 +306,9 @@ def test_invalid_jsonl_data(): expected_keys += ['decoder_attention_mask', 'decoder_input_ids'] with pytest.raises(MisconfiguredHfDatasetError): - build_finetuning_dataloader(cfg, tokenizer, - device_batch_size).dataloader + build_finetuning_dataloader( + **cfg, tokenizer=tokenizer, + device_batch_size=device_batch_size).dataloader @pytest.mark.parametrize('use_chat_formatting', [True, False]) @@ -1149,12 +1150,12 @@ def test_token_counting_func_dataloader_setting( def test_build_unknown_dataloader(): - cfg = DictConfig({ + cfg = { 'name': 'unknown', - }) + } tokenizer = MagicMock() with pytest.raises(catalogue.RegistryError): - _ = build_dataloader(cfg, tokenizer, 2) + _ = build_dataloader(**cfg, tokenizer=tokenizer, device_batch_size=2) invalid_conversation_params_sharegpt = [ diff --git a/tests/fixtures/models.py b/tests/fixtures/models.py index 3fd004735a..e680ccb28f 100644 --- a/tests/fixtures/models.py +++ b/tests/fixtures/models.py @@ -2,10 +2,9 @@ # SPDX-License-Identifier: Apache-2.0 import copy -from typing import Any, Callable +from typing import Any, Callable, Dict import pytest -from omegaconf import DictConfig from pytest import fixture from transformers import PreTrainedTokenizerBase @@ -14,7 +13,7 @@ from llmfoundry.utils.builders import build_composer_model, build_tokenizer -def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase): +def _build_model(config: Dict[str, Any], tokenizer: PreTrainedTokenizerBase): name = config.pop('name') model = build_composer_model( name=name, @@ -35,13 +34,13 @@ def build_tiny_mpt( ) -> Callable[..., ComposerMPTCausalLM]: def build(**kwargs: Any) -> ComposerMPTCausalLM: - config = DictConfig({ + config = { 'name': 'mpt_causal_lm', 'd_model': 128, 'n_heads': 4, 'n_layers': 2, 'expansion_ratio': 2, - }) + } config.update(kwargs) model = _build_model(config, mpt_tokenizer) assert isinstance(model, ComposerMPTCausalLM) @@ -63,12 +62,12 @@ def build(**kwargs: Any) -> ComposerHFCausalLM: 'expansion_ratio': 2, } config_overrides.update(kwargs) - config = DictConfig({ + config = { 'name': 'hf_causal_lm', 'pretrained_model_name_or_path': 'mosaicml/mpt-7b', 'pretrained': False, 'config_overrides': config_overrides, - }) + } model = _build_model(config, mpt_tokenizer) assert isinstance(model, ComposerHFCausalLM) return model diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 207ad4068c..16f0d43a31 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -156,18 +156,18 @@ def test_hf_config_override( # load hf causal lm model with config_overrides hf_model_config = deepcopy(test_cfg) - model_cfg = { + model_cfg = om.create({ 'name': 'hf_causal_lm', 'pretrained_model_name_or_path': save_path, 'pretrained': False, 'config_overrides': model_cfg_overrides, - } + }) hf_model_config.model = model_cfg name = hf_model_config.model.pop('name') hf_model = build_composer_model( name=name, - cfg=hf_model_config.model, + cfg=to_str_dict(hf_model_config.model), tokenizer=tokenizer, ) From cd73f60bc6f3680839b0b352a60c77e2cadc62a4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 21:45:33 +0000 Subject: [PATCH 106/201] fix V: the empire fixes back --- llmfoundry/data/dataloader.py | 3 ++- llmfoundry/data/finetuning/dataloader.py | 12 +++++----- llmfoundry/data/packing.py | 18 +++++++-------- llmfoundry/utils/builders.py | 29 ++++++++++++------------ scripts/eval/eval.py | 12 ++++------ 5 files changed, 37 insertions(+), 37 deletions(-) diff --git a/llmfoundry/data/dataloader.py b/llmfoundry/data/dataloader.py index 1f28d0ee2c..c65eceb772 100644 --- a/llmfoundry/data/dataloader.py +++ b/llmfoundry/data/dataloader.py @@ -22,13 +22,14 @@ def build_dataloader(cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size (int): The size of the batches (number of examples) that the dataloader will produce. """ + name = cfg.pop('name') kwargs: Dict[str, Any] = { **cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size } return construct_from_registry( - name=cfg.name, + name=name, registry=registry.dataloaders, partial_function=False, pre_validation_function=None, diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index e5ee68e3cb..374a4d6f64 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -195,14 +195,14 @@ def build_finetuning_dataloader( sampling_method=dataset_cfg.get('sampling_method', 'balanced'), sampling_granularity=dataset_cfg.get('sampling_granularity', 1), batching_method=dataset_cfg.get('batching_method', 'random'), - max_seq_len=dataset_cfg.max_seq_len, + max_seq_len=dataset_cfg['max_seq_len'], allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), replication=dataset_cfg.get('replication', None), ) else: # Build HF dataloader - dataset_name_or_path = dataset_cfg.hf_name + dataset_name_or_path = dataset_cfg['hf_name'] split = dataset_cfg.get('split') if split is None: raise MissingHuggingFaceURLSplitError() @@ -228,14 +228,14 @@ def build_finetuning_dataloader( dataset_name=dataset_name_or_path, split=split, safe_load=dataset_cfg.get('safe_load', False), - max_seq_len=dataset_cfg.max_seq_len, + max_seq_len=dataset_cfg['max_seq_len'], preprocessing_fn=preprocessing_fn, tokenizer=tokenizer, target_prompts=dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS), target_responses=dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES), - decoder_only_format=dataset_cfg.decoder_only_format, + decoder_only_format=dataset_cfg['decoder_only_format'], hf_kwargs=dataset_cfg.get('hf_kwargs', {})) # Ensure dataset is large enough. @@ -246,7 +246,7 @@ def build_finetuning_dataloader( full_dataset_size = len(streaming_dataset) if full_dataset_size < minimum_dataset_size: raise NotEnoughDatasetSamplesError( - dataset_name=dataset_cfg.hf_name, + dataset_name=dataset_cfg['hf_name'], split=split, dataloader_batch_size=dataloader_batch_size, world_size=world_size, @@ -255,7 +255,7 @@ def build_finetuning_dataloader( # Initialize sampler. sampler = dist.get_sampler(streaming_dataset, drop_last=drop_last, - shuffle=dataset_cfg.shuffle) + shuffle=dataset_cfg['shuffle']) assert streaming_dataset is not None # for pyright dl = DataLoader( diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 9f4908d709..ea9c0d9a5e 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -3,7 +3,7 @@ import logging import tempfile -from typing import Callable, Dict, Iterable, List, Literal, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, List, Literal, Optional, Tuple import numpy as np import torch @@ -361,7 +361,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, def profile_packing( - dataloader_cfg: DictConfig, + dataloader_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, min_ratio: float, max_ratio: float, @@ -385,7 +385,7 @@ def profile_packing( from llmfoundry.data.dataloader import build_dataloader - dataset_cfg = dataloader_cfg.dataset + dataset_cfg = dataloader_cfg['dataset'] max_seq_len = dataset_cfg.get('max_seq_len') max_leftovers_to_keep = dataset_cfg.get('max_leftovers_to_keep', None) @@ -397,22 +397,22 @@ def profile_packing( 'prefetch_factor': None, 'persistent_workers': False, }) - dataloader_cfg.dataset.packing_ratio = 1.0 + dataloader_cfg['dataset_cfg']['packing_ratio'] = 1.0 # If streaming dataset, use a temporary local folder for profiling local_rank_zero = dist.get_global_rank() - dist.get_local_rank() - if dataloader_cfg.dataset.get('remote') is not None: + if dataloader_cfg['dataset'].get('remote') is not None: tmp_path_to_broadcast = tempfile.TemporaryDirectory().name gathered_paths = dist.all_gather_object(tmp_path_to_broadcast) tmp_path = gathered_paths[local_rank_zero] - dataloader_cfg.dataset.local = tmp_path + dataloader_cfg['dataset']['local'] = tmp_path - if dataloader_cfg.dataset.get('streams') is not None: - for stream_config in dataloader_cfg.dataset.streams.values(): + if dataloader_cfg['dataset'].get('streams') is not None: + for stream_config in dataloader_cfg['dataset']['streams'].values(): tmp_path_to_broadcast = tempfile.TemporaryDirectory().name gathered_paths = dist.all_gather_object(tmp_path_to_broadcast) tmp_path = gathered_paths[local_rank_zero] - stream_config.local = tmp_path + stream_config['local'] = tmp_path # Determine the packing_ratio values we'll try packing_ratios, raw_batch_sizes = [], [] diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f9052f8e08..57dca3006e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -29,6 +29,7 @@ from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper +from llmfoundry.utils.config_utils import to_str_dict from llmfoundry.utils.registry_utils import construct_from_registry from llmfoundry.utils.warnings import VersionedDeprecationWarning @@ -50,9 +51,9 @@ def build_evaluators( - eval_loader_config: Optional[Union[DictConfig, ListConfig]], - icl_tasks_config: Optional[Union[str, ListConfig]], - eval_gauntlet_config: Optional[Union[str, DictConfig]], + eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], + icl_tasks_config: Optional[Union[str, List[Dict[str, Any]]]], + eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], *, tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, @@ -85,23 +86,23 @@ def build_evaluators( def build_eval_loaders( - eval_loader_config: Union[DictConfig, ListConfig], + eval_loader_config: Union[Dict[str, Any], List[Dict[str, Any]]], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, ) -> List[Evaluator]: evaluators: List[Evaluator] = [] - if isinstance(eval_loader_config, ListConfig): - eval_configs: ListConfig = eval_loader_config + if isinstance(eval_loader_config, list): + eval_configs = eval_loader_config is_multi_eval = True else: - eval_configs = ListConfig([eval_loader_config]) + eval_configs = [eval_loader_config] is_multi_eval = False for eval_config in eval_configs: eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) eval_loader: Evaluator = Evaluator( - label=f'eval/{eval_config.label}' if is_multi_eval else 'eval', + label=f"eval/{eval_config['label']}" if is_multi_eval else 'eval', dataloader=eval_dataloader, # Load the eval data to fail fast. metrics will get added # later in add_metrics_to_eval_loaders, after the model is loaded @@ -129,8 +130,8 @@ def add_metrics_to_eval_loaders( def build_icl_data_and_gauntlet( - icl_tasks_config: Union[str, ListConfig], - eval_gauntlet_config: Optional[Union[str, DictConfig]], + icl_tasks_config: Union[str, List[Dict[str, Any]]], + eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], tokenizer: PreTrainedTokenizerBase, device_eval_batch_size: int, icl_seq_len: int, @@ -147,15 +148,15 @@ def build_icl_data_and_gauntlet( if isinstance(eval_gauntlet_config, str): with open(eval_gauntlet_config, 'r') as icl_f: eval_gauntlet_cfg = om.load(icl_f) - eval_gauntlet = eval_gauntlet_cfg.eval_gauntlet - elif isinstance(eval_gauntlet_config, DictConfig): # pyright: ignore + eval_gauntlet = to_str_dict(eval_gauntlet_cfg['eval_gauntlet']) + elif isinstance(eval_gauntlet_config, dict): # pyright: ignore eval_gauntlet = eval_gauntlet_config else: raise ValueError( f'Got invalid type for eval_gauntlet_config: {type(eval_gauntlet_config)}' ) - eval_gauntlet.logger_keys = logger_keys - eval_gauntlet.benchmark_sizes = { + eval_gauntlet['logger_keys'] = logger_keys + eval_gauntlet['benchmark_sizes'] = { e.label: e.dataloader.num_samples for e in icl_evaluators } eval_gauntlet_cb = EvalGauntlet(**eval_gauntlet) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 80e1c9aedf..81b74730fa 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -262,10 +262,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for code_path in (eval_config.code_paths or []): import_file(code_path) - model_configs = ListConfig(eval_config.models) - eval_gauntlet_config = DictConfig( - eval_config.eval_gauntlet - ) if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str + model_configs = eval_config.models + eval_gauntlet_config = eval_config.eval_gauntlet if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str # the below line fixes a strange issue where the fsdp_config is a DictConfig rather than a Dict, # despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`. @@ -280,9 +278,9 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: } if fsdp_config else None # pyright fix # Mandatory Evaluation Parameters - icl_tasks: Union[ListConfig, str, None] = ListConfig( - eval_config.icl_tasks - ) if eval_config.icl_tasks else eval_config.icl_tasks_str + icl_tasks: Union[ + ListConfig, str, + None] = eval_config.icl_tasks if eval_config.icl_tasks else eval_config.icl_tasks_str assert icl_tasks is not None, 'icl_tasks must be specified in the config' # Optional Evaluation Parameters with default values From a59e09cf70d38106ee2470ece05a49e0bf4588ba Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 21:54:06 +0000 Subject: [PATCH 107/201] fixed some more types --- llmfoundry/data/finetuning/dataloader.py | 6 +++--- llmfoundry/data/packing.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 374a4d6f64..f3f7a5f04e 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -148,7 +148,7 @@ def build_finetuning_dataloader( tokenizer.pad_token = tokenizer.eos_token # this full config is necessary for properly profiling the packing ratio - dataloader_cfg = DictConfig({ + dataloader_cfg = { 'name': name, 'dataset': dataset_cfg, 'drop_last': drop_last, @@ -157,7 +157,7 @@ def build_finetuning_dataloader( 'prefetch_factor': prefetch_factor, 'persistent_workers': persistent_workers, 'timeout': timeout, - }) + } collate_fn, dataloader_batch_size = _build_collate_fn( dataloader_cfg=dataloader_cfg, tokenizer=tokenizer, @@ -527,7 +527,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: def _build_collate_fn( - dataloader_cfg: DictConfig, + dataloader_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: int, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index ea9c0d9a5e..21a9064c32 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -290,7 +290,7 @@ def pad_tensor(tensor: torch.Tensor, pad_value: int): return batch -def auto_packing_ratio(dataloader_cfg: DictConfig, +def auto_packing_ratio(dataloader_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: int, num_packing_ratios: int = 20) -> float: @@ -324,7 +324,7 @@ def auto_packing_ratio(dataloader_cfg: DictConfig, reproducibility.seed_all(0) # If max_seq_len is very small, skip profiling and select packing ratio of 1. - dataset_config = dataloader_cfg.dataset + dataset_config = dataloader_cfg['dataset'] max_seq_len = dataset_config.get('max_seq_len') if max_seq_len <= 100: return 1 From b7fb56a4b4d82042f0a3d0a33af18673c0da3c48 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 21:55:16 +0000 Subject: [PATCH 108/201] fix VI: return of the fix --- llmfoundry/data/packing.py | 1 - tests/data/test_packing.py | 8 ++++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 21a9064c32..765da5f220 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -8,7 +8,6 @@ import numpy as np import torch from composer.utils import dist -from omegaconf import DictConfig from transformers import PreTrainedTokenizerBase log = logging.getLogger(__name__) diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 7e4c04586a..61469b4f00 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -107,9 +107,9 @@ def test_auto_packing(profile_packing: Mock): profile_packing.return_value = [(1, .9, 0), (2, .8, 0), (3, .7, .5)] packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': { + dataloader_cfg={'dataset': { 'max_seq_len': 2048 - }}), + }}, tokenizer=None, device_batch_size=1, ) # Dummy values, profiling results are already set. @@ -134,9 +134,9 @@ def test_dist_auto_packing(profile_packing: Mock): (3, .7, .5)] # should pick 2 packing_ratio = auto_packing_ratio( - dataloader_cfg=DictConfig({'dataset': { + dataloader_cfg={'dataset': { 'max_seq_len': 2048 - }}), + }}, tokenizer=None, device_batch_size=1, ) # Dummy values, profiling results are already set. From 1eb809e225fd439a545f47d48092ddc26434d732 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 22:05:46 +0000 Subject: [PATCH 109/201] fix VII: the fix awakens --- llmfoundry/data/finetuning/dataloader.py | 2 +- llmfoundry/utils/builders.py | 88 +++++++++++------------- 2 files changed, 43 insertions(+), 47 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index f3f7a5f04e..851925863c 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -141,7 +141,7 @@ def build_finetuning_dataloader( given a starting workload YAML. """ dataset_cfg = dataset - _validate_config(dataset_cfg) + _validate_config(**dataset_cfg) # Use EOS as the pad token if none exists if tokenizer.pad_token is None: # type: ignore (sometimes it's none and that's ok) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 57dca3006e..a87161ac62 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -17,7 +17,6 @@ from composer.models import ComposerModel from composer.optim.scheduler import ComposerScheduler from composer.utils import dist -from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om from torch.optim.optimizer import Optimizer from torchmetrics import Metric @@ -457,7 +456,7 @@ def build_tokenizer( def build_icl_evaluators( - icl_tasks: Union[str, ListConfig], + icl_tasks: Union[str, List[Dict[str, Any]]], tokenizer: PreTrainedTokenizerBase, default_max_seq_len: int, default_batch_size: int, @@ -475,59 +474,60 @@ def build_icl_evaluators( log.info(f'Extracting ICL task config from path: {icl_tasks}') with open(icl_tasks, 'r') as icl_f: icl_task_cfg = om.load(icl_f) - icl_tasks_list = icl_task_cfg.icl_tasks + icl_tasks_list = to_str_dict(icl_task_cfg.icl_tasks) else: icl_tasks_list = icl_tasks - def _validate_cfg(icl_cfg: DictConfig): + def _validate_cfg(icl_cfg: Dict[str, Any]): assert 'label' in icl_cfg - assert 'dataset_uri' in icl_cfg and icl_cfg.dataset_uri is not None + assert 'dataset_uri' in icl_cfg and icl_cfg['dataset_uri'] is not None assert 'icl_task_type' in icl_cfg assert 'num_fewshot' in icl_cfg if 'metric_names' not in icl_cfg: - if icl_cfg.icl_task_type == 'language_modeling': - icl_cfg.metric_names = ['InContextLearningLMAccuracy'] - elif icl_cfg.icl_task_type == 'multiple_choice': - icl_cfg.metric_names = [ + if icl_cfg['icl_task_type'] == 'language_modeling': + icl_cfg['metric_names'] = ['InContextLearningLMAccuracy'] + elif icl_cfg['icl_task_type'] == 'multiple_choice': + icl_cfg['metric_names'] = [ 'InContextLearningMultipleChoiceAccuracy' ] - elif icl_cfg.icl_task_type == 'schema': - icl_cfg.metric_names = [ + elif icl_cfg['icl_task_type'] == 'schema': + icl_cfg['metric_names'] = [ 'InContextLearningMultipleChoiceAccuracy' ] - elif icl_cfg.icl_task_type == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering': - if icl_cfg.icl_task_type == 'question_answering': + elif icl_cfg[ + 'icl_task_type'] == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering': + if icl_cfg['icl_task_type'] == 'question_answering': warnings.warn( VersionedDeprecationWarning( "ICL task type 'question_answering' is now deprecated. Use identifier 'generation_task_with_answers'", 'v0.9.0')) - icl_cfg.metric_names = [ + icl_cfg['metric_names'] = [ 'InContextLearningGenerationExactMatchAccuracy' ] - elif icl_cfg.icl_task_type == 'code_evaluation': - icl_cfg.metric_names = ['InContextLearningCodeEvalAccuracy'] + elif icl_cfg['icl_task_type'] == 'code_evaluation': + icl_cfg['metric_names'] = ['InContextLearningCodeEvalAccuracy'] else: raise ValueError( f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.' ) if 'prompt_string' not in icl_cfg: - icl_cfg.prompt_string = '' + icl_cfg['prompt_string'] = '' if 'example_delimiter' not in icl_cfg: - icl_cfg.example_delimiter = '\n' + icl_cfg['example_delimiter'] = '\n' if 'continuation_delimiter' not in icl_cfg: - icl_cfg.continuation_delimiter = ' ' + icl_cfg['continuation_delimiter'] = ' ' if 'max_seq_len' not in icl_cfg: - icl_cfg.max_seq_len = default_max_seq_len + icl_cfg['max_seq_len'] = default_max_seq_len if 'batch_size' not in icl_cfg: - icl_cfg.batch_size = default_batch_size + icl_cfg['batch_size'] = default_batch_size if 'pass_at_k' not in icl_cfg: - icl_cfg.pass_at_k = 1 + icl_cfg['pass_at_k'] = 1 if 'fewshot_random_seed' not in icl_cfg: - icl_cfg.fewshot_random_seed = 1234 + icl_cfg['fewshot_random_seed'] = 1234 if 'generations_per_sample' not in icl_cfg: - icl_cfg.generations_per_sample = 1 + icl_cfg['generations_per_sample'] = 1 if 'num_beams' in icl_cfg: raise ValueError( @@ -535,18 +535,18 @@ def _validate_cfg(icl_cfg: DictConfig): 'Please use generation_kwargs.num_beams instead.') for icl_cfg in icl_tasks_list: - assert isinstance(icl_cfg, DictConfig) + assert isinstance(icl_cfg, dict) _validate_cfg(icl_cfg) - for num_fewshot in list(icl_cfg.num_fewshot): + for num_fewshot in list(icl_cfg['num_fewshot']): if tokenizer.pad_token_id is None: # Current workaround to support GPT2 tokenizer with `pad_token_id = None` pad_tok_id = tokenizer.eos_token_id else: pad_tok_id = tokenizer.pad_token_id - label = f'{icl_cfg.label}/{num_fewshot}-shot' - metric_names = list(icl_cfg.metric_names) + label = f'{icl_cfg["label"]}/{num_fewshot}-shot' + metric_names = list(icl_cfg['metric_names']) # TODO: fix Composer bug when copying local paths and destination exists - destination_path = f'{destination_dir}/{icl_cfg.label}-{num_fewshot}.jsonl' + destination_path = f'{destination_dir}/{icl_cfg["label"]}-{num_fewshot}.jsonl' if dist.get_local_rank() == 0 and os.path.exists(destination_path): os.remove(destination_path) dist.barrier() @@ -556,38 +556,34 @@ def _validate_cfg(icl_cfg: DictConfig): early_stopping_criteria = icl_cfg.get('early_stopping_criteria', None) - if isinstance(early_stopping_criteria, ListConfig): - early_stopping_criteria = om.to_container( - early_stopping_criteria) assert early_stopping_criteria is None or isinstance( early_stopping_criteria, list) dataloaders = get_icl_task_dataloader( - icl_cfg.icl_task_type, - icl_cfg.dataset_uri, + icl_cfg['icl_task_type'], + icl_cfg['dataset_uri'], tokenizer, - batch_size=icl_cfg.batch_size, - max_seq_len=icl_cfg.max_seq_len, + batch_size=icl_cfg['batch_size'], + max_seq_len=icl_cfg['max_seq_len'], pad_tok_id=pad_tok_id, num_fewshot=num_fewshot, - prompt_string=icl_cfg.prompt_string, - example_delimiter=icl_cfg.example_delimiter, + prompt_string=icl_cfg['prompt_string'], + example_delimiter=icl_cfg['example_delimiter'], hf_loading_vars=hf_loading_vars, hf_parsing_map=hf_parsing_map, - continuation_delimiter=icl_cfg.continuation_delimiter, + continuation_delimiter=icl_cfg['continuation_delimiter'], question_prelimiter=icl_cfg.get('question_prelimiter', ''), destination_path=destination_path, - fewshot_random_seed=icl_cfg.fewshot_random_seed, - pass_at_k=icl_cfg.pass_at_k, - generations_per_sample=icl_cfg.generations_per_sample, + fewshot_random_seed=icl_cfg['fewshot_random_seed'], + pass_at_k=icl_cfg['pass_at_k'], + generations_per_sample=icl_cfg['generations_per_sample'], has_categories=icl_cfg.get('has_categories', False), cot_delimiter=icl_cfg.get('cot_delimiter', ''), generation_kwargs=icl_cfg.get('generation_kwargs', {}), early_stopping_criteria=early_stopping_criteria, do_normalization=icl_cfg.get('do_normalization', True)) - if hasattr( - icl_cfg, - 'has_categories') and icl_cfg.has_categories and isinstance( - dataloaders, dict): + if hasattr(icl_cfg, 'has_categories' + ) and icl_cfg['has_categories'] and isinstance( + dataloaders, dict): for category in dataloaders.keys(): logger_keys.extend([ f'metrics/{label}/{category}/{m}' for m in metric_names From c4922a587d160c4e065f95d78617a2469b1c4321 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 22:09:20 +0000 Subject: [PATCH 110/201] fix VIII: the last bug --- scripts/eval/eval.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 81b74730fa..2cd2e4a93a 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -42,11 +42,11 @@ def evaluate_model( dist_timeout: Union[float, int], run_name: str, seed: int, - icl_tasks: Union[str, ListConfig], + icl_tasks: Union[str, List[Dict[str, Any]]], max_seq_len: int, device_eval_batch_size: int, - eval_gauntlet_config: Optional[Union[str, DictConfig]], - eval_loader_config: Optional[Union[DictConfig, ListConfig]], + eval_gauntlet_config: Optional[Union[str, Dict[str, Any]]], + eval_loader_config: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]], fsdp_config: Optional[Dict[str, Any]], loggers: List[LoggerDestination], python_log_level: Optional[str], From 3fc2d82dd7897dc578511b8a85b5a8f753bf1278 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 22:39:25 +0000 Subject: [PATCH 111/201] fix --- llmfoundry/data/finetuning/dataloader.py | 38 +++++++----------------- llmfoundry/data/packing.py | 2 +- llmfoundry/data/text_data.py | 3 +- llmfoundry/utils/builders.py | 5 +++- tests/data/test_packing.py | 4 +-- tests/models/hf/test_hf_fsdp.py | 6 ++-- tests/models/test_model.py | 9 +++--- tests/utils/test_builders.py | 9 +++--- 8 files changed, 31 insertions(+), 45 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 851925863c..60e89e2665 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -303,7 +303,7 @@ def build_finetuning_dataloader( def _validate_config( max_seq_len: int, - decoder_only_format: bool, + decoder_only_format: bool = False, hf_name: Optional[str] = None, local: Optional[str] = None, remote: Optional[str] = None, @@ -329,29 +329,13 @@ def _validate_config( """ # Check for extraneous keys in the dataset config allowed_additional_kwargs = { - 'local', - 'remote', - 'split', - 'download_retry', - 'download_timeout', - 'validate_hash', - 'keep_zip', - 'epoch_size', - 'predownload', - 'cache_limit', - 'partition_algo', - 'num_canonical_nodes', - 'batch_size', - 'shuffle', - 'shuffle_algo', - 'shuffle_seed', - 'shuffle_block_size', - 'sampling_method', - 'sampling_granularity', - 'batching_method', - 'max_seq_len', - 'allow_unsafe_types', - 'replication', + 'local', 'remote', 'split', 'download_retry', 'download_timeout', + 'validate_hash', 'keep_zip', 'epoch_size', 'predownload', 'cache_limit', + 'partition_algo', 'num_canonical_nodes', 'batch_size', 'shuffle', + 'shuffle_algo', 'shuffle_seed', 'shuffle_block_size', 'sampling_method', + 'sampling_granularity', 'batching_method', 'max_seq_len', + 'allow_unsafe_types', 'replication', 'packing_ratio', + 'allow_pad_trimming' } if not set(kwargs.keys()).issubset(allowed_additional_kwargs): raise ValueError( @@ -532,12 +516,12 @@ def _build_collate_fn( device_batch_size: int, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: # These `.get` calls are safe because the dataset_cfg is validated for extra keys - dataset_cfg = dataloader_cfg.dataset + dataset_cfg = dataloader_cfg['dataset'] target_responses = dataset_cfg.get('target_responses', _DEFAULT_TARGET_RESPONSES) target_prompts = dataset_cfg.get('target_prompts', _DEFAULT_TARGET_PROMPTS) - max_seq_len = dataset_cfg.max_seq_len - decoder_only_format = dataset_cfg.decoder_only_format + max_seq_len = dataset_cfg['max_seq_len'] + decoder_only_format = dataset_cfg['decoder_only_format'] allow_pad_trimming = dataset_cfg.get('allow_pad_trimming', False) collate_fn = Seq2SeqFinetuningCollator( diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py index 765da5f220..f2d61e711e 100644 --- a/llmfoundry/data/packing.py +++ b/llmfoundry/data/packing.py @@ -396,7 +396,7 @@ def profile_packing( 'prefetch_factor': None, 'persistent_workers': False, }) - dataloader_cfg['dataset_cfg']['packing_ratio'] = 1.0 + dataloader_cfg['dataset']['packing_ratio'] = 1.0 # If streaming dataset, use a temporary local folder for profiling local_rank_zero = dist.get_global_rank() - dist.get_local_rank() diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 1fa884ca6b..c15ba11183 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -303,7 +303,8 @@ def build_text_dataloader( ' To override this error, set the override_bos_token_id_mismatch_error flag to True in the dataset config section of the YAML.' ) - streams = build_streams(streams=dataset_cfg.streams) + streams = build_streams(streams=dataset_cfg.pop('streams') if 'streams' in + dataset_cfg else None) # build dataset potentially with streams text_dataset = StreamingTextDataset( diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index a87161ac62..d98c8126e0 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -98,10 +98,13 @@ def build_eval_loaders( is_multi_eval = False for eval_config in eval_configs: + label = None + if 'label' in eval_config: + label = eval_config.pop('label') eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) eval_loader: Evaluator = Evaluator( - label=f"eval/{eval_config['label']}" if is_multi_eval else 'eval', + label=f"eval/{label}" if is_multi_eval else 'eval', dataloader=eval_dataloader, # Load the eval data to fail fast. metrics will get added # later in add_metrics_to_eval_loaders, after the model is loaded diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py index 61469b4f00..ed0629e24f 100644 --- a/tests/data/test_packing.py +++ b/tests/data/test_packing.py @@ -197,7 +197,7 @@ def test_packing_with_dataloader(packing_ratio: Any): """Tests that packing works with a dataloader.""" reproducibility.seed_all(17) tokenizer = build_tokenizer('gpt2', {}) - cfg = DictConfig({ + cfg = { 'name': 'finetuning', 'dataset': { 'hf_name': 'tatsu-lab/alpaca', @@ -216,7 +216,7 @@ def test_packing_with_dataloader(packing_ratio: Any): 'prefetch_factor': None, 'persistent_workers': False, 'timeout': 0, - }) + } loader = build_finetuning_dataloader(**cfg, tokenizer=tokenizer, diff --git a/tests/models/hf/test_hf_fsdp.py b/tests/models/hf/test_hf_fsdp.py index 274ebeabcd..cfc995817b 100644 --- a/tests/models/hf/test_hf_fsdp.py +++ b/tests/models/hf/test_hf_fsdp.py @@ -19,10 +19,8 @@ def test_olmo_wraps(): }, } - config = DictConfig(conf) - - config.model.pop('name') - model = ComposerHFCausalLM(**config.model, tokenizer=None) + conf['model'].pop('name') + model = ComposerHFCausalLM(**conf['model'], tokenizer=None) # check that all the modules we except are blocked from FSDP wrapping underlying_model = maybe_get_underlying_model(model.model) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index f705783a4b..3a62be8a3b 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -665,13 +665,12 @@ def test_opt_wrapping(peft_config: Optional[dict[str, str]]): if peft_config is not None: conf['model']['peft_config'] = peft_config - config = DictConfig(conf) - - tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(config.tokenizer) - tokenizer = build_tokenizer(config.tokenizer.name, + tokenizer_cfg: Dict[str, Any] = _load_tokenizer_cfg(conf['tokenizer']) + tokenizer = build_tokenizer(conf['tokenizer']['name'], tokenizer_cfg.get('kwargs', {})) - model = ComposerHFCausalLM(**config.model, tokenizer=tokenizer) + conf['model'].pop('name') + model = ComposerHFCausalLM(**conf['model'], tokenizer=tokenizer) # check that all the modules we except are blocked from FSDP wrapping underlying_model = maybe_get_underlying_model(model.model) diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index 21ff72fd99..4ee58adc57 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -259,14 +259,15 @@ def test_build_evaluators_empty(): def test_build_eval_loaders(monkeypatch: pytest.MonkeyPatch): tokenizer = TiktokenTokenizerWrapper(model_name='gpt-4') - eval_loader_cfg = DictConfig({ + eval_loader_cfg = { 'name': 'text', 'dataset': { + 'streams': None # mocked, not needed }, 'drop_last': False, 'num_workers': 8, - }) + } monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', lambda *args, **kwargs: MagicMock()) eval_loaders = build_eval_loaders(eval_loader_cfg, tokenizer, 2) @@ -277,7 +278,7 @@ def test_build_eval_loaders(monkeypatch: pytest.MonkeyPatch): assert eval_loaders[0].dataloader is not None assert eval_loaders[0].metric_names == [] - multi_eval_loader_cfg = ListConfig([ + multi_eval_loader_cfg = [ { 'name': 'text', 'label': 'test1', @@ -296,7 +297,7 @@ def test_build_eval_loaders(monkeypatch: pytest.MonkeyPatch): 'drop_last': False, 'num_workers': 8, } - ]) + ] monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', lambda *args, **kwargs: MagicMock()) eval_loaders2 = build_eval_loaders(multi_eval_loader_cfg, tokenizer, 2) From b9db81fd5c4e936f6f4021390a0c7dbb6937bfce Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 19 Apr 2024 22:51:54 +0000 Subject: [PATCH 112/201] final fix I think --- llmfoundry/utils/builders.py | 3 ++- scripts/eval/eval.py | 4 +--- tests/callbacks/test_eval_gauntlet_callback.py | 4 +++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index d98c8126e0..bad2c2998b 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -104,7 +104,7 @@ def build_eval_loaders( eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) eval_loader: Evaluator = Evaluator( - label=f"eval/{label}" if is_multi_eval else 'eval', + label=f'eval/{label}' if is_multi_eval else 'eval', dataloader=eval_dataloader, # Load the eval data to fail fast. metrics will get added # later in add_metrics_to_eval_loaders, after the model is loaded @@ -150,6 +150,7 @@ def build_icl_data_and_gauntlet( if isinstance(eval_gauntlet_config, str): with open(eval_gauntlet_config, 'r') as icl_f: eval_gauntlet_cfg = om.load(icl_f) + assert isinstance(eval_gauntlet_cfg, dict) eval_gauntlet = to_str_dict(eval_gauntlet_cfg['eval_gauntlet']) elif isinstance(eval_gauntlet_config, dict): # pyright: ignore eval_gauntlet = eval_gauntlet_config diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 2cd2e4a93a..a715245055 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -284,9 +284,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: assert icl_tasks is not None, 'icl_tasks must be specified in the config' # Optional Evaluation Parameters with default values - eval_loader_config = DictConfig( - eval_config.eval_loader) if eval_config.eval_loader else ListConfig( - eval_config.eval_loaders) if eval_config.eval_loaders else None + eval_loader_config = eval_config.eval_loader if eval_config.eval_loader else eval_config.eval_loaders default_run_name: str = os.environ.get('RUN_NAME', 'llm') run_name = eval_config.run_name if eval_config.run_name else default_run_name diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index 8d9938e3a1..3b9298be00 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -13,6 +13,7 @@ from llmfoundry.eval.metrics.nlp import InContextLearningLMAccuracy from llmfoundry.utils.builders import build_icl_data_and_gauntlet +from llmfoundry.utils.config_utils import to_str_dict @pytest.fixture(autouse=True) @@ -97,7 +98,8 @@ def test_gauntlet_callback(averages: Optional[dict]): # test loading functionality _, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - icl_task_config, eval_gauntlet_config, tokenizer, 4, 1024, 1) + [to_str_dict(c) for c in icl_task_config], + to_str_dict(eval_gauntlet_config), tokenizer, 4, 1024, 1) assert eval_gauntlet_callback is not None state = MockState(eval_gauntlet_callback.logger_keys) logger = MockLogger(state) From e87b9b2f0ce4dbab3d350244cdabf857d0e1ff0d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sat, 20 Apr 2024 03:55:01 +0000 Subject: [PATCH 113/201] fixed --- llmfoundry/utils/builders.py | 13 ++++++----- llmfoundry/utils/config_utils.py | 17 +++++++++++++- scripts/eval/eval.py | 39 ++++++++++++++++++++----------- tests/a_scripts/eval/test_eval.py | 7 +++--- 4 files changed, 51 insertions(+), 25 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index bad2c2998b..0aa965386e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -500,7 +500,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): 'InContextLearningMultipleChoiceAccuracy' ] elif icl_cfg[ - 'icl_task_type'] == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering': + 'icl_task_type'] == 'generation_task_with_answers' or icl_cfg[ + 'icl_task_type'] == 'question_answering': if icl_cfg['icl_task_type'] == 'question_answering': warnings.warn( VersionedDeprecationWarning( @@ -513,7 +514,7 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): icl_cfg['metric_names'] = ['InContextLearningCodeEvalAccuracy'] else: raise ValueError( - f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.' + f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg["icl_task_type"]}.' ) if 'prompt_string' not in icl_cfg: @@ -539,7 +540,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): 'Please use generation_kwargs.num_beams instead.') for icl_cfg in icl_tasks_list: - assert isinstance(icl_cfg, dict) + assert isinstance( + icl_cfg, dict), f'Expected dict, got {type(icl_cfg)}, {icl_cfg=}' _validate_cfg(icl_cfg) for num_fewshot in list(icl_cfg['num_fewshot']): if tokenizer.pad_token_id is None: @@ -585,9 +587,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): generation_kwargs=icl_cfg.get('generation_kwargs', {}), early_stopping_criteria=early_stopping_criteria, do_normalization=icl_cfg.get('do_normalization', True)) - if hasattr(icl_cfg, 'has_categories' - ) and icl_cfg['has_categories'] and isinstance( - dataloaders, dict): + if 'has_categories' in icl_cfg and icl_cfg[ + 'has_categories'] and isinstance(dataloaders, dict): for category in dataloaders.keys(): logger_keys.extend([ f'metrics/{label}/{category}/{m}' for m in metric_names diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index b0a2d2594e..7fabfab3bd 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -5,7 +5,7 @@ import logging import math import warnings -from typing import Any, Dict, Literal, Mapping, Optional, Tuple, Union +from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union from composer.utils import dist from omegaconf import DictConfig, ListConfig @@ -39,6 +39,21 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): ) +def to_container_recursive( + cfg: Union[DictConfig, ListConfig] +) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + + def rh(x: Any) -> Any: # recursive helper + if isinstance(x, DictConfig): + return {k: rh(v) for k, v in x.items()} + elif isinstance(x, ListConfig): + return [rh(v) for v in x] + else: + return x + + return rh(cfg) + + def pop_config(cfg: Union[Dict[str, Any], DictConfig], key: str, must_exist: bool = True, diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index a715245055..67c5d25658 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -16,7 +16,7 @@ from composer.loggers.logger_destination import LoggerDestination from composer.trainer import Trainer from composer.utils import dist, get_device, reproducibility -from omegaconf import MISSING, DictConfig, ListConfig +from omegaconf import MISSING, DictConfig from omegaconf import OmegaConf as om from rich.traceback import install @@ -29,7 +29,8 @@ build_evaluators, build_logger, build_tokenizer) from llmfoundry.utils.config_utils import (forbid_config_key, log_config, - process_init_device) + process_init_device, + to_container_recursive) from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) @@ -117,9 +118,9 @@ def evaluate_model( eval_gauntlet_df = pd.DataFrame( columns=['model_name'] + [avg for avg in eval_gauntlet_callback.averages] + - [t.name for t in eval_gauntlet_callback.categories]) + [t['name'] for t in eval_gauntlet_callback.categories]) - if model['name'] == 'mpt_causal_lm' and load_path is None: + if name == 'mpt_causal_lm' and load_path is None: raise ValueError( 'MPT causal LMs require a load_path to the checkpoint for model evaluation.' + @@ -263,14 +264,19 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: import_file(code_path) model_configs = eval_config.models - eval_gauntlet_config = eval_config.eval_gauntlet if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str + eval_gauntlet_config = to_container_recursive( + eval_config.eval_gauntlet) or eval_config.eval_gauntlet_str + assert eval_gauntlet_config is None or isinstance( + eval_gauntlet_config, dict + ) or isinstance( + eval_gauntlet_config, str + ), f'eval_gauntlet_config must be a dict or a string but is {type(eval_gauntlet_config)}, {eval_gauntlet_config=}' # the below line fixes a strange issue where the fsdp_config is a DictConfig rather than a Dict, # despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`. # I think it might be rewrapped in DictConfig during the `structured` call in `_make_eval_and_log_config`. # this redundant check is necessary to avoid a pyright error. - fsdp_config = om.to_container( - eval_config.fsdp_config) if eval_config.fsdp_config else None + fsdp_config = to_container_recursive(eval_config.fsdp_config) assert isinstance( fsdp_config, Dict ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' @@ -278,18 +284,23 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: } if fsdp_config else None # pyright fix # Mandatory Evaluation Parameters - icl_tasks: Union[ - ListConfig, str, - None] = eval_config.icl_tasks if eval_config.icl_tasks else eval_config.icl_tasks_str + icl_tasks = to_container_recursive( + eval_config.icl_tasks) or eval_config.icl_tasks_str + assert isinstance(icl_tasks, list) or isinstance( + icl_tasks, str + ), f'icl_tasks must be a list or a string but is {type(icl_tasks)}, {icl_tasks=}' assert icl_tasks is not None, 'icl_tasks must be specified in the config' # Optional Evaluation Parameters with default values - eval_loader_config = eval_config.eval_loader if eval_config.eval_loader else eval_config.eval_loaders + eval_loader_config = to_container_recursive( + eval_config.eval_loader + ) if eval_config.eval_loader else to_container_recursive( + eval_config.eval_loaders) default_run_name: str = os.environ.get('RUN_NAME', 'llm') run_name = eval_config.run_name if eval_config.run_name else default_run_name reproducibility.seed_all(eval_config.seed) - dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) + # dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) logging.basicConfig( # Example of format string @@ -356,8 +367,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: benchmark_to_taxonomy = {} if eval_gauntlet_callback is not None: for t in eval_gauntlet_callback.categories: - for b in t.benchmarks: - benchmark_to_taxonomy[b.name] = t.name + for b in t['benchmarks']: + benchmark_to_taxonomy[b['name']] = t['name'] assert 'model_name' in model_cfg, 'model_name must be specified in model config' model_results = calculate_markdown_results(logger_keys, trainer, diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index adb11acff7..71d2ea5e58 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -15,8 +15,7 @@ from llmfoundry.utils.builders import build_composer_model from llmfoundry.utils.config_utils import to_str_dict from scripts.eval.eval import main # noqa: E402 -from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, - gpt_tiny_cfg) +from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg @pytest.fixture(autouse=True) @@ -40,6 +39,7 @@ def eval_cfg(foundry_dir: str) -> Union[om.ListConfig, om.DictConfig]: @pytest.fixture() def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): + eval_cfg = copy.deepcopy(eval_cfg) # copy config before modifying model_cfg = eval_cfg.models[0] # set device to cpu device = 'cpu' @@ -65,6 +65,7 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any, mock_saved_model_path: Any): + eval_cfg = copy.deepcopy(eval_cfg) eval_cfg.models[0].load_path = mock_saved_model_path assert isinstance(eval_cfg, om.DictConfig) main(eval_cfg) @@ -113,8 +114,6 @@ def test_loader_eval(capfd: Any, mock_saved_model_path: Any, first_eval_loader.label = 'c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) - arxiv_dataset_name = create_arxiv_dataset(tmp_path) - second_eval_loader.data_local = arxiv_dataset_name second_eval_loader.label = 'arxiv' test_cfg.eval_loader = om.OmegaConf.create( [first_eval_loader, second_eval_loader]) From 48fa58e42f5b6435aa6409f7edc15b47432fd8df Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 19:42:53 +0000 Subject: [PATCH 114/201] fix style --- llmfoundry/utils/builders.py | 3 ++- llmfoundry/utils/config_utils.py | 2 +- scripts/eval/eval.py | 2 +- .../callbacks/test_eval_gauntlet_callback.py | 4 ++-- tests/data/test_dataloader.py | 2 +- tests/models/hf/test_hf_fsdp.py | 19 ++++++++++++------- tests/models/test_model.py | 9 +++++---- tests/utils/test_builders.py | 1 - 8 files changed, 24 insertions(+), 18 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 0aa965386e..f5b546e58e 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -17,6 +17,7 @@ from composer.models import ComposerModel from composer.optim.scheduler import ComposerScheduler from composer.utils import dist +from omegaconf import DictConfig from omegaconf import OmegaConf as om from torch.optim.optimizer import Optimizer from torchmetrics import Metric @@ -150,7 +151,7 @@ def build_icl_data_and_gauntlet( if isinstance(eval_gauntlet_config, str): with open(eval_gauntlet_config, 'r') as icl_f: eval_gauntlet_cfg = om.load(icl_f) - assert isinstance(eval_gauntlet_cfg, dict) + assert isinstance(eval_gauntlet_cfg, DictConfig) eval_gauntlet = to_str_dict(eval_gauntlet_cfg['eval_gauntlet']) elif isinstance(eval_gauntlet_config, dict): # pyright: ignore eval_gauntlet = eval_gauntlet_config diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 7fabfab3bd..5d6c5bfeb8 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -40,7 +40,7 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): def to_container_recursive( - cfg: Union[DictConfig, ListConfig] + cfg: Union[DictConfig, ListConfig, Dict[str, Any], List[Dict[str, Any]]] ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: def rh(x: Any) -> Any: # recursive helper diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 67c5d25658..67287d53b2 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -300,7 +300,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: run_name = eval_config.run_name if eval_config.run_name else default_run_name reproducibility.seed_all(eval_config.seed) - # dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) + dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) logging.basicConfig( # Example of format string diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index 3b9298be00..7d5f38b9d2 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -70,8 +70,8 @@ def test_gauntlet_callback(averages: Optional[dict]): num_fewshot: [0] icl_task_type: language_modeling """) - assert isinstance(icl_task_config, om.ListConfig) or isinstance( - icl_task_config, str) + icl_task_config = list(icl_task_config) + assert isinstance(icl_task_config, list(om.DictConfig)) eval_gauntlet_config = om.OmegaConf.create(""" weighting: EQUAL diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index a2fcc24b5a..f073a2ca5b 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -1155,7 +1155,7 @@ def test_build_unknown_dataloader(): } tokenizer = MagicMock() with pytest.raises(catalogue.RegistryError): - _ = build_dataloader(**cfg, tokenizer=tokenizer, device_batch_size=2) + _ = build_dataloader(cfg=cfg, tokenizer=tokenizer, device_batch_size=2) invalid_conversation_params_sharegpt = [ diff --git a/tests/models/hf/test_hf_fsdp.py b/tests/models/hf/test_hf_fsdp.py index cfc995817b..69d4b67207 100644 --- a/tests/models/hf/test_hf_fsdp.py +++ b/tests/models/hf/test_hf_fsdp.py @@ -1,14 +1,15 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 +from typing import Any, Dict + from composer.models.huggingface import maybe_get_underlying_model -from omegaconf import DictConfig from llmfoundry.models.hf import ComposerHFCausalLM def test_olmo_wraps(): - conf: dict = { + conf: Dict[str, Any] = { 'model': { 'name': 'hf_causal_lm', 'pretrained_model_name_or_path': 'allenai/OLMo-7B', @@ -20,11 +21,15 @@ def test_olmo_wraps(): } conf['model'].pop('name') - model = ComposerHFCausalLM(**conf['model'], tokenizer=None) + model = ComposerHFCausalLM(tokenizer=None, **conf['model']) # type: ignore # check that all the modules we except are blocked from FSDP wrapping underlying_model = maybe_get_underlying_model(model.model) - assert not underlying_model.model._fsdp_wrap - assert not underlying_model.model.transformer._fsdp_wrap - assert not underlying_model.model.transformer.wte._fsdp_wrap - assert not underlying_model.model.transformer.ff_out._fsdp_wrap + assert (not hasattr(underlying_model.model, + 'fsdp_wrap')) or (not underlying_model.model._fsdp_wrap) + assert (not hasattr(underlying_model.model.transformer, 'fsdp_wrap')) or ( + not underlying_model.model.transformer._fsdp_wrap) + assert (not hasattr(underlying_model.model.transformer.wte, 'fsdp_wrap') + ) or (not underlying_model.model.transformer.wte._fsdp_wrap) + assert (not hasattr(underlying_model.model.transformer.ff_out, 'fsdp_wrap') + ) or (not underlying_model.model.transformer.ff_out._fsdp_wrap) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 3a62be8a3b..4f22fb90bf 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -48,10 +48,11 @@ def get_config( return cast(DictConfig, test_cfg) -def _load_tokenizer_cfg(cfg: DictConfig) -> Dict: - config = om.to_container(cfg, resolve=True) - assert isinstance(config, Dict) - return config +def _load_tokenizer_cfg(cfg: Union[Dict[str, Any], DictConfig]) -> Dict: + if isinstance(cfg, DictConfig): + cfg = om.to_container(cfg, resolve=True) + assert isinstance(cfg, Dict) + return cfg def _get_objs(request: pytest.FixtureRequest, diff --git a/tests/utils/test_builders.py b/tests/utils/test_builders.py index 4ee58adc57..a227083d45 100644 --- a/tests/utils/test_builders.py +++ b/tests/utils/test_builders.py @@ -13,7 +13,6 @@ from composer.callbacks import Generate from composer.core import Evaluator from composer.loggers import WandBLogger -from omegaconf import DictConfig, ListConfig from transformers import PreTrainedTokenizerBase from llmfoundry.callbacks import HuggingFaceCheckpointer From 3e77198c079366c4f87628fb2e2502f720666485 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 20:08:00 +0000 Subject: [PATCH 115/201] fix --- llmfoundry/utils/config_utils.py | 3 ++- tests/callbacks/test_eval_gauntlet_callback.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 5d6c5bfeb8..cef01ff18a 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -40,7 +40,8 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): def to_container_recursive( - cfg: Union[DictConfig, ListConfig, Dict[str, Any], List[Dict[str, Any]]] + cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], + List[Dict[str, Any]]]] ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: def rh(x: Any) -> Any: # recursive helper diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index 7d5f38b9d2..e4e66692ca 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -71,7 +71,7 @@ def test_gauntlet_callback(averages: Optional[dict]): icl_task_type: language_modeling """) icl_task_config = list(icl_task_config) - assert isinstance(icl_task_config, list(om.DictConfig)) + assert isinstance(icl_task_config, list[om.DictConfig]) eval_gauntlet_config = om.OmegaConf.create(""" weighting: EQUAL From 961b034e0dd99535b7ef2e16ccdadbf4938bc3eb Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 20:16:41 +0000 Subject: [PATCH 116/201] fix fix --- tests/models/test_model.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 4f22fb90bf..3768840a9f 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -50,9 +50,11 @@ def get_config( def _load_tokenizer_cfg(cfg: Union[Dict[str, Any], DictConfig]) -> Dict: if isinstance(cfg, DictConfig): - cfg = om.to_container(cfg, resolve=True) - assert isinstance(cfg, Dict) - return cfg + config = to_str_dict(cfg) + else: + assert isinstance(cfg, dict) + config = cfg + return config def _get_objs(request: pytest.FixtureRequest, From d245cd160384934d4b56f8b0dc42d09b701bb40a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 20:18:34 +0000 Subject: [PATCH 117/201] fix fix style --- tests/callbacks/test_eval_gauntlet_callback.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index e4e66692ca..21c7c7dbfe 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -71,7 +71,7 @@ def test_gauntlet_callback(averages: Optional[dict]): icl_task_type: language_modeling """) icl_task_config = list(icl_task_config) - assert isinstance(icl_task_config, list[om.DictConfig]) + assert isinstance(icl_task_config, List[om.DictConfig]) eval_gauntlet_config = om.OmegaConf.create(""" weighting: EQUAL @@ -89,8 +89,7 @@ def test_gauntlet_callback(averages: Optional[dict]): num_fewshot: 0 random_baseline: 0.0 """) - assert isinstance(eval_gauntlet_config, om.DictConfig) or isinstance( - eval_gauntlet_config, str) + assert isinstance(eval_gauntlet_config, om.DictConfig) if averages is not None: eval_gauntlet_config.averages = averages From aec718b796971c68953e47bdb8d24c9f0b0073a6 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 20:26:56 +0000 Subject: [PATCH 118/201] icl task config --- tests/callbacks/test_eval_gauntlet_callback.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index 21c7c7dbfe..e64a8ff327 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -70,8 +70,8 @@ def test_gauntlet_callback(averages: Optional[dict]): num_fewshot: [0] icl_task_type: language_modeling """) - icl_task_config = list(icl_task_config) - assert isinstance(icl_task_config, List[om.DictConfig]) + icl_task_config: List[om.DictConfig] = list(icl_task_config) # type: ignore + assert all(isinstance(c, om.DictConfig) for c in icl_task_config) eval_gauntlet_config = om.OmegaConf.create(""" weighting: EQUAL From d9e6f131c0f724d6a8a1efefb4efe2353afb4b8d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 20:55:14 +0000 Subject: [PATCH 119/201] fix train --- llmfoundry/utils/builders.py | 6 +++++- scripts/train/train.py | 19 ++++++++++++------- tests/a_scripts/train/test_train.py | 2 -- tests/data/test_dataloader.py | 1 + 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index f5b546e58e..e9e496bb85 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -94,9 +94,13 @@ def build_eval_loaders( if isinstance(eval_loader_config, list): eval_configs = eval_loader_config is_multi_eval = True - else: + elif isinstance(eval_loader_config, dict): eval_configs = [eval_loader_config] is_multi_eval = False + else: + raise ValueError( + f'Got invalid type for eval_loader_config: {type(eval_loader_config)}, {eval_loader_config=}' + ) for eval_config in eval_configs: label = None diff --git a/scripts/train/train.py b/scripts/train/train.py index 8a9955e23c..8ea59f8b45 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -37,6 +37,7 @@ build_scheduler, build_tokenizer) from llmfoundry.utils.config_utils import (forbid_config_key, log_config, pop_config, process_init_device, + to_container_recursive, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -356,15 +357,19 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config = train_cfg.model - train_loader_config = train_cfg.train_loader + model_config = to_container_recursive(train_cfg.model) + train_loader_config = to_container_recursive(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config + fsdp_config: Optional[Dict[str, Any]] = to_container_recursive( + train_cfg.fsdp_config) - eval_loader_config = train_cfg.eval_loader if train_cfg.eval_loader is not None else train_cfg.eval_loaders - icl_tasks_config = train_cfg.icl_tasks - eval_gauntlet_config = train_cfg.eval_gauntlet + eval_loader_config: Optional[Dict[str, Any]] = to_container_recursive( + train_cfg.eval_loader + ) if train_cfg.eval_loader is not None else to_container_recursive( + train_cfg.eval_loaders) + icl_tasks_config = to_container_recursive(train_cfg.icl_tasks) + eval_gauntlet_config = to_container_recursive(train_cfg.eval_gauntlet) # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') @@ -442,7 +447,7 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg = train_cfg.profiler + profiler_cfg = to_container_recursive(train_cfg.profiler) if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 5f624906c8..1d9c3656ce 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -109,8 +109,6 @@ def test_train_multi_eval(tmp_path: pathlib.Path): first_eval_loader.label = 'c4' # Create second eval dataloader using the arxiv dataset. second_eval_loader = copy.deepcopy(first_eval_loader) - arxiv_dataset_name = create_arxiv_dataset(tmp_path) - second_eval_loader.data_local = arxiv_dataset_name second_eval_loader.label = 'arxiv' test_cfg.eval_loader = om.create([first_eval_loader, second_eval_loader]) test_cfg.eval_subset_num_batches = 1 # -1 to evaluate on all batches diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index f073a2ca5b..6be18b8d1c 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -227,6 +227,7 @@ def test_correct_padding(tokenizer_name: str, ) # Dataloaders + test_cfg.eval_loader.pop('name') eval_loader = build_text_dataloader( **test_cfg.eval_loader, tokenizer=tokenizer, From 4609950292586c1bd5ab79229d9a7a42fd5e40db Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:15:28 +0000 Subject: [PATCH 120/201] fix finetuning dataloader --- llmfoundry/data/finetuning/dataloader.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 60e89e2665..d3b7bdc10e 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -18,6 +18,7 @@ dataset_constructor) from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio from llmfoundry.data.text_data import build_streams, get_tokens_per_batch_func +from llmfoundry.utils.config_utils import to_str_dict from llmfoundry.utils.exceptions import (MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError) @@ -169,7 +170,11 @@ def build_finetuning_dataloader( if dataset_cfg.get('remote') is not None or dataset_cfg.get( 'streams') is not None: # Build streaming dataloader - streams = build_streams(**dataset_cfg) + streams_cfg = dataset_cfg.get('streams', None) + streams_cfg = to_str_dict( + streams_cfg) if streams_cfg is not None else None + streams = build_streams( + streams_cfg) if streams_cfg is not None else None # note: we don't need to use ** here because we're setting default values for almost all arguments streaming_dataset = dataset_constructor.build_from_streaming( @@ -309,7 +314,7 @@ def _validate_config( remote: Optional[str] = None, hf_kwargs: Optional[Dict[str, Any]] = None, preprocessing_fn: Optional[str] = None, - safe_load: Optional[bool] = False, + safe_load: Optional[bool] = None, streams: Optional[Dict[str, Any]] = None, target_prompts: Optional[str] = None, target_responses: Optional[str] = None, From 722aeb1e0088123c32a95706c71b32bd62669fc8 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:31:09 +0000 Subject: [PATCH 121/201] fix train types --- llmfoundry/utils/config_utils.py | 22 ++++++++++++++++++++++ scripts/train/train.py | 21 +++++++++++---------- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index cef01ff18a..f62fd16805 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,6 +39,28 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): ) +def to_dict_recursive( + cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], + List[Dict[str, Any]]]] +) -> Dict[str, Any]: + maybe_dict = to_container_recursive(cfg) + if isinstance(maybe_dict, dict): + return maybe_dict + else: + raise ValueError(f'Expected a dict-like type, got {type(maybe_dict)}') + + +def to_list_recursive( + cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], + List[Dict[str, Any]]]] +) -> List[Dict[str, Any]]: + maybe_list = to_container_recursive(cfg) + if isinstance(maybe_list, list): + return maybe_list + else: + raise ValueError(f'Expected a list-like type, got {type(maybe_list)}') + + def to_container_recursive( cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], List[Dict[str, Any]]]] diff --git a/scripts/train/train.py b/scripts/train/train.py index 8ea59f8b45..e54481762f 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -37,7 +37,7 @@ build_scheduler, build_tokenizer) from llmfoundry.utils.config_utils import (forbid_config_key, log_config, pop_config, process_init_device, - to_container_recursive, + to_dict_recursive, to_list_recursive, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -357,19 +357,19 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config = to_container_recursive(train_cfg.model) - train_loader_config = to_container_recursive(train_cfg.train_loader) + model_config = to_dict_recursive(train_cfg.model) + train_loader_config = to_dict_recursive(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = to_container_recursive( - train_cfg.fsdp_config) + fsdp_config: Optional[Dict[str, Any]] = to_dict_recursive( + train_cfg.fsdp_config) if train_cfg.fsdp_config is not None else None - eval_loader_config: Optional[Dict[str, Any]] = to_container_recursive( + eval_loader_config: Optional[Dict[str, Any]] = to_dict_recursive( train_cfg.eval_loader - ) if train_cfg.eval_loader is not None else to_container_recursive( + ) if train_cfg.eval_loader is not None else to_list_recursive( train_cfg.eval_loaders) - icl_tasks_config = to_container_recursive(train_cfg.icl_tasks) - eval_gauntlet_config = to_container_recursive(train_cfg.eval_gauntlet) + icl_tasks_config = to_list_recursive(train_cfg.icl_tasks) + eval_gauntlet_config = to_dict_recursive(train_cfg.eval_gauntlet) # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') @@ -447,7 +447,7 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg = to_container_recursive(train_cfg.profiler) + profiler_cfg = to_dict_recursive(train_cfg.profiler) if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', @@ -528,6 +528,7 @@ def main(cfg: DictConfig) -> Trainer: # Build Model log.info('Initializing model...') name = model_config.pop('name') + assert isinstance(name, str) model = build_composer_model( name=name, tokenizer=tokenizer, From 8d08b176414f24a36901afba0b831c9eaacf3218 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:37:56 +0000 Subject: [PATCH 122/201] fix token counting --- tests/data/test_dataloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 6be18b8d1c..5db45e3afe 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -1136,6 +1136,7 @@ def test_token_counting_func_dataloader_setting( ds_mock.tokenizer = gptt monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', lambda *args, **kwargs: ds_mock) + cfg.pop('name') dl = build_text_dataloader(**cfg, tokenizer=gptt, device_batch_size=batch_size) From 87d7cdf5218c71d88ade0f5cc072912c823758b8 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:45:20 +0000 Subject: [PATCH 123/201] fix train types --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index e54481762f..bcf1e61f44 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -364,7 +364,7 @@ def main(cfg: DictConfig) -> Trainer: fsdp_config: Optional[Dict[str, Any]] = to_dict_recursive( train_cfg.fsdp_config) if train_cfg.fsdp_config is not None else None - eval_loader_config: Optional[Dict[str, Any]] = to_dict_recursive( + eval_loader_config = to_dict_recursive( train_cfg.eval_loader ) if train_cfg.eval_loader is not None else to_list_recursive( train_cfg.eval_loaders) From d0c2b4fe45aecf4a2e06eaeae999ba8940cf65d7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:48:04 +0000 Subject: [PATCH 124/201] oopsie --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index bcf1e61f44..7a15f2a1fc 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -367,7 +367,7 @@ def main(cfg: DictConfig) -> Trainer: eval_loader_config = to_dict_recursive( train_cfg.eval_loader ) if train_cfg.eval_loader is not None else to_list_recursive( - train_cfg.eval_loaders) + train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None icl_tasks_config = to_list_recursive(train_cfg.icl_tasks) eval_gauntlet_config = to_dict_recursive(train_cfg.eval_gauntlet) From fa639c64986600f039f40ef9f415a211852ef900 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:57:00 +0000 Subject: [PATCH 125/201] fix straggler issues --- tests/a_scripts/train/test_train.py | 3 +-- tests/callbacks/test_eval_gauntlet_callback.py | 7 ++++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 1d9c3656ce..63b4324a0c 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -13,8 +13,7 @@ from llmfoundry.utils.config_utils import to_str_dict, update_batch_size_info from scripts.train.train import TrainConfig, main, validate_config # noqa: E402 -from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall, - gpt_tiny_cfg) +from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg from tests.fixtures.autouse import REPO_DIR diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index e64a8ff327..7b6c5fea54 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -70,8 +70,9 @@ def test_gauntlet_callback(averages: Optional[dict]): num_fewshot: [0] icl_task_type: language_modeling """) - icl_task_config: List[om.DictConfig] = list(icl_task_config) # type: ignore - assert all(isinstance(c, om.DictConfig) for c in icl_task_config) + icl_task_config_list: List[om.DictConfig] = list( + icl_task_config) # type: ignore + assert all(isinstance(c, om.DictConfig) for c in icl_task_config_list) eval_gauntlet_config = om.OmegaConf.create(""" weighting: EQUAL @@ -97,7 +98,7 @@ def test_gauntlet_callback(averages: Optional[dict]): # test loading functionality _, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - [to_str_dict(c) for c in icl_task_config], + [to_str_dict(c) for c in icl_task_config_list], to_str_dict(eval_gauntlet_config), tokenizer, 4, 1024, 1) assert eval_gauntlet_callback is not None state = MockState(eval_gauntlet_callback.logger_keys) From f71396d15c32a7b2fa999b5b62492654244a242d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 22:16:32 +0000 Subject: [PATCH 126/201] fix tests --- llmfoundry/utils/config_utils.py | 9 ++----- scripts/train/train.py | 10 +++++--- tests/data/test_icl_datasets.py | 13 ++++++---- .../inference_api_wrapper/test_fmapi.py | 25 +++++++++++-------- .../test_inference_api_eval_wrapper.py | 25 +++++++++++-------- tests/models/test_model.py | 1 + 6 files changed, 48 insertions(+), 35 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index f62fd16805..7db338737e 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -39,10 +39,7 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): ) -def to_dict_recursive( - cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], - List[Dict[str, Any]]]] -) -> Dict[str, Any]: +def to_dict_recursive(cfg: Union[DictConfig, Dict[str, Any]]) -> Dict[str, Any]: maybe_dict = to_container_recursive(cfg) if isinstance(maybe_dict, dict): return maybe_dict @@ -51,9 +48,7 @@ def to_dict_recursive( def to_list_recursive( - cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], - List[Dict[str, Any]]]] -) -> List[Dict[str, Any]]: + cfg: Union[ListConfig, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: maybe_list = to_container_recursive(cfg) if isinstance(maybe_list, list): return maybe_list diff --git a/scripts/train/train.py b/scripts/train/train.py index 7a15f2a1fc..bb6fc16738 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -368,8 +368,11 @@ def main(cfg: DictConfig) -> Trainer: train_cfg.eval_loader ) if train_cfg.eval_loader is not None else to_list_recursive( train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None - icl_tasks_config = to_list_recursive(train_cfg.icl_tasks) - eval_gauntlet_config = to_dict_recursive(train_cfg.eval_gauntlet) + icl_tasks_config = to_list_recursive( + train_cfg.icl_tasks) if train_cfg.icl_tasks is not None else None + eval_gauntlet_config = to_dict_recursive( + train_cfg.eval_gauntlet + ) if train_cfg.eval_gauntlet is not None else None # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') @@ -447,7 +450,8 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg = to_dict_recursive(train_cfg.profiler) + profiler_cfg = to_dict_recursive( + train_cfg.profiler) if train_cfg.profiler is not None else None if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', diff --git a/tests/data/test_icl_datasets.py b/tests/data/test_icl_datasets.py index 3a730fdf19..307dd12408 100644 --- a/tests/data/test_icl_datasets.py +++ b/tests/data/test_icl_datasets.py @@ -8,6 +8,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from llmfoundry.utils.builders import build_icl_evaluators +from llmfoundry.utils.config_utils import to_str_dict def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'): @@ -20,11 +21,13 @@ def run_test(dir: pathlib.Path, tokenizer: PreTrainedTokenizerBase, bos_tok: str = ''): task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, - tokenizer, - 1024, - 8, - destination_dir=str(dir)) + evaluators, _ = build_icl_evaluators( + to_str_dict(task_cfg.icl_tasks) + if isinstance(task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), + tokenizer, + 1024, + 8, + destination_dir=str(dir)) for e in evaluators: batch = next(e.dataloader.dataloader.__iter__()) diff --git a/tests/models/inference_api_wrapper/test_fmapi.py b/tests/models/inference_api_wrapper/test_fmapi.py index bde2c90d36..a5227a5a03 100644 --- a/tests/models/inference_api_wrapper/test_fmapi.py +++ b/tests/models/inference_api_wrapper/test_fmapi.py @@ -12,6 +12,7 @@ FMAPIChatAPIEvalWrapper) from llmfoundry.models.inference_api_wrapper.fmapi import FMAPIEvalInterface from llmfoundry.utils.builders import build_icl_evaluators +from llmfoundry.utils.config_utils import to_str_dict def load_icl_config(): @@ -104,11 +105,13 @@ def test_casual_fmapi_wrapper(tmp_path: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators( + to_str_dict(task_cfg.icl_tasks) if isinstance( + task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = model.eval_forward(batch) @@ -140,11 +143,13 @@ def test_chat_fmapi_wrapper(tmp_path: str): 'Treason!') task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators( + to_str_dict(task_cfg.icl_tasks) if isinstance( + task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = chatmodel.eval_forward(batch) diff --git a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py index 7ecb61aa43..854f7c1599 100644 --- a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py +++ b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py @@ -12,6 +12,7 @@ OpenAIChatAPIEvalWrapper) from llmfoundry.tokenizers import TiktokenTokenizerWrapper from llmfoundry.utils.builders import build_icl_evaluators +from llmfoundry.utils.config_utils import to_str_dict @pytest.fixture(scope='module') @@ -106,11 +107,13 @@ def test_openai_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators( + to_str_dict(task_cfg.icl_tasks) if isinstance( + task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = model.eval_forward(batch) @@ -138,11 +141,13 @@ def test_chat_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): 'Treason!') task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators( + to_str_dict(task_cfg.icl_tasks) if isinstance( + task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = chatmodel.eval_forward(batch) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 3768840a9f..44b464a4c2 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -704,6 +704,7 @@ def test_lora_id(): tokenizer = build_tokenizer(config.tokenizer.name, tokenizer_cfg.get('kwargs', {})) + config.model.pop('name') model = ComposerHFCausalLM(**config.model, tokenizer=tokenizer) assert isinstance(model.model, peft.PeftModelForCausalLM) From 02a50de56b5a528d6a84e9df929d05de91d8de36 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 22:31:35 +0000 Subject: [PATCH 127/201] fix??? --- tests/data/test_icl_datasets.py | 14 ++++------ .../inference_api_wrapper/test_fmapi.py | 27 ++++++++---------- .../test_inference_api_eval_wrapper.py | 28 +++++++++---------- 3 files changed, 31 insertions(+), 38 deletions(-) diff --git a/tests/data/test_icl_datasets.py b/tests/data/test_icl_datasets.py index 307dd12408..ae679a191f 100644 --- a/tests/data/test_icl_datasets.py +++ b/tests/data/test_icl_datasets.py @@ -8,7 +8,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from llmfoundry.utils.builders import build_icl_evaluators -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_list_recursive def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'): @@ -21,13 +21,11 @@ def run_test(dir: pathlib.Path, tokenizer: PreTrainedTokenizerBase, bos_tok: str = ''): task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators( - to_str_dict(task_cfg.icl_tasks) - if isinstance(task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), - tokenizer, - 1024, - 8, - destination_dir=str(dir)) + evaluators, _ = build_icl_evaluators(to_list_recursive(task_cfg.icl_tasks), + tokenizer, + 1024, + 8, + destination_dir=str(dir)) for e in evaluators: batch = next(e.dataloader.dataloader.__iter__()) diff --git a/tests/models/inference_api_wrapper/test_fmapi.py b/tests/models/inference_api_wrapper/test_fmapi.py index a5227a5a03..794290ba18 100644 --- a/tests/models/inference_api_wrapper/test_fmapi.py +++ b/tests/models/inference_api_wrapper/test_fmapi.py @@ -12,7 +12,7 @@ FMAPIChatAPIEvalWrapper) from llmfoundry.models.inference_api_wrapper.fmapi import FMAPIEvalInterface from llmfoundry.utils.builders import build_icl_evaluators -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_list_recursive def load_icl_config(): @@ -105,13 +105,11 @@ def test_casual_fmapi_wrapper(tmp_path: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators( - to_str_dict(task_cfg.icl_tasks) if isinstance( - task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = model.eval_forward(batch) @@ -143,13 +141,12 @@ def test_chat_fmapi_wrapper(tmp_path: str): 'Treason!') task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators( - to_str_dict(task_cfg.icl_tasks) if isinstance( - task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators(to_list_recursive( + task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = chatmodel.eval_forward(batch) diff --git a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py index 854f7c1599..e05593cfbc 100644 --- a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py +++ b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py @@ -12,7 +12,7 @@ OpenAIChatAPIEvalWrapper) from llmfoundry.tokenizers import TiktokenTokenizerWrapper from llmfoundry.utils.builders import build_icl_evaluators -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_list_recursive @pytest.fixture(scope='module') @@ -107,13 +107,12 @@ def test_openai_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators( - to_str_dict(task_cfg.icl_tasks) if isinstance( - task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators(to_list_recursive( + task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = model.eval_forward(batch) @@ -141,13 +140,12 @@ def test_chat_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): 'Treason!') task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators( - to_str_dict(task_cfg.icl_tasks) if isinstance( - task_cfg.icl_tasks, dict) else str(task_cfg.icl_tasks), - tokenizer, - 1024, - 2, - destination_dir=str(tmp_path)) + evaluators, _ = build_icl_evaluators(to_list_recursive( + task_cfg.icl_tasks), + tokenizer, + 1024, + 2, + destination_dir=str(tmp_path)) batch = next(evaluators[0].dataloader.dataloader.__iter__()) result = chatmodel.eval_forward(batch) From 76c413b2a12c8547ce40bacaddf68a0b0742a2d7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 22:55:22 +0000 Subject: [PATCH 128/201] fix hf v mpt gpu test and fmapi test --- tests/models/hf/test_hf_v_mpt.py | 1 + tests/models/inference_api_wrapper/test_fmapi.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 5bb38097fc..6d1c6d8321 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -113,6 +113,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, print(model_cfg) name = model_cfg.pop('name') + model_cfg.pop('device') model = build_composer_model( name=name, cfg=to_str_dict(model_cfg), diff --git a/tests/models/inference_api_wrapper/test_fmapi.py b/tests/models/inference_api_wrapper/test_fmapi.py index 794290ba18..9654de8f04 100644 --- a/tests/models/inference_api_wrapper/test_fmapi.py +++ b/tests/models/inference_api_wrapper/test_fmapi.py @@ -88,7 +88,7 @@ def mock_create(**kwargs: Dict[str, str]): return MockCompletion(' ') -def test_casual_fmapi_wrapper(tmp_path: str): +def test_causal_fmapi_wrapper(tmp_path: str): # patch block_until_ready with patch.object(FMAPIEvalInterface, 'block_until_ready') as mock: @@ -105,7 +105,8 @@ def test_casual_fmapi_wrapper(tmp_path: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(task_cfg.icl_tasks, + evaluators, _ = build_icl_evaluators(to_list_recursive( + task_cfg.icl_tasks), tokenizer, 1024, 2, From 66e86dc56c7894b5a42a0f2c4c507df3e019ae1e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 23:06:26 +0000 Subject: [PATCH 129/201] pop device --- tests/models/hf/test_hf_v_mpt.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index 6d1c6d8321..c91cbf6528 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -60,6 +60,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, tokenizer_kwargs=tokenizer_kwargs, ) name = hf_cfg.model.pop('name') + hf_cfg.model.pop('device') hf_model = build_composer_model( name=name, cfg=to_str_dict(hf_cfg.model), From b57a10714f95014f3615dcbe2afe39c9ea3b9590 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 23:14:01 +0000 Subject: [PATCH 130/201] to_str_dict -> to_dict_recursive --- llmfoundry/data/finetuning/dataloader.py | 4 ++-- llmfoundry/utils/builders.py | 7 ++++--- llmfoundry/utils/config_utils.py | 2 +- tests/a_scripts/eval/test_eval.py | 4 ++-- .../inference/test_convert_composer_to_hf.py | 6 +++--- tests/a_scripts/train/test_train.py | 5 +++-- tests/callbacks/test_eval_gauntlet_callback.py | 6 +++--- tests/models/hf/test_fsdp_weight_tying.py | 4 ++-- tests/models/hf/test_hf_config.py | 8 ++++---- tests/models/hf/test_hf_v_mpt.py | 6 +++--- tests/models/test_model.py | 16 ++++++++-------- 11 files changed, 35 insertions(+), 33 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index d3b7bdc10e..0a7b4cb819 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -18,7 +18,7 @@ dataset_constructor) from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio from llmfoundry.data.text_data import build_streams, get_tokens_per_batch_func -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive from llmfoundry.utils.exceptions import (MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError) @@ -171,7 +171,7 @@ def build_finetuning_dataloader( 'streams') is not None: # Build streaming dataloader streams_cfg = dataset_cfg.get('streams', None) - streams_cfg = to_str_dict( + streams_cfg = to_dict_recursive( streams_cfg) if streams_cfg is not None else None streams = build_streams( streams_cfg) if streams_cfg is not None else None diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index e9e496bb85..943fcdf7dc 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -29,7 +29,7 @@ from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive from llmfoundry.utils.registry_utils import construct_from_registry from llmfoundry.utils.warnings import VersionedDeprecationWarning @@ -156,7 +156,8 @@ def build_icl_data_and_gauntlet( with open(eval_gauntlet_config, 'r') as icl_f: eval_gauntlet_cfg = om.load(icl_f) assert isinstance(eval_gauntlet_cfg, DictConfig) - eval_gauntlet = to_str_dict(eval_gauntlet_cfg['eval_gauntlet']) + eval_gauntlet = to_dict_recursive( + eval_gauntlet_cfg['eval_gauntlet']) elif isinstance(eval_gauntlet_config, dict): # pyright: ignore eval_gauntlet = eval_gauntlet_config else: @@ -483,7 +484,7 @@ def build_icl_evaluators( log.info(f'Extracting ICL task config from path: {icl_tasks}') with open(icl_tasks, 'r') as icl_f: icl_task_cfg = om.load(icl_f) - icl_tasks_list = to_str_dict(icl_task_cfg.icl_tasks) + icl_tasks_list = to_dict_recursive(icl_task_cfg.icl_tasks) else: icl_tasks_list = icl_tasks diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 7db338737e..698bac61ef 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -25,7 +25,7 @@ ] -def to_str_dict(cfg: DictConfig) -> Dict[str, Any]: +def to_dict_recursive(cfg: DictConfig) -> Dict[str, Any]: """Converts a DictConfig to a dictionary with string keys.""" cfg_dict = om.to_container(cfg, resolve=True) assert isinstance(cfg_dict, dict) diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index 71d2ea5e58..52db5e7937 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -13,7 +13,7 @@ from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive from scripts.eval.eval import main # noqa: E402 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg @@ -51,7 +51,7 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): name = model_cfg.model.pop('name') model = build_composer_model(name=name, tokenizer=tokenizer, - cfg=to_str_dict(model_cfg.model)) + cfg=to_dict_recursive(model_cfg.model)) # create mocked save checkpoint trainer = Trainer(model=model, device=device) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index fc1662627e..8d326d89dd 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -28,7 +28,7 @@ from llmfoundry.models.mpt import MPTConfig from llmfoundry.utils.builders import (build_composer_model, build_optimizer, build_tokenizer) -from llmfoundry.utils.config_utils import process_init_device, to_str_dict +from llmfoundry.utils.config_utils import process_init_device, to_dict_recursive from scripts.inference.convert_composer_to_hf import convert_composer_to_hf from tests.data_utils import make_tiny_ft_dataset @@ -875,7 +875,7 @@ def test_convert_and_generate(model: str, tie_word_embeddings: bool, original_model = build_composer_model( name=name, tokenizer=tokenizer, - cfg=to_str_dict(om_cfg['model']), + cfg=to_dict_recursive(om_cfg['model']), ) trainer = Trainer(model=original_model, device='cpu' if not model == 'mptmoe' else 'gpu') @@ -948,7 +948,7 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, original_model = build_composer_model( name=name, tokenizer=tokenizer, - cfg=to_str_dict(om_cfg['model']), + cfg=to_dict_recursive(om_cfg['model']), ) trainer = Trainer(model=original_model, device='cpu' if not 'moe' in conf_path else 'gpu') diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 63b4324a0c..26dcfb082e 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -11,7 +11,8 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -from llmfoundry.utils.config_utils import to_str_dict, update_batch_size_info +from llmfoundry.utils.config_utils import (to_dict_recursive, + update_batch_size_info) from scripts.train.train import TrainConfig, main, validate_config # noqa: E402 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg from tests.fixtures.autouse import REPO_DIR @@ -155,7 +156,7 @@ def test_validate_config(): test_cfg: DictConfig = om.load(f) # type: ignore test_cfg.model.ffn_config.moe_world_size = 4 test_cfg.fsdp_config.use_orig_params = False - test_cfg_dict = to_str_dict(test_cfg) + test_cfg_dict = to_dict_recursive(test_cfg) test_cfg_dict = update_batch_size_info(test_cfg_dict) with pytest.raises( ValueError, diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index 7b6c5fea54..acf6950518 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -13,7 +13,7 @@ from llmfoundry.eval.metrics.nlp import InContextLearningLMAccuracy from llmfoundry.utils.builders import build_icl_data_and_gauntlet -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive @pytest.fixture(autouse=True) @@ -98,8 +98,8 @@ def test_gauntlet_callback(averages: Optional[dict]): # test loading functionality _, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - [to_str_dict(c) for c in icl_task_config_list], - to_str_dict(eval_gauntlet_config), tokenizer, 4, 1024, 1) + [to_dict_recursive(c) for c in icl_task_config_list], + to_dict_recursive(eval_gauntlet_config), tokenizer, 4, 1024, 1) assert eval_gauntlet_callback is not None state = MockState(eval_gauntlet_callback.logger_keys) logger = MockLogger(state) diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 6ffe144fa5..487000193e 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -10,7 +10,7 @@ from omegaconf import OmegaConf as om from llmfoundry.utils.builders import build_composer_model, build_tokenizer -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive @pytest.mark.world_size(2) @@ -71,7 +71,7 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, name = model_cfg.pop('name') original_model = build_composer_model( name=name, - cfg=to_str_dict(model_cfg), + cfg=to_dict_recursive(model_cfg), tokenizer=tokenizer, ) diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 16f0d43a31..2cdc37b797 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -16,7 +16,7 @@ from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive def test_remote_code_false_mpt( @@ -49,7 +49,7 @@ def test_remote_code_false_mpt( name = test_cfg.model.pop('name') _ = build_composer_model( name=name, - cfg=to_str_dict(test_cfg.model), + cfg=to_dict_recursive(test_cfg.model), tokenizer=tokenizer, ) @@ -142,7 +142,7 @@ def test_hf_config_override( name = test_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_str_dict(test_cfg.model), + cfg=to_dict_recursive(test_cfg.model), tokenizer=tokenizer, ) @@ -167,7 +167,7 @@ def test_hf_config_override( name = hf_model_config.model.pop('name') hf_model = build_composer_model( name=name, - cfg=to_str_dict(hf_model_config.model), + cfg=to_dict_recursive(hf_model_config.model), tokenizer=tokenizer, ) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index c91cbf6528..a0be5aa773 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -9,7 +9,7 @@ from omegaconf import OmegaConf as om from llmfoundry.utils.builders import build_composer_model, build_tokenizer -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive @pytest.mark.gpu @@ -63,7 +63,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, hf_cfg.model.pop('device') hf_model = build_composer_model( name=name, - cfg=to_str_dict(hf_cfg.model), + cfg=to_dict_recursive(hf_cfg.model), tokenizer=tokenizer, ).to(device) hf_n_params = sum(p.numel() for p in hf_model.parameters()) @@ -117,7 +117,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, model_cfg.pop('device') model = build_composer_model( name=name, - cfg=to_str_dict(model_cfg), + cfg=to_dict_recursive(model_cfg), tokenizer=tokenizer, ).to(device) n_params = sum(p.numel() for p in model.parameters()) diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 44b464a4c2..3a1ab1ce7b 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -35,7 +35,7 @@ from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_str_dict +from llmfoundry.utils.config_utils import to_dict_recursive def get_config( @@ -50,7 +50,7 @@ def get_config( def _load_tokenizer_cfg(cfg: Union[Dict[str, Any], DictConfig]) -> Dict: if isinstance(cfg, DictConfig): - config = to_str_dict(cfg) + config = to_dict_recursive(cfg) else: assert isinstance(cfg, dict) config = cfg @@ -96,7 +96,7 @@ def _get_objs(request: pytest.FixtureRequest, name = test_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_str_dict(test_cfg.model), + cfg=to_dict_recursive(test_cfg.model), tokenizer=tokenizer, ) @@ -299,7 +299,7 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): name = neo_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_str_dict(neo_cfg.model), + cfg=to_dict_recursive(neo_cfg.model), tokenizer=tokenizer, ).to(device) @@ -349,7 +349,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): name = t5_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_str_dict(t5_cfg.model), + cfg=to_dict_recursive(t5_cfg.model), tokenizer=tokenizer, ).to(device) @@ -427,7 +427,7 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, name = test_cfg.model.pop('name') model_1 = build_composer_model( name=name, - cfg=to_str_dict(test_cfg.model), + cfg=to_dict_recursive(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -498,7 +498,7 @@ def test_loss_fn(): name = test_cfg.model.pop('name') model_1 = build_composer_model( name=name, - cfg=to_str_dict(test_cfg.model), + cfg=to_dict_recursive(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -585,7 +585,7 @@ def test_loss_reduction(loss_fn_config: str): name = test_cfg.model.pop('name') model_1 = build_composer_model( name=name, - cfg=to_str_dict(test_cfg.model), + cfg=to_dict_recursive(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) From f680ea28f5565bfc4b4f136130f911f0aa24aa18 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 23:17:35 +0000 Subject: [PATCH 131/201] fix this darn unit test one more time --- llmfoundry/utils/config_utils.py | 7 ------- tests/models/hf/test_hf_v_mpt.py | 6 ++++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 698bac61ef..3a4aa22768 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -25,13 +25,6 @@ ] -def to_dict_recursive(cfg: DictConfig) -> Dict[str, Any]: - """Converts a DictConfig to a dictionary with string keys.""" - cfg_dict = om.to_container(cfg, resolve=True) - assert isinstance(cfg_dict, dict) - return {str(k): v for k, v in cfg_dict.items()} - - def forbid_config_key(cfg_dict: Dict[str, Any], key: str): if key in cfg_dict: raise ValueError( diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index a0be5aa773..b25f2d4476 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -113,8 +113,10 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, print('Initializing model...') print(model_cfg) - name = model_cfg.pop('name') - model_cfg.pop('device') + if 'name' in model_cfg: + name = model_cfg.pop('name') + if 'device' in model_cfg: + model_cfg.pop('device') model = build_composer_model( name=name, cfg=to_dict_recursive(model_cfg), From 7650b502bd1bec2f850ed53738b4aac17c61c88e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 01:25:19 +0000 Subject: [PATCH 132/201] fix ComposerMPTCausalLM constructor invocation --- tests/models/test_fsdp_act_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index 97063b25c4..592bb39473 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -48,7 +48,7 @@ def test_fsdp_act_checkpoint(activation_checkpointing: bool, 'activation_cpu_offload': False, } - model = ComposerMPTCausalLM(model_cfg) + model = ComposerMPTCausalLM(**model_cfg) model = device.module_to_device(model) trainer = Trainer( From 70af2887d2110d8fa628f57c49555bb9a619d656 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Sun, 21 Apr 2024 21:35:35 -0400 Subject: [PATCH 133/201] Delete tests/models/hf/test_hf_fsdp.py --- tests/models/hf/test_hf_fsdp.py | 35 --------------------------------- 1 file changed, 35 deletions(-) delete mode 100644 tests/models/hf/test_hf_fsdp.py diff --git a/tests/models/hf/test_hf_fsdp.py b/tests/models/hf/test_hf_fsdp.py deleted file mode 100644 index 69d4b67207..0000000000 --- a/tests/models/hf/test_hf_fsdp.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright 2024 MosaicML LLM Foundry authors -# SPDX-License-Identifier: Apache-2.0 - -from typing import Any, Dict - -from composer.models.huggingface import maybe_get_underlying_model - -from llmfoundry.models.hf import ComposerHFCausalLM - - -def test_olmo_wraps(): - conf: Dict[str, Any] = { - 'model': { - 'name': 'hf_causal_lm', - 'pretrained_model_name_or_path': 'allenai/OLMo-7B', - 'pretrained': False, - 'config_overrides': { - 'n_layers': 2, - } - }, - } - - conf['model'].pop('name') - model = ComposerHFCausalLM(tokenizer=None, **conf['model']) # type: ignore - - # check that all the modules we except are blocked from FSDP wrapping - underlying_model = maybe_get_underlying_model(model.model) - assert (not hasattr(underlying_model.model, - 'fsdp_wrap')) or (not underlying_model.model._fsdp_wrap) - assert (not hasattr(underlying_model.model.transformer, 'fsdp_wrap')) or ( - not underlying_model.model.transformer._fsdp_wrap) - assert (not hasattr(underlying_model.model.transformer.wte, 'fsdp_wrap') - ) or (not underlying_model.model.transformer.wte._fsdp_wrap) - assert (not hasattr(underlying_model.model.transformer.ff_out, 'fsdp_wrap') - ) or (not underlying_model.model.transformer.ff_out._fsdp_wrap) From bc6c54532142a14d7c83a4d0b80bc8aad2772850 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 02:00:53 +0000 Subject: [PATCH 134/201] unwrap model in unit tests --- tests/models/hf/test_fsdp_weight_tying.py | 4 ++-- tests/models/hf/test_hf_peft_wrapping.py | 6 +++--- tests/models/test_fsdp_act_checkpoint.py | 2 -- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index 487000193e..f7bf64119b 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -93,8 +93,8 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, ) model = trainer.state.model - lm_head = model.model.lm_head if peft_config is None else model.model.base_model.model.lm_head - embedding_layer = model.model.model.embed_tokens if peft_config is None else model.model.base_model.model.model.embed_tokens + lm_head = model.lm_head if peft_config is None else model.base_model.lm_head + embedding_layer = model.embed_tokens if peft_config is None else model.base_model.model.embed_tokens lm_head_id = id(lm_head.weight) embedding_layer_id = id(embedding_layer.weight) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 052704e785..b4a8224caa 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -99,9 +99,9 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, ) model = trainer.state.model - underlying_model = model.model.base_model.model - lora_A = underlying_model.model.layers[0].self_attn.q_proj.lora_A['default'] - lora_B = underlying_model.model.layers[0].self_attn.q_proj.lora_B['default'] + underlying_model = model.base_model + lora_A = underlying_model.layers[0].self_attn.q_proj.lora_A['default'] + lora_B = underlying_model.layers[0].self_attn.q_proj.lora_B['default'] assert (lora_A.weight == 1).all() assert (lora_B.weight == 0).all() diff --git a/tests/models/test_fsdp_act_checkpoint.py b/tests/models/test_fsdp_act_checkpoint.py index 592bb39473..63a366817b 100644 --- a/tests/models/test_fsdp_act_checkpoint.py +++ b/tests/models/test_fsdp_act_checkpoint.py @@ -6,7 +6,6 @@ import pytest from composer import Trainer from composer.utils import get_device -from omegaconf import OmegaConf as om from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import \ CheckpointWrapper @@ -40,7 +39,6 @@ def test_fsdp_act_checkpoint(activation_checkpointing: bool, }, 'activation_checkpointing_target': activation_checkpointing_target } - model_cfg = om.create(model_cfg) fsdp_config = { 'activation_checkpointing': activation_checkpointing, From be5105da43a7b8207f12a30c71b3536e8fa8149d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 03:14:36 +0000 Subject: [PATCH 135/201] model.model.model.model.model --- llmfoundry/models/hf/hf_causal_lm.py | 6 +++--- tests/models/hf/test_fsdp_weight_tying.py | 11 ++++------- tests/models/hf/test_hf_peft_wrapping.py | 3 ++- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index ddac76e98b..9034e96920 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -246,9 +246,9 @@ def _autoset_attn_implementation_monkeypatch( if model.config.tie_word_embeddings and resolved_init_device == 'meta': model.tie_weights() - peft_config = None + peft_config_object = None if peft_config is not None: - peft_config = self._get_peft_config(peft_config) + peft_config_object = self._get_peft_config(peft_config) if pretrained_lora_id_or_path is not None: if not peft_installed: @@ -266,7 +266,7 @@ def _autoset_attn_implementation_monkeypatch( metrics=train_metrics, eval_metrics=eval_metrics, init_device=init_device, - peft_config=peft_config, + peft_config=peft_config_object, ) @staticmethod diff --git a/tests/models/hf/test_fsdp_weight_tying.py b/tests/models/hf/test_fsdp_weight_tying.py index f7bf64119b..319e2de055 100644 --- a/tests/models/hf/test_fsdp_weight_tying.py +++ b/tests/models/hf/test_fsdp_weight_tying.py @@ -7,10 +7,8 @@ import pytest from composer import Trainer from composer.models.huggingface import maybe_get_underlying_model -from omegaconf import OmegaConf as om from llmfoundry.utils.builders import build_composer_model, build_tokenizer -from llmfoundry.utils.config_utils import to_dict_recursive @pytest.mark.world_size(2) @@ -48,7 +46,6 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, assert model_cfg is not None assert tokenizer_name is not None - model_cfg = om.create(model_cfg) if peft_config is not None: model_cfg['peft_config'] = peft_config @@ -71,12 +68,12 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, name = model_cfg.pop('name') original_model = build_composer_model( name=name, - cfg=to_dict_recursive(model_cfg), + cfg=model_cfg, tokenizer=tokenizer, ) underlying_model = maybe_get_underlying_model(original_model.model) - lm_head = underlying_model.lm_head if peft_config is None else underlying_model.lm_head + lm_head = underlying_model.lm_head embedding_layer = underlying_model.model.embed_tokens if peft_config is None else underlying_model.model.embed_tokens lm_head_id = id(lm_head.weight) @@ -93,8 +90,8 @@ def test_fsdp_weight_tying(peft_config: Optional[dict], tmp_path: pathlib.Path, ) model = trainer.state.model - lm_head = model.lm_head if peft_config is None else model.base_model.lm_head - embedding_layer = model.embed_tokens if peft_config is None else model.base_model.model.embed_tokens + lm_head = model.model.lm_head if peft_config is None else model.model.base_model.model.lm_head + embedding_layer = model.model.model.embed_tokens if peft_config is None else model.model.base_model.model.model.embed_tokens lm_head_id = id(lm_head.weight) embedding_layer_id = id(embedding_layer.weight) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index b4a8224caa..b31eaa12eb 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -99,7 +99,8 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, ) model = trainer.state.model - underlying_model = model.base_model + underlying_model = model.model.base_model.model.model + # assert False, f"underlying_model: {underlying_model}" lora_A = underlying_model.layers[0].self_attn.q_proj.lora_A['default'] lora_B = underlying_model.layers[0].self_attn.q_proj.lora_B['default'] From 6206db659826316d29b23f362fb84b9aa02f32a7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 03:55:58 +0000 Subject: [PATCH 136/201] abstract away dataclass construction --- llmfoundry/utils/config_utils.py | 62 +++++++++++++++++++++++++++- scripts/eval/eval.py | 54 +++--------------------- scripts/train/train.py | 70 ++++---------------------------- 3 files changed, 74 insertions(+), 112 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 3a4aa22768..f8a13f5481 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -2,10 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import contextlib +import copy import logging import math import warnings -from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union +from typing import (Any, Callable, Dict, List, Literal, Mapping, Optional, Set, + Tuple, Union) from composer.utils import dist from omegaconf import DictConfig, ListConfig @@ -65,6 +67,64 @@ def rh(x: Any) -> Any: # recursive helper return rh(cfg) +def make_dataclass_and_log_config( + cfg: DictConfig, dataclass_constructor: Callable[..., Any], + dataclass_fields: Set[str], + transforms: Optional[List[Callable[[Dict[str, Any]], Dict[str, Any]]]] +) -> Tuple[DictConfig, Any]: + """Converts a DictConfig to a dataclass and creates a logged config.""" + # Resolve all interpolation variables as early as possible + unstructured_config = om.to_container(cfg, resolve=True) + assert isinstance(unstructured_config, dict) + assert all(isinstance(k, str) for k in unstructured_config.keys()) + unstructured_config = {str(k): v for k, v in unstructured_config.items()} + + # Flatten union types before creating structured config: + if 'eval_gauntlet' in unstructured_config: + forbid_config_key(unstructured_config, 'eval_gauntlet_str') + if isinstance(unstructured_config['eval_gauntlet'], str): + unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( + 'eval_gauntlet') + if (loader := unstructured_config.get('eval_loader', None)) is not None: + forbid_config_key(unstructured_config, 'eval_loaders') + if isinstance(loader, list): + unstructured_config['eval_loaders'] = unstructured_config.pop( + 'eval_loader') + if 'icl_tasks' in unstructured_config: + forbid_config_key(unstructured_config, 'icl_tasks_str') + if isinstance(unstructured_config['icl_tasks'], str): + unstructured_config['icl_tasks_str'] = unstructured_config.pop( + 'icl_tasks') + else: + raise ValueError('icl_tasks must be specified in the config') + + arg_config_keys = set(unstructured_config.keys()) + extraneous_keys = set.difference(arg_config_keys, dataclass_fields) + + if 'variables' not in unstructured_config: + unstructured_config['variables'] = {} + + for key in extraneous_keys: + warnings.warn( + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', + DeprecationWarning) + unstructured_config['variables'][key] = unstructured_config.pop(key) + + # Create copy of config for logging + logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) + + # apply transforms to the unstructured config before constructing dataclass + for transform in transforms or []: + unstructured_config = transform(unstructured_config) + + logged_cfg.update(unstructured_config, merge=True) + + eval_config: DictConfig = om.structured( + dataclass_constructor(**unstructured_config)) + + return logged_cfg, eval_config + + def pop_config(cfg: Union[Dict[str, Any], DictConfig], key: str, must_exist: bool = True, diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 67287d53b2..fde75db318 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -1,12 +1,10 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import copy import logging import os import sys import time -import warnings from dataclasses import dataclass, fields from typing import Any, Dict, List, Optional, Tuple, Union @@ -28,7 +26,8 @@ build_callback, build_composer_model, build_evaluators, build_logger, build_tokenizer) -from llmfoundry.utils.config_utils import (forbid_config_key, log_config, +from llmfoundry.utils.config_utils import (log_config, + make_dataclass_and_log_config, process_init_device, to_container_recursive) from llmfoundry.utils.registry_utils import import_file @@ -211,53 +210,10 @@ class EvalConfig: EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) -def _make_eval_and_log_config(cfg: DictConfig) -> Tuple[DictConfig, EvalConfig]: - # Resolve all interpolation variables as early as possible - unstructured_config = om.to_container(cfg, resolve=True) - assert isinstance(unstructured_config, dict) - assert all(isinstance(k, str) for k in unstructured_config.keys()) - unstructured_config = {str(k): v for k, v in unstructured_config.items()} - - # Flatten union types before creating structured config: - if 'eval_gauntlet' in unstructured_config: - forbid_config_key(unstructured_config, 'eval_gauntlet_str') - if isinstance(unstructured_config['eval_gauntlet'], str): - unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( - 'eval_gauntlet') - if (loader := unstructured_config.get('eval_loader', None)) is not None: - forbid_config_key(unstructured_config, 'eval_loaders') - if isinstance(loader, list): - unstructured_config['eval_loaders'] = unstructured_config.pop( - 'eval_loader') - if 'icl_tasks' in unstructured_config: - forbid_config_key(unstructured_config, 'icl_tasks_str') - if isinstance(unstructured_config['icl_tasks'], str): - unstructured_config['icl_tasks_str'] = unstructured_config.pop( - 'icl_tasks') - else: - raise ValueError('icl_tasks must be specified in the config') - - arg_config_keys = set(unstructured_config.keys()) - extraneous_keys = set.difference(arg_config_keys, EVAL_CONFIG_KEYS) - - if 'variables' not in unstructured_config: - unstructured_config['variables'] = {} - - for key in extraneous_keys: - warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', - DeprecationWarning) - unstructured_config['variables'][key] = unstructured_config.pop(key) - - eval_config: EvalConfig = om.structured(EvalConfig(**unstructured_config)) - # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) - - return logged_cfg, eval_config - - def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: - logged_cfg, eval_config = _make_eval_and_log_config(cfg) + cfgs: Tuple[DictConfig, EvalConfig] = make_dataclass_and_log_config( + cfg, EvalConfig, EVAL_CONFIG_KEYS) + logged_cfg, eval_config = cfgs # Run user provided code if specified for code_path in (eval_config.code_paths or []): diff --git a/scripts/train/train.py b/scripts/train/train.py index bb6fc16738..0b704ec561 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -1,6 +1,5 @@ # Copyright 2022 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -import copy import gc import logging import os @@ -35,7 +34,8 @@ build_composer_model, build_evaluators, build_logger, build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (forbid_config_key, log_config, +from llmfoundry.utils.config_utils import (log_config, + make_dataclass_and_log_config, pop_config, process_init_device, to_dict_recursive, to_list_recursive, update_batch_size_info) @@ -231,65 +231,6 @@ def validate_config(train_config: TrainConfig): ) -def _make_train_and_log_config( - cfg: DictConfig) -> Tuple[DictConfig, TrainConfig]: - # Resolve all interpolation variables as early as possible - unstructured_config = om.to_container(cfg, resolve=True) - assert isinstance(unstructured_config, dict) - assert all(isinstance(k, str) for k in unstructured_config.keys()) - unstructured_config = {str(k): v for k, v in unstructured_config.items()} - - # Structured config does not support unions of containers, so separate single and plural containers - if (loader := unstructured_config.get('eval_loader', None)) is not None: - forbid_config_key(unstructured_config, 'eval_loaders') - if isinstance(loader, list): - unstructured_config['eval_loaders'] = unstructured_config.pop( - 'eval_loader') - if (tasks := unstructured_config.get('icl_tasks', None)) is not None: - forbid_config_key(unstructured_config, 'icl_tasks_str') - if isinstance(tasks, str): - if 'icl_tasks_str' in unstructured_config: - raise ValueError( - 'Only one of `icl_tasks` or `icl_tasks_str` should be provided.' - ) - unstructured_config['icl_tasks_str'] = unstructured_config.pop( - 'icl_tasks') - if (gauntlet := unstructured_config.get('eval_gauntlet', None)) is not None: - forbid_config_key(unstructured_config, 'eval_gauntlet_str') - if isinstance(gauntlet, str): - if 'eval_gauntlet_str' in unstructured_config: - raise ValueError( - 'Only one of `eval_gauntlet` or `eval_gauntlet_str` should be provided.' - ) - unstructured_config['eval_gauntlet_str'] = unstructured_config.pop( - 'eval_gauntlet') - - arg_config_keys = set(unstructured_config.keys()) - extraneous_keys = set.difference(arg_config_keys, TRAIN_CONFIG_KEYS) - - if 'variables' not in unstructured_config: - unstructured_config['variables'] = {} - - for key in extraneous_keys: - warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', - DeprecationWarning) - # TODO (milo): delete the below line once we deprecate variables at the top level. - unstructured_config['variables'][key] = unstructured_config.pop(key) - - # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) - - # Get global and device batch size information from distributed/single node setting - unstructured_config = update_batch_size_info(unstructured_config) - logged_cfg.update(unstructured_config, merge=True) - - train_cfg: TrainConfig = om.structured( - TrainConfig(**unstructured_config) - ) # type: ignore (TrainConfig does expect arguments, the type checker is wrong here) - return logged_cfg, train_cfg - - def _log_num_params(model: ComposerModel, logged_cfg: DictConfig): # Log number of parameters if hasattr(model, 'n_total_params'): @@ -311,7 +252,12 @@ def _log_num_params(model: ComposerModel, logged_cfg: DictConfig): def main(cfg: DictConfig) -> Trainer: - logged_cfg, train_cfg = _make_train_and_log_config(cfg) + cfgs: Tuple[DictConfig, TrainConfig] = make_dataclass_and_log_config( + cfg, + TrainConfig, + TRAIN_CONFIG_KEYS, + transforms=[update_batch_size_info]) + logged_cfg, train_cfg = cfgs code_paths = train_cfg.code_paths if train_cfg.code_paths else [] # Import any user provided code From de6726641032da8de6fb5040fdae61c270fb569c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 04:26:46 +0000 Subject: [PATCH 137/201] updated docstrings and removed dictconfig from logging logic --- llmfoundry/data/finetuning/dataloader.py | 37 ++++++++++---------- llmfoundry/models/hf/hf_causal_lm.py | 43 ++++++++++++------------ llmfoundry/models/hf/hf_t5.py | 23 ++++++------- llmfoundry/utils/config_utils.py | 30 +++++++++++------ scripts/eval/eval.py | 4 +-- scripts/train/train.py | 4 +-- 6 files changed, 74 insertions(+), 67 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 0a7b4cb819..7e9704020d 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -53,18 +53,17 @@ def build_finetuning_dataloader( on which you intend to use, as explained below. Args: - cfg (DictConfig): An omegaconf dictionary used to configure the loader: - cfg.name (str): The type of dataloader to build. Must = "finetuning". - --- - *** HuggingFace dataset config fields *** - cfg.dataset.hf_name (str, optional): The name of the HuggingFace dataset + name (str): The type of dataloader to build. Must = "finetuning". + --- + *** HuggingFace dataset config fields *** + dataset.hf_name (str, optional): The name of the HuggingFace dataset to use. Can also be a remote http(s) directory or object store bucket containing the file {split}.jsonl in the format (prompt, response), in which case the builder will create a HuggingFace dataset. - cfg.dataset.hf_kwargs (DictConfig, optional): Additional kwargs to + dataset.hf_kwargs (DictConfig, optional): Additional kwargs to pass to `datasets.load_dataset`, which can be used to load a dataset from local files. - cfg.dataset.preprocessing_fn (str, optional): The name/import path of + dataset.preprocessing_fn (str, optional): The name/import path of the preprocessing function to use for formatting the data examples. If ``None`` (default), the builder will use the preprocessing function registered under `hf_name` (see `tasks.py`), if one exists, @@ -76,30 +75,30 @@ def build_finetuning_dataloader( `from import.path import function_name` and use the imported function as the preprocessing function. *** Streaming dataset config fields *** - cfg.dataset.remote (str, optional): Location of a MDS-formatted + dataset.remote (str, optional): Location of a MDS-formatted streaming dataset to use. Setting this will tell the builder to create a streaming dataset rather than a HuggingFace dataset. - cfg.dataset.local (str, optional): Local path where remote data + dataset.local (str, optional): Local path where remote data will be streamed to. Only valid if `cfg.dataset.remote` has also been set. *** Shared dataset configs fields *** - cfg.dataset.max_seq_len (int): The maximum length of sequences + dataset.max_seq_len (int): The maximum length of sequences in the batch. See :class:`Seq2SeqFinetuningCollator` docstring for details. - cfg.dataset.decoder_only_format (bool): Whether to format the + dataset.decoder_only_format (bool): Whether to format the examples for a decoder-only model. See :class:`Seq2SeqFinetuningCollator` docstring for details. - cfg.dataset.target_responses (str): Which responses are used as training targets. + dataset.target_responses (str): Which responses are used as training targets. Defaults to "last", meaning only the final response in multi-turn examples will serve as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for details. - cfg.dataset.target_prompts (str): Which prompts are used as training targets. + dataset.target_prompts (str): Which prompts are used as training targets. Defaults to "none", meaning prompts are never used as training targets. See :class:`Seq2SeqFinetuningCollator` docstring for details. - cfg.dataset.allow_pad_trimming (bool, optional): Whether to allow + dataset.allow_pad_trimming (bool, optional): Whether to allow the collator to trim padding. See :class:`Seq2SeqFinetuningCollator` docstring for details. Default: ``False``. - cfg.dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes + dataset.packing_ratio (Optional[float, Literal['auto']]): If provided, this invokes a collator wrapper that packs device_batch_size*packing_ratio raw examples into device_batch_size packed examples. This helps minimize padding while preserving sequence integrity. @@ -119,19 +118,19 @@ def build_finetuning_dataloader( statistics, max_seq_len, and tolerance for discarding samples! The script `scripts/misc/profile_packing.py` can help you choose the best packing_ratio. - cfg.dataset.shuffle (bool): Whether to shuffle the dataset. + dataset.shuffle (bool): Whether to shuffle the dataset. ___ See :class:`StreamingFinetuningDataset` for info on other standard config - options within `cfg.dataset` that will be passed as kwargs if + options within `dataset` that will be passed as kwargs if using the streaming codepath. --- - See :class:`DataLoader` for standard argument options to the pytorch - dataloader, such as `cfg.drop_last`, `cfg.num_workers`, etc. tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to prepare the data from raw text. Any missing sentinel tokens will be added by the collator. device_batch_size (int): The size of the batches (number of examples) that the dataloader will produce. + See :class:`DataLoader` for standard argument options to the pytorch + dataloader, such as `drop_last`, `num_workers`, etc. Returns: A pytorch dataloader diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 9034e96920..1f8881b942 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -32,28 +32,27 @@ class ComposerHFCausalLM(HuggingFaceModelWithFSDP): """Configures a :class:`.HuggingFaceModel` around a Causal LM. Args: - om_model_config (DictConfig): An OmegaConf DictConfig specifying the configuration options - cfg.pretrained_model_name_or_path (str): The name of or local path to - the HF Causal LM (e.g., `gpt2` to instantiate a GPT2LMHeadModel). - cfg.config_overrides (dict, optional): An optional dictionary of keyword - arguments that override the default configuration associated with - cfg.pretrained_model_name_or_path. - cfg.pretrained (bool): Whether to instantiate the model with pre-trained - weights coming from cfg.pretrained_model_name_or_path. If ``True``, - cfg.config_overrides must be compatible with the pre-trained weights. - cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to - initialize the model on. Currently, `meta` is only supported when - cfg.pretrained is ``False``. Default: ``'cpu'``. - cfg.peft_config (dict, optional): An optional dictionary of keyword arguments to be - passed to the PeftConfig constructor. If provided, the model will be wrapped in a PeftModel. - cfg.trust_remote_code (bool, optional): Whether to trust remote code when loading from Hugging Face - Hub. Default: ``True``. - cfg.use_auth_token (bool, optional): Whether to use the Hugging Face authentication token when - loading from Hugging Face Hub. Default: ``False``. - cfg.use_train_metrics (bool, optional): Whether to use training metrics. Default: ``True``. - cfg.load_in_8bit (bool, optional): Whether to load the model in 8-bit mode. Default: ``False``. - cfg.init_device (str, optional): Which device to initialize the model on. Default: ``'cpu'``. - cfg.use_flash_attention_2 (bool, optional): Whether to use flash-attention 2. Default: ``False``. + pretrained_model_name_or_path (str): The name of or local path to + the HF Causal LM (e.g., `gpt2` to instantiate a GPT2LMHeadModel). + config_overrides (dict, optional): An optional dictionary of keyword + arguments that override the default configuration associated with + cfg.pretrained_model_name_or_path. + pretrained (bool): Whether to instantiate the model with pre-trained + weights coming from cfg.pretrained_model_name_or_path. If ``True``, + cfg.config_overrides must be compatible with the pre-trained weights. + init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to + initialize the model on. Currently, `meta` is only supported when + cfg.pretrained is ``False``. Default: ``'cpu'``. + peft_config (dict, optional): An optional dictionary of keyword arguments to be + passed to the PeftConfig constructor. If provided, the model will be wrapped in a PeftModel. + trust_remote_code (bool, optional): Whether to trust remote code when loading from Hugging Face + Hub. Default: ``True``. + use_auth_token (bool, optional): Whether to use the Hugging Face authentication token when + loading from Hugging Face Hub. Default: ``False``. + use_train_metrics (bool, optional): Whether to use training metrics. Default: ``True``. + load_in_8bit (bool, optional): Whether to load the model in 8-bit mode. Default: ``False``. + init_device (str, optional): Which device to initialize the model on. Default: ``'cpu'``. + use_flash_attention_2 (bool, optional): Whether to use flash-attention 2. Default: ``False``. tokenizer (PreTrainedTokenizer): The tokenizer that the model will use. """ diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py index cf6f6d0ece..409093f271 100644 --- a/llmfoundry/models/hf/hf_t5.py +++ b/llmfoundry/models/hf/hf_t5.py @@ -28,18 +28,17 @@ class ComposerHFT5(HuggingFaceModelWithFSDP): will expand support to more general classes of HF Encoder-Decoder models. Args: - cfg (DictConfig): An omegaconf dictionary used to configure the model: - cfg.pretrained_model_name_or_path (str): The name of or local path to - the HF model (e.g., `t5-base` to instantiate a T5 using the base config). - cfg.config_overrides (dict, optional): An optional dictionary of keyword - arguments that override the default configuration associated with - cfg.pretrained_model_name_or_path. Default: ``{}``. - cfg.pretrained (bool): Whether to instantiate the model with pre-trained - weights coming from cfg.pretrained_model_name_or_path. If ``True``, - cfg.config_overrides must be compatible with the pre-trained weights. - cfg.init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to - initialize the model on. Currently, `meta` is only supported when - cfg.pretrained is ``False``. Default: ``'cpu'``. + pretrained_model_name_or_path (str): The name of or local path to + the HF model (e.g., `t5-base` to instantiate a T5 using the base config). + config_overrides (dict, optional): An optional dictionary of keyword + arguments that override the default configuration associated with + cfg.pretrained_model_name_or_path. Default: ``{}``. + pretrained (bool): Whether to instantiate the model with pre-trained + weights coming from cfg.pretrained_model_name_or_path. If ``True``, + cfg.config_overrides must be compatible with the pre-trained weights. + init_device ('cpu' | 'meta'): Which device, 'cpu' or 'meta', to + initialize the model on. Currently, `meta` is only supported when + cfg.pretrained is ``False``. Default: ``'cpu'``. tokenizer (PreTrainedTokenizer): The tokenizer that the model will use. """ diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index f8a13f5481..65f0751a87 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -7,7 +7,7 @@ import math import warnings from typing import (Any, Callable, Dict, List, Literal, Mapping, Optional, Set, - Tuple, Union) + Tuple, TypeVar, Union) from composer.utils import dist from omegaconf import DictConfig, ListConfig @@ -55,6 +55,11 @@ def to_container_recursive( cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], List[Dict[str, Any]]]] ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: + """Converts a DictConfig or ListConfig to a dict or list recursively. + + `omegaconf.to_container` does not handle nested DictConfig or ListConfig + objects, so this function is used to convert them to dicts or lists. + """ def rh(x: Any) -> Any: # recursive helper if isinstance(x, DictConfig): @@ -67,11 +72,16 @@ def rh(x: Any) -> Any: # recursive helper return rh(cfg) +T = TypeVar('T') + + def make_dataclass_and_log_config( - cfg: DictConfig, dataclass_constructor: Callable[..., Any], + cfg: DictConfig, + dataclass_constructor: Callable[..., T], dataclass_fields: Set[str], - transforms: Optional[List[Callable[[Dict[str, Any]], Dict[str, Any]]]] -) -> Tuple[DictConfig, Any]: + transforms: Optional[List[Callable[[Dict[str, Any]], Dict[str, + Any]]]] = None +) -> Tuple[Dict[str, Any], T]: """Converts a DictConfig to a dataclass and creates a logged config.""" # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) @@ -111,7 +121,7 @@ def make_dataclass_and_log_config( unstructured_config['variables'][key] = unstructured_config.pop(key) # Create copy of config for logging - logged_cfg: DictConfig = copy.deepcopy(DictConfig(unstructured_config)) + logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) # apply transforms to the unstructured config before constructing dataclass for transform in transforms or []: @@ -119,10 +129,10 @@ def make_dataclass_and_log_config( logged_cfg.update(unstructured_config, merge=True) - eval_config: DictConfig = om.structured( + dataclass_config: T = om.structured( dataclass_constructor(**unstructured_config)) - return logged_cfg, eval_config + return logged_cfg, dataclass_config def pop_config(cfg: Union[Dict[str, Any], DictConfig], @@ -262,7 +272,7 @@ def process_init_device(model_cfg: Dict[str, Any], fsdp_config: Optional[Dict]): return init_context -def log_config(cfg: DictConfig) -> None: +def log_config(cfg: Dict[str, Any]) -> None: """Logs the current config and updates the wandb and mlflow configs. This function can be called multiple times to update the wandb and MLflow @@ -275,7 +285,7 @@ def log_config(cfg: DictConfig) -> None: except ImportError as e: raise e if wandb.run: - wandb.config.update(om.to_container(cfg, resolve=True)) + wandb.config.update(cfg) if 'mlflow' in cfg.get('loggers', {}): try: @@ -283,4 +293,4 @@ def log_config(cfg: DictConfig) -> None: except ImportError as e: raise e if mlflow.active_run(): - mlflow.log_params(params=om.to_container(cfg, resolve=True)) + mlflow.log_params(params=cfg) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index fde75db318..f03c1f0f80 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -56,7 +56,7 @@ def evaluate_model( icl_subset_num_batches: Optional[int], callback_configs: Optional[Dict[str, Any]], metadata: Optional[Dict[str, str]], - logged_config: DictConfig, + logged_config: Dict[str, Any], should_log_config: bool = True, load_path: Optional[str] = None, ): @@ -211,7 +211,7 @@ class EvalConfig: def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: - cfgs: Tuple[DictConfig, EvalConfig] = make_dataclass_and_log_config( + cfgs: Tuple[Dict[str, Any], EvalConfig] = make_dataclass_and_log_config( cfg, EvalConfig, EVAL_CONFIG_KEYS) logged_cfg, eval_config = cfgs diff --git a/scripts/train/train.py b/scripts/train/train.py index 0b704ec561..e22728740a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -231,7 +231,7 @@ def validate_config(train_config: TrainConfig): ) -def _log_num_params(model: ComposerModel, logged_cfg: DictConfig): +def _log_num_params(model: ComposerModel, logged_cfg: Dict[str, Any]): # Log number of parameters if hasattr(model, 'n_total_params'): n_params = model.n_total_params @@ -252,7 +252,7 @@ def _log_num_params(model: ComposerModel, logged_cfg: DictConfig): def main(cfg: DictConfig) -> Trainer: - cfgs: Tuple[DictConfig, TrainConfig] = make_dataclass_and_log_config( + cfgs: Tuple[Dict[str, Any], TrainConfig] = make_dataclass_and_log_config( cfg, TrainConfig, TRAIN_CONFIG_KEYS, From 31111a795200a37df90f6cddfaf3053ee5249730 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 04:52:05 +0000 Subject: [PATCH 138/201] flag icl tasks required or not --- llmfoundry/utils/config_utils.py | 15 ++++++++------- scripts/eval/eval.py | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 65f0751a87..db29bcf5c5 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -76,12 +76,12 @@ def rh(x: Any) -> Any: # recursive helper def make_dataclass_and_log_config( - cfg: DictConfig, - dataclass_constructor: Callable[..., T], - dataclass_fields: Set[str], - transforms: Optional[List[Callable[[Dict[str, Any]], Dict[str, - Any]]]] = None -) -> Tuple[Dict[str, Any], T]: + cfg: DictConfig, + dataclass_constructor: Callable[..., T], + dataclass_fields: Set[str], + transforms: Optional[List[Callable[[Dict[str, Any]], + Dict[str, Any]]]] = None, + icl_tasks_required: bool = False) -> Tuple[Dict[str, Any], T]: """Converts a DictConfig to a dataclass and creates a logged config.""" # Resolve all interpolation variables as early as possible unstructured_config = om.to_container(cfg, resolve=True) @@ -106,7 +106,8 @@ def make_dataclass_and_log_config( unstructured_config['icl_tasks_str'] = unstructured_config.pop( 'icl_tasks') else: - raise ValueError('icl_tasks must be specified in the config') + if icl_tasks_required: + raise ValueError('icl_tasks must be specified in the config') arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, dataclass_fields) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index f03c1f0f80..34e0cba7a3 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -212,7 +212,7 @@ class EvalConfig: def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: cfgs: Tuple[Dict[str, Any], EvalConfig] = make_dataclass_and_log_config( - cfg, EvalConfig, EVAL_CONFIG_KEYS) + cfg, EvalConfig, EVAL_CONFIG_KEYS, icl_tasks_required=True) logged_cfg, eval_config = cfgs # Run user provided code if specified From 8c4aaa47858b1814aa479ceec17edde1846bf66b Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 05:40:10 +0000 Subject: [PATCH 139/201] updated a couple yamls --- scripts/eval/yamls/hf_8bit_eval.yaml | 15 ++++--- scripts/eval/yamls/hf_eval.yaml | 24 +++++------ scripts/eval/yamls/hf_lora_eval.yml | 23 +++++----- scripts/eval/yamls/mpt_eval.yaml | 13 +++--- .../yamls/finetune/1b_local_data_sft.yaml | 11 ++--- .../train/yamls/finetune/7b_dolly_sft.yaml | 14 +++--- .../train/yamls/finetune/dbrx-full-ft.yaml | 16 ++++--- .../train/yamls/finetune/dbrx-lora-ft.yaml | 16 ++++--- .../gpt2-arc-easy-cpu-streaming-dataset.yaml | 23 +++++----- .../yamls/finetune/mpt-30b-instruct.yaml | 31 +++++++------ .../yamls/finetune/mpt-7b_dolly_sft.yaml | 19 ++++---- .../yamls/finetune/mpt-7b_domain_adapt.yaml | 31 +++++++------ .../yamls/finetune/t5-small_dolly_sft.yaml | 13 +++--- .../train/yamls/pretrain/gpt-neo-125m.yaml | 43 +++++++++++-------- .../yamls/pretrain/gpt-neo-125m_eval.yaml | 43 +++++++++++-------- scripts/train/yamls/pretrain/gpt2-small.yaml | 43 +++++++++++-------- scripts/train/yamls/pretrain/mpt-125m.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-13b.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-1b.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-30b.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-350m.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-3b.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-70b.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-760m.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/mpt-7b.yaml | 38 ++++++++-------- .../train/yamls/pretrain/mpt-small-cpu.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/opt-3b.yaml | 40 +++++++++-------- scripts/train/yamls/pretrain/testing-moe.yaml | 38 ++++++++-------- scripts/train/yamls/pretrain/testing.yaml | 38 ++++++++-------- 29 files changed, 487 insertions(+), 387 deletions(-) diff --git a/scripts/eval/yamls/hf_8bit_eval.yaml b/scripts/eval/yamls/hf_8bit_eval.yaml index 3bf3c23414..482c6d7da7 100644 --- a/scripts/eval/yamls/hf_8bit_eval.yaml +++ b/scripts/eval/yamls/hf_8bit_eval.yaml @@ -1,22 +1,23 @@ -max_seq_len: 1024 +variables: + model_name_or_path: bigscience/bloom-1b7 + seed: 1 precision: amp_fp16 - -model_name_or_path: bigscience/bloom-1b7 +max_seq_len: 1024 models: - - model_name: ${model_name_or_path} + model_name: ${variables.model_name_or_path} model: name: hf_causal_lm - pretrained_model_name_or_path: ${model_name_or_path} + pretrained_model_name_or_path: ${variables.model_name_or_path} init_device: mixed pretrained: true load_in_8bit: true tokenizer: - name: ${model_name_or_path} + name: ${variables.model_name_or_path} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} device_eval_batch_size: 4 diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 9eb0245f9a..15e53edcaa 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -1,23 +1,23 @@ -max_seq_len: 1024 -seed: 1 -precision: fp32 +variables: + # If you are using one model, put it here: + model_name_or_path: EleutherAI/gpt-neo-125m + # otherwise, write a block for each model you want to test in the `models` section -# If you are using one model, put it here: -model_name_or_path: EleutherAI/gpt-neo-125m -# otherwise, write a block for each model you want to test in the `models` section +precision: fp32 +max_seq_len: 1024 models: - - model_name: ${model_name_or_path} + model_name: ${variables.model_name_or_path} model: name: hf_causal_lm - pretrained_model_name_or_path: ${model_name_or_path} + pretrained_model_name_or_path: ${variables.model_name_or_path} init_device: mixed pretrained: true tokenizer: - name: ${model_name_or_path} + name: ${variables.model_name_or_path} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # # if you are evaluating more than one model, list them all as YAML blocks without variable interpolation # - # model_name: mosaicml/mpt-7b @@ -27,11 +27,11 @@ models: # init_device: cpu # pretrained: true # config_overrides: -# max_seq_len: ${max_seq_len} +# max_seq_len: ${variables.max_seq_len} # tokenizer: # name: mosaicml/mpt-7b # kwargs: -# model_max_length: ${max_seq_len} +# model_max_length: ${variables.max_seq_len} device_eval_batch_size: 4 diff --git a/scripts/eval/yamls/hf_lora_eval.yml b/scripts/eval/yamls/hf_lora_eval.yml index 08861b8569..f2bc637cac 100644 --- a/scripts/eval/yamls/hf_lora_eval.yml +++ b/scripts/eval/yamls/hf_lora_eval.yml @@ -1,25 +1,26 @@ -max_seq_len: 2048 +variables: + model_name_or_path: facebook/opt-350m + # If you are using a seperated lora weight, put it here: + # lora weights must be compatible with the specified model + lora_id_or_path: ybelkada/opt-350m-lora # Example lora weights for opt-350m + seed: 1 precision: amp_fp16 - -model_name_or_path: facebook/opt-350m -# If you are using a seperated lora weight, put it here: -# lora weights must be compatible with the specified model -lora_id_or_path: ybelkada/opt-350m-lora # Example lora weights for opt-350m +max_seq_len: 2048 models: - - model_name: ${model_name_or_path} + model_name: ${variables.model_name_or_path} model: name: hf_causal_lm - pretrained_model_name_or_path: ${model_name_or_path} + pretrained_model_name_or_path: ${variables.model_name_or_path} init_device: mixed pretrained: true - pretrained_lora_id_or_path: ${lora_id_or_path} + pretrained_lora_id_or_path: ${variables.lora_id_or_path} tokenizer: - name: ${model_name_or_path} + name: ${variables.model_name_or_path} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} device_eval_batch_size: 4 diff --git a/scripts/eval/yamls/mpt_eval.yaml b/scripts/eval/yamls/mpt_eval.yaml index f59a73f15b..5274bd0b9d 100644 --- a/scripts/eval/yamls/mpt_eval.yaml +++ b/scripts/eval/yamls/mpt_eval.yaml @@ -1,16 +1,19 @@ -max_seq_len: 1024 -tokenizer_name: EleutherAI/gpt-neox-20b +variables: + tokenizer_name: EleutherAI/gpt-neox-20b + max_seq_len: 1024 + seed: 1 precision: amp_fp16 +max_seq_len: ${variables.max_seq_len} models: - model_name: mpt_test # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} model: name: mpt_causal_lm init_device: mixed @@ -19,7 +22,7 @@ models: n_heads: 12 n_layers: 12 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash diff --git a/scripts/train/yamls/finetune/1b_local_data_sft.yaml b/scripts/train/yamls/finetune/1b_local_data_sft.yaml index cec9febf68..08f25cce21 100644 --- a/scripts/train/yamls/finetune/1b_local_data_sft.yaml +++ b/scripts/train/yamls/finetune/1b_local_data_sft.yaml @@ -6,8 +6,9 @@ variables: global_seed: 17 + max_seq_len: 2048 -max_seq_len: 2048 +max_seq_len: ${variables.max_seq_len} # Run Name run_name: # If left blank, will be read from env var $RUN_NAME @@ -21,7 +22,7 @@ model: n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention n_layers: 24 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -32,7 +33,7 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Local data to load into huggingface datasets dataset: &hf_dataset @@ -47,7 +48,7 @@ train_loader: &train_loader dataset: <<: *hf_dataset split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true shuffle: true @@ -99,7 +100,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 128 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/finetune/7b_dolly_sft.yaml b/scripts/train/yamls/finetune/7b_dolly_sft.yaml index d46393bd8a..f9edba3716 100644 --- a/scripts/train/yamls/finetune/7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/7b_dolly_sft.yaml @@ -6,11 +6,13 @@ variables: global_seed: 17 + max_seq_len: 2048 + run_name: # If left blank, will be read from env var $RUN_NAME -max_seq_len: 2048 +max_seq_len: ${variables.max_seq_len} # Run Name -run_name: # If left blank, will be read from env var $RUN_NAME +run_name: ${variables.run_name} # Model model: @@ -20,7 +22,7 @@ model: n_heads: 32 n_layers: 32 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -31,7 +33,7 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: @@ -39,7 +41,7 @@ train_loader: dataset: hf_name: HuggingFaceH4/databricks_dolly_15k split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true shuffle: true @@ -87,7 +89,7 @@ eval_interval: 1 # this is the only allowed value for no eval global_train_batch_size: 64 # assuming 8 gpus # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/finetune/dbrx-full-ft.yaml b/scripts/train/yamls/finetune/dbrx-full-ft.yaml index c48f269788..24fd4cb126 100644 --- a/scripts/train/yamls/finetune/dbrx-full-ft.yaml +++ b/scripts/train/yamls/finetune/dbrx-full-ft.yaml @@ -2,9 +2,13 @@ variables: # Run Name run_name: # If left blank, will be read from env var $RUN_NAME -# Note: This requires ~64x80GB GPUs -max_seq_len: 4096 -icl_seq_len: 1024 + # Note: This requires ~64x80GB GPUs + max_seq_len: 4096 + icl_seq_len: 1024 + +run_name: ${variables.run_name} +max_seq_len: ${variables.max_seq_len} +icl_seq_len: ${variables.icl_seq_len} # Model model: @@ -20,7 +24,7 @@ model: tokenizer: name: databricks/dbrx-instruct kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} trust_remote_code: true # Dataloaders @@ -30,7 +34,7 @@ train_loader: split: train hf_name: mosaicml/dolly_hhrlhf shuffle: true - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} eos_token_id: 0 packing_ratio: auto allow_pad_trimming: false @@ -47,7 +51,7 @@ eval_loader: split: test hf_name: mosaicml/dolly_hhrlhf shuffle: false - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} packing_ratio: null allow_pad_trimming: false decoder_only_format: true diff --git a/scripts/train/yamls/finetune/dbrx-lora-ft.yaml b/scripts/train/yamls/finetune/dbrx-lora-ft.yaml index dacb2c8563..87950134bd 100644 --- a/scripts/train/yamls/finetune/dbrx-lora-ft.yaml +++ b/scripts/train/yamls/finetune/dbrx-lora-ft.yaml @@ -2,10 +2,14 @@ variables: # Note: This requires ~16x80GB GPUs icl_seq_len: 1024 -max_seq_len: 4096 + max_seq_len: 4096 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +icl_seq_len: ${variables.icl_seq_len} +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -29,7 +33,7 @@ model: tokenizer: name: databricks/dbrx-instruct kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} trust_remote_code: true # Dataloaders @@ -39,7 +43,7 @@ train_loader: split: train hf_name: mosaicml/dolly_hhrlhf shuffle: true - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} eos_token_id: 0 packing_ratio: auto allow_pad_trimming: false @@ -56,7 +60,7 @@ eval_loader: split: test hf_name: mosaicml/dolly_hhrlhf shuffle: false - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} packing_ratio: null allow_pad_trimming: false decoder_only_format: true diff --git a/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml b/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml index 2de4e29795..95a70acfd7 100644 --- a/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml +++ b/scripts/train/yamls/finetune/gpt2-arc-easy-cpu-streaming-dataset.yaml @@ -1,13 +1,16 @@ variables: global_seed: 17 -max_seq_len: 512 + max_seq_len: 512 -data_local: ./my_data -data_remote: # If blank, files must be present in data_local + data_local: ./my_data + data_remote: # If blank, files must be present in data_local -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -19,7 +22,7 @@ model: tokenizer: name: gpt2 kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: @@ -28,12 +31,12 @@ train_loader: ############ streams: my_data: - remote: ${data_remote} - local: ${data_local} + remote: ${variables.data_remote} + local: ${variables.data_local} split: train ############ shuffle: true - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} decoder_only_format: true drop_last: true num_workers: 8 @@ -65,7 +68,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 8 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml index 373f3b379b..3ef41e0aa1 100644 --- a/scripts/train/yamls/finetune/mpt-30b-instruct.yaml +++ b/scripts/train/yamls/finetune/mpt-30b-instruct.yaml @@ -2,10 +2,15 @@ variables: tokenizer_name: mosaicml/mpt-30b global_seed: 17 -max_seq_len: 8192 + max_seq_len: 8192 -# Run Name -run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME + + icl_max_seq_len: 2048 + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +19,7 @@ model: pretrained_model_name_or_path: mosaicml/mpt-30b init_device: mixed config_overrides: - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} attn_config: attn_impl: flash # Note: we still use packing, but turn this off for memory. @@ -23,9 +28,9 @@ model: # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: @@ -33,7 +38,7 @@ train_loader: dataset: hf_name: mosaicml/instruct-v3 split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true packing_ratio: 9 @@ -50,7 +55,7 @@ eval_loader: dataset: hf_name: mosaicml/instruct-v3 split: test - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true packing_ratio: 9 @@ -89,7 +94,7 @@ eval_first: true global_train_batch_size: 72 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 1 precision: amp_bf16 @@ -122,7 +127,7 @@ callbacks: # save_interval: 3ep # save_num_checkpoints_to_keep: 1 -icl_max_seq_len: 2048 +icl_max_seq_len: ${variables.icl_max_seq_len} # YOU MUST ADD YOUR OWN DATASET URIs # this section can be removed if you do not want to track these metrics @@ -133,7 +138,7 @@ icl_tasks: num_fewshot: - 0 batch_size: 4 - max_seq_len: ${icl_max_seq_len} + max_seq_len: ${variables.icl_max_seq_len} icl_task_type: multiple_choice metric_names: - InContextLearningMultipleChoiceAccuracy @@ -146,7 +151,7 @@ icl_tasks: num_fewshot: - 0 batch_size: 4 - max_seq_len: ${icl_max_seq_len} + max_seq_len: ${variables.icl_max_seq_len} icl_task_type: multiple_choice metric_names: - InContextLearningMultipleChoiceAccuracy @@ -159,7 +164,7 @@ icl_tasks: num_fewshot: - 0 batch_size: 4 - max_seq_len: ${icl_max_seq_len} + max_seq_len: ${variables.icl_max_seq_len} icl_task_type: multiple_choice metric_names: - InContextLearningMultipleChoiceAccuracy diff --git a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml index bd100dd01c..3634fd259e 100644 --- a/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/mpt-7b_dolly_sft.yaml @@ -1,17 +1,20 @@ variables: global_seed: 17 -max_seq_len: 2048 + max_seq_len: 2048 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} model: name: hf_causal_lm pretrained: true pretrained_model_name_or_path: mosaicml/mpt-7b config_overrides: - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} attn_config: attn_impl: flash # Set this to `true` if using `train_loader.dataset.packing_ratio` below @@ -21,7 +24,7 @@ model: tokenizer: name: mosaicml/mpt-7b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders @@ -30,7 +33,7 @@ train_loader: dataset: hf_name: mosaicml/dolly_hhrlhf split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with @@ -53,7 +56,7 @@ eval_loader: dataset: hf_name: mosaicml/dolly_hhrlhf split: test - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: true # packing_ratio: @@ -93,7 +96,7 @@ eval_first: true global_train_batch_size: 48 # somewhere in the 6-8 * numgpus range seems good # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml index 3dcdb95e7a..9357ef7771 100644 --- a/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml +++ b/scripts/train/yamls/finetune/mpt-7b_domain_adapt.yaml @@ -1,9 +1,12 @@ variables: global_seed: 17 -data_local: ./my-adaptation-data -data_remote: # If blank, files must be present in data_local -max_seq_len: 4096 + data_local: ./my-adaptation-data + data_remote: # If blank, files must be present in data_local + + max_seq_len: 4096 + +max_seq_len: ${variables.max_seq_len} # Run Name run_name: # If left blank, will be read from env var $COMPOSER_RUN_NAME @@ -14,7 +17,7 @@ model: pretrained: true pretrained_model_name_or_path: mosaicml/mpt-7b config_overrides: - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} attn_config: attn_impl: flash attn_uses_sequence_id: false @@ -23,31 +26,31 @@ model: tokenizer: name: mosaicml/mpt-7b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train_small shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val_small shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -78,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 1024 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml index 257c088c9e..d394018cfc 100644 --- a/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml +++ b/scripts/train/yamls/finetune/t5-small_dolly_sft.yaml @@ -2,10 +2,13 @@ variables: global_seed: 17 model_name: t5-small -max_seq_len: 1024 + max_seq_len: 1024 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -23,7 +26,7 @@ train_loader: dataset: hf_name: HuggingFaceH4/databricks_dolly_15k split: train - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} allow_pad_trimming: false decoder_only_format: false shuffle: true @@ -64,7 +67,7 @@ eval_interval: 1 # this is the only allowed value for no eval global_train_batch_size: 64 # assuming 8 gpus # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml index 2791acc935..5f02ba47e6 100644 --- a/scripts/train/yamls/pretrain/gpt-neo-125m.yaml +++ b/scripts/train/yamls/pretrain/gpt-neo-125m.yaml @@ -1,14 +1,19 @@ # Pretrain a gpt-neo-125m style model # this is NOT a finetuning run -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -tokenizer_name: gpt2 -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + tokenizer_name: gpt2 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + max_seq_len: 2048 + + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -17,37 +22,37 @@ model: config_overrides: # WARNING: if setting `pretrained: true`, `max_position_embeddings` must match the # `max_position_embeddings` used during pre-training - max_position_embeddings: ${max_seq_len} + max_position_embeddings: ${variables.max_seq_len} pretrained: false # false: only use the architecture; true: initialize with pretrained weights # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -78,7 +83,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml index b2d71ad762..fe9828b50a 100644 --- a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml +++ b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml @@ -1,14 +1,19 @@ # Pretrain a gpt-neo-125m style model # this is NOT a finetuning run -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -tokenizer_name: EleutherAI/gpt-neo-125M -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + tokenizer_name: EleutherAI/gpt-neo-125M + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + max_seq_len: 2048 + + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -17,37 +22,37 @@ model: config_overrides: # WARNING: if setting `pretrained: true`, `max_position_embeddings` must match the # `max_position_embeddings` used during pre-training - max_position_embeddings: ${max_seq_len} + max_position_embeddings: ${variables.max_seq_len} pretrained: false # false: only use the architecture; true: initialize with pretrained weights # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -78,7 +83,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/gpt2-small.yaml b/scripts/train/yamls/pretrain/gpt2-small.yaml index 52d0f8cb73..458f6869da 100644 --- a/scripts/train/yamls/pretrain/gpt2-small.yaml +++ b/scripts/train/yamls/pretrain/gpt2-small.yaml @@ -1,14 +1,19 @@ # Pretrain a gpt2 style model # this is NOT a finetuning run -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -tokenizer_name: gpt2 -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + tokenizer_name: gpt2 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + max_seq_len: 2048 + + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -17,37 +22,37 @@ model: config_overrides: # WARNING: if setting `pretrained: true`, `max_position_embeddings` must match the # `max_position_embeddings` used during pre-training - n_positions: ${max_seq_len} + n_positions: ${variables.max_seq_len} pretrained: false # false: only use the architecture; true: initialize with pretrained weights # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -78,7 +83,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-125m.yaml b/scripts/train/yamls/pretrain/mpt-125m.yaml index 78dc789e7d..644dfc26c1 100644 --- a/scripts/train/yamls/pretrain/mpt-125m.yaml +++ b/scripts/train/yamls/pretrain/mpt-125m.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 12 n_layers: 12 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 16 device_train_microbatch_size: 16 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-13b.yaml b/scripts/train/yamls/pretrain/mpt-13b.yaml index 782c01f1f0..41002bb45d 100644 --- a/scripts/train/yamls/pretrain/mpt-13b.yaml +++ b/scripts/train/yamls/pretrain/mpt-13b.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 40 n_layers: 40 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 1024 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-1b.yaml b/scripts/train/yamls/pretrain/mpt-1b.yaml index 3744a455a8..39b18a09e4 100644 --- a/scripts/train/yamls/pretrain/mpt-1b.yaml +++ b/scripts/train/yamls/pretrain/mpt-1b.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 16 # Modified 24->16 so that d_head == 128 to statisfy FlashAttention n_layers: 24 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 512 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-30b.yaml b/scripts/train/yamls/pretrain/mpt-30b.yaml index 6b82407c63..3627c36dd0 100644 --- a/scripts/train/yamls/pretrain/mpt-30b.yaml +++ b/scripts/train/yamls/pretrain/mpt-30b.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 56 n_layers: 48 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 2048 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-350m.yaml b/scripts/train/yamls/pretrain/mpt-350m.yaml index 63bc6169a1..ebe8da715f 100644 --- a/scripts/train/yamls/pretrain/mpt-350m.yaml +++ b/scripts/train/yamls/pretrain/mpt-350m.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 16 n_layers: 24 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-3b.yaml b/scripts/train/yamls/pretrain/mpt-3b.yaml index 74d422398d..615f59ee3f 100644 --- a/scripts/train/yamls/pretrain/mpt-3b.yaml +++ b/scripts/train/yamls/pretrain/mpt-3b.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 20 # Modified 32->20 so that d_head == 128 to statisfy FlashAttention n_layers: 32 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 512 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-70b.yaml b/scripts/train/yamls/pretrain/mpt-70b.yaml index 8e6856ceb8..55450a8bfc 100644 --- a/scripts/train/yamls/pretrain/mpt-70b.yaml +++ b/scripts/train/yamls/pretrain/mpt-70b.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 64 n_layers: 80 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 2048 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-760m.yaml b/scripts/train/yamls/pretrain/mpt-760m.yaml index f11f199036..5c1f0bdbdc 100644 --- a/scripts/train/yamls/pretrain/mpt-760m.yaml +++ b/scripts/train/yamls/pretrain/mpt-760m.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 12 # Modified 16->12 so that d_head == 128 to statisfy FlashAttention n_layers: 24 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-7b.yaml b/scripts/train/yamls/pretrain/mpt-7b.yaml index 831383168f..b97f3f2c9e 100644 --- a/scripts/train/yamls/pretrain/mpt-7b.yaml +++ b/scripts/train/yamls/pretrain/mpt-7b.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 2048 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 2048 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 32 n_layers: 32 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: flash @@ -23,30 +27,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -77,7 +81,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 1024 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 8 device_train_microbatch_size: 8 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/mpt-small-cpu.yaml b/scripts/train/yamls/pretrain/mpt-small-cpu.yaml index 1f50c68a74..b579723002 100644 --- a/scripts/train/yamls/pretrain/mpt-small-cpu.yaml +++ b/scripts/train/yamls/pretrain/mpt-small-cpu.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 128 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 128 + global_seed: 17 -# Run Name -run_name: mpt_causal_lm_cpu # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: mpt_causal_lm_cpu # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 4 n_layers: 4 expansion_ratio: 5 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: torch @@ -24,30 +28,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 2 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 2 @@ -79,7 +83,7 @@ global_train_batch_size: 256 autoresume: false # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 16 device_train_microbatch_size: 16 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/opt-3b.yaml b/scripts/train/yamls/pretrain/opt-3b.yaml index 65b73257c2..31b7bf255b 100644 --- a/scripts/train/yamls/pretrain/opt-3b.yaml +++ b/scripts/train/yamls/pretrain/opt-3b.yaml @@ -1,11 +1,15 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -tokenizer_name: facebook/opt-2.7b -max_seq_len: 256 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + tokenizer_name: facebook/opt-2.7b + max_seq_len: 256 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -15,32 +19,32 @@ model: # Tokenizer tokenizer: - name: ${tokenizer_name} + name: ${variables.tokenizer_name} kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -71,7 +75,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 4 device_train_microbatch_size: 4 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/testing-moe.yaml b/scripts/train/yamls/pretrain/testing-moe.yaml index eea2b999b7..e61e3e451e 100644 --- a/scripts/train/yamls/pretrain/testing-moe.yaml +++ b/scripts/train/yamls/pretrain/testing-moe.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 128 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 128 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -24,7 +28,7 @@ model: n_heads: 2 n_layers: 2 expansion_ratio: 1 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: torch @@ -34,30 +38,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -88,7 +92,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 16 device_train_microbatch_size: 16 # device_train_microbatch_size: auto diff --git a/scripts/train/yamls/pretrain/testing.yaml b/scripts/train/yamls/pretrain/testing.yaml index 01ebecafe2..2271be5d6d 100644 --- a/scripts/train/yamls/pretrain/testing.yaml +++ b/scripts/train/yamls/pretrain/testing.yaml @@ -1,10 +1,14 @@ -data_local: ./my-copy-c4 -data_remote: # If blank, files must be present in data_local -max_seq_len: 128 -global_seed: 17 +variables: + data_local: ./my-copy-c4 + data_remote: # If blank, files must be present in data_local + max_seq_len: 128 + global_seed: 17 -# Run Name -run_name: # If left blank, will be read from env var $RUN_NAME + # Run Name + run_name: # If left blank, will be read from env var $RUN_NAME + +max_seq_len: ${variables.max_seq_len} +run_name: ${variables.run_name} # Model model: @@ -14,7 +18,7 @@ model: n_heads: 2 n_layers: 2 expansion_ratio: 4 - max_seq_len: ${max_seq_len} + max_seq_len: ${variables.max_seq_len} vocab_size: 50368 attn_config: attn_impl: torch @@ -24,30 +28,30 @@ model: tokenizer: name: EleutherAI/gpt-neox-20b kwargs: - model_max_length: ${max_seq_len} + model_max_length: ${variables.max_seq_len} # Dataloaders train_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: train shuffle: true - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: true num_workers: 8 eval_loader: name: text dataset: - local: ${data_local} - remote: ${data_remote} + local: ${variables.data_local} + remote: ${variables.data_remote} split: val shuffle: false - max_seq_len: ${max_seq_len} - shuffle_seed: ${global_seed} + max_seq_len: ${variables.max_seq_len} + shuffle_seed: ${variables.global_seed} drop_last: false num_workers: 8 @@ -78,7 +82,7 @@ eval_subset_num_batches: -1 global_train_batch_size: 256 # System -seed: ${global_seed} +seed: ${variables.global_seed} device_eval_batch_size: 16 device_train_microbatch_size: 16 # device_train_microbatch_size: auto From 4758af34e38a2b78910ad31d77c20c86020df2e0 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 05:42:57 +0000 Subject: [PATCH 140/201] updated train and eval scripts --- scripts/eval/eval.py | 1 - scripts/train/train.py | 7 +------ 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 34e0cba7a3..3c4f760cf2 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -193,7 +193,6 @@ class EvalConfig: seed: int = 17 precision: str = 'amp_bf16' run_name: Optional[str] = None - model_name_or_path: Optional[str] = None metadata: Optional[Dict[str, str]] = None # Distributed parameters diff --git a/scripts/train/train.py b/scripts/train/train.py index e22728740a..44ced8b486 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -115,8 +115,6 @@ class TrainConfig: # Dataloader device_train_microbatch_size: Union[str, int] = 'auto' - data_local: Optional[str] = None - data_remote: Optional[str] = None # Eval dataloader eval_subset_num_batches: int = -1 @@ -136,10 +134,7 @@ class TrainConfig: # Profiling profiler: Optional[Dict[str, Any]] = None - # Ignore keys - global_seed: Optional[int] = None - global_train_batch_size: Optional[int] = None - n_gpus: Optional[int] = None + # Variables to ignore variables: Optional[Dict[str, Any]] = None From 169f1a3b06c7e8e981b61a1869163156e914170a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 06:00:18 +0000 Subject: [PATCH 141/201] un-delete global train batch size --- scripts/train/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/train/train.py b/scripts/train/train.py index 44ced8b486..11feb19f9c 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -115,6 +115,7 @@ class TrainConfig: # Dataloader device_train_microbatch_size: Union[str, int] = 'auto' + global_train_batch_size: Optional[int] = None # Eval dataloader eval_subset_num_batches: int = -1 From df955a9991add23962b2de6d120213a5319e60b5 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 06:48:01 +0000 Subject: [PATCH 142/201] fix --- llmfoundry/utils/builders.py | 4 ++-- llmfoundry/utils/config_utils.py | 2 +- scripts/eval/eval.py | 2 +- scripts/eval/yamls/hf_eval.yaml | 7 +++++-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 943fcdf7dc..16881b860a 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -29,7 +29,7 @@ from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_recursive, to_list_recursive from llmfoundry.utils.registry_utils import construct_from_registry from llmfoundry.utils.warnings import VersionedDeprecationWarning @@ -484,7 +484,7 @@ def build_icl_evaluators( log.info(f'Extracting ICL task config from path: {icl_tasks}') with open(icl_tasks, 'r') as icl_f: icl_task_cfg = om.load(icl_f) - icl_tasks_list = to_dict_recursive(icl_task_cfg.icl_tasks) + icl_tasks_list = to_list_recursive(icl_task_cfg.icl_tasks) else: icl_tasks_list = icl_tasks diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index db29bcf5c5..97195a8a0f 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -118,7 +118,7 @@ def make_dataclass_and_log_config( for key in extraneous_keys: warnings.warn( f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', - DeprecationWarning) + category=DeprecationWarning) unstructured_config['variables'][key] = unstructured_config.pop(key) # Create copy of config for logging diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 3c4f760cf2..2517fdfc24 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -255,7 +255,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: run_name = eval_config.run_name if eval_config.run_name else default_run_name reproducibility.seed_all(eval_config.seed) - dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) + # dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) logging.basicConfig( # Example of format string diff --git a/scripts/eval/yamls/hf_eval.yaml b/scripts/eval/yamls/hf_eval.yaml index 15e53edcaa..708c871d88 100644 --- a/scripts/eval/yamls/hf_eval.yaml +++ b/scripts/eval/yamls/hf_eval.yaml @@ -3,8 +3,11 @@ variables: model_name_or_path: EleutherAI/gpt-neo-125m # otherwise, write a block for each model you want to test in the `models` section -precision: fp32 -max_seq_len: 1024 + precision: fp32 + max_seq_len: 1024 + +precision: ${variables.precision} +max_seq_len: ${variables.max_seq_len} models: - From 060c216b8013f84e196daa0b77c1160baef119a7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 07:04:09 +0000 Subject: [PATCH 143/201] I don't understand why this doesn't work --- scripts/eval/eval.py | 2 +- tests/a_scripts/train/test_train.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 2517fdfc24..3c4f760cf2 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -255,7 +255,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: run_name = eval_config.run_name if eval_config.run_name else default_run_name reproducibility.seed_all(eval_config.seed) - # dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) + dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) logging.basicConfig( # Example of format string diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 26dcfb082e..67630d0cd3 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -11,9 +11,11 @@ from omegaconf import DictConfig, ListConfig from omegaconf import OmegaConf as om -from llmfoundry.utils.config_utils import (to_dict_recursive, +from llmfoundry.utils.config_utils import (make_dataclass_and_log_config, + to_dict_recursive, update_batch_size_info) -from scripts.train.train import TrainConfig, main, validate_config # noqa: E402 +from scripts.train.train import TrainConfig # noqa: E402 +from scripts.train.train import TRAIN_CONFIG_KEYS, main, validate_config from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg from tests.fixtures.autouse import REPO_DIR @@ -163,7 +165,9 @@ def test_validate_config(): match= 'MoEs with expert parallelism (.*) require `use_orig_params=True`.' ): - validate_config(om.structured(TrainConfig(**test_cfg_dict))) + _, cfg_obj = make_dataclass_and_log_config(test_cfg_dict, TrainConfig, + TRAIN_CONFIG_KEYS) + validate_config(cfg_obj) def test_eval_metrics_with_no_train_metrics(tmp_path: pathlib.Path): From feea2d12d3a26457ae13c6c8e49891abcdadba42 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 07:20:28 +0000 Subject: [PATCH 144/201] that was the sneakiest bug I've ever fixed --- tests/data_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/data_utils.py b/tests/data_utils.py index fd24d4cbbf..e407d2b01c 100644 --- a/tests/data_utils.py +++ b/tests/data_utils.py @@ -293,13 +293,13 @@ def gpt_tiny_cfg(dataset_name: str, device: str): test_cfg = om.load(f) assert isinstance(test_cfg, DictConfig) - test_cfg.data_local = dataset_name + test_cfg.variables.data_local = dataset_name test_cfg.global_train_batch_size = 8 test_cfg.device_eval_batch_size = 4 test_cfg.device_train_microbatch_size = 4 test_cfg.max_duration = '4ba' test_cfg.eval_interval = '4ba' - test_cfg.run_name = 'gpt-mini-integration-test' + test_cfg.variables.run_name = 'gpt-mini-integration-test' if device == 'cpu': test_cfg.model.init_device = 'cpu' From adfa1659ac266d8e2b1a1d887cd7308bd20cfdab Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 07:24:04 +0000 Subject: [PATCH 145/201] try to fix the regression test --- llmfoundry/utils/config_utils.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 97195a8a0f..96dfcc958d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -109,6 +109,15 @@ def make_dataclass_and_log_config( if icl_tasks_required: raise ValueError('icl_tasks must be specified in the config') + # Create copy of config for logging + logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) + + # apply transforms to the unstructured config before constructing dataclass + for transform in transforms or []: + unstructured_config = transform(unstructured_config) + + logged_cfg.update(unstructured_config, merge=True) + arg_config_keys = set(unstructured_config.keys()) extraneous_keys = set.difference(arg_config_keys, dataclass_fields) @@ -121,15 +130,6 @@ def make_dataclass_and_log_config( category=DeprecationWarning) unstructured_config['variables'][key] = unstructured_config.pop(key) - # Create copy of config for logging - logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) - - # apply transforms to the unstructured config before constructing dataclass - for transform in transforms or []: - unstructured_config = transform(unstructured_config) - - logged_cfg.update(unstructured_config, merge=True) - dataclass_config: T = om.structured( dataclass_constructor(**unstructured_config)) From 2c1f4d61e5c0f5060320b642b113c79f51e15a28 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 07:33:43 +0000 Subject: [PATCH 146/201] remove device train grad accum --- scripts/train/train.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 11feb19f9c..7663a6f373 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -129,9 +129,6 @@ class TrainConfig: # Resumption autoresume: bool = False - # Gradient accumulation - device_train_grad_accum: Optional[int] = None - # Profiling profiler: Optional[Dict[str, Any]] = None From 774db460c26e01613f4194d1e65833d1bc83e96a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 07:34:23 +0000 Subject: [PATCH 147/201] fix validate config --- tests/a_scripts/train/test_train.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 67630d0cd3..0056032442 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -158,15 +158,16 @@ def test_validate_config(): test_cfg: DictConfig = om.load(f) # type: ignore test_cfg.model.ffn_config.moe_world_size = 4 test_cfg.fsdp_config.use_orig_params = False - test_cfg_dict = to_dict_recursive(test_cfg) - test_cfg_dict = update_batch_size_info(test_cfg_dict) with pytest.raises( ValueError, match= 'MoEs with expert parallelism (.*) require `use_orig_params=True`.' ): - _, cfg_obj = make_dataclass_and_log_config(test_cfg_dict, TrainConfig, - TRAIN_CONFIG_KEYS) + _, cfg_obj = make_dataclass_and_log_config( + test_cfg, + TrainConfig, + TRAIN_CONFIG_KEYS, + transforms=[update_batch_size_info]) validate_config(cfg_obj) From c014baaaeda492747a016d2b21a457e394e803d3 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 07:34:54 +0000 Subject: [PATCH 148/201] removed unused import --- tests/a_scripts/train/test_train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/a_scripts/train/test_train.py b/tests/a_scripts/train/test_train.py index 0056032442..86cfbb0138 100644 --- a/tests/a_scripts/train/test_train.py +++ b/tests/a_scripts/train/test_train.py @@ -12,7 +12,6 @@ from omegaconf import OmegaConf as om from llmfoundry.utils.config_utils import (make_dataclass_and_log_config, - to_dict_recursive, update_batch_size_info) from scripts.train.train import TrainConfig # noqa: E402 from scripts.train.train import TRAIN_CONFIG_KEYS, main, validate_config From 05d9c685646fe093375d02c036587ae7909856f8 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 08:01:38 +0000 Subject: [PATCH 149/201] use variables --- tests/data/test_dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 5db45e3afe..4374100191 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -214,7 +214,7 @@ def test_correct_padding(tokenizer_name: str, test_cfg = get_config( conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml') - test_cfg.data_local = data_local + test_cfg.variables.data_local = data_local test_cfg.eval_loader.dataset.split = split test_cfg.dataset = om.create({ 'num_canonical_nodes': 1, From c2e1c4fc8440dcc97a616b39d1cdb7c464dd803a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 08:27:23 +0000 Subject: [PATCH 150/201] missing mandatory value fix --- tests/a_scripts/eval/test_eval_inputs.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/a_scripts/eval/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py index 8694546c4f..47757029bb 100644 --- a/tests/a_scripts/eval/test_eval_inputs.py +++ b/tests/a_scripts/eval/test_eval_inputs.py @@ -38,7 +38,8 @@ def test_mispelled_mandatory_params_fail(self, cfg: DictConfig) -> None: mandatory_configs = ['models', 'icl_tasks'] for p in mandatory_params + mandatory_configs: with pytest.raises((omegaconf.errors.ConfigKeyError, - omegaconf.errors.InterpolationKeyError)): + omegaconf.errors.InterpolationKeyError, + omegaconf.errors.MissingMandatoryValue)): cfg[p + '-mispelled'] = cfg.pop(p) main(cfg) cfg[p] = cfg.pop(p + '-mispelled') From e733b9f15a2ed08e12e45be81e56cceb6fcd40fc Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 14:43:38 +0000 Subject: [PATCH 151/201] use correct type of error --- llmfoundry/utils/config_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 96dfcc958d..9108eb7d22 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -10,7 +10,7 @@ Tuple, TypeVar, Union) from composer.utils import dist -from omegaconf import DictConfig, ListConfig +from omegaconf import DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from llmfoundry.layers_registry import ffns_with_megablocks @@ -107,7 +107,8 @@ def make_dataclass_and_log_config( 'icl_tasks') else: if icl_tasks_required: - raise ValueError('icl_tasks must be specified in the config') + raise MissingMandatoryValue( + 'icl_tasks must be specified in the config') # Create copy of config for logging logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) From 3a5a960356fa68fd7bb477eefcb729307709c3ff Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 16:30:53 +0000 Subject: [PATCH 152/201] fix --- scripts/eval/eval.py | 5 +++-- scripts/train/train.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 3c4f760cf2..788424c00c 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -29,7 +29,8 @@ from llmfoundry.utils.config_utils import (log_config, make_dataclass_and_log_config, process_init_device, - to_container_recursive) + to_container_recursive, + to_list_recursive) from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) @@ -218,7 +219,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for code_path in (eval_config.code_paths or []): import_file(code_path) - model_configs = eval_config.models + model_configs = to_list_recursive(eval_config.models) eval_gauntlet_config = to_container_recursive( eval_config.eval_gauntlet) or eval_config.eval_gauntlet_str assert eval_gauntlet_config is None or isinstance( diff --git a/scripts/train/train.py b/scripts/train/train.py index 7663a6f373..5fbf016277 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -409,19 +409,23 @@ def main(cfg: DictConfig) -> Trainer: trace_handlers=profiler_trace_handlers, schedule=profiler_schedule) + callback_configs = to_dict_recursive( + train_cfg.callbacks) if train_cfg.callbacks is not None else {} # Callbacks callbacks: List[Callback] = [ build_callback(str(name), callback_cfg, logged_cfg) - for name, callback_cfg in train_cfg.callbacks.items() - ] if train_cfg.callbacks else [] + for name, callback_cfg in callback_configs.items() + ] use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) + algorithm_configs = to_dict_recursive( + train_cfg.algorithms) if train_cfg.algorithms is not None else {} # Algorithms algorithms = [ build_algorithm(str(name), algorithm_cfg) - for name, algorithm_cfg in train_cfg.algorithms.items() - ] if train_cfg.algorithms else None + for name, algorithm_cfg in algorithm_configs.items() + ] # Dataloaders log.info('Building train loader...') From e309366b2e0b2ed7fd1aef065e8cea5d460b5684 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 18:17:12 +0000 Subject: [PATCH 153/201] import TrainConfig just in case? --- scripts/inference/convert_composer_to_hf.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py index 51afb105c8..6ac1c4d91e 100644 --- a/scripts/inference/convert_composer_to_hf.py +++ b/scripts/inference/convert_composer_to_hf.py @@ -19,6 +19,8 @@ from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility +from scripts.eval.eval import EvalConfig +from scripts.train.train import TrainConfig def write_huggingface_pretrained_from_composer_checkpoint( From 8704f3f7f64abdab406641a95214bbce88b34bb6 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 18:23:25 +0000 Subject: [PATCH 154/201] moved trainconfig and evalconfig into utils --- llmfoundry/utils/config_utils.py | 144 +++++++++++++++++++- scripts/eval/eval.py | 52 +------ scripts/inference/convert_composer_to_hf.py | 3 +- scripts/train/train.py | 100 +------------- 4 files changed, 149 insertions(+), 150 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 9108eb7d22..e6d43aa969 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -6,11 +6,12 @@ import logging import math import warnings +from dataclasses import dataclass, fields from typing import (Any, Callable, Dict, List, Literal, Mapping, Optional, Set, Tuple, TypeVar, Union) from composer.utils import dist -from omegaconf import DictConfig, ListConfig, MissingMandatoryValue +from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from llmfoundry.layers_registry import ffns_with_megablocks @@ -27,6 +28,147 @@ ] +@dataclass +class EvalConfig: + # Eval Config required parameters: + models: List[Dict[str, Any]] = MISSING + max_seq_len: int = MISSING + device_eval_batch_size: int = MISSING + + # Eval Config optional parameters: + code_paths: Optional[List[str]] = None + + # Eval hyperparameters + eval_gauntlet: Optional[Dict[str, Any]] = None + eval_gauntlet_str: Optional[str] = None + eval_loader: Optional[Dict[str, Any]] = None + eval_loaders: Optional[List[Dict[str, Any]]] = None + eval_subset_num_batches: int = -1 + icl_subset_num_batches: Optional[int] = None + # One of icl_tasks or icl_tasks_str must be specified + icl_tasks: Optional[List[Dict[str, Any]]] = None + icl_tasks_str: Optional[str] = None + + # Logging parameters + python_log_level: str = 'debug' + loggers: Optional[Dict[str, Any]] = None + log_config: bool = True + + # Model/run parameters + seed: int = 17 + precision: str = 'amp_bf16' + run_name: Optional[str] = None + metadata: Optional[Dict[str, str]] = None + + # Distributed parameters + dist_timeout: Union[float, int] = 600.0 + fsdp_config: Optional[Dict[str, Any]] = None + + # Callback parameters + callbacks: Optional[Dict[str, Any]] = None + + # Variables to ignore + variables: Optional[Dict[str, Any]] = None + + +EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) + + +@dataclass +class TrainConfig: + """Dataclass for training configuration.""" + + # Mandatory model training parameters + model: Dict[str, Any] = MISSING + tokenizer: Dict[str, Any] = MISSING + optimizer: Dict[str, Any] = MISSING + scheduler: Dict[str, Any] = MISSING + train_loader: Dict[str, Any] = MISSING + device_train_batch_size: int = MISSING + device_eval_batch_size: int = MISSING + max_duration: Union[int, str] = MISSING + eval_interval: Union[int, str] = MISSING + precision: str = 'amp_bf16' + max_seq_len: int = MISSING + seed: int = MISSING + + # Optional model training parameters + + # Code paths to import + code_paths: Optional[List[str]] = None + + # Cuda allocation configuration + max_split_size_mb: Optional[int] = None + expandable_segments: bool = False + cuda_load_lazy: bool = False + + # Distributed training parameters + dist_timeout: Union[int, float] = 600.0 + fsdp_config: Optional[Dict[str, Any]] = None + + # Evaluation parameters + eval_loader: Optional[Dict[str, Any]] = None + eval_loaders: Optional[List[Dict[ + str, Any]]] = None # should not be set by the user + icl_tasks: Optional[List[Dict[str, Any]]] = None + icl_tasks_str: Optional[str] = None # should not be set by the user + eval_gauntlet: Optional[Dict[str, Any]] = None + eval_gauntlet_str: Optional[str] = None # should not be set by the user + icl_subset_num_batches: Optional[int] = None + icl_seq_len: Optional[int] = None + + # Logging + loggers: Optional[Dict[str, Any]] = None + progress_bar: bool = False + log_to_console: bool = True + python_log_level: Optional[str] = 'debug' + console_log_interval: Union[int, str] = '1ba' + log_config: bool = True + + # Callbacks + callbacks: Optional[Dict[str, Any]] = None + algorithms: Optional[Dict[str, Any]] = None + + # Checkpoints + save_folder: Optional[str] = None + save_latest_filename: Optional[str] = None + save_overwrite: bool = False + save_weights_only: bool = False + save_filename: Optional[str] = None + save_interval: Union[str, int] = '1000ba' + save_num_checkpoints_to_keep: int = -1 + load_path: Optional[str] = None + load_weights_only: bool = False + load_strict_model_weights: bool = True + load_ignore_keys: Optional[List[str]] = None + save_ignore_keys: Optional[List[str]] = None + + # Dataloader + device_train_microbatch_size: Union[str, int] = 'auto' + global_train_batch_size: Optional[int] = None + + # Eval dataloader + eval_subset_num_batches: int = -1 + eval_first: bool = False + compile_config: Optional[Dict[str, Any]] = None + + # Metadata + metadata: Optional[Dict[str, Any]] = None + run_name: Optional[str] = None + + # Resumption + autoresume: bool = False + + # Profiling + profiler: Optional[Dict[str, Any]] = None + + # Variables to ignore + variables: Optional[Dict[str, Any]] = None + + +TRAIN_CONFIG_KEYS = set(field.name for field in fields(TrainConfig)) + + def forbid_config_key(cfg_dict: Dict[str, Any], key: str): if key in cfg_dict: raise ValueError( diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 788424c00c..359ca68b1c 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -5,7 +5,6 @@ import os import sys import time -from dataclasses import dataclass, fields from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd @@ -14,7 +13,7 @@ from composer.loggers.logger_destination import LoggerDestination from composer.trainer import Trainer from composer.utils import dist, get_device, reproducibility -from omegaconf import MISSING, DictConfig +from omegaconf import DictConfig from omegaconf import OmegaConf as om from rich.traceback import install @@ -26,7 +25,8 @@ build_callback, build_composer_model, build_evaluators, build_logger, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, +from llmfoundry.utils.config_utils import (EVAL_CONFIG_KEYS, EvalConfig, + log_config, make_dataclass_and_log_config, process_init_device, to_container_recursive, @@ -164,52 +164,6 @@ def evaluate_model( return (trainer, logger_keys, eval_gauntlet_callback, eval_gauntlet_df) -@dataclass -class EvalConfig: - # Eval Config required parameters: - models: List[Dict[str, Any]] = MISSING - max_seq_len: int = MISSING - device_eval_batch_size: int = MISSING - - # Eval Config optional parameters: - code_paths: Optional[List[str]] = None - - # Eval hyperparameters - eval_gauntlet: Optional[Dict[str, Any]] = None - eval_gauntlet_str: Optional[str] = None - eval_loader: Optional[Dict[str, Any]] = None - eval_loaders: Optional[List[Dict[str, Any]]] = None - eval_subset_num_batches: int = -1 - icl_subset_num_batches: Optional[int] = None - # One of icl_tasks or icl_tasks_str must be specified - icl_tasks: Optional[List[Dict[str, Any]]] = None - icl_tasks_str: Optional[str] = None - - # Logging parameters - python_log_level: str = 'debug' - loggers: Optional[Dict[str, Any]] = None - log_config: bool = True - - # Model/run parameters - seed: int = 17 - precision: str = 'amp_bf16' - run_name: Optional[str] = None - metadata: Optional[Dict[str, str]] = None - - # Distributed parameters - dist_timeout: Union[float, int] = 600.0 - fsdp_config: Optional[Dict[str, Any]] = None - - # Callback parameters - callbacks: Optional[Dict[str, Any]] = None - - # Variables to ignore - variables: Optional[Dict[str, Any]] = None - - -EVAL_CONFIG_KEYS = set(field.name for field in fields(EvalConfig)) - - def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: cfgs: Tuple[Dict[str, Any], EvalConfig] = make_dataclass_and_log_config( cfg, EvalConfig, EVAL_CONFIG_KEYS, icl_tasks_required=True) diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py index 6ac1c4d91e..acf33dd7f4 100644 --- a/scripts/inference/convert_composer_to_hf.py +++ b/scripts/inference/convert_composer_to_hf.py @@ -17,10 +17,9 @@ from llmfoundry import MPTConfig, MPTForCausalLM from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer +from llmfoundry.utils.config_utils import EvalConfig, TrainConfig from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility -from scripts.eval.eval import EvalConfig -from scripts.train.train import TrainConfig def write_huggingface_pretrained_from_composer_checkpoint( diff --git a/scripts/train/train.py b/scripts/train/train.py index 5fbf016277..d419171378 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -6,7 +6,6 @@ import sys import time import warnings -from dataclasses import dataclass, fields from typing import Any, Dict, List, Optional, Tuple, Union import torch @@ -24,7 +23,6 @@ maybe_create_mosaicml_logger) install() -from omegaconf import MISSING from llmfoundry.callbacks import AsyncEval from llmfoundry.data.dataloader import build_dataloader @@ -34,7 +32,8 @@ build_composer_model, build_evaluators, build_logger, build_optimizer, build_scheduler, build_tokenizer) -from llmfoundry.utils.config_utils import (log_config, +from llmfoundry.utils.config_utils import (TRAIN_CONFIG_KEYS, TrainConfig, + log_config, make_dataclass_and_log_config, pop_config, process_init_device, to_dict_recursive, to_list_recursive, @@ -44,101 +43,6 @@ log = logging.getLogger(__name__) -@dataclass -class TrainConfig: - """Dataclass for training configuration.""" - - # Mandatory model training parameters - model: Dict[str, Any] = MISSING - tokenizer: Dict[str, Any] = MISSING - optimizer: Dict[str, Any] = MISSING - scheduler: Dict[str, Any] = MISSING - train_loader: Dict[str, Any] = MISSING - device_train_batch_size: int = MISSING - device_eval_batch_size: int = MISSING - max_duration: Union[int, str] = MISSING - eval_interval: Union[int, str] = MISSING - precision: str = 'amp_bf16' - max_seq_len: int = MISSING - seed: int = MISSING - - # Optional model training parameters - - # Code paths to import - code_paths: Optional[List[str]] = None - - # Cuda allocation configuration - max_split_size_mb: Optional[int] = None - expandable_segments: bool = False - cuda_load_lazy: bool = False - - # Distributed training parameters - dist_timeout: Union[int, float] = 600.0 - fsdp_config: Optional[Dict[str, Any]] = None - - # Evaluation parameters - eval_loader: Optional[Dict[str, Any]] = None - eval_loaders: Optional[List[Dict[ - str, Any]]] = None # should not be set by the user - icl_tasks: Optional[List[Dict[str, Any]]] = None - icl_tasks_str: Optional[str] = None # should not be set by the user - eval_gauntlet: Optional[Dict[str, Any]] = None - eval_gauntlet_str: Optional[str] = None # should not be set by the user - icl_subset_num_batches: Optional[int] = None - icl_seq_len: Optional[int] = None - - # Logging - loggers: Optional[Dict[str, Any]] = None - progress_bar: bool = False - log_to_console: bool = True - python_log_level: Optional[str] = 'debug' - console_log_interval: Union[int, str] = '1ba' - log_config: bool = True - - # Callbacks - callbacks: Optional[Dict[str, Any]] = None - algorithms: Optional[Dict[str, Any]] = None - - # Checkpoints - save_folder: Optional[str] = None - save_latest_filename: Optional[str] = None - save_overwrite: bool = False - save_weights_only: bool = False - save_filename: Optional[str] = None - save_interval: Union[str, int] = '1000ba' - save_num_checkpoints_to_keep: int = -1 - load_path: Optional[str] = None - load_weights_only: bool = False - load_strict_model_weights: bool = True - load_ignore_keys: Optional[List[str]] = None - save_ignore_keys: Optional[List[str]] = None - - # Dataloader - device_train_microbatch_size: Union[str, int] = 'auto' - global_train_batch_size: Optional[int] = None - - # Eval dataloader - eval_subset_num_batches: int = -1 - eval_first: bool = False - compile_config: Optional[Dict[str, Any]] = None - - # Metadata - metadata: Optional[Dict[str, Any]] = None - run_name: Optional[str] = None - - # Resumption - autoresume: bool = False - - # Profiling - profiler: Optional[Dict[str, Any]] = None - - # Variables to ignore - variables: Optional[Dict[str, Any]] = None - - -TRAIN_CONFIG_KEYS = set(field.name for field in fields(TrainConfig)) - - def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" # Check for missing mandatory fields From 98164c835a35038109ebbba586c2b84e4a78b4e3 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 18:28:04 +0000 Subject: [PATCH 155/201] works --- scripts/inference/convert_composer_to_hf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py index acf33dd7f4..ef48a6055b 100644 --- a/scripts/inference/convert_composer_to_hf.py +++ b/scripts/inference/convert_composer_to_hf.py @@ -17,7 +17,8 @@ from llmfoundry import MPTConfig, MPTForCausalLM from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer -from llmfoundry.utils.config_utils import EvalConfig, TrainConfig +from llmfoundry.utils.config_utils import ( # pyright: ignore (needed for loading state dict) + EvalConfig, TrainConfig) from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility From 9ab6b8f0497b5f2633b0062ea3dcf16eca04517d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 19:55:20 +0000 Subject: [PATCH 156/201] no cheating --- scripts/inference/convert_composer_to_hf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/scripts/inference/convert_composer_to_hf.py b/scripts/inference/convert_composer_to_hf.py index ef48a6055b..51afb105c8 100644 --- a/scripts/inference/convert_composer_to_hf.py +++ b/scripts/inference/convert_composer_to_hf.py @@ -17,8 +17,6 @@ from llmfoundry import MPTConfig, MPTForCausalLM from llmfoundry.utils import get_hf_tokenizer_from_composer_state_dict from llmfoundry.utils.checkpoint_conversion_helpers import load_tokenizer -from llmfoundry.utils.config_utils import ( # pyright: ignore (needed for loading state dict) - EvalConfig, TrainConfig) from llmfoundry.utils.huggingface_hub_utils import \ edit_files_for_hf_compatibility From 94fd55b47d069fa9a1869d2acda9dbccfc6334d4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 20:01:29 +0000 Subject: [PATCH 157/201] dicts everywhere gah --- scripts/train/train.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index d419171378..ab1a0051a3 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -380,6 +380,7 @@ def main(cfg: DictConfig) -> Trainer: log.info('Initializing model...') name = model_config.pop('name') assert isinstance(name, str) + assert isinstance(model_config, dict) model = build_composer_model( name=name, tokenizer=tokenizer, @@ -392,7 +393,8 @@ def main(cfg: DictConfig) -> Trainer: # Optimizer optimizer_name: str = train_cfg.optimizer.pop('name') - optimizer = build_optimizer(model, optimizer_name, train_cfg.optimizer) + optimizer_cfg = to_dict_recursive(train_cfg.optimizer) + optimizer = build_optimizer(model, optimizer_name, optimizer_cfg) # Now add the eval metrics try: @@ -409,6 +411,9 @@ def main(cfg: DictConfig) -> Trainer: mosaicml_logger.log_exception(e) raise e + compile_config = to_dict_recursive( + train_cfg.compile_config + ) if train_cfg.compile_config is not None else None # Build the Trainer log.info('Building trainer...') trainer = Trainer( @@ -447,7 +452,7 @@ def main(cfg: DictConfig) -> Trainer: python_log_level=train_cfg.python_log_level, dist_timeout=dist_timeout, profiler=profiler, - compile_config=train_cfg.compile_config, + compile_config=compile_config, ) if train_cfg.log_config: From 84ba9171bb9e22c484a68a9bb7a666827d72dfdc Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 20:30:38 +0000 Subject: [PATCH 158/201] try no recursive just --- llmfoundry/utils/config_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index e6d43aa969..a0f636255d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -205,9 +205,15 @@ def to_container_recursive( def rh(x: Any) -> Any: # recursive helper if isinstance(x, DictConfig): - return {k: rh(v) for k, v in x.items()} + ret = om.to_container(x) + assert isinstance(ret, dict) + return ret + # return {k: rh(v) for k, v in x.items()} elif isinstance(x, ListConfig): - return [rh(v) for v in x] + # return [rh(v) for v in x] + ret = om.to_container(x) + assert isinstance(ret, list) + return ret else: return x From 155a484e273f7a63157e49f3f1b9f05c89e45581 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 20:49:14 +0000 Subject: [PATCH 159/201] rename typed helpers --- llmfoundry/data/finetuning/dataloader.py | 4 +-- llmfoundry/utils/builders.py | 6 ++-- llmfoundry/utils/config_utils.py | 36 ++++++++----------- scripts/eval/eval.py | 21 +++++------ scripts/train/train.py | 26 +++++++------- tests/a_scripts/eval/test_eval.py | 4 +-- .../inference/test_convert_composer_to_hf.py | 6 ++-- .../callbacks/test_eval_gauntlet_callback.py | 6 ++-- tests/data/test_icl_datasets.py | 4 +-- tests/models/hf/test_hf_config.py | 8 ++--- tests/models/hf/test_hf_v_mpt.py | 6 ++-- .../inference_api_wrapper/test_fmapi.py | 6 ++-- .../test_inference_api_eval_wrapper.py | 6 ++-- tests/models/test_model.py | 16 ++++----- 14 files changed, 73 insertions(+), 82 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 7e9704020d..ac7d7bd773 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -18,7 +18,7 @@ dataset_constructor) from llmfoundry.data.packing import BinPackCollator, auto_packing_ratio from llmfoundry.data.text_data import build_streams, get_tokens_per_batch_func -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_container from llmfoundry.utils.exceptions import (MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError) @@ -170,7 +170,7 @@ def build_finetuning_dataloader( 'streams') is not None: # Build streaming dataloader streams_cfg = dataset_cfg.get('streams', None) - streams_cfg = to_dict_recursive( + streams_cfg = to_dict_container( streams_cfg) if streams_cfg is not None else None streams = build_streams( streams_cfg) if streams_cfg is not None else None diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 16881b860a..bf74c6746f 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -29,7 +29,7 @@ from llmfoundry.eval.datasets.in_context_learning_evaluation import \ get_icl_task_dataloader from llmfoundry.tokenizers.tiktoken import TiktokenTokenizerWrapper -from llmfoundry.utils.config_utils import to_dict_recursive, to_list_recursive +from llmfoundry.utils.config_utils import to_dict_container, to_list_container from llmfoundry.utils.registry_utils import construct_from_registry from llmfoundry.utils.warnings import VersionedDeprecationWarning @@ -156,7 +156,7 @@ def build_icl_data_and_gauntlet( with open(eval_gauntlet_config, 'r') as icl_f: eval_gauntlet_cfg = om.load(icl_f) assert isinstance(eval_gauntlet_cfg, DictConfig) - eval_gauntlet = to_dict_recursive( + eval_gauntlet = to_dict_container( eval_gauntlet_cfg['eval_gauntlet']) elif isinstance(eval_gauntlet_config, dict): # pyright: ignore eval_gauntlet = eval_gauntlet_config @@ -484,7 +484,7 @@ def build_icl_evaluators( log.info(f'Extracting ICL task config from path: {icl_tasks}') with open(icl_tasks, 'r') as icl_f: icl_task_cfg = om.load(icl_f) - icl_tasks_list = to_list_recursive(icl_task_cfg.icl_tasks) + icl_tasks_list = to_list_container(icl_task_cfg.icl_tasks) else: icl_tasks_list = icl_tasks diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index a0f636255d..dab00a67b4 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -176,24 +176,24 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str): ) -def to_dict_recursive(cfg: Union[DictConfig, Dict[str, Any]]) -> Dict[str, Any]: - maybe_dict = to_container_recursive(cfg) +def to_dict_container(cfg: Union[DictConfig, Dict[str, Any]]) -> Dict[str, Any]: + maybe_dict = to_container(cfg) if isinstance(maybe_dict, dict): return maybe_dict else: raise ValueError(f'Expected a dict-like type, got {type(maybe_dict)}') -def to_list_recursive( +def to_list_container( cfg: Union[ListConfig, List[Dict[str, Any]]]) -> List[Dict[str, Any]]: - maybe_list = to_container_recursive(cfg) + maybe_list = to_container(cfg) if isinstance(maybe_list, list): return maybe_list else: raise ValueError(f'Expected a list-like type, got {type(maybe_list)}') -def to_container_recursive( +def to_container( cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], List[Dict[str, Any]]]] ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: @@ -202,22 +202,16 @@ def to_container_recursive( `omegaconf.to_container` does not handle nested DictConfig or ListConfig objects, so this function is used to convert them to dicts or lists. """ - - def rh(x: Any) -> Any: # recursive helper - if isinstance(x, DictConfig): - ret = om.to_container(x) - assert isinstance(ret, dict) - return ret - # return {k: rh(v) for k, v in x.items()} - elif isinstance(x, ListConfig): - # return [rh(v) for v in x] - ret = om.to_container(x) - assert isinstance(ret, list) - return ret - else: - return x - - return rh(cfg) + if isinstance(cfg, DictConfig): + ret = om.to_container(cfg) + assert isinstance(ret, dict) + return ret + elif isinstance(cfg, ListConfig): + ret = om.to_container(cfg) + assert isinstance(ret, list) + return ret + else: + return cfg T = TypeVar('T') diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 359ca68b1c..bfb39dbba2 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -28,9 +28,8 @@ from llmfoundry.utils.config_utils import (EVAL_CONFIG_KEYS, EvalConfig, log_config, make_dataclass_and_log_config, - process_init_device, - to_container_recursive, - to_list_recursive) + process_init_device, to_container, + to_list_container) from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) @@ -173,8 +172,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for code_path in (eval_config.code_paths or []): import_file(code_path) - model_configs = to_list_recursive(eval_config.models) - eval_gauntlet_config = to_container_recursive( + model_configs = to_list_container(eval_config.models) + eval_gauntlet_config = to_container( eval_config.eval_gauntlet) or eval_config.eval_gauntlet_str assert eval_gauntlet_config is None or isinstance( eval_gauntlet_config, dict @@ -186,7 +185,7 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: # despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`. # I think it might be rewrapped in DictConfig during the `structured` call in `_make_eval_and_log_config`. # this redundant check is necessary to avoid a pyright error. - fsdp_config = to_container_recursive(eval_config.fsdp_config) + fsdp_config = to_container(eval_config.fsdp_config) assert isinstance( fsdp_config, Dict ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' @@ -194,18 +193,16 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: } if fsdp_config else None # pyright fix # Mandatory Evaluation Parameters - icl_tasks = to_container_recursive( - eval_config.icl_tasks) or eval_config.icl_tasks_str + icl_tasks = to_container(eval_config.icl_tasks) or eval_config.icl_tasks_str assert isinstance(icl_tasks, list) or isinstance( icl_tasks, str ), f'icl_tasks must be a list or a string but is {type(icl_tasks)}, {icl_tasks=}' assert icl_tasks is not None, 'icl_tasks must be specified in the config' # Optional Evaluation Parameters with default values - eval_loader_config = to_container_recursive( - eval_config.eval_loader - ) if eval_config.eval_loader else to_container_recursive( - eval_config.eval_loaders) + eval_loader_config = to_container( + eval_config.eval_loader) if eval_config.eval_loader else to_container( + eval_config.eval_loaders) default_run_name: str = os.environ.get('RUN_NAME', 'llm') run_name = eval_config.run_name if eval_config.run_name else default_run_name diff --git a/scripts/train/train.py b/scripts/train/train.py index ab1a0051a3..db87a8b523 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -36,7 +36,7 @@ log_config, make_dataclass_and_log_config, pop_config, process_init_device, - to_dict_recursive, to_list_recursive, + to_dict_container, to_list_container, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -200,20 +200,20 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config = to_dict_recursive(train_cfg.model) - train_loader_config = to_dict_recursive(train_cfg.train_loader) + model_config = to_dict_container(train_cfg.model) + train_loader_config = to_dict_container(train_cfg.train_loader) # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = to_dict_recursive( + fsdp_config: Optional[Dict[str, Any]] = to_dict_container( train_cfg.fsdp_config) if train_cfg.fsdp_config is not None else None - eval_loader_config = to_dict_recursive( + eval_loader_config = to_dict_container( train_cfg.eval_loader - ) if train_cfg.eval_loader is not None else to_list_recursive( + ) if train_cfg.eval_loader is not None else to_list_container( train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None - icl_tasks_config = to_list_recursive( + icl_tasks_config = to_list_container( train_cfg.icl_tasks) if train_cfg.icl_tasks is not None else None - eval_gauntlet_config = to_dict_recursive( + eval_gauntlet_config = to_dict_container( train_cfg.eval_gauntlet ) if train_cfg.eval_gauntlet is not None else None @@ -293,7 +293,7 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg = to_dict_recursive( + profiler_cfg = to_dict_container( train_cfg.profiler) if train_cfg.profiler is not None else None if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, @@ -313,7 +313,7 @@ def main(cfg: DictConfig) -> Trainer: trace_handlers=profiler_trace_handlers, schedule=profiler_schedule) - callback_configs = to_dict_recursive( + callback_configs = to_dict_container( train_cfg.callbacks) if train_cfg.callbacks is not None else {} # Callbacks callbacks: List[Callback] = [ @@ -323,7 +323,7 @@ def main(cfg: DictConfig) -> Trainer: use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) - algorithm_configs = to_dict_recursive( + algorithm_configs = to_dict_container( train_cfg.algorithms) if train_cfg.algorithms is not None else {} # Algorithms algorithms = [ @@ -393,7 +393,7 @@ def main(cfg: DictConfig) -> Trainer: # Optimizer optimizer_name: str = train_cfg.optimizer.pop('name') - optimizer_cfg = to_dict_recursive(train_cfg.optimizer) + optimizer_cfg = to_dict_container(train_cfg.optimizer) optimizer = build_optimizer(model, optimizer_name, optimizer_cfg) # Now add the eval metrics @@ -411,7 +411,7 @@ def main(cfg: DictConfig) -> Trainer: mosaicml_logger.log_exception(e) raise e - compile_config = to_dict_recursive( + compile_config = to_dict_container( train_cfg.compile_config ) if train_cfg.compile_config is not None else None # Build the Trainer diff --git a/tests/a_scripts/eval/test_eval.py b/tests/a_scripts/eval/test_eval.py index 52db5e7937..ef3c12d1d3 100644 --- a/tests/a_scripts/eval/test_eval.py +++ b/tests/a_scripts/eval/test_eval.py @@ -13,7 +13,7 @@ from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_container from scripts.eval.eval import main # noqa: E402 from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg @@ -51,7 +51,7 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]): name = model_cfg.model.pop('name') model = build_composer_model(name=name, tokenizer=tokenizer, - cfg=to_dict_recursive(model_cfg.model)) + cfg=to_dict_container(model_cfg.model)) # create mocked save checkpoint trainer = Trainer(model=model, device=device) diff --git a/tests/a_scripts/inference/test_convert_composer_to_hf.py b/tests/a_scripts/inference/test_convert_composer_to_hf.py index 8d326d89dd..01ffed41cf 100644 --- a/tests/a_scripts/inference/test_convert_composer_to_hf.py +++ b/tests/a_scripts/inference/test_convert_composer_to_hf.py @@ -28,7 +28,7 @@ from llmfoundry.models.mpt import MPTConfig from llmfoundry.utils.builders import (build_composer_model, build_optimizer, build_tokenizer) -from llmfoundry.utils.config_utils import process_init_device, to_dict_recursive +from llmfoundry.utils.config_utils import process_init_device, to_dict_container from scripts.inference.convert_composer_to_hf import convert_composer_to_hf from tests.data_utils import make_tiny_ft_dataset @@ -875,7 +875,7 @@ def test_convert_and_generate(model: str, tie_word_embeddings: bool, original_model = build_composer_model( name=name, tokenizer=tokenizer, - cfg=to_dict_recursive(om_cfg['model']), + cfg=to_dict_container(om_cfg['model']), ) trainer = Trainer(model=original_model, device='cpu' if not model == 'mptmoe' else 'gpu') @@ -948,7 +948,7 @@ def test_convert_and_generate_meta(tie_word_embeddings: str, original_model = build_composer_model( name=name, tokenizer=tokenizer, - cfg=to_dict_recursive(om_cfg['model']), + cfg=to_dict_container(om_cfg['model']), ) trainer = Trainer(model=original_model, device='cpu' if not 'moe' in conf_path else 'gpu') diff --git a/tests/callbacks/test_eval_gauntlet_callback.py b/tests/callbacks/test_eval_gauntlet_callback.py index acf6950518..cdd2324b60 100644 --- a/tests/callbacks/test_eval_gauntlet_callback.py +++ b/tests/callbacks/test_eval_gauntlet_callback.py @@ -13,7 +13,7 @@ from llmfoundry.eval.metrics.nlp import InContextLearningLMAccuracy from llmfoundry.utils.builders import build_icl_data_and_gauntlet -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_container @pytest.fixture(autouse=True) @@ -98,8 +98,8 @@ def test_gauntlet_callback(averages: Optional[dict]): # test loading functionality _, _, eval_gauntlet_callback = build_icl_data_and_gauntlet( - [to_dict_recursive(c) for c in icl_task_config_list], - to_dict_recursive(eval_gauntlet_config), tokenizer, 4, 1024, 1) + [to_dict_container(c) for c in icl_task_config_list], + to_dict_container(eval_gauntlet_config), tokenizer, 4, 1024, 1) assert eval_gauntlet_callback is not None state = MockState(eval_gauntlet_callback.logger_keys) logger = MockLogger(state) diff --git a/tests/data/test_icl_datasets.py b/tests/data/test_icl_datasets.py index ae679a191f..4cf5cb56dd 100644 --- a/tests/data/test_icl_datasets.py +++ b/tests/data/test_icl_datasets.py @@ -8,7 +8,7 @@ from transformers import AutoTokenizer, PreTrainedTokenizerBase from llmfoundry.utils.builders import build_icl_evaluators -from llmfoundry.utils.config_utils import to_list_recursive +from llmfoundry.utils.config_utils import to_list_container def load_icl_config(conf_path: str = 'tests/data/test_tasks.yaml'): @@ -21,7 +21,7 @@ def run_test(dir: pathlib.Path, tokenizer: PreTrainedTokenizerBase, bos_tok: str = ''): task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(to_list_recursive(task_cfg.icl_tasks), + evaluators, _ = build_icl_evaluators(to_list_container(task_cfg.icl_tasks), tokenizer, 1024, 8, diff --git a/tests/models/hf/test_hf_config.py b/tests/models/hf/test_hf_config.py index 2cdc37b797..e765ea4966 100644 --- a/tests/models/hf/test_hf_config.py +++ b/tests/models/hf/test_hf_config.py @@ -16,7 +16,7 @@ from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_container def test_remote_code_false_mpt( @@ -49,7 +49,7 @@ def test_remote_code_false_mpt( name = test_cfg.model.pop('name') _ = build_composer_model( name=name, - cfg=to_dict_recursive(test_cfg.model), + cfg=to_dict_container(test_cfg.model), tokenizer=tokenizer, ) @@ -142,7 +142,7 @@ def test_hf_config_override( name = test_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_dict_recursive(test_cfg.model), + cfg=to_dict_container(test_cfg.model), tokenizer=tokenizer, ) @@ -167,7 +167,7 @@ def test_hf_config_override( name = hf_model_config.model.pop('name') hf_model = build_composer_model( name=name, - cfg=to_dict_recursive(hf_model_config.model), + cfg=to_dict_container(hf_model_config.model), tokenizer=tokenizer, ) diff --git a/tests/models/hf/test_hf_v_mpt.py b/tests/models/hf/test_hf_v_mpt.py index b25f2d4476..8f2aa6e5a7 100644 --- a/tests/models/hf/test_hf_v_mpt.py +++ b/tests/models/hf/test_hf_v_mpt.py @@ -9,7 +9,7 @@ from omegaconf import OmegaConf as om from llmfoundry.utils.builders import build_composer_model, build_tokenizer -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_container @pytest.mark.gpu @@ -63,7 +63,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, hf_cfg.model.pop('device') hf_model = build_composer_model( name=name, - cfg=to_dict_recursive(hf_cfg.model), + cfg=to_dict_container(hf_cfg.model), tokenizer=tokenizer, ).to(device) hf_n_params = sum(p.numel() for p in hf_model.parameters()) @@ -119,7 +119,7 @@ def test_compare_hf_v_mpt(attn_impl: str, dropout: float, alibi: bool, model_cfg.pop('device') model = build_composer_model( name=name, - cfg=to_dict_recursive(model_cfg), + cfg=to_dict_container(model_cfg), tokenizer=tokenizer, ).to(device) n_params = sum(p.numel() for p in model.parameters()) diff --git a/tests/models/inference_api_wrapper/test_fmapi.py b/tests/models/inference_api_wrapper/test_fmapi.py index 9654de8f04..3e85a924cc 100644 --- a/tests/models/inference_api_wrapper/test_fmapi.py +++ b/tests/models/inference_api_wrapper/test_fmapi.py @@ -12,7 +12,7 @@ FMAPIChatAPIEvalWrapper) from llmfoundry.models.inference_api_wrapper.fmapi import FMAPIEvalInterface from llmfoundry.utils.builders import build_icl_evaluators -from llmfoundry.utils.config_utils import to_list_recursive +from llmfoundry.utils.config_utils import to_list_container def load_icl_config(): @@ -105,7 +105,7 @@ def test_causal_fmapi_wrapper(tmp_path: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(to_list_recursive( + evaluators, _ = build_icl_evaluators(to_list_container( task_cfg.icl_tasks), tokenizer, 1024, @@ -142,7 +142,7 @@ def test_chat_fmapi_wrapper(tmp_path: str): 'Treason!') task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(to_list_recursive( + evaluators, _ = build_icl_evaluators(to_list_container( task_cfg.icl_tasks), tokenizer, 1024, diff --git a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py index e05593cfbc..8dd1a61698 100644 --- a/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py +++ b/tests/models/inference_api_wrapper/test_inference_api_eval_wrapper.py @@ -12,7 +12,7 @@ OpenAIChatAPIEvalWrapper) from llmfoundry.tokenizers import TiktokenTokenizerWrapper from llmfoundry.utils.builders import build_icl_evaluators -from llmfoundry.utils.config_utils import to_list_recursive +from llmfoundry.utils.config_utils import to_list_container @pytest.fixture(scope='module') @@ -107,7 +107,7 @@ def test_openai_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): mock.completions.create = mock_create task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(to_list_recursive( + evaluators, _ = build_icl_evaluators(to_list_container( task_cfg.icl_tasks), tokenizer, 1024, @@ -140,7 +140,7 @@ def test_chat_api_eval_wrapper(tmp_path: str, openai_api_key_env_var: str): 'Treason!') task_cfg = load_icl_config() - evaluators, _ = build_icl_evaluators(to_list_recursive( + evaluators, _ = build_icl_evaluators(to_list_container( task_cfg.icl_tasks), tokenizer, 1024, diff --git a/tests/models/test_model.py b/tests/models/test_model.py index 3a1ab1ce7b..1c5fac5463 100644 --- a/tests/models/test_model.py +++ b/tests/models/test_model.py @@ -35,7 +35,7 @@ from llmfoundry.models.mpt import MPTConfig, MPTForCausalLM from llmfoundry.utils import build_tokenizer from llmfoundry.utils.builders import build_composer_model -from llmfoundry.utils.config_utils import to_dict_recursive +from llmfoundry.utils.config_utils import to_dict_container def get_config( @@ -50,7 +50,7 @@ def get_config( def _load_tokenizer_cfg(cfg: Union[Dict[str, Any], DictConfig]) -> Dict: if isinstance(cfg, DictConfig): - config = to_dict_recursive(cfg) + config = to_dict_container(cfg) else: assert isinstance(cfg, dict) config = cfg @@ -96,7 +96,7 @@ def _get_objs(request: pytest.FixtureRequest, name = test_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_dict_recursive(test_cfg.model), + cfg=to_dict_container(test_cfg.model), tokenizer=tokenizer, ) @@ -299,7 +299,7 @@ def test_full_forward_and_backward_gpt2_small(batch_size: int = 2): name = neo_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_dict_recursive(neo_cfg.model), + cfg=to_dict_container(neo_cfg.model), tokenizer=tokenizer, ).to(device) @@ -349,7 +349,7 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2): name = t5_cfg.model.pop('name') model = build_composer_model( name=name, - cfg=to_dict_recursive(t5_cfg.model), + cfg=to_dict_container(t5_cfg.model), tokenizer=tokenizer, ).to(device) @@ -427,7 +427,7 @@ def test_determinism(attn_impl: str, precision: torch.dtype, ffn_type: str, name = test_cfg.model.pop('name') model_1 = build_composer_model( name=name, - cfg=to_dict_recursive(test_cfg.model), + cfg=to_dict_container(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -498,7 +498,7 @@ def test_loss_fn(): name = test_cfg.model.pop('name') model_1 = build_composer_model( name=name, - cfg=to_dict_recursive(test_cfg.model), + cfg=to_dict_container(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) @@ -585,7 +585,7 @@ def test_loss_reduction(loss_fn_config: str): name = test_cfg.model.pop('name') model_1 = build_composer_model( name=name, - cfg=to_dict_recursive(test_cfg.model), + cfg=to_dict_container(test_cfg.model), tokenizer=tokenizer, ) model_2 = copy.deepcopy(model_1) From 04033307d6a04bbef8fa7b011079ac7e3c87538e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 21:05:33 +0000 Subject: [PATCH 160/201] fix the test cases with deep magic --- llmfoundry/utils/config_utils.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index dab00a67b4..040cc5a856 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -202,16 +202,16 @@ def to_container( `omegaconf.to_container` does not handle nested DictConfig or ListConfig objects, so this function is used to convert them to dicts or lists. """ - if isinstance(cfg, DictConfig): - ret = om.to_container(cfg) - assert isinstance(ret, dict) - return ret - elif isinstance(cfg, ListConfig): - ret = om.to_container(cfg) - assert isinstance(ret, list) - return ret - else: - return cfg + + def rh(x: Any) -> Any: # recursive helper + if isinstance(x, DictConfig): + return {k: rh(v) for k, v in x.items()} + elif isinstance(x, ListConfig): + return [rh(v) for v in x] + else: + return x + + return rh(cfg) T = TypeVar('T') From d33eb10e9ff3d89dba0cabcb1d5214d99c7d159c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 21:32:42 +0000 Subject: [PATCH 161/201] towards a peaceful resolution --- llmfoundry/utils/config_utils.py | 40 ++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 040cc5a856..dd8ce03771 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -202,16 +202,18 @@ def to_container( `omegaconf.to_container` does not handle nested DictConfig or ListConfig objects, so this function is used to convert them to dicts or lists. """ - - def rh(x: Any) -> Any: # recursive helper - if isinstance(x, DictConfig): - return {k: rh(v) for k, v in x.items()} - elif isinstance(x, ListConfig): - return [rh(v) for v in x] - else: - return x - - return rh(cfg) + if isinstance(cfg, DictConfig): + ret = om.to_container(cfg, resolve=True) + assert isinstance(ret, dict) + return ret + # return {k: rh(v) for k, v in cfg.items()} + elif isinstance(cfg, ListConfig): + # return [rh(v) for v in cfg] + ret = om.to_container(cfg, resolve=True) + assert isinstance(ret, list) + return ret + else: + return cfg T = TypeVar('T') @@ -438,3 +440,21 @@ def log_config(cfg: Dict[str, Any]) -> None: raise e if mlflow.active_run(): mlflow.log_params(params=cfg) + + +if __name__ == '__main__': + my_dict_config = DictConfig({ + 'a': 1, + 'b': 2, + 'c': None, + 'd': { + 'e': 3, + 'f': 4, + 'g': None + } + }) + + print(to_dict_container(my_dict_config)) + print(om.to_container(my_dict_config)) + assert to_dict_container(my_dict_config) == om.to_container( + my_dict_config) # passes From bd203e60fa24ec1cc3f5d0e8bae25e38ce7db757 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 22 Apr 2024 21:33:03 +0000 Subject: [PATCH 162/201] remove comments --- llmfoundry/utils/config_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index dd8ce03771..847cfd214a 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -206,9 +206,7 @@ def to_container( ret = om.to_container(cfg, resolve=True) assert isinstance(ret, dict) return ret - # return {k: rh(v) for k, v in cfg.items()} elif isinstance(cfg, ListConfig): - # return [rh(v) for v in cfg] ret = om.to_container(cfg, resolve=True) assert isinstance(ret, list) return ret From 853c173b7254ebb60b7dbdb43d6d2a6aa5aa87a1 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 00:52:33 +0000 Subject: [PATCH 163/201] fix type warnings --- llmfoundry/utils/config_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 847cfd214a..f0c2b9c036 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -205,13 +205,13 @@ def to_container( if isinstance(cfg, DictConfig): ret = om.to_container(cfg, resolve=True) assert isinstance(ret, dict) - return ret + return ret # type: ignore (return type is correct and converting all keys to str would be unnecessarily costly) elif isinstance(cfg, ListConfig): ret = om.to_container(cfg, resolve=True) assert isinstance(ret, list) - return ret + return ret # type: ignore (see above) else: - return cfg + return cfg # type: ignore (dicts and lists are already in the correct format) T = TypeVar('T') From 0e74185d73ff94cde52a743754c5fd1df2ab5532 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 09:41:46 -0400 Subject: [PATCH 164/201] Update llmfoundry/utils/config_utils.py Co-authored-by: Daniel King <43149077+dakinggg@users.noreply.github.com> --- llmfoundry/utils/config_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index f0c2b9c036..24582bfd09 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -255,7 +255,7 @@ def make_dataclass_and_log_config( # Create copy of config for logging logged_cfg: Dict[str, Any] = copy.deepcopy(unstructured_config) - # apply transforms to the unstructured config before constructing dataclass + # Apply transforms to the unstructured config before constructing dataclass for transform in transforms or []: unstructured_config = transform(unstructured_config) From 9fa0418efe2f36d7fca8363212d2a5ef2a8207f2 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 14:29:01 +0000 Subject: [PATCH 165/201] address low-hanging fruit --- llmfoundry/data/finetuning/dataloader.py | 25 ------------------------ llmfoundry/data/text_data.py | 2 +- llmfoundry/models/hf/hf_causal_lm.py | 3 --- llmfoundry/models/hf/hf_t5.py | 9 ++++----- llmfoundry/utils/builders.py | 4 +--- llmfoundry/utils/config_utils.py | 24 ++--------------------- scripts/eval/eval.py | 8 ++++---- scripts/train/train.py | 17 ++++++++-------- tests/models/hf/test_hf_peft_wrapping.py | 1 - 9 files changed, 20 insertions(+), 73 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index ac7d7bd773..c527ce1717 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -280,31 +280,6 @@ def build_finetuning_dataloader( return DataSpec(dataloader=dl, get_num_tokens_in_batch=token_counting_func) -# local=dataset_cfg.get('local', None), -# remote=dataset_cfg.get('remote', None), -# split=dataset_cfg.get('split', None), -# download_retry=dataset_cfg.get('download_retry', 2), -# download_timeout=dataset_cfg.get('download_timeout', 60), -# validate_hash=dataset_cfg.get('validate_hash', None), -# keep_zip=dataset_cfg.get('keep_zip', False), -# epoch_size=dataset_cfg.get('epoch_size', None), -# predownload=dataset_cfg.get('predownload', None), -# cache_limit=dataset_cfg.get('cache_limit', None), -# partition_algo=dataset_cfg.get('partition_algo', 'relaxed'), -# num_canonical_nodes=dataset_cfg.get('num_canonical_nodes', None), -# batch_size=device_batch_size, -# shuffle=dataset_cfg.get('shuffle', False), -# shuffle_algo=dataset_cfg.get('shuffle_algo', 'py1e'), -# shuffle_seed=dataset_cfg.get('shuffle_seed', 9176), -# shuffle_block_size=dataset_cfg.get('shuffle_block_size', None), -# sampling_method=dataset_cfg.get('sampling_method', 'balanced'), -# sampling_granularity=dataset_cfg.get('sampling_granularity', 1), -# batching_method=dataset_cfg.get('batching_method', 'random'), -# max_seq_len=dataset_cfg.max_seq_len, -# allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), -# replication=dataset_cfg.get('replication', None), - - def _validate_config( max_seq_len: int, decoder_only_format: bool = False, diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index c15ba11183..8c3a2dde3e 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -248,7 +248,7 @@ def get_sequence_id_from_batch( def build_streams(streams: Optional[Dict[str, Any]] = None,): streams_dict = streams # build streams - streams_ret: List = [] + streams_ret = [] if streams_dict is not None: for _, stream in streams_dict.items(): # stream is the streams kwargs diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 1f8881b942..258632a2eb 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -79,9 +79,6 @@ def __init__( additional_train_metrics = additional_train_metrics or [] additional_eval_metrics = additional_eval_metrics or [] - pretrained_model_name_or_path = pretrained_model_name_or_path - pretrained_lora_id_or_path = pretrained_lora_id_or_path - if not trust_remote_code and pretrained_model_name_or_path.startswith( 'mosaicml/mpt'): raise ValueError( diff --git a/llmfoundry/models/hf/hf_t5.py b/llmfoundry/models/hf/hf_t5.py index 409093f271..2bae8fbf39 100644 --- a/llmfoundry/models/hf/hf_t5.py +++ b/llmfoundry/models/hf/hf_t5.py @@ -57,6 +57,7 @@ def __init__( from llmfoundry.utils.builders import build_metric config_overrides = config_overrides or {} + additional_train_metrics = additional_train_metrics or [] config = AutoConfig.from_pretrained( pretrained_model_name_or_path, @@ -65,7 +66,7 @@ def __init__( ) # set config overrides - for k, v in (config_overrides or {}).items(): + for k, v in config_overrides.items(): if not hasattr(config, k): raise ValueError( f'config does not have attribute "{k}" to override ({k}: {v}).' @@ -87,8 +88,6 @@ def __init__( raise ValueError(f'Model type "hf_t5" currently only supports T5 models ' +\ f'using configs where `is_encoder_decoder` is ``True``.') - init_device = init_device - # Get the device we want to initialize, and use the # resolved version to initialize the HF model resolved_init_device = hf_get_init_device(init_device) @@ -116,8 +115,8 @@ def __init__( f'init_device="{init_device}" must be either "cpu" or "meta".') metrics = [ - build_metric(metric, {}) for metric in DEFAULT_ENC_DEC_METRICS + - (additional_train_metrics or []) + build_metric(metric, {}) + for metric in DEFAULT_ENC_DEC_METRICS + additional_train_metrics ] composer_model = super().__init__(model=model, diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index bf74c6746f..a9e3ecb224 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -103,9 +103,7 @@ def build_eval_loaders( ) for eval_config in eval_configs: - label = None - if 'label' in eval_config: - label = eval_config.pop('label') + label = eval_config.pop('label') eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) eval_loader: Evaluator = Evaluator( diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index f0c2b9c036..39ec8775a0 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -92,8 +92,6 @@ class TrainConfig: max_seq_len: int = MISSING seed: int = MISSING - # Optional model training parameters - # Code paths to import code_paths: Optional[List[str]] = None @@ -197,7 +195,7 @@ def to_container( cfg: Optional[Union[DictConfig, ListConfig, Dict[str, Any], List[Dict[str, Any]]]] ) -> Union[Dict[str, Any], List[Dict[str, Any]]]: - """Converts a DictConfig or ListConfig to a dict or list recursively. + """Converts a DictConfig or ListConfig to a dict or list. `omegaconf.to_container` does not handle nested DictConfig or ListConfig objects, so this function is used to convert them to dicts or lists. @@ -269,7 +267,7 @@ def make_dataclass_and_log_config( for key in extraneous_keys: warnings.warn( - f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases.', + f'Unused parameter {key} found in cfg. Please check your yaml to ensure this parameter is necessary. Interpreting {key} as a variable for logging purposes. Top-level variables are deprecated and will not be supported in future releases. Please place any variables under the `variables` key.', category=DeprecationWarning) unstructured_config['variables'][key] = unstructured_config.pop(key) @@ -438,21 +436,3 @@ def log_config(cfg: Dict[str, Any]) -> None: raise e if mlflow.active_run(): mlflow.log_params(params=cfg) - - -if __name__ == '__main__': - my_dict_config = DictConfig({ - 'a': 1, - 'b': 2, - 'c': None, - 'd': { - 'e': 3, - 'f': 4, - 'g': None - } - }) - - print(to_dict_container(my_dict_config)) - print(om.to_container(my_dict_config)) - assert to_dict_container(my_dict_config) == om.to_container( - my_dict_config) # passes diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index bfb39dbba2..c8d2f4c46d 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -164,14 +164,14 @@ def evaluate_model( def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: + # Run user provided code if specified + for code_path in cfg.get('code_paths', []): + import_file(code_path) + cfgs: Tuple[Dict[str, Any], EvalConfig] = make_dataclass_and_log_config( cfg, EvalConfig, EVAL_CONFIG_KEYS, icl_tasks_required=True) logged_cfg, eval_config = cfgs - # Run user provided code if specified - for code_path in (eval_config.code_paths or []): - import_file(code_path) - model_configs = to_list_container(eval_config.models) eval_gauntlet_config = to_container( eval_config.eval_gauntlet) or eval_config.eval_gauntlet_str diff --git a/scripts/train/train.py b/scripts/train/train.py index db87a8b523..424d981a08 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -45,7 +45,7 @@ def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" - # Check for missing mandatory fields + # Check for missing mandatory fields and throw error early. for field in TRAIN_CONFIG_KEYS: _ = getattr(train_config, field) @@ -85,19 +85,18 @@ def validate_config(train_config: TrainConfig): if (train_config.model.get('fc_type', 'torch') == 'te' or 'te' in train_config.model.get('ffn_config', {}).get( 'ffn_type', 'mptmlp')): - if train_config.fsdp_config is None: - train_config.fsdp_config = {} fsdp_config = train_config.fsdp_config act_ckpt = fsdp_config.get('activation_checkpointing', False) if fsdp_config else False act_ckpt_reentrant = fsdp_config.get( 'activation_checkpointing_reentrant', False) - if act_ckpt == True and act_ckpt_reentrant == True: + if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == True: warnings.warn( '`te.Linear` layers do not support activation_checkpointing with ' + '`activation_checkpointing_reentrant = True`. ' + 'Setting cfg.fsdp_config.activation_checkpointing_reentrant=False.' ) + assert train_config.fsdp_config is not None # pyright (this is known because fsdp_config is not None) train_config.fsdp_config[ 'activation_checkpointing_reentrant'] = False @@ -149,6 +148,11 @@ def _log_num_params(model: ComposerModel, logged_cfg: Dict[str, Any]): def main(cfg: DictConfig) -> Trainer: + code_paths = cfg.get('code_paths', []) + # Import any user provided code + for code_path in code_paths: + import_file(code_path) + cfgs: Tuple[Dict[str, Any], TrainConfig] = make_dataclass_and_log_config( cfg, TrainConfig, @@ -156,11 +160,6 @@ def main(cfg: DictConfig) -> Trainer: transforms=[update_batch_size_info]) logged_cfg, train_cfg = cfgs - code_paths = train_cfg.code_paths if train_cfg.code_paths else [] - # Import any user provided code - for code_path in code_paths: - import_file(code_path) - # Filter deprecation warning from torch internal usage warnings.filterwarnings( action='ignore', diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index b31eaa12eb..83e079e5fb 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -100,7 +100,6 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, model = trainer.state.model underlying_model = model.model.base_model.model.model - # assert False, f"underlying_model: {underlying_model}" lora_A = underlying_model.layers[0].self_attn.q_proj.lora_A['default'] lora_B = underlying_model.layers[0].self_attn.q_proj.lora_B['default'] From 66655756b799ecfa8a25dbad89867b3b2f479437 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 14:34:10 +0000 Subject: [PATCH 166/201] remove peft wrapping extra model --- tests/models/hf/test_hf_peft_wrapping.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/hf/test_hf_peft_wrapping.py b/tests/models/hf/test_hf_peft_wrapping.py index 83e079e5fb..052704e785 100644 --- a/tests/models/hf/test_hf_peft_wrapping.py +++ b/tests/models/hf/test_hf_peft_wrapping.py @@ -99,9 +99,9 @@ def test_lora_mixed_init(peft_config: Optional[dict], tmp_path: pathlib.Path, ) model = trainer.state.model - underlying_model = model.model.base_model.model.model - lora_A = underlying_model.layers[0].self_attn.q_proj.lora_A['default'] - lora_B = underlying_model.layers[0].self_attn.q_proj.lora_B['default'] + underlying_model = model.model.base_model.model + lora_A = underlying_model.model.layers[0].self_attn.q_proj.lora_A['default'] + lora_B = underlying_model.model.layers[0].self_attn.q_proj.lora_B['default'] assert (lora_A.weight == 1).all() assert (lora_B.weight == 0).all() From 9cdc7a483039b8f856bb65d928973ecf2d4b5c9c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 14:38:54 +0000 Subject: [PATCH 167/201] python :handshake: haskell --- scripts/eval/eval.py | 3 +-- scripts/train/train.py | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index c8d2f4c46d..ebfba4529b 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -168,9 +168,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: for code_path in cfg.get('code_paths', []): import_file(code_path) - cfgs: Tuple[Dict[str, Any], EvalConfig] = make_dataclass_and_log_config( + logged_cfg, eval_config = make_dataclass_and_log_config( cfg, EvalConfig, EVAL_CONFIG_KEYS, icl_tasks_required=True) - logged_cfg, eval_config = cfgs model_configs = to_list_container(eval_config.models) eval_gauntlet_config = to_container( diff --git a/scripts/train/train.py b/scripts/train/train.py index 424d981a08..ef63be54ea 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -6,7 +6,7 @@ import sys import time import warnings -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Union import torch from composer import ComposerModel, Trainer @@ -153,12 +153,11 @@ def main(cfg: DictConfig) -> Trainer: for code_path in code_paths: import_file(code_path) - cfgs: Tuple[Dict[str, Any], TrainConfig] = make_dataclass_and_log_config( + logged_cfg, train_cfg = make_dataclass_and_log_config( cfg, TrainConfig, TRAIN_CONFIG_KEYS, transforms=[update_batch_size_info]) - logged_cfg, train_cfg = cfgs # Filter deprecation warning from torch internal usage warnings.filterwarnings( From 08814e1022380aa8909df08d7bf3802b45a9d7e0 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 14:43:02 +0000 Subject: [PATCH 168/201] dataset config should be dict --- llmfoundry/data/text_data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 152ffd1a20..61daac1165 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -14,7 +14,6 @@ import transformers from composer.core.data_spec import DataSpec from composer.core.types import Batch -from omegaconf import DictConfig from omegaconf import OmegaConf as om from streaming import Stream, StreamingDataset from torch.utils.data import DataLoader @@ -267,7 +266,7 @@ def build_streams(streams: Optional[Dict[str, Any]] = None,): def build_text_dataloader( tokenizer: PreTrainedTokenizerBase, device_batch_size: int, - dataset: DictConfig, + dataset: Dict[str, Any], drop_last: bool, num_workers: int, pin_memory: bool = True, From 80acfb39faf10393e68bfe4fb17f24d24d82882c Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 14:47:00 +0000 Subject: [PATCH 169/201] just because omega starts with OMMMM does not mean it's zen --- llmfoundry/data/text_data.py | 3 +-- tests/data/test_dataloader.py | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 61daac1165..47d4709eee 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -14,7 +14,6 @@ import transformers from composer.core.data_spec import DataSpec from composer.core.types import Batch -from omegaconf import OmegaConf as om from streaming import Stream, StreamingDataset from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerBase @@ -274,6 +273,7 @@ def build_text_dataloader( persistent_workers: bool = True, timeout: int = 0, ) -> DataSpec: + dataset_cfg = dataset # get kwargs @@ -450,7 +450,6 @@ def get_num_samples_in_batch(batch: Batch) -> int: 'drop_last': False, 'num_workers': 4, } - cfg = om.create(cfg) device_batch_size = 2 tokenizer_name = args.tokenizer diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 1b141af6b2..e584c8c11e 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -1119,7 +1119,7 @@ def test_token_counting_func_dataloader_setting( device_batch_size=batch_size, **cfg) elif dataloader_type == 'text': - cfg = DictConfig({ + cfg = { 'name': 'text', 'dataset': { 'local': 'dummy-path', @@ -1130,7 +1130,7 @@ def test_token_counting_func_dataloader_setting( 'shuffle_seed': 0, }, **common_args - }) + } ds_mock = MagicMock() ds_mock.tokenizer = gptt monkeypatch.setattr('llmfoundry.data.text_data.StreamingTextDataset', @@ -1142,8 +1142,6 @@ def test_token_counting_func_dataloader_setting( else: raise NotImplementedError() - cfg = om.create(cfg) - batch_collated = dl.dataloader.collate_fn(batch_tokenized) # type: ignore actual_token_count = dl.get_num_tokens_in_batch(batch_collated) From 2dd350c1beecd89df5c681d0a1ef4d0ad8767a83 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 15:02:32 +0000 Subject: [PATCH 170/201] fix --- scripts/train/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index ef63be54ea..9f5a7f2b6a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -89,7 +89,8 @@ def validate_config(train_config: TrainConfig): act_ckpt = fsdp_config.get('activation_checkpointing', False) if fsdp_config else False act_ckpt_reentrant = fsdp_config.get( - 'activation_checkpointing_reentrant', False) + 'activation_checkpointing_reentrant', + False) if fsdp_config else False if fsdp_config is not None and act_ckpt == True and act_ckpt_reentrant == True: warnings.warn( '`te.Linear` layers do not support activation_checkpointing with ' From e8ecfcda7aea83e2aa67aff80bebc43caf43827e Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 15:14:14 +0000 Subject: [PATCH 171/201] fix --- llmfoundry/utils/builders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index a9e3ecb224..bf0b446357 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -103,7 +103,7 @@ def build_eval_loaders( ) for eval_config in eval_configs: - label = eval_config.pop('label') + label = eval_config.pop('label') if is_multi_eval else None eval_dataloader = build_dataloader(eval_config, tokenizer, device_eval_batch_size) eval_loader: Evaluator = Evaluator( From 0842b36057869c1aac11dcfa08524277899194c5 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 15:48:03 +0000 Subject: [PATCH 172/201] structured settlement --- llmfoundry/utils/config_utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 24f336a4fb..4ebc8dc63d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -6,12 +6,12 @@ import logging import math import warnings -from dataclasses import dataclass, fields +from dataclasses import MISSING, dataclass, fields from typing import (Any, Callable, Dict, List, Literal, Mapping, Optional, Set, Tuple, TypeVar, Union) from composer.utils import dist -from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue +from omegaconf import DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from llmfoundry.layers_registry import ffns_with_megablocks @@ -271,8 +271,7 @@ def make_dataclass_and_log_config( category=DeprecationWarning) unstructured_config['variables'][key] = unstructured_config.pop(key) - dataclass_config: T = om.structured( - dataclass_constructor(**unstructured_config)) + dataclass_config: T = dataclass_constructor(**unstructured_config) return logged_cfg, dataclass_config From 4141d4809b30949d9534964d9a59531c84bd7d3d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 16:23:50 +0000 Subject: [PATCH 173/201] precision further down --- llmfoundry/utils/config_utils.py | 4 +++- scripts/train/train.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 4ebc8dc63d..beafcbb961 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -88,10 +88,12 @@ class TrainConfig: device_eval_batch_size: int = MISSING max_duration: Union[int, str] = MISSING eval_interval: Union[int, str] = MISSING - precision: str = 'amp_bf16' max_seq_len: int = MISSING seed: int = MISSING + # Precision + precision: str = 'amp_bf16' + # Code paths to import code_paths: Optional[List[str]] = None diff --git a/scripts/train/train.py b/scripts/train/train.py index 9f5a7f2b6a..a1f5f9d47f 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -196,7 +196,7 @@ def main(cfg: DictConfig) -> Trainer: # Initialize pytorch distributed training process groups dist_timeout: Union[int, float] = train_cfg.dist_timeout - dist.initialize_dist(get_device(None), timeout=dist_timeout) + # dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs model_config = to_dict_container(train_cfg.model) From 53a2a809a4bb229fedfa4ac94a3f9e71c7b31c87 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 16:27:19 +0000 Subject: [PATCH 174/201] throws TypeError instead of MissingMandatoryValue or whatever --- tests/a_scripts/train/test_train_inputs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index c2dd5b3d27..41475535ab 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -79,8 +79,7 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: for param in mandatory_params: orig_param = cfg.pop(param) with pytest.raises( - (omegaconf.errors.MissingMandatoryValue, NameError, - omegaconf.errors.InterpolationKeyError)): + (TypeError, NameError, omegaconf.errors.InterpolationKeyError)): main(cfg) cfg[param] = orig_param From fc86f6f673e03a4ac95fae1b0642e2537f5a3f6a Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 16:28:01 +0000 Subject: [PATCH 175/201] remove debugging statement --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index a1f5f9d47f..9f5a7f2b6a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -196,7 +196,7 @@ def main(cfg: DictConfig) -> Trainer: # Initialize pytorch distributed training process groups dist_timeout: Union[int, float] = train_cfg.dist_timeout - # dist.initialize_dist(get_device(None), timeout=dist_timeout) + dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs model_config = to_dict_container(train_cfg.model) From dc73a4ff77ef9d07d3c0ef80b4589cf044ab708f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 16:36:17 +0000 Subject: [PATCH 176/201] remove to_container calls everywhere --- scripts/eval/eval.py | 38 +++++++++----------------------------- scripts/train/train.py | 39 ++++++++++++++------------------------- 2 files changed, 23 insertions(+), 54 deletions(-) diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index ebfba4529b..8e6caf8b8f 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -28,8 +28,7 @@ from llmfoundry.utils.config_utils import (EVAL_CONFIG_KEYS, EvalConfig, log_config, make_dataclass_and_log_config, - process_init_device, to_container, - to_list_container) + process_init_device) from llmfoundry.utils.registry_utils import import_file log = logging.getLogger(__name__) @@ -171,37 +170,18 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: logged_cfg, eval_config = make_dataclass_and_log_config( cfg, EvalConfig, EVAL_CONFIG_KEYS, icl_tasks_required=True) - model_configs = to_list_container(eval_config.models) - eval_gauntlet_config = to_container( - eval_config.eval_gauntlet) or eval_config.eval_gauntlet_str - assert eval_gauntlet_config is None or isinstance( - eval_gauntlet_config, dict - ) or isinstance( - eval_gauntlet_config, str - ), f'eval_gauntlet_config must be a dict or a string but is {type(eval_gauntlet_config)}, {eval_gauntlet_config=}' - - # the below line fixes a strange issue where the fsdp_config is a DictConfig rather than a Dict, - # despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`. - # I think it might be rewrapped in DictConfig during the `structured` call in `_make_eval_and_log_config`. - # this redundant check is necessary to avoid a pyright error. - fsdp_config = to_container(eval_config.fsdp_config) - assert isinstance( - fsdp_config, Dict - ) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}' - fsdp_config = {str(k): v for k, v in fsdp_config.items() - } if fsdp_config else None # pyright fix + model_configs = eval_config.models + eval_gauntlet_config = eval_config.eval_gauntlet + + fsdp_config = eval_config.fsdp_config # Mandatory Evaluation Parameters - icl_tasks = to_container(eval_config.icl_tasks) or eval_config.icl_tasks_str - assert isinstance(icl_tasks, list) or isinstance( - icl_tasks, str - ), f'icl_tasks must be a list or a string but is {type(icl_tasks)}, {icl_tasks=}' - assert icl_tasks is not None, 'icl_tasks must be specified in the config' + icl_tasks = eval_config.icl_tasks or eval_config.icl_tasks_str + if icl_tasks is None: + raise ValueError('icl_tasks must be specified in the config') # Optional Evaluation Parameters with default values - eval_loader_config = to_container( - eval_config.eval_loader) if eval_config.eval_loader else to_container( - eval_config.eval_loaders) + eval_loader_config = eval_config.eval_loader or eval_config.eval_loaders default_run_name: str = os.environ.get('RUN_NAME', 'llm') run_name = eval_config.run_name if eval_config.run_name else default_run_name diff --git a/scripts/train/train.py b/scripts/train/train.py index 9f5a7f2b6a..cbc76c4f35 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -36,7 +36,6 @@ log_config, make_dataclass_and_log_config, pop_config, process_init_device, - to_dict_container, to_list_container, update_batch_size_info) from llmfoundry.utils.registry_utils import import_file @@ -199,22 +198,15 @@ def main(cfg: DictConfig) -> Trainer: dist.initialize_dist(get_device(None), timeout=dist_timeout) # Mandatory model training configs - model_config = to_dict_container(train_cfg.model) - train_loader_config = to_dict_container(train_cfg.train_loader) + model_config = train_cfg.model + train_loader_config = train_cfg.train_loader # Optional fsdp data, fine-tuning, and eval configs - fsdp_config: Optional[Dict[str, Any]] = to_dict_container( - train_cfg.fsdp_config) if train_cfg.fsdp_config is not None else None - - eval_loader_config = to_dict_container( - train_cfg.eval_loader - ) if train_cfg.eval_loader is not None else to_list_container( - train_cfg.eval_loaders) if train_cfg.eval_loaders is not None else None - icl_tasks_config = to_list_container( - train_cfg.icl_tasks) if train_cfg.icl_tasks is not None else None - eval_gauntlet_config = to_dict_container( - train_cfg.eval_gauntlet - ) if train_cfg.eval_gauntlet is not None else None + fsdp_config: Optional[Dict[str, Any]] = train_cfg.fsdp_config + + eval_loader_config = train_cfg.eval_loader if train_cfg.eval_loader is not None else train_cfg.eval_loaders + icl_tasks_config = train_cfg.icl_tasks + eval_gauntlet_config = train_cfg.eval_gauntlet # Optional parameters will be set to default values if not specified. default_run_name: str = os.environ.get('RUN_NAME', 'llm') @@ -292,8 +284,7 @@ def main(cfg: DictConfig) -> Trainer: # Profiling profiler: Optional[Profiler] = None - profiler_cfg = to_dict_container( - train_cfg.profiler) if train_cfg.profiler is not None else None + profiler_cfg = train_cfg.profiler if profiler_cfg: profiler_schedule_cfg: Dict = pop_config(profiler_cfg, 'schedule', @@ -312,8 +303,8 @@ def main(cfg: DictConfig) -> Trainer: trace_handlers=profiler_trace_handlers, schedule=profiler_schedule) - callback_configs = to_dict_container( - train_cfg.callbacks) if train_cfg.callbacks is not None else {} + callback_configs = train_cfg.callbacks + # Callbacks callbacks: List[Callback] = [ build_callback(str(name), callback_cfg, logged_cfg) @@ -322,8 +313,8 @@ def main(cfg: DictConfig) -> Trainer: use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) - algorithm_configs = to_dict_container( - train_cfg.algorithms) if train_cfg.algorithms is not None else {} + algorithm_configs = train_cfg.algorithms + # Algorithms algorithms = [ build_algorithm(str(name), algorithm_cfg) @@ -392,7 +383,7 @@ def main(cfg: DictConfig) -> Trainer: # Optimizer optimizer_name: str = train_cfg.optimizer.pop('name') - optimizer_cfg = to_dict_container(train_cfg.optimizer) + optimizer_cfg = train_cfg.optimizer optimizer = build_optimizer(model, optimizer_name, optimizer_cfg) # Now add the eval metrics @@ -410,9 +401,7 @@ def main(cfg: DictConfig) -> Trainer: mosaicml_logger.log_exception(e) raise e - compile_config = to_dict_container( - train_cfg.compile_config - ) if train_cfg.compile_config is not None else None + compile_config = train_cfg.compile_config # Build the Trainer log.info('Building trainer...') trainer = Trainer( From 4987145e4262df5586f3d92cdaded11523ecee4f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 17:12:21 +0000 Subject: [PATCH 177/201] wrap then unwrap --- llmfoundry/utils/config_utils.py | 9 ++++++--- tests/a_scripts/eval/test_eval_inputs.py | 7 ++++--- tests/a_scripts/train/test_train_inputs.py | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index beafcbb961..b4288c166d 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -6,12 +6,12 @@ import logging import math import warnings -from dataclasses import MISSING, dataclass, fields +from dataclasses import dataclass, fields from typing import (Any, Callable, Dict, List, Literal, Mapping, Optional, Set, Tuple, TypeVar, Union) from composer.utils import dist -from omegaconf import DictConfig, ListConfig, MissingMandatoryValue +from omegaconf import MISSING, DictConfig, ListConfig, MissingMandatoryValue from omegaconf import OmegaConf as om from llmfoundry.layers_registry import ffns_with_megablocks @@ -273,7 +273,10 @@ def make_dataclass_and_log_config( category=DeprecationWarning) unstructured_config['variables'][key] = unstructured_config.pop(key) - dataclass_config: T = dataclass_constructor(**unstructured_config) + dataclass_dict_config: DictConfig = om.structured( + dataclass_constructor(**unstructured_config)) + dataclass_config: T = dataclass_constructor( + **to_dict_container(dataclass_dict_config)) return logged_cfg, dataclass_config diff --git a/tests/a_scripts/eval/test_eval_inputs.py b/tests/a_scripts/eval/test_eval_inputs.py index 47757029bb..3593e89280 100644 --- a/tests/a_scripts/eval/test_eval_inputs.py +++ b/tests/a_scripts/eval/test_eval_inputs.py @@ -37,9 +37,10 @@ def test_mispelled_mandatory_params_fail(self, cfg: DictConfig) -> None: ] mandatory_configs = ['models', 'icl_tasks'] for p in mandatory_params + mandatory_configs: - with pytest.raises((omegaconf.errors.ConfigKeyError, - omegaconf.errors.InterpolationKeyError, - omegaconf.errors.MissingMandatoryValue)): + with pytest.raises( + (omegaconf.errors.ConfigKeyError, + omegaconf.errors.InterpolationKeyError, + omegaconf.errors.MissingMandatoryValue, TypeError)): cfg[p + '-mispelled'] = cfg.pop(p) main(cfg) cfg[p] = cfg.pop(p + '-mispelled') diff --git a/tests/a_scripts/train/test_train_inputs.py b/tests/a_scripts/train/test_train_inputs.py index 41475535ab..b0dee0f5fd 100644 --- a/tests/a_scripts/train/test_train_inputs.py +++ b/tests/a_scripts/train/test_train_inputs.py @@ -79,7 +79,8 @@ def test_missing_mandatory_parameters_fail(self, cfg: DictConfig) -> None: for param in mandatory_params: orig_param = cfg.pop(param) with pytest.raises( - (TypeError, NameError, omegaconf.errors.InterpolationKeyError)): + (TypeError, NameError, omegaconf.errors.InterpolationKeyError, + omegaconf.errors.MissingMandatoryValue)): main(cfg) cfg[param] = orig_param From b9c3cbf49ff03969224fba9075f2b8d442be5db7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 17:42:31 +0000 Subject: [PATCH 178/201] pyright --- llmfoundry/utils/config_utils.py | 3 +++ scripts/train/train.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index b4288c166d..2860b82059 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -275,6 +275,9 @@ def make_dataclass_and_log_config( dataclass_dict_config: DictConfig = om.structured( dataclass_constructor(**unstructured_config)) + + # Convert DictConfig to dict for dataclass constructor so that child + # configs are not DictConfigs dataclass_config: T = dataclass_constructor( **to_dict_container(dataclass_dict_config)) diff --git a/scripts/train/train.py b/scripts/train/train.py index cbc76c4f35..139f2ff7ff 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -303,7 +303,7 @@ def main(cfg: DictConfig) -> Trainer: trace_handlers=profiler_trace_handlers, schedule=profiler_schedule) - callback_configs = train_cfg.callbacks + callback_configs = train_cfg.callbacks or {} # Callbacks callbacks: List[Callback] = [ @@ -313,7 +313,7 @@ def main(cfg: DictConfig) -> Trainer: use_async_eval = any(isinstance(c, AsyncEval) for c in callbacks) - algorithm_configs = train_cfg.algorithms + algorithm_configs = train_cfg.algorithms or {} # Algorithms algorithms = [ From cbfec68f945d487513271f6539c2bef51d8a4a8d Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 17:45:55 +0000 Subject: [PATCH 179/201] error early on missing mandatory values --- llmfoundry/utils/config_utils.py | 4 ++++ scripts/train/train.py | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 2860b82059..f460da8344 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -276,6 +276,10 @@ def make_dataclass_and_log_config( dataclass_dict_config: DictConfig = om.structured( dataclass_constructor(**unstructured_config)) + # Error on missing mandatory values: + for key in dataclass_fields: + _ = dataclass_dict_config[key] + # Convert DictConfig to dict for dataclass constructor so that child # configs are not DictConfigs dataclass_config: T = dataclass_constructor( diff --git a/scripts/train/train.py b/scripts/train/train.py index 139f2ff7ff..19693852a2 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -44,10 +44,6 @@ def validate_config(train_config: TrainConfig): """Validates compatible model and dataloader selection.""" - # Check for missing mandatory fields and throw error early. - for field in TRAIN_CONFIG_KEYS: - _ = getattr(train_config, field) - # Validate the rest of the config loaders = [train_config.train_loader] if train_config.eval_loaders is not None: From f2ed1d74fef12484a51d3da38308066277f2bf77 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Tue, 23 Apr 2024 19:51:12 +0000 Subject: [PATCH 180/201] remove unnecessory ignore --- llmfoundry/utils/builders.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index bf0b446357..9940921e4a 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -443,9 +443,7 @@ def build_tokenizer( int(1e30), ) - if not hasattr( - tokenizer, 'eos_token' - ) or tokenizer.eos_token is None: # type: ignore (sometime's it's not none but that's ok too) + if not hasattr(tokenizer, 'eos_token') or tokenizer.eos_token is None: raise ValueError( f'The tokenizer {tokenizer_name} must have an eos_token.') From c586978ac54c827c7333ccf08ddfa7eca9718ea7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 24 Apr 2024 02:17:41 +0000 Subject: [PATCH 181/201] update unit tests --- tests/utils/test_mlflow_logging.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/utils/test_mlflow_logging.py b/tests/utils/test_mlflow_logging.py index b8dd0becdf..81c1be5048 100644 --- a/tests/utils/test_mlflow_logging.py +++ b/tests/utils/test_mlflow_logging.py @@ -5,7 +5,6 @@ from unittest.mock import MagicMock, patch import pytest -from omegaconf import OmegaConf from llmfoundry.utils.config_utils import (_log_dataset_uri, _parse_source_dataset) @@ -16,7 +15,7 @@ def create_config(**kwargs: Any): """Helper function to create OmegaConf configurations.""" - return OmegaConf.create(kwargs) + return kwargs def test_parse_source_dataset_delta_table(): @@ -95,7 +94,7 @@ def test_log_dataset_uri(): def test_multiple_eval_datasets(): # Setup a configuration with multiple evaluation datasets - cfg = OmegaConf.create({ + cfg = { 'train_loader': { 'dataset': { 'hf_name': 'huggingface/train_dataset', @@ -110,7 +109,7 @@ def test_multiple_eval_datasets(): 'hf_name': 'huggingface/eval_dataset2', }, }] - }) + } expected_data_paths = [('hf', 'huggingface/train_dataset', 'train'), ('hf', 'huggingface/eval_dataset1', 'eval'), From fbe436ea0b9b062ff721344fb1275f55b6b923be Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 24 Apr 2024 17:41:03 +0000 Subject: [PATCH 182/201] update eval yamls --- scripts/eval/yamls/hf_8bit_eval.yaml | 3 ++- scripts/eval/yamls/hf_lora_eval.yml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/eval/yamls/hf_8bit_eval.yaml b/scripts/eval/yamls/hf_8bit_eval.yaml index 482c6d7da7..30da2e5ef3 100644 --- a/scripts/eval/yamls/hf_8bit_eval.yaml +++ b/scripts/eval/yamls/hf_8bit_eval.yaml @@ -1,9 +1,10 @@ variables: model_name_or_path: bigscience/bloom-1b7 + max_seq_len: 1024 seed: 1 precision: amp_fp16 -max_seq_len: 1024 +max_seq_len: ${variables.max_seq_len} models: - diff --git a/scripts/eval/yamls/hf_lora_eval.yml b/scripts/eval/yamls/hf_lora_eval.yml index f2bc637cac..e1e87968bc 100644 --- a/scripts/eval/yamls/hf_lora_eval.yml +++ b/scripts/eval/yamls/hf_lora_eval.yml @@ -3,10 +3,11 @@ variables: # If you are using a seperated lora weight, put it here: # lora weights must be compatible with the specified model lora_id_or_path: ybelkada/opt-350m-lora # Example lora weights for opt-350m + max_seq_len: 2048 seed: 1 precision: amp_fp16 -max_seq_len: 2048 +max_seq_len: ${variables.max_seq_len} models: - From 7c611f000f05985ccb87483b420e027b8d388b60 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 24 Apr 2024 17:27:30 -0400 Subject: [PATCH 183/201] Update train.py --- scripts/train/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index f5df0e00df..c926322edc 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -228,9 +228,9 @@ def main(cfg: DictConfig) -> Trainer: f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' ) logging.getLogger('llmfoundry').setLevel( - python_log_level.upper()) # Foundry module + train_cfg.python_log_level.upper()) # Foundry module logging.getLogger(__name__).setLevel( - python_log_level.upper()) # Train script + train_cfg.python_log_level.upper()) # Train script _initialize_gloo_and_nccl(dist_timeout=dist_timeout) From 96a620dc469f370c9276413983d53999592b7ba4 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 24 Apr 2024 21:32:13 +0000 Subject: [PATCH 184/201] make log level optional again --- llmfoundry/utils/config_utils.py | 2 +- scripts/eval/eval.py | 17 +++++++++-------- scripts/train/train.py | 23 ++++++++++++----------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py index 905d99c79b..0a8dd76aa0 100644 --- a/llmfoundry/utils/config_utils.py +++ b/llmfoundry/utils/config_utils.py @@ -52,7 +52,7 @@ class EvalConfig: icl_tasks_str: Optional[str] = None # Logging parameters - python_log_level: str = 'debug' + python_log_level: Optional[str] = 'debug' loggers: Optional[Dict[str, Any]] = None log_config: bool = True diff --git a/scripts/eval/eval.py b/scripts/eval/eval.py index 8e6caf8b8f..a29148573b 100644 --- a/scripts/eval/eval.py +++ b/scripts/eval/eval.py @@ -188,14 +188,15 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]: reproducibility.seed_all(eval_config.seed) dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout) - logging.basicConfig( - # Example of format string - # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' - ) - logging.getLogger('llmfoundry').setLevel( - eval_config.python_log_level.upper()) + if eval_config.python_log_level is not None: + logging.basicConfig( + # Example of format string + # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here + format= + f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' + ) + logging.getLogger('llmfoundry').setLevel( + eval_config.python_log_level.upper()) # Default argument values for evaluate_model eval_gauntlet_df = None diff --git a/scripts/train/train.py b/scripts/train/train.py index c926322edc..ac00cde5da 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -220,17 +220,18 @@ def main(cfg: DictConfig) -> Trainer: # Initialize pytorch distributed training process groups dist_timeout: Union[int, float] = train_cfg.dist_timeout - # Set logging level - logging.basicConfig( - # Example of format string - # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' - ) - logging.getLogger('llmfoundry').setLevel( - train_cfg.python_log_level.upper()) # Foundry module - logging.getLogger(__name__).setLevel( - train_cfg.python_log_level.upper()) # Train script + if train_cfg.python_log_level is not None: + # Set logging level + logging.basicConfig( + # Example of format string + # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here + format= + f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' + ) + logging.getLogger('llmfoundry').setLevel( + train_cfg.python_log_level.upper()) # Foundry module + logging.getLogger(__name__).setLevel( + train_cfg.python_log_level.upper()) # Train script _initialize_gloo_and_nccl(dist_timeout=dist_timeout) From e6d092328fdddaf0741610dbaa8b3d33ae8ea7e6 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Thu, 25 Apr 2024 16:00:45 +0000 Subject: [PATCH 185/201] oopsie --- scripts/train/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 106c4c8bda..61aaac2400 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -245,7 +245,7 @@ def main(cfg: DictConfig) -> Trainer: fsdp_config = None # set logging level - if python_log_level is not None: + if train_cfg.python_log_level is not None: logging.basicConfig( # Example of format string # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here @@ -253,9 +253,9 @@ def main(cfg: DictConfig) -> Trainer: f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s' ) logging.getLogger('llmfoundry').setLevel( - python_log_level.upper()) # Foundry module + train_cfg.python_log_level.upper()) # Foundry module logging.getLogger(__name__).setLevel( - python_log_level.upper()) # Train script + train_cfg.python_log_level.upper()) # Train script # Initialize context init_context = process_init_device(model_config, fsdp_config) From f7cede68afa13aff1c21dc18e5be642257126afe Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 29 Apr 2024 19:20:16 +0000 Subject: [PATCH 186/201] use keywords for arg clarity --- scripts/train/train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 872012aeab..d283c2bc33 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -332,7 +332,9 @@ def main(cfg: DictConfig) -> Trainer: # Callbacks callbacks: List[Callback] = [ - build_callback(str(name), callback_cfg, logged_cfg) + build_callback(name=str(name), + kwargs=callback_cfg, + train_config=logged_cfg) for name, callback_cfg in callback_configs.items() ] From e13c75a672b76c9c91948b63b7cc3c79ee9c8619 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 29 Apr 2024 19:29:56 +0000 Subject: [PATCH 187/201] use keywords for arg clarity --- llmfoundry/models/hf/hf_causal_lm.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 48af1da96c..a91057837f 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -87,7 +87,7 @@ def __init__( use_auth_token=use_auth_token, config_overrides=config_overrides, load_in_8bit=load_in_8bit, - pef_config=peft_config, + pretrained=pretrained, prepare_for_fsdp=True, ) @@ -157,6 +157,7 @@ def build_inner_model( use_auth_token: bool, config_overrides: Dict[str, Any], load_in_8bit: bool, + pretrained: bool, prepare_for_fsdp: bool = False, ) -> Union[PreTrainedModel, 'PeftModel']: """Builds the inner model for the ComposerHFCausalLM. From a1b9adf7cc32ffb5c8a62769ed8e22f79450f917 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 29 Apr 2024 19:30:12 +0000 Subject: [PATCH 188/201] style --- llmfoundry/models/hf/hf_causal_lm.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index a91057837f..db02ae1d83 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -339,6 +339,7 @@ def _autoset_attn_implementation_monkeypatch( ComposerHFCausalLM.prepare_inner_model(model, init_device) return model + @staticmethod def _get_peft_config(peft_config_dict: Dict[str, Any]) -> 'PeftConfig': if peft_installed: From 459fce50110abfd34634197352cb9c10fe7c38d7 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 29 Apr 2024 19:30:32 +0000 Subject: [PATCH 189/201] style --- llmfoundry/models/hf/hf_causal_lm.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index db02ae1d83..a91057837f 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -339,7 +339,6 @@ def _autoset_attn_implementation_monkeypatch( ComposerHFCausalLM.prepare_inner_model(model, init_device) return model - @staticmethod def _get_peft_config(peft_config_dict: Dict[str, Any]) -> 'PeftConfig': if peft_installed: From 27c81a50521e0fd344d6c215c762cebb84e55976 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 29 Apr 2024 19:41:07 +0000 Subject: [PATCH 190/201] dist timeout --- scripts/train/train.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index d283c2bc33..9a2b92b9f2 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -182,7 +182,7 @@ def main(cfg: DictConfig) -> Trainer: logging.getLogger(__name__).setLevel( train_cfg.python_log_level.upper()) # Train script - _initialize_dist_with_barrier(dist_timeout=dist_timeout) + _initialize_dist_with_barrier(dist_timeout=train_cfg.dist_timeout) # Filter deprecation warning from torch internal usage warnings.filterwarnings( @@ -218,10 +218,6 @@ def main(cfg: DictConfig) -> Trainer: seed: int = train_cfg.seed reproducibility.seed_all(seed) - # Initialize pytorch distributed training process groups - dist_timeout: Union[int, float] = train_cfg.dist_timeout - _initialize_dist_with_barrier(dist_timeout=dist_timeout) - # Mandatory model training configs model_config = train_cfg.model train_loader_config = train_cfg.train_loader @@ -465,7 +461,7 @@ def main(cfg: DictConfig) -> Trainer: save_ignore_keys=train_cfg.save_ignore_keys, autoresume=train_cfg.autoresume, python_log_level=train_cfg.python_log_level, - dist_timeout=dist_timeout, + dist_timeout=train_cfg.dist_timeout, profiler=profiler, compile_config=compile_config, ) From bcaad4bcc0e09c31f77313fdd74e1bb9665c399f Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 17:55:29 +0000 Subject: [PATCH 191/201] resolve deeper conflict issues --- llmfoundry/data/finetuning/dataloader.py | 6 ++-- llmfoundry/data/text_data.py | 17 ++++++++-- llmfoundry/data/utils.py | 40 +++++++++++++----------- llmfoundry/registry.py | 11 ++++--- 4 files changed, 44 insertions(+), 30 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 331a87cc3c..9305bc4f94 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -176,7 +176,7 @@ def build_finetuning_dataloader( registry=registry.dataset_replication_validators, partial_function=False, kwargs={ - 'cfg': dataloader_cfg, + 'cfg': dataset_cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size, }, @@ -334,7 +334,7 @@ def build_finetuning_dataloader( partial_function=False, kwargs={ 'dl': dl, - 'dataset_cfg': dataset, + 'dataset_cfg': dataset_cfg, }, ) @@ -575,7 +575,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: return finetune_dir -def _build_collate_fn( +def build_collate_fn( dataloader_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: int, diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index e07967ba38..0634b87642 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -324,13 +324,24 @@ def build_text_dataloader( **dataset_config_subset_for_streaming_text_dataset, ) + dataloader_cfg = { + 'name': 'text', + 'dataset': dataset_cfg, + 'drop_last': drop_last, + 'num_workers': num_workers, + 'pin_memory': pin_memory, + 'prefetch_factor': prefetch_factor, + 'persistent_workers': persistent_workers, + 'timeout': timeout, + } + collate_fn, dataloader_batch_size = construct_from_registry( name='text_collator', registry=registry.collators, partial_function=False, kwargs={ - 'cfg': dataset_cfg, - 'tokenizer': dataset.tokenizer, + 'cfg': dataloader_cfg, + 'tokenizer': tokenizer, 'dataset_batch_size': dataset_batch_size, }, ) @@ -353,7 +364,7 @@ def build_text_dataloader( partial_function=False, kwargs={ 'dl': dl, - 'dataset_cfg': cfg.dataset, + 'dataset_cfg': dataset_cfg, }, ) diff --git a/llmfoundry/data/utils.py b/llmfoundry/data/utils.py index 96847c5409..e222aa1d9a 100644 --- a/llmfoundry/data/utils.py +++ b/llmfoundry/data/utils.py @@ -2,13 +2,12 @@ # SPDX-License-Identifier: Apache-2.0 import logging -from typing import Callable, Iterable, Mapping, Tuple, Union +from typing import Any, Callable, Dict, Iterable, Mapping, Tuple, Union import torch import transformers from composer.core.data_spec import DataSpec from composer.core.types import Batch -from omegaconf import DictConfig from torch.utils.data import DataLoader as TorchDataloader from transformers import PreTrainedTokenizerBase @@ -20,9 +19,12 @@ log = logging.getLogger(__name__) -def _validate_cfg(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase): - eos_token_id = cfg.dataset.get('eos_token_id', None) - bos_token_id = cfg.dataset.get('bos_token_id', None) +def _validate_cfg( + dataset_cfg: Dict[str, Any], + tokenizer: PreTrainedTokenizerBase, +): + eos_token_id = dataset_cfg.get('eos_token_id', None) + bos_token_id = dataset_cfg.get('bos_token_id', None) if eos_token_id is None and bos_token_id is None and ( hasattr(tokenizer, 'eos_token_id') or @@ -35,7 +37,7 @@ def _validate_cfg(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase): tokenizer_eos_token_id = getattr(tokenizer, 'eos_token_id', None) if eos_token_id is not None and eos_token_id != tokenizer_eos_token_id: eos_mismatch_str = f'Provided {eos_token_id=} does not match the eos_token_id of the tokenizer={tokenizer_eos_token_id}.' - if cfg.dataset.pop('override_eos_token_id_mismatch_error', False): + if dataset_cfg.pop('override_eos_token_id_mismatch_error', False): log.warning(eos_mismatch_str) else: raise ValueError( @@ -46,7 +48,7 @@ def _validate_cfg(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase): tokenizer_bos_token_id = getattr(tokenizer, 'bos_token_id', None) if bos_token_id is not None and bos_token_id != tokenizer_bos_token_id: bos_mismatch_str = f'Provided {bos_token_id=} does not match the bos_token_id of the tokenizer={tokenizer_bos_token_id}.' - if cfg.dataset.pop('override_bos_token_id_mismatch_error', False): + if dataset_cfg.pop('override_bos_token_id_mismatch_error', False): log.warning(bos_mismatch_str) else: raise ValueError( @@ -54,20 +56,19 @@ def _validate_cfg(cfg: DictConfig, tokenizer: PreTrainedTokenizerBase): ' To override this error, set the override_bos_token_id_mismatch_error flag to True in the dataset config section of the YAML.', ) - max_seq_len = cfg.dataset.get('max_seq_len') + max_seq_len = dataset_cfg.get('max_seq_len') if max_seq_len is not None: if max_seq_len != int(max_seq_len): raise ValueError('max_seq_len must be an integer') - cfg.dataset['max_seq_len'] = int(max_seq_len) + dataset_cfg['max_seq_len'] = int(max_seq_len) def validate_ds_replication( - cfg: DictConfig, + dataset_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, device_batch_size: Union[int, float], ) -> Tuple[int, int]: - _validate_cfg(cfg, tokenizer) - dataset_cfg = cfg.dataset + _validate_cfg(dataset_cfg, tokenizer) if (dataset_cfg.get('seq_parallel_replication', 1) or 1) > 1: raise NotImplementedError('Sequence parallelism is not supported.') if not isinstance(device_batch_size, int): @@ -77,7 +78,7 @@ def validate_ds_replication( def get_data_spec( dl: Union[Iterable, TorchDataloader], - dataset_cfg: DictConfig, + dataset_cfg: Dict[str, Any], ) -> DataSpec: del dataset_cfg token_counting_func = get_tokens_per_batch_func() @@ -134,14 +135,15 @@ def get_num_tokens_in_batch(batch: Batch) -> int: def get_text_collator( - cfg: DictConfig, + dataloader_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, dataset_batch_size: int = -1, ) -> Tuple[Union[transformers.DataCollatorForLanguageModeling, ConcatenatedSequenceCollatorWrapper], int]: - eos_token_id = cfg.dataset.get('eos_token_id', None) - bos_token_id = cfg.dataset.get('bos_token_id', None) - mlm_probability = cfg.dataset.pop('mlm_probability', None) + dataset_cfg = dataloader_cfg.get('dataset') + eos_token_id = dataset_cfg.get('eos_token_id', None) + bos_token_id = dataset_cfg.get('bos_token_id', None) + mlm_probability = dataset_cfg.pop('mlm_probability', None) collate_fn = transformers.DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=mlm_probability is not None, @@ -160,8 +162,8 @@ def get_text_collator( def get_finetuning_collator( - cfg: DictConfig, + dataloader_cfg: Dict[str, Any], tokenizer: PreTrainedTokenizerBase, dataset_batch_size: int, ) -> Tuple[Union[Seq2SeqFinetuningCollator, BinPackCollator], int]: - return build_collate_fn(cfg, tokenizer, dataset_batch_size) + return build_collate_fn(dataloader_cfg, tokenizer, dataset_batch_size) diff --git a/llmfoundry/registry.py b/llmfoundry/registry.py index cc4a053a36..0c8e64b759 100644 --- a/llmfoundry/registry.py +++ b/llmfoundry/registry.py @@ -1,6 +1,6 @@ # Copyright 2024 MosaicML LLM Foundry authors # SPDX-License-Identifier: Apache-2.0 -from typing import Any, Callable, Iterable, Tuple, Type, Union +from typing import Any, Callable, Dict, Iterable, Tuple, Type, Union from composer.core import Algorithm, Callback, DataSpec from composer.loggers import LoggerDestination @@ -9,6 +9,7 @@ from torch.optim import Optimizer from torch.utils.data import DataLoader as TorchDataloader from torchmetrics import Metric +from transformers import PreTrainedTokenizerBase from llmfoundry.interfaces import CallbackWithConfig from llmfoundry.layers_registry import ( @@ -152,8 +153,8 @@ 'llmfoundry', 'dataset_replication_validators', generic_type=Callable[ - [DictConfig, PreTrainedTokenizerBase, Union[int, float]], Tuple[int, - int]], + [Dict[str, Any], PreTrainedTokenizerBase, Union[int, float]], + Tuple[int, int]], entry_points=True, description=_dataset_replication_validators_description, ) @@ -171,7 +172,7 @@ collators = create_registry( 'llmfoundry', 'collators', - generic_type=Callable[[DictConfig, PreTrainedTokenizerBase, int], + generic_type=Callable[[Dict[str, Any], PreTrainedTokenizerBase, int], Tuple[Any, int]], entry_points=True, description=_collators_description, @@ -188,7 +189,7 @@ data_specs = create_registry( 'llmfoundry', 'data_specs', - generic_type=Callable[[Union[Iterable, TorchDataloader], DictConfig], + generic_type=Callable[[Union[Iterable, TorchDataloader], Dict[str, Any]], DataSpec], entry_points=True, description=_data_specs_description, From 108268d7c01533d85a918928dc750109ea3bddac Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 19:42:17 +0000 Subject: [PATCH 192/201] fix train.py --- scripts/train/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 5fba296998..689e270ac9 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -150,7 +150,7 @@ def validate_config(train_config: TrainConfig): f'MoEs with expert parallelism (moe_world_size {moe_world_size} > 1) require `use_orig_params=True`.', ) - attn_config = cfg.model.get('attn_config', None) + attn_config = train_config.model.get('attn_config', None) if attn_config is not None: seq_parallel_world_size = attn_config.get( 'seq_parallel_world_size', From 6ab9bc29d3099ba1d5e12b6bdb824e50e6f732ed Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 19:51:22 +0000 Subject: [PATCH 193/201] fix registry --- llmfoundry/data/finetuning/dataloader.py | 4 ++-- llmfoundry/data/text_data.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 9305bc4f94..c11e252f00 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -176,7 +176,7 @@ def build_finetuning_dataloader( registry=registry.dataset_replication_validators, partial_function=False, kwargs={ - 'cfg': dataset_cfg, + 'dataset_cfg': dataset_cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size, }, @@ -187,7 +187,7 @@ def build_finetuning_dataloader( registry=registry.collators, partial_function=False, kwargs={ - 'cfg': dataloader_cfg, + 'dataloader_cfg': dataloader_cfg, 'tokenizer': tokenizer, 'dataset_batch_size': dataset_batch_size, }, diff --git a/llmfoundry/data/text_data.py b/llmfoundry/data/text_data.py index 0634b87642..60b81cd145 100644 --- a/llmfoundry/data/text_data.py +++ b/llmfoundry/data/text_data.py @@ -295,7 +295,7 @@ def build_text_dataloader( registry=registry.dataset_replication_validators, partial_function=False, kwargs={ - 'cfg': dataset_cfg, + 'dataset_cfg': dataset_cfg, 'tokenizer': tokenizer, 'device_batch_size': device_batch_size, }, @@ -340,7 +340,7 @@ def build_text_dataloader( registry=registry.collators, partial_function=False, kwargs={ - 'cfg': dataloader_cfg, + 'dataloader_cfg': dataloader_cfg, 'tokenizer': tokenizer, 'dataset_batch_size': dataset_batch_size, }, From 61e30caede853cc77aa2e6325bb1667718e35016 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 20:02:50 +0000 Subject: [PATCH 194/201] fix dataloader --- llmfoundry/data/finetuning/dataloader.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index c11e252f00..e689860534 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -286,10 +286,6 @@ def build_finetuning_dataloader( # Ensure dataset is large enough. if drop_last: - world_size = dist.get_world_size() - - # Ensure dataset is large enough. - if cfg.drop_last: world_size = dist.get_world_size() // replication_factor minimum_dataset_size = world_size * dataloader_batch_size if hasattr(streaming_dataset, '__len__'): @@ -303,6 +299,7 @@ def build_finetuning_dataloader( full_dataset_size=full_dataset_size, minimum_dataset_size=minimum_dataset_size, ) + # Initialize sampler. sampler = dist.get_sampler( streaming_dataset, From b3cd8cee0b8803557beeba9e1d5e117a61c85543 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 20:04:25 +0000 Subject: [PATCH 195/201] fix train II --- scripts/train/train.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/scripts/train/train.py b/scripts/train/train.py index 689e270ac9..e8f5b8220a 100644 --- a/scripts/train/train.py +++ b/scripts/train/train.py @@ -299,21 +299,6 @@ def main(cfg: DictConfig) -> Trainer: ) fsdp_config = None - # set logging level - if train_cfg.python_log_level is not None: - logging.basicConfig( - # Example of format string - # 2022-06-29 11:22:26,152: rank0[822018][MainThread]: INFO: Message here - format= - f'%(asctime)s: rank{dist.get_global_rank()}[%(process)d][%(threadName)s]: %(levelname)s: %(name)s: %(message)s', - ) - logging.getLogger('llmfoundry').setLevel( - train_cfg.python_log_level.upper(), - ) # Foundry module - logging.getLogger(__name__).setLevel( - train_cfg.python_log_level.upper(), - ) # Train script - # Initialize context init_context = process_init_device(model_config, fsdp_config) logged_cfg.update({'fsdp_config': fsdp_config}, merge=True) From 760abb48426d4e90ace4fda9ab5f2b9a94519a17 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 20:16:32 +0000 Subject: [PATCH 196/201] fix dataloader and utils --- llmfoundry/data/finetuning/dataloader.py | 2 +- llmfoundry/data/utils.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index e689860534..df61d36a91 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -231,7 +231,7 @@ def build_finetuning_dataloader( sampling_method=dataset_cfg.get('sampling_method', 'balanced'), sampling_granularity=dataset_cfg.get('sampling_granularity', 1), batching_method=dataset_cfg.get('batching_method', 'random'), - max_seq_len=dataset_cfg.max_seq_len, + max_seq_len=dataset_cfg['max_seq_len'], allow_unsafe_types=dataset_cfg.get('allow_unsafe_types', False), replication=replication_factor, ) diff --git a/llmfoundry/data/utils.py b/llmfoundry/data/utils.py index 5b94fad7f3..a5fe3a1022 100644 --- a/llmfoundry/data/utils.py +++ b/llmfoundry/data/utils.py @@ -141,6 +141,7 @@ def get_text_collator( ) -> Tuple[Union[transformers.DataCollatorForLanguageModeling, ConcatenatedSequenceCollatorWrapper], int]: dataset_cfg = dataloader_cfg.get('dataset') + assert isinstance(dataset_cfg, dict) eos_token_id = dataset_cfg.get('eos_token_id', None) bos_token_id = dataset_cfg.get('bos_token_id', None) mlm_probability = dataset_cfg.pop('mlm_probability', None) From c896437a449b8b34bf8fc721033823b0e0451ee2 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 20:35:07 +0000 Subject: [PATCH 197/201] fix dictconfig --- tests/data/test_dataloader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index a1a9248e90..7852f32c64 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -40,6 +40,7 @@ ) from llmfoundry.data.utils import get_tokens_per_batch_func from llmfoundry.utils.builders import build_tokenizer +from llmfoundry.utils.config_utils import to_dict_container # yapf: disable from llmfoundry.utils.exceptions import ( ConsecutiveRepeatedChatRolesError, @@ -259,8 +260,9 @@ def test_correct_padding( # Dataloaders test_cfg.eval_loader.pop('name') + test_cfg = to_dict_container(test_cfg) eval_loader = build_text_dataloader( - **test_cfg.eval_loader, + **test_cfg['eval_loader'], tokenizer=tokenizer, device_batch_size=batch_size, ).dataloader From 682d2bfffdbf8f7795387e2e383841fe32b29bd5 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 20:45:24 +0000 Subject: [PATCH 198/201] skill issue --- tests/data/test_dataloader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 7852f32c64..0da518d2e7 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -260,6 +260,7 @@ def test_correct_padding( # Dataloaders test_cfg.eval_loader.pop('name') + assert isinstance(test_cfg, DictConfig) test_cfg = to_dict_container(test_cfg) eval_loader = build_text_dataloader( **test_cfg['eval_loader'], From 9d229b9be0a660bcfccae4cdb0d8eae8b5ddf1fa Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Fri, 3 May 2024 21:04:06 +0000 Subject: [PATCH 199/201] add new keys --- llmfoundry/data/finetuning/dataloader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index df61d36a91..af5eccbc77 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -389,6 +389,8 @@ def _validate_config( 'replication', 'packing_ratio', 'allow_pad_trimming', + 'seq_parallel_replication', + 'auto_packing_replication', } if not set(kwargs.keys()).issubset(allowed_additional_kwargs): raise ValueError( From eccf8499cc2b2f78e16a94876961dfdda23360ed Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Mon, 6 May 2024 23:30:06 +0000 Subject: [PATCH 200/201] remove pop_config --- llmfoundry/models/hf/hf_causal_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/models/hf/hf_causal_lm.py b/llmfoundry/models/hf/hf_causal_lm.py index 66293f668c..5f3a53ed18 100644 --- a/llmfoundry/models/hf/hf_causal_lm.py +++ b/llmfoundry/models/hf/hf_causal_lm.py @@ -36,7 +36,7 @@ from llmfoundry.models.hf.model_wrapper import HuggingFaceModelWithFSDP from llmfoundry.models.layers.attention import is_flash_v2_installed from llmfoundry.models.utils import init_empty_weights -from llmfoundry.utils.config_utils import get_hf_config_value, pop_config +from llmfoundry.utils.config_utils import get_hf_config_value if TYPE_CHECKING: from peft import PeftConfig, PeftModel From 8fb5e4c5c26e3e8fd28dd49cbe55d421b4e0fc77 Mon Sep 17 00:00:00 2001 From: Milo Cress Date: Wed, 8 May 2024 02:14:20 +0000 Subject: [PATCH 201/201] fix --- llmfoundry/utils/builders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/utils/builders.py b/llmfoundry/utils/builders.py index 4b4badfb36..73eb026d98 100644 --- a/llmfoundry/utils/builders.py +++ b/llmfoundry/utils/builders.py @@ -536,8 +536,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]): icl_cfg['metric_names'] = [ 'InContextLearningMultipleChoiceAccuracy', ] - elif icl_cfg.icl_task_type == 'generation_task_with_answers': - icl_cfg.metric_names = [ + elif icl_cfg['icl_task_type'] == 'generation_task_with_answers': + icl_cfg['metric_names'] = [ 'InContextLearningGenerationExactMatchAccuracy', ] else: