Skip to content

Commit

Permalink
fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
milocress committed Apr 20, 2024
1 parent b9db81f commit e87b9b2
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 25 deletions.
13 changes: 7 additions & 6 deletions llmfoundry/utils/builders.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
'InContextLearningMultipleChoiceAccuracy'
]
elif icl_cfg[
'icl_task_type'] == 'generation_task_with_answers' or icl_cfg.icl_task_type == 'question_answering':
'icl_task_type'] == 'generation_task_with_answers' or icl_cfg[
'icl_task_type'] == 'question_answering':
if icl_cfg['icl_task_type'] == 'question_answering':
warnings.warn(
VersionedDeprecationWarning(
Expand All @@ -513,7 +514,7 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
icl_cfg['metric_names'] = ['InContextLearningCodeEvalAccuracy']
else:
raise ValueError(
f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg.icl_task_type}.'
f'No metric_names defined, unable to build default metrics for icl_task_type={icl_cfg["icl_task_type"]}.'
)

if 'prompt_string' not in icl_cfg:
Expand All @@ -539,7 +540,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
'Please use generation_kwargs.num_beams instead.')

for icl_cfg in icl_tasks_list:
assert isinstance(icl_cfg, dict)
assert isinstance(
icl_cfg, dict), f'Expected dict, got {type(icl_cfg)}, {icl_cfg=}'
_validate_cfg(icl_cfg)
for num_fewshot in list(icl_cfg['num_fewshot']):
if tokenizer.pad_token_id is None:
Expand Down Expand Up @@ -585,9 +587,8 @@ def _validate_cfg(icl_cfg: Dict[str, Any]):
generation_kwargs=icl_cfg.get('generation_kwargs', {}),
early_stopping_criteria=early_stopping_criteria,
do_normalization=icl_cfg.get('do_normalization', True))
if hasattr(icl_cfg, 'has_categories'
) and icl_cfg['has_categories'] and isinstance(
dataloaders, dict):
if 'has_categories' in icl_cfg and icl_cfg[
'has_categories'] and isinstance(dataloaders, dict):
for category in dataloaders.keys():
logger_keys.extend([
f'metrics/{label}/{category}/{m}' for m in metric_names
Expand Down
17 changes: 16 additions & 1 deletion llmfoundry/utils/config_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import logging
import math
import warnings
from typing import Any, Dict, Literal, Mapping, Optional, Tuple, Union
from typing import Any, Dict, List, Literal, Mapping, Optional, Tuple, Union

from composer.utils import dist
from omegaconf import DictConfig, ListConfig
Expand Down Expand Up @@ -39,6 +39,21 @@ def forbid_config_key(cfg_dict: Dict[str, Any], key: str):
)


def to_container_recursive(
cfg: Union[DictConfig, ListConfig]
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:

def rh(x: Any) -> Any: # recursive helper
if isinstance(x, DictConfig):
return {k: rh(v) for k, v in x.items()}
elif isinstance(x, ListConfig):
return [rh(v) for v in x]
else:
return x

return rh(cfg)


def pop_config(cfg: Union[Dict[str, Any], DictConfig],
key: str,
must_exist: bool = True,
Expand Down
39 changes: 25 additions & 14 deletions scripts/eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from composer.loggers.logger_destination import LoggerDestination
from composer.trainer import Trainer
from composer.utils import dist, get_device, reproducibility
from omegaconf import MISSING, DictConfig, ListConfig
from omegaconf import MISSING, DictConfig
from omegaconf import OmegaConf as om
from rich.traceback import install

Expand All @@ -29,7 +29,8 @@
build_evaluators, build_logger,
build_tokenizer)
from llmfoundry.utils.config_utils import (forbid_config_key, log_config,
process_init_device)
process_init_device,
to_container_recursive)
from llmfoundry.utils.registry_utils import import_file

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -117,9 +118,9 @@ def evaluate_model(
eval_gauntlet_df = pd.DataFrame(
columns=['model_name'] +
[avg for avg in eval_gauntlet_callback.averages] +
[t.name for t in eval_gauntlet_callback.categories])
[t['name'] for t in eval_gauntlet_callback.categories])

if model['name'] == 'mpt_causal_lm' and load_path is None:
if name == 'mpt_causal_lm' and load_path is None:
raise ValueError(
'MPT causal LMs require a load_path to the checkpoint for model evaluation.'
+
Expand Down Expand Up @@ -263,33 +264,43 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
import_file(code_path)

model_configs = eval_config.models
eval_gauntlet_config = eval_config.eval_gauntlet if eval_config.eval_gauntlet else eval_config.eval_gauntlet_str
eval_gauntlet_config = to_container_recursive(
eval_config.eval_gauntlet) or eval_config.eval_gauntlet_str
assert eval_gauntlet_config is None or isinstance(
eval_gauntlet_config, dict
) or isinstance(
eval_gauntlet_config, str
), f'eval_gauntlet_config must be a dict or a string but is {type(eval_gauntlet_config)}, {eval_gauntlet_config=}'

# the below line fixes a strange issue where the fsdp_config is a DictConfig rather than a Dict,
# despite the type hint being Dict[str, Any] and the `cfg` object being sent to `to_container`.
# I think it might be rewrapped in DictConfig during the `structured` call in `_make_eval_and_log_config`.
# this redundant check is necessary to avoid a pyright error.
fsdp_config = om.to_container(
eval_config.fsdp_config) if eval_config.fsdp_config else None
fsdp_config = to_container_recursive(eval_config.fsdp_config)
assert isinstance(
fsdp_config, Dict
) or fsdp_config is None, f'fsdp_config must be a Dict or None but is {type(fsdp_config)}'
fsdp_config = {str(k): v for k, v in fsdp_config.items()
} if fsdp_config else None # pyright fix

# Mandatory Evaluation Parameters
icl_tasks: Union[
ListConfig, str,
None] = eval_config.icl_tasks if eval_config.icl_tasks else eval_config.icl_tasks_str
icl_tasks = to_container_recursive(
eval_config.icl_tasks) or eval_config.icl_tasks_str
assert isinstance(icl_tasks, list) or isinstance(
icl_tasks, str
), f'icl_tasks must be a list or a string but is {type(icl_tasks)}, {icl_tasks=}'
assert icl_tasks is not None, 'icl_tasks must be specified in the config'

# Optional Evaluation Parameters with default values
eval_loader_config = eval_config.eval_loader if eval_config.eval_loader else eval_config.eval_loaders
eval_loader_config = to_container_recursive(
eval_config.eval_loader
) if eval_config.eval_loader else to_container_recursive(
eval_config.eval_loaders)
default_run_name: str = os.environ.get('RUN_NAME', 'llm')
run_name = eval_config.run_name if eval_config.run_name else default_run_name

reproducibility.seed_all(eval_config.seed)
dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout)
# dist.initialize_dist(get_device(None), timeout=eval_config.dist_timeout)

logging.basicConfig(
# Example of format string
Expand Down Expand Up @@ -356,8 +367,8 @@ def main(cfg: DictConfig) -> Tuple[List[Trainer], pd.DataFrame]:
benchmark_to_taxonomy = {}
if eval_gauntlet_callback is not None:
for t in eval_gauntlet_callback.categories:
for b in t.benchmarks:
benchmark_to_taxonomy[b.name] = t.name
for b in t['benchmarks']:
benchmark_to_taxonomy[b['name']] = t['name']

assert 'model_name' in model_cfg, 'model_name must be specified in model config'
model_results = calculate_markdown_results(logger_keys, trainer,
Expand Down
7 changes: 3 additions & 4 deletions tests/a_scripts/eval/test_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
from llmfoundry.utils.builders import build_composer_model
from llmfoundry.utils.config_utils import to_str_dict
from scripts.eval.eval import main # noqa: E402
from tests.data_utils import (create_arxiv_dataset, create_c4_dataset_xxsmall,
gpt_tiny_cfg)
from tests.data_utils import create_c4_dataset_xxsmall, gpt_tiny_cfg


@pytest.fixture(autouse=True)
Expand All @@ -40,6 +39,7 @@ def eval_cfg(foundry_dir: str) -> Union[om.ListConfig, om.DictConfig]:

@pytest.fixture()
def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]):
eval_cfg = copy.deepcopy(eval_cfg) # copy config before modifying
model_cfg = eval_cfg.models[0]
# set device to cpu
device = 'cpu'
Expand All @@ -65,6 +65,7 @@ def mock_saved_model_path(eval_cfg: Union[om.ListConfig, om.DictConfig]):

def test_icl_eval(eval_cfg: Union[om.ListConfig, om.DictConfig], capfd: Any,
mock_saved_model_path: Any):
eval_cfg = copy.deepcopy(eval_cfg)
eval_cfg.models[0].load_path = mock_saved_model_path
assert isinstance(eval_cfg, om.DictConfig)
main(eval_cfg)
Expand Down Expand Up @@ -113,8 +114,6 @@ def test_loader_eval(capfd: Any, mock_saved_model_path: Any,
first_eval_loader.label = 'c4'
# Create second eval dataloader using the arxiv dataset.
second_eval_loader = copy.deepcopy(first_eval_loader)
arxiv_dataset_name = create_arxiv_dataset(tmp_path)
second_eval_loader.data_local = arxiv_dataset_name
second_eval_loader.label = 'arxiv'
test_cfg.eval_loader = om.OmegaConf.create(
[first_eval_loader, second_eval_loader])
Expand Down

0 comments on commit e87b9b2

Please sign in to comment.