Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set up the ability to run eval suites #114

Merged
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions lm_eval/prompts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
PROMPT_CODES = {
"user": "0.0",
"jgpt": "0.1",
"fintan": "0.2",
"fintan2": "0.2.1",
"ja-alpaca": "0.3",
"rinna-sft": "0.4",
"rinna-bilingual": "0.5",
"llama2": "0.6",
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me know if these names make sense or could be improved.

56 changes: 56 additions & 0 deletions lm_eval/suites/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Functionality related to "eval suites". A suite is a collection of tasks with
# options pre-configured. Different models can be run with the same suite to
# compare them.
import configparser
from dataclasses import dataclass
from typing import Optional
import os
from pathlib import Path

# This file is the path where suite configs go
SUITE_DIR = Path(os.path.dirname(os.path.realpath(__file__))) / "configs"


@dataclass
class TaskSpec:
"""Specification of a task in an eval suite.

A suite is a list of these specs, plus a prompt."""

# The real arguments have to be massaged into messy strings and parallel
# lists, but this is a more reasonable structure - we can handle conversion
# separately.

name: str
fewshot: int
version: Optional[str]


def load_suite(name):
"""Read in configuration for a test suite.

A suite will have a config file named something like `my_suite.conf`. For
each task in the file, a version, fewshot config, and any other details
will be specified.

Example entry:

[tasks.mgsm]
version = 1.0
fewshot = 5
"""
conf = configparser.ConfigParser()
conf.read(SUITE_DIR / (name + ".conf"))

specs = []
for key, val in conf.items():
if not key.startswith("tasks."):
continue

spec = TaskSpec(
name=key.split(".", 1)[1],
version=val.get("version", None),
fewshot=int(val["fewshot"]),
)
specs.append(spec)
return specs
33 changes: 33 additions & 0 deletions lm_eval/suites/configs/ja8.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This is the standard eight-task eval suite.

[tasks.mgsm]
version = 1.0
fewshot = 5

[tasks.xwinograd_ja]
# this has no version
fewshot = 0

[tasks.xlsum_ja]
version = 1.0
fewshot = 1

[tasks.jaqket_v2]
version = 0.2
fewshot = 1

[tasks.marc_ja]
version = 1.1
fewshot = 3

[tasks.jnli]
version = 1.3
fewshot = 3

[tasks.jcommonsenseqa]
version = 1.1
fewshot = 3

[tasks.jsquad]
version = 1.1
fewshot = 2
99 changes: 99 additions & 0 deletions scripts/run_suite.py
mrorii marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python
# Run a suite of tests

import argparse

from lm_eval import evaluator
from lm_eval.prompts import PROMPT_CODES
from lm_eval.suites import TaskSpec, load_suite


def build_eval_args(specs: list[TaskSpec], prompt: str) -> tuple[list[str], list[int]]:
"""Convert list of TaskSpecs into args for simple_evaluate."""

tasks = []
fewshot = []
prompt_code = PROMPT_CODES[prompt]
for spec in specs:
task_name = spec.name
if spec.version is not None:
task_name += "-" + spec.version + "-" + prompt_code

tasks.append(task_name)
fewshot.append(spec.fewshot)

return (tasks, fewshot)


def run_suite(
model_args,
suite,
prompt,
*,
model_type="hf-causal",
output=None,
verbose=False,
limit=None,
):
# Confusing detail: in the "simple evaluate", "model" is the HF model type,
# which is almost always hf-causal or hf-causal-experimental. `model_args`
# looks like this:
#
# pretrained=hoge/piyo,tokenizer=...,asdf=...

# device never changes in practice
device = "cuda"

specs = load_suite(suite)
tasks, num_fewshot = build_eval_args(specs, prompt)

evaluator.simple_evaluate(
model=model_type,
model_args=model_args,
tasks=tasks,
num_fewshot=num_fewshot,
device=device,
verbose=verbose,
limit=limit,
)


def main():
parser = argparse.ArgumentParser(
prog="run_suite.py", description="Run a test suite with a model"
)
parser.add_argument("model", help="Model path (or HF spec)")
parser.add_argument("suite", help="Test suite to run")
parser.add_argument("prompt", help="Prompt to use")
parser.add_argument("-m", "--model_args", help="Additional model arguments")
parser.add_argument(
"-t", "--model_type", default="hf-causal-experimental", help="Model type"
)
parser.add_argument("-o", "--output", help="Output file")
parser.add_argument("-v", "--verbose", action="store_true")

# TODO would it be better to just use a "quick" setting that runs 10
# iterations? We don't need arbitrary numeric control
parser.add_argument(
"-l", "--limit", type=int, help="number of iterations to run (for testing)"
)

args = parser.parse_args()

margs = f"pretrained={args.model}"
if args.model_args:
margs = args.model + "," + args.model_args

run_suite(
margs,
args.suite,
args.prompt,
model_type=args.model_type,
output=args.output,
verbose=args.verbose,
limit=args.limit,
)


if __name__ == "__main__":
main()
Loading