diff --git a/README.md b/README.md index 0fe428a493..828c6c4b17 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,8 @@ python main.py \ --device cuda:0 ``` +Also check the script for running [evalutation suites](#evaluation-suites). + Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints: ```bash @@ -181,6 +183,24 @@ python write_out.py \ This will write out one text file for each task. +## Evaluation Suites + +If you have multiple tasks that you routinely run as an evaluation suite, you can save the suite configuration in a single file and run it with different models. Save a suite config to `lm_eval/suites/configs/[suite].conf`, formatted like this: + + [tasks.my_task] + version = 1.0 + fewshot = 2 + + [tasks.other_task] + version = 1.1 + fewshot = 3 + +Then you can run the suite like this: + + python scripts/run_suite.py [model_path] [suite_name] [prompt_version] -m [model_args] + +For prompt versions, see the [prompt docs](docs/prompt_templates.md) and the [list of prompt names](lm_eval/prompts.py). + ## Advanced Usage For models loaded with the HuggingFace `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument: diff --git a/docs/prompt_templates.md b/docs/prompt_templates.md index 1c3e7e908e..d4a93f93ca 100644 --- a/docs/prompt_templates.md +++ b/docs/prompt_templates.md @@ -1,5 +1,6 @@ # Prompt Templates -Before evaluation, you can choose suitable prompt template for your model. + +Before evaluation, you can choose suitable prompt template for your model. Prompts can be referred to by version numbers (like `0.0`) or by short names (like `custom`). You can check the mapping in [`prompts.py`](../lm_eval/prompts.py). Once you found the best one of the following supported templates, replace `TEMPLATE` to the template version. @@ -15,11 +16,12 @@ python main.py \ --output_path "result.json" ``` -## `0.0` +## `0.0 user` This version uses plausible prompt templates the contributor made. In most cases, templates in paper are well-investigated so that they should be good to use. But, the reality is that some eval tasks we want to support are never used before. In this case, the contributors would carefully think of the plausible prompt template as this version. -## `0.1` +## `0.1 jgpt` + - **Reference:** [日本語に特化した60億パラメータ規模のGPTモデルの構築と評価](https://www.anlp.jp/proceedings/annual_meeting/2023/pdf_dir/H9-4.pdf) - **Supported Tasks:** `jsquad`, `jaquad`, `jcommonsenseqa`, `jaqket_v2` - **Format:** @@ -33,7 +35,8 @@ This version uses plausible prompt templates the contributor made. In most cases ``` For formats for other tasks, please see `lm_eval/tasks/TASK.py`. -## `0.2` +## `0.2 fintan` + - **Reference:** [ChatGPT vs BERT: どちらが日本語をより理解できるのか?](https://fintan.jp/page/9126/) - **Supported Tasks:** `jsquad`, `jaquad`, `jcommonsenseqa`, `jnli`, `marc_ja`, `jaqket_v2` - **Format:** @@ -48,7 +51,8 @@ This version uses plausible prompt templates the contributor made. In most cases For formats for other tasks, please see `lm_eval/tasks/TASK.py`. -## `0.3` +## `0.3 ja-alpaca` + This is intended to use for instruction-tuned models trained on [Japanese Alpaca](https://huggingface.co/datasets/fujiki/japanese_alpaca_data) - **Reference:** @@ -80,7 +84,8 @@ japanese-alpaca-lora For formats for other tasks, please see `lm_eval/tasks/TASK.py`. -## `0.4` +## `0.4 rinna-sft` + This is intended to use for [rinna/japanese-gpt-neox-3.6b-instruction-sft](https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft). @@ -97,7 +102,8 @@ This is intended to use for [rinna/japanese-gpt-neox-3.6b-instruction-sft](https ``` For formats for other tasks, please see `lm_eval/tasks/TASK.py`. -## `0.5` +## `0.5 rinna-bilingual` + This is intended to use for [rinna/bilingual-gpt-neox-4b-instruction-sft](https://huggingface.co/rinna/bilingual-gpt-neox-4b-instruction-sft). @@ -119,8 +125,9 @@ This is intended to use for [rinna/bilingual-gpt-neox-4b-instruction-sft](https: For formats for other tasks, please see `lm_eval/tasks/TASK.py`. -## `0.6` -This is intended to used for Llama2-chat variants +## `0.6 llama2` + +This is intended to used for Llama2-chat variants. - **Reference:** https://huggingface.co/blog/llama2#how-to-prompt-llama-2 - **Supported Tasks:** `jsquad`, `jaquad`, `jcommonsenseqa`, `jnli`, `marc_ja`, `jcola`, `jaqket_v2`, `xlsum_ja`, `mgsm` diff --git a/lm_eval/prompts.py b/lm_eval/prompts.py new file mode 100644 index 0000000000..9a72bc6cb8 --- /dev/null +++ b/lm_eval/prompts.py @@ -0,0 +1,33 @@ +def jslm_beta(task): + """JSLM Beta uses a different prompt for JCommonSenseQA.""" + if task == "jcommonsenseqa": + return "0.2.1" + else: + return "0.2" + + +PROMPT_CODES = { + "user": "0.0", + "jgpt": "0.1", + "fintan": "0.2", + "fintan2": "0.2.1", + "ja-alpaca": "0.3", + "rinna-sft": "0.4", + "rinna-bilingual": "0.5", + "llama2": "0.6", + "jslm-beta": jslm_beta, +} + + +def get_prompt_code(short_name, task=None): + """Get the prompt code given a short name. + + Usually, this is a simple dictionary lookup. But it can depend on the task + sometimes. + """ + code = PROMPT_CODES[short_name] + + if callable(code): + return callable(task) + else: + return code diff --git a/lm_eval/suites/__init__.py b/lm_eval/suites/__init__.py new file mode 100644 index 0000000000..6d4afbaed7 --- /dev/null +++ b/lm_eval/suites/__init__.py @@ -0,0 +1,56 @@ +# Functionality related to "eval suites". A suite is a collection of tasks with +# options pre-configured. Different models can be run with the same suite to +# compare them. +import configparser +from dataclasses import dataclass +from typing import Optional +import os +from pathlib import Path + +# This file is the path where suite configs go +SUITE_DIR = Path(os.path.dirname(os.path.realpath(__file__))) / "configs" + + +@dataclass +class TaskSpec: + """Specification of a task in an eval suite. + + A suite is a list of these specs, plus a prompt.""" + + # The real arguments have to be massaged into messy strings and parallel + # lists, but this is a more reasonable structure - we can handle conversion + # separately. + + name: str + fewshot: int + version: Optional[str] + + +def load_suite(name): + """Read in configuration for a test suite. + + A suite will have a config file named something like `my_suite.conf`. For + each task in the file, a version, fewshot config, and any other details + will be specified. + + Example entry: + + [tasks.mgsm] + version = 1.0 + fewshot = 5 + """ + conf = configparser.ConfigParser() + conf.read(SUITE_DIR / (name + ".conf")) + + specs = [] + for key, val in conf.items(): + if not key.startswith("tasks."): + continue + + spec = TaskSpec( + name=key.split(".", 1)[1], + version=val.get("version", None), + fewshot=int(val["fewshot"]), + ) + specs.append(spec) + return specs diff --git a/lm_eval/suites/configs/ja8.conf b/lm_eval/suites/configs/ja8.conf new file mode 100644 index 0000000000..94e4cadfcd --- /dev/null +++ b/lm_eval/suites/configs/ja8.conf @@ -0,0 +1,33 @@ +# This is the standard eight-task eval suite. + +[tasks.mgsm] +version = 1.0 +fewshot = 5 + +[tasks.xwinograd_ja] +# this has no version +fewshot = 0 + +[tasks.xlsum_ja] +version = 1.0 +fewshot = 1 + +[tasks.jaqket_v2] +version = 0.2 +fewshot = 1 + +[tasks.marc_ja] +version = 1.1 +fewshot = 3 + +[tasks.jnli] +version = 1.3 +fewshot = 3 + +[tasks.jcommonsenseqa] +version = 1.1 +fewshot = 3 + +[tasks.jsquad] +version = 1.1 +fewshot = 2 diff --git a/scripts/run_suite.py b/scripts/run_suite.py new file mode 100644 index 0000000000..f3fa20fc69 --- /dev/null +++ b/scripts/run_suite.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# Run a suite of tests + +import argparse + +from lm_eval import evaluator +from lm_eval.prompts import get_prompt_code +from lm_eval.suites import TaskSpec, load_suite + + +def build_eval_args(specs: list[TaskSpec], prompt: str) -> tuple[list[str], list[int]]: + """Convert list of TaskSpecs into args for simple_evaluate.""" + + tasks = [] + fewshot = [] + for spec in specs: + task_name = spec.name + + code = get_prompt_code(prompt, task_name) + + if spec.version is not None: + task_name += "-" + spec.version + "-" + code + + tasks.append(task_name) + fewshot.append(spec.fewshot) + + return (tasks, fewshot) + + +def run_suite( + model_args, + suite, + prompt, + *, + model_type="hf-causal", + output=None, + verbose=False, + limit=None, +): + # Confusing detail: in the "simple evaluate", "model" is the HF model type, + # which is almost always hf-causal or hf-causal-experimental. `model_args` + # looks like this: + # + # pretrained=hoge/piyo,tokenizer=...,asdf=... + + # device never changes in practice + device = "cuda" + + specs = load_suite(suite) + tasks, num_fewshot = build_eval_args(specs, prompt) + + evaluator.simple_evaluate( + model=model_type, + model_args=model_args, + tasks=tasks, + num_fewshot=num_fewshot, + device=device, + verbose=verbose, + limit=limit, + ) + + +def main(): + parser = argparse.ArgumentParser( + prog="run_suite.py", description="Run a test suite with a model" + ) + parser.add_argument("model", help="Model path (or HF spec)") + parser.add_argument("suite", help="Test suite to run") + parser.add_argument("prompt", help="Prompt to use") + parser.add_argument("-m", "--model_args", help="Additional model arguments") + parser.add_argument( + "-t", "--model_type", default="hf-causal-experimental", help="Model type" + ) + parser.add_argument("-o", "--output", help="Output file") + parser.add_argument("-v", "--verbose", action="store_true") + + # TODO would it be better to just use a "quick" setting that runs 10 + # iterations? We don't need arbitrary numeric control + parser.add_argument( + "-l", "--limit", type=int, help="number of iterations to run (for testing)" + ) + + args = parser.parse_args() + + margs = f"pretrained={args.model}" + if args.model_args: + margs = args.model + "," + args.model_args + + run_suite( + margs, + args.suite, + args.prompt, + model_type=args.model_type, + output=args.output, + verbose=args.verbose, + limit=args.limit, + ) + + +if __name__ == "__main__": + main()