Update gauntlet v0.2 to reflect results of calibration (#791)

* add calibrated tasks as v0.2 gauntlet * add calibrated tasks as v0.2 gauntlet
mosaicml · Dec 9, 2023 · 8d96f9d · 8d96f9d
1 parent 75cc1e1
commit 8d96f9d
Show file tree

Hide file tree

Showing 8 changed files with 327 additions and 13 deletions.
diff --git a/llmfoundry/callbacks/eval_gauntlet_callback.py b/llmfoundry/callbacks/eval_gauntlet_callback.py
@@ -59,7 +59,7 @@ class EvalGauntlet(Callback):
                             logged under in the logger after eval
         categories (dict): This contains the list of categories, as well as the subtasks within them, the
                       random baseline accuracy of each subtask, and the number of fewshot examples
-                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet.yaml` to see the structure.
+                      used for the task. See `llmfoundry/scripts/eval/yamls/eval_gauntlet_v0.2.yaml` to see the structure.
         weighting (Weighting): The weighting scheme used to balance different tasks within each category.
                                Either assign them all equal weight, assign them weight proportional
                                to the dataset size, or assign them weight proportional to the log2 of the dataset size.

diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
@@ -55,5 +55,5 @@ parameters:
     forward_prefetch: True
     limit_all_gathers: True
 
-  icl_tasks: 'eval/yamls/tasks.yaml'
-  eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+  icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
+  eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
@@ -50,5 +50,5 @@ parameters:
     limit_all_gathers: True
 
 
-  icl_tasks: 'eval/yamls/tasks.yaml'
-  eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+  icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
+  eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
diff --git a/scripts/eval/README.md b/scripts/eval/README.md
@@ -27,7 +27,7 @@ composer eval/eval.py eval/yamls/hf_eval.yaml \
     model_name_or_path=mosaicml/mpt-7b
 ```
 
-You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet.yaml`.
+You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet_v0.2.yaml`.
 
 
 ### Evaluation during training
@@ -38,7 +38,7 @@ To run evaluation during training, download this repo, follow the instructions i
 cd llm-foundry/scripts/train
 composer train.py yamls/pretrain/mpt-125m_eval.yaml train_loader.dataset.split=train_small eval_loader.dataset.split=val_small
 ```
-You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet.yaml`. You can also choose to either run the full evaluation or run on a subset number of batches per benchmark by setting `icl_subset_num_batches`.
+You can also modify the specific benchmarks executed and their formatting by modifying the contents of `tasks.yaml` and you can modify the choice of composite scores and the set of tasks they consist of by modifying `eval_gauntlet_v0.2.yaml`. You can also choose to either run the full evaluation or run on a subset number of batches per benchmark by setting `icl_subset_num_batches`.
 
 ----
 ## In-depth walkthrough
@@ -131,7 +131,7 @@ An example is given below:
 ```
   icl_tasks: eval/yamls/tasks.yaml # or use tasks_light.yaml
   icl_subset_num_batches: 100 # -1, or omit this key entirely, to evaluate on all batches
-  eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+  eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
   icl_seq_len: 1024
 ```
 

diff --git a/scripts/eval/yamls/eval_gauntlet_v0.2.yaml b/scripts/eval/yamls/eval_gauntlet_v0.2.yaml
@@ -0,0 +1,121 @@
+eval_gauntlet:
+  weighting: EQUAL
+  subtract_random_baseline: true
+  rescale_accuracy: true
+  averages:
+    core_average:
+    - world_knowledge
+    - commonsense_reasoning
+    - language_understanding
+    - symbolic_problem_solving
+    - reading_comprehension
+  categories:
+  - name: world_knowledge
+    benchmarks:
+    - name: jeopardy
+      num_fewshot: 3
+      random_baseline: 0
+    - name: bigbench_qa_wikidata
+      num_fewshot: 3
+      random_baseline: 0
+    - name: arc_easy
+      num_fewshot: 3
+      random_baseline: 0.25
+    - name: arc_challenge
+      num_fewshot: 3
+      random_baseline: 0.25
+    - name: mmlu
+      num_fewshot: 5
+      random_baseline: 0.25
+    - name: triviaqa_sm_sub
+      num_fewshot: 3
+      random_baseline: 0.0
+  - name: commonsense_reasoning
+    benchmarks:
+    - name: copa
+      num_fewshot: 0
+      random_baseline: 0.5
+    - name: siqa
+      num_fewshot: 3
+      random_baseline: 0.5
+    - name: commonsense_qa
+      num_fewshot: 0
+      random_baseline: 0.25
+    - name: piqa
+      num_fewshot: 0
+      random_baseline: 0.5
+    - name: openbook_qa
+      num_fewshot: 10
+      random_baseline: 0.25
+    - name: bigbench_strange_stories
+      num_fewshot: 0
+      random_baseline: 0.5
+    - name: bigbench_strategy_qa
+      num_fewshot: 0
+      random_baseline: 0.5
+  - name: language_understanding
+    benchmarks:
+    - name: lambada_openai
+      num_fewshot: 0
+      random_baseline: 0.0
+    - name: hellaswag
+      num_fewshot: 0
+      random_baseline: 0.25
+    - name: winograd
+      num_fewshot: 3
+      random_baseline: 0.5
+    - name: winogrande
+      num_fewshot: 5
+      random_baseline: 0.5
+  - name: symbolic_problem_solving
+    benchmarks:
+    - name: bigbench_elementary_math_qa
+      num_fewshot: 1
+      random_baseline: 0.25
+    - name: bigbench_dyck_languages
+      num_fewshot: 5
+      random_baseline: 0
+    - name: bigbench_operators
+      num_fewshot: 3
+      random_baseline: 0.0
+    - name: simple_arithmetic_withspaces
+      num_fewshot: 5
+      random_baseline: 0.0
+    - name: simple_arithmetic_nospaces
+      num_fewshot: 5
+      random_baseline: 0.0
+    - name: aqua
+      num_fewshot: 3
+      random_baseline: 0.0
+    - name: gsm8k
+      num_fewshot: 8
+      random_baseline: 0.0
+    - name: svamp
+      num_fewshot: 5
+      random_baseline: 0
+    - name: agi_eval_sat_math
+      num_fewshot: 3
+      random_baseline: 0.0
+    - name: agi_eval_lsat_ar
+      num_fewshot: 5
+      random_baseline: 0.25
+  - name: reading_comprehension
+    benchmarks:
+    - name: squad
+      num_fewshot: 3
+      random_baseline: 0
+    - name: boolq
+      num_fewshot: 0
+      random_baseline: 0.5
+    - name: coqa
+      num_fewshot: 0
+      random_baseline: 0.0
+    - name: agi_eval_lsat_rc
+      num_fewshot: 5
+      random_baseline: 0.25
+    - name: agi_eval_lsat_lr
+      num_fewshot: 5
+      random_baseline: 0.25
+    - name: agi_eval_sat_en
+      num_fewshot: 5
+      random_baseline: 0.25
diff --git a/scripts/eval/yamls/hf_lora_eval.yml b/scripts/eval/yamls/hf_lora_eval.yml
@@ -46,5 +46,5 @@ fsdp_config:
   sharding_strategy: FULL_SHARD
   mixed_precision: FULL
 
-icl_tasks: 'eval/yamls/tasks_light.yaml'
-eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+icl_tasks: 'eval/yamls/tasks_v0.2.yaml'
+eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
diff --git a/scripts/eval/yamls/tasks_v0.2.yaml b/scripts/eval/yamls/tasks_v0.2.yaml
@@ -0,0 +1,194 @@
+icl_tasks:
+-
+  label: jeopardy
+  dataset_uri: eval/local_data/world_knowledge/jeopardy_all.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+  has_categories: true
+-
+  label: triviaqa_sm_sub
+  dataset_uri: eval/local_data/world_knowledge/triviaqa_sm_sub.jsonl
+  num_fewshot: [3]
+  icl_task_type: question_answering
+-
+  label: gsm8k
+  dataset_uri: eval/local_data/symbolic_problem_solving/gsm8k.jsonl
+  num_fewshot: [8, 5]
+  icl_task_type: question_answering
+  cot_delimiter: ' #### '
+  continuation_delimiter: "\nA: Let's think step by step. "
+  question_prelimiter: "Q: "
+-
+  label: agi_eval_sat_math
+  dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_sat_math.jsonl
+  num_fewshot: [3]
+  icl_task_type: question_answering
+  cot_delimiter: ' #### '
+  continuation_delimiter: "\nA: Let's think step by step. "
+-
+  label: aqua
+  dataset_uri: eval/local_data/symbolic_problem_solving/aqua.jsonl
+  num_fewshot: [3]
+  icl_task_type: question_answering
+  cot_delimiter: ' #### '
+  continuation_delimiter: "\nA: Let's think step by step. "
+-
+  label: svamp
+  dataset_uri: eval/local_data/symbolic_problem_solving/svamp.jsonl
+  num_fewshot: [5]
+  icl_task_type: question_answering
+  continuation_delimiter: "\nUsing the formula below:\n"
+  cot_delimiter: ' #### '
+  question_prelimiter: "Q: "
+-
+  label: bigbench_qa_wikidata
+  dataset_uri: eval/local_data/world_knowledge/bigbench_qa_wikidata.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+-
+  label: arc_easy
+  dataset_uri: eval/local_data/world_knowledge/arc_easy.jsonl
+  num_fewshot: [3]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: arc_challenge
+  dataset_uri: eval/local_data/world_knowledge/arc_challenge.jsonl
+  num_fewshot: [3, 25]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: mmlu
+  dataset_uri: eval/local_data/world_knowledge/mmlu.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+  has_categories: true
+-
+  label: copa
+  dataset_uri: eval/local_data/commonsense_reasoning/copa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: siqa
+  dataset_uri: eval/local_data/commonsense_reasoning/siqa.jsonl
+  num_fewshot: [3]
+  icl_task_type: multiple_choice
+-
+  label: commonsense_qa
+  dataset_uri: eval/local_data/commonsense_reasoning/commonsense_qa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: piqa
+  dataset_uri: eval/local_data/commonsense_reasoning/piqa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: openbook_qa
+  dataset_uri: eval/local_data/commonsense_reasoning/openbook_qa.jsonl
+  num_fewshot: [10]
+  icl_task_type: multiple_choice
+-
+  label: bigbench_strange_stories
+  dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strange_stories.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: bigbench_strategy_qa
+  dataset_uri: eval/local_data/commonsense_reasoning/bigbench_strategy_qa.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+-
+  label: bigbench_dyck_languages
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_dyck_languages.jsonl
+  num_fewshot: [5]
+  icl_task_type: language_modeling
+-
+  label: lambada_openai
+  dataset_uri: eval/local_data/language_understanding/lambada_openai.jsonl
+  num_fewshot: [0]
+  icl_task_type: language_modeling
+-
+  label: hellaswag
+  dataset_uri: eval/local_data/language_understanding/hellaswag.jsonl
+  num_fewshot: [0, 10]
+  icl_task_type: multiple_choice
+-
+  label: winograd
+  dataset_uri: eval/local_data/language_understanding/winograd_wsc.jsonl
+  num_fewshot: [3]
+  icl_task_type: schema
+-
+  label: winogrande
+  dataset_uri: eval/local_data/language_understanding/winogrande.jsonl
+  num_fewshot: [5]
+  icl_task_type: schema
+-
+  label: bigbench_elementary_math_qa
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_elementary_math_qa.jsonl
+  num_fewshot: [1]
+  icl_task_type: multiple_choice
+-
+  label: agi_eval_lsat_ar
+  dataset_uri: eval/local_data/symbolic_problem_solving/agi_eval_lsat_ar.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+-
+  label: bigbench_cs_algorithms
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_cs_algorithms.jsonl
+  num_fewshot: [10]
+  icl_task_type: language_modeling
+-
+  label: bigbench_operators
+  dataset_uri: eval/local_data/symbolic_problem_solving/bigbench_operators.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+-
+  label: simple_arithmetic_nospaces
+  dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_nospaces.jsonl
+  num_fewshot: [5]
+  icl_task_type: language_modeling
+-
+  label: simple_arithmetic_withspaces
+  dataset_uri: eval/local_data/symbolic_problem_solving/simple_arithmetic_withspaces.jsonl
+  num_fewshot: [5]
+  icl_task_type: language_modeling
+-
+  label: pubmed_qa_labeled
+  dataset_uri: eval/local_data/reading_comprehension/pubmed_qa_labeled.jsonl
+  num_fewshot: [10]
+  icl_task_type: language_modeling
+-
+  label: squad
+  dataset_uri: eval/local_data/reading_comprehension/squad.jsonl
+  num_fewshot: [3]
+  icl_task_type: language_modeling
+-
+  label: agi_eval_lsat_rc
+  dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_rc.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+-
+  label: agi_eval_lsat_lr
+  dataset_uri: eval/local_data/reading_comprehension/agi_eval_lsat_lr.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
+-
+  label: coqa
+  dataset_uri: eval/local_data/reading_comprehension/coqa.jsonl
+  num_fewshot: [0]
+  icl_task_type: language_modeling
+-
+  label: boolq
+  dataset_uri: eval/local_data/reading_comprehension/boolq.jsonl
+  num_fewshot: [0]
+  icl_task_type: multiple_choice
+  continuation_delimiter: "\nAnswer: " # this separates questions from answers
+-
+  label: agi_eval_sat_en
+  dataset_uri: eval/local_data/reading_comprehension/agi_eval_sat_en.jsonl
+  num_fewshot: [5]
+  icl_task_type: multiple_choice
diff --git a/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml b/scripts/train/yamls/pretrain/gpt-neo-125m_eval.yaml
@@ -98,9 +98,8 @@ progress_bar: false
 log_to_console: true
 console_log_interval: 1ba
 
-icl_tasks: eval/yamls/tasks.yaml # or use tasks_light.yaml
-icl_subset_num_batches: 2 # -1, or omit this key entirely, to evaluate on all batches
-eval_gauntlet: 'eval/yamls/eval_gauntlet.yaml'
+icl_tasks: eval/yamls/tasks_v0.2.yaml # or use tasks_light.yaml
+eval_gauntlet: 'eval/yamls/eval_gauntlet_v0.2.yaml'
 icl_seq_len: 1024
 
 callbacks: