Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support HELMET #182

Merged
merged 15 commits into from
Nov 1, 2024
506 changes: 506 additions & 0 deletions evals/evaluation/HELMET/README.md

Large diffs are not rendered by default.

100 changes: 100 additions & 0 deletions evals/evaluation/HELMET/arguments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import argparse
import ast
import os

import yaml


def parse_arguments():
parser = argparse.ArgumentParser(description="evaluation on downstream tasks")
parser.add_argument("--config", type=str, default=None, help="path to config file")
parser.add_argument("--tag", type=str, default="eval", help="tag to add to the output file")

# model setting
parser.add_argument("--model_name_or_path", type=str, default=None)
parser.add_argument("--use_vllm", action="store_true", help="whether to use vllm engine")

# data paths
parser.add_argument("--datasets", type=str, default=None)
parser.add_argument("--demo_files", type=str, default=None)
parser.add_argument("--test_files", type=str, default=None)
parser.add_argument("--output_dir", type=str, default=None, help="path to save the predictions")
parser.add_argument("--overwrite", action="store_true", help="whether to the saved file")
parser.add_argument("--max_test_samples", type=int, default=None)
parser.add_argument("--num_workers", type=int, default=4)
parser.add_argument("--num_depths", type=int, default=10)

# dataset specific settings
parser.add_argument("--popularity_threshold", type=int, default=3)

# evaluation settings
parser.add_argument("--shots", type=int, default=5, help="total number of demos (encoder + decoder)")
parser.add_argument(
"--input_max_length",
type=str,
default="8192",
help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets",
)

# generation settings
parser.add_argument(
"--do_sample",
type=ast.literal_eval,
choices=[True, False],
default=False,
help="whether to use sampling (false is greedy), overwrites temperature",
)
parser.add_argument(
"--generation_max_length",
type=str,
default="10",
help="max number of tokens to generate, can be separated by comma to match the specified datasets",
)
parser.add_argument("--generation_min_length", type=int, default=0, help="min number of tokens to generate")
parser.add_argument("--temperature", type=float, default=1.0, help="generation temperature")
parser.add_argument("--top_p", type=float, default=1.0, help="top-p parameter for nucleus sampling")
parser.add_argument(
"--stop_newline",
type=ast.literal_eval,
choices=[True, False],
default=False,
help="whether to stop generation at newline",
)

# model specific settings
parser.add_argument("--seed", type=int, default=42, help="random seed")
parser.add_argument("--no_cuda", action="store_true", help="disable cuda")
parser.add_argument("--no_bf16", action="store_true", help="disable bf16 and use fp32")
parser.add_argument("--no_torch_compile", action="store_true", help="disable cuda")
parser.add_argument(
"--use_chat_template",
type=ast.literal_eval,
choices=[True, False],
default=False,
help="whether to use chat template",
)
parser.add_argument("--rope_theta", type=int, default=None, help="override rope theta")

# misc
parser.add_argument("--debug", action="store_true", help="for debugging")
parser.add_argument(
"--count_tokens",
action="store_true",
help="instead of running generation, just count the number of tokens (only for HF models not API)",
)

args = parser.parse_args()
config = yaml.safe_load(open(args.config)) if args.config is not None else {}
parser.set_defaults(**config)
args = parser.parse_args()

if args.output_dir is None:
args.output_dir = f"output/{os.path.basename(args.model_name_or_path)}"

if args.rope_theta is not None:
args.output_dir = args.output_dir + f"-override-rope{args.rope_theta}"

return args
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added evals/evaluation/HELMET/assets/logo.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/cite.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072
datasets: alce_asqa_700,alce_qampari_700
generation_max_length: 300,300
test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json
demo_files: prompts/asqa_revised.json,prompts/qampari_revised.json
use_chat_template: true
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/cite_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536
datasets: alce_asqa_30,alce_asqa_75,alce_asqa_165,alce_asqa_345,alce_qampari_30,alce_qampari_75,alce_qampari_165,alce_qampari_345
generation_max_length: 300,300,300,300,300,300,300,300
test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json
demo_files: prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json
use_chat_template: true
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/icl.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072,131072,131072
datasets: icl_trec_coarse_6600shot_balance,icl_trec_fine_6400shot_balance,icl_banking77_5900shot_balance,icl_clinic150_7050shot_balance,icl_nlu_8296shot_balance
generation_max_length: 20,20,20,20,20
test_files: ',,,,'
demo_files: ',,,,'
use_chat_template: false
max_test_samples: 100
shots: 0
stop_new_line: true
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/icl_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: icl_trec_coarse_400shot_balance,icl_trec_coarse_800shot_balance,icl_trec_coarse_1600shot_balance,icl_trec_coarse_3300shot_balance,icl_trec_fine_400shot_balance,icl_trec_fine_800shot_balance,icl_trec_fine_1600shot_balance,icl_trec_fine_3200shot_balance,icl_banking77_360shot_balance,icl_banking77_720shot_balance,icl_banking77_1450shot_balance,icl_banking77_2900shot_balance,icl_clinic150_440shot_balance,icl_clinic150_880shot_balance,icl_clinic150_1750shot_balance,icl_clinic150_3525shot_balance,icl_nlu_510shot_balance,icl_nlu_1020shot_balance,icl_nlu_2040shot_balance,icl_nlu_4080shot_balance
generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
test_files: ',,,,,,,,,,,,,,,,,,,'
demo_files: ',,,,,,,,,,,,,,,,,,,'
use_chat_template: false
max_test_samples: 100
shots: 0
stop_new_line: true
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/longqa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072
datasets: narrativeqa_130772,infbench_qa_eng_130862,infbench_choice_eng_130862
generation_max_length: 100,10,10
test_files: ',,'
demo_files: ',,'
use_chat_template: true
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/longqa_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: narrativeqa_7892,narrativeqa_16084,narrativeqa_32468,narrativeqa_65236,infbench_qa_eng_7982,infbench_qa_eng_16174,infbench_qa_eng_32558,infbench_qa_eng_65326,infbench_choice_eng_7982,infbench_choice_eng_16174,infbench_choice_eng_32558,infbench_choice_eng_65326
generation_max_length: 100,100,100,100,10,10,10,10,10,10,10,10
test_files: ',,,,,,,,,,,'
demo_files: ',,,,,,,,,,,'
use_chat_template: true
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
8 changes: 8 additions & 0 deletions evals/evaluation/HELMET/configs/niah.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072
datasets: ruler_niah_s_2
generation_max_length: 50
test_files: data/ruler/niah_single_2/validation_131072.jsonl
demo_files: ''
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/niah_long.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072
datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2
generation_max_length: 50,50,50,50,50,50,50,50,50,50,100,100,100,100,50,50,100,100,50,50,50,50,50,50,50,50
test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/few/validation_65536.jsonl,data/ruler/few/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl
demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,'
use_chat_template: false
max_test_samples: 100
shots: 0
stop_new_line: false
model_name_or_path: /scratch/gpfs/hyen/models/Meta-Llama-3.1-8B
output_dir: output/Meta-Llama-3.1-8B
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/rag.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072,131072
datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3
generation_max_length: 20,20,20,20
test_files: data/kilt/nq-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k1000_dep3.jsonl,data/kilt/popqa_test_1000_k1000_dep6.jsonl
demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl
use_chat_template: false
max_test_samples: 100
shots: 2
stop_new_line: true
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/rag_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: kilt_nq,kilt_nq,kilt_nq,kilt_nq,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3
generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
test_files: data/kilt/nq-dev-multikilt_1000_k50_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k105_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k220_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k440_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k50_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k105_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k220_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k440_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k50_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k105_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k220_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k440_dep3.jsonl,data/kilt/popqa_test_1000_k50_dep6.jsonl,data/kilt/popqa_test_1000_k105_dep6.jsonl,data/kilt/popqa_test_1000_k220_dep6.jsonl,data/kilt/popqa_test_1000_k440_dep6.jsonl
demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl
use_chat_template: false
max_test_samples: 100
shots: 2
stop_new_line: true
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/recall.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072,131072,131072
datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv
generation_max_length: 50,100,50,100
test_files: data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/json_kv/test_k1800_dep6.jsonl
demo_files: ',,,'
use_chat_template: false
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/recall_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
datasets: ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,json_kv,json_kv,json_kv,json_kv
generation_max_length: 50,50,50,50,100,100,100,100,50,50,50,50,100,100,100,100
test_files: data/ruler/niah_multikey_2/validation_8192.jsonl,data/ruler/niah_multikey_2/validation_16384.jsonl,data/ruler/niah_multikey_2/validation_32768.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_8192.jsonl,data/ruler/niah_multikey_3/validation_16384.jsonl,data/ruler/niah_multikey_3/validation_32768.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multivalue/validation_8192.jsonl,data/ruler/niah_multivalue/validation_16384.jsonl,data/ruler/niah_multivalue/validation_32768.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/json_kv/test_k105_dep6.jsonl,data/json_kv/test_k220_dep6.jsonl,data/json_kv/test_k440_dep6.jsonl,data/json_kv/test_k900_dep6.jsonl
demo_files: ',,,,,,,,,,,,,,,'
use_chat_template: false
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/rerank.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: '131072'
datasets: msmarco_rerank_psg
generation_max_length: '200'
test_files: data/msmarco/test_reranking_data_k1000_dep3.jsonl
demo_files: data/msmarco/test_reranking_data_k10_dep3.jsonl
use_chat_template: false
max_test_samples: 100
shots: 2
stop_new_line: true
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/rerank_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536
datasets: msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg
generation_max_length: 200,200,200,200
test_files: data/msmarco/test_reranking_data_k50_dep3.jsonl,data/msmarco/test_reranking_data_k130_dep3.jsonl,data/msmarco/test_reranking_data_k285_dep3.jsonl,data/msmarco/test_reranking_data_k600_dep3.jsonl
demo_files: data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl
use_chat_template: false
max_test_samples: 100
shots: 2
stop_new_line: true
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/summ.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 131072,131072
datasets: infbench_sum_eng_129672,multi_lexsum_130372
generation_max_length: 1200,400
test_files: ','
demo_files: ','
use_chat_template: true
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
14 changes: 14 additions & 0 deletions evals/evaluation/HELMET/configs/summ_short.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536
datasets: infbench_sum_eng_6792,infbench_sum_eng_14984,infbench_sum_eng_31368,infbench_sum_eng_64136,multi_lexsum_7492,multi_lexsum_15684,multi_lexsum_32068,multi_lexsum_64836
generation_max_length: 1200,1200,1200,1200,400,400,400,400
test_files: ',,,,,,,'
demo_files: ',,,,,,,'
use_chat_template: true
max_test_samples: 100
shots: 2
stop_new_line: false
model_name_or_path: meta-llama/Llama-3.2-1B-Instruct
output_dir: output/Llama-3.2-1B-Instruct
Loading