Skip to content

Commit

Permalink
update model adapter
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunnglin committed Dec 19, 2024
1 parent 655d49c commit 2f941a6
Show file tree
Hide file tree
Showing 15 changed files with 272 additions and 407 deletions.
3 changes: 2 additions & 1 deletion docs/zh/user_guides/backend/rageval_backend/mteb.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ one_stage_task_cfg = {


### 两阶段评测
配置文件示例如下,先进行检索,再进行reranking:
评测reranker需要用retrieval数据集,先用embedding模型检索topk,再进行排序。配置文件示例如下:

```python
two_stage_task_cfg = {
"eval_backend": "RAGEval",
Expand Down
10 changes: 5 additions & 5 deletions evalscope/benchmarks/data_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,11 +139,11 @@ def gen_prompts(self, data_dict: dict) -> dict:
prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data)
prompt_d[AnswerKeys.RAW_INPUT] = sample_d
res_dict[sub_name].append(prompt_d)

rnd = random.Random()
rnd.seed(42)
for k, v in res_dict.items():
rnd.shuffle(v)
# Note: for multiprocess
# rnd = random.Random()
# rnd.seed(42)
# for k, v in res_dict.items():
# rnd.shuffle(v)

return res_dict

Expand Down
3 changes: 2 additions & 1 deletion evalscope/benchmarks/gsm8k/gsm8k_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
@Benchmark.register(
name='gsm8k',
dataset_id='modelscope/gsm8k',
model_adapter=ChatGenerationModelAdapter,
subset_list=['main'],
metric_list=[WeightedAverageAccuracy],
few_shot_num=4,
train_split='train',
eval_split='test',
prompt_template='',
model_adapter=ChatGenerationModelAdapter)
)
class GSM8KAdapter(DataAdapter):

def __init__(self, **kwargs):
Expand Down
5 changes: 0 additions & 5 deletions evalscope/benchmarks/hellaswag/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter
from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass
from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa
108 changes: 17 additions & 91 deletions evalscope/benchmarks/hellaswag/hellaswag_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,54 +3,40 @@
import os
import re

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.metrics.metrics import exact_match, weighted_mean
from evalscope.utils import normalize_score
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.metrics import WeightedAverageAccuracy, exact_match
from evalscope.models import ContinuationLogitsModelAdapter
from evalscope.utils.io_utils import jsonl_to_list
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()

DATASET_ID = 'modelscope/hellaswag'
SUBSET_LIST = ['default']


@Benchmark.register(
name='hellaswag',
dataset_id='modelscope/hellaswag',
model_adapter=ContinuationLogitsModelAdapter,
subset_list=['default'],
metric_list=[WeightedAverageAccuracy],
few_shot_num=0,
train_split='train',
eval_split='validation',
prompt_template='',
)
class HellaSwagAdapter(DataAdapter):

choices = ['0', '1', '2', '3']

def __init__(self,
subset_list: list = None,
metric_list: list = None,
few_shot_num: int = None,
train_split: str = 'train',
eval_split: str = 'validation',
**kwargs):

if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]

if few_shot_num is None:
# Use 0-shot by default
logger.info(f'Set 0-shot examples by system for HellaSwag.')
few_shot_num = 0
def __init__(self, **kwargs):

few_shot_num = kwargs.get('few_shot_num', None)
if few_shot_num != 0:
logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.')
few_shot_num = 0

super().__init__(
subset_list=subset_list,
metric_list=metric_list,
few_shot_num=few_shot_num,
train_split=train_split,
eval_split=eval_split,
**kwargs)
super().__init__(**kwargs)

def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
data_dict = {}
Expand Down Expand Up @@ -136,66 +122,6 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s
def match(self, gold: str, pred: str) -> float:
return exact_match(gold=str(gold), pred=str(pred))

def compute_metric(self, review_res_list: list) -> float:
"""
Compute evaluation result by specific metric.
Args:
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
Returns:
The metric score.
"""
items = [(score, 1.0) for score in review_res_list]
return weighted_mean(items)

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate the report for the model output.
Args:
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
report_name: The user-defined report name.
Returns: A dict of metric calculation results. The format is like:
{
"name":"HellaSwag",
"metric":"WeightedAverageAccuracy",
"score":0.3389,
"category":[
{
"name":"DEFAULT",
"score":0.4128,
"subset":[
{
"name":"default",
"score":0.5632
},
]
}
],
"total_num":7800
}
"""
total_num: int = sum([num for _, num in subset_score_map.values()])
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
cate_avg_list = [{
'name': subset_name,
'score': normalize_score(score=score)
} for subset_name, (score, _) in subset_score_map.items()]

category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)

res_map = dict(
name=report_name or 'hellaswag',
metric=self.metric_list[0]['name'],
score=weighted_avg_acc,
category=[category_d],
total_num=total_num)

return res_map

@classmethod
def _preprocess(cls, text):
text = text.strip()
Expand Down
2 changes: 1 addition & 1 deletion evalscope/evaluator/reviewer/auto_reviewer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from typing import Any, List

from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation
from evalscope.models.openai_model import OpenAIModel
from evalscope.models.model import OpenAIModel
from evalscope.utils import completion_parsers, random_seeded_choice
from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences
from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list
Expand Down
1 change: 1 addition & 0 deletions evalscope/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.models.custom import *
from evalscope.models.model import BaseModel, ChatBaseModel
from evalscope.models.model_adapter import *
3 changes: 0 additions & 3 deletions evalscope/models/api/__init__.py

This file was deleted.

49 changes: 0 additions & 49 deletions evalscope/models/dummy_chat_model.py

This file was deleted.

Loading

0 comments on commit 2f941a6

Please sign in to comment.