Skip to content

Commit

Permalink
add server
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunnglin committed Dec 20, 2024
1 parent a3b9b9f commit 85b6577
Show file tree
Hide file tree
Showing 10 changed files with 99 additions and 149 deletions.
13 changes: 10 additions & 3 deletions evalscope/arguments.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import argparse
import json

from evalscope.constants import EvalBackend, EvalStage, EvalType


class ParseStrArgsAction(argparse.Action):

Expand Down Expand Up @@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.') # noqa: E501

# Evaluation-related arguments
parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL]) # noqa: E501
parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.') # noqa: E501
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.EVAL])
parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')

# Cache and working directory arguments
Expand All @@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.') # noqa: E501
parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
# yapf: enable


Expand Down
103 changes: 18 additions & 85 deletions evalscope/benchmarks/bbh/bbh_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,17 @@
import random
import re

from evalscope.benchmarks.data_adapter import DataAdapter
from evalscope.benchmarks import Benchmark, DataAdapter
from evalscope.constants import AnswerKeys
from evalscope.metrics.metrics import exact_match, weighted_mean
from evalscope.utils import ResponseParser, normalize_score
from evalscope.metrics import WeightedAverageAccuracy, exact_match
from evalscope.models.chat_adapter import ChatGenerationModelAdapter
from evalscope.utils import ResponseParser
from evalscope.utils.logger import get_logger

# flake8: noqa

logger = get_logger()

DATASET_ID = 'modelscope/bbh'

# BBH multiple choice subset list
MULTIPLE_CHOICE = 'multiple_choice'
MULTIPLE_CHOICE_LIST = [
Expand Down Expand Up @@ -59,25 +58,25 @@
SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST


@Benchmark.register(
name='bbh',
dataset_id='modelscope/bbh',
model_adapter=ChatGenerationModelAdapter,
subset_list=SUBSET_LIST,
metric_list=[WeightedAverageAccuracy],
few_shot_num=0,
train_split=None,
eval_split='test',
prompt_template='',
)
class BBHAdapter(DataAdapter):
"""
Adapter for BBH free-form and multiple-choices sub-tasks.
"""

def __init__(self,
subset_list: list = None,
metric_list: list = None,
few_shot_num: int = None,
train_split: str = None,
eval_split: str = 'test',
**kwargs):

if subset_list is None:
subset_list = SUBSET_LIST

if metric_list is None:
metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
def __init__(self, **kwargs):

few_shot_num = kwargs.get('few_shot_num', None)
if few_shot_num is None:
logger.info(f'Set 3-shot examples by system for BBH.')
few_shot_num = 3
Expand All @@ -87,13 +86,7 @@ def __init__(self,
f'Use 3-shot by default.')
few_shot_num = 3

super().__init__(
subset_list=subset_list,
metric_list=metric_list,
few_shot_num=few_shot_num,
train_split=train_split,
eval_split=eval_split,
**kwargs)
super().__init__(**kwargs)

def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
data_dict = {}
Expand Down Expand Up @@ -217,66 +210,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
def match(self, gold: str, pred: str) -> float:
return exact_match(gold=gold, pred=pred)

def compute_metric(self, review_res_list: list) -> float:
"""
Compute evaluation result by specific metric.
Args:
review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
Returns:
The metric score.
"""
items = [(score, 1.0) for score in review_res_list]
return weighted_mean(items)

def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
"""
Generate the report for the model output.
Args:
subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
report_name: The user-defined report name.
Returns: A dict of metric calculation results. The format is like:
{
"name":"BBH",
"metric":"WeightedAverageAccuracy",
"score":0.3389,
"category":[
{
"name":"DEFAULT",
"score":0.3389,
"subset":[
{
"name":"BBH",
"score":0.3389
},
]
}
],
"total_num":100
}
"""
total_num: int = sum([num for _, num in subset_score_map.values()])
weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
weighted_avg_acc = normalize_score(score=weighted_avg_acc)
cate_avg_list = [{
'name': subset_name,
'score': normalize_score(score=score)
} for subset_name, (score, _) in subset_score_map.items()]

category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)

res_map = dict(
name=report_name or 'bbh',
metric=self.metric_list[0]['name'],
score=weighted_avg_acc,
category=[category_d],
total_num=total_num)

return res_map

@classmethod
def _extract_mc_answer(cls, ans: str) -> str:
"""
Expand Down
2 changes: 2 additions & 0 deletions evalscope/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ class TaskConfig:
debug: bool = False
dry_run: bool = False
seed: int = 42
api_url: Optional[str] = None # Only used for server model
api_key: Optional[str] = 'EMPTY' # Only used for server model

def __post_init__(self):
if (not self.model_id) and self.model:
Expand Down
32 changes: 5 additions & 27 deletions evalscope/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,30 +140,8 @@ class EvalType:


class EvalBackend:

class _Backend:
# compatible with old version, set 'value'

def __init__(self, value):
self._value = value

@property
def value(self):
return self._value

def __str__(self):
return self._value

def __repr__(self):
return f"'{self._value}'"

def __eq__(self, other):
if isinstance(other, str):
return self._value == other
return NotImplemented

NATIVE = _Backend('Native')
OPEN_COMPASS = _Backend('OpenCompass')
VLM_EVAL_KIT = _Backend('VLMEvalKit')
RAG_EVAL = _Backend('RAGEval')
THIRD_PARTY = _Backend('ThirdParty')
NATIVE = 'Native'
OPEN_COMPASS = 'OpenCompass'
VLM_EVAL_KIT = 'VLMEvalKit'
RAG_EVAL = 'RAGEval'
THIRD_PARTY = 'ThirdParty'
8 changes: 5 additions & 3 deletions evalscope/models/base_adapter.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import torch
from abc import ABC, abstractmethod
from typing import Any, Union
from typing import Any, Optional, Union

from evalscope.models.custom import CustomModel
from evalscope.models.local_model import LocalModel


class BaseModelAdapter(ABC):

def __init__(self, model: Union[LocalModel, CustomModel], **kwargs):
if isinstance(model, LocalModel):
def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
if model is None:
self.model_cfg = kwargs.get('model_cfg', None)
elif isinstance(model, LocalModel):
self.model = model.model
self.model_id = model.model_id
self.model_revision = model.model_revision
Expand Down
59 changes: 30 additions & 29 deletions evalscope/models/server_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,28 @@
from typing import Union

from evalscope.models.base_adapter import BaseModelAdapter
from evalscope.models.custom import CustomModel
from evalscope.models.local_model import LocalModel
from evalscope.utils.chat_service import ChatCompletionResponse
from evalscope.utils.logger import get_logger

logger = get_logger()


class ServerModelAdapter(BaseModelAdapter):
"""
Server model adapter to request remote API model and generate results.
"""

def __init__(self, model: Union[LocalModel, CustomModel], api_url: str, **kwargs):
def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
"""
Args:
model: The model instance.
api_url: The URL of the remote API model.
**kwargs: Other args.
model_id: The ID of the remote API model.
api_key: The API key of the remote API model.
"""
super().__init__(model, **kwargs)
self.api_url = api_url
self.model_id = model_id
self.api_key = api_key
self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)

def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
"""
Expand All @@ -48,33 +51,31 @@ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dic
raise TypeError(f'Unsupported inputs type: {type(inputs)}')

# Format request JSON according to OpenAI API format
# do not sample by default
request_json = {
'model': self.model_id,
'prompt': query,
'messages': [{
'role': 'user',
'content': query
}],
'max_tokens': infer_cfg.get('max_tokens', 2048),
'temperature': infer_cfg.get('temperature', 1.0),
'temperature': infer_cfg.get('temperature', 0.0),
'top_p': infer_cfg.get('top_p', 1.0),
'n': infer_cfg.get('num_return_sequences', 1),
'stop': infer_cfg.get('stop', None)
}

# Request to remote API
response = requests.post(self.api_url, json=request_json)
response_data = response.json()

choices_list = [{
'index': i,
'message': {
'content': choice['text'],
'role': 'assistant'
}
} for i, choice in enumerate(response_data['choices'])]

res_d = ChatCompletionResponse(
model=self.model_id,
choices=choices_list,
object='chat.completion',
created=int(time.time()),
usage=response_data.get('usage', None)).model_dump(exclude_unset=True)

return res_d
# Request to remote API with retry mechanism
max_retries = 3
for attempt in range(max_retries):
response = requests.post(
self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
if response.status_code == 200:
response_data = response.json()
return response_data
logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
if attempt < max_retries - 1:
time.sleep(5) # Sleep for 5 seconds before retrying
else:
raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
f'{response.status_code} {response.text}')
2 changes: 1 addition & 1 deletion evalscope/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model
return CustomModelAdapter(custom_model=task_cfg.model)
elif task_cfg.eval_type == EvalType.SERVICE:
from evalscope.models import ServerModelAdapter
return ServerModelAdapter(url=task_cfg.model, model_id=task_cfg.model_id)
return ServerModelAdapter(api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key)
else:
return model_adapter_cls(
model=base_model or get_base_model(task_cfg),
Expand Down
2 changes: 1 addition & 1 deletion evalscope/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

from evalscope.constants import *
from evalscope.utils.model_utils import EvalBackend
from evalscope.utils.utils import *
9 changes: 9 additions & 0 deletions evalscope/utils/model_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
from enum import Enum
from transformers import GenerationConfig


class EvalBackend(Enum):
NATIVE = 'Native'
OPEN_COMPASS = 'OpenCompass'
VLM_EVAL_KIT = 'VLMEvalKit'
RAG_EVAL = 'RAGEval'
THIRD_PARTY = 'ThirdParty'


def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
# Use the default values of temperature/top_p/top_k in generation_config.
if generation_config.temperature == 0:
Expand Down
18 changes: 18 additions & 0 deletions tests/cli/test_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import torch
import unittest

from evalscope.constants import EvalType
from evalscope.run import run_task
from evalscope.utils import is_module_installed, test_level_list
from evalscope.utils.logger import get_logger
Expand Down Expand Up @@ -110,5 +111,22 @@ def test_run_humaneval(self):

run_task(task_cfg=task_cfg)

@unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
def test_run_server_model(self):
from evalscope.config import TaskConfig

task_cfg = TaskConfig(
model='qwen2.5',
api_url='http://127.0.0.1:8801/v1/chat/completions',
api_key='EMPTY',
eval_type=EvalType.SERVICE,
datasets=['gsm8k', 'bbh'],
limit=2,
debug=True
)

run_task(task_cfg=task_cfg)


if __name__ == '__main__':
unittest.main()

0 comments on commit 85b6577

Please sign in to comment.