add server

modelscope · Dec 20, 2024 · 85b6577 · 85b6577
1 parent a3b9b9f
commit 85b6577
Show file tree

Hide file tree

Showing 10 changed files with 99 additions and 149 deletions.
diff --git a/evalscope/arguments.py b/evalscope/arguments.py
@@ -1,6 +1,8 @@
 import argparse
 import json
 
+from evalscope.constants import EvalBackend, EvalStage, EvalType
+
 
 class ParseStrArgsAction(argparse.Action):
 
@@ -47,10 +49,13 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--generation-config', type=str, action=ParseStrArgsAction, help='The generation config, should be a string.')  # noqa: E501
 
     # Evaluation-related arguments
-    parser.add_argument('--eval-type', type=str, help='The type for evaluating.')
-    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.')
+    parser.add_argument('--eval-type', type=str, help='The type for evaluating.',
+                        choices=[EvalType.CHECKPOINT, EvalType.CUSTOM, EvalType.SERVICE])
+    parser.add_argument('--eval-backend', type=str, help='The evaluation backend to use.',
+                        choices=[EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT, EvalBackend.RAG_EVAL])  # noqa: E501
     parser.add_argument('--eval-config', type=str, required=False, help='The eval task config file path for evaluation backend.')  # noqa: E501
-    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.')
+    parser.add_argument('--stage', type=str, default='all', help='The stage of evaluation pipeline.',
+                        choices=[EvalStage.ALL, EvalStage.INFER, EvalStage.EVAL])
     parser.add_argument('--limit', type=int, default=None, help='Max evaluation samples num for each subset.')
 
     # Cache and working directory arguments
@@ -62,6 +67,8 @@ def add_argument(parser: argparse.ArgumentParser):
     parser.add_argument('--debug', action='store_true', default=False, help='Debug mode, will print information for debugging.')  # noqa: E501
     parser.add_argument('--dry-run', action='store_true', default=False, help='Dry run in single processing mode.')
     parser.add_argument('--seed', type=int, default=42, help='Random seed for reproducibility.')
+    parser.add_argument('--api-key', type=str, default='EMPTY', help='The API key for the remote API model.')
+    parser.add_argument('--api-url', type=str, default=None, help='The API url for the remote API model.')
     # yapf: enable
 
 

diff --git a/evalscope/benchmarks/bbh/bbh_adapter.py b/evalscope/benchmarks/bbh/bbh_adapter.py
@@ -5,18 +5,17 @@
 import random
 import re
 
-from evalscope.benchmarks.data_adapter import DataAdapter
+from evalscope.benchmarks import Benchmark, DataAdapter
 from evalscope.constants import AnswerKeys
-from evalscope.metrics.metrics import exact_match, weighted_mean
-from evalscope.utils import ResponseParser, normalize_score
+from evalscope.metrics import WeightedAverageAccuracy, exact_match
+from evalscope.models.chat_adapter import ChatGenerationModelAdapter
+from evalscope.utils import ResponseParser
 from evalscope.utils.logger import get_logger
 
 # flake8: noqa
 
 logger = get_logger()
 
-DATASET_ID = 'modelscope/bbh'
-
 # BBH multiple choice subset list
 MULTIPLE_CHOICE = 'multiple_choice'
 MULTIPLE_CHOICE_LIST = [
@@ -59,25 +58,25 @@
 SUBSET_LIST = MULTIPLE_CHOICE_LIST + FREE_FORM_LIST
 
 
+@Benchmark.register(
+    name='bbh',
+    dataset_id='modelscope/bbh',
+    model_adapter=ChatGenerationModelAdapter,
+    subset_list=SUBSET_LIST,
+    metric_list=[WeightedAverageAccuracy],
+    few_shot_num=0,
+    train_split=None,
+    eval_split='test',
+    prompt_template='',
+)
 class BBHAdapter(DataAdapter):
     """
     Adapter for BBH free-form and multiple-choices sub-tasks.
     """
 
-    def __init__(self,
-                 subset_list: list = None,
-                 metric_list: list = None,
-                 few_shot_num: int = None,
-                 train_split: str = None,
-                 eval_split: str = 'test',
-                 **kwargs):
-
-        if subset_list is None:
-            subset_list = SUBSET_LIST
-
-        if metric_list is None:
-            metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}]
+    def __init__(self, **kwargs):
 
+        few_shot_num = kwargs.get('few_shot_num', None)
         if few_shot_num is None:
             logger.info(f'Set 3-shot examples by system for BBH.')
             few_shot_num = 3
@@ -87,13 +86,7 @@ def __init__(self,
                          f'Use 3-shot by default.')
             few_shot_num = 3
 
-        super().__init__(
-            subset_list=subset_list,
-            metric_list=metric_list,
-            few_shot_num=few_shot_num,
-            train_split=train_split,
-            eval_split=eval_split,
-            **kwargs)
+        super().__init__(**kwargs)
 
     def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict:
         data_dict = {}
@@ -217,66 +210,6 @@ def parse_pred_result(self, result: str, raw_input_d: dict = None, eval_type: st
     def match(self, gold: str, pred: str) -> float:
         return exact_match(gold=gold, pred=pred)
 
-    def compute_metric(self, review_res_list: list) -> float:
-        """
-        Compute evaluation result by specific metric.
-
-        Args:
-            review_res_list: review score list, e.g. [0, 1, 1, 0, ...]
-
-        Returns:
-            The metric score.
-        """
-        items = [(score, 1.0) for score in review_res_list]
-        return weighted_mean(items)
-
-    def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict:
-        """
-        Generate the report for the model output.
-
-        Args:
-            subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...}
-            report_name: The user-defined report name.
-
-        Returns: A dict of metric calculation results. The format is like:
-        {
-            "name":"BBH",
-            "metric":"WeightedAverageAccuracy",
-            "score":0.3389,
-            "category":[
-                {
-                    "name":"DEFAULT",
-                    "score":0.3389,
-                    "subset":[
-                        {
-                            "name":"BBH",
-                            "score":0.3389
-                        },
-                    ]
-                }
-            ],
-            "total_num":100
-        }
-        """
-        total_num: int = sum([num for _, num in subset_score_map.values()])
-        weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num
-        weighted_avg_acc = normalize_score(score=weighted_avg_acc)
-        cate_avg_list = [{
-            'name': subset_name,
-            'score': normalize_score(score=score)
-        } for subset_name, (score, _) in subset_score_map.items()]
-
-        category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list)
-
-        res_map = dict(
-            name=report_name or 'bbh',
-            metric=self.metric_list[0]['name'],
-            score=weighted_avg_acc,
-            category=[category_d],
-            total_num=total_num)
-
-        return res_map
-
     @classmethod
     def _extract_mc_answer(cls, ans: str) -> str:
         """

diff --git a/evalscope/config.py b/evalscope/config.py
@@ -65,6 +65,8 @@ class TaskConfig:
     debug: bool = False
     dry_run: bool = False
     seed: int = 42
+    api_url: Optional[str] = None  # Only used for server model
+    api_key: Optional[str] = 'EMPTY'  # Only used for server model
 
     def __post_init__(self):
         if (not self.model_id) and self.model:

diff --git a/evalscope/constants.py b/evalscope/constants.py
@@ -140,30 +140,8 @@ class EvalType:
 
 
 class EvalBackend:
-
-    class _Backend:
-        #  compatible with old version, set 'value'
-
-        def __init__(self, value):
-            self._value = value
-
-        @property
-        def value(self):
-            return self._value
-
-        def __str__(self):
-            return self._value
-
-        def __repr__(self):
-            return f"'{self._value}'"
-
-        def __eq__(self, other):
-            if isinstance(other, str):
-                return self._value == other
-            return NotImplemented
-
-    NATIVE = _Backend('Native')
-    OPEN_COMPASS = _Backend('OpenCompass')
-    VLM_EVAL_KIT = _Backend('VLMEvalKit')
-    RAG_EVAL = _Backend('RAGEval')
-    THIRD_PARTY = _Backend('ThirdParty')
+    NATIVE = 'Native'
+    OPEN_COMPASS = 'OpenCompass'
+    VLM_EVAL_KIT = 'VLMEvalKit'
+    RAG_EVAL = 'RAGEval'
+    THIRD_PARTY = 'ThirdParty'
diff --git a/evalscope/models/base_adapter.py b/evalscope/models/base_adapter.py
@@ -1,15 +1,17 @@
 import torch
 from abc import ABC, abstractmethod
-from typing import Any, Union
+from typing import Any, Optional, Union
 
 from evalscope.models.custom import CustomModel
 from evalscope.models.local_model import LocalModel
 
 
 class BaseModelAdapter(ABC):
 
-    def __init__(self, model: Union[LocalModel, CustomModel], **kwargs):
-        if isinstance(model, LocalModel):
+    def __init__(self, model: Optional[Union[LocalModel, CustomModel]], **kwargs):
+        if model is None:
+            self.model_cfg = kwargs.get('model_cfg', None)
+        elif isinstance(model, LocalModel):
             self.model = model.model
             self.model_id = model.model_id
             self.model_revision = model.model_revision

diff --git a/evalscope/models/server_adapter.py b/evalscope/models/server_adapter.py
@@ -3,25 +3,28 @@
 from typing import Union
 
 from evalscope.models.base_adapter import BaseModelAdapter
-from evalscope.models.custom import CustomModel
-from evalscope.models.local_model import LocalModel
-from evalscope.utils.chat_service import ChatCompletionResponse
+from evalscope.utils.logger import get_logger
+
+logger = get_logger()
 
 
 class ServerModelAdapter(BaseModelAdapter):
     """
     Server model adapter to request remote API model and generate results.
     """
 
-    def __init__(self, model: Union[LocalModel, CustomModel], api_url: str, **kwargs):
+    def __init__(self, api_url: str, model_id: str, api_key: str = 'EMPTY', **kwargs):
         """
         Args:
-            model: The model instance.
             api_url: The URL of the remote API model.
-            **kwargs: Other args.
+            model_id: The ID of the remote API model.
+            api_key: The API key of the remote API model.
         """
-        super().__init__(model, **kwargs)
         self.api_url = api_url
+        self.model_id = model_id
+        self.api_key = api_key
+        self.model_cfg = {'api_url': api_url, 'model_id': model_id, 'api_key': api_key}
+        super().__init__(model=None, model_cfg=self.model_cfg, **kwargs)
 
     def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dict:
         """
@@ -48,33 +51,31 @@ def predict(self, inputs: Union[str, dict, list], infer_cfg: dict = None) -> dic
             raise TypeError(f'Unsupported inputs type: {type(inputs)}')
 
         # Format request JSON according to OpenAI API format
+        # do not sample by default
         request_json = {
             'model': self.model_id,
-            'prompt': query,
+            'messages': [{
+                'role': 'user',
+                'content': query
+            }],
             'max_tokens': infer_cfg.get('max_tokens', 2048),
-            'temperature': infer_cfg.get('temperature', 1.0),
+            'temperature': infer_cfg.get('temperature', 0.0),
             'top_p': infer_cfg.get('top_p', 1.0),
             'n': infer_cfg.get('num_return_sequences', 1),
             'stop': infer_cfg.get('stop', None)
         }
 
-        # Request to remote API
-        response = requests.post(self.api_url, json=request_json)
-        response_data = response.json()
-
-        choices_list = [{
-            'index': i,
-            'message': {
-                'content': choice['text'],
-                'role': 'assistant'
-            }
-        } for i, choice in enumerate(response_data['choices'])]
-
-        res_d = ChatCompletionResponse(
-            model=self.model_id,
-            choices=choices_list,
-            object='chat.completion',
-            created=int(time.time()),
-            usage=response_data.get('usage', None)).model_dump(exclude_unset=True)
-
-        return res_d
+        # Request to remote API with retry mechanism
+        max_retries = 3
+        for attempt in range(max_retries):
+            response = requests.post(
+                self.api_url, json=request_json, headers={'Authorization': f'Bearer {self.api_key}'})
+            if response.status_code == 200:
+                response_data = response.json()
+                return response_data
+            logger.warning(f'Failed to request to remote API: {response.status_code} {response.text}')
+            if attempt < max_retries - 1:
+                time.sleep(5)  # Sleep for 5 seconds before retrying
+            else:
+                raise RuntimeError(f'Failed to request to remote API after {max_retries} attempts: '
+                                   f'{response.status_code} {response.text}')
diff --git a/evalscope/run.py b/evalscope/run.py
@@ -160,7 +160,7 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model
         return CustomModelAdapter(custom_model=task_cfg.model)
     elif task_cfg.eval_type == EvalType.SERVICE:
         from evalscope.models import ServerModelAdapter
-        return ServerModelAdapter(url=task_cfg.model, model_id=task_cfg.model_id)
+        return ServerModelAdapter(api_url=task_cfg.api_url, model_id=task_cfg.model, api_key=task_cfg.api_key)
     else:
         return model_adapter_cls(
             model=base_model or get_base_model(task_cfg),

diff --git a/evalscope/utils/__init__.py b/evalscope/utils/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from evalscope.constants import *
+from evalscope.utils.model_utils import EvalBackend
 from evalscope.utils.utils import *
diff --git a/evalscope/utils/model_utils.py b/evalscope/utils/model_utils.py
@@ -1,6 +1,15 @@
+from enum import Enum
 from transformers import GenerationConfig
 
 
+class EvalBackend(Enum):
+    NATIVE = 'Native'
+    OPEN_COMPASS = 'OpenCompass'
+    VLM_EVAL_KIT = 'VLMEvalKit'
+    RAG_EVAL = 'RAGEval'
+    THIRD_PARTY = 'ThirdParty'
+
+
 def fix_do_sample_warning(generation_config: GenerationConfig) -> None:
     # Use the default values of temperature/top_p/top_k in generation_config.
     if generation_config.temperature == 0:

diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py
@@ -4,6 +4,7 @@
 import torch
 import unittest
 
+from evalscope.constants import EvalType
 from evalscope.run import run_task
 from evalscope.utils import is_module_installed, test_level_list
 from evalscope.utils.logger import get_logger
@@ -110,5 +111,22 @@ def test_run_humaneval(self):
 
         run_task(task_cfg=task_cfg)
 
+    @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level')
+    def test_run_server_model(self):
+        from evalscope.config import TaskConfig
+
+        task_cfg = TaskConfig(
+            model='qwen2.5',
+            api_url='http://127.0.0.1:8801/v1/chat/completions',
+            api_key='EMPTY',
+            eval_type=EvalType.SERVICE,
+            datasets=['gsm8k', 'bbh'],
+            limit=2,
+            debug=True
+        )
+
+        run_task(task_cfg=task_cfg)
+
+
 if __name__ == '__main__':
     unittest.main()