From 2f941a66ed7d07a09e458d0bb0af855c4251aabb Mon Sep 17 00:00:00 2001 From: Yunnglin Date: Thu, 19 Dec 2024 20:46:39 +0800 Subject: [PATCH] update model adapter --- .../backend/rageval_backend/mteb.md | 3 +- evalscope/benchmarks/data_adapter.py | 10 +- evalscope/benchmarks/gsm8k/gsm8k_adapter.py | 3 +- evalscope/benchmarks/hellaswag/__init__.py | 5 - .../benchmarks/hellaswag/hellaswag_adapter.py | 108 ++-------- evalscope/evaluator/reviewer/auto_reviewer.py | 2 +- evalscope/models/__init__.py | 1 + evalscope/models/api/__init__.py | 3 - evalscope/models/dummy_chat_model.py | 49 ----- evalscope/models/model.py | 141 ++++++++++++ evalscope/models/model_adapter.py | 204 +++++++----------- evalscope/models/openai_model.py | 103 --------- evalscope/run.py | 45 ++-- .../longbench_write/tools}/openai_api.py | 0 tests/cli/test_run.py | 2 +- 15 files changed, 272 insertions(+), 407 deletions(-) delete mode 100644 evalscope/models/api/__init__.py delete mode 100644 evalscope/models/dummy_chat_model.py delete mode 100644 evalscope/models/openai_model.py rename evalscope/{models/api => third_party/longbench_write/tools}/openai_api.py (100%) diff --git a/docs/zh/user_guides/backend/rageval_backend/mteb.md b/docs/zh/user_guides/backend/rageval_backend/mteb.md index 1a864a55..0e0937d5 100644 --- a/docs/zh/user_guides/backend/rageval_backend/mteb.md +++ b/docs/zh/user_guides/backend/rageval_backend/mteb.md @@ -102,7 +102,8 @@ one_stage_task_cfg = { ### 两阶段评测 -配置文件示例如下,先进行检索,再进行reranking: +评测reranker需要用retrieval数据集,先用embedding模型检索topk,再进行排序。配置文件示例如下: + ```python two_stage_task_cfg = { "eval_backend": "RAGEval", diff --git a/evalscope/benchmarks/data_adapter.py b/evalscope/benchmarks/data_adapter.py index da3a72e2..18f823ed 100644 --- a/evalscope/benchmarks/data_adapter.py +++ b/evalscope/benchmarks/data_adapter.py @@ -139,11 +139,11 @@ def gen_prompts(self, data_dict: dict) -> dict: prompt_d = self.gen_prompt(input_d=sample_d, subset_name=sub_name, few_shot_list=few_shot_data) prompt_d[AnswerKeys.RAW_INPUT] = sample_d res_dict[sub_name].append(prompt_d) - - rnd = random.Random() - rnd.seed(42) - for k, v in res_dict.items(): - rnd.shuffle(v) + # Note: for multiprocess + # rnd = random.Random() + # rnd.seed(42) + # for k, v in res_dict.items(): + # rnd.shuffle(v) return res_dict diff --git a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py index 5aa67e97..450df31d 100644 --- a/evalscope/benchmarks/gsm8k/gsm8k_adapter.py +++ b/evalscope/benchmarks/gsm8k/gsm8k_adapter.py @@ -17,13 +17,14 @@ @Benchmark.register( name='gsm8k', dataset_id='modelscope/gsm8k', + model_adapter=ChatGenerationModelAdapter, subset_list=['main'], metric_list=[WeightedAverageAccuracy], few_shot_num=4, train_split='train', eval_split='test', prompt_template='', - model_adapter=ChatGenerationModelAdapter) +) class GSM8KAdapter(DataAdapter): def __init__(self, **kwargs): diff --git a/evalscope/benchmarks/hellaswag/__init__.py b/evalscope/benchmarks/hellaswag/__init__.py index 5899f3de..b937315b 100644 --- a/evalscope/benchmarks/hellaswag/__init__.py +++ b/evalscope/benchmarks/hellaswag/__init__.py @@ -1,6 +1 @@ # Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.benchmarks.hellaswag.hellaswag_adapter import DATASET_ID, SUBSET_LIST -from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter -from evalscope.benchmarks.hellaswag.hellaswag_adapter import HellaSwagAdapter as DataAdapterClass -from evalscope.models.model_adapter import ContinuationLogitsModelAdapter as ModelAdapterClass # noqa diff --git a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py index 4d5f7ef0..afae5570 100644 --- a/evalscope/benchmarks/hellaswag/hellaswag_adapter.py +++ b/evalscope/benchmarks/hellaswag/hellaswag_adapter.py @@ -3,9 +3,9 @@ import os import re -from evalscope.benchmarks.data_adapter import DataAdapter -from evalscope.metrics.metrics import exact_match, weighted_mean -from evalscope.utils import normalize_score +from evalscope.benchmarks import Benchmark, DataAdapter +from evalscope.metrics import WeightedAverageAccuracy, exact_match +from evalscope.models import ContinuationLogitsModelAdapter from evalscope.utils.io_utils import jsonl_to_list from evalscope.utils.logger import get_logger @@ -13,44 +13,30 @@ logger = get_logger() -DATASET_ID = 'modelscope/hellaswag' -SUBSET_LIST = ['default'] - +@Benchmark.register( + name='hellaswag', + dataset_id='modelscope/hellaswag', + model_adapter=ContinuationLogitsModelAdapter, + subset_list=['default'], + metric_list=[WeightedAverageAccuracy], + few_shot_num=0, + train_split='train', + eval_split='validation', + prompt_template='', +) class HellaSwagAdapter(DataAdapter): choices = ['0', '1', '2', '3'] - def __init__(self, - subset_list: list = None, - metric_list: list = None, - few_shot_num: int = None, - train_split: str = 'train', - eval_split: str = 'validation', - **kwargs): - - if subset_list is None: - subset_list = SUBSET_LIST - - if metric_list is None: - metric_list = [{'name': 'WeightedAverageAccuracy', 'object': weighted_mean}] - - if few_shot_num is None: - # Use 0-shot by default - logger.info(f'Set 0-shot examples by system for HellaSwag.') - few_shot_num = 0 + def __init__(self, **kwargs): + few_shot_num = kwargs.get('few_shot_num', None) if few_shot_num != 0: logger.warning(f'few_shot_num should be 0 for HellaSwag, but got {few_shot_num}. Use 0-shot by default.') few_shot_num = 0 - super().__init__( - subset_list=subset_list, - metric_list=metric_list, - few_shot_num=few_shot_num, - train_split=train_split, - eval_split=eval_split, - **kwargs) + super().__init__(**kwargs) def load_from_disk(self, dataset_name_or_path, subset_list, work_dir, **kwargs) -> dict: data_dict = {} @@ -136,66 +122,6 @@ def parse_pred_result(self, result: list, raw_input_d: dict = None, eval_type: s def match(self, gold: str, pred: str) -> float: return exact_match(gold=str(gold), pred=str(pred)) - def compute_metric(self, review_res_list: list) -> float: - """ - Compute evaluation result by specific metric. - - Args: - review_res_list: review score list, e.g. [0, 1, 1, 0, ...] - - Returns: - The metric score. - """ - items = [(score, 1.0) for score in review_res_list] - return weighted_mean(items) - - def gen_report(self, subset_score_map: dict, report_name: str = None) -> dict: - """ - Generate the report for the model output. - - Args: - subset_score_map: The subset-score mapping. e.g. {subset_name: (score, num), ...} - report_name: The user-defined report name. - - Returns: A dict of metric calculation results. The format is like: - { - "name":"HellaSwag", - "metric":"WeightedAverageAccuracy", - "score":0.3389, - "category":[ - { - "name":"DEFAULT", - "score":0.4128, - "subset":[ - { - "name":"default", - "score":0.5632 - }, - ] - } - ], - "total_num":7800 - } - """ - total_num: int = sum([num for _, num in subset_score_map.values()]) - weighted_avg_acc: float = sum([score * num for score, num in subset_score_map.values()]) / total_num - weighted_avg_acc = normalize_score(score=weighted_avg_acc) - cate_avg_list = [{ - 'name': subset_name, - 'score': normalize_score(score=score) - } for subset_name, (score, _) in subset_score_map.items()] - - category_d = dict(name='DEFAULT', score=weighted_avg_acc, subset=cate_avg_list) - - res_map = dict( - name=report_name or 'hellaswag', - metric=self.metric_list[0]['name'], - score=weighted_avg_acc, - category=[category_d], - total_num=total_num) - - return res_map - @classmethod def _preprocess(cls, text): text = text.strip() diff --git a/evalscope/evaluator/reviewer/auto_reviewer.py b/evalscope/evaluator/reviewer/auto_reviewer.py index 01902f45..4144f111 100644 --- a/evalscope/evaluator/reviewer/auto_reviewer.py +++ b/evalscope/evaluator/reviewer/auto_reviewer.py @@ -11,7 +11,7 @@ from typing import Any, List from evalscope.constants import ArenaMode, EvalConfigKeys, FnCompletionParser, PositionBiasMitigation -from evalscope.models.openai_model import OpenAIModel +from evalscope.models.model import OpenAIModel from evalscope.utils import completion_parsers, random_seeded_choice from evalscope.utils.arena_utils import get_battle_pairs, merge_ques_ans, shuffle_pairwise_preferences from evalscope.utils.io_utils import dump_jsonl_data, jsonl_to_list diff --git a/evalscope/models/__init__.py b/evalscope/models/__init__.py index 9afbad48..8fc22ebf 100644 --- a/evalscope/models/__init__.py +++ b/evalscope/models/__init__.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +from evalscope.models.custom import * from evalscope.models.model import BaseModel, ChatBaseModel from evalscope.models.model_adapter import * diff --git a/evalscope/models/api/__init__.py b/evalscope/models/api/__init__.py deleted file mode 100644 index a19bf86e..00000000 --- a/evalscope/models/api/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -from evalscope.models.api.openai_api import OpenaiApi diff --git a/evalscope/models/dummy_chat_model.py b/evalscope/models/dummy_chat_model.py deleted file mode 100644 index 578b5f59..00000000 --- a/evalscope/models/dummy_chat_model.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import random -import time - -from evalscope.models import ChatBaseModel -from evalscope.utils.logger import get_logger - -logger = get_logger() - - -class DummyChatModel(ChatBaseModel): - - MODEL_ID = 'dummy_chat_model_0801' - REVISION = 'v1.0.0' - - def __init__(self, model_cfg: dict, **kwargs): - model_cfg['model_id'] = self.MODEL_ID - model_cfg['revision'] = self.REVISION - super(DummyChatModel, self).__init__(model_cfg=model_cfg) - - def predict(self, inputs: dict, **kwargs) -> dict: - - debug: bool = False - if debug: - messages = inputs['messages'] - history = inputs['history'] - - logger.info(f'** messages: {messages}') - logger.info(f'** history: {history}') - - choice = random.choice(['A', 'B', 'C', 'D']) - - # Build response - res = { - 'choices': [{ - 'index': 0, - 'message': { - 'content': choice, - 'role': 'assistant' - } - }], - 'created': time.time(), - 'model': self.MODEL_ID + '-' + self.REVISION, - 'object': 'chat.completion', - 'usage': {} - } - - return res diff --git a/evalscope/models/model.py b/evalscope/models/model.py index 826fb879..7f32f7b9 100644 --- a/evalscope/models/model.py +++ b/evalscope/models/model.py @@ -1,7 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import openai +import os +import random +import time from abc import ABC, abstractmethod from typing import Any +from evalscope.utils.logger import get_logger + +logger = get_logger() + class BaseModel(ABC): @@ -86,3 +94,136 @@ def predict(self, inputs: dict, **kwargs) -> dict: } """ raise NotImplementedError + + +class OpenAIModel(ChatBaseModel): + """ + APIs of OpenAI models. + Available models: gpt-3.5-turbo, gpt-4 + """ + + MAX_RETRIES = 3 + + def __init__(self, model_cfg: dict, **kwargs): + super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs) + + openai_api_key = os.environ.get('OPENAI_API_KEY', None) + self.api_key = self.model_cfg.get('api_key', openai_api_key) + + if not self.api_key: + logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY') + # raise ValueError( + # 'OpenAI API key is not provided, ' + # 'please set it in environment variable OPENAI_API_KEY') + + def predict(self, model_id: str, inputs: dict, **kwargs) -> dict: + + sys_prompt: str = inputs.get('sys_prompt', '') + user_prompt: str = inputs.get('user_prompt', '') + + # model_id: str = kwargs.get('model_id', '') + temperature: float = kwargs.pop('temperature', 0.2) + max_tokens: int = kwargs.pop('max_tokens', 1024) + mode: str = kwargs.pop('mode', 'chat.completion') + + logger.info(f'Using OpenAI model_id: {model_id}') + + res = self._predict( + model_id=model_id, + sys_prompt=sys_prompt, + user_prompt=user_prompt, + temperature=temperature, + max_tokens=max_tokens, + mode=mode) + + return res + + def _predict( + self, + model_id, + sys_prompt, + user_prompt, + temperature, + max_tokens, + mode: str = 'chat.completion', + ) -> dict: + + res = {} + openai.api_key = self.api_key + + for i in range(self.MAX_RETRIES): + try: + if mode == 'chat.completion': + resp = openai.ChatCompletion.create( + model=model_id, + messages=[{ + 'role': 'system', + 'content': sys_prompt + }, { + 'role': 'user', + 'content': user_prompt + }], + temperature=temperature, + max_tokens=max_tokens) + + if resp: + ans_text = resp['choices'][0]['message']['content'] + model_id = resp['model'] + else: + logger.warning(f'OpenAI GPT API call failed: got empty response ' + f'for input {sys_prompt} {user_prompt}') + ans_text = '' + model_id = '' + + res['ans_text'] = ans_text + res['model_id'] = model_id + else: + raise ValueError(f'Invalid mode: {mode}') + + return res + + except Exception as e: + logger.warning(f'OpenAI API call failed: {e}') + time.sleep(3) + logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries') + return res + + +class DummyChatModel(ChatBaseModel): + + MODEL_ID = 'dummy_chat_model_0801' + REVISION = 'v1.0.0' + + def __init__(self, model_cfg: dict, **kwargs): + model_cfg['model_id'] = self.MODEL_ID + model_cfg['revision'] = self.REVISION + super(DummyChatModel, self).__init__(model_cfg=model_cfg) + + def predict(self, inputs: dict, **kwargs) -> dict: + + debug: bool = False + if debug: + messages = inputs['messages'] + history = inputs['history'] + + logger.info(f'** messages: {messages}') + logger.info(f'** history: {history}') + + choice = random.choice(['A', 'B', 'C', 'D']) + + # Build response + res = { + 'choices': [{ + 'index': 0, + 'message': { + 'content': choice, + 'role': 'assistant' + } + }], + 'created': time.time(), + 'model': self.MODEL_ID + '-' + self.REVISION, + 'object': 'chat.completion', + 'usage': {} + } + + return res diff --git a/evalscope/models/model_adapter.py b/evalscope/models/model_adapter.py index d52bdf72..a3d56500 100644 --- a/evalscope/models/model_adapter.py +++ b/evalscope/models/model_adapter.py @@ -3,11 +3,9 @@ # flake8: noqa import numpy as np import os -import sys import time import torch from abc import ABC, abstractmethod -from copy import deepcopy from modelscope import AutoModelForCausalLM, AutoTokenizer, GenerationConfig from torch import dtype from typing import Any, Dict, List, Union @@ -21,77 +19,39 @@ logger = get_logger() -class BaseModelAdapter(ABC): - """ - Base class for model adapter. - """ - - def __init__(self, model, tokenizer, model_cfg: dict): - """ - Args: - model: The model instance which is compatible with - AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers. - tokenizer: The tokenizer instance which is compatible with AutoTokenizer of transformers. - model_cfg: - Attributes: model_id, model_revision, device_map, torch_dtype - """ - self.model = model - self.tokenizer = tokenizer - self.model_cfg = model_cfg - - @abstractmethod - @torch.no_grad() - def predict(self, *args, **kwargs) -> Any: - """ - Model prediction func. - """ - raise NotImplementedError - - -class MultiChoiceModelAdapter(BaseModelAdapter): - """ The multi-choice model adapter. """ - - _DEFAULT_MAX_LENGTH = 2048 +class LocalModel: def __init__(self, model_id: str, + model_revision: str = 'master', device_map: str = 'auto', torch_dtype: dtype = torch.bfloat16, - model_revision: str = None, - max_length: int = None, cache_dir: str = None, **kwargs): """ Args: - model_id: The model id on ModelScope, or local model_dir. TODO: torch.nn.module to be supported. + model_id: The model id on ModelScope, or local model_dir. + model_revision: The model revision on ModelScope. device_map: The device map for model inference. - torch_dtype: The torch dtype for model inference. Default: torch.bfloat16. - model_revision: The model revision on ModelScope. Default: None. - max_length: The max length of input sequence. Default: None. - **kwargs: Other args. + torch_dtype: The torch dtype for model inference. + cache_dir: Directory to cache the models. """ model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR - self.model_id: str = model_id + self.model_id = model_id + self.model_revision = model_revision self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - logger.warning(f'Device: {self.device}') - - torch_dtype = torch_dtype if torch_dtype is not None else 'auto' - - model_cfg: dict = dict() - model_cfg['model_id'] = model_id - model_cfg['device_map'] = device_map - model_cfg['torch_dtype'] = str(torch_dtype) + logger.info(f'Device: {self.device}') - tokenizer = AutoTokenizer.from_pretrained( - self.model_id, # self.model_id + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_id, revision=model_revision, trust_remote_code=True, cache_dir=model_cache_dir, ) - model = AutoModelForCausalLM.from_pretrained( - self.model_id, # self.model_id + self.model = AutoModelForCausalLM.from_pretrained( + self.model_id, revision=model_revision, device_map=device_map, trust_remote_code=True, @@ -99,9 +59,54 @@ def __init__(self, cache_dir=model_cache_dir, ) - super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg) + self.model_cfg = { + 'model_id': model_id, + 'device_map': device_map, + 'torch_dtype': str(torch_dtype), + } + + +class BaseModelAdapter(ABC): + """ + Base class for model adapter. + """ + + def __init__(self, model: Union[LocalModel, CustomModel], **kwargs): + """ + Args: + model: The model instance which is compatible with + AutoModel/AutoModelForCausalLM/AutoModelForSeq2SeqLM of transformers. + """ + if isinstance(model, LocalModel): + self.model = model.model + self.model_id = model.model_id + self.model_revision = model.model_revision + self.device = model.device + self.tokenizer = model.tokenizer + self.model_cfg = model.model_cfg + elif isinstance(model, CustomModel): + pass + else: + raise ValueError(f'Unsupported model type: {type(model)}') + + @abstractmethod + @torch.no_grad() + def predict(self, *args, **kwargs) -> Any: + """ + Model prediction func. + """ + raise NotImplementedError + - self._max_length = max_length +class MultiChoiceModelAdapter(BaseModelAdapter): + """ The multi-choice model adapter. """ + + _DEFAULT_MAX_LENGTH = 2048 + + def __init__(self, model: LocalModel, **kwargs): + super().__init__(model) + + self._max_length = kwargs.get('max_length') @property def max_length(self): @@ -198,32 +203,12 @@ def _get_logits(tokenizer, model, inputs: List[str]): class ContinuationLogitsModelAdapter(MultiChoiceModelAdapter): + """ + Continuation-logits model adapter. + """ - def __init__(self, - model_id: str, - device_map: str = 'auto', - torch_dtype: dtype = torch.bfloat16, - model_revision: str = None, - cache_dir: str = None, - **kwargs): - """ - Continuation-logits model adapter. - - Args: - model_id: The model id on ModelScope, or local model_dir. - device_map: The device map for model inference. - torch_dtype: The torch dtype for model inference. Default: torch.bfloat16. - model_revision: The model revision on ModelScope. Default: None. - **kwargs: Other args. - """ - - super().__init__( - model_id=model_id, - device_map=device_map, - torch_dtype=torch_dtype, - model_revision=model_revision, - cache_dir=cache_dir, - **kwargs) + def __init__(self, model: LocalModel, **kwargs): + super().__init__(model, **kwargs) @torch.no_grad() def predict(self, inputs: dict, infer_cfg: dict = None) -> dict: @@ -321,69 +306,26 @@ def _encode_pair(self, context, continuation): class ChatGenerationModelAdapter(BaseModelAdapter): + """ + Chat generation model adapter. + """ - def __init__(self, - model_id: str, - model_revision: str = 'master', - device_map: str = 'auto', - torch_dtype: dtype = 'auto', - cache_dir: str = None, - **kwargs): - """ - Chat completion model adapter. Tasks of chat and generation are supported. + def __init__(self, model: LocalModel, **kwargs): + super().__init__(model) - Args: - model_id: The model id on ModelScope, or local model_dir. - model_revision: The model revision on ModelScope. Default: None. - device_map: The device map for model inference. - torch_dtype: The torch dtype for model inference. Default: 'auto'. - **kwargs: Other args. - """ + self.generation_config = self._parse_generation_config(self.tokenizer, self.model) custom_generation_config = kwargs.pop('generation_config', None) custom_chat_template = kwargs.pop('chat_template', None) - model_cache_dir = cache_dir or DEFAULT_MODEL_CACHE_DIR - - self.model_id: str = model_id - self.model_revision: str = model_revision - self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - logger.warning(f'Device: {self.device}') - - torch_dtype = torch_dtype if torch_dtype is not None else 'auto' - - model_cfg: dict = dict() - model_cfg['model_id'] = model_id - model_cfg['device_map'] = device_map - model_cfg['torch_dtype'] = str(torch_dtype) - - tokenizer = AutoTokenizer.from_pretrained( - self.model_id, - revision=model_revision, - trust_remote_code=True, - cache_dir=model_cache_dir, - ) - - model = AutoModelForCausalLM.from_pretrained( - self.model_id, - revision=model_revision, - device_map=device_map, - trust_remote_code=True, - torch_dtype=torch_dtype, - cache_dir=model_cache_dir, - ) - - self.generation_config = self._parse_generation_config(tokenizer, model) if custom_generation_config: logger.info('Updating generation config ...') self.generation_config.update(**custom_generation_config) if custom_chat_template: - tokenizer.chat_template = custom_chat_template + self.tokenizer.chat_template = custom_chat_template logger.info(f'Using custom chat template: {custom_chat_template}') - super().__init__(model=model, tokenizer=tokenizer, model_cfg=model_cfg) - def _parse_generation_config(self, tokenizer, model): generation_config = getattr(model, 'generation_config', GenerationConfig(do_sample=False)) @@ -473,7 +415,7 @@ def __init__(self, custom_model: CustomModel, **kwargs): **kwargs: Other args. """ self.custom_model = custom_model - super(CustomModelAdapter, self).__init__(model=None, tokenizer=None, model_cfg=custom_model.config) + super(CustomModelAdapter, self).__init__(model=custom_model) def predict(self, inputs: Union[str, dict, list], **kwargs) -> List[Dict[str, Any]]: """ diff --git a/evalscope/models/openai_model.py b/evalscope/models/openai_model.py deleted file mode 100644 index 3caa9c4b..00000000 --- a/evalscope/models/openai_model.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import openai -import os -import time - -from evalscope.models import ChatBaseModel -from evalscope.utils.logger import get_logger - -logger = get_logger() - - -class OpenAIModel(ChatBaseModel): - """ - APIs of OpenAI models. - Available models: gpt-3.5-turbo, gpt-4 - """ - - MAX_RETRIES = 3 - - def __init__(self, model_cfg: dict, **kwargs): - super(OpenAIModel, self).__init__(model_cfg=model_cfg, **kwargs) - - openai_api_key = os.environ.get('OPENAI_API_KEY', None) - self.api_key = self.model_cfg.get('api_key', openai_api_key) - - if not self.api_key: - logger.error('OpenAI API key is not provided, please set it in environment variable OPENAI_API_KEY') - # raise ValueError( - # 'OpenAI API key is not provided, ' - # 'please set it in environment variable OPENAI_API_KEY') - - def predict(self, model_id: str, inputs: dict, **kwargs) -> dict: - - sys_prompt: str = inputs.get('sys_prompt', '') - user_prompt: str = inputs.get('user_prompt', '') - - # model_id: str = kwargs.get('model_id', '') - temperature: float = kwargs.pop('temperature', 0.2) - max_tokens: int = kwargs.pop('max_tokens', 1024) - mode: str = kwargs.pop('mode', 'chat.completion') - - logger.info(f'Using OpenAI model_id: {model_id}') - - res = self._predict( - model_id=model_id, - sys_prompt=sys_prompt, - user_prompt=user_prompt, - temperature=temperature, - max_tokens=max_tokens, - mode=mode) - - return res - - def _predict( - self, - model_id, - sys_prompt, - user_prompt, - temperature, - max_tokens, - mode: str = 'chat.completion', - ) -> dict: - - res = {} - openai.api_key = self.api_key - - for i in range(self.MAX_RETRIES): - try: - if mode == 'chat.completion': - resp = openai.ChatCompletion.create( - model=model_id, - messages=[{ - 'role': 'system', - 'content': sys_prompt - }, { - 'role': 'user', - 'content': user_prompt - }], - temperature=temperature, - max_tokens=max_tokens) - - if resp: - ans_text = resp['choices'][0]['message']['content'] - model_id = resp['model'] - else: - logger.warning(f'OpenAI GPT API call failed: got empty response ' - f'for input {sys_prompt} {user_prompt}') - ans_text = '' - model_id = '' - - res['ans_text'] = ans_text - res['model_id'] = model_id - else: - raise ValueError(f'Invalid mode: {mode}') - - return res - - except Exception as e: - logger.warning(f'OpenAI API call failed: {e}') - time.sleep(3) - logger.error(f'OpenAI API call failed after {self.MAX_RETRIES} retries') - return res diff --git a/evalscope/run.py b/evalscope/run.py index 3b531732..e7473293 100644 --- a/evalscope/run.py +++ b/evalscope/run.py @@ -13,7 +13,7 @@ from evalscope.config import TaskConfig, parse_task_config from evalscope.constants import DEFAULT_MODEL_REVISION, DEFAULT_WORK_DIR, EvalBackend, EvalType from evalscope.evaluator import Evaluator -from evalscope.models.custom import CustomModel +from evalscope.models import CustomModel, LocalModel from evalscope.utils import seed_everything from evalscope.utils.io_utils import OutputsStructure, are_paths_same from evalscope.utils.logger import configure_logging, get_logger @@ -99,22 +99,21 @@ def evaluate_model(task_cfg: TaskConfig, outputs: OutputsStructure) -> dict: """Evaluate the model based on the provided task configuration.""" # Initialize evaluator eval_results = {} - + base_model = get_base_model(task_cfg) for dataset_name in task_cfg.datasets: - evaluator = create_evaluator(task_cfg, dataset_name, outputs) + evaluator = create_evaluator(task_cfg, dataset_name, outputs, base_model) res_dict = evaluator.eval(infer_cfg=task_cfg.generation_config, debug=task_cfg.debug, limit=task_cfg.limit) eval_results[dataset_name] = res_dict return eval_results -def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure): +def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsStructure, base_model: LocalModel): """Create an evaluator object for the specified dataset.""" - # imported_modules = import_module_util(BENCHMARK_PATH_PREFIX, dataset_name, MEMBERS_TO_IMPORT) benchmark: BenchmarkMeta = Benchmark.get(dataset_name) data_adapter = benchmark.get_data_adapter(config=task_cfg.dataset_args) - model_adapter = initialize_model_adapter(task_cfg, model_adapter_cls=benchmark.model_adapter) + model_adapter = initialize_model_adapter(task_cfg, benchmark.model_adapter, base_model) return Evaluator( dataset_name_or_path=benchmark.dataset_id, @@ -131,10 +130,31 @@ def create_evaluator(task_cfg: TaskConfig, dataset_name: str, outputs: OutputsSt ) -def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls): +def get_base_model(task_cfg: TaskConfig) -> Optional[LocalModel]: + """Get the base model for the task.""" + if task_cfg.eval_type != EvalType.CHECKPOINT: + return None + else: + device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None + cache_dir = task_cfg.model_args.get('cache_dir', None) + model_precision = task_cfg.model_args.get('precision', torch.float16) + model_revision = task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION) + if isinstance(model_precision, str) and model_precision != 'auto': + model_precision = eval(model_precision) + + base_model = LocalModel( + model_id=task_cfg.model, + model_revision=model_revision, + device_map=device_map, + torch_dtype=model_precision, + cache_dir=cache_dir) + return base_model + + +def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls, base_model: LocalModel): """Initialize the model adapter based on the task configuration.""" if task_cfg.dry_run: - from evalscope.models.dummy_chat_model import DummyChatModel + from evalscope.models.model import DummyChatModel return DummyChatModel(model_cfg=dict()) elif task_cfg.eval_type == EvalType.CUSTOM: if not isinstance(task_cfg.model, CustomModel): @@ -142,15 +162,8 @@ def initialize_model_adapter(task_cfg: TaskConfig, model_adapter_cls): from evalscope.models.model_adapter import CustomModelAdapter return CustomModelAdapter(custom_model=task_cfg.model) else: - device_map = task_cfg.model_args.get('device_map', 'auto') if torch.cuda.is_available() else None - model_precision = task_cfg.model_args.get('precision', torch.float16) - if isinstance(model_precision, str) and model_precision != 'auto': - model_precision = eval(model_precision) return model_adapter_cls( - model_id=task_cfg.model, - model_revision=task_cfg.model_args.get('revision', DEFAULT_MODEL_REVISION), - device_map=device_map, - torch_dtype=model_precision, + model=base_model or get_base_model(task_cfg), generation_config=task_cfg.generation_config, chat_template=task_cfg.chat_template) diff --git a/evalscope/models/api/openai_api.py b/evalscope/third_party/longbench_write/tools/openai_api.py similarity index 100% rename from evalscope/models/api/openai_api.py rename to evalscope/third_party/longbench_write/tools/openai_api.py diff --git a/tests/cli/test_run.py b/tests/cli/test_run.py index caaac0b2..9370e2bb 100644 --- a/tests/cli/test_run.py +++ b/tests/cli/test_run.py @@ -70,7 +70,7 @@ def test_run_eval_with_args(self): @unittest.skipUnless(0 in test_level_list(), 'skip test in current test level') def test_run_task(self): - task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['gsm8k', 'arc'], 'limit': 2, 'debug': True} + task_cfg = {'model': 'qwen/Qwen2-0.5B-Instruct', 'datasets': ['hellaswag', 'gsm8k', 'arc'], 'limit': 2, 'debug': True} run_task(task_cfg=task_cfg)