From ba1b898f095a859501eb0c6d4360e1b80e358bf9 Mon Sep 17 00:00:00 2001 From: "Yao, Qing" Date: Mon, 2 Sep 2024 15:45:58 +0800 Subject: [PATCH] Support bigcode eval for codegen v0.1 Signed-off-by: Yao, Qing --- .../stresscli/locust/codegenbench.py | 2 +- .../bigcode_evaluation_harness/accuracy.py | 8 ++ .../api_evaluator.py | 100 ++++++++++++++++++ .../bigcode_evaluation_harness/arguments.py | 5 + 4 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 evals/evaluation/bigcode_evaluation_harness/api_evaluator.py diff --git a/evals/benchmark/stresscli/locust/codegenbench.py b/evals/benchmark/stresscli/locust/codegenbench.py index bdf1fefa..3a34b57c 100644 --- a/evals/benchmark/stresscli/locust/codegenbench.py +++ b/evals/benchmark/stresscli/locust/codegenbench.py @@ -20,7 +20,7 @@ def getUrl(): - return "/v1/chatqna" + return "/v1/codegen" def getReqData(): diff --git a/evals/evaluation/bigcode_evaluation_harness/accuracy.py b/evals/evaluation/bigcode_evaluation_harness/accuracy.py index 31132726..023f1b3f 100644 --- a/evals/evaluation/bigcode_evaluation_harness/accuracy.py +++ b/evals/evaluation/bigcode_evaluation_harness/accuracy.py @@ -21,6 +21,7 @@ from bigcode_eval.evaluator import Evaluator from bigcode_eval.tasks import ALL_TASKS from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer +from evals.evaluation.bigcode_evaluation_harness.api_evaluator import APIEvaluator def pattern_match(patterns, source_list): @@ -68,6 +69,13 @@ def evaluate(args): evaluator = Evaluator(accelerator, None, None, args) for task in task_names: results[task] = evaluator.evaluate(task) + elif args.codegen_url: + # here we generate code using an OPEA codegen API + if accelerator.is_main_process: + print("OPEA codegen API generation mode") + evaluator = APIEvaluator(accelerator, args.model, None, args) + for task in task_names: + results[task] = evaluator.evaluate(task) else: # here we generate code and save it (evaluation is optional but True by default) dict_precisions = { diff --git a/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py new file mode 100644 index 00000000..7650482d --- /dev/null +++ b/evals/evaluation/bigcode_evaluation_harness/api_evaluator.py @@ -0,0 +1,100 @@ +import inspect +import json +import warnings + +import aiohttp + +from bigcode_eval import tasks +from bigcode_eval.evaluator import Evaluator + + +class APIEvaluator(Evaluator): + def generate_text(self, task_name, intermediate_generations=None): + task = tasks.get_task(task_name, self.args) + dataset = task.get_dataset() + # if args.limit is None, use all samples + # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset) + n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset) + print(n_tasks) + # when args.limit is None + # adjust n_tasks by args.limit_start to prevent out of bounds issues + if not self.args.limit: + n_tasks -= self.args.limit_start + references = [task.get_reference(dataset[i]) for i in + range(self.args.limit_start, self.args.limit_start + n_tasks)] + + if self.args.check_references: + if "get_solution" in inspect.signature(task.get_reference).parameters: + solutions = [[task.get_reference(dataset[i], get_solution=True)] for i in + range(self.args.limit_start, self.args.limit_start + n_tasks)] + else: + solutions = [[ref] for ref in references] + return solutions, references + + if intermediate_generations: + curr_generations = [gen for gen in intermediate_generations if gen] + n_tasks -= len(curr_generations) + + generations = parallel_generations_by_api( + task, + dataset, + self.accelerator, + n_tasks=n_tasks, + args=self.args, + + ) + + if len(generations[0]) > self.args.n_samples: + generations = [l[: self.args.n_samples] for l in generations] + warnings.warn( + f"Number of tasks wasn't proportional to number of devices, we removed extra predictions to only keep nsamples={self.args.n_samples}" + ) + return generations, references + + +def parallel_generations_by_api( + task, + dataset, + accelerator, + n_tasks, + args, +): + if args.load_generations_path: + # load generated code + with open(args.load_generations_path) as fp: + generations = json.load(fp) + if accelerator.is_main_process: + print( + f"generations loaded, {n_tasks} selected from {len(generations)} with {len(generations[0])} candidates" + ) + return generations[:n_tasks] + + if codegen_url := args.codegen_url: + assert "/codegen" in codegen_url, "Only OPEA codegen compatible APIs are supported" + import asyncio + import os + from tqdm.asyncio import tqdm + import requests + async def get_res(prompt): + headers = {"Content-Type": "application/json"} + data = { + "messages": prompt, + "max_tokens": 2048, + "stream": False, + "temperature": args.temperature, + "top_p": args.top_p, + "top_k": args.top_k, + } + async with aiohttp.ClientSession() as session: + async with session.post(codegen_url, json=data, headers=headers, timeout=600) as response: + text = await response.text() + return text + + prompts = [task.get_prompt(doc) for doc in dataset] + awaitables = [get_res(prompt=prompt) for prompt in prompts] + responses = asyncio.run(tqdm.gather(*awaitables)) + generations = [] + for i, (prompt, response) in enumerate(zip(prompts, responses)): + texts = [prompt + choice["message"]["content"] for choice in json.loads(response)['choices']] + generations.append([task.postprocess_generation(text, i) for text in texts]) + return generations diff --git a/evals/evaluation/bigcode_evaluation_harness/arguments.py b/evals/evaluation/bigcode_evaluation_harness/arguments.py index cec695b8..b2303a47 100644 --- a/evals/evaluation/bigcode_evaluation_harness/arguments.py +++ b/evals/evaluation/bigcode_evaluation_harness/arguments.py @@ -204,6 +204,11 @@ def setup_parser(): action="store_true", help="Don't run generation but benchmark groundtruth (useful for debugging)", ) + parser.add_argument( + "--codegen_url", + default=None, + help="Base URL to use OPEA Codegen API,", + ) return parser.parse_args()