diff --git a/GenAIEval/evaluation/bigcode_evaluation_harness/main.py b/GenAIEval/evaluation/bigcode_evaluation_harness/main.py new file mode 100644 index 00000000..4e103158 --- /dev/null +++ b/GenAIEval/evaluation/bigcode_evaluation_harness/main.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2022 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from GenAIEval.evaluation.bigcode_evaluation_harness import evaluate, setup_parser + + +def main(): + eval_args = setup_parser() + results = evaluate(eval_args) + print(results) + + +if __name__ == "__main__": + main() diff --git a/main.py b/GenAIEval/evaluation/lm_evaluation_harness/main.py similarity index 90% rename from main.py rename to GenAIEval/evaluation/lm_evaluation_harness/main.py index b7a2a832..6fdd27db 100644 --- a/main.py +++ b/GenAIEval/evaluation/lm_evaluation_harness/main.py @@ -16,8 +16,6 @@ # limitations under the License. from GenAIEval.evaluation.lm_evaluation_harness import evaluate, setup_parser -# from GenAIEval.evaluation.bigcode_evaluation_harness import evaluate, setup_parser - def main(): eval_args = setup_parser() diff --git a/README.md b/README.md index 80eb676f..830274de 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ For evaluating the models on text-generation tasks, we follow the [lm-evaluation ```shell # pip install --upgrade-strategy eager optimum[habana] - +cd GenAIEval/evaluation/lm_evaluation_harness python main.py \ --model gaudi-hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -28,6 +28,8 @@ python main.py \ ##### CPU ```shell + +cd GenAIEval/evaluation/lm_evaluation_harness python main.py \ --model hf \ --model_args pretrained=EleutherAI/gpt-j-6B \ @@ -53,12 +55,9 @@ results = evaluate(args) ### bigcode-evaluation-harness For evaluating the models on coding tasks or specifically coding LLMs, we follow the [bigcode-evaluation-harness](https://github.com/bigcode-project/bigcode-evaluation-harness) and provide the command line usage and function call usage. [HumanEval](https://huggingface.co/datasets/openai_humaneval), [HumanEval+](https://huggingface.co/datasets/evalplus/humanevalplus), [InstructHumanEval](https://huggingface.co/datasets/codeparrot/instructhumaneval), [APPS](https://huggingface.co/datasets/codeparrot/apps), [MBPP](https://huggingface.co/datasets/mbpp), [MBPP+](https://huggingface.co/datasets/evalplus/mbppplus), and [DS-1000](https://github.com/HKUNLP/DS-1000/) for both completion (left-to-right) and insertion (FIM) mode are available. #### command line usage -There is a small code change in `main.py` regarding the import path. -```diff -- from GenAIEval.evaluation.lm_evaluation_harness import evaluate, setup_parser -+ from GenAIEval.evaluation.bigcode_evaluation_harness import evaluate, setup_parser -``` + ```shell +cd GenAIEval/evaluation/bigcode_evaluation_harness python main.py \ --model "codeparrot/codeparrot-small" \ --tasks "humaneval" \