From 077667c34353d4c330e26e6fe067c975e5078f10 Mon Sep 17 00:00:00 2001 From: Lev Kurilenko <113481193+lekurile@users.noreply.github.com> Date: Thu, 22 Feb 2024 14:55:40 -0800 Subject: [PATCH] Inference V2 Human Eval (#4804) This PR adds a Human Eval CI workflow and associated unit test for Inference V2. --------- Co-authored-by: Arash Bakhtiari Co-authored-by: Michael Wyatt Co-authored-by: Masahiro Tanaka Co-authored-by: Masahiro Tanaka <81312776+tohtana@users.noreply.github.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- .github/workflows/nv-human-eval.yml | 53 ++++++++++++++++++ tests/pytest.ini | 7 ++- tests/unit/inference/test_human_eval.py | 73 +++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/nv-human-eval.yml create mode 100644 tests/unit/inference/test_human_eval.py diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml new file mode 100644 index 000000000000..82f7e9beacca --- /dev/null +++ b/.github/workflows/nv-human-eval.yml @@ -0,0 +1,53 @@ +name: nv-human-eval + +on: + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + unit-tests: + runs-on: [self-hosted, nvidia, a6000] + container: + image: nvcr.io/nvidia/pytorch:23.03-py3 + ports: + - 80 + options: --gpus all --shm-size "8G" + + steps: + - uses: actions/checkout@v3 + + - name: Check container state + run: | + ldd --version + nvcc --version + nvidia-smi + python -c "import torch; print('torch:', torch.__version__, torch)" + python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + - name: Install transformers + run: | + git clone --depth=1 https://github.com/huggingface/transformers + cd transformers + git rev-parse --short HEAD + python -m pip install . + - name: Clone Human Eval + run: | + git clone --depth=1 https://github.com/openai/human-eval.git + sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py + cd human-eval + git rev-parse --short HEAD + python -m pip install . + - name: Install deepspeed + run: | + python -m pip install .[dev,1bit,autotuning] + ds_report + - name: Python environment + run: | + python -m pip list + - name: Unit tests + run: | + unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch + cd tests + python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12" diff --git a/tests/pytest.ini b/tests/pytest.ini index 8d043c8b3f9d..f841c47afc0c 100644 --- a/tests/pytest.ini +++ b/tests/pytest.ini @@ -1,12 +1,13 @@ [pytest] -addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion" +addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion and not evaluation" markers = sequential:Tests that need to be run sequentially inference:Inference model tests inference_ops:Individual inference operator tests - inference_v2: Inference tests for the v2 stack - inference_v2_ops: Op tests for the v2 stack + inference_v2:Inference tests for the v2 stack + inference_v2_ops:Op tests for the v2 stack seq_inference:Inference model tests to run sequentially nightly:Tests that should be run nightly world_size:Change world size of individual tests in a class stable_diffusion:Tests that run Stable Diffusion + evaluation:Tests that evaluate model correctness diff --git a/tests/unit/inference/test_human_eval.py b/tests/unit/inference/test_human_eval.py new file mode 100644 index 000000000000..2525aeb5aa0e --- /dev/null +++ b/tests/unit/inference/test_human_eval.py @@ -0,0 +1,73 @@ +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +import pytest +import os +import torch +from deepspeed.accelerator import get_accelerator + + +@pytest.mark.evaluation +@pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"]) +def test_human_eval(model_name): + import mii + import numpy + from transformers import pipeline + from human_eval.data import write_jsonl, read_problems + from human_eval.evaluation import evaluate_functional_correctness + + def generate_base_completion(pipe, problem_prompt: str) -> str: + return pipe(problem_prompt, do_sample=True)[0]["generated_text"] + + def generate_mii_completion(pipe, problem_prompt: str) -> str: + return pipe(problem_prompt, max_new_tokens=512)[0].generated_text + + def generate_samples(pipe, generation_function): + samples = [ + dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"])) + for task_id in problems for _ in range(num_samples_per_task) + ] + return samples + + # Loading Problems + problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz") + num_samples_per_task = 20 + + # Initializing HuggingFace Pipeline + local_rank = os.getenv("LOCAL_RANK", "0") + device = torch.device(get_accelerator().device_name(local_rank)) + base_pipe = pipeline(model=model_name, + device=torch.device(get_accelerator().device_name(local_rank)), + max_length=512, + return_full_text=False) + + # Generating Base Samples + base_samples = generate_samples(base_pipe, generate_base_completion) + + # Base Pipeline Teardown + del base_pipe + get_accelerator().empty_cache() + + # Initializing DeepSpeed-MII Pipeline + mii_pipe = mii.pipeline(model_name) + + # Generating MII Samples + mii_samples = generate_samples(mii_pipe, generate_mii_completion) + + # MII Pipeline Teardown + mii_pipe.destroy() + + # Writing Samples + write_jsonl("base_samples.jsonl", base_samples) + write_jsonl("mii_samples.jsonl", mii_samples) + + # Evaluating Samples + base_results = evaluate_functional_correctness("base_samples.jsonl") + mii_results = evaluate_functional_correctness("mii_samples.jsonl") + + # Executing Assertions + for key in base_results.keys(): + assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \ + f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol."