forked from microsoft/DeepSpeed
-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Inference V2 Human Eval (microsoft#4804)
This PR adds a Human Eval CI workflow and associated unit test for Inference V2. --------- Co-authored-by: Arash Bakhtiari <[email protected]> Co-authored-by: Michael Wyatt <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]> Co-authored-by: Logan Adams <[email protected]>
- Loading branch information
Showing
3 changed files
with
130 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
name: nv-human-eval | ||
|
||
on: | ||
workflow_dispatch: | ||
|
||
concurrency: | ||
group: ${{ github.workflow }}-${{ github.ref }} | ||
cancel-in-progress: true | ||
|
||
jobs: | ||
unit-tests: | ||
runs-on: [self-hosted, nvidia, a6000] | ||
container: | ||
image: nvcr.io/nvidia/pytorch:23.03-py3 | ||
ports: | ||
- 80 | ||
options: --gpus all --shm-size "8G" | ||
|
||
steps: | ||
- uses: actions/checkout@v3 | ||
|
||
- name: Check container state | ||
run: | | ||
ldd --version | ||
nvcc --version | ||
nvidia-smi | ||
python -c "import torch; print('torch:', torch.__version__, torch)" | ||
python -c "import torch; print('CUDA available:', torch.cuda.is_available())" | ||
- name: Install transformers | ||
run: | | ||
git clone --depth=1 https://github.com/huggingface/transformers | ||
cd transformers | ||
git rev-parse --short HEAD | ||
python -m pip install . | ||
- name: Clone Human Eval | ||
run: | | ||
git clone --depth=1 https://github.com/openai/human-eval.git | ||
sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py | ||
cd human-eval | ||
git rev-parse --short HEAD | ||
python -m pip install . | ||
- name: Install deepspeed | ||
run: | | ||
python -m pip install .[dev,1bit,autotuning] | ||
ds_report | ||
- name: Python environment | ||
run: | | ||
python -m pip list | ||
- name: Unit tests | ||
run: | | ||
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch | ||
cd tests | ||
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,13 @@ | ||
[pytest] | ||
addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion" | ||
addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion and not evaluation" | ||
markers = | ||
sequential:Tests that need to be run sequentially | ||
inference:Inference model tests | ||
inference_ops:Individual inference operator tests | ||
inference_v2: Inference tests for the v2 stack | ||
inference_v2_ops: Op tests for the v2 stack | ||
inference_v2:Inference tests for the v2 stack | ||
inference_v2_ops:Op tests for the v2 stack | ||
seq_inference:Inference model tests to run sequentially | ||
nightly:Tests that should be run nightly | ||
world_size:Change world size of individual tests in a class | ||
stable_diffusion:Tests that run Stable Diffusion | ||
evaluation:Tests that evaluate model correctness |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
# Copyright (c) Microsoft Corporation. | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# DeepSpeed Team | ||
|
||
import pytest | ||
import os | ||
import torch | ||
from deepspeed.accelerator import get_accelerator | ||
|
||
|
||
@pytest.mark.evaluation | ||
@pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"]) | ||
def test_human_eval(model_name): | ||
import mii | ||
import numpy | ||
from transformers import pipeline | ||
from human_eval.data import write_jsonl, read_problems | ||
from human_eval.evaluation import evaluate_functional_correctness | ||
|
||
def generate_base_completion(pipe, problem_prompt: str) -> str: | ||
return pipe(problem_prompt, do_sample=True)[0]["generated_text"] | ||
|
||
def generate_mii_completion(pipe, problem_prompt: str) -> str: | ||
return pipe(problem_prompt, max_new_tokens=512)[0].generated_text | ||
|
||
def generate_samples(pipe, generation_function): | ||
samples = [ | ||
dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"])) | ||
for task_id in problems for _ in range(num_samples_per_task) | ||
] | ||
return samples | ||
|
||
# Loading Problems | ||
problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz") | ||
num_samples_per_task = 20 | ||
|
||
# Initializing HuggingFace Pipeline | ||
local_rank = os.getenv("LOCAL_RANK", "0") | ||
device = torch.device(get_accelerator().device_name(local_rank)) | ||
base_pipe = pipeline(model=model_name, | ||
device=torch.device(get_accelerator().device_name(local_rank)), | ||
max_length=512, | ||
return_full_text=False) | ||
|
||
# Generating Base Samples | ||
base_samples = generate_samples(base_pipe, generate_base_completion) | ||
|
||
# Base Pipeline Teardown | ||
del base_pipe | ||
get_accelerator().empty_cache() | ||
|
||
# Initializing DeepSpeed-MII Pipeline | ||
mii_pipe = mii.pipeline(model_name) | ||
|
||
# Generating MII Samples | ||
mii_samples = generate_samples(mii_pipe, generate_mii_completion) | ||
|
||
# MII Pipeline Teardown | ||
mii_pipe.destroy() | ||
|
||
# Writing Samples | ||
write_jsonl("base_samples.jsonl", base_samples) | ||
write_jsonl("mii_samples.jsonl", mii_samples) | ||
|
||
# Evaluating Samples | ||
base_results = evaluate_functional_correctness("base_samples.jsonl") | ||
mii_results = evaluate_functional_correctness("mii_samples.jsonl") | ||
|
||
# Executing Assertions | ||
for key in base_results.keys(): | ||
assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \ | ||
f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol." |