Skip to content

Commit

Permalink
Inference V2 Human Eval (microsoft#4804)
Browse files Browse the repository at this point in the history
This PR adds a Human Eval CI workflow and associated unit test for
Inference V2.

---------

Co-authored-by: Arash Bakhtiari <[email protected]>
Co-authored-by: Michael Wyatt <[email protected]>
Co-authored-by: Masahiro Tanaka <[email protected]>
Co-authored-by: Masahiro Tanaka <[email protected]>
Co-authored-by: Logan Adams <[email protected]>
  • Loading branch information
6 people authored and rraminen committed May 9, 2024
1 parent b3df7bd commit 077667c
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 3 deletions.
53 changes: 53 additions & 0 deletions .github/workflows/nv-human-eval.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: nv-human-eval

on:
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, a6000]
container:
image: nvcr.io/nvidia/pytorch:23.03-py3
ports:
- 80
options: --gpus all --shm-size "8G"

steps:
- uses: actions/checkout@v3

- name: Check container state
run: |
ldd --version
nvcc --version
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Install transformers
run: |
git clone --depth=1 https://github.com/huggingface/transformers
cd transformers
git rev-parse --short HEAD
python -m pip install .
- name: Clone Human Eval
run: |
git clone --depth=1 https://github.com/openai/human-eval.git
sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py
cd human-eval
git rev-parse --short HEAD
python -m pip install .
- name: Install deepspeed
run: |
python -m pip install .[dev,1bit,autotuning]
ds_report
- name: Python environment
run: |
python -m pip list
- name: Unit tests
run: |
unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
cd tests
python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12"
7 changes: 4 additions & 3 deletions tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
[pytest]
addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion"
addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion and not evaluation"
markers =
sequential:Tests that need to be run sequentially
inference:Inference model tests
inference_ops:Individual inference operator tests
inference_v2: Inference tests for the v2 stack
inference_v2_ops: Op tests for the v2 stack
inference_v2:Inference tests for the v2 stack
inference_v2_ops:Op tests for the v2 stack
seq_inference:Inference model tests to run sequentially
nightly:Tests that should be run nightly
world_size:Change world size of individual tests in a class
stable_diffusion:Tests that run Stable Diffusion
evaluation:Tests that evaluate model correctness
73 changes: 73 additions & 0 deletions tests/unit/inference/test_human_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

import pytest
import os
import torch
from deepspeed.accelerator import get_accelerator


@pytest.mark.evaluation
@pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"])
def test_human_eval(model_name):
import mii
import numpy
from transformers import pipeline
from human_eval.data import write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness

def generate_base_completion(pipe, problem_prompt: str) -> str:
return pipe(problem_prompt, do_sample=True)[0]["generated_text"]

def generate_mii_completion(pipe, problem_prompt: str) -> str:
return pipe(problem_prompt, max_new_tokens=512)[0].generated_text

def generate_samples(pipe, generation_function):
samples = [
dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"]))
for task_id in problems for _ in range(num_samples_per_task)
]
return samples

# Loading Problems
problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz")
num_samples_per_task = 20

# Initializing HuggingFace Pipeline
local_rank = os.getenv("LOCAL_RANK", "0")
device = torch.device(get_accelerator().device_name(local_rank))
base_pipe = pipeline(model=model_name,
device=torch.device(get_accelerator().device_name(local_rank)),
max_length=512,
return_full_text=False)

# Generating Base Samples
base_samples = generate_samples(base_pipe, generate_base_completion)

# Base Pipeline Teardown
del base_pipe
get_accelerator().empty_cache()

# Initializing DeepSpeed-MII Pipeline
mii_pipe = mii.pipeline(model_name)

# Generating MII Samples
mii_samples = generate_samples(mii_pipe, generate_mii_completion)

# MII Pipeline Teardown
mii_pipe.destroy()

# Writing Samples
write_jsonl("base_samples.jsonl", base_samples)
write_jsonl("mii_samples.jsonl", mii_samples)

# Evaluating Samples
base_results = evaluate_functional_correctness("base_samples.jsonl")
mii_results = evaluate_functional_correctness("mii_samples.jsonl")

# Executing Assertions
for key in base_results.keys():
assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \
f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol."

0 comments on commit 077667c

Please sign in to comment.