Inference V2 Human Eval (microsoft#4804)

This PR adds a Human Eval CI workflow and associated unit test for Inference V2. --------- Co-authored-by: Arash Bakhtiari <[email protected]> Co-authored-by: Michael Wyatt <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]> Co-authored-by: Masahiro Tanaka <[email protected]> Co-authored-by: Logan Adams <[email protected]>
ROCm · May 9, 2024 · 077667c · 077667c
1 parent b3df7bd
commit 077667c
Show file tree

Hide file tree

Showing 3 changed files with 130 additions and 3 deletions.
diff --git a/.github/workflows/nv-human-eval.yml b/.github/workflows/nv-human-eval.yml
@@ -0,0 +1,53 @@
+name: nv-human-eval
+
+on:
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: [self-hosted, nvidia, a6000]
+    container:
+      image: nvcr.io/nvidia/pytorch:23.03-py3
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Check container state
+        run: |
+          ldd --version
+          nvcc --version
+          nvidia-smi
+          python -c "import torch; print('torch:', torch.__version__, torch)"
+          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+      - name: Install transformers
+        run: |
+          git clone --depth=1 https://github.com/huggingface/transformers
+          cd transformers
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Clone Human Eval
+        run: |
+          git clone --depth=1 https://github.com/openai/human-eval.git
+          sed -i '/exec(check_program, exec_globals)/ s/^# //' human-eval/human_eval/execution.py
+          cd human-eval
+          git rev-parse --short HEAD
+          python -m pip install .
+      - name: Install deepspeed
+        run: |
+          python -m pip install .[dev,1bit,autotuning]
+          ds_report
+      - name: Python environment
+        run: |
+          python -m pip list
+      - name: Unit tests
+        run: |
+          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+          cd tests
+          python -m pytest --color=yes --durations=0 --verbose -rF -m 'evaluation' -k "test_human_eval" unit/ --torch_ver="2.0" --cuda_ver="12"
diff --git a/tests/pytest.ini b/tests/pytest.ini
@@ -1,12 +1,13 @@
 [pytest]
-addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion"
+addopts = -m "not sequential and not nightly and not inference and not seq_inference and not inference_ops and not inference_v2 and not inference_v2_ops and not stable_diffusion and not evaluation"
 markers =
     sequential:Tests that need to be run sequentially
     inference:Inference model tests
     inference_ops:Individual inference operator tests
-    inference_v2: Inference tests for the v2 stack
-    inference_v2_ops: Op tests for the v2 stack
+    inference_v2:Inference tests for the v2 stack
+    inference_v2_ops:Op tests for the v2 stack
     seq_inference:Inference model tests to run sequentially
     nightly:Tests that should be run nightly
     world_size:Change world size of individual tests in a class
     stable_diffusion:Tests that run Stable Diffusion
+    evaluation:Tests that evaluate model correctness
diff --git a/tests/unit/inference/test_human_eval.py b/tests/unit/inference/test_human_eval.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import pytest
+import os
+import torch
+from deepspeed.accelerator import get_accelerator
+
+
+@pytest.mark.evaluation
+@pytest.mark.parametrize("model_name", ["codellama/CodeLlama-7b-Python-hf"])
+def test_human_eval(model_name):
+    import mii
+    import numpy
+    from transformers import pipeline
+    from human_eval.data import write_jsonl, read_problems
+    from human_eval.evaluation import evaluate_functional_correctness
+
+    def generate_base_completion(pipe, problem_prompt: str) -> str:
+        return pipe(problem_prompt, do_sample=True)[0]["generated_text"]
+
+    def generate_mii_completion(pipe, problem_prompt: str) -> str:
+        return pipe(problem_prompt, max_new_tokens=512)[0].generated_text
+
+    def generate_samples(pipe, generation_function):
+        samples = [
+            dict(task_id=task_id, completion=generation_function(pipe, problems[task_id]["prompt"]))
+            for task_id in problems for _ in range(num_samples_per_task)
+        ]
+        return samples
+
+    # Loading Problems
+    problems = read_problems("../../human-eval/data/HumanEval.jsonl.gz")
+    num_samples_per_task = 20
+
+    # Initializing HuggingFace Pipeline
+    local_rank = os.getenv("LOCAL_RANK", "0")
+    device = torch.device(get_accelerator().device_name(local_rank))
+    base_pipe = pipeline(model=model_name,
+                         device=torch.device(get_accelerator().device_name(local_rank)),
+                         max_length=512,
+                         return_full_text=False)
+
+    # Generating Base Samples
+    base_samples = generate_samples(base_pipe, generate_base_completion)
+
+    # Base Pipeline Teardown
+    del base_pipe
+    get_accelerator().empty_cache()
+
+    # Initializing DeepSpeed-MII Pipeline
+    mii_pipe = mii.pipeline(model_name)
+
+    # Generating MII Samples
+    mii_samples = generate_samples(mii_pipe, generate_mii_completion)
+
+    # MII Pipeline Teardown
+    mii_pipe.destroy()
+
+    # Writing Samples
+    write_jsonl("base_samples.jsonl", base_samples)
+    write_jsonl("mii_samples.jsonl", mii_samples)
+
+    # Evaluating Samples
+    base_results = evaluate_functional_correctness("base_samples.jsonl")
+    mii_results = evaluate_functional_correctness("mii_samples.jsonl")
+
+    # Executing Assertions
+    for key in base_results.keys():
+        assert numpy.allclose(base_results[key], mii_results[key], rtol=0.10), \
+            f"Base result: {base_results[key]}, MII result: {mii_results[key]}, outside of rtol."