Merge pull request opendatahub-io#109 from ROCm/simple_fp8_inference_…

…example adding a simple model invocation involving fp8 calculation/storage
prarit · Jul 27, 2024 · 200fbea · 200fbea
2 parents 304efb3 + 98c2e72
commit 200fbea
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 1 deletion.
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -79,6 +79,10 @@ steps:
     - python3 llava_example.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmpvllm/facebook/opt-125m/v1/model.tensors
 
+- label: FP8 Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s fp8_offline_inference.py
+
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -163,4 +167,4 @@ steps:
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt
-  - SPHINXOPTS=\"-W\" make html
+  - SPHINXOPTS=\"-W\" make html
diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
@@ -0,0 +1,22 @@
+from vllm import LLM, SamplingParams
+
+
+def test_fp8_offline_inference():
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+    # Create an LLM
+    llm = LLM(
+            model="/data/models/llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
+            quantization_param_path = \
+                    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
+            )
+
+    prompt = "London is the capital of"
+
+    # Generate model response
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+
+    assert out == (" England and the United Kingdom."
+                   " It is located in the southeastern part of")