From 9c69c196eeafe0f48bcb311d46fe398f9a96fa4b Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 00:31:19 +0000
Subject: [PATCH 01/11] adding a simple model invocation involving fp8
 calculation/storage

---
 tests/fp8_offline_inference.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 tests/fp8_offline_inference.py

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
new file mode 100644
index 0000000000000..526f830757c43
--- /dev/null
+++ b/tests/fp8_offline_inference.py
@@ -0,0 +1,19 @@
+from vllm import LLM, SamplingParams
+
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(
+            model="/data/models/llama-2-7b-chat-hf", 
+            kv_cache_dtype="fp8", 
+            quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
+        )
+
+prompt = "London is the capital of"
+
+# Generate model response
+out = llm.generate(prompt, sampling_params)[0].outputs[0].text
+
+print(out)
+

From fe5828d2867a3b947f5a047459182a81e6c2f446 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:06:32 +0000
Subject: [PATCH 02/11] Adding pytest wrapper.

---
 tests/fp8_offline_inference.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 526f830757c43..9f4e306e9df5e 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -1,19 +1,23 @@
+import pytest
 from vllm import LLM, SamplingParams
 
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-# Create an LLM.
-llm = LLM(
+def test_fp8_offline_inference():
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+    
+    # Create an LLM.
+    llm = LLM(
             model="/data/models/llama-2-7b-chat-hf", 
             kv_cache_dtype="fp8", 
             quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
-        )
+            )
+    
+    prompt = "London is the capital of"
+    
+    # Generate model response
+    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
 
-prompt = "London is the capital of"
-
-# Generate model response
-out = llm.generate(prompt, sampling_params)[0].outputs[0].text
-
-print(out)
+    assert out == " England and the United Kingdom. It is located in the southeastern part of"
+    #print(out)
 

From f344924ca54839365e5b5e9e9b2e2cb6069cd3b6 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:15:31 +0000
Subject: [PATCH 03/11] .

---
 tests/fp8_offline_inference.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 9f4e306e9df5e..578e40970de86 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -1,4 +1,3 @@
-import pytest
 from vllm import LLM, SamplingParams
 
 
@@ -10,7 +9,8 @@ def test_fp8_offline_inference():
     llm = LLM(
             model="/data/models/llama-2-7b-chat-hf", 
             kv_cache_dtype="fp8", 
-            quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
+            quantization_param_path = \
+                    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
             )
     
     prompt = "London is the capital of"
@@ -18,6 +18,7 @@ def test_fp8_offline_inference():
     # Generate model response
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
 
-    assert out == " England and the United Kingdom. It is located in the southeastern part of"
+    assert out == ( " England and the United Kingdom."
+           " It is located in the southeastern part of")
     #print(out)
 

From 9a1e2b5504985829fd61b4683c0df0fe99daffe7 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:18:28 +0000
Subject: [PATCH 04/11] .

---
 tests/fp8_offline_inference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 578e40970de86..e09dd2ff48005 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -20,5 +20,4 @@ def test_fp8_offline_inference():
 
     assert out == ( " England and the United Kingdom."
            " It is located in the southeastern part of")
-    #print(out)
 

From 6be15abe93e8177606522dc995acfaed5ec310e7 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:21:55 +0000
Subject: [PATCH 05/11] .

---
 tests/fp8_offline_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index e09dd2ff48005..eabd5eb297919 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -19,5 +19,5 @@ def test_fp8_offline_inference():
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
 
     assert out == ( " England and the United Kingdom."
-           " It is located in the southeastern part of")
+                    " It is located in the southeastern part of")
 

From 94dd71d4a5b9b53d69ec0bfd4fc528b7df5407f9 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:24:36 +0000
Subject: [PATCH 06/11] .

---
 tests/fp8_offline_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index eabd5eb297919..29684382b46ff 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -18,6 +18,6 @@ def test_fp8_offline_inference():
     # Generate model response
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
 
-    assert out == ( " England and the United Kingdom."
-                    " It is located in the southeastern part of")
+    assert out == (" England and the United Kingdom."
+                   " It is located in the southeastern part of")
 

From c3291752f42eba471beb9bb83aeef78208486370 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:28:43 +0000
Subject: [PATCH 07/11] .

---
 tests/fp8_offline_inference.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 29684382b46ff..65bff1a1a49e1 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -1,10 +1,9 @@
 from vllm import LLM, SamplingParams
 
-
 def test_fp8_offline_inference():
     # Create a sampling params object.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     # Create an LLM.
     llm = LLM(
             model="/data/models/llama-2-7b-chat-hf", 
@@ -12,12 +11,11 @@ def test_fp8_offline_inference():
             quantization_param_path = \
                     "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
             )
-    
+
     prompt = "London is the capital of"
-    
+
     # Generate model response
     out = llm.generate(prompt, sampling_params)[0].outputs[0].text
 
     assert out == (" England and the United Kingdom."
                    " It is located in the southeastern part of")
-

From b174e5803a4ffc65e4471b1f0cfcd4e5d6ec8b08 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:32:29 +0000
Subject: [PATCH 08/11] .

---
 tests/fp8_offline_inference.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 65bff1a1a49e1..7102563208697 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -1,5 +1,6 @@
 from vllm import LLM, SamplingParams
 
+
 def test_fp8_offline_inference():
     # Create a sampling params object.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

From 6a2d00ed2434c1e7e2f42b5846211b8a1da3a4f1 Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:35:54 +0000
Subject: [PATCH 09/11] .

---
 tests/fp8_offline_inference.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 7102563208697..61fd262ca53d9 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -5,7 +5,7 @@ def test_fp8_offline_inference():
     # Create a sampling params object.
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
-    # Create an LLM.
+    # Create an LLM
     llm = LLM(
             model="/data/models/llama-2-7b-chat-hf", 
             kv_cache_dtype="fp8", 

From 0aa3ef8f2e8fb2af5744c4b137fe8aeb6f9de10f Mon Sep 17 00:00:00 2001
From: "Alexei V. Ivanov" <alexei.ivanov@amd.com>
Date: Sat, 27 Jul 2024 01:43:19 +0000
Subject: [PATCH 10/11] .

---
 tests/fp8_offline_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py
index 61fd262ca53d9..79454458d920c 100644
--- a/tests/fp8_offline_inference.py
+++ b/tests/fp8_offline_inference.py
@@ -7,8 +7,8 @@ def test_fp8_offline_inference():
 
     # Create an LLM
     llm = LLM(
-            model="/data/models/llama-2-7b-chat-hf", 
-            kv_cache_dtype="fp8", 
+            model="/data/models/llama-2-7b-chat-hf",
+            kv_cache_dtype="fp8",
             quantization_param_path = \
                     "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
             )

From 98c2e720b215b2a3a8da42e565e651d974e1ba5c Mon Sep 17 00:00:00 2001
From: Alexei-V-Ivanov-AMD
 <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com>
Date: Fri, 26 Jul 2024 20:48:55 -0500
Subject: [PATCH 11/11] Update test-pipeline.yaml

---
 .buildkite/test-pipeline.yaml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ff39d189760f7..7b59439d5bf56 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -79,6 +79,10 @@ steps:
     - python3 llava_example.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmpvllm/facebook/opt-125m/v1/model.tensors
 
+- label: FP8 Test
+  mirror_hardwares: [amd]
+  command: pytest -v -s fp8_offline_inference.py
+  
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
@@ -163,4 +167,4 @@ steps:
   no_gpu: True
   commands:
   - pip install -r requirements-docs.txt
-  - SPHINXOPTS=\"-W\" make html
\ No newline at end of file
+  - SPHINXOPTS=\"-W\" make html