From 9c69c196eeafe0f48bcb311d46fe398f9a96fa4b Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 00:31:19 +0000 Subject: [PATCH 01/11] adding a simple model invocation involving fp8 calculation/storage --- tests/fp8_offline_inference.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 tests/fp8_offline_inference.py diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py new file mode 100644 index 0000000000000..526f830757c43 --- /dev/null +++ b/tests/fp8_offline_inference.py @@ -0,0 +1,19 @@ +from vllm import LLM, SamplingParams + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM( + model="/data/models/llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" + ) + +prompt = "London is the capital of" + +# Generate model response +out = llm.generate(prompt, sampling_params)[0].outputs[0].text + +print(out) + From fe5828d2867a3b947f5a047459182a81e6c2f446 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:06:32 +0000 Subject: [PATCH 02/11] Adding pytest wrapper. --- tests/fp8_offline_inference.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 526f830757c43..9f4e306e9df5e 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -1,19 +1,23 @@ +import pytest from vllm import LLM, SamplingParams -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -# Create an LLM. -llm = LLM( +def test_fp8_offline_inference(): + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + + # Create an LLM. + llm = LLM( model="/data/models/llama-2-7b-chat-hf", kv_cache_dtype="fp8", quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" - ) + ) + + prompt = "London is the capital of" + + # Generate model response + out = llm.generate(prompt, sampling_params)[0].outputs[0].text -prompt = "London is the capital of" - -# Generate model response -out = llm.generate(prompt, sampling_params)[0].outputs[0].text - -print(out) + assert out == " England and the United Kingdom. It is located in the southeastern part of" + #print(out) From f344924ca54839365e5b5e9e9b2e2cb6069cd3b6 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:15:31 +0000 Subject: [PATCH 03/11] . --- tests/fp8_offline_inference.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 9f4e306e9df5e..578e40970de86 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -1,4 +1,3 @@ -import pytest from vllm import LLM, SamplingParams @@ -10,7 +9,8 @@ def test_fp8_offline_inference(): llm = LLM( model="/data/models/llama-2-7b-chat-hf", kv_cache_dtype="fp8", - quantization_param_path="./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" + quantization_param_path = \ + "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" ) prompt = "London is the capital of" @@ -18,6 +18,7 @@ def test_fp8_offline_inference(): # Generate model response out = llm.generate(prompt, sampling_params)[0].outputs[0].text - assert out == " England and the United Kingdom. It is located in the southeastern part of" + assert out == ( " England and the United Kingdom." + " It is located in the southeastern part of") #print(out) From 9a1e2b5504985829fd61b4683c0df0fe99daffe7 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:18:28 +0000 Subject: [PATCH 04/11] . --- tests/fp8_offline_inference.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 578e40970de86..e09dd2ff48005 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -20,5 +20,4 @@ def test_fp8_offline_inference(): assert out == ( " England and the United Kingdom." " It is located in the southeastern part of") - #print(out) From 6be15abe93e8177606522dc995acfaed5ec310e7 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:21:55 +0000 Subject: [PATCH 05/11] . --- tests/fp8_offline_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index e09dd2ff48005..eabd5eb297919 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -19,5 +19,5 @@ def test_fp8_offline_inference(): out = llm.generate(prompt, sampling_params)[0].outputs[0].text assert out == ( " England and the United Kingdom." - " It is located in the southeastern part of") + " It is located in the southeastern part of") From 94dd71d4a5b9b53d69ec0bfd4fc528b7df5407f9 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:24:36 +0000 Subject: [PATCH 06/11] . --- tests/fp8_offline_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index eabd5eb297919..29684382b46ff 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -18,6 +18,6 @@ def test_fp8_offline_inference(): # Generate model response out = llm.generate(prompt, sampling_params)[0].outputs[0].text - assert out == ( " England and the United Kingdom." - " It is located in the southeastern part of") + assert out == (" England and the United Kingdom." + " It is located in the southeastern part of") From c3291752f42eba471beb9bb83aeef78208486370 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:28:43 +0000 Subject: [PATCH 07/11] . --- tests/fp8_offline_inference.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 29684382b46ff..65bff1a1a49e1 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -1,10 +1,9 @@ from vllm import LLM, SamplingParams - def test_fp8_offline_inference(): # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - + # Create an LLM. llm = LLM( model="/data/models/llama-2-7b-chat-hf", @@ -12,12 +11,11 @@ def test_fp8_offline_inference(): quantization_param_path = \ "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" ) - + prompt = "London is the capital of" - + # Generate model response out = llm.generate(prompt, sampling_params)[0].outputs[0].text assert out == (" England and the United Kingdom." " It is located in the southeastern part of") - From b174e5803a4ffc65e4471b1f0cfcd4e5d6ec8b08 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:32:29 +0000 Subject: [PATCH 08/11] . --- tests/fp8_offline_inference.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 65bff1a1a49e1..7102563208697 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -1,5 +1,6 @@ from vllm import LLM, SamplingParams + def test_fp8_offline_inference(): # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) From 6a2d00ed2434c1e7e2f42b5846211b8a1da3a4f1 Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:35:54 +0000 Subject: [PATCH 09/11] . --- tests/fp8_offline_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 7102563208697..61fd262ca53d9 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -5,7 +5,7 @@ def test_fp8_offline_inference(): # Create a sampling params object. sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - # Create an LLM. + # Create an LLM llm = LLM( model="/data/models/llama-2-7b-chat-hf", kv_cache_dtype="fp8", From 0aa3ef8f2e8fb2af5744c4b137fe8aeb6f9de10f Mon Sep 17 00:00:00 2001 From: "Alexei V. Ivanov" Date: Sat, 27 Jul 2024 01:43:19 +0000 Subject: [PATCH 10/11] . --- tests/fp8_offline_inference.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/fp8_offline_inference.py b/tests/fp8_offline_inference.py index 61fd262ca53d9..79454458d920c 100644 --- a/tests/fp8_offline_inference.py +++ b/tests/fp8_offline_inference.py @@ -7,8 +7,8 @@ def test_fp8_offline_inference(): # Create an LLM llm = LLM( - model="/data/models/llama-2-7b-chat-hf", - kv_cache_dtype="fp8", + model="/data/models/llama-2-7b-chat-hf", + kv_cache_dtype="fp8", quantization_param_path = \ "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" ) From 98c2e720b215b2a3a8da42e565e651d974e1ba5c Mon Sep 17 00:00:00 2001 From: Alexei-V-Ivanov-AMD <156011006+Alexei-V-Ivanov-AMD@users.noreply.github.com> Date: Fri, 26 Jul 2024 20:48:55 -0500 Subject: [PATCH 11/11] Update test-pipeline.yaml --- .buildkite/test-pipeline.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ff39d189760f7..7b59439d5bf56 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -79,6 +79,10 @@ steps: - python3 llava_example.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmpvllm/facebook/opt-125m/v1/model.tensors +- label: FP8 Test + mirror_hardwares: [amd] + command: pytest -v -s fp8_offline_inference.py + - label: Kernels Test %N #mirror_hardwares: [amd] command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT @@ -163,4 +167,4 @@ steps: no_gpu: True commands: - pip install -r requirements-docs.txt - - SPHINXOPTS=\"-W\" make html \ No newline at end of file + - SPHINXOPTS=\"-W\" make html