From 02d4f62a1f6dd440476c65140a880601cd6babf1 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 25 Sep 2023 09:19:12 +0000 Subject: [PATCH] Make awq install optional + integration tests values. --- Dockerfile | 2 + .../test_flash_awq/test_flash_llama_awq.json | 89 +++++ .../test_flash_llama_awq_all_params.json | 89 +++++ .../test_flash_llama_awq_load.json | 358 ++++++++++++++++++ integration-tests/models/test_flash_awq.py | 22 +- server/text_generation_server/utils/layers.py | 8 +- 6 files changed, 556 insertions(+), 12 deletions(-) create mode 100644 integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json create mode 100644 integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json create mode 100644 integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json diff --git a/Dockerfile b/Dockerfile index 16a2dbdf785..84b8504066d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -175,6 +175,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy build artifacts from exllama kernels builder COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages +# Copy build artifacts from exllama kernels builder +COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages # Copy builds artifacts from vllm builder COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json new file mode 100644 index 00000000000..0d8c05ed9fa --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -8.515625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -15.4140625, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 29896, + "logprob": -2.0292969, + "special": false, + "text": "1" + }, + { + "id": 13, + "logprob": -2.2597656, + "special": false, + "text": "\n" + }, + { + "id": 30166, + "logprob": -3.8671875, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -1.0488281, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.24523926, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.07897949, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.023513794, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.011444092, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.008430481, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.007648468, + "special": false, + "text": "​" + } + ], + "top_tokens": null + }, + "generated_text": "1\n​​​​​​​​" +} diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json new file mode 100644 index 00000000000..8e3f5571625 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json @@ -0,0 +1,89 @@ +{ + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -8.515625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -15.4140625, + "text": "request" + } + ], + "seed": 0, + "tokens": [ + { + "id": 29896, + "logprob": 0.0, + "special": false, + "text": "1" + }, + { + "id": 13, + "logprob": -0.6254883, + "special": false, + "text": "\n" + }, + { + "id": 30166, + "logprob": 0.0, + "special": false, + "text": "​" + }, + { + "id": 29918, + "logprob": -0.20141602, + "special": false, + "text": "_" + }, + { + "id": 29906, + "logprob": -0.6254883, + "special": false, + "text": "2" + }, + { + "id": 29871, + "logprob": 0.0, + "special": false, + "text": " " + }, + { + "id": 30166, + "logprob": 0.0, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": 0.0, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": 0.0, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": 0.0, + "special": false, + "text": "​" + } + ], + "top_tokens": null + }, + "generated_text": "Test request1\n​_2 ​​​​" +} diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json new file mode 100644 index 00000000000..42b085f8de0 --- /dev/null +++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json @@ -0,0 +1,358 @@ +[ + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -8.515625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -15.4140625, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 29896, + "logprob": -2.0292969, + "special": false, + "text": "1" + }, + { + "id": 13, + "logprob": -2.2617188, + "special": false, + "text": "\n" + }, + { + "id": 30166, + "logprob": -3.8671875, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -1.0498047, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.24523926, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.07897949, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.023529053, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.011444092, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.008300781, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.007648468, + "special": false, + "text": "​" + } + ], + "top_tokens": null + }, + "generated_text": "1\n​​​​​​​​" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -8.515625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -15.4140625, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 29896, + "logprob": -2.0292969, + "special": false, + "text": "1" + }, + { + "id": 13, + "logprob": -2.2617188, + "special": false, + "text": "\n" + }, + { + "id": 30166, + "logprob": -3.8671875, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -1.0498047, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.24523926, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.07897949, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.023529053, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.011444092, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.008300781, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.007648468, + "special": false, + "text": "​" + } + ], + "top_tokens": null + }, + "generated_text": "1\n​​​​​​​​" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -8.515625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -15.4140625, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 29896, + "logprob": -2.0292969, + "special": false, + "text": "1" + }, + { + "id": 13, + "logprob": -2.2617188, + "special": false, + "text": "\n" + }, + { + "id": 30166, + "logprob": -3.8671875, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -1.0498047, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.24523926, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.07897949, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.023529053, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.011444092, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.008300781, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.007648468, + "special": false, + "text": "​" + } + ], + "top_tokens": null + }, + "generated_text": "1\n​​​​​​​​" + }, + { + "details": { + "best_of_sequences": null, + "finish_reason": "length", + "generated_tokens": 10, + "prefill": [ + { + "id": 1, + "logprob": null, + "text": "" + }, + { + "id": 4321, + "logprob": -8.515625, + "text": "Test" + }, + { + "id": 2009, + "logprob": -15.4140625, + "text": "request" + } + ], + "seed": null, + "tokens": [ + { + "id": 29896, + "logprob": -2.0292969, + "special": false, + "text": "1" + }, + { + "id": 13, + "logprob": -2.2617188, + "special": false, + "text": "\n" + }, + { + "id": 30166, + "logprob": -3.8671875, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -1.0498047, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.24523926, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.07897949, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.023529053, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.011444092, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.008300781, + "special": false, + "text": "​" + }, + { + "id": 30166, + "logprob": -0.007648468, + "special": false, + "text": "​" + } + ], + "top_tokens": null + }, + "generated_text": "1\n​​​​​​​​" + } +] diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py index f25f7f4e18c..ca474d37ffb 100644 --- a/integration-tests/models/test_flash_awq.py +++ b/integration-tests/models/test_flash_awq.py @@ -2,21 +2,21 @@ @pytest.fixture(scope="module") -def flash_llama_gptq_handle(launcher): +def flash_llama_awq_handle(launcher): with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=2, quantize="awq") as handle: yield handle @pytest.fixture(scope="module") -async def flash_llama_gptq(flash_llama_gptq_handle): - await flash_llama_gptq_handle.health(300) - return flash_llama_gptq_handle.client +async def flash_llama_awq(flash_llama_awq_handle): + await flash_llama_awq_handle.health(300) + return flash_llama_awq_handle.client @pytest.mark.asyncio @pytest.mark.private -async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot): - response = await flash_llama_gptq.generate( +async def test_flash_llama_awq(flash_llama_awq, response_snapshot): + response = await flash_llama_awq.generate( "Test request", max_new_tokens=10, decoder_input_details=True ) @@ -26,8 +26,8 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot): @pytest.mark.asyncio @pytest.mark.private -async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot): - response = await flash_llama_gptq.generate( +async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot): + response = await flash_llama_awq.generate( "Test request", max_new_tokens=10, repetition_penalty=1.2, @@ -48,11 +48,11 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot): @pytest.mark.asyncio @pytest.mark.private -async def test_flash_llama_gptq_load( - flash_llama_gptq, generate_load, response_snapshot +async def test_flash_llama_awq_load( + flash_llama_awq, generate_load, response_snapshot ): responses = await generate_load( - flash_llama_gptq, "Test request", max_new_tokens=10, n=4 + flash_llama_awq, "Test request", max_new_tokens=10, n=4 ) assert len(responses) == 4 diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py index cfec58597f9..fb27764cd41 100644 --- a/server/text_generation_server/utils/layers.py +++ b/server/text_generation_server/utils/layers.py @@ -17,7 +17,13 @@ from accelerate import init_empty_weights from text_generation_server.utils.gptq.quant_linear import QuantLinear -from text_generation_server.utils.awq.quantize.qmodule import WQLinear + + +HAS_AWQ = True +try: + from text_generation_server.utils.awq.quantize.qmodule import WQLinear +except ImportError: + HAS_AWQ = False try: major, _minor = torch.cuda.get_device_capability()