From 02d4f62a1f6dd440476c65140a880601cd6babf1 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Mon, 25 Sep 2023 09:19:12 +0000
Subject: [PATCH] Make awq install optional + integration tests values.

---
 Dockerfile                                    |   2 +
 .../test_flash_awq/test_flash_llama_awq.json  |  89 +++++
 .../test_flash_llama_awq_all_params.json      |  89 +++++
 .../test_flash_llama_awq_load.json            | 358 ++++++++++++++++++
 integration-tests/models/test_flash_awq.py    |  22 +-
 server/text_generation_server/utils/layers.py |   8 +-
 6 files changed, 556 insertions(+), 12 deletions(-)
 create mode 100644 integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
 create mode 100644 integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
diff --git a/Dockerfile b/Dockerfile
index 16a2dbdf785..84b8504066d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -175,6 +175,8 @@ COPY --from=flash-att-v2-builder /usr/src/flash-attention-v2/build/lib.linux-x86
 COPY --from=custom-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 # Copy build artifacts from exllama kernels builder
 COPY --from=exllama-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
+# Copy build artifacts from exllama kernels builder
+COPY --from=awq-kernels-builder /usr/src/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
 
 # Copy builds artifacts from vllm builder
 COPY --from=vllm-builder /usr/src/vllm/build/lib.linux-x86_64-cpython-39 /opt/conda/lib/python3.9/site-packages
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
new file mode 100644
index 00000000000..0d8c05ed9fa
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -8.515625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -15.4140625,
+        "text": "request"
+      }
+    ],
+    "seed": null,
+    "tokens": [
+      {
+        "id": 29896,
+        "logprob": -2.0292969,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 13,
+        "logprob": -2.2597656,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 30166,
+        "logprob": -3.8671875,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -1.0488281,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -0.24523926,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -0.07897949,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -0.023513794,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -0.011444092,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -0.008430481,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": -0.007648468,
+        "special": false,
+        "text": "​"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "1\n​​​​​​​​"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
new file mode 100644
index 00000000000..8e3f5571625
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_all_params.json
@@ -0,0 +1,89 @@
+{
+  "details": {
+    "best_of_sequences": null,
+    "finish_reason": "length",
+    "generated_tokens": 10,
+    "prefill": [
+      {
+        "id": 1,
+        "logprob": null,
+        "text": "<s>"
+      },
+      {
+        "id": 4321,
+        "logprob": -8.515625,
+        "text": "Test"
+      },
+      {
+        "id": 2009,
+        "logprob": -15.4140625,
+        "text": "request"
+      }
+    ],
+    "seed": 0,
+    "tokens": [
+      {
+        "id": 29896,
+        "logprob": 0.0,
+        "special": false,
+        "text": "1"
+      },
+      {
+        "id": 13,
+        "logprob": -0.6254883,
+        "special": false,
+        "text": "\n"
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 29918,
+        "logprob": -0.20141602,
+        "special": false,
+        "text": "_"
+      },
+      {
+        "id": 29906,
+        "logprob": -0.6254883,
+        "special": false,
+        "text": "2"
+      },
+      {
+        "id": 29871,
+        "logprob": 0.0,
+        "special": false,
+        "text": " "
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": "​"
+      },
+      {
+        "id": 30166,
+        "logprob": 0.0,
+        "special": false,
+        "text": "​"
+      }
+    ],
+    "top_tokens": null
+  },
+  "generated_text": "Test request1\n​_2 ​​​​"
+}
diff --git a/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
new file mode 100644
index 00000000000..42b085f8de0
--- /dev/null
+++ b/integration-tests/models/__snapshots__/test_flash_awq/test_flash_llama_awq_load.json
@@ -0,0 +1,358 @@
+[
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": "​"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n​​​​​​​​"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": "​"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n​​​​​​​​"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": "​"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n​​​​​​​​"
+  },
+  {
+    "details": {
+      "best_of_sequences": null,
+      "finish_reason": "length",
+      "generated_tokens": 10,
+      "prefill": [
+        {
+          "id": 1,
+          "logprob": null,
+          "text": "<s>"
+        },
+        {
+          "id": 4321,
+          "logprob": -8.515625,
+          "text": "Test"
+        },
+        {
+          "id": 2009,
+          "logprob": -15.4140625,
+          "text": "request"
+        }
+      ],
+      "seed": null,
+      "tokens": [
+        {
+          "id": 29896,
+          "logprob": -2.0292969,
+          "special": false,
+          "text": "1"
+        },
+        {
+          "id": 13,
+          "logprob": -2.2617188,
+          "special": false,
+          "text": "\n"
+        },
+        {
+          "id": 30166,
+          "logprob": -3.8671875,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -1.0498047,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.24523926,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.07897949,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.023529053,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.011444092,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.008300781,
+          "special": false,
+          "text": "​"
+        },
+        {
+          "id": 30166,
+          "logprob": -0.007648468,
+          "special": false,
+          "text": "​"
+        }
+      ],
+      "top_tokens": null
+    },
+    "generated_text": "1\n​​​​​​​​"
+  }
+]
diff --git a/integration-tests/models/test_flash_awq.py b/integration-tests/models/test_flash_awq.py
index f25f7f4e18c..ca474d37ffb 100644
--- a/integration-tests/models/test_flash_awq.py
+++ b/integration-tests/models/test_flash_awq.py
@@ -2,21 +2,21 @@
 
 
 @pytest.fixture(scope="module")
-def flash_llama_gptq_handle(launcher):
+def flash_llama_awq_handle(launcher):
     with launcher("abhinavkulkarni/codellama-CodeLlama-7b-Python-hf-w4-g128-awq", num_shard=2, quantize="awq") as handle:
         yield handle
 
 
 @pytest.fixture(scope="module")
-async def flash_llama_gptq(flash_llama_gptq_handle):
-    await flash_llama_gptq_handle.health(300)
-    return flash_llama_gptq_handle.client
+async def flash_llama_awq(flash_llama_awq_handle):
+    await flash_llama_awq_handle.health(300)
+    return flash_llama_awq_handle.client
 
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
-    response = await flash_llama_gptq.generate(
+async def test_flash_llama_awq(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
         "Test request", max_new_tokens=10, decoder_input_details=True
     )
 
@@ -26,8 +26,8 @@ async def test_flash_llama_gptq(flash_llama_gptq, response_snapshot):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
-    response = await flash_llama_gptq.generate(
+async def test_flash_llama_awq_all_params(flash_llama_awq, response_snapshot):
+    response = await flash_llama_awq.generate(
         "Test request",
         max_new_tokens=10,
         repetition_penalty=1.2,
@@ -48,11 +48,11 @@ async def test_flash_llama_gptq_all_params(flash_llama_gptq, response_snapshot):
 
 @pytest.mark.asyncio
 @pytest.mark.private
-async def test_flash_llama_gptq_load(
-    flash_llama_gptq, generate_load, response_snapshot
+async def test_flash_llama_awq_load(
+    flash_llama_awq, generate_load, response_snapshot
 ):
     responses = await generate_load(
-        flash_llama_gptq, "Test request", max_new_tokens=10, n=4
+        flash_llama_awq, "Test request", max_new_tokens=10, n=4
     )
 
     assert len(responses) == 4
diff --git a/server/text_generation_server/utils/layers.py b/server/text_generation_server/utils/layers.py
index cfec58597f9..fb27764cd41 100644
--- a/server/text_generation_server/utils/layers.py
+++ b/server/text_generation_server/utils/layers.py
@@ -17,7 +17,13 @@
 from accelerate import init_empty_weights
 
 from text_generation_server.utils.gptq.quant_linear import QuantLinear
-from text_generation_server.utils.awq.quantize.qmodule import WQLinear
+
+
+HAS_AWQ = True
+try: 
+    from text_generation_server.utils.awq.quantize.qmodule import WQLinear
+except ImportError:
+    HAS_AWQ = False
 
 try:
     major, _minor = torch.cuda.get_device_capability()