From f9a629df549c34213507a824809eddbc929cb33c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-9-227.us-west-2.compute.internal>
Date: Tue, 21 May 2024 18:37:59 +0000
Subject: [PATCH] int4 and int8

---
 benchmarks/models_config/llama-2-7b.yaml | 52 ++++++++++++++++--------
 1 file changed, 34 insertions(+), 18 deletions(-)

diff --git a/benchmarks/models_config/llama-2-7b.yaml b/benchmarks/models_config/llama-2-7b.yaml
index c5fbeb25e6..7021642892 100644
--- a/benchmarks/models_config/llama-2-7b.yaml
+++ b/benchmarks/models_config/llama-2-7b.yaml
@@ -1,24 +1,24 @@
 ---
 llama-2-7b:
-  # int8:
-  #   benchmark_engine: "ab"
-  #   url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar
-  #   workers:
-  #     - 1
-  #   batch_delay: 100
-  #   batch_size:
-  #     - 1
-  #   input: "./examples/large_models/gpt_fast/request.json"
-  #   requests: 1000
-  #   concurrency: 1
-  #   backend_profiling: False
-  #   exec_env: "local"
-  #   processors:
-  #     - "gpus": "all"
-  #   stream: "false"
-  base:
+  int8:
+    benchmark_engine: "ab"
+    url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar
+    workers:
+      - 1
+    batch_delay: 100
+    batch_size:
+      - 1
+    input: "./examples/large_models/gpt_fast/request.json"
+    requests: 1000
+    concurrency: 1
+    backend_profiling: False
+    exec_env: "local"
+    processors:
+      - "gpus": "all"
+    stream: "false"
+  int4:
     benchmark_engine: "ab"
-    url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar
+    url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int4.mar
     workers:
       - 1
     batch_delay: 100
@@ -32,6 +32,22 @@ llama-2-7b:
     processors:
       - "gpus": "all"
     stream: "false"
+  # base:
+  #   benchmark_engine: "ab"
+  #   url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar
+  #   workers:
+  #     - 1
+  #   batch_delay: 100
+  #   batch_size:
+  #     - 1
+  #   input: "./examples/large_models/gpt_fast/request.json"
+  #   requests: 1000
+  #   concurrency: 1
+  #   backend_profiling: False
+  #   exec_env: "local"
+  #   processors:
+  #     - "gpus": "all"
+  #   stream: "false"
   # int8-tp:
   #   benchmark_engine: "ab"
   #   url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/llama-2-7b-int8-tp.mar