int4 and int8

pytorch · May 21, 2024 · f9a629d · f9a629d
1 parent 65fcb53
commit f9a629d
Showing 1 changed file with 34 additions and 18 deletions.
diff --git a/benchmarks/models_config/llama-2-7b.yaml b/benchmarks/models_config/llama-2-7b.yaml
@@ -1,24 +1,24 @@
 ---
 llama-2-7b:
-  # int8:
-  #   benchmark_engine: "ab"
-  #   url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar
-  #   workers:
-  #     - 1
-  #   batch_delay: 100
-  #   batch_size:
-  #     - 1
-  #   input: "./examples/large_models/gpt_fast/request.json"
-  #   requests: 1000
-  #   concurrency: 1
-  #   backend_profiling: False
-  #   exec_env: "local"
-  #   processors:
-  #     - "gpus": "all"
-  #   stream: "false"
-  base:
+  int8:
+    benchmark_engine: "ab"
+    url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar
+    workers:
+      - 1
+    batch_delay: 100
+    batch_size:
+      - 1
+    input: "./examples/large_models/gpt_fast/request.json"
+    requests: 1000
+    concurrency: 1
+    backend_profiling: False
+    exec_env: "local"
+    processors:
+      - "gpus": "all"
+    stream: "false"
+  int4:
     benchmark_engine: "ab"
-    url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar
+    url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int4.mar
     workers:
       - 1
     batch_delay: 100
@@ -32,6 +32,22 @@ llama-2-7b:
     processors:
       - "gpus": "all"
     stream: "false"
+  # base:
+  #   benchmark_engine: "ab"
+  #   url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar
+  #   workers:
+  #     - 1
+  #   batch_delay: 100
+  #   batch_size:
+  #     - 1
+  #   input: "./examples/large_models/gpt_fast/request.json"
+  #   requests: 1000
+  #   concurrency: 1
+  #   backend_profiling: False
+  #   exec_env: "local"
+  #   processors:
+  #     - "gpus": "all"
+  #   stream: "false"
   # int8-tp:
   #   benchmark_engine: "ab"
   #   url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/llama-2-7b-int8-tp.mar