From f9a629df549c34213507a824809eddbc929cb33c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 21 May 2024 18:37:59 +0000 Subject: [PATCH] int4 and int8 --- benchmarks/models_config/llama-2-7b.yaml | 52 ++++++++++++++++-------- 1 file changed, 34 insertions(+), 18 deletions(-) diff --git a/benchmarks/models_config/llama-2-7b.yaml b/benchmarks/models_config/llama-2-7b.yaml index c5fbeb25e6..7021642892 100644 --- a/benchmarks/models_config/llama-2-7b.yaml +++ b/benchmarks/models_config/llama-2-7b.yaml @@ -1,24 +1,24 @@ --- llama-2-7b: - # int8: - # benchmark_engine: "ab" - # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar - # workers: - # - 1 - # batch_delay: 100 - # batch_size: - # - 1 - # input: "./examples/large_models/gpt_fast/request.json" - # requests: 1000 - # concurrency: 1 - # backend_profiling: False - # exec_env: "local" - # processors: - # - "gpus": "all" - # stream: "false" - base: + int8: + benchmark_engine: "ab" + url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int8.mar + workers: + - 1 + batch_delay: 100 + batch_size: + - 1 + input: "./examples/large_models/gpt_fast/request.json" + requests: 1000 + concurrency: 1 + backend_profiling: False + exec_env: "local" + processors: + - "gpus": "all" + stream: "false" + int4: benchmark_engine: "ab" - url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar + url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-int4.mar workers: - 1 batch_delay: 100 @@ -32,6 +32,22 @@ llama-2-7b: processors: - "gpus": "all" stream: "false" + # base: + # benchmark_engine: "ab" + # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/mar+files/llama-2-7b-base.mar + # workers: + # - 1 + # batch_delay: 100 + # batch_size: + # - 1 + # input: "./examples/large_models/gpt_fast/request.json" + # requests: 1000 + # concurrency: 1 + # backend_profiling: False + # exec_env: "local" + # processors: + # - "gpus": "all" + # stream: "false" # int8-tp: # benchmark_engine: "ab" # url: https://torchserve.s3.amazonaws.com/mar_files/llama-2/llama-2-7b-int8-tp.mar