From 0fb7bfad5c4eef4d46e7d33c6735e1e360eeb29c Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 16:46:10 +0200 Subject: [PATCH 1/7] ci: bench: change trigger path to not spawn on each PR --- .github/workflows/bench.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 949d806f67bf4..ae9be5ebc9ef4 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -24,10 +24,10 @@ on: push: branches: - master - paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] pull_request_target: types: [opened, synchronize, reopened] - paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] schedule: - cron: '04 2 * * *' From 22597a48488993ea4c640921e068a3498921117e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 17:16:05 +0200 Subject: [PATCH 2/7] ci: bench: add more file type for phi-2: q8_0 and f16. - do not show the comment by default --- .github/workflows/bench.yml | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index ae9be5ebc9ef4..8ff124052241f 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -42,6 +42,16 @@ jobs: RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it N_USERS: 8 DURATION: 10m + + strategy: + matrix: + model: [phi-2] + ftype: [q4_0, q8_0, f16] + include: + - model: phi-2 + ftype: q4_0 + pr_comment_enabled: "true" + if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }} steps: - name: Clone @@ -116,7 +126,7 @@ jobs: --scenario script.js \ --duration ${{ github.event.inputs.duration || env.DURATION }} \ --hf-repo ggml-org/models \ - --hf-file phi-2/ggml-model-q4_0.gguf \ + --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ --model-path-prefix /models \ --parallel ${{ env.N_USERS }} \ -ngl 33 \ @@ -146,7 +156,7 @@ jobs: with: authToken: ${{secrets.GITHUB_TOKEN}} sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} - context: bench-server-baseline + context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} description: | ${{ env.BENCH_RESULTS }} state: 'success' @@ -203,11 +213,12 @@ jobs: - name: Comment PR uses: mshick/add-pr-comment@v2 id: comment_pr - if: ${{ github.event.pull_request != '' }} + if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} with: - message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }} + message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} message: | - 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + + 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for ${{ matrix.model }} ${{ matrix.ftype }}: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} From b996b00d6fce156abb010940c4e53912bfd4a99e Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 17:18:54 +0200 Subject: [PATCH 3/7] ci: bench: add seed parameter in k6 script --- examples/server/bench/script.js | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index a4f5ac5ab22ad..dc41e8d937c1b 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -87,6 +87,7 @@ export default function () { ], "model": model, "stream": false, + "seed": 42, "max_tokens": max_tokens } From a380b95274c74e18720f734975d8c636eab3fd54 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 20:25:50 +0200 Subject: [PATCH 4/7] ci: bench: artefact name perf job --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 8ff124052241f..d2ef2bbd59a14 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -144,7 +144,7 @@ jobs: - uses: actions/upload-artifact@v4 with: - name: benchmark-results + name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} compression-level: 9 path: | examples/server/bench/*.jpg From 04e1ce34985c2c118a57aa7ca9ca15ba101d8a36 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 21:24:06 +0200 Subject: [PATCH 5/7] Add iteration in the commit status, reduce again the autocomment --- .github/workflows/bench.yml | 14 +++++++++----- examples/server/bench/bench.py | 1 + 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index d2ef2bbd59a14..5f5b3d212ab3b 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -217,8 +217,15 @@ jobs: with: message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} message: | - - 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for ${{ matrix.model }} ${{ matrix.ftype }}: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 +

+ + 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + +

+ +
+ + Expand details for performance related PR only - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} @@ -226,9 +233,6 @@ jobs: - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s** - ${{ env.BENCH_GRAPH_XLABEL }} -
- - Time series

diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index ea5d3854d561a..672fb57e8dfe1 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -176,6 +176,7 @@ def main(args_in: list[str] | None = None) -> None: # 140 chars max for commit status description bench_results = { + "i": iterations, "req": { "p90": data['metrics']["http_req_duration"]["p(90)"], "avg": data['metrics']["http_req_duration"]["avg"], From 64c7534b00df754a31ff1ec7906716418642d9d4 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 21:33:41 +0200 Subject: [PATCH 6/7] ci: bench: add per slot metric in the commit status --- examples/server/bench/bench.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py index 672fb57e8dfe1..86eeeccf874cd 100644 --- a/examples/server/bench/bench.py +++ b/examples/server/bench/bench.py @@ -16,6 +16,7 @@ import matplotlib.dates import matplotlib.pyplot as plt import requests +from statistics import mean def main(args_in: list[str] | None = None) -> None: @@ -109,6 +110,7 @@ def main(args_in: list[str] | None = None) -> None: # Prometheus end_time = time.time() + prometheus_metrics = {} if is_server_listening("0.0.0.0", 9090): metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] @@ -127,6 +129,7 @@ def main(args_in: list[str] | None = None) -> None: values = metric_data['data']['result'][0]['values'] timestamps, metric_values = zip(*values) metric_values = [float(value) for value in metric_values] + prometheus_metrics[metric] = metric_values timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] plt.figure(figsize=(16, 10), dpi=80) plt.plot(timestamps_dt, metric_values, label=metric) @@ -178,16 +181,18 @@ def main(args_in: list[str] | None = None) -> None: bench_results = { "i": iterations, "req": { - "p90": data['metrics']["http_req_duration"]["p(90)"], - "avg": data['metrics']["http_req_duration"]["avg"], + "p90": round(data['metrics']["http_req_duration"]["p(90)"], 2), + "avg": round(data['metrics']["http_req_duration"]["avg"], 2), }, "pp": { - "p90": data['metrics']["llamacpp_prompt_tokens"]["p(90)"], - "avg": data['metrics']["llamacpp_prompt_tokens"]["avg"], + "p90": round(data['metrics']["llamacpp_prompt_tokens"]["p(90)"], 2), + "avg": round(data['metrics']["llamacpp_prompt_tokens"]["avg"], 2), + "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), }, "tg": { - "p90": data['metrics']["llamacpp_tokens_second"]["p(90)"], - "avg": data['metrics']["llamacpp_tokens_second"]["avg"], + "p90": round(data['metrics']["llamacpp_tokens_second"]["p(90)"], 2), + "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), + "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), }, } with open("results.github.env", 'a') as github_env: From 8685c2cda25b84906a3cf080b8cc4dcc0d8f22d2 Mon Sep 17 00:00:00 2001 From: Pierrick HYMBERT Date: Wed, 3 Apr 2024 21:59:30 +0200 Subject: [PATCH 7/7] Fix trailing spaces --- .github/workflows/bench.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 5f5b3d212ab3b..af6060e085a20 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -220,7 +220,7 @@ jobs:

📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 - +