l3utterfly · l3utterfly · Apr 6, 2024 · Mar 28, 2024 · Mar 28, 2024 · Mar 28, 2024
diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -28,6 +28,8 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1
 
 RUN make
 

diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make
 
 ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 
 RUN apt-get update && \
-    apt-get install -y build-essential python3 python3-pip git
+    apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
 
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
@@ -15,6 +15,9 @@ WORKDIR /app
 
 COPY . .
 
+ENV LLAMA_CURL=1
+
+
 RUN make
 
 ENV LC_ALL=C.utf8

diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - [email protected]
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 

diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - [email protected]
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 

diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec
@@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - [email protected]
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 

diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev
 
 WORKDIR /app
 
@@ -22,11 +22,16 @@ COPY . .
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable CUDA
 ENV LLAMA_CUDA=1
+# Enable cURL
+ENV LLAMA_CURL=1
 
 RUN make
 
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server
 
 ENTRYPOINT [ "/server" ]
diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile
@@ -4,7 +4,7 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
 
 ARG LLAMA_SYCL_F16=OFF
 RUN apt-get update && \
-    apt-get install -y git
+    apt-get install -y git libcurl4-openssl-dev
 
 WORKDIR /app
 
@@ -16,11 +16,14 @@ RUN mkdir build && \
         echo "LLAMA_SYCL_F16 is set" && \
         export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
     fi && \
-    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+    cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
     cmake --build . --config Release --target server
 
 FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
 
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/build/bin/server /server
 
 ENV LC_ALL=C.utf8

diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile
@@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 
+# Enable cURL
+ENV LLAMA_CURL=1
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 RUN make
 
 ENTRYPOINT [ "/app/server" ]
diff --git a/.devops/server-vulkan.Dockerfile b/.devops/server-vulkan.Dockerfile
@@ -11,12 +11,16 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
     apt update -y && \
     apt-get install -y vulkan-sdk
 
+# Install cURL
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 # Build it
 WORKDIR /app
 COPY . .
 RUN mkdir build && \
     cd build && \
-    cmake .. -DLLAMA_VULKAN=1 && \
+    cmake .. -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
     cmake --build . --config Release --target server
 
 # Clean up

diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
@@ -3,16 +3,21 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git
+    apt-get install -y build-essential git libcurl4-openssl-dev
 
 WORKDIR /app
 
 COPY . .
 
+ENV LLAMA_CURL=1
+
 RUN make
 
 FROM ubuntu:$UBUNTU_VERSION as runtime
 
+RUN apt-get update && \
+    apt-get install -y libcurl4-openssl-dev
+
 COPY --from=build /app/server /server
 
 ENV LC_ALL=C.utf8

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -24,15 +24,15 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
-  pull_request:
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
+  pull_request_target:
     types: [opened, synchronize, reopened]
-    paths: ['.github/workflows/bench.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*']
+    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
   schedule:
     -  cron: '04 2 * * *'
 
 concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
+  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}-${{ github.event.inputs.sha }}
   cancel-in-progress: true
 
 jobs:
@@ -42,11 +42,21 @@ jobs:
       RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
       N_USERS: 8
       DURATION: 10m
+
+    strategy:
+      matrix:
+        model: [phi-2]
+        ftype: [q4_0, q8_0, f16]
+        include:
+          - model: phi-2
+            ftype: q4_0
+            pr_comment_enabled: "true"
+
     if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
     steps:
       - name: Clone
         id: checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
@@ -116,7 +126,7 @@ jobs:
               --scenario script.js \
               --duration ${{ github.event.inputs.duration || env.DURATION }} \
               --hf-repo ggml-org/models	 \
-              --hf-file phi-2/ggml-model-q4_0.gguf \
+              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
               --model-path-prefix /models \
               --parallel ${{ env.N_USERS }} \
               -ngl 33 \
@@ -134,7 +144,7 @@ jobs:
 
       - uses: actions/upload-artifact@v4
         with:
-          name: benchmark-results
+          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
           compression-level: 9
           path: |
             examples/server/bench/*.jpg
@@ -143,11 +153,10 @@ jobs:
 
       - name: Commit status
         uses: Sibz/github-status-action@v1
-        continue-on-error: true # If not authorized on external repo
         with:
           authToken: ${{secrets.GITHUB_TOKEN}}
           sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
-          context: bench-server-baseline
+          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
           description: |
             ${{ env.BENCH_RESULTS }}
           state: 'success'
@@ -204,21 +213,26 @@ jobs:
       - name: Comment PR
         uses: mshick/add-pr-comment@v2
         id: comment_pr
-        if: ${{ github.event.pull_request != '' }}
+        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
         with:
-          message-id: bench-${{ github.job }}-${{ env.RUNNER_LABEL }}
+          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
           message: |
-            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+            <p align="center">
+
+            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
+
+            </p>
+
+            <details>
+
+            <summary>Expand details for performance related PR only</summary>
 
             - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
             - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(90)=${{ env.HTTP_REQ_DURATION_P_90_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
             - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_TOKENS_AVG }}tk/s p(90)=${{ env.LLAMACPP_PROMPT_TOKENS_P_90_ }}tk/s **total=${{ env.LLAMACPP_PROMPT_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
             - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(90)=${{ env.LLAMACPP_TOKENS_SECOND_P_90_ }}tk/s **total=${{ env.LLAMACPP_COMPLETION_TOKENS_TOTAL_COUNTER_RATE }}tk/s**
             - ${{ env.BENCH_GRAPH_XLABEL }}
 
-            <details>
-
-            <summary>Time series</summary>
 
             <p align="center">