Merge branch 'master' into layla-build

l3utterfly · Feb 26, 2024 · 684fea9 · 684fea9
2 parents f507a78 + c393733
commit 684fea9
Show file tree

Hide file tree

Showing 64 changed files with 6,267 additions and 1,729 deletions.
diff --git a/.devops/nix/docker.nix b/.devops/nix/docker.nix
@@ -0,0 +1,37 @@
+{
+  lib,
+  dockerTools,
+  buildEnv,
+  llama-cpp,
+  interactive ? true,
+  coreutils,
+}:
+
+# A tar that can be fed into `docker load`:
+#
+# $ nix build .#llamaPackages.docker
+# $ docker load < result
+
+# For details and variations cf.
+# - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
+# - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
+# - https://nixery.dev/
+
+# Approximate (compressed) sizes, at the time of writing, are:
+#
+# .#llamaPackages.docker: 125M;
+# .#llamaPackagesCuda.docker: 537M;
+# .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
+
+dockerTools.buildLayeredImage {
+  name = llama-cpp.pname;
+  tag = "latest";
+
+  contents =
+    [ llama-cpp ]
+    ++ lib.optionals interactive [
+      coreutils
+      dockerTools.binSh
+      dockerTools.caCertificates
+    ];
+}
diff --git a/.devops/nix/scope.nix b/.devops/nix/scope.nix
@@ -12,5 +12,8 @@ lib.makeScope newScope (
   self: {
     inherit llamaVersion;
     llama-cpp = self.callPackage ./package.nix { };
+    docker = self.callPackage ./docker.nix { };
+    docker-min = self.callPackage ./docker.nix { interactive = false; };
+    sif = self.callPackage ./sif.nix { };
   }
 )
diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix
@@ -0,0 +1,27 @@
+{
+  lib,
+  singularity-tools,
+  llama-cpp,
+  bashInteractive,
+  interactive ? false,
+}:
+
+let
+    optionalInt = cond: x: if cond then x else 0;
+in
+singularity-tools.buildImage rec {
+  inherit (llama-cpp) name;
+  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
+
+  # These are excessive (but safe) for most variants. Building singularity
+  # images requires superuser privileges, so we build them inside a VM in a
+  # writable image of pre-determined size.
+  #
+  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
+  #
+  # Expected image sizes:
+  # - cpu/blas: 150M,
+  # - cuda, all gencodes: 560M,
+  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
+  memSize = diskSize;
+}
diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
@@ -7,3 +7,5 @@ assignees: ''
 ---
 
 Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
+
+If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -669,8 +669,7 @@ jobs:
         run: |
           cd examples/llama.android
 
-          # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
-          ./gradlew build --no-daemon -Pskip-armeabi-v7a
+          ./gradlew build --no-daemon
 
 #  freeBSD-latest:
 #    runs-on: macos-12

diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml
@@ -19,7 +19,6 @@ on:
 
 jobs:
   nix-build-aarch64:
-    if: ${{ vars.CACHIX_NAME != '' }}
     runs-on: ubuntu-latest
     steps:
     - name: Checkout repository
@@ -37,16 +36,16 @@ jobs:
         extra-conf: |
           extra-platforms = aarch64-linux
           extra-system-features = nixos-test kvm
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
     - name: Set-up cachix to push the results to
       uses: cachix/cachix-action@v13
       with:
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
     - name: Show all output paths
       run: >
           nix run github:nix-community/nix-eval-jobs

diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml
@@ -23,8 +23,8 @@ jobs:
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
@@ -37,7 +37,6 @@ jobs:
           --flake
           ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
   nix-build:
-    if: ${{ vars.CACHIX_NAME != '' }}
     strategy:
       fail-fast: false
       matrix:
@@ -51,16 +50,16 @@ jobs:
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
         extra-conf: |
-          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
-          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
+          extra-substituters = https://llama-cpp.cachix.org https://cuda-maintainers.cachix.org
+          extra-trusted-public-keys = llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc= cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
     - uses: DeterminateSystems/magic-nix-cache-action@v2
       with:
         upstream-cache: https://${{ matrix.cachixName }}.cachix.org
     - name: Set-up cachix to push the results to
       uses: cachix/cachix-action@v13
       with:
         authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
-        name: ${{ vars.CACHIX_NAME }}
+        name: llama-cpp
     - name: Build
       run: >
           nix run github:Mic92/nix-fast-build

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -0,0 +1,127 @@
+# Server build and tests
+name: Server
+
+on:
+  workflow_dispatch: # allows manual triggering
+  push:
+    branches:
+      - master
+      - test/server-add-ci-test # FIXME remove
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
+
+jobs:
+  server:
+    runs-on: ubuntu-latest
+
+    strategy:
+      matrix:
+        build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
+        sanitizer: [ADDRESS, THREAD, UNDEFINED]
+        build_type: [Debug, Release]
+        include:
+          - build: 'noavx'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
+            image: ubuntu:latest
+          - build: 'avx2'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
+            image: ubuntu:latest
+          - build: 'avx'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
+            image: ubuntu:latest
+          - build: 'avx512'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
+            image: ubuntu:latest
+            experimental: true
+          - build: 'cublas'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
+            image: nvidia/cuda:12.3.1-devel-ubuntu22.04
+            arch_not_available: true # require nvidia docker engine
+          - build: 'clblast'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
+            image: ubuntu:latest
+            arch_not_available: true
+          - build: 'openblas'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
+            image: ubuntu:latest
+          - build: 'kompute'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
+            image: ubuntu:latest
+            arch_not_available: true
+          - build: 'vulkan'
+            defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
+            image: ubuntu:latest
+            arch_not_available: true
+
+    container:
+      image: ${{ matrix.image }}
+      ports:
+        - 8888
+      options: --cpus 4
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        run: |
+          apt-get update
+          apt-get -y install \
+            build-essential \
+            pkg-config \
+            git \
+            cmake \
+            python3-pip \
+            wget \
+            psmisc
+
+      - name: Download CLBlast
+        id: get_clblast
+        if: ${{ matrix.build == 'clblast' }}
+        run: |
+          apt install -y libclblast-dev
+
+      - name: Download OpenBLAS
+        id: get_openblas
+        if: ${{ matrix.build == 'openblas' }}
+        run: |
+          apt-get -y install libopenblas-dev
+
+      - name: Install Vulkan SDK
+        id: get_vulkan
+        if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
+        run: |
+          wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
+          wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          apt-get update
+          apt-get -y install vulkan-sdk
+
+      - name: Build
+        id: cmake_build
+        run: |
+          mkdir build
+          cd build
+          cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
+          cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
+
+      - name: Tests dependencies
+        id: test_dependencies
+        run: |
+          pip install -r examples/server/tests/requirements.txt
+
+      - name: Download models
+        id: download_models
+        run: |
+          cd examples/server/tests
+          ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
+
+      - name: Tests
+        id: server_integration_test
+        continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
+        run: |
+          cd examples/server/tests
+          PORT=8888 ./tests.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -938,10 +938,16 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
             list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
         endif()
         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
-            # Raspberry Pi 2
-            list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
+                # Android armeabi-v7a
+                list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
+            else()
+                # Raspberry Pi 2
+                list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
+            endif()
         endif()
         if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
+            # Android arm64-v8a
             # Raspberry Pi 3, 4, Zero 2 (32-bit)
             list(APPEND ARCH_FLAGS -mno-unaligned-access)
         endif()

diff --git a/Makefile b/Makefile
@@ -173,7 +173,7 @@ ifdef LLAMA_DEBUG
 	MK_LDFLAGS  += -g
 
 	ifeq ($(UNAME_S),Linux)
-		MK_CXXFLAGS += -Wp,-D_GLIBCXX_ASSERTIONS
+		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
 	MK_CPPFLAGS += -DNDEBUG
@@ -597,7 +597,7 @@ $(info I CC:        $(shell $(CC)   --version | head -n 1))
 $(info I CXX:       $(shell $(CXX)  --version | head -n 1))
 ifdef LLAMA_CUBLAS
 $(info I NVCC:      $(shell $(NVCC) --version | tail -n 1))
-CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
+CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])')
 ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
 ifndef CUDA_DOCKER_ARCH
 ifndef CUDA_POWER_ARCH
@@ -719,7 +719,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
-server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
 	$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)

diff --git a/README.md b/README.md
@@ -10,13 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ### Hot topics
 
-- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com/ggerganov/llama.cpp/pull/5240
-- Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
-  - [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
-- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
-- Collecting Apple Silicon performance stats:
-  - M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
-  - A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
+- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
+- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
+- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
 - Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
 
 ----
@@ -107,6 +103,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
+- [x] [Gemma](https://ai.google.dev/gemma)
 
 **Multimodal models:**
 
@@ -117,6 +114,9 @@ Typically finetunes of the base models below are supported as well.
 - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
 - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
 
+**HTTP server**
+
+[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
 
 **Bindings:**
 
@@ -145,6 +145,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [nat/openplayground](https://github.com/nat/openplayground)
 - [Faraday](https://faraday.dev/) (proprietary)
 - [LMStudio](https://lmstudio.ai/) (proprietary)
+- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
 - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
 - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
 - [nomic-ai/gpt4all](https://github.com/nomic-ai/gpt4all)
@@ -157,6 +158,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 - [semperai/amica](https://github.com/semperai/amica)
 - [withcatai/catai](https://github.com/withcatai/catai)
 - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
+- [Msty](https://msty.app) (proprietary)
 
 ---
 

diff --git a/build.zig b/build.zig
@@ -123,6 +123,7 @@ pub fn build(b: *std.build.Builder) !void {
     const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
     const train = make.obj("train", "common/train.cpp");
     const clip = make.obj("clip", "examples/llava/clip.cpp");
+    const llava = make.obj("llava", "examples/llava/llava.cpp");
 
     _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
     _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
@@ -131,7 +132,7 @@ pub fn build(b: *std.build.Builder) !void {
     _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
     _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
 
-    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
+    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
     if (server.target.isWindows()) {
         server.linkSystemLibrary("ws2_32");
     }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -7,3 +7,5 @@ assignees: ''
		---

		Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.

		If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).