teleprint-me · pull · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024 · Jan 2, 2024
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix
@@ -9,7 +9,7 @@
   git,
   python3,
   mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch betwen alternative implementations
+  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
   cudaPackages,
   darwin,
   rocmPackages,

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -95,6 +95,7 @@ option(LLAMA_HIP_UMA                         "llama: use HIP unified memory arch
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
+option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
@@ -154,9 +155,9 @@ if (APPLE AND LLAMA_ACCELERATE)
 endif()
 
 if (LLAMA_METAL)
-    find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
-    find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
-    find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
 
     message(STATUS "Metal framework found")
     set(GGML_HEADERS_METAL ggml-metal.h)
@@ -173,6 +174,35 @@ if (LLAMA_METAL)
     # copy ggml-metal.metal to bin directory
     configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
 
+    if (LLAMA_METAL_SHADER_DEBUG)
+        # custom command to do the following:
+        #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+        #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
+        #
+        # note: this is the only way I found to disable fast-math in Metal. it's ugly, but at least it works
+        #       disabling fast math is needed in order to pass tests/test-backend-ops
+        # note: adding -fno-inline fixes the tests when using MTL_SHADER_VALIDATION=1
+        # note: unfortunately, we have to call it default.metallib instead of ggml.metallib
+        #       ref: https://github.com/ggerganov/whisper.cpp/issues/1720
+        set(XC_FLAGS -fno-fast-math -fno-inline -g)
+        if (LLAMA_QKK_64)
+            set(XC_FLAGS ${XC_FLAGS} -DQK_K=64)
+        endif()
+
+        add_custom_command(
+            OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
+            COMMAND xcrun -sdk macosx metallib                ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air   -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+            DEPENDS ggml-metal.metal
+            COMMENT "Compiling Metal kernels"
+        )
+
+        add_custom_target(
+            ggml-metal ALL
+            DEPENDS ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
+        )
+    endif()
+
     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS}
         ${FOUNDATION_LIBRARY}
         ${METAL_FRAMEWORK}
@@ -200,7 +230,11 @@ if (LLAMA_BLAS)
             if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
                 pkg_check_modules(DepBLAS REQUIRED blas)
             elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                pkg_check_modules(DepBLAS REQUIRED openblas)
+                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
+                pkg_check_modules(DepBLAS openblas64)
+                if (NOT DepBLAS_FOUND)
+                    pkg_check_modules(DepBLAS REQUIRED openblas)
+                endif()
             elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
                 pkg_check_modules(DepBLAS REQUIRED blis)
             elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")

diff --git a/Package.swift b/Package.swift
@@ -13,21 +13,17 @@ let package = Package(
     products: [
         .library(name: "llama", targets: ["llama"]),
     ],
+    dependencies: [
+        .package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
+    ],
     targets: [
         .target(
             name: "llama",
+            dependencies: ["ggml"],
             path: ".",
             exclude: [],
             sources: [
-                "ggml.c",
                 "llama.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m",
-            ],
-            resources: [
-                .process("ggml-metal.metal")
             ],
             publicHeadersPath: "spm-headers",
             cSettings: [

diff --git a/awq-py/requirements.txt b/awq-py/requirements.txt
@@ -1,2 +1,2 @@
-torch>=2.0.0
+torch>=2.1.1
 transformers>=4.32.0
diff --git a/ci/run.sh b/ci/run.sh
@@ -30,6 +30,12 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
 
+CMAKE_EXTRA=""
+
+if [ ! -z ${GG_BUILD_METAL} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
+fi
+
 ## helpers
 
 # download a file if it does not exist or if it is outdated
@@ -81,8 +87,8 @@ function gg_run_ctest_debug {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Debug ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     (time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
 
@@ -109,8 +115,8 @@ function gg_run_ctest_release {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                               ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     if [ -z ${GG_BUILD_LOW_PERF} ]; then
         (time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log

diff --git a/common/train.cpp b/common/train.cpp
@@ -1107,7 +1107,7 @@ void print_common_train_usage(int /*argc*/, char ** /*argv*/, const struct train
     fprintf(stderr, "  --sample-start STR         Sets the starting point for samples after the specified pattern. If empty use every token position as sample start. (default '%s')\n", params->sample_start.c_str());
     fprintf(stderr, "  --include-sample-start     Include the sample start in the samples. (default off)\n");
     fprintf(stderr, "  --escape                   process sample start escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
-    fprintf(stderr, "  --overlapping-samples      Samples my overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
+    fprintf(stderr, "  --overlapping-samples      Samples may overlap, will include sample-start of second and following samples. When off, samples will end at begin of next sample. (default off)\n");
     fprintf(stderr, "  --fill-with-next-samples   Samples shorter than context length will be followed by the next (shuffled) samples. (default off)\n");
     fprintf(stderr, "  --separate-with-eos        When fill-with-next-samples, insert end-of-sequence token between samples.%s\n", params->separate_with_eos ? " (default)" : "");
     fprintf(stderr, "  --separate-with-bos        When fill-with-next-samples, insert begin-of-sequence token between samples.%s\n", params->separate_with_bos ? " (default)" : "");

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
@@ -59,7 +59,7 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
                 from safetensors import safe_open
                 ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu"))
             else:
-                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", weights_only=True))
+                ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True))
 
             with ctx as model_part:
                 for name in model_part.keys():

diff --git a/examples/base-translate.sh b/examples/base-translate.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#
+# Few-shot translation example.
+# Requires a base model (i.e. no fine-tuned or instruct models).
+#
+# Usage:
+#
+#   cd llama.cpp
+#   make -j
+#
+#   ./examples/base-translate.sh <model-base> "<text>"
+#
+
+if [ $# -ne 2 ]; then
+  echo "Usage: ./base-translate.sh <model-base> \"<text>\""
+  exit 1
+fi
+
+ftmp="__llama.cpp_example_tmp__.txt"
+trap "rm -f $ftmp" EXIT
+
+echo "Translate from English to French:
+
+===
+
+sea otter, peppermint, plush girafe:
+
+sea otter => loutre de mer
+peppermint => menthe poivrée
+plush girafe => girafe peluche
+
+===
+
+violin
+
+violin => violon
+
+===
+
+phone, computer, mouse, keyboard:
+
+phone => téléphone
+computer => ordinateur
+mouse => souris
+keyboard => clavier
+
+===
+" > $ftmp
+
+echo "$2
+" >> $ftmp
+
+model=$1
+
+# generate the most likely continuation, run on the CPU until the string "===" is found
+./main -m $model -f $ftmp -n 64 --temp 0 --repeat-penalty 1.0 --no-penalize-nl -ngl 0 -r "==="
diff --git a/examples/finetune/README.md b/examples/finetune/README.md
@@ -61,7 +61,7 @@ For example to apply 40% of the 'shakespeare' LORA adapter, 80% of the 'bible' L
   --lora lora-open-llama-3b-v2-q8_0-yet-another-one-LATEST.bin
 ```
 
-The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values to big will sometimes result in worse output. Play around to find good values.
+The scale numbers don't need to add up to one, and you can also use numbers greater than 1 to further increase the influence of an adapter. But making the values too big will sometimes result in worse output. Play around to find good values.
 
 Gradient checkpointing reduces the memory requirements by ~50% but increases the runtime.
 If you have enough RAM, you can make finetuning a bit faster by disabling checkpointing with `--no-checkpointing`.

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
@@ -3,15 +3,9 @@
 #include "llama.h"
 #include "common.h"
 #include "train.h"
-#include <unordered_map>
 #include <vector>
-#include <cassert>
-#include <climits>
 #include <cstring>
-#include <cstdarg>
 #include <ctime>
-#include <random>
-#include <stdexcept>
 #include <algorithm>
 #include <string>
 

diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj
@@ -9,7 +9,6 @@
 /* Begin PBXBuildFile section */
 		542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 542376072B0D9BFB008E6A1C /* ggml-quants.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
 		5423760B2B0D9C4B008E6A1C /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 5423760A2B0D9C4B008E6A1C /* ggml-backend.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
-		542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
 		542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09B2AC8723900A8AEE9 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL -DGGML_USE_K_QUANTS -O3"; }; };
 		542EA0A02AC8725700A8AEE9 /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 542EA09F2AC8725700A8AEE9 /* ggml-alloc.c */; settings = {COMPILER_FLAGS = "-O3"; }; };
 		542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 542EA0A12AC8729100A8AEE9 /* llama.cpp */; settings = {COMPILER_FLAGS = "-DGGML_USE_K_QUANTS -DGGML_USE_METAL -O3"; }; };
@@ -24,6 +23,8 @@
 		8A3F84242AC4C891005E2EE8 /* models in Resources */ = {isa = PBXBuildFile; fileRef = 8A3F84232AC4C891005E2EE8 /* models */; };
 		8A907F332AC7138A006146EA /* LibLlama.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A907F322AC7134E006146EA /* LibLlama.swift */; };
 		8A9F7C4D2AC332EE008AE1EA /* LlamaState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */; };
+		F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 549479C82AC9E10B00E0F78B /* ggml-metal.metal */; };
+		F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXFileReference section */
@@ -52,6 +53,7 @@
 		8A3F84232AC4C891005E2EE8 /* models */ = {isa = PBXFileReference; lastKnownFileType = folder; name = models; path = llama.swiftui/Resources/models; sourceTree = "<group>"; };
 		8A907F322AC7134E006146EA /* LibLlama.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibLlama.swift; sourceTree = "<group>"; };
 		8A9F7C4C2AC332EE008AE1EA /* LlamaState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaState.swift; sourceTree = "<group>"; };
+		F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LoadCustomButton.swift; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
 /* Begin PBXFrameworksBuildPhase section */
@@ -166,6 +168,7 @@
 			children = (
 				7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */,
 				8A1C83782AC328BD0096AF73 /* ContentView.swift */,
+				F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */,
 			);
 			path = UI;
 			sourceTree = "<group>";
@@ -241,7 +244,7 @@
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
-				542378792ACE3F3500834A7B /* ggml-metal.metal in Resources */,
+				F1FE20DC2B465C4500B45541 /* ggml-metal.metal in Resources */,
 				8A3F84242AC4C891005E2EE8 /* models in Resources */,
 				8A1C837E2AC328BE0096AF73 /* Preview Assets.xcassets in Resources */,
 				8A1C837B2AC328BE0096AF73 /* Assets.xcassets in Resources */,
@@ -257,6 +260,7 @@
 			files = (
 				542376082B0D9BFB008E6A1C /* ggml-quants.c in Sources */,
 				549479CD2AC9E42A00E0F78B /* ggml-metal.m in Sources */,
+				F1FE20E22B465ECA00B45541 /* LoadCustomButton.swift in Sources */,
 				542EA09D2AC8723900A8AEE9 /* ggml.c in Sources */,
 				8A907F332AC7138A006146EA /* LibLlama.swift in Sources */,
 				542EA0A32AC8729100A8AEE9 /* llama.cpp in Sources */,

diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift
@@ -103,6 +103,8 @@ struct ContentView: View {
                     ContentView.cleanupModelCaches()
                     llamaState.cacheCleared = true
                 }
+
+                LoadCustomButton(llamaState: llamaState)
             }
             .padding(.top, 4)
             .font(.system(size: 12))

diff --git a/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift b/examples/llama.swiftui/llama.swiftui/UI/LoadCustomButton.swift
@@ -0,0 +1,44 @@
+import SwiftUI
+import UniformTypeIdentifiers
+
+struct LoadCustomButton: View {
+    @ObservedObject private var llamaState: LlamaState
+    @State private var showFileImporter = false
+
+    init(llamaState: LlamaState) {
+        self.llamaState = llamaState
+    }
+
+    var body: some View {
+        VStack {
+            Button(action: {
+                showFileImporter = true
+            }) {
+                Text("Load Custom Model")
+            }
+        }
+        .fileImporter(
+            isPresented: $showFileImporter,
+            allowedContentTypes: [UTType(filenameExtension: "gguf", conformingTo: .data)!],
+            allowsMultipleSelection: false
+        ) { result in
+            switch result {
+            case .success(let files):
+                files.forEach { file in
+                    let gotAccess = file.startAccessingSecurityScopedResource()
+                    if !gotAccess { return }
+
+                    do {
+                        try llamaState.loadModel(modelUrl: file.absoluteURL)
+                    } catch let err {
+                        print("Error: \(err.localizedDescription)")
+                    }
+
+                    file.stopAccessingSecurityScopedResource()
+                }
+            case .failure(let error):
+                print(error)
+            }
+        }
+    }
+}
diff --git a/examples/server/README.md b/examples/server/README.md
@@ -168,6 +168,12 @@ node index.js
 
     `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
 
+    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
+
+    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
+
+    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
+
     *Result JSON:*
 
     Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion.
@@ -198,12 +204,6 @@ node index.js
 
     `truncated`: Boolean indicating if the context size was exceeded during generation, i.e. the number of tokens provided in the prompt (`tokens_evaluated`) plus tokens generated (`tokens predicted`) exceeded the context size (`n_ctx`)
 
-    `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1)
-
-    `cache_prompt`: Save the prompt and generation for avoid reprocess entire prompt if a part of this isn't change (default: false)
-
-    `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
-
 -   **POST** `/tokenize`: Tokenize a given text.
 
     *Options:*