From 5ef07e25ac39e62297a67208c5bcced50835a2dd Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 12 Aug 2024 10:21:50 +0300
Subject: [PATCH 01/11] server : handle models with missing EOS token (#8997)

ggml-ci
---
 examples/server/server.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 360f571e42867..1621c7c43961c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -631,6 +631,7 @@ struct server_context {
 
     bool clean_kv_cache = true;
     bool add_bos_token  = true;
+    bool has_eos_token  = false;
 
     int32_t n_ctx; // total context for all clients / slots
 
@@ -693,7 +694,7 @@ struct server_context {
         n_ctx = llama_n_ctx(ctx);
 
         add_bos_token = llama_should_add_bos_token(model);
-        GGML_ASSERT(llama_add_eos_token(model) != 1);
+        has_eos_token = llama_add_eos_token(model) != 1;
 
         return true;
     }
@@ -1031,7 +1032,7 @@ struct server_context {
         {
             slot.sparams.logit_bias.clear();
 
-            if (json_value(data, "ignore_eos", false)) {
+            if (json_value(data, "ignore_eos", false) && has_eos_token) {
                 slot.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
             }
 

From d3ae0ee8d75033921a076131d4d0fa1c6ec579a7 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 12 Aug 2024 11:02:01 +0300
Subject: [PATCH 02/11] py : fix requirements check '==' -> '~=' (#8982)

* py : fix requirements check '==' -> '~='

* cont : fix the fix

* ci : run on all requirements.txt
---
 .github/workflows/python-check-requirements.yml | 6 ++----
 examples/llava/requirements.txt                 | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml
index 4e0374fc63d95..46e80aecd0a0c 100644
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@@ -6,15 +6,13 @@ on:
       - '.github/workflows/python-check-requirements.yml'
       - 'scripts/check-requirements.sh'
       - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
+      - '**/requirements*.txt'
   pull_request:
     paths:
       - '.github/workflows/python-check-requirements.yml'
       - 'scripts/check-requirements.sh'
       - 'convert*.py'
-      - 'requirements.txt'
-      - 'requirements/*.txt'
+      - '**/requirements*.txt'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
index dfe5fbe62cea6..cbcbf26c9b4e9 100644
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -2,4 +2,4 @@
 --extra-index-url https://download.pytorch.org/whl/cpu
 pillow~=10.2.0
 torch~=2.2.1
-torchvision==0.17.1
+torchvision~=0.17.1

From 2589292cde038ba876c041bcd7b3f0c81f3f11fe Mon Sep 17 00:00:00 2001
From: Liu Jia <jia3.liu@intel.com>
Date: Mon, 12 Aug 2024 17:46:03 +0800
Subject: [PATCH 03/11] Fix a spelling mistake (#9001)

---
 src/llama-sampling.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
index 8910f6d6542e9..8f4841d9daf7b 100644
--- a/src/llama-sampling.cpp
+++ b/src/llama-sampling.cpp
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
             constexpr float bucket_low   = -10.0f;
             constexpr float bucket_high  =  10.0f;
             constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
-            constexpr float bucker_inter = -bucket_low * bucket_scale;
+            constexpr float bucket_inter = -bucket_low * bucket_scale;
 
             std::vector<int> bucket_idx(candidates->size);
             std::vector<int> histo(nbuckets, 0);
 
             for (int i = 0; i < (int)candidates->size; ++i) {
                 const float val = candidates->data[i].logit;
-                int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
+                int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
                 ib = std::max(0, std::min(nbuckets-1, ib));
                 bucket_idx[i] = ib;
                 ++histo[ib];

From df5478fbea7e652cfad4ee7974ac3b624fd6c7f6 Mon Sep 17 00:00:00 2001
From: DavidKorczynski <david@adalogics.com>
Date: Mon, 12 Aug 2024 13:21:41 +0100
Subject: [PATCH 04/11] ggml: fix div-by-zero (#9003)

Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=70724

In order to access the above bug you need to login using one of the
emails in
https://github.com/google/oss-fuzz/blob/master/projects/llamacpp/project.yaml#L3-L5

Signed-off-by: David Korczynski <david@adalogics.com>
---
 ggml/src/ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 38990e3a05a3f..c9b0e81684903 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -21129,7 +21129,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                 (int64_t) info->ne[2] *
                 (int64_t) info->ne[3];
 
-            if (ne % ggml_blck_size(info->type) != 0) {
+            if (ggml_blck_size(info->type) == 0 || ne % ggml_blck_size(info->type) != 0) {
                 fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
                         __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
                 fclose(file);

From 1262e7ed13ac197c944f15e1ddb083cb4f36cf65 Mon Sep 17 00:00:00 2001
From: DavidKorczynski <david@adalogics.com>
Date: Mon, 12 Aug 2024 13:36:41 +0100
Subject: [PATCH 05/11] grammar-parser : fix possible null-deref (#9004)

Fixes: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=70680

Signed-off-by: David Korczynski <david@adalogics.com>
---
 common/grammar-parser.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp
index a518b766dc33e..438452eab570f 100644
--- a/common/grammar-parser.cpp
+++ b/common/grammar-parser.cpp
@@ -369,6 +369,9 @@ namespace grammar_parser {
             }
             // Validate the state to ensure that all rules are defined
             for (const auto & rule : state.rules) {
+                if (rule.empty()) {
+                    throw std::runtime_error("Undefined rule");
+                }
                 for (const auto & elem : rule) {
                     if (elem.type == LLAMA_GRETYPE_RULE_REF) {
                         // Ensure that the rule at that location exists

From 84eb2f4fad28ceadd415a4e775320c983f4d9a7d Mon Sep 17 00:00:00 2001
From: Frank Mai <thxcode0824@gmail.com>
Date: Mon, 12 Aug 2024 20:45:50 +0800
Subject: [PATCH 06/11] docs: introduce gpustack and gguf-parser (#8873)

* readme: introduce gpustack

GPUStack is an open-source GPU cluster manager for running large
language models, which uses llama.cpp as the backend.

Signed-off-by: thxCode <thxcode0824@gmail.com>

* readme: introduce gguf-parser

GGUF Parser is a tool to review/check the GGUF file and estimate the
memory usage without downloading the whole model.

Signed-off-by: thxCode <thxcode0824@gmail.com>

---------

Signed-off-by: thxCode <thxcode0824@gmail.com>
---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 1283f6805874e..7f48fde6e5f5a 100644
--- a/README.md
+++ b/README.md
@@ -186,10 +186,12 @@ Unless otherwise noted these projects are open-source with permissive licensing:
 
 - [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
 - [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption
+- [gpustack/gguf-parser](https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser) - review/check the GGUF file and estimate the memory usage
 
 **Infrastructure:**
 
 - [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp
+- [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs
 
 **Games:**
 - [Lucy's Labyrinth](https://github.com/MorganRO8/Lucys_Labyrinth) - A simple maze game where agents controlled by an AI model will try to trick you.

From 0fd93cdef5e583aa980b3c0d693c0d207f0787a7 Mon Sep 17 00:00:00 2001
From: Nico Bosshard <nico@bosshome.ch>
Date: Mon, 12 Aug 2024 17:13:59 +0200
Subject: [PATCH 07/11] llama : model-based max number of graph nodes
 calculation (#8970)

* llama : model-based max number of graph nodes calculation

* Update src/llama.cpp

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 src/llama.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index aaf8db496ecbd..7f2f0003142a3 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -3575,13 +3575,8 @@ namespace GGUFMeta {
 
 using llama_buf_map = std::unordered_map<uint32_t, ggml_backend_buffer_t>;
 
-// TODO: update when needed or think of some clever automatic way to do this
-static size_t llama_model_max_nodes(const llama_model & /*model*/) {
-    //if (model.arch == LLM_ARCH_LLAMA && model.hparams.n_layer > ??) { // llama-3 405B
-    //    return 32768;
-    //}
-
-    return 8192;
+static size_t llama_model_max_nodes(const llama_model & model) {
+    return std::max<size_t>(8192, model.tensors_by_name.size()*5);
 }
 
 struct llama_model_loader {

From 1f67436c5ee6f4c99e71a8518bdfc214c27ce934 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Mon, 12 Aug 2024 19:17:03 +0300
Subject: [PATCH 08/11] ci : enable RPC in all of the released builds (#9006)

ref: #8912
---
 .github/workflows/build.yml | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b9246659a6ef0..74b5d4f69d790 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -47,7 +47,7 @@ jobs:
           sysctl -a
           mkdir build
           cd build
-          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF ..
+          cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF ..
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -105,7 +105,7 @@ jobs:
           sysctl -a
           # Metal is disabled due to intermittent failures with Github runners not having a GPU:
           # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
-          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
 
       - name: Test
@@ -222,7 +222,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DBUILD_SHARED_LIBS=OFF
+          cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=OFF
           cmake --build . --config Release -j $(nproc)
 
       - name: Test
@@ -696,22 +696,20 @@ jobs:
     strategy:
       matrix:
         include:
-          - build: 'rpc-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'noavx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx2-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'avx-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
           - build: 'avx512-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'openblas-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
           - build: 'kompute-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'vulkan-x64'
-            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
+            defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'llvm-arm64'
             defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
           - build: 'msvc-arm64'

From fc4ca27b25464a11b3b86c9dbb5b6ed6065965c2 Mon Sep 17 00:00:00 2001
From: Diogo Teles Sant'Anna <diogoteles@google.com>
Date: Mon, 12 Aug 2024 13:28:23 -0300
Subject: [PATCH 09/11] ci : fix github workflow vulnerable to script injection
 (#9008)

Signed-off-by: Diogo Teles Sant'Anna <diogoteles@google.com>
---
 .github/workflows/bench.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index eb69b82c47e64..56d22bc0cc394 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -129,6 +129,8 @@ jobs:
 
       - name: Server bench
         id: server_bench
+        env:
+            HEAD_REF: ${{ github.head_ref || github.ref_name }}
         run: |
           set -eux
 
@@ -137,7 +139,7 @@ jobs:
           python bench.py \
               --runner-label ${{ env.RUNNER_LABEL }} \
               --name ${{ github.job }} \
-              --branch ${{ github.head_ref || github.ref_name }} \
+              --branch $HEAD_REF \
               --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
               --scenario script.js \
               --duration ${{ github.event.inputs.duration || env.DURATION }} \

From 828d6ff7d796f48b2c345f6be2805a3c531a089c Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <thichthat@gmail.com>
Date: Tue, 13 Aug 2024 11:41:14 +0200
Subject: [PATCH 10/11] export-lora : throw error if lora is quantized (#9002)

---
 examples/export-lora/README.md       |  6 ++---
 examples/export-lora/export-lora.cpp | 35 +++++++++++++++++++++-------
 2 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/examples/export-lora/README.md b/examples/export-lora/README.md
index 91c33c34acaa9..7dce99c9a9e61 100644
--- a/examples/export-lora/README.md
+++ b/examples/export-lora/README.md
@@ -17,9 +17,9 @@ For example:
 
 ```bash
 ./bin/llama-export-lora \
-    -m open-llama-3b-v2-q8_0.gguf \
-    -o open-llama-3b-v2-q8_0-english2tokipona-chat.gguf \
-    --lora lora-open-llama-3b-v2-q8_0-english2tokipona-chat-LATEST.gguf
+    -m open-llama-3b-v2.gguf \
+    -o open-llama-3b-v2-english2tokipona-chat.gguf \
+    --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf
 ```
 
 Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters:
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 3176d6e26ef8b..c7e5ca78845ee 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -10,6 +10,12 @@
 
 static bool g_verbose = false;
 
+struct tensor_transformation {
+    struct ggml_tensor * in;
+    struct ggml_tensor * out;
+    bool is_copy;
+};
+
 static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
     int id = gguf_find_key(ctx_gguf, key.c_str());
     return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
@@ -198,8 +204,7 @@ struct lora_merge_ctx {
         }
 
         // mapping base tensor to out tensor (same shape with base, but different type)
-        // if out_tensor == nullptr, we only copy it
-        std::vector<std::pair<struct ggml_tensor *, struct ggml_tensor *>> base_to_out_tensors;
+        std::vector<tensor_transformation> trans;
         for (auto & it : base_model.tensors) {
             bool t_a = true;
             bool t_b = true;
@@ -212,14 +217,22 @@ struct lora_merge_ctx {
                 // only copy
                 struct ggml_tensor * cpy_tensor = ggml_dup_tensor(ctx_out_ggml, base_tensor);
                 ggml_set_name(cpy_tensor, base_tensor->name);
-                base_to_out_tensors.push_back(std::make_pair(cpy_tensor, nullptr));
+                trans.push_back({
+                    cpy_tensor,
+                    cpy_tensor,
+                    true,
+                });
                 gguf_add_tensor(ctx_out, cpy_tensor);
             } else if (t_a && t_b) {
                 // need merging
                 struct ggml_tensor * out_tensor = ggml_new_tensor(
                     ctx_out_ggml, get_out_tensor_type(base_tensor), GGML_MAX_DIMS, base_tensor->ne);
                 ggml_set_name(out_tensor, base_tensor->name);
-                base_to_out_tensors.push_back(std::make_pair(base_tensor, out_tensor));
+                trans.push_back({
+                    base_tensor,
+                    out_tensor,
+                    false,
+                });
                 gguf_add_tensor(ctx_out, out_tensor);
             } else {
                 throw std::runtime_error("tensor " + it.first + " missing either lora_a or lora_b");
@@ -234,12 +247,12 @@ struct lora_merge_ctx {
 
         // process base model tensors
         size_t n_merged = 0;
-        for (auto & it : base_to_out_tensors) {
-            if (it.second != nullptr) {
-                merge_tensor(it.first, it.second);
+        for (auto & it : trans) {
+            if (!it.is_copy) {
+                merge_tensor(it.in, it.out);
                 n_merged++;
             } else {
-                copy_tensor(it.first);
+                copy_tensor(it.in);
             }
         }
 
@@ -252,7 +265,7 @@ struct lora_merge_ctx {
         }
 
         printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged);
-        printf("%s : wrote %ld tensors to output file\n", __func__, base_to_out_tensors.size());
+        printf("%s : wrote %ld tensors to output file\n", __func__, trans.size());
     }
 
     void copy_tensor(struct ggml_tensor * base) {
@@ -285,6 +298,10 @@ struct lora_merge_ctx {
         for (size_t i = 0; i < adapters.size(); ++i) {
             auto t_a = adapters[i]->get_tensor(name_lora_a);
             auto t_b = adapters[i]->get_tensor(name_lora_b);
+            // TODO: add support for quantized lora
+            if (ggml_is_quantized(t_a->type) || ggml_is_quantized(t_b->type)) {
+                throw std::runtime_error("quantized LoRA adapters is not supported, please retry with f16 or f32");
+            }
             inp_a[i] = ggml_dup_tensor(ctx, t_a);
             inp_b[i] = ggml_dup_tensor(ctx, t_b);
         }

From 06943a69f678fb32829ff06d9c18367b17d4b361 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 13 Aug 2024 21:13:15 +0200
Subject: [PATCH 11/11] ggml : move rope type enum to ggml.h (#8949)

* ggml : move rope type enum to ggml.h

This commit moves the `llama_rope_type` enum from `llama.h` to
`ggml.h` and changes its name to `ggml_rope_type`.

The motivation for this change is to address the TODO in `llama.h` and
use the enum in ggml.

Note: This commit does not change the `mode` parameter to be of type
`enum ggml_rope_type`. The name `mode` and its usage suggest that it
might be more generic and possibly used as a bit field for multiple
flags. Further investigation/discussion may be needed to determine
if `mode` should be restricted to RoPE types.

* squash! ggml : move rope type enum to ggml.h

This commit removes GGML_ROPE_TYPE_NONE and GGML_ROPE_TYPE_GLM from
ggml.h, and back the llama_rope_type enum.

I've kept the assert for GGML_ROPE_TYPE_GLM as I'm not sure if it is
safe to remove it yet.

* squash! ggml : move rope type enum to ggml.h

This commit removes the enum ggml_rope_type from ggml.h and replaces it
with a define (GGML_ROPE_TYPE_NEOX). This define is used in the code to
check if the mode is set to GPT-NeoX. Also the enum llama_rope_type has
been updated to reflect this change.

* squash! ggml : move rope type enum to ggml.h

This commit contains a suggestion enable the GGML_ROPE_TYPE_NEOX
macro/define to be passed to the shader compiler.

* squash! ggml : move rope type enum to ggml.h

This commit fixes the editorconfig-checker warnings.

* squash! ggml : move rope type enum to ggml.h

Update comment for ggml_rope function.

* Revert "squash! ggml : move rope type enum to ggml.h"

This reverts commit 6261222bd0dc0efd51f0fb0435ad3f16a5b52fd6.

* squash! ggml : move rope type enum to ggml.h

Add GGML_ROPE_TYPE_NEOX to rope_common.comp.

* remove extra line

---------

Co-authored-by: slaren <slarengh@gmail.com>
---
 ggml/include/ggml.h                       | 6 ++++--
 ggml/src/ggml-cann/aclnn_ops.cpp          | 2 +-
 ggml/src/ggml-cuda/rope.cu                | 2 +-
 ggml/src/ggml-metal.m                     | 2 +-
 ggml/src/ggml-sycl/rope.cpp               | 2 +-
 ggml/src/ggml-vulkan.cpp                  | 2 +-
 ggml/src/ggml.c                           | 4 ++--
 ggml/src/kompute-shaders/op_rope_f16.comp | 2 +-
 ggml/src/kompute-shaders/op_rope_f32.comp | 2 +-
 ggml/src/kompute-shaders/rope_common.comp | 2 ++
 include/llama.h                           | 7 ++-----
 11 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 15602a96df7ad..1d2a354024675 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -244,6 +244,8 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
+#define GGML_ROPE_TYPE_NEOX 2
+
 #define GGUF_MAGIC "GGUF"
 
 #define GGUF_VERSION 3
@@ -1453,8 +1455,8 @@ extern "C" {
             struct ggml_tensor  * b);
 
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
-    // if mode & 2 == 1, GPT-NeoX style
+    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
+    // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
     GGML_API struct ggml_tensor * ggml_rope(
diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp
index 8c4132f5bb7ad..a4ec8418e2ab3 100644
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
@@ -2881,7 +2881,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast,
                              beta_slow, corr_dims);
 
-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
     // init cos/sin cache
     ggml_cann_pool_alloc sin_allocator(
diff --git a/ggml/src/ggml-cuda/rope.cu b/ggml/src/ggml-cuda/rope.cu
index 99ec1dd98ca9c..88f586d689cfd 100644
--- a/ggml/src/ggml-cuda/rope.cu
+++ b/ggml/src/ggml-cuda/rope.cu
@@ -226,7 +226,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
     const int32_t * pos = (const int32_t *) src1_d;
 
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
index aad189430ab0b..995f1934bc73b 100644
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -2313,7 +2313,7 @@ static enum ggml_status ggml_metal_graph_compute(
                         memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
                         memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
-                        const bool is_neox = mode & 2;
+                        const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
                         id<MTLComputePipelineState> pipeline = nil;
 
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index c7545bcc1a8a9..1f06f78fa3d91 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -226,7 +226,7 @@ void ggml_sycl_op_rope(
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
 
-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
     const int32_t * pos = (const int32_t *) src1_dd;
 
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp
index 86732837254f0..c0504e43429be 100644
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -4053,7 +4053,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
     case GGML_OP_ROPE:
         {
             const int mode = ((const int32_t *) dst->op_params)[2];
-            const bool is_neox = mode & 2;
+            const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
             if (is_neox) {
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c9b0e81684903..88e4fb7325dd9 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -14094,7 +14094,7 @@ static void ggml_compute_forward_rope_f32(
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
     const float * freq_factors = NULL;
     if (src2 != NULL) {
@@ -14219,7 +14219,7 @@ static void ggml_compute_forward_rope_f16(
     float corr_dims[2];
     ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
 
-    const bool is_neox = mode & 2;
+    const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
 
     const float * freq_factors = NULL;
     if (src2 != NULL) {
diff --git a/ggml/src/kompute-shaders/op_rope_f16.comp b/ggml/src/kompute-shaders/op_rope_f16.comp
index 1a4058b3f1f10..0ecfb2eab527c 100644
--- a/ggml/src/kompute-shaders/op_rope_f16.comp
+++ b/ggml/src/kompute-shaders/op_rope_f16.comp
@@ -11,7 +11,7 @@ void main() {
     const uint i2 = gl_WorkGroupID.y;
     const uint i1 = gl_WorkGroupID.x;
 
-    const bool is_neox = (pcs.mode & 2) != 0;
+    const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
 
     float corr_dims[2];
     rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
diff --git a/ggml/src/kompute-shaders/op_rope_f32.comp b/ggml/src/kompute-shaders/op_rope_f32.comp
index 65e03827a2660..cec0fd9a5d10c 100644
--- a/ggml/src/kompute-shaders/op_rope_f32.comp
+++ b/ggml/src/kompute-shaders/op_rope_f32.comp
@@ -11,7 +11,7 @@ void main() {
     const uint i2 = gl_WorkGroupID.y;
     const uint i1 = gl_WorkGroupID.x;
 
-    const bool is_neox = (pcs.mode & 2) != 0;
+    const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
 
     float corr_dims[2];
     rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
diff --git a/ggml/src/kompute-shaders/rope_common.comp b/ggml/src/kompute-shaders/rope_common.comp
index 7b9394cb2fffc..df4702896d46f 100644
--- a/ggml/src/kompute-shaders/rope_common.comp
+++ b/ggml/src/kompute-shaders/rope_common.comp
@@ -1,5 +1,7 @@
 #include "common.comp"
 
+#define GGML_ROPE_TYPE_NEOX 2
+
 // TODO: use a local size of 32 or more (Metal uses 1024)
 layout(local_size_x = 1) in;
 
diff --git a/include/llama.h b/include/llama.h
index ce07f4fac8f10..3c28cf0b509fb 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -95,13 +95,10 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
     };
 
-    // note: these values should be synchronized with ggml_rope
-    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
     enum llama_rope_type {
         LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM =  0,
-        LLAMA_ROPE_TYPE_NEOX =  2,
-        LLAMA_ROPE_TYPE_GLM  =  4,
+        LLAMA_ROPE_TYPE_NORM = 0,
+        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
     };
 
     enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file