From e0324285a569d0583cf2f4a07a2402221ee25f58 Mon Sep 17 00:00:00 2001
From: stduhpf <stephduh@live.fr>
Date: Tue, 16 Jan 2024 12:04:32 +0100
Subject: [PATCH 01/25] speculative : threading options (#4959)

* speculative: expose draft threading

* fix usage format

* accept -td and -tbd args

* speculative: revert default behavior when -td is unspecified

* fix trailing whitespace
---
 common/common.cpp                    | 22 ++++++++++++++++++++++
 common/common.h                      |  2 ++
 examples/speculative/speculative.cpp |  4 ++++
 3 files changed, 28 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index c11006bcb9175..2b0865fff0e62 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -167,6 +167,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             if (params.n_threads_batch <= 0) {
                 params.n_threads_batch = std::thread::hardware_concurrency();
             }
+        } else if (arg == "-td" || arg == "--threads-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads_draft = std::stoi(argv[i]);
+            if (params.n_threads_draft <= 0) {
+                params.n_threads_draft = std::thread::hardware_concurrency();
+            }
+        } else if (arg == "-tbd" || arg == "--threads-batch-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.n_threads_batch_draft = std::stoi(argv[i]);
+            if (params.n_threads_batch_draft <= 0) {
+                params.n_threads_batch_draft = std::thread::hardware_concurrency();
+            }
         } else if (arg == "-p" || arg == "--prompt") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -845,6 +863,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -t N, --threads N     number of threads to use during generation (default: %d)\n", params.n_threads);
     printf("  -tb N, --threads-batch N\n");
     printf("                        number of threads to use during batch and prompt processing (default: same as --threads)\n");
+    printf("  -td N, --threads-draft N");
+    printf("                        number of threads to use during generation (default: same as --threads)");
+    printf("  -tbd N, --threads-batch-draft N\n");
+    printf("                        number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
     printf("  -p PROMPT, --prompt PROMPT\n");
     printf("                        prompt to start generation with (default: empty)\n");
     printf("  -e, --escape          process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
diff --git a/common/common.h b/common/common.h
index 096468243d88c..1f43e6282f48d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -46,7 +46,9 @@ struct gpt_params {
     uint32_t seed                           = -1;    // RNG seed
 
     int32_t n_threads                       = get_num_physical_cores();
+    int32_t n_threads_draft                 = -1;
     int32_t n_threads_batch                 = -1;    // number of threads to use for batch processing (-1 = use n_threads)
+    int32_t n_threads_batch_draft           = -1;
     int32_t n_predict                       = -1;    // new tokens to predict
     int32_t n_ctx                           = 512;   // context size
     int32_t n_batch                         = 512;   // batch size for prompt processing (must be >=32 to use BLAS)
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
index 20f1fb5bfcd99..7b3af01f339a9 100644
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@@ -65,6 +65,10 @@ int main(int argc, char ** argv) {
     // load the draft model
     params.model = params.model_draft;
     params.n_gpu_layers = params.n_gpu_layers_draft;
+    if (params.n_threads_draft > 0) {
+        params.n_threads = params.n_threads_draft;
+    }
+    params.n_threads_batch = params.n_threads_batch_draft;
     std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
 
     {

From d75c232e1da56f19ac4d2530dadbe0ab3a11fde5 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Jan 2024 12:14:19 +0100
Subject: [PATCH 02/25] finetune : use LLAMA_FILE_MAGIC_GGLA (#4961)

This commit replaces the magic number LLAMA_FILE_MAGIC_LORA used in
finetune.cpp with LLAMA_FILE_MAGIC_GGLA defined in llama.h.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/finetune/finetune.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index eaca42fc1c356..a6620fd73ca18 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1138,9 +1138,8 @@ static void save_as_llama_lora(const char * filename, struct my_llama_lora * lor
         return tn_buf.data();
     };
 
-    uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla'
     // write_magic
-    file.write_u32(LLAMA_FILE_MAGIC_LORA);   // magic
+    file.write_u32(LLAMA_FILE_MAGIC_GGLA);   // magic
     file.write_u32(1); // version
     // write_hparams
     file.write_u32(lora->hparams.lora_r);

From a0b3ac8c48b66206b9c5921ce57bd5c0ea6557c3 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Tue, 16 Jan 2024 03:16:33 -0800
Subject: [PATCH 03/25] ggml : introduce GGML_CALL function annotation (#4850)

This change makes it possible to build ggml-cuda.cu and ggml-metal.m as
independent dynamic shared objects, that may be conditionally linked at
runtime in a multiplatform binary. It introduces a GGML_CALL annotation
that documents which functions have a cyclic call relationship, between
the application code and GPU modules.

This change does nothing, unless the build defines -DGGML_MULTIPLATFORM
which causes back-references and function pointers to conform to MS ABI
which is supported by NVCC, ROCm, XCode, GCC and Clang across platforms
---
 ggml-backend-impl.h |  60 +++++++++++-----------
 ggml-backend.c      |  80 ++++++++++++++---------------
 ggml-backend.h      |  50 +++++++++---------
 ggml-cuda.cu        | 121 ++++++++++++++++++++++----------------------
 ggml-cuda.h         |  32 ++++++------
 ggml-metal.h        |   4 +-
 ggml-metal.m        |  42 +++++++--------
 ggml.c              |  32 ++++++------
 ggml.h              |  58 ++++++++++++---------
 9 files changed, 244 insertions(+), 235 deletions(-)

diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h
index 1db32901fe6c7..1397828d9ac71 100644
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@@ -16,14 +16,14 @@ extern "C" {
     typedef void * ggml_backend_buffer_type_context_t;
 
     struct ggml_backend_buffer_type_i {
-        const char *          (*get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
         // check if tensor data is in host memory
         // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
     };
 
     struct ggml_backend_buffer_type {
@@ -35,15 +35,15 @@ extern "C" {
     typedef void * ggml_backend_buffer_context_t;
 
     struct ggml_backend_buffer_i {
-        const char * (*get_name)   (ggml_backend_buffer_t buffer);
-        void         (*free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*get_base)   (ggml_backend_buffer_t buffer);
-        void         (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
     };
 
     struct ggml_backend_buffer {
@@ -54,7 +54,7 @@ extern "C" {
         enum ggml_backend_buffer_usage usage;
     };
 
-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                    ggml_backend_buffer_type_t      buft,
             struct ggml_backend_buffer_i           iface,
                    ggml_backend_buffer_context_t   context,
@@ -70,31 +70,31 @@ extern "C" {
     typedef void * ggml_backend_context_t;
 
     struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
 
-        void (*free)(ggml_backend_t backend);
+        void (*GGML_CALL free)(ggml_backend_t backend);
 
         // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
 
         // (optional) asynchronous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
 
         // (optional) complete all pending operations
-        void (*synchronize)(ggml_backend_t backend);
+        void (*GGML_CALL synchronize)(ggml_backend_t backend);
 
         // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
 
         // compute graph without a plan (async)
-        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 
         // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
     };
 
     struct ggml_backend {
@@ -107,9 +107,9 @@ extern "C" {
     // Backend registry
     //
 
-    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
 
-    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml-backend.c b/ggml-backend.c
index 505dbba476253..f5424fb904117 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -19,7 +19,7 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_name(buft);
 }
 
-ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     return buft->iface.alloc_buffer(buft, size);
 }
 
@@ -27,7 +27,7 @@ size_t ggml_backend_buft_get_alignment(ggml_backend_buffer_type_t buft) {
     return buft->iface.get_alignment(buft);
 }
 
-size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
+GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
     // get_alloc_size is optional, defaults to ggml_nbytes
     if (buft->iface.get_alloc_size) {
         return buft->iface.get_alloc_size(buft, tensor);
@@ -48,7 +48,7 @@ bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
 
 // backend buffer
 
-ggml_backend_buffer_t ggml_backend_buffer_init(
+GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                ggml_backend_buffer_type_t      buft,
         struct ggml_backend_buffer_i           iface,
                ggml_backend_buffer_context_t   context,
@@ -95,7 +95,7 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
     return base;
 }
 
-void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+GGML_CALL void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
     // init_tensor is optional
     if (buffer->iface.init_tensor) {
         buffer->iface.init_tensor(buffer, tensor);
@@ -191,7 +191,7 @@ void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_ten
     }
 }
 
-void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -201,7 +201,7 @@ void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, siz
     tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
 }
 
-void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
 
     GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
@@ -318,9 +318,9 @@ struct ggml_backend_reg {
 static struct ggml_backend_reg ggml_backend_registry[GGML_MAX_BACKENDS_REG];
 static size_t ggml_backend_registry_count = 0;
 
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
+GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data);
 
-static void ggml_backend_registry_init(void) {
+GGML_CALL static void ggml_backend_registry_init(void) {
     static bool initialized = false;
 
     if (initialized) {
@@ -333,18 +333,18 @@ static void ggml_backend_registry_init(void) {
 
     // add forward decls here to avoid including the backend headers
 #ifdef GGML_USE_CUBLAS
-    extern void ggml_backend_cuda_reg_devices(void);
+    extern GGML_CALL void ggml_backend_cuda_reg_devices(void);
     ggml_backend_cuda_reg_devices();
 #endif
 
 #ifdef GGML_USE_METAL
-    extern ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
-    extern ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+    extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
+    extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
     ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
 #endif
 }
 
-void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
+GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
     GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
 
     size_t id = ggml_backend_registry_count;
@@ -439,33 +439,33 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) {
 
 // backend CPU
 
-static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
     return "CPU";
 
     GGML_UNUSED(buffer);
 }
 
-static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
     return (void *)buffer->context;
 }
 
-static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     free(buffer->context);
 }
 
-static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     memcpy(data, (const char *)tensor->data + offset, size);
 
     GGML_UNUSED(buffer);
 }
 
-static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     if (ggml_backend_buffer_is_host(src->buffer)) {
         memcpy(dst->data, src->data, ggml_nbytes(src));
         return true;
@@ -475,7 +475,7 @@ static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, con
     GGML_UNUSED(buffer);
 }
 
-static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     memset(buffer->context, value, buffer->size);
 }
 
@@ -506,13 +506,13 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
 
 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
 
-static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU";
 
     GGML_UNUSED(buft);
 }
 
-static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     size += TENSOR_ALIGNMENT;   // malloc may return an address that is not aligned
     void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
 
@@ -521,25 +521,25 @@ static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_back
     return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
 }
 
-static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return TENSOR_ALIGNMENT;
 
     GGML_UNUSED(buft);
 }
 
-static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     return ggml_backend_is_cpu(backend);
 
     GGML_UNUSED(buft);
 }
 
-static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return true;
 
     GGML_UNUSED(buft);
 }
 
-ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_cpu_buffer_type_get_name,
@@ -561,23 +561,23 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
 
 #include <hbwmalloc.h>
 
-static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "CPU_HBM";
 
     GGML_UNUSED(buft);
 }
 
-static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
+GGML_CALL static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
     return "CPU_HBM";
 
     GGML_UNUSED(buf);
 }
 
-static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     hbw_free(buffer->context);
 }
 
-static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     //void * ptr = hbw_malloc(size);
     void * ptr;
     int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
@@ -617,20 +617,20 @@ struct ggml_backend_cpu_context {
     size_t work_size;
 };
 
-static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
     return "CPU";
 
     GGML_UNUSED(backend);
 }
 
-static void ggml_backend_cpu_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cpu_free(ggml_backend_t backend) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
     free(cpu_ctx->work_data);
     free(cpu_ctx);
     free(backend);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
     return ggml_backend_cpu_buffer_type();
 
     GGML_UNUSED(backend);
@@ -641,7 +641,7 @@ struct ggml_backend_plan_cpu {
     struct ggml_cgraph cgraph;
 };
 
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
+GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
@@ -656,7 +656,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
     return cpu_plan;
 }
 
-static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
     free(cpu_plan->cplan.work_data);
@@ -665,7 +665,7 @@ static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backen
     GGML_UNUSED(backend);
 }
 
-static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+GGML_CALL static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
     ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
@@ -673,7 +673,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
     GGML_UNUSED(backend);
 }
 
-static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
     struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@@ -690,7 +690,7 @@ static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
     return true;
 }
 
-static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_MUL_MAT:
             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
@@ -732,7 +732,7 @@ ggml_backend_t ggml_backend_cpu_init(void) {
     return cpu_backend;
 }
 
-bool ggml_backend_is_cpu(ggml_backend_t backend) {
+GGML_CALL bool ggml_backend_is_cpu(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_cpu_name;
 }
 
@@ -743,11 +743,11 @@ void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
     ctx->n_threads = n_threads;
 }
 
-ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
     return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), cpu_backend_buffer_i_from_ptr, ptr, size);
 }
 
-static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
+GGML_CALL static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user_data) {
     return ggml_backend_cpu_init();
 
     GGML_UNUSED(params);
diff --git a/ggml-backend.h b/ggml-backend.h
index 4eb244af1d3e7..12b4b4ab74935 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -17,12 +17,12 @@ extern "C" {
     //
 
     // buffer type
-    GGML_API const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
 
     // buffer
     enum ggml_backend_buffer_usage {
@@ -30,18 +30,18 @@ extern "C" {
         GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
     };
 
-    GGML_API const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
 
     //
     // Backend
@@ -58,8 +58,8 @@ extern "C" {
     GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
     GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
 
     GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
 
@@ -80,13 +80,13 @@ extern "C" {
 
     GGML_API ggml_backend_t ggml_backend_cpu_init(void);
 
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu           (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
 
     // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
 
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 
 #ifdef GGML_USE_CPU_HBM
     GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
@@ -183,7 +183,7 @@ extern "C" {
     GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
     GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
 
-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
 
     // Compare the output of two backends
     GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index c3e14bc96ec38..568c411afd3ee 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -7615,11 +7615,11 @@ struct cuda_pool_alloc {
 
 static bool g_cublas_loaded = false;
 
-bool ggml_cublas_loaded(void) {
+GGML_CALL bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
-void ggml_init_cublas() {
+GGML_CALL void ggml_init_cublas() {
     static bool initialized = false;
 
     if (!initialized) {
@@ -7707,7 +7707,7 @@ void ggml_init_cublas() {
     }
 }
 
-void * ggml_cuda_host_malloc(size_t size) {
+GGML_CALL void * ggml_cuda_host_malloc(size_t size) {
     if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
         return nullptr;
     }
@@ -7725,7 +7725,7 @@ void * ggml_cuda_host_malloc(size_t size) {
     return ptr;
 }
 
-void ggml_cuda_host_free(void * ptr) {
+GGML_CALL void ggml_cuda_host_free(void * ptr) {
     CUDA_CHECK(cudaFreeHost(ptr));
 }
 
@@ -9242,7 +9242,7 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
 
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
     if (!g_cublas_loaded) return false;
 
     const int64_t ne10 = src1->ne[0];
@@ -10013,7 +10013,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
     return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }
 
-static void ggml_cuda_set_main_device(const int main_device) {
+GGML_CALL static void ggml_cuda_set_main_device(const int main_device) {
     if (main_device >= g_device_count) {
         fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
                 main_device, g_device_count, g_main_device);
@@ -10028,7 +10028,7 @@ static void ggml_cuda_set_main_device(const int main_device) {
     }
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
     if (!g_cublas_loaded) return false;
 
     ggml_cuda_func_t func;
@@ -10186,7 +10186,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
     return true;
 }
 
-int ggml_cuda_get_device_count() {
+GGML_CALL int ggml_cuda_get_device_count() {
     int device_count;
     if (cudaGetDeviceCount(&device_count) != cudaSuccess) {
         return 0;
@@ -10194,7 +10194,7 @@ int ggml_cuda_get_device_count() {
     return device_count;
 }
 
-void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
+GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) {
     cudaDeviceProp prop;
     CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
     snprintf(description, description_size, "%s", prop.name);
@@ -10244,27 +10244,27 @@ struct ggml_backend_cuda_buffer_context {
     }
 };
 
-static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     return ctx->name.c_str();
 }
 
-static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
     return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
 }
 
-static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     CUDA_CHECK(cudaFree(ctx->dev_ptr));
     delete ctx;
 }
 
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
     return ctx->dev_ptr;
 }
 
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
@@ -10296,7 +10296,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g
     }
 }
 
-static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10307,7 +10307,7 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, gg
     CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10318,7 +10318,7 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, co
     CUDA_CHECK(cudaDeviceSynchronize());
 }
 
-static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
     if (ggml_backend_buffer_is_cuda(src->buffer)) {
         ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
         ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
@@ -10335,7 +10335,7 @@ static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, co
     return false;
 }
 
-static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
 
     ggml_cuda_set_device(ctx->device);
@@ -10357,19 +10357,18 @@ static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
 };
 
 // cuda buffer type
-
 struct ggml_backend_cuda_buffer_type_context {
     int device;
     std::string name;
 };
 
-static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
     ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
     return ctx->name.c_str();
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
 
     ggml_cuda_set_device(buft_ctx->device);
@@ -10388,13 +10387,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
     return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
 
     UNUSED(buft);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     int64_t row_low = 0;
     int64_t row_high = ggml_nrows(tensor);
     int64_t nrows_split = row_high - row_low;
@@ -10414,7 +10413,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
     UNUSED(buft);
 }
 
-static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     if (!ggml_backend_is_cuda(backend)) {
         return false;
     }
@@ -10434,7 +10433,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
     /* .is_host          = */ NULL,
 };
 
-ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
     // FIXME: this is not thread safe
     if (device >= ggml_backend_cuda_get_device_count()) {
         return nullptr;
@@ -10479,7 +10478,7 @@ struct ggml_backend_cuda_split_buffer_context {
     std::vector<ggml_tensor_extra_gpu *> tensor_extras;
 };
 
-static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
     return GGML_CUDA_NAME "_Split";
 
     UNUSED(buffer);
@@ -10490,19 +10489,19 @@ static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_
 //    return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
 //}
 
-static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
     delete ctx;
 }
 
-static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
     // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
     return (void *)0x1000;
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
 
     ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
@@ -10552,7 +10551,7 @@ static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buf
     tensor->extra = extra;
 }
 
-static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -10586,7 +10585,7 @@ static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buff
     }
 }
 
-static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     // split tensors must always be set in their entirety at once
     GGML_ASSERT(offset == 0);
     GGML_ASSERT(size == ggml_nbytes(tensor));
@@ -10620,7 +10619,7 @@ static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buff
     }
 }
 
-static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     UNUSED(buffer);
     UNUSED(value);
 }
@@ -10639,13 +10638,13 @@ static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
 
 // cuda split buffer type
 
-static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
     return GGML_CUDA_NAME "_Split";
 
     UNUSED(buft);
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
     // instead, we allocate them for each tensor separately in init_tensor
     // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
@@ -10655,13 +10654,13 @@ static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(gg
     return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
 }
 
-static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
 
     UNUSED(buft);
 }
 
-static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
 
     size_t total_size = 0;
@@ -10688,13 +10687,13 @@ static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_bu
     return total_size;
 }
 
-static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     return ggml_backend_is_cuda(backend);
 
     UNUSED(buft);
 }
 
-static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return false;
 
     UNUSED(buft);
@@ -10709,7 +10708,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
     /* .is_host          = */ ggml_backend_cuda_split_buffer_type_is_host,
 };
 
-ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
     // FIXME: this is not thread safe
     static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
 
@@ -10745,23 +10744,23 @@ ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * ten
 
 // host buffer type
 
-static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
     return GGML_CUDA_NAME "_Host";
 
     UNUSED(buft);
 }
 
-static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
     return GGML_CUDA_NAME "_Host";
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_cuda_host_free(buffer->context);
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     void * ptr = ggml_cuda_host_malloc(size);
 
     if (ptr == nullptr) {
@@ -10777,7 +10776,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
     return buffer;
 }
 
-ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
         /* .iface    = */ {
             /* .get_name         = */ ggml_backend_cuda_host_buffer_type_name,
@@ -10795,26 +10794,26 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
 
 // backend
 
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     return cuda_ctx->name.c_str();
 }
 
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cuda_free(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     delete cuda_ctx;
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     return ggml_backend_cuda_buffer_type(cuda_ctx->device);
 }
 
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
@@ -10823,7 +10822,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens
     CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0]));
 }
 
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
@@ -10832,7 +10831,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm
     CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
 }
 
-static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
@@ -10843,7 +10842,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggm
     return false;
 }
 
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
@@ -10851,7 +10850,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
     UNUSED(backend);
 }
 
-static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
 
     ggml_cuda_set_main_device(cuda_ctx->device);
@@ -10890,7 +10889,7 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
     return true;
 }
 
-static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -11016,7 +11015,7 @@ static ggml_backend_i ggml_backend_cuda_interface = {
     /* .supports_op             = */ ggml_backend_cuda_supports_op,
 };
 
-ggml_backend_t ggml_backend_cuda_init(int device) {
+GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
     ggml_init_cublas(); // TODO: remove from ggml.c
 
     if (device < 0 || device >= ggml_cuda_get_device_count()) {
@@ -11040,35 +11039,35 @@ ggml_backend_t ggml_backend_cuda_init(int device) {
     return cuda_backend;
 }
 
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
+GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) {
     return backend && backend->iface.get_name == ggml_backend_cuda_name;
 }
 
-int ggml_backend_cuda_get_device_count() {
+GGML_CALL int ggml_backend_cuda_get_device_count() {
     return ggml_cuda_get_device_count();
 }
 
-void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
+GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
     ggml_cuda_get_device_description(device, description, description_size);
 }
 
-void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
+GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
     ggml_cuda_set_device(device);
 
     CUDA_CHECK(cudaMemGetInfo(free, total));
 }
 
 // backend registry
-static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
+GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
     ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
     return cuda_backend;
 
     UNUSED(params);
 }
 
-extern "C" int ggml_backend_cuda_reg_devices();
+extern "C" GGML_CALL int ggml_backend_cuda_reg_devices();
 
-int ggml_backend_cuda_reg_devices() {
+GGML_CALL int ggml_backend_cuda_reg_devices() {
     int device_count = ggml_cuda_get_device_count();
     //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
     for (int i = 0; i < device_count; i++) {
diff --git a/ggml-cuda.h b/ggml-cuda.h
index d19cbf3fdd04b..b1ebd61d7fb66 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -18,34 +18,34 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 
 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API void   ggml_init_cublas(void);
+GGML_API GGML_CALL void   ggml_init_cublas(void);
 
 // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API bool   ggml_cublas_loaded(void);
+GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
 
-GGML_API void * ggml_cuda_host_malloc(size_t size);
-GGML_API void   ggml_cuda_host_free(void * ptr);
+GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
+GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
 
-GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API GGML_CALL bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API GGML_CALL bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 
-GGML_API int    ggml_cuda_get_device_count(void);
-GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL int    ggml_cuda_get_device_count(void);
+GGML_API GGML_CALL void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 
 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
 
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 
-GGML_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 
 #ifdef  __cplusplus
 }
diff --git a/ggml-metal.h b/ggml-metal.h
index cd5e2995f66f6..8b0bfc5f10329 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -47,11 +47,11 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
 
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 
-GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
 
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
diff --git a/ggml-metal.m b/ggml-metal.m
index 2ca726055f9ea..867f2fd48cbd2 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2294,13 +2294,13 @@ static void ggml_backend_metal_free_device(void) {
     }
 }
 
-static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
+GGML_CALL static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
     return "Metal";
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     for (int i = 0; i < ctx->n_buffers; i++) {
@@ -2315,25 +2315,25 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
     free(ctx);
 }
 
-static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+GGML_CALL static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     return ctx->all_data;
 }
 
-static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     memcpy((char *)tensor->data + offset, data, size);
 
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+GGML_CALL static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     memcpy(data, (const char *)tensor->data + offset, size);
 
     UNUSED(buffer);
 }
 
-static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+GGML_CALL static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
     if (ggml_backend_buffer_is_host(src->buffer)) {
         memcpy(dst->data, src->data, ggml_nbytes(src));
         return true;
@@ -2343,7 +2343,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
     UNUSED(buffer);
 }
 
-static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
     struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
 
     memset(ctx->all_data, value, ctx->all_size);
@@ -2363,13 +2363,13 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
 
 // default buffer type
 
-static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
     return "Metal";
 
     UNUSED(buft);
 }
 
-static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
     const size_t size_page = sysconf(_SC_PAGESIZE);
@@ -2421,24 +2421,24 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
     return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
 }
 
-static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+GGML_CALL static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 32;
     UNUSED(buft);
 }
 
-static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
     return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
 
     UNUSED(buft);
 }
 
-static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
     return true;
 
     UNUSED(buft);
 }
 
-ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
+GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
     static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = {
         /* .iface = */ {
             /* .get_name         = */ ggml_backend_metal_buffer_type_get_name,
@@ -2456,7 +2456,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
 
 // buffer from ptr
 
-ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
+GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
     ctx->all_data = data;
@@ -2543,31 +2543,31 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
 
 // backend
 
-static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_metal_name(ggml_backend_t backend) {
     return "Metal";
 
     UNUSED(backend);
 }
 
-static void ggml_backend_metal_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_metal_free(ggml_backend_t backend) {
     struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
     ggml_metal_free(ctx);
     free(backend);
 }
 
-static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
     return ggml_backend_metal_buffer_type();
 
     UNUSED(backend);
 }
 
-static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
 
     return ggml_metal_graph_compute(metal_ctx, cgraph);
 }
 
-static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
 
     return ggml_metal_supports_op(metal_ctx, op);
@@ -2630,9 +2630,9 @@ bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
     return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
 
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
+GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data); // silence warning
 
-ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
+GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data) {
     return ggml_backend_metal_init();
 
     GGML_UNUSED(params);
diff --git a/ggml.c b/ggml.c
index ef5888ab21538..5779f32d297e3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1990,19 +1990,19 @@ void ggml_print_objects(const struct ggml_context * ctx) {
     GGML_PRINT("%s: --- end ---\n", __func__);
 }
 
-int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+GGML_CALL int64_t ggml_nelements(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+GGML_CALL int64_t ggml_nrows(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+GGML_CALL size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     size_t nbytes;
     size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
@@ -2025,15 +2025,15 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
     return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
 }
 
-int ggml_blck_size(enum ggml_type type) {
+GGML_CALL int ggml_blck_size(enum ggml_type type) {
     return type_traits[type].blck_size;
 }
 
-size_t ggml_type_size(enum ggml_type type) {
+GGML_CALL size_t ggml_type_size(enum ggml_type type) {
     return type_traits[type].type_size;
 }
 
-size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+GGML_CALL size_t ggml_row_size(enum ggml_type type, int64_t ne) {
     assert(ne % ggml_blck_size(type) == 0);
     return ggml_type_size(type)*ne/ggml_blck_size(type);
 }
@@ -2042,15 +2042,15 @@ double ggml_type_sizef(enum ggml_type type) {
     return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
 }
 
-const char * ggml_type_name(enum ggml_type type) {
+GGML_CALL const char * ggml_type_name(enum ggml_type type) {
     return type_traits[type].type_name;
 }
 
-bool ggml_is_quantized(enum ggml_type type) {
+GGML_CALL bool ggml_is_quantized(enum ggml_type type) {
     return type_traits[type].is_quantized;
 }
 
-const char * ggml_op_name(enum ggml_op op) {
+GGML_CALL const char * ggml_op_name(enum ggml_op op) {
     return GGML_OP_NAME[op];
 }
 
@@ -2062,7 +2062,7 @@ const char * ggml_unary_op_name(enum ggml_unary_op op) {
     return GGML_UNARY_OP_NAME[op];
 }
 
-const char * ggml_op_desc(const struct ggml_tensor * t) {
+GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) {
     if (t->op == GGML_OP_UNARY) {
         enum ggml_unary_op uop = ggml_get_unary_op(t);
         return ggml_unary_op_name(uop);
@@ -2072,7 +2072,7 @@ const char * ggml_op_desc(const struct ggml_tensor * t) {
     }
 }
 
-size_t ggml_element_size(const struct ggml_tensor * tensor) {
+GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) {
     return ggml_type_size(tensor->type);
 }
 
@@ -2154,11 +2154,11 @@ size_t ggml_tensor_overhead(void) {
     return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
 }
 
-bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
 
-bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return
@@ -2177,7 +2177,7 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
-bool ggml_is_permuted(const struct ggml_tensor * tensor) {
+GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
@@ -3079,7 +3079,7 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
     return (float *)(tensor->data);
 }
 
-enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
+GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor) {
     GGML_ASSERT(tensor->op == GGML_OP_UNARY);
     return (enum ggml_unary_op) ggml_get_op_params_i32(tensor, 0);
 }
@@ -11653,7 +11653,7 @@ static void ggml_rope_cache_init(
     }
 }
 
-void ggml_rope_yarn_corr_dims(
+GGML_CALL void ggml_rope_yarn_corr_dims(
     int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]
 ) {
     // start and end correction dims
diff --git a/ggml.h b/ggml.h
index 1187074f7f174..837c52e68c90c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -187,6 +187,16 @@
 #    define GGML_API
 #endif
 
+#ifdef GGML_MULTIPLATFORM
+#    if defined(_WIN32)
+#        define GGML_CALL
+#    else
+#        define GGML_CALL __attribute__((__ms_abi__))
+#    endif
+#else
+#    define GGML_CALL
+#endif
+
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -649,41 +659,41 @@ extern "C" {
     GGML_API void    ggml_print_object (const struct ggml_object * obj);
     GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
 
-    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
 
-    GGML_API int    ggml_blck_size(enum ggml_type type);
-    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
+    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
 
     GGML_DEPRECATED(
     GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
     "use ggml_row_size() instead");
 
-    GGML_API const char * ggml_type_name(enum ggml_type type);
-    GGML_API const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
+    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);
 
-    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
 
-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
 
-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);
 
     // TODO: temporary until model loading of ggml examples is refactored
     GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
 
-    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
 
     GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
@@ -770,7 +780,7 @@ extern "C" {
     GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
     GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
 
-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
 
     GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
     GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
@@ -1413,7 +1423,7 @@ extern "C" {
             float                 beta_slow);
 
     // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
+    GGML_CALL void ggml_rope_yarn_corr_dims(
         int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
 
     // xPos RoPE, in-place, returns view(a)

From 122ed4840cc6d209df6043e027f9f8a03aee01da Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Tue, 16 Jan 2024 13:10:48 +0100
Subject: [PATCH 04/25] examples : fix and improv docs for the grammar
 generator (#4909)

* Create pydantic-models-to-grammar.py

* Added some comments for usage

* Refactored Grammar Generator

Added example and usage instruction.

* Update pydantic_models_to_grammar.py

* Update pydantic-models-to-grammar-examples.py

* Renamed module and imported it.

* Update pydantic-models-to-grammar.py

* Renamed file and fixed grammar generator issue.

* Fixed some issues and bugs of the grammar generator. Imporved Documentation

* Update pydantic_models_to_grammar.py
---
 examples/pydantic_models_to_grammar.py | 835 +++++++++++++++----------
 1 file changed, 498 insertions(+), 337 deletions(-)

diff --git a/examples/pydantic_models_to_grammar.py b/examples/pydantic_models_to_grammar.py
index 41b98fdc1fcb4..848c1c367d701 100644
--- a/examples/pydantic_models_to_grammar.py
+++ b/examples/pydantic_models_to_grammar.py
@@ -4,6 +4,7 @@
 from inspect import isclass, getdoc
 from types import NoneType
 
+from docstring_parser import parse
 from pydantic import BaseModel, create_model, Field
 from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
 from enum import Enum
@@ -25,9 +26,10 @@ class PydanticDataType(Enum):
         ENUM (str): Represents an enum data type.
         CUSTOM_CLASS (str): Represents a custom class data type.
     """
+
     STRING = "string"
     TRIPLE_QUOTED_STRING = "triple_quoted_string"
-    MARKDOWN_STRING = "markdown_string"
+    MARKDOWN_CODE_BLOCK = "markdown_code_block"
     BOOLEAN = "boolean"
     INTEGER = "integer"
     FLOAT = "float"
@@ -78,10 +80,10 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
 
 
 def format_model_and_field_name(model_name: str) -> str:
-    parts = re.findall('[A-Z][^A-Z]*', model_name)
+    parts = re.findall("[A-Z][^A-Z]*", model_name)
     if not parts:  # Check if the list is empty
         return model_name.lower().replace("_", "-")
-    return '-'.join(part.lower().replace("_", "-") for part in parts)
+    return "-".join(part.lower().replace("_", "-") for part in parts)
 
 
 def generate_list_rule(element_type):
@@ -93,29 +95,31 @@ def generate_list_rule(element_type):
     """
     rule_name = f"{map_pydantic_type_to_gbnf(element_type)}-list"
     element_rule = map_pydantic_type_to_gbnf(element_type)
-    list_rule = fr'{rule_name} ::= "["  {element_rule} (","  {element_rule})* "]"'
+    list_rule = rf'{rule_name} ::= "["  {element_rule} (","  {element_rule})* "]"'
     return list_rule
 
 
 def get_members_structure(cls, rule_name):
     if issubclass(cls, Enum):
         # Handle Enum types
-        members = [f'\"\\\"{member.value}\\\"\"' for name, member in cls.__members__.items()]
+        members = [f'"\\"{member.value}\\""' for name, member in cls.__members__.items()]
         return f"{cls.__name__.lower()} ::= " + " | ".join(members)
     if cls.__annotations__ and cls.__annotations__ != {}:
         result = f'{rule_name} ::= "{{"'
         type_list_rules = []
         # Modify this comprehension
-        members = [f'  \"\\\"{name}\\\"\" ":"  {map_pydantic_type_to_gbnf(param_type)}'
-                   for name, param_type in cls.__annotations__.items()
-                   if name != 'self']
+        members = [
+            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param_type)}'
+            for name, param_type in cls.__annotations__.items()
+            if name != "self"
+        ]
 
         result += '"," '.join(members)
         result += '  "}"'
         return result, type_list_rules
     elif rule_name == "custom-class-any":
-        result = f'{rule_name} ::= '
-        result += 'value'
+        result = f"{rule_name} ::= "
+        result += "value"
         type_list_rules = []
         return result, type_list_rules
     else:
@@ -124,9 +128,11 @@ def get_members_structure(cls, rule_name):
         result = f'{rule_name} ::=  "{{"'
         type_list_rules = []
         # Modify this comprehension too
-        members = [f'  \"\\\"{name}\\\"\" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
-                   for name, param in parameters.items()
-                   if name != 'self' and param.annotation != inspect.Parameter.empty]
+        members = [
+            f'  "\\"{name}\\"" ":"  {map_pydantic_type_to_gbnf(param.annotation)}'
+            for name, param in parameters.items()
+            if name != "self" and param.annotation != inspect.Parameter.empty
+        ]
 
         result += '", "'.join(members)
         result += '  "}"'
@@ -141,8 +147,8 @@ def regex_to_gbnf(regex_pattern: str) -> str:
     gbnf_rule = regex_pattern
 
     # Translate common regex components to GBNF
-    gbnf_rule = gbnf_rule.replace('\\d', '[0-9]')
-    gbnf_rule = gbnf_rule.replace('\\s', '[ \t\n]')
+    gbnf_rule = gbnf_rule.replace("\\d", "[0-9]")
+    gbnf_rule = gbnf_rule.replace("\\s", "[ \t\n]")
 
     # Handle quantifiers and other regex syntax that is similar in GBNF
     # (e.g., '*', '+', '?', character classes)
@@ -158,12 +164,12 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None):
     Generates GBNF (Generalized Backus-Naur Form) rules for integers based on the given maximum and minimum digits.
 
     Parameters:
-    max_digit (int): The maximum number of digits for the integer. Default is None.
-    min_digit (int): The minimum number of digits for the integer. Default is None.
+        max_digit (int): The maximum number of digits for the integer. Default is None.
+        min_digit (int): The minimum number of digits for the integer. Default is None.
 
     Returns:
-    integer_rule (str): The identifier for the integer rule generated.
-    additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits.
+        integer_rule (str): The identifier for the integer rule generated.
+        additional_rules (list): A list of additional rules generated based on the given maximum and minimum digits.
 
     """
     additional_rules = []
@@ -178,21 +184,21 @@ def generate_gbnf_integer_rules(max_digit=None, min_digit=None):
     # Handling Integer Rules
     if max_digit is not None or min_digit is not None:
         # Start with an empty rule part
-        integer_rule_part = ''
+        integer_rule_part = ""
 
         # Add mandatory digits as per min_digit
         if min_digit is not None:
-            integer_rule_part += '[0-9] ' * min_digit
+            integer_rule_part += "[0-9] " * min_digit
 
         # Add optional digits up to max_digit
         if max_digit is not None:
             optional_digits = max_digit - (min_digit if min_digit is not None else 0)
-            integer_rule_part += ''.join(['[0-9]? ' for _ in range(optional_digits)])
+            integer_rule_part += "".join(["[0-9]? " for _ in range(optional_digits)])
 
         # Trim the rule part and append it to additional rules
         integer_rule_part = integer_rule_part.strip()
         if integer_rule_part:
-            additional_rules.append(f'{integer_rule} ::= {integer_rule_part}')
+            additional_rules.append(f"{integer_rule} ::= {integer_rule_part}")
 
     return integer_rule, additional_rules
 
@@ -224,21 +230,26 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
     additional_rules = []
 
     # Define the integer part rule
-    integer_part_rule = "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
+    integer_part_rule = (
+        "integer-part" + (f"-max{max_digit}" if max_digit is not None else "") + (
         f"-min{min_digit}" if min_digit is not None else "")
+    )
 
     # Define the fractional part rule based on precision constraints
     fractional_part_rule = "fractional-part"
-    fractional_rule_part = ''
+    fractional_rule_part = ""
     if max_precision is not None or min_precision is not None:
         fractional_part_rule += (f"-max{max_precision}" if max_precision is not None else "") + (
-            f"-min{min_precision}" if min_precision is not None else "")
+            f"-min{min_precision}" if min_precision is not None else ""
+        )
         # Minimum number of digits
-        fractional_rule_part = '[0-9]' * (min_precision if min_precision is not None else 1)
+        fractional_rule_part = "[0-9]" * (min_precision if min_precision is not None else 1)
         # Optional additional digits
-        fractional_rule_part += ''.join([' [0-9]?'] * (
-            (max_precision - (min_precision if min_precision is not None else 1)) if max_precision is not None else 0))
-        additional_rules.append(f'{fractional_part_rule} ::= {fractional_rule_part}')
+        fractional_rule_part += "".join(
+            [" [0-9]?"] * ((max_precision - (
+                min_precision if min_precision is not None else 1)) if max_precision is not None else 0)
+        )
+        additional_rules.append(f"{fractional_part_rule} ::= {fractional_rule_part}")
 
     # Define the float rule
     float_rule = f"float-{max_digit if max_digit is not None else 'X'}-{min_digit if min_digit is not None else 'X'}-{max_precision if max_precision is not None else 'X'}-{min_precision if min_precision is not None else 'X'}"
@@ -246,20 +257,19 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
 
     # Generating the integer part rule definition, if necessary
     if max_digit is not None or min_digit is not None:
-        integer_rule_part = '[0-9]'
+        integer_rule_part = "[0-9]"
         if min_digit is not None and min_digit > 1:
-            integer_rule_part += ' [0-9]' * (min_digit - 1)
+            integer_rule_part += " [0-9]" * (min_digit - 1)
         if max_digit is not None:
-            integer_rule_part += ''.join([' [0-9]?'] * (max_digit - (min_digit if min_digit is not None else 1)))
-        additional_rules.append(f'{integer_part_rule} ::= {integer_rule_part.strip()}')
+            integer_rule_part += "".join([" [0-9]?"] * (max_digit - (min_digit if min_digit is not None else 1)))
+        additional_rules.append(f"{integer_part_rule} ::= {integer_rule_part.strip()}")
 
     return float_rule, additional_rules
 
 
-def generate_gbnf_rule_for_type(model_name, field_name,
-                                field_type, is_optional, processed_models, created_rules,
-                                field_info=None) -> \
-    Tuple[str, list]:
+def generate_gbnf_rule_for_type(
+    model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
+) -> Tuple[str, list]:
     """
     Generate GBNF rule for a given field type.
 
@@ -282,20 +292,19 @@ def generate_gbnf_rule_for_type(model_name, field_name,
 
     if isclass(field_type) and issubclass(field_type, BaseModel):
         nested_model_name = format_model_and_field_name(field_type.__name__)
-        nested_model_rules = generate_gbnf_grammar(field_type, processed_models, created_rules)
+        nested_model_rules, _ = generate_gbnf_grammar(field_type, processed_models, created_rules)
         rules.extend(nested_model_rules)
         gbnf_type, rules = nested_model_name, rules
     elif isclass(field_type) and issubclass(field_type, Enum):
-        enum_values = [f'\"\\\"{e.value}\\\"\"' for e in field_type]  # Adding escaped quotes
+        enum_values = [f'"\\"{e.value}\\""' for e in field_type]  # Adding escaped quotes
         enum_rule = f"{model_name}-{field_name} ::= {' | '.join(enum_values)}"
         rules.append(enum_rule)
         gbnf_type, rules = model_name + "-" + field_name, rules
-    elif get_origin(field_type) == list or field_type == list:  # Array
+    elif get_origin(field_type) == list:  # Array
         element_type = get_args(field_type)[0]
-        element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
-                                                                          f"{field_name}-element",
-                                                                          element_type, is_optional, processed_models,
-                                                                          created_rules)
+        element_rule_name, additional_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
+        )
         rules.extend(additional_rules)
         array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})*  "]" """
         rules.append(array_rule)
@@ -303,10 +312,9 @@ def generate_gbnf_rule_for_type(model_name, field_name,
 
     elif get_origin(field_type) == set or field_type == set:  # Array
         element_type = get_args(field_type)[0]
-        element_rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
-                                                                          f"{field_name}-element",
-                                                                          element_type, is_optional, processed_models,
-                                                                          created_rules)
+        element_rule_name, additional_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-element", element_type, is_optional, processed_models, created_rules
+        )
         rules.extend(additional_rules)
         array_rule = f"""{model_name}-{field_name} ::= "[" ws {element_rule_name} ("," ws {element_rule_name})*  "]" """
         rules.append(array_rule)
@@ -318,15 +326,13 @@ def generate_gbnf_rule_for_type(model_name, field_name,
     elif gbnf_type.startswith("custom-dict-"):
         key_type, value_type = get_args(field_type)
 
-        additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(model_name,
-                                                                                f"{field_name}-key-type",
-                                                                                key_type, is_optional, processed_models,
-                                                                                created_rules)
-        additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(model_name,
-                                                                                    f"{field_name}-value-type",
-                                                                                    value_type, is_optional,
-                                                                                    processed_models, created_rules)
-        gbnf_type = fr'{gbnf_type} ::= "{{"  ( {additional_key_type} ":"  {additional_value_type} (","  {additional_key_type} ":"  {additional_value_type})*  )? "}}" '
+        additional_key_type, additional_key_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-key-type", key_type, is_optional, processed_models, created_rules
+        )
+        additional_value_type, additional_value_rules = generate_gbnf_rule_for_type(
+            model_name, f"{field_name}-value-type", value_type, is_optional, processed_models, created_rules
+        )
+        gbnf_type = rf'{gbnf_type} ::= "{{"  ( {additional_key_type} ": "  {additional_value_type} ("," "\n" ws {additional_key_type} ":"  {additional_value_type})*  )? "}}" '
 
         rules.extend(additional_key_rules)
         rules.extend(additional_value_rules)
@@ -336,19 +342,16 @@ def generate_gbnf_rule_for_type(model_name, field_name,
 
         for union_type in union_types:
             if isinstance(union_type, _GenericAlias):
-                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name,
-                                                                                field_name, union_type,
-                                                                                False,
-                                                                                processed_models, created_rules)
+                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
+                    model_name, field_name, union_type, False, processed_models, created_rules
+                )
                 union_rules.append(union_gbnf_type)
                 rules.extend(union_rules_list)
 
-
             elif not issubclass(union_type, NoneType):
-                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(model_name,
-                                                                                field_name, union_type,
-                                                                                False,
-                                                                                processed_models, created_rules)
+                union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
+                    model_name, field_name, union_type, False, processed_models, created_rules
+                )
                 union_rules.append(union_gbnf_type)
                 rules.extend(union_rules_list)
 
@@ -363,45 +366,58 @@ def generate_gbnf_rule_for_type(model_name, field_name,
         else:
             gbnf_type = f"{model_name}-{field_name}-union"
     elif isclass(field_type) and issubclass(field_type, str):
-        if field_info and hasattr(field_info, 'json_schema_extra') and field_info.json_schema_extra is not None:
-
-            triple_quoted_string = field_info.json_schema_extra.get('triple_quoted_string', False)
-            markdown_string = field_info.json_schema_extra.get('markdown_string', False)
+        if field_info and hasattr(field_info, "json_schema_extra") and field_info.json_schema_extra is not None:
+            triple_quoted_string = field_info.json_schema_extra.get("triple_quoted_string", False)
+            markdown_string = field_info.json_schema_extra.get("markdown_code_block", False)
 
             gbnf_type = PydanticDataType.TRIPLE_QUOTED_STRING.value if triple_quoted_string else PydanticDataType.STRING.value
-            gbnf_type = PydanticDataType.MARKDOWN_STRING.value if markdown_string else gbnf_type
+            gbnf_type = PydanticDataType.MARKDOWN_CODE_BLOCK.value if markdown_string else gbnf_type
 
-        elif field_info and hasattr(field_info, 'pattern'):
+        elif field_info and hasattr(field_info, "pattern"):
             # Convert regex pattern to grammar rule
             regex_pattern = field_info.regex.pattern
             gbnf_type = f"pattern-{field_name} ::= {regex_to_gbnf(regex_pattern)}"
         else:
             gbnf_type = PydanticDataType.STRING.value
 
-    elif isclass(field_type) and issubclass(field_type, float) and field_info and hasattr(field_info,
-                                                                                          'json_schema_extra') and field_info.json_schema_extra is not None:
+    elif (
+        isclass(field_type)
+        and issubclass(field_type, float)
+        and field_info
+        and hasattr(field_info, "json_schema_extra")
+        and field_info.json_schema_extra is not None
+    ):
         # Retrieve precision attributes for floats
-        max_precision = field_info.json_schema_extra.get('max_precision') if field_info and hasattr(field_info,
-                                                                                                    'json_schema_extra') else None
-        min_precision = field_info.json_schema_extra.get('min_precision') if field_info and hasattr(field_info,
-                                                                                                    'json_schema_extra') else None
-        max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
-        min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
+        max_precision = (
+            field_info.json_schema_extra.get("max_precision") if field_info and hasattr(field_info,
+                                                                                        "json_schema_extra") else None
+        )
+        min_precision = (
+            field_info.json_schema_extra.get("min_precision") if field_info and hasattr(field_info,
+                                                                                        "json_schema_extra") else None
+        )
+        max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
+        min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
 
         # Generate GBNF rule for float with given attributes
-        gbnf_type, rules = generate_gbnf_float_rules(max_digit=max_digits, min_digit=min_digits,
-                                                     max_precision=max_precision,
-                                                     min_precision=min_precision)
-
-    elif isclass(field_type) and issubclass(field_type, int) and field_info and hasattr(field_info,
-                                                                                        'json_schema_extra') and field_info.json_schema_extra is not None:
+        gbnf_type, rules = generate_gbnf_float_rules(
+            max_digit=max_digits, min_digit=min_digits, max_precision=max_precision, min_precision=min_precision
+        )
+
+    elif (
+        isclass(field_type)
+        and issubclass(field_type, int)
+        and field_info
+        and hasattr(field_info, "json_schema_extra")
+        and field_info.json_schema_extra is not None
+    ):
         # Retrieve digit attributes for integers
-        max_digits = field_info.json_schema_extra.get('max_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
-        min_digits = field_info.json_schema_extra.get('min_digit') if field_info and hasattr(field_info,
-                                                                                             'json_schema_extra') else None
+        max_digits = field_info.json_schema_extra.get("max_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
+        min_digits = field_info.json_schema_extra.get("min_digit") if field_info and hasattr(field_info,
+                                                                                             "json_schema_extra") else None
 
         # Generate GBNF rule for integer with given attributes
         gbnf_type, rules = generate_gbnf_integer_rules(max_digit=max_digits, min_digit=min_digits)
@@ -443,13 +459,13 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
 
     if not issubclass(model, BaseModel):
         # For non-Pydantic classes, generate model_fields from __annotations__ or __init__
-        if hasattr(model, '__annotations__') and model.__annotations__:
+        if hasattr(model, "__annotations__") and model.__annotations__:
             model_fields = {name: (typ, ...) for name, typ in model.__annotations__.items()}
         else:
             init_signature = inspect.signature(model.__init__)
             parameters = init_signature.parameters
-            model_fields = {name: (param.annotation, param.default) for name, param in parameters.items()
-                            if name != 'self'}
+            model_fields = {name: (param.annotation, param.default) for name, param in parameters.items() if
+                            name != "self"}
     else:
         # For Pydantic models, use model_fields and check for ellipsis (required fields)
         model_fields = model.__annotations__
@@ -469,51 +485,55 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
             field_type = field_info
             field_info = model.model_fields[field_name]
             is_optional = field_info.is_required is False and get_origin(field_type) is Optional
-        rule_name, additional_rules = generate_gbnf_rule_for_type(model_name,
-                                                                  format_model_and_field_name(field_name),
-                                                                  field_type, is_optional,
-                                                                  processed_models, created_rules, field_info)
-        look_for_markdown_code_block = True if rule_name == "markdown_string" else False
+        rule_name, additional_rules = generate_gbnf_rule_for_type(
+            model_name, format_model_and_field_name(field_name), field_type, is_optional, processed_models,
+            created_rules, field_info
+        )
+        look_for_markdown_code_block = True if rule_name == "markdown_code_block" else False
         look_for_triple_quoted_string = True if rule_name == "triple_quoted_string" else False
         if not look_for_markdown_code_block and not look_for_triple_quoted_string:
             if rule_name not in created_rules:
                 created_rules[rule_name] = additional_rules
-            model_rule_parts.append(f' ws \"\\\"{field_name}\\\"\" ": "  {rule_name}')  # Adding escaped quotes
+            model_rule_parts.append(f' ws "\\"{field_name}\\"" ":" ws {rule_name}')  # Adding escaped quotes
             nested_rules.extend(additional_rules)
         else:
-            has_triple_quoted_string = look_for_markdown_code_block
-            has_markdown_code_block = look_for_triple_quoted_string
+            has_triple_quoted_string = look_for_triple_quoted_string
+            has_markdown_code_block = look_for_markdown_code_block
 
     fields_joined = r' "," "\n" '.join(model_rule_parts)
-    model_rule = fr'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"'
-
-    if look_for_markdown_code_block or look_for_triple_quoted_string:
-        model_rule += ' ws "}"'
+    model_rule = rf'{model_name} ::= "{{" "\n" {fields_joined} "\n" ws "}}"'
 
+    has_special_string = False
     if has_triple_quoted_string:
+        model_rule += '"\\n" ws "}"'
         model_rule += '"\\n" triple-quoted-string'
+        has_special_string = True
     if has_markdown_code_block:
+        model_rule += '"\\n" ws "}"'
         model_rule += '"\\n" markdown-code-block'
+        has_special_string = True
     all_rules = [model_rule] + nested_rules
 
-    return all_rules, has_markdown_code_block, has_triple_quoted_string
+    return all_rules, has_special_string
 
 
-def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], outer_object_name: str = None,
-                                               outer_object_content: str = None, list_of_outputs: bool = False) -> str:
+def generate_gbnf_grammar_from_pydantic_models(
+    models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
+    list_of_outputs: bool = False
+) -> str:
     """
     Generate GBNF Grammar from Pydantic Models.
 
     This method takes a list of Pydantic models and uses them to generate a GBNF grammar string. The generated grammar string can be used for parsing and validating data using the generated
     * grammar.
 
-    Parameters:
-    models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    list_of_outputs (str, optional): Allows a list of output objects
+    Args:
+        models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        list_of_outputs (str, optional): Allows a list of output objects
     Returns:
-    str: The generated GBNF grammar string.
+        str: The generated GBNF grammar string.
 
     Examples:
         models = [UserModel, PostModel]
@@ -527,52 +547,53 @@ def generate_gbnf_grammar_from_pydantic_models(models: List[Type[BaseModel]], ou
     all_rules = []
     created_rules = {}
     if outer_object_name is None:
-
         for model in models:
-            model_rules, _, _ = generate_gbnf_grammar(model,
-                                                      processed_models, created_rules)
+            model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
             all_rules.extend(model_rules)
 
         if list_of_outputs:
-            root_rule = r'root ::= ws "["  grammar-models (","  grammar-models)*  "]"' + "\n"
+            root_rule = r'root ::= (" "| "\n") "[" ws grammar-models ("," ws grammar-models)* ws "]"' + "\n"
         else:
-            root_rule = r'root ::= ws grammar-models' + "\n"
+            root_rule = r'root ::= (" "| "\n") grammar-models' + "\n"
         root_rule += "grammar-models ::= " + " | ".join(
             [format_model_and_field_name(model.__name__) for model in models])
         all_rules.insert(0, root_rule)
         return "\n".join(all_rules)
     elif outer_object_name is not None:
         if list_of_outputs:
-            root_rule = fr'root ::= ws "["  {format_model_and_field_name(outer_object_name)} (","  {format_model_and_field_name(outer_object_name)})*  "]"' + "\n"
+            root_rule = (
+                rf'root ::= (" "| "\n") "[" ws {format_model_and_field_name(outer_object_name)} ("," ws {format_model_and_field_name(outer_object_name)})* ws "]"'
+                + "\n"
+            )
         else:
             root_rule = f"root ::= {format_model_and_field_name(outer_object_name)}\n"
 
-        model_rule = fr'{format_model_and_field_name(outer_object_name)} ::= ws "{{" ws "\"{outer_object_name}\""  ": "  grammar-models'
+        model_rule = (
+            rf'{format_model_and_field_name(outer_object_name)} ::= (" "| "\n") "{{" ws "\"{outer_object_name}\""  ":" ws grammar-models'
+        )
 
         fields_joined = " | ".join(
-            [fr'{format_model_and_field_name(model.__name__)}-grammar-model' for model in models])
+            [rf"{format_model_and_field_name(model.__name__)}-grammar-model" for model in models])
 
-        grammar_model_rules = f'\ngrammar-models ::= {fields_joined}'
+        grammar_model_rules = f"\ngrammar-models ::= {fields_joined}"
         mod_rules = []
         for model in models:
-            mod_rule = fr'{format_model_and_field_name(model.__name__)}-grammar-model ::= ws'
-            mod_rule += fr'"\"{format_model_and_field_name(model.__name__)}\"" "," ws "\"{outer_object_content}\"" ws ":" ws {format_model_and_field_name(model.__name__)}' + '\n'
+            mod_rule = rf"{format_model_and_field_name(model.__name__)}-grammar-model ::= "
+            mod_rule += (
+                rf'"\"{model.__name__}\"" "," ws "\"{outer_object_content}\"" ":" ws {format_model_and_field_name(model.__name__)}' + "\n"
+            )
             mod_rules.append(mod_rule)
         grammar_model_rules += "\n" + "\n".join(mod_rules)
-        look_for_markdown_code_block = False
-        look_for_triple_quoted_string = False
+
         for model in models:
-            model_rules, markdown_block, triple_quoted_string = generate_gbnf_grammar(model,
-                                                                                      processed_models, created_rules)
-            all_rules.extend(model_rules)
-            if markdown_block:
-                look_for_markdown_code_block = True
+            model_rules, has_special_string = generate_gbnf_grammar(model, processed_models,
+                                                                    created_rules)
 
-            if triple_quoted_string:
-                look_for_triple_quoted_string = True
+            if not has_special_string:
+                model_rules[0] += r'"\n" ws "}"'
+
+            all_rules.extend(model_rules)
 
-        if not look_for_markdown_code_block and not look_for_triple_quoted_string:
-            model_rule += ' ws "}"'
         all_rules.insert(0, root_rule + model_rule + grammar_model_rules)
         return "\n".join(all_rules)
 
@@ -582,10 +603,10 @@ def get_primitive_grammar(grammar):
     Returns the needed GBNF primitive grammar for a given GBNF grammar string.
 
     Args:
-    grammar (str): The string containing the GBNF grammar.
+        grammar (str): The string containing the GBNF grammar.
 
     Returns:
-    str: GBNF primitive grammar string.
+        str: GBNF primitive grammar string.
     """
     type_list = []
     if "string-list" in grammar:
@@ -611,7 +632,7 @@ def get_primitive_grammar(grammar):
 
     any_block = ""
     if "custom-class-any" in grammar:
-        any_block = '''
+        any_block = """
 value ::= object | array | string | number | boolean | null
 
 object ::=
@@ -626,7 +647,7 @@ def get_primitive_grammar(grammar):
     ("," ws value)*
   )? "]" ws
 
-number ::= integer | float'''
+number ::= integer | float"""
 
     markdown_code_block_grammar = ""
     if "markdown-code-block" in grammar:
@@ -641,47 +662,140 @@ def get_primitive_grammar(grammar):
 triple-quoted-string ::= triple-quotes triple-quoted-string-content triple-quotes
 triple-quoted-string-content ::= ( [^'] | "'" [^'] |  "'"  "'" [^']  )*
 triple-quotes ::= "'''" """
-    return "\n" + '\n'.join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar
-
+    return "\n" + "\n".join(additional_grammar) + any_block + primitive_grammar + markdown_code_block_grammar
 
-def generate_field_markdown(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1) -> str:
-    indent = '  ' * depth
-    field_markdown = f"{indent}- **{field_name}** (`{field_type.__name__}`): "
-
-    # Extracting field description from Pydantic Field using __model_fields__
-    field_info = model.model_fields.get(field_name)
-    field_description = field_info.description if field_info and field_info.description else "No description available."
 
-    field_markdown += field_description + '\n'
-
-    # Handling nested BaseModel fields
-    if isclass(field_type) and issubclass(field_type, BaseModel):
-        field_markdown += f"{indent}  - Details:\n"
-        for name, type_ in field_type.__annotations__.items():
-            field_markdown += generate_field_markdown(name, type_, field_type, depth + 2)
+def generate_markdown_documentation(
+    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    documentation_with_field_description=True
+) -> str:
+    """
+    Generate markdown documentation for a list of Pydantic models.
 
-    return field_markdown
+    Args:
+        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        model_prefix (str): Prefix for the model section.
+        fields_prefix (str): Prefix for the fields section.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
+    Returns:
+        str: Generated text documentation.
+    """
+    documentation = ""
+    pyd_models = [(model, True) for model in pydantic_models]
+    for model, add_prefix in pyd_models:
+        if add_prefix:
+            documentation += f"{model_prefix}: {model.__name__}\n"
+        else:
+            documentation += f"Model: {model.__name__}\n"
 
-def generate_markdown_report(pydantic_models: List[Type[BaseModel]]) -> str:
-    markdown = ""
-    for model in pydantic_models:
-        markdown += f"### {format_model_and_field_name(model.__name__)}\n"
+        # Handling multi-line model description with proper indentation
 
-        # Check if the model's docstring is different from BaseModel's docstring
         class_doc = getdoc(model)
         base_class_doc = getdoc(BaseModel)
-        class_description = class_doc if class_doc and class_doc != base_class_doc else "No specific description available."
-
-        markdown += f"{class_description}\n\n"
-        markdown += "#### Fields\n"
+        class_description = class_doc if class_doc and class_doc != base_class_doc else ""
+        if class_description != "":
+            documentation += "  Description: "
+            documentation += format_multiline_description(class_description, 0) + "\n"
 
+        if add_prefix:
+            # Indenting the fields section
+            documentation += f"  {fields_prefix}:\n"
+        else:
+            documentation += f"  Fields:\n"
         if isclass(model) and issubclass(model, BaseModel):
             for name, field_type in model.__annotations__.items():
-                markdown += generate_field_markdown(format_model_and_field_name(name), field_type, model)
-        markdown += "\n"
+                # if name == "markdown_code_block":
+                #    continue
+                if get_origin(field_type) == list:
+                    element_type = get_args(field_type)[0]
+                    if isclass(element_type) and issubclass(element_type, BaseModel):
+                        pyd_models.append((element_type, False))
+                if get_origin(field_type) == Union:
+                    element_types = get_args(field_type)
+                    for element_type in element_types:
+                        if isclass(element_type) and issubclass(element_type, BaseModel):
+                            pyd_models.append((element_type, False))
+                documentation += generate_field_markdown(
+                    name, field_type, model, documentation_with_field_description=documentation_with_field_description
+                )
+            documentation += "\n"
+
+        if hasattr(model, "Config") and hasattr(model.Config,
+                                                "json_schema_extra") and "example" in model.Config.json_schema_extra:
+            documentation += f"  Expected Example Output for {format_model_and_field_name(model.__name__)}:\n"
+            json_example = json.dumps(model.Config.json_schema_extra["example"])
+            documentation += format_multiline_description(json_example, 2) + "\n"
 
-    return markdown
+    return documentation
+
+
+def generate_field_markdown(
+    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    documentation_with_field_description=True
+) -> str:
+    """
+    Generate markdown documentation for a Pydantic model field.
+
+    Args:
+        field_name (str): Name of the field.
+        field_type (Type[Any]): Type of the field.
+        model (Type[BaseModel]): Pydantic model class.
+        depth (int): Indentation depth in the documentation.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
+
+    Returns:
+        str: Generated text documentation for the field.
+    """
+    indent = "    " * depth
+
+    field_info = model.model_fields.get(field_name)
+    field_description = field_info.description if field_info and field_info.description else ""
+
+    if get_origin(field_type) == list:
+        element_type = get_args(field_type)[0]
+        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)} of {format_model_and_field_name(element_type.__name__)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+    elif get_origin(field_type) == Union:
+        element_types = get_args(field_type)
+        types = []
+        for element_type in element_types:
+            types.append(format_model_and_field_name(element_type.__name__))
+        field_text = f"{indent}{field_name} ({' or '.join(types)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+    else:
+        field_text = f"{indent}{field_name} ({format_model_and_field_name(field_type.__name__)})"
+        if field_description != "":
+            field_text += ":\n"
+        else:
+            field_text += "\n"
+
+    if not documentation_with_field_description:
+        return field_text
+
+    if field_description != "":
+        field_text += f"        Description: " + field_description + "\n"
+
+    # Check for and include field-specific examples if available
+    if hasattr(model, "Config") and hasattr(model.Config,
+                                            "json_schema_extra") and "example" in model.Config.json_schema_extra:
+        field_example = model.Config.json_schema_extra["example"].get(field_name)
+        if field_example is not None:
+            example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
+            field_text += f"{indent}  Example: {example_text}\n"
+
+    if isclass(field_type) and issubclass(field_type, BaseModel):
+        field_text += f"{indent}  Details:\n"
+        for name, type_ in field_type.__annotations__.items():
+            field_text += generate_field_markdown(name, type_, field_type, depth + 2)
+
+    return field_text
 
 
 def format_json_example(example: dict, depth: int) -> str:
@@ -689,42 +803,44 @@ def format_json_example(example: dict, depth: int) -> str:
     Format a JSON example into a readable string with indentation.
 
     Args:
-    example (dict): JSON example to be formatted.
-    depth (int): Indentation depth.
+        example (dict): JSON example to be formatted.
+        depth (int): Indentation depth.
 
     Returns:
-    str: Formatted JSON example string.
+        str: Formatted JSON example string.
     """
-    indent = '    ' * depth
-    formatted_example = '{\n'
+    indent = "    " * depth
+    formatted_example = "{\n"
     for key, value in example.items():
         value_text = f"'{value}'" if isinstance(value, str) else value
         formatted_example += f"{indent}{key}: {value_text},\n"
-    formatted_example = formatted_example.rstrip(',\n') + '\n' + indent + '}'
+    formatted_example = formatted_example.rstrip(",\n") + "\n" + indent + "}"
     return formatted_example
 
 
-def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_prefix="Model",
-                                fields_prefix="Fields", documentation_with_field_description=True) -> str:
+def generate_text_documentation(
+    pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
+    documentation_with_field_description=True
+) -> str:
     """
     Generate text documentation for a list of Pydantic models.
 
     Args:
-    pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
-    model_prefix (str): Prefix for the model section.
-    fields_prefix (str): Prefix for the fields section.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
+        model_prefix (str): Prefix for the model section.
+        fields_prefix (str): Prefix for the fields section.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    str: Generated text documentation.
+        str: Generated text documentation.
     """
     documentation = ""
     pyd_models = [(model, True) for model in pydantic_models]
     for model, add_prefix in pyd_models:
         if add_prefix:
-            documentation += f"{model_prefix}: {format_model_and_field_name(model.__name__)}\n"
+            documentation += f"{model_prefix}: {model.__name__}\n"
         else:
-            documentation += f"Model: {format_model_and_field_name(model.__name__)}\n"
+            documentation += f"Model: {model.__name__}\n"
 
         # Handling multi-line model description with proper indentation
 
@@ -735,12 +851,8 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr
             documentation += "  Description: "
             documentation += "\n" + format_multiline_description(class_description, 2) + "\n"
 
-        if add_prefix:
-            # Indenting the fields section
-            documentation += f"  {fields_prefix}:\n"
-        else:
-            documentation += f"  Fields:\n"
         if isclass(model) and issubclass(model, BaseModel):
+            documentation_fields = ""
             for name, field_type in model.__annotations__.items():
                 # if name == "markdown_code_block":
                 #    continue
@@ -753,35 +865,43 @@ def generate_text_documentation(pydantic_models: List[Type[BaseModel]], model_pr
                     for element_type in element_types:
                         if isclass(element_type) and issubclass(element_type, BaseModel):
                             pyd_models.append((element_type, False))
-                documentation += generate_field_text(name, field_type, model,
-                                                     documentation_with_field_description=documentation_with_field_description)
+                documentation_fields += generate_field_text(
+                    name, field_type, model, documentation_with_field_description=documentation_with_field_description
+                )
+            if documentation_fields != "":
+                if add_prefix:
+                    documentation += f"  {fields_prefix}:\n{documentation_fields}"
+                else:
+                    documentation += f"  Fields:\n{documentation_fields}"
             documentation += "\n"
 
-        if hasattr(model, 'Config') and hasattr(model.Config,
-                                                'json_schema_extra') and 'example' in model.Config.json_schema_extra:
+        if hasattr(model, "Config") and hasattr(model.Config,
+                                                "json_schema_extra") and "example" in model.Config.json_schema_extra:
             documentation += f"  Expected Example Output for {format_model_and_field_name(model.__name__)}:\n"
-            json_example = json.dumps(model.Config.json_schema_extra['example'])
+            json_example = json.dumps(model.Config.json_schema_extra["example"])
             documentation += format_multiline_description(json_example, 2) + "\n"
 
     return documentation
 
 
-def generate_field_text(field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
-                        documentation_with_field_description=True) -> str:
+def generate_field_text(
+    field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
+    documentation_with_field_description=True
+) -> str:
     """
     Generate text documentation for a Pydantic model field.
 
     Args:
-    field_name (str): Name of the field.
-    field_type (Type[Any]): Type of the field.
-    model (Type[BaseModel]): Pydantic model class.
-    depth (int): Indentation depth in the documentation.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        field_name (str): Name of the field.
+        field_type (Type[Any]): Type of the field.
+        model (Type[BaseModel]): Pydantic model class.
+        depth (int): Indentation depth in the documentation.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    str: Generated text documentation for the field.
+        str: Generated text documentation for the field.
     """
-    indent = '    ' * depth
+    indent = "    " * depth
 
     field_info = model.model_fields.get(field_name)
     field_description = field_info.description if field_info and field_info.description else ""
@@ -817,9 +937,9 @@ def generate_field_text(field_name: str, field_type: Type[Any], model: Type[Base
         field_text += f"{indent}  Description: " + field_description + "\n"
 
     # Check for and include field-specific examples if available
-    if hasattr(model, 'Config') and hasattr(model.Config,
-                                            'json_schema_extra') and 'example' in model.Config.json_schema_extra:
-        field_example = model.Config.json_schema_extra['example'].get(field_name)
+    if hasattr(model, "Config") and hasattr(model.Config,
+                                            "json_schema_extra") and "example" in model.Config.json_schema_extra:
+        field_example = model.Config.json_schema_extra["example"].get(field_name)
         if field_example is not None:
             example_text = f"'{field_example}'" if isinstance(field_example, str) else field_example
             field_text += f"{indent}  Example: {example_text}\n"
@@ -837,39 +957,40 @@ def format_multiline_description(description: str, indent_level: int) -> str:
     Format a multiline description with proper indentation.
 
     Args:
-    description (str): Multiline description.
-    indent_level (int): Indentation level.
+        description (str): Multiline description.
+        indent_level (int): Indentation level.
 
     Returns:
-    str: Formatted multiline description.
+        str: Formatted multiline description.
     """
-    indent = '    ' * indent_level
-    return indent + description.replace('\n', '\n' + indent)
+    indent = "    " * indent_level
+    return indent + description.replace("\n", "\n" + indent)
 
 
-def save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path="./grammar.gbnf",
-                                        documentation_file_path="./grammar_documentation.md"):
+def save_gbnf_grammar_and_documentation(
+    grammar, documentation, grammar_file_path="./grammar.gbnf", documentation_file_path="./grammar_documentation.md"
+):
     """
     Save GBNF grammar and documentation to specified files.
 
     Args:
-    grammar (str): GBNF grammar string.
-    documentation (str): Documentation string.
-    grammar_file_path (str): File path to save the GBNF grammar.
-    documentation_file_path (str): File path to save the documentation.
+        grammar (str): GBNF grammar string.
+        documentation (str): Documentation string.
+        grammar_file_path (str): File path to save the GBNF grammar.
+        documentation_file_path (str): File path to save the documentation.
 
     Returns:
-    None
+        None
     """
     try:
-        with open(grammar_file_path, 'w') as file:
+        with open(grammar_file_path, "w") as file:
             file.write(grammar + get_primitive_grammar(grammar))
         print(f"Grammar successfully saved to {grammar_file_path}")
     except IOError as e:
         print(f"An error occurred while saving the grammar file: {e}")
 
     try:
-        with open(documentation_file_path, 'w') as file:
+        with open(documentation_file_path, "w") as file:
             file.write(documentation)
         print(f"Documentation successfully saved to {documentation_file_path}")
     except IOError as e:
@@ -881,10 +1002,10 @@ def remove_empty_lines(string):
     Remove empty lines from a string.
 
     Args:
-    string (str): Input string.
+        string (str): Input string.
 
     Returns:
-    str: String with empty lines removed.
+        str: String with empty lines removed.
     """
     lines = string.splitlines()
     non_empty_lines = [line for line in lines if line.strip() != ""]
@@ -892,95 +1013,109 @@ def remove_empty_lines(string):
     return string_no_empty_lines
 
 
-def generate_and_save_gbnf_grammar_and_documentation(pydantic_model_list,
-                                                     grammar_file_path="./generated_grammar.gbnf",
-                                                     documentation_file_path="./generated_grammar_documentation.md",
-                                                     outer_object_name: str = None,
-                                                     outer_object_content: str = None,
-                                                     model_prefix: str = "Output Model",
-                                                     fields_prefix: str = "Output Fields",
-                                                     list_of_outputs: bool = False,
-                                                     documentation_with_field_description=True):
+def generate_and_save_gbnf_grammar_and_documentation(
+    pydantic_model_list,
+    grammar_file_path="./generated_grammar.gbnf",
+    documentation_file_path="./generated_grammar_documentation.md",
+    outer_object_name: str = None,
+    outer_object_content: str = None,
+    model_prefix: str = "Output Model",
+    fields_prefix: str = "Output Fields",
+    list_of_outputs: bool = False,
+    documentation_with_field_description=True,
+):
     """
     Generate GBNF grammar and documentation, and save them to specified files.
 
     Args:
-    pydantic_model_list: List of Pydantic model classes.
-    grammar_file_path (str): File path to save the generated GBNF grammar.
-    documentation_file_path (str): File path to save the generated documentation.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    model_prefix (str): Prefix for the model section in the documentation.
-    fields_prefix (str): Prefix for the fields section in the documentation.
-    list_of_outputs (bool): Whether the output is a list of items.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        pydantic_model_list: List of Pydantic model classes.
+        grammar_file_path (str): File path to save the generated GBNF grammar.
+        documentation_file_path (str): File path to save the generated documentation.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        model_prefix (str): Prefix for the model section in the documentation.
+        fields_prefix (str): Prefix for the fields section in the documentation.
+        list_of_outputs (bool): Whether the output is a list of items.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    None
+        None
     """
-    documentation = generate_text_documentation(pydantic_model_list, model_prefix, fields_prefix,
-                                                documentation_with_field_description=documentation_with_field_description)
-    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
-                                                         outer_object_content, list_of_outputs)
+    documentation = generate_markdown_documentation(
+        pydantic_model_list, model_prefix, fields_prefix,
+        documentation_with_field_description=documentation_with_field_description
+    )
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content,
+                                                         list_of_outputs)
     grammar = remove_empty_lines(grammar)
     save_gbnf_grammar_and_documentation(grammar, documentation, grammar_file_path, documentation_file_path)
 
 
-def generate_gbnf_grammar_and_documentation(pydantic_model_list, outer_object_name: str = None,
-                                            outer_object_content: str = None,
-                                            model_prefix: str = "Output Model",
-                                            fields_prefix: str = "Output Fields", list_of_outputs: bool = False,
-                                            documentation_with_field_description=True):
+def generate_gbnf_grammar_and_documentation(
+    pydantic_model_list,
+    outer_object_name: str = None,
+    outer_object_content: str = None,
+    model_prefix: str = "Output Model",
+    fields_prefix: str = "Output Fields",
+    list_of_outputs: bool = False,
+    documentation_with_field_description=True,
+):
     """
     Generate GBNF grammar and documentation for a list of Pydantic models.
 
     Args:
-    pydantic_model_list: List of Pydantic model classes.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    model_prefix (str): Prefix for the model section in the documentation.
-    fields_prefix (str): Prefix for the fields section in the documentation.
-    list_of_outputs (bool): Whether the output is a list of items.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        pydantic_model_list: List of Pydantic model classes.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        model_prefix (str): Prefix for the model section in the documentation.
+        fields_prefix (str): Prefix for the fields section in the documentation.
+        list_of_outputs (bool): Whether the output is a list of items.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    tuple: GBNF grammar string, documentation string.
+        tuple: GBNF grammar string, documentation string.
     """
-    documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix,
-                                                documentation_with_field_description=documentation_with_field_description)
-    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
-                                                         outer_object_content, list_of_outputs)
+    documentation = generate_markdown_documentation(
+        copy(pydantic_model_list), model_prefix, fields_prefix,
+        documentation_with_field_description=documentation_with_field_description
+    )
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content,
+                                                         list_of_outputs)
     grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar))
     return grammar, documentation
 
 
-def generate_gbnf_grammar_and_documentation_from_dictionaries(dictionaries: List[dict],
-                                                              outer_object_name: str = None,
-                                                              outer_object_content: str = None,
-                                                              model_prefix: str = "Output Model",
-                                                              fields_prefix: str = "Output Fields",
-                                                              list_of_outputs: bool = False,
-                                                              documentation_with_field_description=True):
+def generate_gbnf_grammar_and_documentation_from_dictionaries(
+    dictionaries: List[dict],
+    outer_object_name: str = None,
+    outer_object_content: str = None,
+    model_prefix: str = "Output Model",
+    fields_prefix: str = "Output Fields",
+    list_of_outputs: bool = False,
+    documentation_with_field_description=True,
+):
     """
     Generate GBNF grammar and documentation from a list of dictionaries.
 
     Args:
-    dictionaries (List[dict]): List of dictionaries representing Pydantic models.
-    outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
-    outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
-    model_prefix (str): Prefix for the model section in the documentation.
-    fields_prefix (str): Prefix for the fields section in the documentation.
-    list_of_outputs (bool): Whether the output is a list of items.
-    documentation_with_field_description (bool): Include field descriptions in the documentation.
+        dictionaries (List[dict]): List of dictionaries representing Pydantic models.
+        outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
+        outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
+        model_prefix (str): Prefix for the model section in the documentation.
+        fields_prefix (str): Prefix for the fields section in the documentation.
+        list_of_outputs (bool): Whether the output is a list of items.
+        documentation_with_field_description (bool): Include field descriptions in the documentation.
 
     Returns:
-    tuple: GBNF grammar string, documentation string.
+        tuple: GBNF grammar string, documentation string.
     """
     pydantic_model_list = create_dynamic_models_from_dictionaries(dictionaries)
-    documentation = generate_text_documentation(copy(pydantic_model_list), model_prefix, fields_prefix,
-                                                documentation_with_field_description=documentation_with_field_description)
-    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name,
-                                                         outer_object_content, list_of_outputs)
+    documentation = generate_markdown_documentation(
+        copy(pydantic_model_list), model_prefix, fields_prefix,
+        documentation_with_field_description=documentation_with_field_description
+    )
+    grammar = generate_gbnf_grammar_from_pydantic_models(pydantic_model_list, outer_object_name, outer_object_content,
+                                                         list_of_outputs)
     grammar = remove_empty_lines(grammar + get_primitive_grammar(grammar))
     return grammar, documentation
 
@@ -990,41 +1125,61 @@ def create_dynamic_model_from_function(func: Callable):
     Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
 
     Args:
-    func (Callable): A function with type hints from which to create the model.
+        func (Callable): A function with type hints from which to create the model.
 
     Returns:
-    A dynamic Pydantic model class with the provided function as a 'run' method.
+        A dynamic Pydantic model class with the provided function as a 'run' method.
     """
-    # Extracting type hints from the provided function
-    type_hints = get_type_hints(func)
-    type_hints.pop('return', None)
 
-    # Handling default values and annotations
-    dynamic_fields = {}
-    defaults = getattr(func, '__defaults__', ()) or ()
-    defaults_index = len(type_hints) - len(defaults)
+    # Get the signature of the function
+    sig = inspect.signature(func)
 
-    for index, (name, typ) in enumerate(type_hints.items()):
-        if index >= defaults_index:
-            default_value = defaults[index - defaults_index]
-            dynamic_fields[name] = (typ, default_value)
-        else:
-            dynamic_fields[name] = (typ, ...)
+    # Parse the docstring
+    docstring = parse(func.__doc__)
 
+    dynamic_fields = {}
+    param_docs = []
+    for param in sig.parameters.values():
+        # Exclude 'self' parameter
+        if param.name == "self":
+            continue
+
+        # Assert that the parameter has a type annotation
+        if param.annotation == inspect.Parameter.empty:
+            raise TypeError(f"Parameter '{param.name}' in function '{func.__name__}' lacks a type annotation")
+
+        # Find the parameter's description in the docstring
+        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
+
+        # Assert that the parameter has a description
+        if not param_doc or not param_doc.description:
+            raise ValueError(
+                f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
+
+        # Add parameter details to the schema
+        param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
+        param_docs.append((param.name, param_doc))
+        if param.default == inspect.Parameter.empty:
+            default_value = ...
+        else:
+            default_value = param.default
+        dynamic_fields[param.name] = (
+            param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
     # Creating the dynamic model
-    dynamicModel = create_model(f'{func.__name__}', **dynamic_fields)
+    dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
+
+    for param_doc in param_docs:
+        dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
 
-    dynamicModel.__doc__ = getdoc(func)
+    dynamic_model.__doc__ = docstring.short_description
 
-    # Wrapping the original function to handle instance 'self'
     def run_method_wrapper(self):
-        func_args = {name: getattr(self, name) for name in type_hints}
+        func_args = {name: getattr(self, name) for name, _ in dynamic_fields.items()}
         return func(**func_args)
 
     # Adding the wrapped function as a 'run' method
-    setattr(dynamicModel, 'run', run_method_wrapper)
-
-    return dynamicModel
+    setattr(dynamic_model, "run", run_method_wrapper)
+    return dynamic_model
 
 
 def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
@@ -1032,11 +1187,11 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
     Add a 'run' method to a dynamic Pydantic model, using the provided function.
 
     Args:
-    - model (Type[BaseModel]): Dynamic Pydantic model class.
-    - func (Callable): Function to be added as a 'run' method to the model.
+        model (Type[BaseModel]): Dynamic Pydantic model class.
+        func (Callable): Function to be added as a 'run' method to the model.
 
     Returns:
-    - Type[BaseModel]: Pydantic model class with the added 'run' method.
+        Type[BaseModel]: Pydantic model class with the added 'run' method.
     """
 
     def run_method_wrapper(self):
@@ -1044,7 +1199,7 @@ def run_method_wrapper(self):
         return func(**func_args)
 
     # Adding the wrapped function as a 'run' method
-    setattr(model, 'run', run_method_wrapper)
+    setattr(model, "run", run_method_wrapper)
 
     return model
 
@@ -1054,15 +1209,15 @@ def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
     Create a list of dynamic Pydantic model classes from a list of dictionaries.
 
     Args:
-    - dictionaries (List[dict]): List of dictionaries representing model structures.
+        dictionaries (List[dict]): List of dictionaries representing model structures.
 
     Returns:
-    - List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
+        List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
     """
     dynamic_models = []
     for func in dictionaries:
         model_name = format_model_and_field_name(func.get("name", ""))
-        dyn_model = convert_dictionary_to_to_pydantic_model(func, model_name)
+        dyn_model = convert_dictionary_to_pydantic_model(func, model_name)
         dynamic_models.append(dyn_model)
     return dynamic_models
 
@@ -1080,12 +1235,12 @@ def map_grammar_names_to_pydantic_model_class(pydantic_model_list):
 
 def json_schema_to_python_types(schema):
     type_map = {
-        'any': Any,
-        'string': str,
-        'number': float,
-        'integer': int,
-        'boolean': bool,
-        'array': list,
+        "any": Any,
+        "string": str,
+        "number": float,
+        "integer": int,
+        "boolean": bool,
+        "array": list,
     }
     return type_map[schema]
 
@@ -1094,58 +1249,64 @@ def list_to_enum(enum_name, values):
     return Enum(enum_name, {value: value for value in values})
 
 
-def convert_dictionary_to_to_pydantic_model(dictionary: dict, model_name: str = 'CustomModel') -> Type[BaseModel]:
+def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
     """
     Convert a dictionary to a Pydantic model class.
 
     Args:
-    - dictionary (dict): Dictionary representing the model structure.
-    - model_name (str): Name of the generated Pydantic model.
+        dictionary (dict): Dictionary representing the model structure.
+        model_name (str): Name of the generated Pydantic model.
 
     Returns:
-    - Type[BaseModel]: Generated Pydantic model class.
+        Type[BaseModel]: Generated Pydantic model class.
     """
     fields = {}
 
     if "properties" in dictionary:
         for field_name, field_data in dictionary.get("properties", {}).items():
-            if field_data == 'object':
-                submodel = convert_dictionary_to_to_pydantic_model(dictionary, f'{model_name}_{field_name}')
+            if field_data == "object":
+                submodel = convert_dictionary_to_pydantic_model(dictionary, f"{model_name}_{field_name}")
                 fields[field_name] = (submodel, ...)
             else:
-                field_type = field_data.get('type', 'str')
+                field_type = field_data.get("type", "str")
 
                 if field_data.get("enum", []):
                     fields[field_name] = (list_to_enum(field_name, field_data.get("enum", [])), ...)
-                if field_type == "array":
+                elif field_type == "array":
                     items = field_data.get("items", {})
                     if items != {}:
                         array = {"properties": items}
-                        array_type = convert_dictionary_to_to_pydantic_model(array, f'{model_name}_{field_name}_items')
+                        array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
                         fields[field_name] = (List[array_type], ...)
                     else:
                         fields[field_name] = (list, ...)
-                elif field_type == 'object':
-                    submodel = convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}_{field_name}')
+                elif field_type == "object":
+                    submodel = convert_dictionary_to_pydantic_model(field_data, f"{model_name}_{field_name}")
                     fields[field_name] = (submodel, ...)
+                elif field_type == "required":
+                    required = field_data.get("enum", [])
+                    for key, field in fields.items():
+                        if key not in required:
+                            fields[key] = (Optional[fields[key][0]], ...)
                 else:
                     field_type = json_schema_to_python_types(field_type)
                     fields[field_name] = (field_type, ...)
     if "function" in dictionary:
-
         for field_name, field_data in dictionary.get("function", {}).items():
             if field_name == "name":
                 model_name = field_data
             elif field_name == "description":
                 fields["__doc__"] = field_data
             elif field_name == "parameters":
-                return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}')
+                return convert_dictionary_to_pydantic_model(field_data, f"{model_name}")
+
     if "parameters" in dictionary:
         field_data = {"function": dictionary}
-        return convert_dictionary_to_to_pydantic_model(field_data, f'{model_name}')
-
+        return convert_dictionary_to_pydantic_model(field_data, f"{model_name}")
+    if "required" in dictionary:
+        required = dictionary.get("required", [])
+        for key, field in fields.items():
+            if key not in required:
+                fields[key] = (Optional[fields[key][0]], ...)
     custom_model = create_model(model_name, **fields)
     return custom_model
-
-
-

From 7c8d3abd1a17c28fc56b1a4814bc4b29f91d7454 Mon Sep 17 00:00:00 2001
From: Alex Azarov <alex@azarov.by>
Date: Tue, 16 Jan 2024 14:33:02 +0100
Subject: [PATCH 05/25] metal : log `recommendedMaxWorkingSetSize` on iOS 16+
 (#4936)

* metal: Log `recommendedMaxWorkingSetSize` on iOS 16+

* Only log on iOS and macOS, ignoring tvOS and other platforms

* Check for Xcode version before using recommendedMaxWorkingSetSize

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-metal.m | 58 ++++++++++++++++++++++++----------------------------
 1 file changed, 27 insertions(+), 31 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 867f2fd48cbd2..44134d1d92494 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -369,8 +369,12 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
     GGML_METAL_LOG_INFO("%s: simdgroup reduction support   = %s\n",       __func__, ctx->support_simdgroup_reduction ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: simdgroup matrix mul. support = %s\n",       __func__, ctx->support_simdgroup_mm ? "true" : "false");
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
+
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
+    }
+#elif TARGET_OS_OSX
     if (ctx->device.maxTransferRate != 0) {
         GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
     } else {
@@ -2369,6 +2373,25 @@ GGML_CALL static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buff
     UNUSED(buft);
 }
 
+static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device) {
+#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
+    if (@available(macOS 10.12, iOS 16.0, *)) {
+        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+                device.currentAllocatedSize / 1024.0 / 1024.0,
+                device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+        if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+        } else {
+            GGML_METAL_LOG_INFO("\n");
+        }
+    } else {
+        GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+    }
+#endif
+    UNUSED(device);
+}
+
 GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
 
@@ -2401,22 +2424,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
     }
 
     GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
-
-
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-            device.currentAllocatedSize / 1024.0 / 1024.0,
-            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-    } else {
-        GGML_METAL_LOG_INFO("\n");
-    }
-#else
-    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
-
+    ggml_backend_metal_log_allocated_size(device);
 
     return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
 }
@@ -2524,19 +2532,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data,
         }
     }
 
-#if TARGET_OS_OSX
-    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-            device.currentAllocatedSize / 1024.0 / 1024.0,
-            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
-
-    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
-        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
-    } else {
-        GGML_METAL_LOG_INFO("\n");
-    }
-#else
-    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
-#endif
+    ggml_backend_metal_log_allocated_size(device);
 
     return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
 }

From 3a48d558a69c88ac17efcaa5900cd9eb19596ac4 Mon Sep 17 00:00:00 2001
From: Alex Azarov <alex@azarov.by>
Date: Tue, 16 Jan 2024 14:41:27 +0100
Subject: [PATCH 06/25] metal : replace loop of dispatch_async with
 dispatch_apply (#4934)

* Replace loop of dispatch_async with dispatch_apply

* Update ggml-metal.m

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ggml-metal.m | 2796 +++++++++++++++++++++++++-------------------------
 1 file changed, 1396 insertions(+), 1400 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index 44134d1d92494..c21dc465ae50c 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -737,1521 +737,1517 @@ static bool ggml_metal_graph_compute(
         ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
     }
 
-    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-        const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
-
-        dispatch_async(ctx->d_queue, ^{
-            size_t offs_src0 = 0;
-            size_t offs_src1 = 0;
-            size_t offs_dst  = 0;
-
-            id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-            id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
-
-            const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
-            const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
-
-            for (int ind = node_start; ind < node_end; ++ind) {
-                const int i = ind;
-
-                if (i == -1) {
-                    [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
-                    continue;
-                }
-
-                //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
-
-                struct ggml_tensor * src0 = gf->nodes[i]->src[0];
-                struct ggml_tensor * src1 = gf->nodes[i]->src[1];
-                struct ggml_tensor * dst  = gf->nodes[i];
-
-                switch (dst->op) {
-                    case GGML_OP_NONE:
-                    case GGML_OP_RESHAPE:
-                    case GGML_OP_VIEW:
-                    case GGML_OP_TRANSPOSE:
-                    case GGML_OP_PERMUTE:
-                        {
-                            // noop -> next node
-                        } continue;
-                    default:
-                        {
-                        } break;
-                }
-
-                if (!ggml_metal_supports_op(ctx, dst)) {
-                    GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
-                    GGML_ASSERT(!"unsupported op");
-                }
-
-#ifndef GGML_METAL_NDEBUG
-                [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
-#endif
-
-                const int64_t  ne00 = src0 ? src0->ne[0] : 0;
-                const int64_t  ne01 = src0 ? src0->ne[1] : 0;
-                const int64_t  ne02 = src0 ? src0->ne[2] : 0;
-                const int64_t  ne03 = src0 ? src0->ne[3] : 0;
-
-                const uint64_t nb00 = src0 ? src0->nb[0] : 0;
-                const uint64_t nb01 = src0 ? src0->nb[1] : 0;
-                const uint64_t nb02 = src0 ? src0->nb[2] : 0;
-                const uint64_t nb03 = src0 ? src0->nb[3] : 0;
-
-                const int64_t  ne10 = src1 ? src1->ne[0] : 0;
-                const int64_t  ne11 = src1 ? src1->ne[1] : 0;
-                const int64_t  ne12 = src1 ? src1->ne[2] : 0;
-                const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
-
-                const uint64_t nb10 = src1 ? src1->nb[0] : 0;
-                const uint64_t nb11 = src1 ? src1->nb[1] : 0;
-                const uint64_t nb12 = src1 ? src1->nb[2] : 0;
-                const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
-
-                const int64_t  ne0  = dst ? dst->ne[0] : 0;
-                const int64_t  ne1  = dst ? dst->ne[1] : 0;
-                const int64_t  ne2  = dst ? dst->ne[2] : 0;
-                const int64_t  ne3  = dst ? dst->ne[3] : 0;
-
-                const uint64_t nb0  = dst ? dst->nb[0] : 0;
-                const uint64_t nb1  = dst ? dst->nb[1] : 0;
-                const uint64_t nb2  = dst ? dst->nb[2] : 0;
-                const uint64_t nb3  = dst ? dst->nb[3] : 0;
-
-                const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
-                const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
-                const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
-
-                id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
-                id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
-                id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
-
-                //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
-                //if (src0) {
-                //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
-                //            ggml_is_contiguous(src0), src0->name);
-                //}
-                //if (src1) {
-                //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
-                //            ggml_is_contiguous(src1), src1->name);
-                //}
-                //if (dst) {
-                //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
-                //            dst->name);
-                //}
-
-                switch (dst->op) {
-                    case GGML_OP_CONCAT:
-                        {
-                            const int64_t nb = ne00;
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
-
-                            const int nth = MIN(1024, ne0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ADD:
-                    case GGML_OP_MUL:
-                    case GGML_OP_DIV:
-                        {
-                            const size_t offs = 0;
-
-                            bool bcast_row = false;
-
-                            int64_t nb = ne00;
+    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
+    dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
+        const int cb_idx = iter;
 
-                            id<MTLComputePipelineState> pipeline = nil;
+        size_t offs_src0 = 0;
+        size_t offs_src1 = 0;
+        size_t offs_dst  = 0;
 
-                            if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
-                                GGML_ASSERT(ggml_is_contiguous(src0));
+        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
+        id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
 
-                                // src1 is a row
-                                GGML_ASSERT(ne11 == 1);
+        const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
+        const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
 
-                                nb = ne00 / 4;
-                                switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
-                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
-                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
-                                    default: GGML_ASSERT(false);
-                                }
+        for (int ind = node_start; ind < node_end; ++ind) {
+            const int i = ind;
 
-                                bcast_row = true;
-                            } else {
-                                switch (dst->op) {
-                                    case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
-                                    case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
-                                    case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
-                                    default: GGML_ASSERT(false);
-                                }
-                            }
+            if (i == -1) {
+                [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
+                continue;
+            }
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
-                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-                            [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
-
-                            if (bcast_row) {
-                                const int64_t n = ggml_nelements(dst)/4;
+            //GGML_METAL_LOG_INFO("%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+            struct ggml_tensor * src0 = gf->nodes[i]->src[0];
+            struct ggml_tensor * src1 = gf->nodes[i]->src[1];
+            struct ggml_tensor * dst  = gf->nodes[i];
+
+            switch (dst->op) {
+                case GGML_OP_NONE:
+                case GGML_OP_RESHAPE:
+                case GGML_OP_VIEW:
+                case GGML_OP_TRANSPOSE:
+                case GGML_OP_PERMUTE:
+                    {
+                        // noop -> next node
+                    } continue;
+                default:
+                    {
+                    } break;
+            }
 
-                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            } else {
-                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+            if (!ggml_metal_supports_op(ctx, dst)) {
+                GGML_METAL_LOG_ERROR("%s: error: unsupported op '%s'\n", __func__, ggml_op_desc(dst));
+                GGML_ASSERT(!"unsupported op");
+            }
 
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                            }
-                        } break;
-                    case GGML_OP_ACC:
-                        {
-                            GGML_ASSERT(src0t == GGML_TYPE_F32);
-                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                            GGML_ASSERT(dstt  == GGML_TYPE_F32);
+#ifndef GGML_METAL_NDEBUG
+            [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
+#endif
 
+            const int64_t  ne00 = src0 ? src0->ne[0] : 0;
+            const int64_t  ne01 = src0 ? src0->ne[1] : 0;
+            const int64_t  ne02 = src0 ? src0->ne[2] : 0;
+            const int64_t  ne03 = src0 ? src0->ne[3] : 0;
+
+            const uint64_t nb00 = src0 ? src0->nb[0] : 0;
+            const uint64_t nb01 = src0 ? src0->nb[1] : 0;
+            const uint64_t nb02 = src0 ? src0->nb[2] : 0;
+            const uint64_t nb03 = src0 ? src0->nb[3] : 0;
+
+            const int64_t  ne10 = src1 ? src1->ne[0] : 0;
+            const int64_t  ne11 = src1 ? src1->ne[1] : 0;
+            const int64_t  ne12 = src1 ? src1->ne[2] : 0;
+            const int64_t  ne13 = src1 ? src1->ne[3] : 0; UNUSED(ne13);
+
+            const uint64_t nb10 = src1 ? src1->nb[0] : 0;
+            const uint64_t nb11 = src1 ? src1->nb[1] : 0;
+            const uint64_t nb12 = src1 ? src1->nb[2] : 0;
+            const uint64_t nb13 = src1 ? src1->nb[3] : 0; UNUSED(nb13);
+
+            const int64_t  ne0  = dst ? dst->ne[0] : 0;
+            const int64_t  ne1  = dst ? dst->ne[1] : 0;
+            const int64_t  ne2  = dst ? dst->ne[2] : 0;
+            const int64_t  ne3  = dst ? dst->ne[3] : 0;
+
+            const uint64_t nb0  = dst ? dst->nb[0] : 0;
+            const uint64_t nb1  = dst ? dst->nb[1] : 0;
+            const uint64_t nb2  = dst ? dst->nb[2] : 0;
+            const uint64_t nb3  = dst ? dst->nb[3] : 0;
+
+            const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT;
+            const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
+            const enum ggml_type dstt  = dst  ? dst->type  : GGML_TYPE_COUNT;
+
+            id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
+            id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
+            id<MTLBuffer> id_dst  = dst  ? ggml_metal_get_buffer(ctx, dst,  &offs_dst)  : nil;
+
+            //GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
+            //if (src0) {
+            //    GGML_METAL_LOG_INFO("%s: src0 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src0t), ne00, ne01, ne02,
+            //            ggml_is_contiguous(src0), src0->name);
+            //}
+            //if (src1) {
+            //    GGML_METAL_LOG_INFO("%s: src1 - %4s [%5lld, %5lld, %5lld], %d, %s\n", __func__, ggml_type_name(src1t), ne10, ne11, ne12,
+            //            ggml_is_contiguous(src1), src1->name);
+            //}
+            //if (dst) {
+            //    GGML_METAL_LOG_INFO("%s: dst  - %4s [%5lld, %5lld, %5lld], 1, %s\n",  __func__, ggml_type_name(dstt),  ne0,  ne1,  ne2,
+            //            dst->name);
+            //}
+
+            switch (dst->op) {
+                case GGML_OP_CONCAT:
+                    {
+                        const int64_t nb = ne00;
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CONCAT].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:27];
+
+                        const int nth = MIN(1024, ne0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ADD:
+                case GGML_OP_MUL:
+                case GGML_OP_DIV:
+                    {
+                        const size_t offs = 0;
+
+                        bool bcast_row = false;
+
+                        int64_t nb = ne00;
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) {
                             GGML_ASSERT(ggml_is_contiguous(src0));
-                            GGML_ASSERT(ggml_is_contiguous(src1));
-
-                            const size_t pnb1 = ((int32_t *) dst->op_params)[0];
-                            const size_t pnb2 = ((int32_t *) dst->op_params)[1];
-                            const size_t pnb3 = ((int32_t *) dst->op_params)[2];
-                            const size_t offs = ((int32_t *) dst->op_params)[3];
-
-                            const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
-
-                            if (!inplace) {
-                                // run a separete kernel to cpy src->dst
-                                // not sure how to avoid this
-                                // TODO: make a simpler cpy_bytes kernel
 
-                                const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
+                            // src1 is a row
+                            GGML_ASSERT(ne11 == 1);
 
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                                [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                                [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                                [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                                [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                                [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                                [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                                [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                                [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                                [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                                [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                                [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                                [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                                [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                                [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                                [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                                [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                                [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
-
-                                const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                            nb = ne00 / 4;
+                            switch (dst->op) {
+                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
+                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
+                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
+                                default: GGML_ASSERT(false);
                             }
 
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
-                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
-                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
-                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
-                            [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
-                            [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
-                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
-                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
-
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_SCALE:
-                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
-
-                            const float scale = *(const float *) dst->op_params;
-
-                            int64_t n = ggml_nelements(dst);
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (n % 4 == 0) {
-                                n /= 4;
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
+                            bcast_row = true;
+                        } else {
+                            switch (dst->op) {
+                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
+                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
+                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
+                                default: GGML_ASSERT(false);
                             }
+                        }
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:8];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:9];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:10];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:24];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:25];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:26];
+                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+                        [encoder setBytes:&nb   length:sizeof(nb)   atIndex:28];
+
+                        if (bcast_row) {
+                            const int64_t n = ggml_nelements(dst)/4;
 
                             [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_UNARY:
-                        switch (ggml_get_unary_op(gf->nodes[i])) {
-                            case GGML_UNARY_OP_TANH:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_RELU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_GELU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_GELU_QUICK:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
-
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            case GGML_UNARY_OP_SILU:
-                                {
-                                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
-
-                                    [encoder setComputePipelineState:pipeline];
-                                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-
-                                    const int64_t n = ggml_nelements(dst);
-                                    GGML_ASSERT(n % 4 == 0);
+                        } else {
+                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
 
-                                    [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                                } break;
-                            default:
-                                {
-                                    GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                                    GGML_ASSERT(false);
-                                }
-                        } break;
-                    case GGML_OP_SQR:
-                        {
-                            GGML_ASSERT(ggml_is_contiguous(src0));
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        }
+                    } break;
+                case GGML_OP_ACC:
+                    {
+                        GGML_ASSERT(src0t == GGML_TYPE_F32);
+                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                        GGML_ASSERT(dstt  == GGML_TYPE_F32);
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+                        GGML_ASSERT(ggml_is_contiguous(src1));
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+                        const size_t pnb1 = ((int32_t *) dst->op_params)[0];
+                        const size_t pnb2 = ((int32_t *) dst->op_params)[1];
+                        const size_t pnb3 = ((int32_t *) dst->op_params)[2];
+                        const size_t offs = ((int32_t *) dst->op_params)[3];
 
-                            const int64_t n = ggml_nelements(dst);
+                        const bool inplace = (bool) ((int32_t *) dst->op_params)[4];
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_SUM_ROWS:
-                        {
-                            GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+                        if (!inplace) {
+                            // run a separete kernel to cpy src->dst
+                            // not sure how to avoid this
+                            // TODO: make a simpler cpy_bytes kernel
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;
 
                             [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
-                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                            [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_SOFT_MAX:
-                        {
-                            int nth = 32; // SIMD width
-
-                            id<MTLComputePipelineState> pipeline = nil;
-
-                            if (ne00%4 == 0) {
-                                while (nth < ne00/4 && nth < 256) {
-                                    nth *= 2;
-                                }
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
-                            } else {
-                                while (nth < ne00 && nth < 1024) {
-                                    nth *= 2;
-                                }
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
-                            }
+                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
 
-                            const float scale = ((float *) dst->op_params)[0];
+                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            if (id_src1) {
-                                [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
-                            } else {
-                                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
-                            }
-                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
-                            [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
-                            [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
-                            [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
-                            [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_DIAG_MASK_INF:
-                        {
-                            const int n_past = ((int32_t *)(dst->op_params))[0];
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                        }
 
-                            id<MTLComputePipelineState> pipeline = nil;
+                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:6];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:7];
+                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:8];
+                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:9];
+                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:10];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:11];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:12];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:13];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:14];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:15];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:16];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:17];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:18];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:19];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:20];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:21];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:22];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:23];
+                        [encoder setBytes:&pnb1 length:sizeof(pnb1) atIndex:24];
+                        [encoder setBytes:&pnb2 length:sizeof(pnb2) atIndex:25];
+                        [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
+                        [encoder setBytes:&offs length:sizeof(offs) atIndex:27];
+
+                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne00);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_SCALE:
+                    {
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+
+                        const float scale = *(const float *) dst->op_params;
+
+                        int64_t n = ggml_nelements(dst);
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (n % 4 == 0) {
+                            n /= 4;
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE_4].pipeline;
+                        } else {
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SCALE].pipeline;
+                        }
 
-                            if (ne00%8 == 0) {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
-                            } else {
-                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
-                            }
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&scale length:sizeof(scale) atIndex:2];
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_UNARY:
+                    switch (ggml_get_unary_op(gf->nodes[i])) {
+                        case GGML_UNARY_OP_TANH:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline;
 
-                            if (ne00%8 == 0) {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                            else {
-                                [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                            }
-                        } break;
-                    case GGML_OP_MUL_MAT:
-                        {
-                            GGML_ASSERT(ne00 == ne10);
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                            // TODO: assert that dim2 and dim3 are contiguous
-                            GGML_ASSERT(ne12 % ne02 == 0);
-                            GGML_ASSERT(ne13 % ne03 == 0);
+                                const int64_t n = ggml_nelements(dst);
 
-                            const uint r2 = ne12/ne02;
-                            const uint r3 = ne13/ne03;
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_RELU:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RELU].pipeline;
 
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
+                                [encoder setComputePipelineState:pipeline];
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-#if 0
-                            // the numbers below are measured on M2 Ultra for 7B and 13B models
-                            // these numbers do not translate to other devices or model sizes
-                            // TODO: need to find a better approach
-                            if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
-                                switch (src0t) {
-                                    case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
-                                    case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q4_0:
-                                    case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
-                                    case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
-                                    case GGML_TYPE_Q5_0:                          // not tested yet
-                                    case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
-                                    case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
-                                    case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
-                                    default:             ne11_mm_min = 1;  break;
-                                }
-                            }
-#endif
+                                const int64_t n = ggml_nelements(dst);
 
-                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                !ggml_is_transposed(src0) &&
-                                !ggml_is_transposed(src1) &&
-                                src1t == GGML_TYPE_F32 &&
-                                ne00 % 32 == 0 && ne00 >= 64 &&
-                                (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
-                                //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                switch (src0->type) {
-                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
-                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
-                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
-                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
-                                    default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
-                                }
+                                [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_GELU:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline;
 
                                 [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
-                                [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
-                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
-                                [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
-                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-                                [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                            } else {
-                                int nth0 = 32;
-                                int nth1 = 1;
-                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                // use custom matrix x vector kernel
-                                switch (src0t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
-                                            nrows = 4;
-                                        } break;
-                                    case GGML_TYPE_F16:
-                                        {
-                                            nth0 = 32;
-                                            nth1 = 1;
-                                            if (src1t == GGML_TYPE_F32) {
-                                                if (ne11 * ne12 < 4) {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
-                                                } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
-                                                    nrows = ne11;
-                                                } else {
-                                                    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
-                                                    nrows = 4;
-                                                }
-                                            } else {
-                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
-                                                nrows = 4;
-                                            }
-                                        } break;
-                                    case GGML_TYPE_Q4_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q8_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q2_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q3_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_K:
-                                        {
-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q6_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
-                                        } break;
-                                    default:
-                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
-                                            GGML_ASSERT(false && "not implemented");
-                                        }
-                                };
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                                if (ggml_is_quantized(src0t)) {
-                                    GGML_ASSERT(ne00 >= nth0*nth1);
-                                }
+                                const int64_t n = ggml_nelements(dst);
+                                GGML_ASSERT(n % 4 == 0);
 
-                                [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                                [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
-                                [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
-                                [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
-                                [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                                [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
-                                [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
-                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
-                                [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
-                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
-                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
-
-                                if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
-                                    src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
-                                    src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                                }
-                                else if (src0t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src0t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                } else {
-                                    const int64_t ny = (ne11 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                            }
-                        } break;
-                    case GGML_OP_MUL_MAT_ID:
-                        {
-                            //GGML_ASSERT(ne00 == ne10);
-                            //GGML_ASSERT(ne03 == ne13);
-
-                            GGML_ASSERT(src0t == GGML_TYPE_I32);
-
-                            const int n_as = ((int32_t *) dst->op_params)[1];
-
-                            // TODO: make this more general
-                            GGML_ASSERT(n_as <= 8);
-
-                            // max size of the src1ids array in the kernel stack
-                            GGML_ASSERT(ne11 <= 512);
-
-                            struct ggml_tensor * src2 = gf->nodes[i]->src[2];
-
-                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
-                            const int64_t  ne21 = src2 ? src2->ne[1] : 0;
-                            const int64_t  ne22 = src2 ? src2->ne[2] : 0;
-                            const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
-
-                            const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
-                            const uint64_t nb21 = src2 ? src2->nb[1] : 0;
-                            const uint64_t nb22 = src2 ? src2->nb[2] : 0;
-                            const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
-
-                            const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
-
-                            GGML_ASSERT(!ggml_is_transposed(src2));
-                            GGML_ASSERT(!ggml_is_transposed(src1));
-
-                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-
-                            const uint r2 = ne12/ne22;
-                            const uint r3 = ne13/ne23;
-
-                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
-                            // to the matrix-vector kernel
-                            int ne11_mm_min = n_as;
-
-                            const int idx = ((int32_t *) dst->op_params)[0];
-
-                            // batch size
-                            GGML_ASSERT(ne01 == ne11);
-
-                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
-                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
-                            // !!!
-                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
-                            //       indirect matrix multiplication
-                            // !!!
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
-                                ne20 % 32 == 0 && ne20 >= 64 &&
-                                ne11 > ne11_mm_min) {
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                switch (src2->type) {
-                                    case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
-                                    case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
-                                    case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
-                                    case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
-                                    case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
-                                    default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
-                                }
+                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_GELU_QUICK:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline;
 
                                 [encoder setComputePipelineState:pipeline];
-                                [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
-                                [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
-                                [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
-                                [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
-                                [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
-                                [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
-                                [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
-                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
-                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
-                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
-                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
-                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
-                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
-                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
-                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < 8; ++j) {
-                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                    size_t offs_src_cur = 0;
-                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
-                                }
-
-                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];
-
-                                [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
-                            } else {
-                                int nth0 = 32;
-                                int nth1 = 1;
-                                int nrows = 1;
-                                //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
-
-                                id<MTLComputePipelineState> pipeline = nil;
-
-                                // use custom matrix x vector kernel
-                                switch (src2t) {
-                                    case GGML_TYPE_F32:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_F16:
-                                        {
-                                            GGML_ASSERT(src1t == GGML_TYPE_F32);
-                                            nth0 = 32;
-                                            nth1 = 1;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_1:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q8_0:
-                                        {
-                                            nth0 = 8;
-                                            nth1 = 8;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q2_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q3_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q4_K:
-                                        {
-                                            nth0 = 4; //1;
-                                            nth1 = 8; //32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q5_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_Q6_K:
-                                        {
-                                            nth0 = 2;
-                                            nth1 = 32;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XXS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
-                                        } break;
-                                    case GGML_TYPE_IQ2_XS:
-                                        {
-                                            nth0 = 4;
-                                            nth1 = 16;
-                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
-                                        } break;
-                                    default:
-                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
-                                            GGML_ASSERT(false && "not implemented");
-                                        }
-                                };
+                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                                if (ggml_is_quantized(src2t)) {
-                                    GGML_ASSERT(ne20 >= nth0*nth1);
-                                }
+                                const int64_t n = ggml_nelements(dst);
+                                GGML_ASSERT(n % 4 == 0);
 
-                                const int64_t _ne1 = 1; // kernels needs a reference in constant memory
+                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        case GGML_UNARY_OP_SILU:
+                            {
+                                id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline;
 
                                 [encoder setComputePipelineState:pipeline];
                                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
-                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                                [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
-                                [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
-                                [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
-                                [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
-                                [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
-                                [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
-                                [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
-                                [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
-                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
-                                [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
-                                [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
-                                [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
-                                [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
-                                [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
-                                [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
-                                [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
-                                [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
-                                [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
-                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
-                                [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
-                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < 8; ++j) {
-                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
-                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
-
-                                    size_t offs_src_cur = 0;
-                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
-
-                                    [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
-                                }
-
-                                if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
-                                    src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
-                                    src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
-                                    const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
-                                    [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q4_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q3_K) {
-#ifdef GGML_QKK_64
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#else
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-#endif
-                                }
-                                else if (src2t == GGML_TYPE_Q5_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                                else if (src2t == GGML_TYPE_Q6_K) {
-                                    [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                } else {
-                                    const int64_t ny = (_ne1 + nrows - 1)/nrows;
-                                    [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
-                                }
-                            }
-                        } break;
-                    case GGML_OP_GET_ROWS:
-                        {
-                            id<MTLComputePipelineState> pipeline = nil;
+                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
 
-                            switch (src0->type) {
-                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
-                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
-                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
-                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
-                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
-                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
-                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
-                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
-                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
-                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
-                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
-                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
-                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
-                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
-                                case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
-                                default: GGML_ASSERT(false && "not implemented");
-                            }
+                                const int64_t n = ggml_nelements(dst);
+                                GGML_ASSERT(n % 4 == 0);
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
-                            [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
-                            [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
-                        } break;
-                    case GGML_OP_RMS_NORM:
-                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
-                            float eps;
-                            memcpy(&eps, dst->op_params, sizeof(float));
-
-                            int nth = 32; // SIMD width
-
-                            while (nth < ne00/4 && nth < 1024) {
+                                [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                            } break;
+                        default:
+                            {
+                                GGML_METAL_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                                GGML_ASSERT(false);
+                            }
+                    } break;
+                case GGML_OP_SQR:
+                    {
+                        GGML_ASSERT(ggml_is_contiguous(src0));
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQR].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
+
+                        const int64_t n = ggml_nelements(dst);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_SUM_ROWS:
+                    {
+                        GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                        [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                        [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11];
+                        [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                        [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                        [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                        [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                        [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                        [encoder setBytes:&nb13 length:sizeof(nb13) atIndex:17];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:18];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:19];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:20];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:21];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:22];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:23];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:24];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:25];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_SOFT_MAX:
+                    {
+                        int nth = 32; // SIMD width
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (ne00%4 == 0) {
+                            while (nth < ne00/4 && nth < 256) {
                                 nth *= 2;
                             }
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline;
+                        } else {
+                            while (nth < ne00 && nth < 1024) {
+                                nth *= 2;
+                            }
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline;
+                        }
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            const int64_t nrows = ggml_nrows(src0);
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_GROUP_NORM:
-                        {
-                            GGML_ASSERT(ne00 % 4 == 0);
-
-                            //float eps;
-                            //memcpy(&eps, dst->op_params, sizeof(float));
-
-                            const float eps = 1e-6f; // TODO: temporarily hardcoded
-
-                            const int32_t n_groups = ((int32_t *) dst->op_params)[0];
-
-                            int nth = 32; // SIMD width
-
-                            //while (nth < ne00/4 && nth < 1024) {
-                            //    nth *= 2;
-                            //}
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
-                            [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
-                            [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
-                            [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_NORM:
-                        {
-                            float eps;
-                            memcpy(&eps, dst->op_params, sizeof(float));
-
-                            const int nth = MIN(256, ne00);
-
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
-
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
-                            [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
-                            [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
+                        const float scale = ((float *) dst->op_params)[0];
 
-                            const int64_t nrows = ggml_nrows(src0);
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                        if (id_src1) {
+                            [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                        } else {
+                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
+                        }
+                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:2];
+                        [encoder setBytes:&ne00  length:sizeof(ne00)  atIndex:3];
+                        [encoder setBytes:&ne01  length:sizeof(ne01)  atIndex:4];
+                        [encoder setBytes:&ne02  length:sizeof(ne02)  atIndex:5];
+                        [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_DIAG_MASK_INF:
+                    {
+                        const int n_past = ((int32_t *)(dst->op_params))[0];
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        if (ne00%8 == 0) {
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8].pipeline;
+                        } else {
+                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF].pipeline;
+                        }
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ALIBI:
-                        {
-                            GGML_ASSERT((src0t == GGML_TYPE_F32));
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00   length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01   length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&n_past length:sizeof(int)  atIndex:4];
 
-                            const int nth = MIN(1024, ne00);
+                        if (ne00%8 == 0) {
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00*ne01*ne02/8, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        }
+                        else {
+                            [encoder dispatchThreadgroups:MTLSizeMake(ne00, ne01, ne02) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                        }
+                    } break;
+                case GGML_OP_MUL_MAT:
+                    {
+                        GGML_ASSERT(ne00 == ne10);
 
-                            //const int n_past = ((int32_t *) dst->op_params)[0];
-                            const int n_head = ((int32_t *) dst->op_params)[1];
-                            float max_bias;
-                            memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+                        // TODO: assert that dim2 and dim3 are contiguous
+                        GGML_ASSERT(ne12 % ne02 == 0);
+                        GGML_ASSERT(ne13 % ne03 == 0);
 
-                            const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-                            const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-                            const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+                        const uint r2 = ne12/ne02;
+                        const uint r3 = ne13/ne03;
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
+                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                        // to the matrix-vector kernel
+                        int ne11_mm_min = 1;
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
-                            [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
-                            [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
+#if 0
+                        // the numbers below are measured on M2 Ultra for 7B and 13B models
+                        // these numbers do not translate to other devices or model sizes
+                        // TODO: need to find a better approach
+                        if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) {
+                            switch (src0t) {
+                                case GGML_TYPE_F16:  ne11_mm_min = 2;  break;
+                                case GGML_TYPE_Q8_0: ne11_mm_min = 7;  break;
+                                case GGML_TYPE_Q2_K: ne11_mm_min = 15; break;
+                                case GGML_TYPE_Q3_K: ne11_mm_min = 7;  break;
+                                case GGML_TYPE_Q4_0:
+                                case GGML_TYPE_Q4_1: ne11_mm_min = 15; break;
+                                case GGML_TYPE_Q4_K: ne11_mm_min = 11; break;
+                                case GGML_TYPE_Q5_0:                          // not tested yet
+                                case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet
+                                case GGML_TYPE_Q5_K: ne11_mm_min = 7;  break;
+                                case GGML_TYPE_Q6_K: ne11_mm_min = 7;  break;
+                                default:             ne11_mm_min = 1;  break;
+                            }
+                        }
+#endif
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ROPE:
-                        {
-                            GGML_ASSERT(ne10 == ne02);
-
-                            const int nth = MIN(1024, ne00);
-
-                            const int n_past     = ((int32_t *) dst->op_params)[0];
-                            const int n_dims     = ((int32_t *) dst->op_params)[1];
-                            const int mode       = ((int32_t *) dst->op_params)[2];
-                            // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
-                            const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
-
-                            float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-                            memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-                            memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-                            memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-                            memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-                            memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-                            memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                            !ggml_is_transposed(src0) &&
+                            !ggml_is_transposed(src1) &&
+                            src1t == GGML_TYPE_F32 &&
+                            ne00 % 32 == 0 && ne00 >= 64 &&
+                            (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) {
+                            //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
                             id<MTLComputePipelineState> pipeline = nil;
 
                             switch (src0->type) {
-                                case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
+                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32    ].pipeline; break;
+                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32    ].pipeline; break;
+                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32   ].pipeline; break;
+                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32].pipeline; break;
+                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32 ].pipeline; break;
+                                default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
+                            }
 
                             [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
-                            [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
-                            [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
-                            [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
-                            [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
-                            [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
-                            [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
-                            [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
-                            [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
-                            [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
-                            [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
-                            [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
-                            [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
-                            [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
-                            [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
-                            [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
-                            [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_IM2COL:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F16);
-                            GGML_ASSERT(src1->type == GGML_TYPE_F32);
-                            GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-                            const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
-                            const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
-                            const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
-                            const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
-                            const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
-                            const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
-                            const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
-
-                            const int32_t N  = src1->ne[is_2D ? 3 : 2];
-                            const int32_t IC = src1->ne[is_2D ? 2 : 1];
-                            const int32_t IH = is_2D ? src1->ne[1] : 1;
-                            const int32_t IW =         src1->ne[0];
-
-                            const int32_t KH = is_2D ? src0->ne[1] : 1;
-                            const int32_t KW =         src0->ne[0];
-
-                            const int32_t OH = is_2D ? dst->ne[2] : 1;
-                            const int32_t OW =         dst->ne[1];
-
-                            const int32_t CHW = IC * KH * KW;
-
-                            const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
-                            const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                            [encoder setBytes:&ne00    length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne02    length:sizeof(ne02) atIndex:4];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:5];
+                            [encoder setBytes:&nb02    length:sizeof(nb02) atIndex:6];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:7];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:8];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:9];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:10];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:11];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:12];
+                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:13];
+                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:14];
+                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
+                            [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                        } else {
+                            int nth0 = 32;
+                            int nth1 = 1;
+                            int nrows = 1;
+                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
                             id<MTLComputePipelineState> pipeline = nil;
 
-                            switch (src0->type) {
-                                case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
-                                case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
-                                default: GGML_ASSERT(false);
+                            // use custom matrix x vector kernel
+                            switch (src0t) {
+                                case GGML_TYPE_F32:
+                                    {
+                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                        nrows = 4;
+                                    } break;
+                                case GGML_TYPE_F16:
+                                    {
+                                        nth0 = 32;
+                                        nth1 = 1;
+                                        if (src1t == GGML_TYPE_F32) {
+                                            if (ne11 * ne12 < 4) {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
+                                            } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
+                                                nrows = ne11;
+                                            } else {
+                                                pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32].pipeline;
+                                                nrows = 4;
+                                            }
+                                        } else {
+                                            pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16].pipeline;
+                                            nrows = 4;
+                                        }
+                                    } break;
+                                case GGML_TYPE_Q4_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q8_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q2_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q3_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_K:
+                                    {
+                                        nth0 = 4; //1;
+                                        nth1 = 8; //32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q6_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XXS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32].pipeline;
+                                    } break;
+                                default:
+                                    {
+                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                        GGML_ASSERT(false && "not implemented");
+                                    }
                             };
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
-                            [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
-                            [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
-                            [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
-                            [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
-                            [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
-                            [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
-                            [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
-                            [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
-                            [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
-                            [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
-
-                            [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
-                        } break;
-                    case GGML_OP_UPSCALE:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-
-                            const int sf = dst->op_params[0];
-
-                            const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
+                            if (ggml_is_quantized(src0t)) {
+                                GGML_ASSERT(ne00 >= nth0*nth1);
+                            }
 
                             [encoder setComputePipelineState:pipeline];
                             [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
+                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
+                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
                             [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
                             [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
                             [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
-                            [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:9];
+                            [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:10];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:11];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:12];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:13];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:14];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:15];
+                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:16];
+                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:17];
+                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:18];
+
+                            if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 ||
+                                src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 ||
+                                src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
+                                const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_Q3_K) {
+#ifdef GGML_QKK_64
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                            }
+                            else if (src0t == GGML_TYPE_Q5_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src0t == GGML_TYPE_Q6_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            } else {
+                                const int64_t ny = (ne11 + nrows - 1)/nrows;
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                        }
+                    } break;
+                case GGML_OP_MUL_MAT_ID:
+                    {
+                        //GGML_ASSERT(ne00 == ne10);
+                        //GGML_ASSERT(ne03 == ne13);
 
-                            const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+                        GGML_ASSERT(src0t == GGML_TYPE_I32);
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_PAD:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                        const int n_as = ((int32_t *) dst->op_params)[1];
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+                        // TODO: make this more general
+                        GGML_ASSERT(n_as <= 8);
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                            [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
-                            [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
-                            [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
-                            [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
-                            [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
-                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
-                            [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
-                            [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
-                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
-                            [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
-                            [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
-                            [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
-                            [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
-                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
-                            [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
-                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                        // max size of the src1ids array in the kernel stack
+                        GGML_ASSERT(ne11 <= 512);
 
-                            const int nth = MIN(1024, ne0);
+                        struct ggml_tensor * src2 = gf->nodes[i]->src[2];
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    case GGML_OP_ARGSORT:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
-                            GGML_ASSERT( dst->type == GGML_TYPE_I32);
+                        const int64_t  ne20 = src2 ? src2->ne[0] : 0;
+                        const int64_t  ne21 = src2 ? src2->ne[1] : 0;
+                        const int64_t  ne22 = src2 ? src2->ne[2] : 0;
+                        const int64_t  ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23);
 
-                            const int nrows = ggml_nrows(src0);
+                        const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20);
+                        const uint64_t nb21 = src2 ? src2->nb[1] : 0;
+                        const uint64_t nb22 = src2 ? src2->nb[2] : 0;
+                        const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23);
 
-                            enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+                        const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
 
-                            id<MTLComputePipelineState> pipeline = nil;
+                        GGML_ASSERT(!ggml_is_transposed(src2));
+                        GGML_ASSERT(!ggml_is_transposed(src1));
 
-                            switch (order) {
-                                case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
-                                case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
-                                default: GGML_ASSERT(false);
-                            };
+                        GGML_ASSERT(src1t == GGML_TYPE_F32);
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        const uint r2 = ne12/ne22;
+                        const uint r3 = ne13/ne23;
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
-                        } break;
-                    case GGML_OP_LEAKY_RELU:
-                        {
-                            GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                        // find the break-even point where the matrix-matrix kernel becomes more efficient compared
+                        // to the matrix-vector kernel
+                        int ne11_mm_min = n_as;
 
-                            float slope;
-                            memcpy(&slope, dst->op_params, sizeof(float));
+                        const int idx = ((int32_t *) dst->op_params)[0];
 
-                            id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
+                        // batch size
+                        GGML_ASSERT(ne01 == ne11);
 
-                            [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
-                            [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
+                        // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
+                        // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
+                        // !!!
+                        // TODO: for now, always use mat-vec kernels until we figure out how to improve the
+                        //       indirect matrix multiplication
+                        // !!!
+                        if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                            ne20 % 32 == 0 && ne20 >= 64 &&
+                            ne11 > ne11_mm_min) {
+
+                            id<MTLComputePipelineState> pipeline = nil;
 
-                            const int64_t n = ggml_nelements(dst);
+                            switch (src2->type) {
+                                case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32    ].pipeline; break;
+                                case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32    ].pipeline; break;
+                                case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32   ].pipeline; break;
+                                case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32   ].pipeline; break;
+                                case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32   ].pipeline; break;
+                                case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32   ].pipeline; break;
+                                case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32].pipeline; break;
+                                case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32 ].pipeline; break;
+                                default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
+                            }
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
-                        } break;
-                    case GGML_OP_DUP:
-                    case GGML_OP_CPY:
-                    case GGML_OP_CONT:
-                        {
-                            GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+                            [encoder setComputePipelineState:pipeline];
+                            [encoder setBuffer:id_src0 offset:offs_src0    atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1    atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst     atIndex:2];
+                            [encoder setBytes:&nb01    length:sizeof(nb01) atIndex:3];
+                            [encoder setBytes:&ne20    length:sizeof(ne20) atIndex:4];
+                            [encoder setBytes:&ne22    length:sizeof(ne22) atIndex:5];
+                            [encoder setBytes:&nb21    length:sizeof(nb21) atIndex:6];
+                            [encoder setBytes:&nb22    length:sizeof(nb22) atIndex:7];
+                            [encoder setBytes:&ne12    length:sizeof(ne12) atIndex:8];
+                            [encoder setBytes:&ne13    length:sizeof(ne13) atIndex:9];
+                            [encoder setBytes:&nb10    length:sizeof(nb10) atIndex:10];
+                            [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
+                            [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
+                            [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
+                            [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
+                            [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
+                            [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
+                            [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
+                            [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
+                            // TODO: how to make this an array? read Metal docs
+                            for (int j = 0; j < 8; ++j) {
+                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
+                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
+
+                                size_t offs_src_cur = 0;
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+
+                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
+                            }
+
+                            [encoder setThreadgroupMemoryLength:8192 atIndex:0];
 
-                            int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
+                            [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                        } else {
+                            int nth0 = 32;
+                            int nth1 = 1;
+                            int nrows = 1;
+                            //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12);
 
                             id<MTLComputePipelineState> pipeline = nil;
 
-                            switch (src0t) {
+                            // use custom matrix x vector kernel
+                            switch (src2t) {
                                 case GGML_TYPE_F32:
                                     {
-                                        GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
-
-                                        switch (dstt) {
-                                            case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
-                                            case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
-                                            case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
-                                            case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
-                                            case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
-                                          //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
-                                          //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
-                                            default: GGML_ASSERT(false && "not implemented");
-                                        };
+                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32].pipeline;
                                     } break;
                                 case GGML_TYPE_F16:
                                     {
-                                        switch (dstt) {
-                                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
-                                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
-                                            default: GGML_ASSERT(false && "not implemented");
-                                        };
+                                        GGML_ASSERT(src1t == GGML_TYPE_F32);
+                                        nth0 = 32;
+                                        nth1 = 1;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q4_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_1:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q8_0:
+                                    {
+                                        nth0 = 8;
+                                        nth1 = 8;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q2_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q3_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32].pipeline;
                                     } break;
-                                default: GGML_ASSERT(false && "not implemented");
+                                case GGML_TYPE_Q4_K:
+                                    {
+                                        nth0 = 4; //1;
+                                        nth1 = 8; //32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q5_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_Q6_K:
+                                    {
+                                        nth0 = 2;
+                                        nth1 = 32;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XXS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32].pipeline;
+                                    } break;
+                                case GGML_TYPE_IQ2_XS:
+                                    {
+                                        nth0 = 4;
+                                        nth1 = 16;
+                                        pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32].pipeline;
+                                    } break;
+                                default:
+                                    {
+                                        GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
+                                        GGML_ASSERT(false && "not implemented");
+                                    }
+                            };
+
+                            if (ggml_is_quantized(src2t)) {
+                                GGML_ASSERT(ne20 >= nth0*nth1);
                             }
 
+                            const int64_t _ne1 = 1; // kernels needs a reference in constant memory
+
                             [encoder setComputePipelineState:pipeline];
-                            [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
-                            [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
-                            [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
-                            [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
-                            [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
-                            [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
-                            [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
-                            [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
-                            [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
-                            [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
-                            [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
-                            [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
-                            [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
-                            [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
-                            [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
-                            [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
-                            [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                            [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3];
+                            [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4];
+                            [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5];
+                            [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6];
+                            [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7];
+                            [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8];
+                            [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9];
+                            [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10];
+                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11];
+                            [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12];
+                            [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13];
+                            [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14];
+                            [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15];
+                            [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16];
+                            [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:17];
+                            [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18];
+                            [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:19];
+                            [encoder setBytes:&r2   length:sizeof(r2)   atIndex:20];
+                            [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
+                            [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
+                            // TODO: how to make this an array? read Metal docs
+                            for (int j = 0; j < 8; ++j) {
+                                // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
+                                struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
+
+                                size_t offs_src_cur = 0;
+                                id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
+
+                                [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
+                            }
 
-                            [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
-                        } break;
-                    default:
-                        {
-                            GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
-                            GGML_ASSERT(false);
+                            if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 ||
+                                src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 ||
+                                src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
+                                const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
+                                [encoder setThreadgroupMemoryLength:mem_size atIndex:0];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_Q4_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_Q3_K) {
+#ifdef GGML_QKK_64
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#else
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+#endif
+                            }
+                            else if (src2t == GGML_TYPE_Q5_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                            else if (src2t == GGML_TYPE_Q6_K) {
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            } else {
+                                const int64_t ny = (_ne1 + nrows - 1)/nrows;
+                                [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
+                            }
+                        }
+                    } break;
+                case GGML_OP_GET_ROWS:
+                    {
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F32    ].pipeline; break;
+                            case GGML_TYPE_F16:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_F16    ].pipeline; break;
+                            case GGML_TYPE_Q4_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0   ].pipeline; break;
+                            case GGML_TYPE_Q4_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1   ].pipeline; break;
+                            case GGML_TYPE_Q5_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0   ].pipeline; break;
+                            case GGML_TYPE_Q5_1:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1   ].pipeline; break;
+                            case GGML_TYPE_Q8_0:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0   ].pipeline; break;
+                            case GGML_TYPE_Q2_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K   ].pipeline; break;
+                            case GGML_TYPE_Q3_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K   ].pipeline; break;
+                            case GGML_TYPE_Q4_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K   ].pipeline; break;
+                            case GGML_TYPE_Q5_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K   ].pipeline; break;
+                            case GGML_TYPE_Q6_K:    pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K   ].pipeline; break;
+                            case GGML_TYPE_IQ2_XXS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS].pipeline; break;
+                            case GGML_TYPE_IQ2_XS:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS ].pipeline; break;
+                            case GGML_TYPE_I32:     pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32    ].pipeline; break;
+                            default: GGML_ASSERT(false && "not implemented");
                         }
-                }
 
-#ifndef GGML_METAL_NDEBUG
-                [encoder popDebugGroup];
-#endif
-            }
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0     offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_src1     offset:offs_src1 atIndex:1];
+                        [encoder setBuffer:id_dst      offset:offs_dst  atIndex:2];
+                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:4];
+                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:5];
+                        [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:6];
+                        [encoder setBytes:&nb10 length:sizeof( int64_t) atIndex:7];
+                        [encoder setBytes:&nb11 length:sizeof( int64_t) atIndex:8];
+                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:10];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne10, ne11, 1) threadsPerThreadgroup:MTLSizeMake(32, 1, 1)];
+                    } break;
+                case GGML_OP_RMS_NORM:
+                    {
+                        GGML_ASSERT(ne00 % 4 == 0);
+
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+
+                        int nth = 32; // SIMD width
+
+                        while (nth < ne00/4 && nth < 1024) {
+                            nth *= 2;
+                        }
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_RMS_NORM].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        const int64_t nrows = ggml_nrows(src0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_GROUP_NORM:
+                    {
+                        GGML_ASSERT(ne00 % 4 == 0);
+
+                        //float eps;
+                        //memcpy(&eps, dst->op_params, sizeof(float));
+
+                        const float eps = 1e-6f; // TODO: temporarily hardcoded
+
+                        const int32_t n_groups = ((int32_t *) dst->op_params)[0];
+
+                        int nth = 32; // SIMD width
+
+                        //while (nth < ne00/4 && nth < 1024) {
+                        //    nth *= 2;
+                        //}
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GROUP_NORM].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0  offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst   offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00     length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne01     length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne02     length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&nb00     length:sizeof(uint64_t) atIndex:5];
+                        [encoder setBytes:&nb01     length:sizeof(uint64_t) atIndex:6];
+                        [encoder setBytes:&nb02     length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&n_groups length:sizeof( int32_t) atIndex:8];
+                        [encoder setBytes:&eps      length:sizeof(   float) atIndex:9];
+                        [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n_groups, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_NORM:
+                    {
+                        float eps;
+                        memcpy(&eps, dst->op_params, sizeof(float));
+
+                        const int nth = MIN(256, ne00);
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NORM].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:3];
+                        [encoder setBytes:&eps     length:sizeof(   float) atIndex:4];
+                        [encoder setThreadgroupMemoryLength:GGML_PAD(nth*sizeof(float), 16) atIndex:0];
+
+                        const int64_t nrows = ggml_nrows(src0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ALIBI:
+                    {
+                        GGML_ASSERT((src0t == GGML_TYPE_F32));
+
+                        const int nth = MIN(1024, ne00);
+
+                        //const int n_past = ((int32_t *) dst->op_params)[0];
+                        const int n_head = ((int32_t *) dst->op_params)[1];
+                        float max_bias;
+                        memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+                        const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+                        const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+                        const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&ne0  length:sizeof( int64_t) atIndex:10];
+                        [encoder setBytes:&ne1  length:sizeof( int64_t) atIndex:11];
+                        [encoder setBytes:&ne2  length:sizeof( int64_t) atIndex:12];
+                        [encoder setBytes:&ne3  length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&nb0  length:sizeof(uint64_t) atIndex:14];
+                        [encoder setBytes:&nb1  length:sizeof(uint64_t) atIndex:15];
+                        [encoder setBytes:&nb2  length:sizeof(uint64_t) atIndex:16];
+                        [encoder setBytes:&nb3  length:sizeof(uint64_t) atIndex:17];
+                        [encoder setBytes:&m0   length:sizeof(   float) atIndex:18];
+                        [encoder setBytes:&m1   length:sizeof(   float) atIndex:19];
+                        [encoder setBytes:&n_heads_log2_floor   length:sizeof(int) atIndex:20];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ROPE:
+                    {
+                        GGML_ASSERT(ne10 == ne02);
+
+                        const int nth = MIN(1024, ne00);
+
+                        const int n_past     = ((int32_t *) dst->op_params)[0];
+                        const int n_dims     = ((int32_t *) dst->op_params)[1];
+                        const int mode       = ((int32_t *) dst->op_params)[2];
+                        // skip 3, n_ctx, used in GLM RoPE, unimplemented in metal
+                        const int n_orig_ctx = ((int32_t *) dst->op_params)[4];
+
+                        float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                        memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+                        memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+                        memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+                        memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+                        memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+                        memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F32].pipeline; break;
+                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ROPE_F16].pipeline; break;
+                            default: GGML_ASSERT(false);
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0     offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_src1     offset:offs_src1        atIndex:1];
+                        [encoder setBuffer:id_dst      offset:offs_dst         atIndex:2];
+                        [encoder setBytes:&ne00        length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne01        length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&ne02        length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&ne03        length:sizeof( int64_t) atIndex:6];
+                        [encoder setBytes:&nb00        length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&nb01        length:sizeof(uint64_t) atIndex:8];
+                        [encoder setBytes:&nb02        length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&nb03        length:sizeof(uint64_t) atIndex:10];
+                        [encoder setBytes:&ne0         length:sizeof( int64_t) atIndex:11];
+                        [encoder setBytes:&ne1         length:sizeof( int64_t) atIndex:12];
+                        [encoder setBytes:&ne2         length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&ne3         length:sizeof( int64_t) atIndex:14];
+                        [encoder setBytes:&nb0         length:sizeof(uint64_t) atIndex:15];
+                        [encoder setBytes:&nb1         length:sizeof(uint64_t) atIndex:16];
+                        [encoder setBytes:&nb2         length:sizeof(uint64_t) atIndex:17];
+                        [encoder setBytes:&nb3         length:sizeof(uint64_t) atIndex:18];
+                        [encoder setBytes:&n_past      length:sizeof(     int) atIndex:19];
+                        [encoder setBytes:&n_dims      length:sizeof(     int) atIndex:20];
+                        [encoder setBytes:&mode        length:sizeof(     int) atIndex:21];
+                        [encoder setBytes:&n_orig_ctx  length:sizeof(     int) atIndex:22];
+                        [encoder setBytes:&freq_base   length:sizeof(   float) atIndex:23];
+                        [encoder setBytes:&freq_scale  length:sizeof(   float) atIndex:24];
+                        [encoder setBytes:&ext_factor  length:sizeof(   float) atIndex:25];
+                        [encoder setBytes:&attn_factor length:sizeof(   float) atIndex:26];
+                        [encoder setBytes:&beta_fast   length:sizeof(   float) atIndex:27];
+                        [encoder setBytes:&beta_slow   length:sizeof(   float) atIndex:28];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_IM2COL:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F16);
+                        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+                        GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+                        const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+                        const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+                        const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+                        const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+                        const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+                        const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+                        const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
+
+                        const int32_t N  = src1->ne[is_2D ? 3 : 2];
+                        const int32_t IC = src1->ne[is_2D ? 2 : 1];
+                        const int32_t IH = is_2D ? src1->ne[1] : 1;
+                        const int32_t IW =         src1->ne[0];
+
+                        const int32_t KH = is_2D ? src0->ne[1] : 1;
+                        const int32_t KW =         src0->ne[0];
+
+                        const int32_t OH = is_2D ? dst->ne[2] : 1;
+                        const int32_t OW =         dst->ne[1];
+
+                        const int32_t CHW = IC * KH * KW;
+
+                        const int32_t ofs0 = src1->nb[is_2D ? 3 : 2] / 4;
+                        const int32_t ofs1 = src1->nb[is_2D ? 2 : 1] / 4;
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0->type) {
+                            case GGML_TYPE_F32: GGML_ASSERT(false && "not implemented"); break;
+                            case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_IM2COL_F16].pipeline; break;
+                            default: GGML_ASSERT(false);
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src1 offset:offs_src1        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ofs0    length:sizeof( int32_t) atIndex:2];
+                        [encoder setBytes:&ofs1    length:sizeof( int32_t) atIndex:3];
+                        [encoder setBytes:&IW      length:sizeof( int32_t) atIndex:4];
+                        [encoder setBytes:&IH      length:sizeof( int32_t) atIndex:5];
+                        [encoder setBytes:&CHW     length:sizeof( int32_t) atIndex:6];
+                        [encoder setBytes:&s0      length:sizeof( int32_t) atIndex:7];
+                        [encoder setBytes:&s1      length:sizeof( int32_t) atIndex:8];
+                        [encoder setBytes:&p0      length:sizeof( int32_t) atIndex:9];
+                        [encoder setBytes:&p1      length:sizeof( int32_t) atIndex:10];
+                        [encoder setBytes:&d0      length:sizeof( int32_t) atIndex:11];
+                        [encoder setBytes:&d1      length:sizeof( int32_t) atIndex:12];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(IC, OH, OW) threadsPerThreadgroup:MTLSizeMake(N, KH, KW)];
+                    } break;
+                case GGML_OP_UPSCALE:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                        const int sf = dst->op_params[0];
+
+                        const id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_UPSCALE_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+                        [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];
+
+                        const int nth = MIN((int) pipeline.maxTotalThreadsPerThreadgroup, ne0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_PAD:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_PAD_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                        [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:2];
+                        [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:3];
+                        [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:4];
+                        [encoder setBytes:&ne03 length:sizeof(ne03) atIndex:5];
+                        [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:6];
+                        [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:7];
+                        [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:8];
+                        [encoder setBytes:&nb03 length:sizeof(nb03) atIndex:9];
+                        [encoder setBytes:&ne0  length:sizeof(ne0)  atIndex:10];
+                        [encoder setBytes:&ne1  length:sizeof(ne1)  atIndex:11];
+                        [encoder setBytes:&ne2  length:sizeof(ne2)  atIndex:12];
+                        [encoder setBytes:&ne3  length:sizeof(ne3)  atIndex:13];
+                        [encoder setBytes:&nb0  length:sizeof(nb0)  atIndex:14];
+                        [encoder setBytes:&nb1  length:sizeof(nb1)  atIndex:15];
+                        [encoder setBytes:&nb2  length:sizeof(nb2)  atIndex:16];
+                        [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
+
+                        const int nth = MIN(1024, ne0);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                case GGML_OP_ARGSORT:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+                        GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+                        const int nrows = ggml_nrows(src0);
+
+                        enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (order) {
+                            case GGML_SORT_ASC:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC].pipeline;  break;
+                            case GGML_SORT_DESC: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC].pipeline; break;
+                            default: GGML_ASSERT(false);
+                        };
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)];
+                    } break;
+                case GGML_OP_LEAKY_RELU:
+                    {
+                        GGML_ASSERT(src0->type == GGML_TYPE_F32);
+
+                        float slope;
+                        memcpy(&slope, dst->op_params, sizeof(float));
+
+                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32].pipeline;
+
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst    atIndex:1];
+                        [encoder setBytes:&slope length:sizeof(slope) atIndex:2];
+
+                        const int64_t n = ggml_nelements(dst);
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                    } break;
+                case GGML_OP_DUP:
+                case GGML_OP_CPY:
+                case GGML_OP_CONT:
+                    {
+                        GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+
+                        int nth = MIN(1024, ne00/ggml_blck_size(src0->type));
+
+                        id<MTLComputePipelineState> pipeline = nil;
+
+                        switch (src0t) {
+                            case GGML_TYPE_F32:
+                                {
+                                    GGML_ASSERT(ne0 % ggml_blck_size(dst->type) == 0);
+
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F16].pipeline;  break;
+                                        case GGML_TYPE_F32:  pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline;  break;
+                                        case GGML_TYPE_Q8_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0].pipeline; break;
+                                        case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0].pipeline; break;
+                                        case GGML_TYPE_Q4_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1].pipeline; break;
+                                      //case GGML_TYPE_Q5_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0].pipeline; break;
+                                      //case GGML_TYPE_Q5_1: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1].pipeline; break;
+                                        default: GGML_ASSERT(false && "not implemented");
+                                    };
+                                } break;
+                            case GGML_TYPE_F16:
+                                {
+                                    switch (dstt) {
+                                        case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline; break;
+                                        case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F32].pipeline; break;
+                                        default: GGML_ASSERT(false && "not implemented");
+                                    };
+                                } break;
+                            default: GGML_ASSERT(false && "not implemented");
+                        }
 
-            if (encoder != nil) {
-                [encoder endEncoding];
-                encoder = nil;
+                        [encoder setComputePipelineState:pipeline];
+                        [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
+                        [encoder setBuffer:id_dst  offset:offs_dst         atIndex:1];
+                        [encoder setBytes:&ne00    length:sizeof( int64_t) atIndex:2];
+                        [encoder setBytes:&ne01    length:sizeof( int64_t) atIndex:3];
+                        [encoder setBytes:&ne02    length:sizeof( int64_t) atIndex:4];
+                        [encoder setBytes:&ne03    length:sizeof( int64_t) atIndex:5];
+                        [encoder setBytes:&nb00    length:sizeof(uint64_t) atIndex:6];
+                        [encoder setBytes:&nb01    length:sizeof(uint64_t) atIndex:7];
+                        [encoder setBytes:&nb02    length:sizeof(uint64_t) atIndex:8];
+                        [encoder setBytes:&nb03    length:sizeof(uint64_t) atIndex:9];
+                        [encoder setBytes:&ne0     length:sizeof( int64_t) atIndex:10];
+                        [encoder setBytes:&ne1     length:sizeof( int64_t) atIndex:11];
+                        [encoder setBytes:&ne2     length:sizeof( int64_t) atIndex:12];
+                        [encoder setBytes:&ne3     length:sizeof( int64_t) atIndex:13];
+                        [encoder setBytes:&nb0     length:sizeof(uint64_t) atIndex:14];
+                        [encoder setBytes:&nb1     length:sizeof(uint64_t) atIndex:15];
+                        [encoder setBytes:&nb2     length:sizeof(uint64_t) atIndex:16];
+                        [encoder setBytes:&nb3     length:sizeof(uint64_t) atIndex:17];
+
+                        [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                    } break;
+                default:
+                    {
+                        GGML_METAL_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op));
+                        GGML_ASSERT(false);
+                    }
             }
 
-            [command_buffer commit];
-        });
-    }
+#ifndef GGML_METAL_NDEBUG
+            [encoder popDebugGroup];
+#endif
+        }
+
+        if (encoder != nil) {
+            [encoder endEncoding];
+            encoder = nil;
+        }
 
-    // wait for all threads to finish
-    dispatch_barrier_sync(ctx->d_queue, ^{});
+        [command_buffer commit];
+    });
 
     // check status of command buffers
     // needed to detect if the device ran out-of-memory for example (#1881)

From 862f5e41ab1fdf12d6f59455aad3f5dd8258f805 Mon Sep 17 00:00:00 2001
From: Neuman Vong <neuman.vong@gmail.com>
Date: Wed, 17 Jan 2024 00:47:34 +1100
Subject: [PATCH 07/25] android : introduce starter project example (#4926)

* Introduce starter project for Android

Based on examples/llama.swiftui.

* Add github workflow

* Set NDK version

* Only build arm64-v8a in CI

* Sync bench code

* Rename CI prop to skip-armeabi-v7a

* Remove unused tests
---
 .github/workflows/build.yml                   |  25 ++
 examples/llama.android/.gitignore             |  33 ++
 examples/llama.android/README.md              |   0
 examples/llama.android/app/.gitignore         |   1 +
 examples/llama.android/app/build.gradle.kts   |  91 ++++
 examples/llama.android/app/proguard-rules.pro |  21 +
 .../app/src/main/AndroidManifest.xml          |  30 ++
 .../app/src/main/cpp/CMakeLists.txt           |  50 +++
 .../app/src/main/cpp/llama-android.cpp        | 394 ++++++++++++++++++
 .../java/com/example/llama/Downloadable.kt    | 119 ++++++
 .../src/main/java/com/example/llama/Llm.kt    | 172 ++++++++
 .../java/com/example/llama/MainActivity.kt    | 154 +++++++
 .../java/com/example/llama/MainViewModel.kt   | 104 +++++
 .../java/com/example/llama/ui/theme/Color.kt  |  11 +
 .../java/com/example/llama/ui/theme/Theme.kt  |  70 ++++
 .../java/com/example/llama/ui/theme/Type.kt   |  34 ++
 .../res/drawable/ic_launcher_background.xml   | 170 ++++++++
 .../res/drawable/ic_launcher_foreground.xml   |  30 ++
 .../main/res/mipmap-anydpi/ic_launcher.xml    |   6 +
 .../res/mipmap-anydpi/ic_launcher_round.xml   |   6 +
 .../src/main/res/mipmap-hdpi/ic_launcher.webp | Bin 0 -> 1404 bytes
 .../res/mipmap-hdpi/ic_launcher_round.webp    | Bin 0 -> 2898 bytes
 .../src/main/res/mipmap-mdpi/ic_launcher.webp | Bin 0 -> 982 bytes
 .../res/mipmap-mdpi/ic_launcher_round.webp    | Bin 0 -> 1772 bytes
 .../main/res/mipmap-xhdpi/ic_launcher.webp    | Bin 0 -> 1900 bytes
 .../res/mipmap-xhdpi/ic_launcher_round.webp   | Bin 0 -> 3918 bytes
 .../main/res/mipmap-xxhdpi/ic_launcher.webp   | Bin 0 -> 2884 bytes
 .../res/mipmap-xxhdpi/ic_launcher_round.webp  | Bin 0 -> 5914 bytes
 .../main/res/mipmap-xxxhdpi/ic_launcher.webp  | Bin 0 -> 3844 bytes
 .../res/mipmap-xxxhdpi/ic_launcher_round.webp | Bin 0 -> 7778 bytes
 .../app/src/main/res/values/colors.xml        |  10 +
 .../app/src/main/res/values/strings.xml       |   3 +
 .../app/src/main/res/values/themes.xml        |   5 +
 .../app/src/main/res/xml/backup_rules.xml     |  13 +
 .../main/res/xml/data_extraction_rules.xml    |  19 +
 examples/llama.android/build.gradle.kts       |   5 +
 examples/llama.android/gradle.properties      |  23 +
 .../gradle/wrapper/gradle-wrapper.jar         | Bin 0 -> 59203 bytes
 .../gradle/wrapper/gradle-wrapper.properties  |   6 +
 examples/llama.android/gradlew                | 185 ++++++++
 examples/llama.android/settings.gradle.kts    |  17 +
 41 files changed, 1807 insertions(+)
 create mode 100644 examples/llama.android/.gitignore
 create mode 100644 examples/llama.android/README.md
 create mode 100644 examples/llama.android/app/.gitignore
 create mode 100644 examples/llama.android/app/build.gradle.kts
 create mode 100644 examples/llama.android/app/proguard-rules.pro
 create mode 100644 examples/llama.android/app/src/main/AndroidManifest.xml
 create mode 100644 examples/llama.android/app/src/main/cpp/CMakeLists.txt
 create mode 100644 examples/llama.android/app/src/main/cpp/llama-android.cpp
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
 create mode 100644 examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
 create mode 100644 examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
 create mode 100644 examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
 create mode 100644 examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
 create mode 100644 examples/llama.android/app/src/main/res/values/colors.xml
 create mode 100644 examples/llama.android/app/src/main/res/values/strings.xml
 create mode 100644 examples/llama.android/app/src/main/res/values/themes.xml
 create mode 100644 examples/llama.android/app/src/main/res/xml/backup_rules.xml
 create mode 100644 examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
 create mode 100644 examples/llama.android/build.gradle.kts
 create mode 100644 examples/llama.android/gradle.properties
 create mode 100644 examples/llama.android/gradle/wrapper/gradle-wrapper.jar
 create mode 100644 examples/llama.android/gradle/wrapper/gradle-wrapper.properties
 create mode 100755 examples/llama.android/gradlew
 create mode 100644 examples/llama.android/settings.gradle.kts

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0a28a11112251..367df07a7e497 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -515,6 +515,31 @@ jobs:
       - name: Build Xcode project
         run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
 
+  android-build:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        uses: actions/checkout@v3
+
+      - name: Set up JDK
+        uses: actions/setup-java@v3
+        with:
+          java-version: 17
+          distribution: zulu
+
+      - name: Setup Android SDK
+        uses: android-actions/setup-android@v3
+        with:
+          log-accepted-android-sdk-licenses: false
+
+      - name: Build
+        run: |
+          cd examples/llama.android
+
+          # Skip armeabi-v7a for now (https://github.com/llvm/llvm-project/issues/65820).
+          ./gradlew build --no-daemon -Pskip-armeabi-v7a
+
 #  freeBSD-latest:
 #    runs-on: macos-12
 #    steps:
diff --git a/examples/llama.android/.gitignore b/examples/llama.android/.gitignore
new file mode 100644
index 0000000000000..347e252ef10e9
--- /dev/null
+++ b/examples/llama.android/.gitignore
@@ -0,0 +1,33 @@
+# Gradle files
+.gradle/
+build/
+
+# Local configuration file (sdk path, etc)
+local.properties
+
+# Log/OS Files
+*.log
+
+# Android Studio generated files and folders
+captures/
+.externalNativeBuild/
+.cxx/
+*.apk
+output.json
+
+# IntelliJ
+*.iml
+.idea/
+misc.xml
+deploymentTargetDropDown.xml
+render.experimental.xml
+
+# Keystore files
+*.jks
+*.keystore
+
+# Google Services (e.g. APIs or Firebase)
+google-services.json
+
+# Android Profiling
+*.hprof
diff --git a/examples/llama.android/README.md b/examples/llama.android/README.md
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/examples/llama.android/app/.gitignore b/examples/llama.android/app/.gitignore
new file mode 100644
index 0000000000000..796b96d1c4023
--- /dev/null
+++ b/examples/llama.android/app/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
new file mode 100644
index 0000000000000..7815a802593ba
--- /dev/null
+++ b/examples/llama.android/app/build.gradle.kts
@@ -0,0 +1,91 @@
+plugins {
+    id("com.android.application")
+    id("org.jetbrains.kotlin.android")
+}
+
+android {
+    namespace = "com.example.llama"
+    compileSdk = 34
+
+    ndkVersion = "26.1.10909125"
+
+    defaultConfig {
+        applicationId = "com.example.llama"
+        minSdk = 33
+        targetSdk = 34
+        versionCode = 1
+        versionName = "1.0"
+
+        testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+        vectorDrawables {
+            useSupportLibrary = true
+        }
+        ndk {
+            // Workaround for https://github.com/llvm/llvm-project/issues/65820
+            // affecting armeabi-v7a. Skip armeabi-v7a when invoked with
+            // -Pskip-armeabi-v7a (e.g., ./gradlew build -Pskip-armeabi-v7a).
+            if (project.hasProperty("skip-armeabi-v7a")) {
+                abiFilters += listOf("arm64-v8a", "x86_64", "x86")
+            }
+        }
+        externalNativeBuild {
+            cmake {
+                cppFlags += listOf()
+                arguments += listOf()
+            }
+        }
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
+    buildFeatures {
+        compose = true
+    }
+    composeOptions {
+        kotlinCompilerExtensionVersion = "1.5.1"
+    }
+    packaging {
+        resources {
+            excludes += "/META-INF/{AL2.0,LGPL2.1}"
+        }
+    }
+    externalNativeBuild {
+        cmake {
+            path = file("src/main/cpp/CMakeLists.txt")
+            version = "3.22.1"
+        }
+    }
+}
+
+dependencies {
+
+    implementation("androidx.core:core-ktx:1.12.0")
+    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.6.2")
+    implementation("androidx.activity:activity-compose:1.8.2")
+    implementation(platform("androidx.compose:compose-bom:2023.08.00"))
+    implementation("androidx.compose.ui:ui")
+    implementation("androidx.compose.ui:ui-graphics")
+    implementation("androidx.compose.ui:ui-tooling-preview")
+    implementation("androidx.compose.material3:material3")
+    testImplementation("junit:junit:4.13.2")
+    androidTestImplementation("androidx.test.ext:junit:1.1.5")
+    androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+    androidTestImplementation(platform("androidx.compose:compose-bom:2023.08.00"))
+    androidTestImplementation("androidx.compose.ui:ui-test-junit4")
+    debugImplementation("androidx.compose.ui:ui-tooling")
+    debugImplementation("androidx.compose.ui:ui-test-manifest")
+}
diff --git a/examples/llama.android/app/proguard-rules.pro b/examples/llama.android/app/proguard-rules.pro
new file mode 100644
index 0000000000000..f1b424510da51
--- /dev/null
+++ b/examples/llama.android/app/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/examples/llama.android/app/src/main/AndroidManifest.xml b/examples/llama.android/app/src/main/AndroidManifest.xml
new file mode 100644
index 0000000000000..41a358a299154
--- /dev/null
+++ b/examples/llama.android/app/src/main/AndroidManifest.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools">
+
+    <uses-permission android:name="android.permission.INTERNET" />
+
+    <application
+        android:allowBackup="true"
+        android:dataExtractionRules="@xml/data_extraction_rules"
+        android:fullBackupContent="@xml/backup_rules"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.LlamaAndroid"
+        >
+
+        <activity
+            android:name=".MainActivity"
+            android:exported="true"
+            android:theme="@style/Theme.LlamaAndroid">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
new file mode 100644
index 0000000000000..85139329aa082
--- /dev/null
+++ b/examples/llama.android/app/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,50 @@
+
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html.
+# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
+
+# Sets the minimum CMake version required for this project.
+cmake_minimum_required(VERSION 3.22.1)
+
+# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
+# Since this is the top level CMakeLists.txt, the project name is also accessible
+# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
+# build script scope).
+project("llama-android")
+
+include(FetchContent)
+FetchContent_Declare(
+        llama
+        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+        GIT_TAG        master
+)
+
+# Also provides "common"
+FetchContent_MakeAvailable(llama)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+#
+# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
+# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
+# is preferred for the same purpose.
+#
+# In order to load a library into your app from Java/Kotlin, you must call
+# System.loadLibrary() and pass the name of the library defined here;
+# for GameActivity/NativeActivity derived applications, the same library name must be
+# used in the AndroidManifest.xml file.
+add_library(${CMAKE_PROJECT_NAME} SHARED
+    # List C/C++ source files with relative paths to this CMakeLists.txt.
+    llama-android.cpp)
+
+# Specifies libraries CMake should link to your target library. You
+# can link libraries from various origins, such as libraries defined in this
+# build script, prebuilt third-party libraries, or Android system libraries.
+target_link_libraries(${CMAKE_PROJECT_NAME}
+    # List libraries link to the target library
+    llama
+    common
+    android
+    log)
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp
new file mode 100644
index 0000000000000..d5e705dce6ca0
--- /dev/null
+++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp
@@ -0,0 +1,394 @@
+#include <android/log.h>
+#include <jni.h>
+#include <iomanip>
+#include <math.h>
+#include <string>
+#include <unistd.h>
+#include "llama.h"
+#include "common/common.h"
+
+// Write C++ code here.
+//
+// Do not forget to dynamically load the C++ library into your application.
+//
+// For instance,
+//
+// In MainActivity.java:
+//    static {
+//       System.loadLibrary("llama-android");
+//    }
+//
+// Or, in MainActivity.kt:
+//    companion object {
+//      init {
+//         System.loadLibrary("llama-android")
+//      }
+//    }
+
+#define TAG "llama-android.cpp"
+#define LOGi(...) __android_log_print(ANDROID_LOG_INFO, TAG, __VA_ARGS__)
+#define LOGe(...) __android_log_print(ANDROID_LOG_ERROR, TAG, __VA_ARGS__)
+
+jclass la_int_var;
+jmethodID la_int_var_value;
+jmethodID la_int_var_inc;
+
+static void log_callback(ggml_log_level level, const char * fmt, void * data) {
+    if (level == GGML_LOG_LEVEL_ERROR)     __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
+    else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
+    else if (level == GGML_LOG_LEVEL_WARN) __android_log_print(ANDROID_LOG_WARN, TAG, fmt, data);
+    else __android_log_print(ANDROID_LOG_DEFAULT, TAG, fmt, data);
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
+    llama_model_params model_params = llama_model_default_params();
+
+    auto path_to_model = env->GetStringUTFChars(filename, 0);
+    LOGi("Loading model from %s", path_to_model);
+
+    auto model = llama_load_model_from_file(path_to_model, model_params);
+    env->ReleaseStringUTFChars(filename, path_to_model);
+
+    if (!model) {
+        LOGe("load_model() failed");
+        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"), "load_model() failed");
+        return 0;
+    }
+
+    return reinterpret_cast<jlong>(model);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
+    llama_free_model(reinterpret_cast<llama_model *>(model));
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+    auto model = reinterpret_cast<llama_model *>(jmodel);
+
+    if (!model) {
+        LOGe("new_context(): model cannot be null");
+        env->ThrowNew(env->FindClass("java/lang/IllegalArgumentException"), "Model cannot be null");
+        return 0;
+    }
+
+    int n_threads = std::max(1, std::min(8, (int) sysconf(_SC_NPROCESSORS_ONLN) - 2));
+    LOGi("Using %d threads", n_threads);
+
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
+    ctx_params.n_threads       = n_threads;
+    ctx_params.n_threads_batch = n_threads;
+
+    llama_context * context = llama_new_context_with_model(model, ctx_params);
+
+    if (!context) {
+        LOGe("llama_new_context_with_model() returned null)");
+        env->ThrowNew(env->FindClass("java/lang/IllegalStateException"),
+                      "llama_new_context_with_model() returned null)");
+        return 0;
+    }
+
+    return reinterpret_cast<jlong>(context);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
+    llama_free(reinterpret_cast<llama_context *>(context));
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
+    llama_backend_free();
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
+    llama_log_set(log_callback, NULL);
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_example_llama_Llm_bench_1model(
+        JNIEnv *env,
+        jobject,
+        jlong context_pointer,
+        jlong model_pointer,
+        jlong batch_pointer,
+        jint pp,
+        jint tg,
+        jint pl,
+        jint nr
+        ) {
+    auto pp_avg = 0.0;
+    auto tg_avg = 0.0;
+    auto pp_std = 0.0;
+    auto tg_std = 0.0;
+
+    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto model = reinterpret_cast<llama_model *>(model_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+
+    const int n_ctx = llama_n_ctx(context);
+
+    LOGi("n_ctx = %d", n_ctx);
+
+    int i, j;
+    int nri;
+    for (nri = 0; nri < nr; nri++) {
+        LOGi("Benchmark prompt processing (pp)");
+
+        llama_batch_clear(*batch);
+
+        const int n_tokens = pp;
+        for (i = 0; i < n_tokens; i++) {
+            llama_batch_add(*batch, 0, i, { 0 }, false);
+        }
+
+        batch->logits[batch->n_tokens - 1] = true;
+        llama_kv_cache_clear(context);
+
+        const auto t_pp_start = ggml_time_us();
+        if (llama_decode(context, *batch) != 0) {
+            LOGi("llama_decode() failed during prompt processing");
+        }
+        const auto t_pp_end = ggml_time_us();
+
+        // bench text generation
+
+        LOGi("Benchmark text generation (tg)");
+
+        llama_kv_cache_clear(context);
+        const auto t_tg_start = ggml_time_us();
+        for (i = 0; i < tg; i++) {
+
+            llama_batch_clear(*batch);
+            for (j = 0; j < pl; j++) {
+                llama_batch_add(*batch, 0, i, { j }, true);
+            }
+
+            LOGi("llama_decode() text generation: %d", i);
+            if (llama_decode(context, *batch) != 0) {
+                LOGi("llama_decode() failed during text generation");
+            }
+        }
+
+        const auto t_tg_end = ggml_time_us();
+
+        llama_kv_cache_clear(context);
+
+        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
+        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
+
+        const auto speed_pp = double(pp) / t_pp;
+        const auto speed_tg = double(pl * tg) / t_tg;
+
+        pp_avg += speed_pp;
+        tg_avg += speed_tg;
+
+        pp_std += speed_pp * speed_pp;
+        tg_std += speed_tg * speed_tg;
+
+        LOGi("pp %f t/s, tg %f t/s", speed_pp, speed_tg);
+    }
+
+    pp_avg /= double(nr);
+    tg_avg /= double(nr);
+
+    if (nr > 1) {
+        pp_std = sqrt(pp_std / double(nr - 1) - pp_avg * pp_avg * double(nr) / double(nr - 1));
+        tg_std = sqrt(tg_std / double(nr - 1) - tg_avg * tg_avg * double(nr) / double(nr - 1));
+    } else {
+        pp_std = 0;
+        tg_std = 0;
+    }
+
+    char model_desc[128];
+    llama_model_desc(model, model_desc, sizeof(model_desc));
+
+    const auto model_size     = double(llama_model_size(model)) / 1024.0 / 1024.0 / 1024.0;
+    const auto model_n_params = double(llama_model_n_params(model)) / 1e9;
+
+    const auto backend    = "(Android)"; // TODO: What should this be?
+
+    std::stringstream result;
+    result << std::setprecision(2);
+    result << "| model | size | params | backend | test | t/s |\n";
+    result << "| --- | --- | --- | --- | --- | --- |\n";
+    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | pp " << pp << " | " << pp_avg << " ± " << pp_std << " |\n";
+    result << "| " << model_desc << " | " << model_size << "GiB | " << model_n_params << "B | " << backend << " | tg " << tg << " | " << tg_avg << " ± " << tg_std << " |\n";
+
+    return env->NewStringUTF(result.str().c_str());
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+    llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
+}
+
+extern "C"
+JNIEXPORT jlong JNICALL
+Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+
+    // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
+
+    llama_batch *batch = new llama_batch {
+        0,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        0,
+        0,
+        0,
+    };
+
+    if (embd) {
+        batch->embd = (float *) malloc(sizeof(float) * n_tokens * embd);
+    } else {
+        batch->token = (llama_token *) malloc(sizeof(llama_token) * n_tokens);
+    }
+
+    batch->pos      = (llama_pos *)     malloc(sizeof(llama_pos)      * n_tokens);
+    batch->n_seq_id = (int32_t *)       malloc(sizeof(int32_t)        * n_tokens);
+    batch->seq_id   = (llama_seq_id **) malloc(sizeof(llama_seq_id *) * n_tokens);
+    for (int i = 0; i < n_tokens; ++i) {
+        batch->seq_id[i] = (llama_seq_id *) malloc(sizeof(llama_seq_id) * n_seq_max);
+    }
+    batch->logits   = (int8_t *)        malloc(sizeof(int8_t)         * n_tokens);
+
+    return reinterpret_cast<jlong>(batch);
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject, jboolean numa) {
+    llama_backend_init(numa);
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
+    return env->NewStringUTF(llama_print_system_info());
+}
+
+extern "C"
+JNIEXPORT jint JNICALL
+Java_com_example_llama_Llm_completion_1init(
+        JNIEnv *env,
+        jobject,
+        jlong context_pointer,
+        jlong batch_pointer,
+        jstring jtext,
+        jint n_len
+    ) {
+
+    const auto text = env->GetStringUTFChars(jtext, 0);
+    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+
+    const auto tokens_list = llama_tokenize(context, text, 1);
+
+    auto n_ctx = llama_n_ctx(context);
+    auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
+
+    LOGi("n_len = %d, n_ctx = %d, n_kv_req = %d", n_len, n_ctx, n_kv_req);
+
+    if (n_kv_req > n_ctx) {
+        LOGe("error: n_kv_req > n_ctx, the required KV cache size is not big enough");
+    }
+
+    for (auto id : tokens_list) {
+        LOGi("%s", llama_token_to_piece(context, id).c_str());
+    }
+
+    llama_batch_clear(*batch);
+
+    // evaluate the initial prompt
+    for (auto i = 0; i < tokens_list.size(); i++) {
+        llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch->logits[batch->n_tokens - 1] = true;
+
+    if (llama_decode(context, *batch) != 0) {
+        LOGe("llama_decode() failed");
+    }
+
+    env->ReleaseStringUTFChars(jtext, text);
+
+    return batch->n_tokens;
+}
+
+extern "C"
+JNIEXPORT jstring JNICALL
+Java_com_example_llama_Llm_completion_1loop(
+        JNIEnv * env,
+        jobject,
+        jlong context_pointer,
+        jlong batch_pointer,
+        jint n_len,
+        jobject intvar_ncur
+) {
+    const auto context = reinterpret_cast<llama_context *>(context_pointer);
+    const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
+    const auto model = llama_get_model(context);
+
+    if (!la_int_var) la_int_var = env->GetObjectClass(intvar_ncur);
+    if (!la_int_var_value) la_int_var_value = env->GetMethodID(la_int_var, "getValue", "()I");
+    if (!la_int_var_inc) la_int_var_inc = env->GetMethodID(la_int_var, "inc", "()V");
+
+    auto n_vocab = llama_n_vocab(model);
+    auto logits = llama_get_logits_ith(context, batch->n_tokens - 1);
+
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+
+    for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
+    }
+
+    llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+    // sample the most likely token
+    const auto new_token_id = llama_sample_token_greedy(context, &candidates_p);
+
+    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
+    if (new_token_id == llama_token_eos(model) || n_cur == n_len) {
+        return env->NewStringUTF("");
+    }
+
+    auto new_token_chars = llama_token_to_piece(context, new_token_id);
+    LOGi("new_token_chars: `%s`", new_token_chars.c_str());
+    auto new_token = env->NewStringUTF(new_token_chars.c_str());
+
+    llama_batch_clear(*batch);
+    llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
+
+    env->CallVoidMethod(intvar_ncur, la_int_var_inc);
+
+    if (llama_decode(context, *batch) != 0) {
+        LOGe("llama_decode() returned null");
+    }
+
+    return new_token;
+}
+
+extern "C"
+JNIEXPORT void JNICALL
+Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
new file mode 100644
index 0000000000000..78c231ae55d8c
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Downloadable.kt
@@ -0,0 +1,119 @@
+package com.example.llama
+
+import android.app.DownloadManager
+import android.net.Uri
+import android.util.Log
+import androidx.compose.material3.Button
+import androidx.compose.material3.Text
+import androidx.compose.runtime.Composable
+import androidx.compose.runtime.getValue
+import androidx.compose.runtime.mutableDoubleStateOf
+import androidx.compose.runtime.mutableStateOf
+import androidx.compose.runtime.remember
+import androidx.compose.runtime.rememberCoroutineScope
+import androidx.compose.runtime.setValue
+import androidx.core.database.getLongOrNull
+import androidx.core.net.toUri
+import kotlinx.coroutines.delay
+import kotlinx.coroutines.launch
+import java.io.File
+
+data class Downloadable(val name: String, val source: Uri, val destination: File) {
+    companion object {
+        @JvmStatic
+        private val tag: String? = this::class.qualifiedName
+
+        sealed interface State
+        data object Ready: State
+        data class Downloading(val id: Long): State
+        data class Downloaded(val downloadable: Downloadable): State
+        data class Error(val message: String): State
+
+        @JvmStatic
+        @Composable
+        fun Button(viewModel: MainViewModel, dm: DownloadManager, item: Downloadable) {
+            var status: State by remember {
+                mutableStateOf(
+                    if (item.destination.exists()) Downloaded(item)
+                    else Ready
+                )
+            }
+            var progress by remember { mutableDoubleStateOf(0.0) }
+
+            val coroutineScope = rememberCoroutineScope()
+
+            suspend fun waitForDownload(result: Downloading, item: Downloadable): State {
+                while (true) {
+                    val cursor = dm.query(DownloadManager.Query().setFilterById(result.id))
+
+                    if (cursor == null) {
+                        Log.e(tag, "dm.query() returned null")
+                        return Error("dm.query() returned null")
+                    }
+
+                    if (!cursor.moveToFirst() || cursor.count < 1) {
+                        cursor.close()
+                        Log.i(tag, "cursor.moveToFirst() returned false or cursor.count < 1, download canceled?")
+                        return Ready
+                    }
+
+                    val pix = cursor.getColumnIndex(DownloadManager.COLUMN_BYTES_DOWNLOADED_SO_FAR)
+                    val tix = cursor.getColumnIndex(DownloadManager.COLUMN_TOTAL_SIZE_BYTES)
+                    val sofar = cursor.getLongOrNull(pix) ?: 0
+                    val total = cursor.getLongOrNull(tix) ?: 1
+                    cursor.close()
+
+                    if (sofar == total) {
+                        return Downloaded(item)
+                    }
+
+                    progress = (sofar * 1.0) / total
+
+                    delay(1000L)
+                }
+            }
+
+            fun onClick() {
+                when (val s = status) {
+                    is Downloaded -> {
+                        viewModel.load(item.destination.path)
+                    }
+
+                    is Downloading -> {
+                        coroutineScope.launch {
+                            status = waitForDownload(s, item)
+                        }
+                    }
+
+                    else -> {
+                        item.destination.delete()
+
+                        val request = DownloadManager.Request(item.source).apply {
+                            setTitle("Downloading model")
+                            setDescription("Downloading model: ${item.name}")
+                            setAllowedNetworkTypes(DownloadManager.Request.NETWORK_WIFI)
+                            setDestinationUri(item.destination.toUri())
+                        }
+
+                        viewModel.log("Saving ${item.name} to ${item.destination.path}")
+                        Log.i(tag, "Saving ${item.name} to ${item.destination.path}")
+
+                        val id = dm.enqueue(request)
+                        status = Downloading(id)
+                        onClick()
+                    }
+                }
+            }
+
+            Button(onClick = { onClick() }, enabled = status !is Downloading) {
+                when (status) {
+                    is Downloading -> Text(text = "Downloading ${(progress * 100).toInt()}%")
+                    is Downloaded -> Text("Load ${item.name}")
+                    is Ready -> Text("Download ${item.name}")
+                    is Error -> Text("Download ${item.name}")
+                }
+            }
+        }
+
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
new file mode 100644
index 0000000000000..5f32703724a49
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
@@ -0,0 +1,172 @@
+package com.example.llama
+
+import android.util.Log
+import kotlinx.coroutines.CoroutineDispatcher
+import kotlinx.coroutines.asCoroutineDispatcher
+import kotlinx.coroutines.flow.Flow
+import kotlinx.coroutines.flow.flow
+import kotlinx.coroutines.flow.flowOn
+import kotlinx.coroutines.withContext
+import java.util.concurrent.Executors
+import kotlin.concurrent.thread
+
+class Llm {
+    private val tag: String? = this::class.simpleName
+
+    private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
+
+    private val runLoop: CoroutineDispatcher = Executors.newSingleThreadExecutor {
+        thread(start = false, name = "Llm-RunLoop") {
+            Log.d(tag, "Dedicated thread for native code: ${Thread.currentThread().name}")
+
+            // No-op if called more than once.
+            System.loadLibrary("llama-android")
+
+            // Set llama log handler to Android
+            log_to_android()
+            backend_init(false)
+
+            Log.d(tag, system_info())
+
+            it.run()
+        }.apply {
+            uncaughtExceptionHandler = Thread.UncaughtExceptionHandler { _, exception: Throwable ->
+                Log.e(tag, "Unhandled exception", exception)
+            }
+        }
+    }.asCoroutineDispatcher()
+
+    private val nlen: Int = 64
+
+    private external fun log_to_android()
+    private external fun load_model(filename: String): Long
+    private external fun free_model(model: Long)
+    private external fun new_context(model: Long): Long
+    private external fun free_context(context: Long)
+    private external fun backend_init(numa: Boolean)
+    private external fun backend_free()
+    private external fun free_batch(batch: Long)
+    private external fun new_batch(nTokens: Int, embd: Int, nSeqMax: Int): Long
+    private external fun bench_model(
+        context: Long,
+        model: Long,
+        batch: Long,
+        pp: Int,
+        tg: Int,
+        pl: Int,
+        nr: Int
+    ): String
+
+    private external fun system_info(): String
+
+    private external fun completion_init(
+        context: Long,
+        batch: Long,
+        text: String,
+        nLen: Int
+    ): Int
+
+    private external fun completion_loop(
+        context: Long,
+        batch: Long,
+        nLen: Int,
+        ncur: IntVar
+    ): String
+
+    private external fun kv_cache_clear(context: Long)
+
+    suspend fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1): String {
+        return withContext(runLoop) {
+            when (val state = threadLocalState.get()) {
+                is State.Loaded -> {
+                    Log.d(tag, "bench(): $state")
+                    bench_model(state.context, state.model, state.batch, pp, tg, pl, nr)
+                }
+
+                else -> throw IllegalStateException("No model loaded")
+            }
+        }
+    }
+
+    suspend fun load(pathToModel: String) {
+        withContext(runLoop) {
+            when (threadLocalState.get()) {
+                is State.Idle -> {
+                    val model = load_model(pathToModel)
+                    if (model == 0L)  throw IllegalStateException("load_model() failed")
+
+                    val context = new_context(model)
+                    if (context == 0L) throw IllegalStateException("new_context() failed")
+
+                    val batch = new_batch(512, 0, 1)
+                    if (batch == 0L) throw IllegalStateException("new_batch() failed")
+
+                    Log.i(tag, "Loaded model $pathToModel")
+                    threadLocalState.set(State.Loaded(model, context, batch))
+                }
+                else -> throw IllegalStateException("Model already loaded")
+            }
+        }
+    }
+
+    fun send(message: String): Flow<String> = flow {
+        when (val state = threadLocalState.get()) {
+            is State.Loaded -> {
+                val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
+                while (ncur.value <= nlen) {
+                    val str = completion_loop(state.context, state.batch, nlen, ncur)
+                    if (str.isEmpty()) {
+                        break
+                    }
+                    emit(str)
+                }
+                kv_cache_clear(state.context)
+            }
+            else -> {}
+        }
+    }.flowOn(runLoop)
+
+    /**
+     * Unloads the model and frees resources.
+     *
+     * This is a no-op if there's no model loaded.
+     */
+    suspend fun unload() {
+        withContext(runLoop) {
+            when (val state = threadLocalState.get()) {
+                is State.Loaded -> {
+                    free_context(state.context)
+                    free_model(state.model)
+                    free_batch(state.batch)
+
+                    threadLocalState.set(State.Idle)
+                }
+                else -> {}
+            }
+        }
+    }
+
+    companion object {
+        private class IntVar(value: Int) {
+            @Volatile
+            var value: Int = value
+                private set
+
+            fun inc() {
+                synchronized(this) {
+                    value += 1
+                }
+            }
+        }
+
+        private sealed interface State {
+            data object Idle: State
+            data class Loaded(val model: Long, val context: Long, val batch: Long): State
+        }
+
+        // Enforce only one instance of Llm.
+        private val _instance: Llm = Llm()
+
+        fun instance(): Llm = _instance
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
new file mode 100644
index 0000000000000..9da04f7d3c32e
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -0,0 +1,154 @@
+package com.example.llama
+
+import android.app.ActivityManager
+import android.app.DownloadManager
+import android.content.ClipData
+import android.content.ClipboardManager
+import android.net.Uri
+import android.os.Bundle
+import android.os.StrictMode
+import android.os.StrictMode.VmPolicy
+import android.text.format.Formatter
+import androidx.activity.ComponentActivity
+import androidx.activity.compose.setContent
+import androidx.activity.viewModels
+import androidx.compose.foundation.layout.Box
+import androidx.compose.foundation.layout.Column
+import androidx.compose.foundation.layout.Row
+import androidx.compose.foundation.layout.fillMaxSize
+import androidx.compose.foundation.layout.padding
+import androidx.compose.foundation.lazy.LazyColumn
+import androidx.compose.foundation.lazy.items
+import androidx.compose.foundation.lazy.rememberLazyListState
+import androidx.compose.material3.Button
+import androidx.compose.material3.LocalContentColor
+import androidx.compose.material3.MaterialTheme
+import androidx.compose.material3.OutlinedTextField
+import androidx.compose.material3.Surface
+import androidx.compose.material3.Text
+import androidx.compose.runtime.Composable
+import androidx.compose.ui.Modifier
+import androidx.compose.ui.unit.dp
+import androidx.core.content.getSystemService
+import com.example.llama.ui.theme.LlamaAndroidTheme
+import java.io.File
+
+class MainActivity(
+    activityManager: ActivityManager? = null,
+    downloadManager: DownloadManager? = null,
+    clipboardManager: ClipboardManager? = null,
+): ComponentActivity() {
+    private val tag: String? = this::class.simpleName
+
+    private val activityManager by lazy { activityManager ?: getSystemService<ActivityManager>()!! }
+    private val downloadManager by lazy { downloadManager ?: getSystemService<DownloadManager>()!! }
+    private val clipboardManager by lazy { clipboardManager ?: getSystemService<ClipboardManager>()!! }
+
+    private val viewModel: MainViewModel by viewModels()
+
+    // Get a MemoryInfo object for the device's current memory status.
+    private fun availableMemory(): ActivityManager.MemoryInfo {
+        return ActivityManager.MemoryInfo().also { memoryInfo ->
+            activityManager.getMemoryInfo(memoryInfo)
+        }
+    }
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+
+        StrictMode.setVmPolicy(
+            VmPolicy.Builder(StrictMode.getVmPolicy())
+                .detectLeakedClosableObjects()
+                .build()
+        )
+
+        val free = Formatter.formatFileSize(this, availableMemory().availMem)
+        val total = Formatter.formatFileSize(this, availableMemory().totalMem)
+
+        viewModel.log("Current memory: $free / $total")
+        viewModel.log("Downloads directory: ${getExternalFilesDir(null)}")
+
+        val extFilesDir = getExternalFilesDir(null)
+
+        val models = listOf(
+            Downloadable(
+                "Phi-2 7B (Q4_0, 1.6 GiB)",
+                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
+                File(extFilesDir, "phi-2-q4_0.gguf"),
+            ),
+            Downloadable(
+                "TinyLlama 1.1B (f16, 2.2 GiB)",
+                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
+                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
+            ),
+            Downloadable(
+                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
+                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
+                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
+            ),
+        )
+
+        setContent {
+            LlamaAndroidTheme {
+                // A surface container using the 'background' color from the theme
+                Surface(
+                    modifier = Modifier.fillMaxSize(),
+                    color = MaterialTheme.colorScheme.background
+                ) {
+                    MainCompose(
+                        viewModel,
+                        clipboardManager,
+                        downloadManager,
+                        models,
+                    )
+                }
+
+            }
+        }
+    }
+}
+
+@Composable
+fun MainCompose(
+    viewModel: MainViewModel,
+    clipboard: ClipboardManager,
+    dm: DownloadManager,
+    models: List<Downloadable>
+) {
+    Column {
+        val scrollState = rememberLazyListState()
+
+        Box(modifier = Modifier.weight(1f)) {
+            LazyColumn(state = scrollState) {
+                items(viewModel.messages) {
+                    Text(
+                        it,
+                        style = MaterialTheme.typography.bodyLarge.copy(color = LocalContentColor.current),
+                        modifier = Modifier.padding(16.dp)
+                    )
+                }
+            }
+        }
+        OutlinedTextField(
+            value = viewModel.message,
+            onValueChange = { viewModel.updateMessage(it) },
+            label = { Text("Message") },
+        )
+        Row {
+            Button({ viewModel.send() }) { Text("Send") }
+            Button({ viewModel.bench(8, 4, 1) }) { Text("Bench") }
+            Button({ viewModel.clear() }) { Text("Clear") }
+            Button({
+                viewModel.messages.joinToString("\n").let {
+                    clipboard.setPrimaryClip(ClipData.newPlainText("", it))
+                }
+            }) { Text("Copy") }
+        }
+
+        Column {
+            for (model in models) {
+                Downloadable.Button(viewModel, dm, model)
+            }
+        }
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
new file mode 100644
index 0000000000000..be95e22218332
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -0,0 +1,104 @@
+package com.example.llama
+
+import android.util.Log
+import androidx.compose.runtime.getValue
+import androidx.compose.runtime.mutableStateOf
+import androidx.compose.runtime.setValue
+import androidx.lifecycle.ViewModel
+import androidx.lifecycle.viewModelScope
+import kotlinx.coroutines.flow.catch
+import kotlinx.coroutines.launch
+
+class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
+    companion object {
+        @JvmStatic
+        private val NanosPerSecond = 1_000_000_000.0
+    }
+
+    private val tag: String? = this::class.simpleName
+
+    var messages by mutableStateOf(listOf("Initializing..."))
+        private set
+
+    var message by mutableStateOf("")
+        private set
+
+    override fun onCleared() {
+        super.onCleared()
+
+        viewModelScope.launch {
+            try {
+                llm.unload()
+            } catch (exc: IllegalStateException) {
+                messages += exc.message!!
+            }
+        }
+    }
+
+    fun send() {
+        val text = message
+        message = ""
+
+        // Add to messages console.
+        messages += text
+        messages += ""
+
+        viewModelScope.launch {
+            llm.send(text)
+                .catch {
+                    Log.e(tag, "send() failed", it)
+                    messages += it.message!!
+                }
+                .collect { messages = messages.dropLast(1) + (messages.last() + it) }
+        }
+    }
+
+    fun bench(pp: Int, tg: Int, pl: Int, nr: Int = 1) {
+        viewModelScope.launch {
+            try {
+                val start = System.nanoTime()
+                val warmupResult = llm.bench(pp, tg, pl, nr)
+                val end = System.nanoTime()
+
+                messages += warmupResult
+
+                val warmup = (end - start).toDouble() / NanosPerSecond
+                messages += "Warm up time: $warmup seconds, please wait..."
+
+                if (warmup > 5.0) {
+                    messages += "Warm up took too long, aborting benchmark"
+                    return@launch
+                }
+
+                messages += llm.bench(512, 128, 1, 3)
+            } catch (exc: IllegalStateException) {
+                Log.e(tag, "bench() failed", exc)
+                messages += exc.message!!
+            }
+        }
+    }
+
+    fun load(pathToModel: String) {
+        viewModelScope.launch {
+            try {
+                llm.load(pathToModel)
+                messages += "Loaded $pathToModel"
+            } catch (exc: IllegalStateException) {
+                Log.e(tag, "load() failed", exc)
+                messages += exc.message!!
+            }
+        }
+    }
+
+    fun updateMessage(newMessage: String) {
+        message = newMessage
+    }
+
+    fun clear() {
+        messages = listOf()
+    }
+
+    fun log(message: String) {
+        messages += message
+    }
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
new file mode 100644
index 0000000000000..40c30e8d97077
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Color.kt
@@ -0,0 +1,11 @@
+package com.example.llama.ui.theme
+
+import androidx.compose.ui.graphics.Color
+
+val Purple80 = Color(0xFFD0BCFF)
+val PurpleGrey80 = Color(0xFFCCC2DC)
+val Pink80 = Color(0xFFEFB8C8)
+
+val Purple40 = Color(0xFF6650a4)
+val PurpleGrey40 = Color(0xFF625b71)
+val Pink40 = Color(0xFF7D5260)
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
new file mode 100644
index 0000000000000..e742220a8d719
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Theme.kt
@@ -0,0 +1,70 @@
+package com.example.llama.ui.theme
+
+import android.app.Activity
+import android.os.Build
+import androidx.compose.foundation.isSystemInDarkTheme
+import androidx.compose.material3.MaterialTheme
+import androidx.compose.material3.darkColorScheme
+import androidx.compose.material3.dynamicDarkColorScheme
+import androidx.compose.material3.dynamicLightColorScheme
+import androidx.compose.material3.lightColorScheme
+import androidx.compose.runtime.Composable
+import androidx.compose.runtime.SideEffect
+import androidx.compose.ui.graphics.toArgb
+import androidx.compose.ui.platform.LocalContext
+import androidx.compose.ui.platform.LocalView
+import androidx.core.view.WindowCompat
+
+private val DarkColorScheme = darkColorScheme(
+    primary = Purple80,
+    secondary = PurpleGrey80,
+    tertiary = Pink80
+)
+
+private val LightColorScheme = lightColorScheme(
+    primary = Purple40,
+    secondary = PurpleGrey40,
+    tertiary = Pink40
+
+    /* Other default colors to override
+    background = Color(0xFFFFFBFE),
+    surface = Color(0xFFFFFBFE),
+    onPrimary = Color.White,
+    onSecondary = Color.White,
+    onTertiary = Color.White,
+    onBackground = Color(0xFF1C1B1F),
+    onSurface = Color(0xFF1C1B1F),
+    */
+)
+
+@Composable
+fun LlamaAndroidTheme(
+    darkTheme: Boolean = isSystemInDarkTheme(),
+    // Dynamic color is available on Android 12+
+    dynamicColor: Boolean = true,
+    content: @Composable () -> Unit
+) {
+    val colorScheme = when {
+        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
+            val context = LocalContext.current
+            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
+        }
+
+        darkTheme -> DarkColorScheme
+        else -> LightColorScheme
+    }
+    val view = LocalView.current
+    if (!view.isInEditMode) {
+        SideEffect {
+            val window = (view.context as Activity).window
+            window.statusBarColor = colorScheme.primary.toArgb()
+            WindowCompat.getInsetsController(window, view).isAppearanceLightStatusBars = darkTheme
+        }
+    }
+
+    MaterialTheme(
+        colorScheme = colorScheme,
+        typography = Typography,
+        content = content
+    )
+}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
new file mode 100644
index 0000000000000..0b87946ca3ab1
--- /dev/null
+++ b/examples/llama.android/app/src/main/java/com/example/llama/ui/theme/Type.kt
@@ -0,0 +1,34 @@
+package com.example.llama.ui.theme
+
+import androidx.compose.material3.Typography
+import androidx.compose.ui.text.TextStyle
+import androidx.compose.ui.text.font.FontFamily
+import androidx.compose.ui.text.font.FontWeight
+import androidx.compose.ui.unit.sp
+
+// Set of Material typography styles to start with
+val Typography = Typography(
+    bodyLarge = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Normal,
+        fontSize = 16.sp,
+        lineHeight = 24.sp,
+        letterSpacing = 0.5.sp
+    )
+    /* Other default text styles to override
+    titleLarge = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Normal,
+        fontSize = 22.sp,
+        lineHeight = 28.sp,
+        letterSpacing = 0.sp
+    ),
+    labelSmall = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Medium,
+        fontSize = 11.sp,
+        lineHeight = 16.sp,
+        letterSpacing = 0.5.sp
+    )
+    */
+)
diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
new file mode 100644
index 0000000000000..07d5da9cbf141
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_background.xml
@@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#3DDC84"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
diff --git a/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
new file mode 100644
index 0000000000000..7706ab9e6d407
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/drawable/ic_launcher_foreground.xml
@@ -0,0 +1,30 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="85.84757"
+                android:endY="92.4963"
+                android:startX="42.9492"
+                android:startY="49.59793"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
new file mode 100644
index 0000000000000..b3e26b4c60c27
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
diff --git a/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
new file mode 100644
index 0000000000000..b3e26b4c60c27
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/mipmap-anydpi/ic_launcher_round.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+    <monochrome android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..c209e78ecd372343283f4157dcfd918ec5165bb3
GIT binary patch
literal 1404
zcmV-?1%vuhNk&F=1pok7MM6+kP&il$0000G0000-002h-06|PpNX!5L00Dqw+t%{r
zzW2vH!KF=w&cMnnN@{whkTw+#mAh0SV?YL=)3MimFYCWp#fpdtz~8$hD5VPuQgtcN
zXl<@<#Cm<R)d19?=)E<{+g@mp0C)CAX%7ksNnpX=jPlJEkqD9%o*fC(U7iySOYHHS
zCLH@bXPyI|^Z)Mc^PG7Oc+NfBJO`d7p5;U`M53Wbd!w|M(MUoNRc2m{@^!wFKzL?&
zx_RAc-^9Azxo%DmXW^9GV0~;n_G9&dym)|QrMEx!y_F=oDunn;1y)cvAc6z{0MHiz
zodGIH07w8nF%*bGq9Gv`YHnm80|d1T$uW=u4vGV%tX#>e5f5yr2h%@8TWh?)bSK`O
z^Z@d={gn7J{iyxL_y_%J|L>ep{dUxUP8a{byupH&!UNR*OutO~0{*T4q5R6@ApLF!
z5{w?Z150gC7#>(VHFJZ-^6O@PYp{t!<r*4*+?g*sysFgiN}}d!-T(>jH(_Z*nzTK4
zkc{fLE4Q3|mA2`CWQ3{8;gxGizgM!zccbdQoOLZc8hThi-IhN90RFT|zlxh3Ty&VG
z?Fe{#<RA5Hg_mG&BZS>9RrRnxzsu|Lg2ddugg7k%>0JeD+{XZ7>Z~{=|M+sh1MF7~
zz>To~`~LVQe1nNoR-gEzkpe{Ak^7{{ZBk2i_<+`Bq<^GB!RYG+z)h;Y3+<{zlMUYd
zrd*W4w&jZ0%kBuDZ1EW&KLpyR7r2=}fF2%0VwHM4pUs}ZI2egi#DRMYZPek*^H9YK
zay4Iy3WXFG(F14xYsoDA|KXgGc5%2DhmQ1gFCkrgHBm!lXG<W<SKpuS2eaDGkL-J-
z!+&UV_N0e<DV*|igwqdC*l{lj0T&&`uq=ycU@f9x0i8jzLa(y*X(YV?PCtUw;ks}p
z4zFn7N(-OG^(9ot1ZhYISOWe9?+l%f6v41n+j<OM$_uJgNP?ZJy}hEMMH=;WiG4I`
zs(bIWwSD<LJnAN(E(Xjr4k(^UYF37F_3f{;E%%FEa&I3I0GbdH@{pD5$m+1PN5CW)
zyZ&*9o#8wstdx@^rci;0B4BP2+H4Y<KbJI5L(bXG(k@`Kp=d>8I5h*uf{rn48Z!_@
z4Bk6TJAB2CKYqPjiX&mWoW>OPFGd$wqro<oln8GL@_6LPC)kg1!M)Y!|NCn7b*0sG
zEN=&c2xMM<>a($ne7EUK;#3V<N-jQ7j($tREa0F&-HzYCQtR#fTCZMRN*ZSm;T7a+
zuxa$}zDL8R9wGYkHb$+gJoiM@z+u{u7a_VUBwtd)bPzHxH}C`W=^2PsBr`s7taBMG
zm#Ss=-o`)W8%%x%>YkXaew%Kh^3OrMht<?zOY6P}#rBhn_hrWY$_P`{#CBR9w?+E6
zt7r#NN-tjxFY{9q75P<|L<ZJBHwn4FhG(&i>jYN?XEoY`tRPQsAkH-DSL^QqyN0>^
zmC>{#F14jz4GeW{pJoRpLFa_*GI{?T93^rX7SPQgT@LbLqpNA}<@2wH;q493)G=1Y
z#-sCiRNX~qf3KgiFzB3I>4Z%AfS(3$`-aMIBU+6?gbgDb!)L~A)je+;fR0jWLL-Fu
z4)<UE%?IC0Du41FrE~F_qc8nOq>P{c7{B4Hp91&%??2$v9iRSFnuckHUm}or9seH6
z>%NbT+5*@L5(I9<vs%8>j@06@(!<!eaZcF<_le1MVaYMg=gRy*f2#IaBH-mJIpy+L
z=Gsbhd6=3>{ZI?U0=pKn8uwIg&L{JV14+8s2hnvbRrU|hZCd}IJu7*;;ECgO%8_*W
Kmw_-CKmY()leWbG

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-hdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..b2dfe3d1ba5cf3ee31b3ecc1ced89044a1f3b7a9
GIT binary patch
literal 2898
zcmV-Y3$650Nk&FW3jhFDMM6+kP&il$0000G0000-002h-06|PpNWB9900E$G+qN-D
z+81ABX7q?;bwx%xBg?kcwr$(C-Tex-ZCkHUw(Y9#+`E5-zuONG5fgw~E2WDng@Bc@
z24xy+R1n%~6xI#u9vJ8zREI)sb<&Il(016}Z~V1n^PU3-_H17A*Bf^o)&{_uBv}Py
zulRfeE8g(g6<I*pq!<m%e0eKLCnC;y@4a&(;z@3Oc_yGuA}p}r2a+O=6+01<KP&)j
z?X*V!2PDO_%3er*&=0L^6alyFqZiK_dhy(U3lP;LLerOI%$mpKS51g&5Mj)6#*PVe
zF_(`;R5goZ_A_QeW9~l|wn`DsB!!6;@*G4}iEuP2Ot6s0BdUVMnEezcTDX4#Y(*N4
z%PCB_a77DrWnVI8;$wcJDzUhkF$0WwCu~^;eS7IbaNIi-ro8tUGsu`9t8y&n(KArb
zV_-{Zd`}5Q__NX_45pDj6i?2BDQ58^g~1BnfGwhN=w`Zb9Jh2q7g$_Q$ABGgfGsfU
zQ%Xp}ueAZ7(7KK;B*zUoD8S$_dIs%z0xV#07bPs=!^#3izY*Sh+CVAuM|l6Flv1c)
zL>HFhk_?o_;0@tz?1I+l+Y#Q*;RVC?(ud`_cU-~n|AX-b`JHrOIqn(-t&rOg-o`#C
zh0LPxmbOAEb;zHTu!R3LDh1Q<R(Kya7{I0;4Dacb1&lp`!Jlm{pmgtAx{w^#57P>O
zZTf-|lJNUxi-PpcbRjw3n~n-pG;$+dIF6eqM5+L();B2O2tQ~|p{PlpNcvDbd1l%c
zLtXn%lu(3!<myn;X3ipg7@oW+6O}@J$7hVgi1~F#J<2qhIXme>aNK!V#+HNn_D3lp
z2%l+hK-nsj|Bi9;V*WIcQRTt5j9<byX)%{hZi!H7I(x!SO0tBzPRXWGd1Ke*j*=vy
zyQaHQRY5iP-Q*Z2C#JituSKDnx+Q<*4#r7|x$~NQt44KoTmQ*R>0A<=<I>am+cc`J
zTYIN|PsYAhJ|=&h*4wI4ebv-C=Be#u>}%m;a{IGmJDU`0snWS&$9zdrT(z8#{OZ_Y
zxwJx!ZClUi%YJjD6Xz@OP8{ieyJB=tn?>zaI-4JN;rr`JQbb%y5h2O-?_V@7pG_+y
z(lqAsqYr!NyVb0C^|uclHaeecG)Sz;WV?rtoqOdAAN{j%?Uo%owya(F&qps@Id|Of
zo@~Y-(YmfB+chv^%*3g4k3R0WqvuYUIA+8^SGJ{2Bl$X&X&v02>+0$4?di(34{pt*
zG=f#yMs@Y|b&=HyH3k4yP&goF2LJ#tBLJNNDo6lG06r}ghC-pC4Q*=x3;|+W04zte
zAl>l4kzUBQFYF(E`KJy?ZXd1tnfbH+Z~SMmA21KokJNs#eqcXWKUIC>{TuoKe^vhF
z);H)o`t9j~`$h1D`#bxe@E`oE`cM9w(@)5Bp8BNukIwM>wZHfd0S;5bcXA*5KT3bj
zc&_~`&{z7u{Et!Z_k78H75gXf4g8<_ul!H$eVspPeU3j&&Au=2R*Z<QVlXG0%J7Qu
z`uQlm{Q{cWVD7XACdR6KeMUk-Q7>p#M9$9s;fqwgzfiX=E_?BwVcfx3tG9Q-+<5fw
z%Hs64<N1NYeh_oukcz%rOcU>z)@Q*%s3_Xd5>S4d<X%6~`O&m@p+WTqnB(reB<gqb
zpaA~={ur+R)J6BZ_}KqfN1AF`u0i5>g$s>@rN^ixeVj*tqu3ZV)biDcFf&l?lGwsa
zWj3rvK}?43c{IruV2L`hUU0t^MemAn3U~x3$4mFDxj=Byowu^Q+#wKRPrWywLjIAp
z9*n}<YIhnms>eQ9-gZmnd9Y0WHtwi2sn6n~?i#n9VN1B*074_VbZZ=WrpkMYr{RsI
ztM_8X1)J*DZejxkjOTRJ&a*lrvMKBQURNP#K)a5wIitfu(CFYV4FT?LUB$jVwJSZz
zNBFTWg->Y<QdHGXO6(B7DL40#@QH~&1bt_RGfAlw%_YsP19wAkHXw%~G9G(zw;=yC
z_Wta^hs{<khF)Et{~KQ(Y!<^`L|pYl%vB@$I(;3RmQHq?VZ^(}{nUdkKh|wO|NXu)
ze|eLtM-LNkZU|pzO^)wX4?x7Y#55_{=sp>k0j&h3e*a5><wP*B;A~Y_-J8$UU=+E3
zs|^$XdARfHEBrp-b3qaNg~XRwL;d6S=>B=-xM7dE`IuOQna!u$OoxLlE;WdrNlN)1
z7**de7-hZ!(%_ZllHBLg`Ir#|t>2$*xVOZ-ADZKTN?{(NUeLU9GbuG-+Axf*AZ-P1
z0ZZ*f<D6L!WI}YtFrx~d;ZCS=O$ReN3~!sEoYV$RgCJx3D(Cp-Mie$*C4cS*q~E}&
z0BT11xQ>x+ck4{XtFsbcc%GRStht@q!m*ImssGwuK+P@%gEK!f5dHymg<9nSCXsB6
zQ*{<`%^bxB($Z@5286^-A(tR;r+p7B%^%$N5h%lb*Vlz-?DL9x;!j<5>~kmXP$E}m
zQV|7uv4SwFs0jUervsxVUm>&9Y3DBIzc1XW|CUZrUdb<&{@D5yuLe%Xniw^x&{A2s
z0q1<L&7;HiAPZm8Z=iQR8>+owDSfc3Gs?ht;3jw49c#mmrViUfX-yvc_B*wY|Lo7;
zGh!t2R#BHx{1wFXReX*~`NS-<fA!XHlF+kxYYK8u1|b%w@Tz%ELs#ab^++6I>LpSX
z#TV*miO^~B9PF%O0huw!1Zv>^d0G3$^8dsC6VI!$oK<B%_ozoN7z7_(zzYjWYY9bu
zd)NEdFua83uR-Vf-s4v#aHcT*T0qDHMRnnTV@TqU{LFRZ2dsH&3pJ!02lVAX&;IMb
z^MANDir>DKiXdJt{mGkyA`+Gwd4D-^1qtNTUK)`N*=NTG-6}=5k6suNfdLt*dt8D|
z%H#$k)z#ZRcf|zDWB|pn<3+7Nz>?WW9WdkO5(a^m+D4WRJ9{wc>Y}IN)2Kbgn;_O?
zGqdr&9~|$Y0tP=N(k7^Eu;iO*w+f%W`20BNo)=Xa@M_)+o$4LXJyiw{F?a633SC{B
zl~9FH%?^Rm*LVz`lkULs)%idDX^O)SxQol(3jDRyBVR!7d`;ar+D7do)jQ}m`g$<s
z-6tu{nP5&-otsZNY)-$k`{Pj80gwuW=4gjb+bXY>TevUD5@?*P8)vo<u;hmO(wx=4
zu#Ty4#N8dV+4db_oTh<$^Q+`f9^xq{WR#>a?kEe@_hl{_h8j&5eB-5FrYW&*FHVt$
z$kRF9Nstj<DlnDleF4(_XZ^q<)s2!0YS`L=!d-ZCs(bT}fT({j8NU<*U4dqQq?|<5
zrM4G6K$2co@=m3s4&j>%KRzpjdd_9wO=4zO8ritN*NPk_9avYrsF(!4))tm{Ga#OY
z(r{0buexOzu7+<C7l)}{Nc<qc*P;@OPvjmTK3RfnIjfpHVr4;vhpzPB(e56`ue)+^
zV<puQ4Ra`IJ1<xY9>rw8E08Gxd`LTOID{*AC1m*6Nw@osfB%0oBF5sf<~wH1kL;sd
zo)k6^VyRFU<BuKKXLDd>`)dt*iX^9&QtWbo6yE8XXH?`ztvpiOLgI3R+=MOBQ<kj1
z^+$eZoWa#nXjJMS{t(g~l-@9Ro*c@Zd2iRE?D?Zo&wSDp9cqKFwo)iB{||Ez9c*1E
z4LKsK`*%O!d#7>9<gyqCJnWR~?z%;3dw3=(Pq|GAF4ceN5fzvX+wwedai5kotW7if
w9)|ozV<th{;5oaSc=(C`Xv64I>=rMVgi<*CU%+d1PQQ0a1U=&b0vkF207%xU0ssI2

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..4f0f1d64e58ba64d180ce43ee13bf9a17835fbca
GIT binary patch
literal 982
zcmV;{11bDcNk&G_0{{S5MM6+kP&il$0000G0000l001ul06|PpNU8t;00Dqo+t#w^
z^1csucXz7-Qrhzl9HuH<!ckn_w-(t15itRHmqN0O$B3XH(E|jyV^QXq8=yM`Q**vy
zpEpgQd+no=J<Tlv&+_>B%l>&>1tG2^vb*E&k^T3$FG1eQZ51g$uv4V+kI`0<^1Z@N
zk?Jjh$olyC%l>)Xq;7!>{iBj&BjJ`P&$fsCfpve_epJOBkTF?nu-B7D!hO=2ZR}<p
z;bEy|mw1;}P&gp|0ssKe4*;D3Dlh;r06r}ehC-pC4mGy`3;|+W04+#B4r_x~@mHHy
z)H}bD|I2-n_L$pW;*I)~?=#N<)`92&<$3IR`#<SH6@I&FRQa6xBmQ5wPwJ2PAI(ne
z$L2Yb@JHxb`+bLk*AjR$^`b?pr|?!6=+AboIQ2D-p)UI7x(J0|5(5~ur$_+)`>C%4
zc_9eOXvPbC4kzU8YowIA8cW~Uv|eB&yYwAObSwL2vY~UYI7NXPvf3b+c^?wcs~_t{
ze_m66-0)^{JdOMKPwjpQ@Sna!*?$wTZ~su*tNv7o!gXT!GRgivP}ec?5>l1!7<(rT
zds|8x(qGc673zrvYIz;J23FG{9nHMnAuP}NpAED^laz3mAN1sy+NXK)!6v1FxQ;lh
zOBLA>$~P3r4b*NcqR;y6pwyhZ<hjKiZs6mOSFB&+cIl`GV$93-<ciUjF#*1^<p~gh
ziQ_{)r0dA7$It&Fe=obxu8n!+elxmgqxPbUL!FxW0;AOfqz@8JOz9Qbm)m-9!^7D)
z480@BoIIb<oT``+rVla8L)8fXO&6}3P9n4v$`6WG<DUNWuKb9J9rUsAn7d-_YWT^U
z{NXl@OAPIJ!>3_PiDb|%n1gGjl3ZU}ujInlP{eks-#oA6>rh&g+!f`hv#_%JrgYPu
z(U^&XLW^Q<WhKYr9rXr6*~Tmpuq6NjnD6;;NNBGIg-1ZvfACQ4{ocrwM0)?`oL2ts
zCXY5KT@`(ir63J0?%+_(-dDgf<6R$u{lCdy6Zi5d+Bf;1OXyD;xe3#Gug*&T|0o41
zD8;$|JvUv&@vsLIH&C5+S{!k&{~Z54^y@9r>X7F9Z*SRPpQl{B%x)_AMp^}_v~?j7
zapvHMKxSf*Mtyx8I}-<*UGn3)oHd(nn=)BZ`d$lDBwq_GL($_TPaS{UeevT(AJ`p0
z9%+hQb6z)U9qjbuXjg|dExCLjpS8$VKQ55VsIC%@{N5t{NsW)=hNGI`J=x97_kbz@
E0Of=7!T<mO

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-mdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..62b611da081676d42f6c3f78a2c91e7bcedddedb
GIT binary patch
literal 1772
zcmV<I1{3*GNk&HG1^@t8MM6+kP&il$0000G0000l001ul06|PpNL&H{00E$D+qP-j
z>Qj4N+cqN`nQhxvX7dAV-`K|Ub$-q+H-5I?Tx0g9jWxd@A|?POE8`3b8fO$T))xP*
z(X?&brZw<itFzD+K&M3~p69>({`)WU&rdAs1i<RDIiSY82S2mupC8Pt4!H6t1GTb(
zWRM~Q$%>T<R+Yg3{a%pbg++@O@<l(ulw^R7DJ5kYQ(?LhFeMn^80iDc8a#OdFhyzL
zDn(d!5nfX;MJV7nMVO%oPb#QF78@wSOhvdEwtz$5l){XK=|H&u(ZCCOX72e)L;uHO
z1tnw`glk~|XjH3U$_P_d)`A8s=1~}>a0x6F@PIxJ&&L|dpySV!ID|iUhjCcKz(@mE
z!x@~W#3H<)4Ae(4eQJRk`Iz3<1)6^m)0b_4_TR<yeHWl(T-||IU&i!Rd!TMUruU72
z<l~rLRD-qWW4hw3Q)?MQ93gOvat1wqq{JcosXwejji>Z+cz#eD3f8V;2r-1fE!F}W
zEi0MEkTTx}8i1{`l_6vo0(Vuh0HD$I4SjZ=?^?k82R51bC)2D_{y8mi_<vjH0E1*B
zfk*0C6jY|!Rf=RG!W+$uDg^D?-XzoVrR42)&Y)P6w7*9BP@dq)8yymh;%(CA$R7+o
zloov8A4l6H7NzQ3vsrJ*;3X6j#0T=toMt(L(p9c**S(b_DMgZG<-Trpa|&g3Ns}I1
zKlp(~|M0=q9!(O5aw}K0apy5RZoJ5U{>?X^=U?2|F{Vr7s!k(AZC$O#ZMyavHhlQ7
zUR~QXuH~#o#>(b$u4?s~HLF*3IcF7023AlwAYudn0FV~|odGH^05AYPEfR)8p`i{n
zwg3zPVp{+wOsxKc>)(pMupKF!Y2HoUqQ3|Yu|8lwR=?5zZuhG6J?H`bSNk_wPoM{u
zSL{c@pY7+c2kck>`^q1^^gR0QB7Y?KUD{vz-uVX~;V-rW)PDcI)$_UjgVV?S?=oLR
zf4}zz{#*R_{LkiJ#0RdQLNC^2Vp%JPEUvG9ra2BVZ92(p9h7Ka@!yf9(lj#}>+|u*
z;^_?KWdzkM`6gqPo9;;r6&JEa)}R3X{(CWv?NvgLeOTq$cZXqf7|sPImi-7cS8DCN
zGf;DVt3Am`>hH3{4-WzH43Ftx)SofNe^-#|0HdCo<+8Qs!}TZP{HH8~z5n`ExcHuT
zDL1m&|DVpI<IwA;|3z1u>y=xsLO>8k92HcmfSKhflQ0H~9=^-{#!I1g(;+44xw~=*
zxvNz35vfsQE)@)Zsp*6_GjYD};Squ83<_?^SbALb{a`j<0Gn%6JY!zhp=Fg}Ga2|8
z52e1W<DmYW|KhLyAh*AQ$=bd-79$cFL1=dC7E!?lJ(DK_A2rbd*I!fTiWjU@hO@LO
z{34r?8R+y6;5?)6c=hv86*TVD<6h<-YN#p%M+B*z{-U|t?d%$+^@~OhgQ=;&eE7WW
zQMm4(i7@Afmhf}Dnwx!Q1lKgexn~licBP}_&7QY=>U%^L1}15Ex0fF$e@eCT(()_P
zvV?CA<sp1RgQ~qYDHIC(K$HgNSDgI7aFI{AcoU=(>%#Sy08_U6VPt4EtmVQraWJX`
zh=N|WQ>LgrvF~R&qOfB$!%D3cGv?;Xh_z$z7k&s4N)$WYf*k=|*jCEkO19{h_(%W4
zPuOqbCw`SeAX*R}UUsbVsgtuG?xs(#Ikx9`JZoQFz0n*7ZG@Fv@kZk`gzO$HoA9kN
z8U5{-<bq**{p6!H-(%Tic#_E`wcN6#HU8-OK@OS$MA~<4ln|3Duf90UXNW1nMhk@X
z!<X~il$GI)0FveT${!;q6+#ptj}^6#CM6bt!8aB|<oIwiQzNU~!^v#E0ATVF@f>yY
zvV{`&WKU2$mZeoBmiJrEd<YP=_2@e1bJ|tRh6}2@09)72_kFh|s|{=Q%;lrD1V0sq
z5(|fB{Q};57E-A$Y;tLp9MPkkDs1?cxgaM#DX)SROj{lUu_=U;L%&QSd(1lwW9=M~
zPXv~y>zUZAv1sRxpePdg1)F*X^Y)zp^Y*R;;z~vOv-z&)&G)JQ{m!C9cmziu1^nHA
z`#`0c>@PnQ9CJKgC5NjJD8HM3|KC(g5nnCq$n0Gsu_DXk36@ql%npEye|?%RmG)<p
z|22C&(o0<{zD=}o7hFmrnHiNsKS+q5do@k^v7dAg(j37~!7%msUYhV9SAD*hicVK@
zd=IyocF&y5dH^sh4`7M2vQg8OP##~+Eu~vo(S~k<e%FqF9ffGv{w_F?KH5TRvvnu}
O>FJ$wK}0tWNB{uH;AM~i

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..948a3070fe34c611c42c0d3ad3013a0dce358be0
GIT binary patch
literal 1900
zcmV-y2b1_xNk&Fw2LJ$9MM6+kP&il$0000G0001A003VA06|PpNH75a00DqwTbm-~
zullQTcXxO9ki!OCRx^i?oR|n!<8G0=kI^!JSjFi-LL*`V;ET0H2IXfU0*i>o6o6Gy
zRq6Ap5(_{XLdXcL-MzlN`ugSdZY_`jXhcENAu)N_0?GhF))9R;E`!bo9p?g?SRgw_
zEXHhFG$0{<gIr?LrJWRzItY~y<Z<EAV-uj3XnE%(*emp8D=Y7PQV-i%2@c@D|9<;;
zH`2jMaL`24BPUPYdJ=PY$>qYOqhdX<(wE4N@es3VIo$%il%6xP9gjiBri+2pI6aY4
zJbgh-Ud|V%3O!IcHKQx1FQH(_*TK;1>FQWbt^$K1zNn^cczkBs=QHCYZ8b&l!UV{K
z{L0$KCf_&KR^}&2Fe|L&?1I7~pBENnCtCuH3sjcx6$<!T3RX}!APxoq0Pr3FodGIf
z0AK(<F%pMDq9F}cg8&c#f?65g)=yLs(=CjK<M1`M`}BX}chg7A2kI}XuSf^#uUY@|
zuTu{#uVwGqpV;qc@BVqrAKdd@@EN}QTvvI{5IcyyCGks*4qjQY^_27g{caAK)e)>c
zwqkNkru);ie``q+_QI;IYLD9OV0ZxkuyBz|5<<a3;s18CO(E<oo}Dr9FcHGkDGekL
z%8=D|QqLZ6i9<4n7z0@0Z!*y6<{tFE?Q(JSs1PS)KpVZ-UuvI<x@J>$1BH|vtey$>
z5oto4=l-R-Aaq`Dk0}o9N<n-Lxhke(>0VrkqW_#;!u{!bJLDq%0092{Ghe=F;(kn}
z+sQ@1=UlX30+2n<VrE^M(W<0s>WjkL$B^b!H2^QYO@iFc0{(-~yXj2TWz?VG{v`Jg
zg}WyYnwGgn>{HFaG7E~pt=)sOO}*yd(UU-D(E&x{xKEl6OcU?pl)K%#U$dn1mDF19
zSw@l8G!GN<gn3;8HSds=>FB3c3VVK0?uyqN&utT-D5%NM4g-3@Sii9tSXKtwce~uF
zS&Jn746EW^wV~8zdQ1XC28~kXu8+Yo9p!<8h&(Q({J*4DBglPdpe4M_mD8AguZFn~
ztiuO~{6Bx<ZU5#l%0-dq__bYvK~-`BMo2EW*Vk@0Uv@y205m+Q&aq=TSlpam*A$L@
zZ$K+cMvxib3m9dD17_p){u>?SfO~_ZV(GIboeR9~hAym{{fV|VM=77MxDrbW6`ujX
z<3HF(>Zr;#*uCvC*bpoSr~C$h?_%nXps@A)=l_;({Fo#6Y1+Zv`!T5HB+)#^-Ud_;
zBwftPN=d8Vx)*O1Mj+0oO=mZ+NVH*ptNDC-&<HMad!<Q5dhOvyth5Fc&!i0MbxZ%N
zU%|-$yCvba94#fAF;MI_OEH#`2k(1(gihK2jMyvsOoHYgzVHUqgQ68^-GY7|rOOyF
zoC~vHfip03zI!qe_AurbxIn0~<I(%>zZ7Hwho6UQ#l-yNvc0Cm+2$$6YUk2<tEyER
zK*7f=uUP>D2t#vdZX-u3>-Be1u9gtTBiMB^xwWQ_rgvGpZ6(C@e23c!^K=>ai-Rqu
zhqT`ZQof;9Bu!AD(i^PCbYV%yha9zuoKMp`U^z;3!+&d@Hud&_iy!O-$b9ZLcSRh?
z)R|826w}TU!J#X6P%@Zh=La$I6zXa#h!B;{qfug}O%z@K{EZECu6zl)7CiNi%xti0
zB{OKfAj83~iJvmpTU|&q1^?^cIMn2RQ?jeSB95l}{DrEPTW{_gmU_pqTc)h@4T>~&
zluq3)GM=xa(#^VU5}@FNqpc$?#SbVsX!~RH*5p0<xA(I&qyn@)(mw0@a&Pg3L>p@w
z;~v{QMX0^bFT1!cXGM8K9FP+=9~-d~#TK#ZE{4umGT=;dfvWi?rYj;^l_Zxywze`W
z^Cr{55U@*BalS}K%Czii_80e0#0#Zkhlij4-~I@}`-JFJ7$5{>LnoJSs??J8<Z-XK
zj&@i^7ta>kWVl6|8A}RCGAu9^rAsfCE=2}tHwl93t0C?#+jMpvr7O3`2=tr{Hg<Kw
z(MWiv`>$=HlnjVG^ewm|Js0J*kfPa6*GhtB>`fN!m#9J(sU!?(OSfzY*zS(FJ<-Vb
zfAIg<xCfqXs~;Xmq<7KOO96xsPR{hU&apj;5A)}6v`#`8fe>+`U)YaXv#sY(c--|X
zEB+TVyZ%Ie4L$gi#Fc++`h6%vzsS$pjz9aLt+ZL(g;n$Dzy5=m=_TV(3H8^C{r0xd
zp#a%}ht55dOq?yhwYPrtp-m1xXp;4X;)NhxxUp<Z`Z=zPQ_3&gbp_8a`>gP%XTLmO
zcjaFva^}dP3$&sfFTIR_jC=2pHh9kpI@2(6V*GQo7Ws)`j)hd+tr@P~gR*2gO@+1?
zG<`_tB+LJuF|SZ9tIec;h%}}6WClT`L>HSW?E{Hp1h^+mlbf_$9zA>!ug>NALJsO{
mU%z=YwVD?}XMya)Bp;vlyE5&E_6!fzx9pwrdz474!~g(M6R?N?

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..1b9a6956b3acdc11f40ce2bb3f6efbd845cc243f
GIT binary patch
literal 3918
zcmV-U53%r4Nk&FS4*&pHMM6+kP&il$0000G0001A003VA06|PpNSy@$00HoY|G(*G
z+qV7x14$dSO^Re!iqt-AAIE9iwr$(CZQJL$blA4B`>;C3fBY6Q8_YSjb2%a=fc}4E
zrSzssacq<^nmW|Rs93PJni30R<8w<(bK_$LO4L?!_OxLl$}K$MUEllnMK|rg=f3;y
z*?;3j|Nh>)p0JQ3A~rf(MibH2r+)3cyV1qF&;8m{w-S*y+0mM){KTK^M5}ksc`qX3
zy>rf^b>~l>SSHds8(I@hz3&PD@LmEs4&prkT=BjsBCXTMhN$_)+kvnl0bLKW5rEsj
z*d#KXGDB4P&>etx0X+`R19yC=LS)j!mgs5M0L~+o-T~J<oyc-(l&0FxfDJ)vWdrzG
zjkHRMCVIq8fJ3SsaN{G0bSezdyMc{>l!p!AJxnGAhV%~rhYUL4hlWhgES3Kb5oA&X
z{}?3OBSS-{!v$nCIGj->(-TAG)8LR{htr41^gxsT8yqt2@DEG6Yl`Uma3Nd4;YUoW
zTbkYl3CMU5ypMF3EIkYmWL|*BknM`0+Kq6CpvO(y$#j94e+q{vI{Zp8cV_6RK!`&C
zo<pW1O@mj#Ba$B1jF9e#KLC$tdVGRA(KNLm5)Z-c3uM|e{5g0;)Z=U1o}r1okeCSe
z&690M^SdF4s^BB6+fY=x1U@bvmsd$`X83VHh)V#T!DbU?^&>b$*5Q|$IZ09dW=L!V
zw@#2wviu|<#3lgG<y?|hUxpyMg6}AupvayTr}GM=TMW<L9;Z9j*(N<6al*6Nv}pBq
zNQh4md`M{`Vm9A~M}$3oY?+A^<^B<?{}o6PDK4EKtBb3wi8R-)jnxf}q~=aYj0C%v
z6V$@(vAW|xm9TnOtnNN6L9fN@aGkJpd#vs_IB9lgtah&@ZNCmaMjkgzYeS?|_54^}
zTvwWi)xbYv^}ni8L~Kgmjn&V}hKa})-h&Y069PU_u+-A`bU@-Gz>E8GEhcx+zBt`}
zOwP8j9X%^f7i_bth4PiJ$LYtFJSCN$3xwDN;8mr*B;CJwBP2G0TMq0uNt7S^DO_wE
zepk!Wrn#Z#03j{`c*Rf~y3o7?J}w?tEELRUR2cgxB*Y{LzA#pxHgf}q?u5idu>077
zd^=p)`nA}6e`|@`p?u}YU66PP_MA}Zqqe!c{nK&z%Jwq1N4e_q<#4g^xaz=ao;u|6
zwpRcW2Lax=ZGbx=Q*HhlJ`Ns#Y*r0*%!T?P*TTiX;rb)$CGLz=rSUum$)3Qyv{BL2
zO*=OI2|%(Yz~`pNEOnLp>+?T@glq-DujlIp?hdJeZ7ctP4_OKx|5@EOps3rr(pWzg
zK4d3&oN-X2qN(d_MkfwB4I)_)!I_6nj2iA9u^pQ{;GckGLxBGrJUM2Wdda!k)Y>lq
zmjws>dVQ*vW9lvEMkiN3wE-__6OWD0txS&Qn0n22cyj4Q*8(nG4!G{6OOwNvsrPIL
zCl-$W9UwkEUVuLwyD%|inbOF*xMODZ4VMEVAq_zUxZ+K#Gdqf!DW$5f)?7UNOFMz!
zrB~tuu=6X2FE(p^iqgxr+?ZK;=yz`e;C$#_@D9Lj-+TDVOrva>(#*PVbaHO>A)mhl
z07OJWCqYC60518$!&c`eNBcBW%GnfaQ*$eazV^2_AW?j)h;J1nUjN(I9=0+!RVx~%
z3@Tf!P0TE<o$!VqowG;KvFtwQM{hV`ZE0qrR<w#Ts%&9+`_$a>+98jA?WceK-}A1%
zW!K)lyKcGqy#M~})315-A#2NXQ`?6NR#Apo=S!oF=JfpX>iR*49ec{7AN$xxpK{D$
z2d%Fz&rdfSqourN$~Y^NFIMV1CZ?J*bMx~H3k&meGtH@q9ra2vZxmA$S(#jaaj-g4
ztJmxG+DLV<*q<|sDXPp$X>E)#S}Vm&sRaO5P&goh2><}FEdZSXDqsL$06sAkh(e+v
zAsBhKSRexgwg6tIy~GFJzaTxXD(}|+0e<LznCgHsU8?@8?t|e+t3NOg)4%V%QT)RG
z#(vWKzWPe^0R3j`BK^Sj0R4vax&4^<HvOypv-k`BiT}~o0m7^OSGJ$@KajnJ{#EwN
zDeve%EWSc^7qvI|&GkIv!CUKJ?z|=S5$~^*hbW!^KEXeU|8)Of{R8r6<YPXsPJiI{
z3I0p{JN=jUpWV;#UH-pt{fqxv+)or1tENbj$yb}h1W}{VuIxcdxr4O$Pk-W+vE;HW
zs>OwFDA%rn`X;MVwDHT9=4=g%OaJ9s%3b9>9EUTnnp0t;2Zpa{*>mk~hZqItE_!dQ
zOtC>8`$l|mV43Jbudf0N6&&X;{=z}Zi}d1`2qmJ}i|0*GsulD3>GgQXHN)pkR6sf1
z?5ZU%&xtL}oH;YiAA)d*^Ndw2T$+Mjuzyzz@-SM`9df7LqTxLuIwC~S0092~+=qYv
z@*ja;?Wt!T!{U?c*Z0YtGe)XbI&y-?B&G2$`JDM)(dIV9G`Sc#6?sI60de6kv+)Qb
zUW~2|WjvJq3TA8`0+sWA3zRhY9a~ow)O~&StBkG2{*{TGiY~S8ep{V&Vo2l<6LWsu
z^#p0-v*t2?3&aA1)ozu|%efSR=XnpX$lvT<i5fh}s=@+>eRdKlvM!@|pM5p2w3u-6
zU>}t2xiYLS+{|%C65AzX+23Mtlq?BS&YdYcYsVjoiE&rT>;Necn6l^K)T^lmE`5u{
zm1i+-a-gc;Z&v-{;8r)z6NYfBUv+=_L}ef}qa9FX01)<p5}9zArZ*5BNNPrYJe?q^
zoGwf&5As9!{7(Mh#&*CXqg;f?QnQ-nlaTt)rSHVHCm`47n7&FR=c_u*_Tb`8rUm3H
z0O9JxAZpoqT#O$8lO#-qLUxwg2QFpWD)MH~tWW!FJ@rL#Z3X@-EA+a_!T&{YBN@VU
z#uLh{fnX}ph>+Aaf+;xj(mL6|JUzGJR1|fnanb%?BPPIp>SCjP|8qE5qJ{=n5<?FH
zyc@=a;51oWzuVAcj6pr}S4=V^1y$yRMekrgPiZC)AMQEB*qQt?gOx<6n-Ze<xOk%8
zJlp{hn2r5lN&v>ZGw?8<T=j<kiK3k}QNf{mJrZ8{h9VJ5mymJ}tharUQVZ+A)q|JA
zP<4CV&CzPUYMZ;!LAXmAxQKNOUhvT9Hs7xDmh*<vTKo#A=V}0C%3}Bd)|`ucui<U}
zkh|*TSU#9=A^@TE$st=m>1z3(k;pzH%1CtlX50{E7h)$h{qGKfzC`e2o`*IqA#tjA
z`Fz&^%$b9F*N`)U-#6>a)Z`55`$Dd0cfcs0$d13^ONrdCu9x<t+8g+uSD5W2zwlhC
z|JVEzcPV$NtS$c<FJ8rBZ;INEx+Q=2(FJ^S342`@#MjKlFl)sF8^7THVLheX^i?L?
z&CPm=G=y+=EKRt@v8Clr<)efd)hKaE^n%ZPKLi%wwD38M$NzP!(WBLE?qcvP01sx9
z&*Nj-pc7`@jq=MVuj=Qp>cv_=n#WQo8stcz3jP9|2EvdI-RhJM3%Q%oM&!OlShM|0
z?gz?<?7WW8=Q%s)y_zh$<gKU|U@-;T<hp_neb-hC9;eMWIi~L=ZQC!2-eBW=SL{}p
z$?;Q@X@<Q+-HdRo>wHZSnm45njLtsz8PVT1S&jAlbKg5kVam$p16=EK@Sj4EP0OtH
zmJDmdc^v)x>56Qg_wmYHz6h)>kl_h$>0@J!ypv%APmjZTAQVLy6Fu50RGY&JAVNhx
zrF_qG6`x9MkT;1S<Ag2tDFJ87lYSIU+mImGAD&|@nwJc#mdqwB-2t$i{E|O|iC+rn
zTx&X1e_l?93I&#?`F=sa9qG87|KIc6S%E@vyQNP?qi0>FWo$)l{M$;3qUDn9JwE}z
zRl#E_bDRJFii61kPgBybIgp8dNW!Cc1b*^YYk-#oWLJvtM_v^hQx~9?8LD4VFFxBF
z3MlrsSC%f9Oupn*ctPL0U1fwfX<e8j*;kJ8_CN6%nCTqo2`3d9Pst}VgQjU)?(M7p
zzxo&&R>?`tRhPD{PSLFPQOmIt$mDy0SgpNVvHS+f#Do>h1Gn?LZU9(KaN>Q_=Y*_T
zvtD7%_u^^+{g`0VGzg(VZrpVQ<iSLmVH#_?Ygs~6CEv!IHC;9@ugl#8Bd(1@U8J`m
zZPR+rwS3E7Io$PJ#u@SZ7*ofWJeNkkZzfy5$#`y(gV@Mrz3MQq!<5HDiA{dy{A6&s
zm;xq~CnA00hNM6ID4qQ25IVwnMQJks`iwc)#g`8-cX!e+83#89|3i9nc;W|OlG5lT
z#`=rnOh~`2$itxg{QZs*tGy>6Ub5M=tI_p7T93R8@3Zulu3|#{iNcu!oiHxZ4Rf*(
zfmiN$$ru(*_Zqn=`Gq#OuHRTSwp7uH_SokR&|)RuW5yo=Z|_4?qU-JU+tpt>!B&Is
z@N(=SG;bpV<x5xb4+$A4;kTvxjvLCmS(Qzk7DoqV?c3gPc^$ajYmd|>c;AO@zbmMM
zScqq1)b-ZQIrs={oD}|?6y{$HNB1U0^LsBh8JI&3!GBZxOXI<}&5-$lgkAaYqhOTb
z?2vEnZ$-kk;*M_17(upJF3%+iH*s0-r{vttXVB2OUwI1s^+G(Ft(U8gYFXC}#P&E^
z>T@C^tS`Z7{6HT4_nF~n>JlZtk5&qDBl6r|^kzQYe`wq!C)n@$c>WOPA61NDFj<<6
zGW71NMMhwAl!U-yqrq2xrSFqRCI8acw7?}3j;ynxo*-b7Co;g5r%^j=H@9({PXXBf
z@r>U>>N;E)81wx`B4f%{PB~MHka_);%kBCb(d|Jy5!MqJ%2p`t&@L)4$T2j&-WHvG
zv3(uyA_gwqNu(k?jQTtv3dgPKRZoH8prxe7>pQBW5L&dpumS&5Ld2?(sCpJjvc4L5
zEnh&?91WVm)ZdTj=fjJ$<vc{WSJ<ii^$T&iG*Yv9jX<?z?mPY(t5t-1^A0*RQs5X?
zYSLjXl&MWC!|=j$?-@JVu!#TF`ZHTW&ulyLWq^6N!VAX2Xmm)BA=Yu5B~k=gi4VJ{
zIG~`oyZBm%<a3bH1xUE^?HI_r5%K8}A8v#m>pPDdgAttLXuke+?KdKxu<Qg!^11Y?
cG7e%GKbg~lPT|05mMxkl$h;o@5^?|l06hZIiU0rr

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..28d4b77f9f036a47549d47db79c16788749dca10
GIT binary patch
literal 2884
zcmV-K3%m4ENk&FI3jhFDMM6+kP&il$0000G0001w0055w06|PpNY()W00EFA*|uso
z=UmW3;Ri7@GcyiBW{ey$jes55b5S`|ZVZ{(x$xch{z?D+^{yErVgleVwa9qvGt40r
z42;MG=7<0QySlzE=Ig6%01!FBK^$Fsxe@Hfe6aCy?Wh2r0~}@_lQAF90oTUi0FhEr
z#(<GhM2CTE5->*;kTC<I6%bkw<P!?WpaDI10CfmFPKu1G=p;%VjNdWOy5JfR^IubB
zmWbY`5VOa4^Co4?lA*0$&a)>(r!tQk6;gxj4h%FdHAt(^M3YvYj(!tOeN)+Hvj6+<
zzyJRG?^lZfWuR#t!tUKP&(?%3v&Zd<cNCc=qEAh>$R2YN>lB(Lq`OInY48%4%yTv2
zYe1{G`3)(PDEio5Y@-I5tUf`c%%O<RR41hl;a9a>CJMtSW56g3iEg%3`$7XSJJHyA
z<|7&N)5Xrlgv~%BO24eFd;Hd;uiK%D`EdK|quUeRZDqbh9l)%j%J#0lfrZumvA<_w
zu&=AVvdChf6}eqh(bUz`(`Ue*p01{fBAcTgKyDYLs_I+YyJEk+rM@avU~>fB$n)HS
zM7pfJydu`i%gfS<LN|{i=ttzza$L{zW8L#y$C4ZoauSg-Za~HmA&1d`@TXE%P&gn!
z2><{PF94kZDv$t>06sAkheDzu40NJ$5CMW%n^Lls?8^p^QGWURbKu3ZduZQZ((s2?
zzE`}<{<HFD`<Klx_&=&1@jStQ!GBWei{=~oAN3#D|91cP9<hA@KTtaAf1`iE_5l7p
z{_oR6_8;Nf_OJGn;4k(%=}+Tjk)AMkcQv~rTZfv5m>;Zt7<$C|9R8A~DJ~@%x>TfP
zF>TX8)@v|t)q4GjRt<}5s6hLHwRel7>V@&r-O|Av(yh;Q1A{E>Ir>p+%dHD|=l+lT
zpr(Dg&>#Nu=!)6bCLr-ZS%|;h)Ij$+e@r8_{qO19QvDe=&1tmpY*0lcA^Cc-#{9fQ
z<~$*<&P$Q<_jy#<$40PMofM7aQ}C=jphI`4kLg}Z7CIN#<BNcF_3BC#1Or(Pa6X(x
zm*>26D{-4v-_CA-LiE@(%{y!BzsU%gG`Q?sjLUf%qFSl0y)2#ae*+EI><i9^|McQ6
z&u-wt#o7y!{eZ1mUVdD*Za<g;gIMqY0RI1A80-K3n!MCY=3j$fj0a3c7yP%@*`6F<
zo`Ip>s|i`d^V$Dn)qmzqRq6VJRY|{4ujsIU%#bnqU6MR&-1I_43=|5(6Jr;Jvert)
zE?S|Tmn}Tv<-??sxV5@9t}3D=>YZ0JrQe$CO~|EY=Lj9RM&4svQHPQL6%pV5fPFiH
zfXDx;l@~et{*{U*#c#Dvzu)<y{p<iKevLm%@24Es{u1>|znDO7$#CRx)Z&yp-}<F^
z`~J$vWM;oQpQO>SrD{&|(MQtfUz~n35@RLfUy=aqrhCX0M}J_r5QsK~NmRCR|Nm&L
z41UdsLjWxSUlL41r^0K&nCCK>fdR-!MYjFg(z9_mF^C|#ZQw?`)f6uVzF^`bRnVY&
zo}@M06J&_+>w9@jpaO4snmU;0t-(zYW1qVBHtuD!d?%?AtN7Plp><-1Y8Rqb20ZaP
zTCgn*-Sri4Q8Xn>=gNaWQ57%!D35UkA@ksOlPB*Dvw}t02ENAqw|kFhn%ZyyW%+t{
zNdM!uqEM^;2}f+tECHbwLmH*!nZVrb$-az%t50Y2pg(HqhvY-^-lb}>^6l{$jOI6}
zo_kBzj%8aX|6H5M0Y<)7pzz_wLkIpRm!;PzY)9+24wk2&TT{w--phDGDCOz{cN_ca
zpnm7`$oDy=HX%0i-`769*0M6(e5j-?(?24%)<)&46y0e&6@HCDZAm9W6Ib#Y#BF6-
z=30crHGg+RRTe%VBC>T00OV6F+gQDAK38<n*vA8r%O6>Ne3N9bm|62tPccBJi)5{B
z4zc^Db72XiBd}v$CF|yU{Z=M|DZ%-(XarYNclODlb1Kz1_EKLy(NSLCN`eUl(rBCL
zT*jx@wNvze0|TSqgE(QArOZU)_?qH(sj#TwzElLs9q)(0u!_P|R%Cy_0JFQxgGV>1
zz4?_uq<8_gM0`c*Hh|;UMz~vrg1gQXp{ufg`hM_qU;U>+zmvc5blCLSq@PrEBSGR#
z&8=2Z4uXN`F3p73ueD1l{s{k$WipAvSh5W7ABe?4)t;r@V?y`bNB5FvBuE|0VRTb<
zM1Hn^?DSsJY+sX@T5xW=#>T9VEV|?<(=6|ge$X6Sb05!LFdjDcoq*gM(Zq<f*af)i
zNrX<tMgmsmg+`)u<gVRy&HOky#ont<pVW|J_-$wrA`xxK6{hhd+PXR8vNn*oM*H0|
z1qYtJ28e684_5Ps?yhMANn+G%uO1h`$vWv3s;1>=t;_)Le&jyt(&9jzR73noru`a#
zN*<`KwGa^gZU3-)MSLF0aFag#f0<>E(bYTeHmtdbns#|I)-$)mJ`q9ctQ8g0=ET?|
zdO}eZ*b_p>ygRTtR^5Ggdam=Zb5wmd{}np+Jn1d_=M`~P=M67jj})fH4ztb5yQqQW
z^C|C&^LHAK-u+ooIK)yM)QM?t;|<{P;;{`p=BclzAN#JzL4jCwXkQB1Dy{=^KR`=~
zTrr)y7eiYBzSNs_DvO=4A6#EgGS-zY%Vi)N*Yb`U;6o}KR}dq{r9pT5wqZ@3NOE8-
z9-(}D|Nc5732CSYQbL)!gPQ#RbD8BhK3dl{sUuPvei0tkvnJBxDE<YT0)IF6ZR)Bk
z@)a0nBbA1w8SkQ(D#i5&8jGNWcVh3%MMH8Vt0#Cqs{7rj9lAfnOxdi%ON~J_Lk4Vr
zr{*Y)igLGP+Xld7jyNiw*|X1cmPqh_jE+%>AYTesU8H$)g(Plra{VH(v3u^CO1~(+
zU0O7#)jaS4{NcwA+LuSm&VBcX2#Im3xg)W}ySNw%->orn1taZ&+d)}8gJTqA!u|5P
z{yv?zol_3|(1(%M(EVU=cp?L`{Pi|ixk<Zz{d_OJ{%(afPiA`kGm0)dQ`ag~77r|Y
z&C+7i1_BU!*UJRd(^@b?4zBGXgdZlcPU8~&SFU-ec*eK#s8l5P4x$+w-ol8WnhVHs
z<8AXv+lumqmDSsBEq_1%nCKJHKDdY<XS%xm_eRL@MHf03BP@ZPs+4efWYQybye<P;
z!YgDeDt`-=e#48=xgFFnb3ip6+;21bca6@PSyeFDq6U)Bi{elQF$F^{M8$^wE9+h9
zp|0OT-Yl*F^H*Gl@RJ6Ygk#_Hwne|c{O*=S8hR2WOY7QEb^oD<fAVQQx1i_#15%F~
zSB12atfnDt>{U)*guFML3P!OSlz;zGA#T+E@8@cgQ_mv1o7RSU=Zo_82F?&&2r;WE
z@wk}JHYEZ9nYUc(Vv~iTCa3u8e4q(yq<29VoNbKk<beKrau_(DO?g1SPxl?tXR8kl
zm@B7yS{4nzYa-BC)B<s3ZV|tCLVRY=S6W|%ltS7#@=YN0E{Q~^h`zp6^Ds5_kY-c@
znjlqvzdNqVg-)ddJh>|`mq%I6u)My=gPIDuUb&lzf4`M<g#L>EA9^g8u<af%@W-r>
z)vp8|$$HE9m_BTV?lOosIGa4jud=jIbw)O2eCMfyw2*S8?hjWw^nqws$O*M$3I1)x
zR0PWFb3$ySOcGTe1dz%N0l;RPc`x%05FtT^f^j{Y<u?Msf@VVK=mBY*;G{h}T6alh
i;_JuyfJ;~Um+rnc{a6{0b-ci|^HsjhJK1mm0001WTfUJ1

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..9287f5083623b375139afb391af71cc533a7dd37
GIT binary patch
literal 5914
zcmV+#7v<<uNk&Ez7XScPMM6+kP&il$0000G0001w0055w06|PpNFxXU009|=ZQHgn
zNOpH4`X7PzLlv+kr;~&u5J=ize1wQdZAZ0jE!n|crW7E6^)EmH0x*}~^%g*95fd;0
zmbPs>CP}*Q=lvp4$ZXrTZQHhO+w%wJn3c8j%+5C3UAFD&%8dBl_qi9D5g8fry}6Ev
z2_Q~)5^N$!IU`BPh1O|=BxQ#*C5*}`lluC515$lxc-vNC)IgW=K|=z7o%cWFpndn=
zX}f{`!VK<lX!FLkOBT(Yy<iOh1h{fQLjvrOQ%*nbfL+k;)$>02_kU+Q5a3m37J;c}
zTzbxteE{GNf?yLt5X=Bzc-mio^Up0nunMCgp*ZJ;%MJvPM3QK)BryP(_v@ei4UvHr
z6<pZ&&fAXylxW0w%M}QG@yfnPZdDXqIP=jPCIit7o$6lTs5icHCgh=N0unBI<zgT*
zptDoN<h$R!7(%ELhKSSOx&nxS==cyKaPf0zAd!_(MC|v$-B2pfoh+ho#bf&+hH7Al
zeEH1*q<}6i9Juxt0<3_LaK_h8L<~fCTg1Svrz2X|nM!={Hli82o)&S!Wq@^;;Sc+N
z{~W{um1qFY-5<^_GW?+R&A}Lmstggwo`S@#F#weeVu1=JFn9vNE-}DWg2+&<6b|VD
zyTX820Y3(!YAs>+sbCifQaOkL6-;5fL8$W($zZ_;CZp305C;~$hhRquZr-r)jjd1z
z31%ZK{-(`P#|Um_Sivn@p$-vz46uqT>QG0B1w9znfS9A8PB2LaHdzA|_)yjXVR*l{
zkcu3@vEf7bxH0nkh`q?8FmoO_Ucui*>_a~P?qQrl<J8L#kWCf%Wh%yn(Y}gU%LfuZ
zDk8@t_|u4e$m`1t<6z}J_rs7?FJ3+<S^J2$5qt6i-|juKZJ|8nXar<dxa}M-+9i7p
zv6dS|d)2&6A)dFrYRGP(%Pv>Z9@+D7%MTpSnztpylXrt5!-k8_QPB?YL8Kx_On8WD
zgT+111d(Op$^$&KLAN5+@?>f7F4~wFi(8TL8+szgVmcMDTp5l&k6~=rA{Dt}!gb^r
zSWY<)M7D|Z2P0cEodj6E42PV>&>DFmQpgt)E-|#sSUU@uKed+F680H@<;-x{p|n<l
zp8yXzqfa@7p%sObAm$9h$>uH4!_mn85rx>wz;0mPi2ZkL#k6;sznu?cXh!T0S>{w6
zL^gvR05NY64l*<+_L>On$rjx9!US;l;LX6@z}yi#2XHh)F@Oo+l)h%fq$v}DNmF2>
zfs^_t0)3N-W<9-N?uedVv{)-J0W5mh#29QM5R5h&KuiRM=0Zvnf#lF=K#WlCgc#9c
zS;qvh(P$<N4^0H>!_a8JwyhI^ZJV2k+B6Z^64?w|1?5gyo6y{}923CRZfYVe1#?F%
z7h2SUiNO3;T#JUOyovSs@@C1GtwipycA=*x5{BpIZ_#GCMuV8XK=x;qCNy{d7?wA~
zC+=vjls;ci&zW=6$H~<UuGL>4^K%v{p}Ab?U%C6Z4p%eC<3ExqU$XR<<U%Vkem)I3
z!`%PIvLz&ze?Zp%vCR@%m16n3hACIF^0#G_T7epA#z)8(rvE?Hg_ap6_uYP)Tb`)h
z2GK)|(Rz!WpFyU@LzjyfQ_<i5^lr&=M4!BSy(W%@)>}LLF67A$Sr20DR_pJ3yeBa~
z^sw{V0FI5;UpwXsScYuhbqGQ`YQ25;6p6W^+tgL&;Ml;>S3CGpSZ>VrTn0m1$y$HU
z&65)I!c?oREz};c=nLCliriqQX->4uivHTgd${GqeAlf*!P^B|jkU|*IdNP(&6C>4
zqOW$)Nw9nvjy^&`?E|gotDV{JmJ9Q~vuhy<<G@fboDAhcI5Dsk#+9Mh1`k6v58TQ6
zTpAxMdW*w$2XjE|dQ{O{2;)nJq6kNrSbdZo9zqeu3!nwqzHn9@9s3%Bu@kHycSZ(x
z2&|bA;|GSCg#oDAgn`0}Ky&~=v%vnTsQAhK3}!@Ul4k5Hs;%f_FcO_g5=04B6;XmB
ziOvB@G`0e&A^~64K@uGVfFy@D!BjmmY#Jg-bQD1l@^zr9M#Q=#5Jcz8rNs%Ao0hm-
z=t_Aq%~%e4bvUtd2FypO;{-_wnmGsNRpExY)16Tgh|VXVky}6f62Ys$1HSxdlSZOT
zCCO8y{_zPY?=~0l+26%7xde3w04W9Q!Is)V1LkBGNde1$zrfW<NfNr*%|ZxRzT^JA
zmcTCY6tLybe{jSY-O=SD&Deu-#rFFZ=3p1N3e^Al)6K5on3B|W0L{#5+PrG&o;8D$
z9VJJ=wm<!FVPYf3<lcP%LC|1@)-Aw}12hTj5D5k>`^C4XIUDt|j4o6rK^e8_(=YqC
zuaR<q<0Od&Y@7Cjug_237^*j7a#aQ~lCq#UYtH7{U_qlC0^1@%Gyuc1|NWO)TNBEm
zMx%@_RIC6AfxhnJIcv(^$)J&PfGr82VdUyLpZ<4xbZB^JxZa4#RXG^p?q<@OkN-Dg
z>6TRVf@tUFHB079o4MBIh{M~4>WwnGgesQH<tZ7VZHqtr^FKfQeBMqw3{0x^1cRqW
zV`%gGwJVk_Syh(=#d=wm^?D<@gsPV0$vs99^0;ZqG|-B^-cjnq$sLjec-e@tEK@9#
zOQ>*3?w(RA%hCZ*7)b!aNV=yOQ%o_Y=<Y6|>Lt0Sl*(9^jfRnC210Om$=y>*o|3z}
zAR&vAdrB#mWoaB0fJSw9xw|Am$fzK>rx-~R#7IFSAwdu_EI|SRfB*yl0w8oX09H^q
zAjl2?0I)v*odGJ40FVGaF&2qJq9Gv`>V>2r0|c`GX8h>CX8eHcOy>S0@<;M3<_6UM
z7yCEpug5NZL!H_0>Hg_HasQ<COXdOkdH!pu@0dU6f8zgSJ>GxR`rY&Z{geOy?N92Z
z{lER^um|$*?*G63*njwc(R?NT)Bei*3jVzR>FWUDb^gKhtL4A=kE_1p-%Fo2`!8M}
z(0AjuCiS;G{?*^1tB-uY%=)SRx&D)pK4u@>f6@KPe3}2j_har$>HqzH;UCR^ssFD0
z<L=e_ckEC4xZ1K8&yCddum>7h+VLO4o@_Yt>>AeaZKUxqyvxWCAjKB>qjQ30UA)#w
z&=RmdwlT`7a8J8Yae=7*c8XL|{@%wA8uvCqfsNX^?UZsS>wX}QD{K}ad4y~iO*p%4
z_cS{u7Ek%?WV6em2(U9#d8(&JDirb^u~7wK4+xP$iiI6IlD|a&S)6o=kG;59N|>K1
zn(0mUqbG3YIY7dQd+*4~)`!S9m7H6HP6YcKHhBc#b%1<GJDT!^vq^Fhq9+GQ)rw<7
zX>L}VIisp%;TckEkcu0>lo@u995$<*Em;XNodjTiCdC%R+TX|_ZR#|1`RR|`^@Teh
zl#w@8fI1FTx2Dy+{blUT{`^kY*V-AZUd?ZZqCS4gW(kY5?retkLbF=>p=59Nl|=sf
zo1Pc|{{N4>5nt#627ylGF`3n>X%`w%bw-Y~zWM_{Si$dc82|=YhISal{N7OY?O`C4
zD|qb}6nLWJ`hUyL+E>-;ricg9J@ZNYP(x(Sct&OI$Y!QWr*=^VN;G3#i>^1n4e#Je
zOVhbFbLpXVu*16enDM+ic;97@R~u&kh__kgP#!R`*rQEnA+_dLkNP~L`0alC|J;c;
zeiK=s8;BsLE)KbG3BD&Br@(Ha@SBT&$?xX`=$;eeel=|R_dIr6-Ro?=HEjnsJ_b`1
zK6Yg^-6;^2aW!xeTK)A~3Rm|L^FCHB_I>jIju7ZGo&N_1*QHkxH2!<tj^laH{Fyx_
z{&=J_f2vo<Z;k$M1Ir~ug1#5Ga52L4CpFf22cxv6fws^ma=KG?212=Y!jNISS!|Lb
z8x-AF@-9``d4}WvRUse6F;u?af>!%@o4iZ?vntS;&zJdPe1dH#04YD93A44o-MpfD
zP{rn_aq>U%RDvC2+bp;xPlsOzauIi3*Lf42`jVKK<K1>ZCRuKdYhi>FDuL<yU&41y
zW;YPPNe&8L>2l=v{$BCN#<T4EqS^BZve&iW4$t~r2^LU29B#Olvb3z==K0V~Xm$T^
zTEZQM|D{j?1st_dU8g^<gdhmv$NdVXjg~&|9i!p3%#sZ~>Q6796s%r-AG$Q^t(3c@
zD?w0UhYr11@feiyl9kY_@H8~|xlmO<8PfQmj1!$@WieW@VxR@Psx<u&LsG06B}sH+
zIY;3Fh+6GQ0@)pP#J1>fe-v9WCi1+f>F4VL?0O~K7T?m4-u|pSkBpUJZZe*16_wAp
zSYZ@;k`3;W3UHKUWc8QeI}0jH5Ly=cGWQPw(Kr2fm=-5L(d`lcXofy8tJY3@Tuadz
zYWXR{mW7XT!RF#RVCe%}=tM*O6!AD3^(!8un~opNI%Uko7$5t@<8+?<JTSi(aPRTt
z&Ml{N#KaBO+?nu~`4Q07^34=s`MzQHq<x4YOM_H9N_$hsJ2<doMH*MCk}b~+4UINa
zTwL7@3kg)_0*#Q$wrCkv#2-q6kYzsssFc?p^mKPeVprz0gBMiUOMbNTyj3-qRER>;
zTxDys(MyyGsUjtSu9$+|_-t!U3fVb1dkK?l`17<+jfl=hrBHnDSV>^R1=TnQeyqbW
z>ov#l%!1|S!1>8UUxIdhQq`_klcHVx0{?#>K3#$4GlXncwldt!g17TcvKq-jo_996
z>oA=tH9CqRl6Yw?Uc`am!V?lHJbizOJaVaScf1U<ZufBT_PTSYy1Kz}Ee^oj{1{Jb
zPK-`C@vPnz@)Il&x&GuPat%?B<3q8e7{hr^F{nmmxEn(YMpk7=cxlRqBY%WZC*EF)
zEGiQ`?WYSV^pF##Wu_nvJYxLapR?xP7cc)>P5e7Dbgabq=b!B~T&_F6?ooU>w%x0A
zH~&MHJ=q`fCH{U<7MDXE4SD32cDZA)WJeWkllJ`UspWaS#eDe^kg^oU_A14UE9<xI
zdNHNo$`_W}M5oe9+e37~{LsytH8U$kdU4k6Wd+jywdyHFMcdx1=~?-!S7R)G4c`N&
zcWK7v+_<;uHA$qDdzdA<PssWlx07Z!S%-(-yIKQguM<#>zG-a^g{xaXf$})Wik>gT
zl#dkzGr(;h0JZDuFn(+k8wNq?PZ5grQ<+sM?wBGt@JnH6v0#or-5wBQWKU~(S_&GT
zkE!tc*ZJ1Y&*p(xX84POb3cClR<n^{&58_5a3*@tLK%RDE@eA8<N0urSMl|?a*z{{
z|I<QGAb!#~MTWAsI&lS{s3^f%*Cq>Md!^qJ#CAZfIepEj-<`VURS_yCz0(?*Ixcj4
z-!zV1_QZhpm=0<;*(nm+F>T=)o?ep@CK5I%g^VAA+RB25ab?7)A~z~egru=I1S|@v
zH7tXV!0wmGS^qj#e+MY;C5eUjEAp$Y?LDkS^QPZ}8WN85?r$u<-Epi;yZ1|J2J`se
z$D6DpH~2F=eI0B&=UFAUnJvZAmClJlK)sutJ?M>xpZiWV&0=G4MZP+x+p>EX=HbCz
zxls%Mw?*u^;LbHWIWCyq+yi)`GmFn9J112CZda_u@YIP%i;srFg_paU02Ifij*7}l
z&CF-<n3E$I?p-QiuZUl*H!IK>(3|>*a|+vbNR`^RP=9G?ymEJ0Z~)d&c*UE$UMepZ
zcITr{0WqhxkjUnM15js_gW=e3Uh|y6ZReaXHIz-=p`x5VvB&rH9y>Amv@^WmXFEw)
zQXYrk3feir=a{jM<dzdx8r#aUNr$FCI&rFeKEw-pN(>Q+wDIkkFnZ$k{sJakHn*?u
za%4b!00ev8NVLM1TY=cl?KB&55BY_MU-sg?c>=Dbz_W{(Z~c?HJi*XpYL)C6Bd8WH
zt+v-#0&o~@t4qESi*)+eW%@VD0|o^yF)n0hM<gQ+PU=IAxwIpk4S6|%K*&)iTbk{k
z!-s&ZD6V|2;CyHbFJb|~GXX8L?gXzy++xyz?IYV^U-~qRctg`i7PNG+E%K%rj1#lA
zgkh1rUqL7W9)e)2c+*k8wKU)vO;P^cyrum5UZ1j=T*)ids=e&KO0D*dbQW0q)Dh%0
zC(oDisK58>E$UtXF$*Lvh}7sso{`|pn*JDIy5^Fm3s$5*zEE=?u5<=l8FJc3r%+H}
zdfoNl2J0^~!-*mOL5o-x32|e0Im*E!yY7F7E5N)W3>+v_LBydlEx?4$RL5f2oYRD#
zaR0wv(-p~wO0eLDl3K=%`{5+0Gd$ktO=W)gWlGZJ0`K<b1^|j#Ha<G@=c9EaeXzf;
z&txt4&rY=X-H_Lvj5eR1K_rweFrPjPyTDZN(Ek|onY%1(4Crv_P7LnIj49do8Wd;~
zCSHo|+D@^-(re?Y49f$%^lZgf&fe1}InrGRWcnc_=giCTrGmDRo?m;MF<+&2oxsg>
z$_RNA=ckrfa;H0KA~dR^p&#0(p-{x$&=IACIfoAR<Nvvu>!za)F-^da-t3#0Dycnp
zwO~NVXwXCl;jE<}>%@<pC<8PLRXJuvO4y>xz|=8fIJAB?>+E{7)|4l${4ngA3G|=r
z2Dyv;VVWSgZx9Wj>qUjleGl3Ei9K4>h!(lPS%8VOG>Xu0%6VDz^O=bjJmuP7>DeUv
zrbI}MlHB^^d?{zv6d=@_ZD2lg1&G7UjnVN{1}9WkaM3H~btX0GtSzB+tZ^qRgWo4m
z!GmimlG$=wgXCnr6j@m<1gAL46#T~5<Pf8lV3fF#Ppu>Bnm=2{^@>|t&`9mkEPddj
zAvG~@Tv~TAm2i%VW}R-g(Z0)z-Y|szHr@rk>4MAyG*Ma*7Yh#H7(!-5>DZ@8r;_dx
z{prSe<>~099F8vsYd2xff7uAS%7{S)f(|@me3t2$iy&NEc7OUEchp@<t56P}GC(ea
zcuDo~%vU?vTqAIp#flPNMI2`5;t8&98^HQwmsAsoPNK_fx^Bggb1WVe*N(RqH8#4x
zCQN_rN*!W39u4A)O3%B(eX4-oO~1PlFj1j%A~@Wj>9A|X;;IA>8!oX+y(BKJ$EzV*
znR$z;!L$s7uy@{OT~nG#B!NRraT8(X##Ho!0r_o@gg0CA-9H^;-uE&?$2$nHv_00o
z%cbuUc-tCx$Uh&EZ4Nf4Zgqv)Y6>usG3>GeQnxx_Z6+PcbX-+ysbt1hQ`K1LDpOE?
zrAhIZhSN9yVIAOa22gn577tbc&i3|3V8NWy&!tw##`}9*x}gtI^h1DzZRA>UuaJG)
zaZ7j)dq!O}{?#8Y7~7i6fHh4{`<bqO>pL?>-18|p!S75Y#^DM>-S3)vuZG+Q7l@ek
zQP~#cBpWgg#mApc_sPYjpw8odQuRokmTkzcNl`^CcKB7e&;zViV;{Y{o^Y$%7i0m#
z62%#1Lq!RC?}lK>%mp}T!3Xv;L*0v*>USLm``N%>w>@fwC+#T&Tx2bN4w(20JB}oU
zuSa6v^kXi0x<PZ)*(STjdw~o@Rw-Or2Ax?U!if0^R7qyS!JDYN`R9pBUrvi5{Lgqu
za0I&Zd*A54UPC}|lJz-2f!1VYM+A-ElFU{V`W)8LtCk5Mx|)Il)$QH*gA63%JZFt}
zGq@eFEWXJ~R>Ps?pbaOHnyiqq6By1EZY9OZ^^QA>{q-Hsd&m`pbQ%8121aWG-F5xf
zlZ%;B{;C>X19|`^_?dVyCq>n+41w7|!tUS!{9rHlbhX=SZO5CQ^;!Du_E7*`GiR^Q
w)2!4MKjfSAeN<ek%udeE*tim(T?PyxRjZ^lIknLzS6Fbvpe_110000002i;2E&u=k

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.webp
new file mode 100644
index 0000000000000000000000000000000000000000..aa7d6427e6fa1074b79ccd52ef67ac15c5637e85
GIT binary patch
literal 3844
zcmV+f5Bu;^Nk&He4gdgGMM6+kP&il$0000G0002L006%L06|PpNQVLd01cqCZJQ!l
zdEc+9kGs3OD-bz^9uc|AA8?1rA#x4f-93WH-QAt;uJ6U6Yp<>o!9>IaV6aUZ*?W>}
zs4%E?srLW`CJh0GCIK@hTkrW7A15Iu<z{NI>%N<!nR<>&?Q^$0+!{Tv&|t^Y@u%!L
zglTg&?Q5q#ijZ;&HBQ?FNPp;k3J5!&{^+SGq<pNwB|u%pA^-t3!%mrgTx*^S#Zw_4
ziE?C>?AX~SiOM9jJMRpyP?RCr@z38AQyy&WRMaC;n4una$~nJKSp?q|s8F00c9?Q!
zY_ovvjTFm+DeQM^LXJ#v0}6HRt3R1%5PT*}W!k8BEM;Jrj8dIceFo2fhzTqaB3KKk
zGlCLI)gU25(#u6ch6GeB1k@eHq7l{EHXv0n6xE#ws#ri}08kkCf8hUt{|Ejb`2YW*
zvg}0nSSX1m=76s?sZhRY$K=3dpJ+y*eDULGnL2}4>4nvW^7_<~wIM_5fjvwt4h1|g
z)g0Z6ZFq9j<~9~b8((~TN{Z?ZQfw|is&Xp~AC61sj;xItKyCHdI|tCMC_LbXF>~vR
z=w6V3^H=W4CbAgR4#xw}ETTwu2guW~=Crl@SMXv85jQ=%y!s^?m4PI0My7MWICO;-
z175jm%&PcPWh8QdOU(#8bp4!N7ET-+)N}N2zk2)8ch|4Q&lPFNQgT-thu053`r*h3
z_8dI@G;`zn;lH$zX3RzIk`E8~`J=BBdR}qD%n@vVG1834)!pS1Y?zVkJGtsa(sB~y
zNfMYKsOJb%5J(0ivK8d+l2D2y&5X!cg3BG!AJ}910|_${nF}sC1QF^nLIhzXk-Y#x
z0)&1iK!O;Og0Ky!;`<M509H^qAWjSb01!F=odGJq0Kfn~F&2tLA|W9aSuDUH0|chv
z><Y$E!fyEcTj8Iz{FnTW`OLS!v-~mg2YDxGX7|*O>b~v%b$`S4E&fB)1NB4v@8wr(
z&+NX4e^&o)ecb=)dd~C!{(1e6t?&9j{l8%U*k4)?`(L<U4S(5x*nimdWB<W>3;Qjw
z#w7FS+U(94MaJKS!J9O8^$)36_J8;thW#2$y9i{bB{?M{QS_inZIJ!jwqAbfXYVd$
zQ5fC$6Nc9hFi8m^;oI-%C#BS|c8vy+@{jx6hFcf^_;2VRgkoN(0h!_VSGmgNPRsxI
z8$rTo0LaYq-H5i&<W~DEeA0J5Ejcr=NoEuyBL(OVLAcB}X_8cB_uJ!s7cp0dRqVBe
zsUE`ZT_vw`#PhJ3GZL&MgceBX?CZld6L?=CALkxMG)wd*K}0qB5G);flh~+*<#sdk
zHVpiyxmjf=)gVwD(Othch%-?7mJ-JFN@GgN5H*j<vXzv;;EgH@{<`xp`bGWxdTuF9
zVfPw2|Mb0|{SR@<coJRz*Ldo7C8_WV2F~CA|MCG$;<8+wMv2K&bEOiLe$h{|mYTns
zmq|q&A*1?q+ixKWAASoVH!ZEVh`i*LG6iiJkbnUG@aX^m02AN;)E{3iDq9o+QQz{^
zE>gtj81=&xU?H-Y2==G@uQV7E`@+2E9XQW@{&j`?EOktk|Ho{HU>ZqDzvgjwBmdex
z&uZNd2C1h{{}2k6Ys9$*nFP3;K%u!MhW`uZy7Sn`1M1zs@Es&;z*Z>Gsh@-3Fe6pE
zQD2@cqF((NrRevgvLsvM_8;;iNyJ5nyPyy?e!kvKjGj`6diRFBEe49Oa7wwkJFV7Z
z$YT&DWloYu-H?3<0BKn9L&JYDT-SK~*6c5pi18P26$JESKRYj{T7Zk6KiRJcbv<El
z9J+CwC&)JZ>OO*{P56Q6s8msbeI3>|j>K9}Q9UBeq*inXKemCm`-<5|-$ZyN4u$(3
z&HcvqehFD%5Yrmykg-^d`=BSa8(i=>ZoC77^mWY{evp(km@aHqhUECBz76YiR+VYK
zY_avFC~V3$=`6C4JhfHAQ@DZtUOwH`L;oYX6zK0-uI^?hS$ALfq}A7evR;ohJHij}
zHSZdW?<e{2-WHa_?U=it9}&7kqMpjq1mSDIef>EKv9U1s4oD*<(0oQ*;MaQ6@cvGL
zuHCPgm_NhVsgp^sfr*ia^Db}swo1?O(_Q2)y+S$CBm+g=9wCOUPbz(x)_GbaKa@A7
zuI&!ynLiZRT#V%_y_-D`0Z5lT*auoe{(U5NylTzFSJW()W-#F6*&A`LNO1bV#Y;QJ
z<awv-I3PIiWGHhTy$}zF2Y)1sqQ<os%Ovgx8Kp1IIYp8yKG??*Ss|3D&_gso#&bcG
zAOx0jE$6M4Ta>SbLBnp|B^dtK|KIWC|No>JjWBWE@n7O)x{&^E(WMeMvp57#qA8m*
zeTow*U@_86B#Fm*rxyYu5<KF&LxRTn#b#-=V+wrM90aLp;^z%k__(dWQ)AGshK?G2
zG_7TEuE}qQ1p|pu9cXTCVY1=}eY&5#0^oi_6WJzXND#Il2{P2*Glja>PRWaWHx8y>
z*qmHEp(AMDl0v)ij(AY8fnH=~ZwwjVAbu*m5;xPf<qJX_d*%rb0I5H47@IVnb7S0o
zz2PY$`9p9<?MI}^fsvg}<5vnkl@iWSyJE|RKd<CD3n(U@+9y@s<I(?>idh@ov6d8g
zfJsi&!QyK53Es%sC39ts;54V68koALD4b|%tNHW0bIkZAJKa=W&FomJSEDT>W1xIX
z1x%Z>AvNIsSPLcn3RTcHXb@KB?cuM)=x6fcIx>&(GxqZ8w3p#jJ(GVgc*`c0HG}dv
zIop&Qim!K1NFwic%07KcjWgHBPUkq7f~lj;TPqVGTiT#cUeim>;nY`>h@a*S{qQex
zQ`z62WK|Mj)Y{tfF{;T4<U2X{`x?}US~MrE1C|_1&};NNy=Xd=->P;c8$Q|KU?Joh
zIk<oAxu7<8J8_((U}1AcLhLHd#;6?=ujo!ltdCtw#~hyreNq0TmvSJC6kvD&I97fd
znpE<a3v3nA{>A^z%X7z|r>4aTh@|StTi!-r1D!g=zb#3d#{{&K3CqE$Iz-UH<%37c
zRfkO`&uM%#AD3PHv`g5t0e^O%nVL0d{Xlx^EjEC3#skF@`zl-7PF^0oxW)1!C!JxR
zWvuAHH?)61FKA1QeT*_sY7;_Id#!GmV4n`<w=^Ck{Y6qCCnK=crd>MO{~sv}VLSK`
zXRw=Y=Clz*00B(5y^K;gCZMAzjT5+c3IC=)l(9VIDdatpxj3y89WwI|bH&$!ZEvp`
zPR!T@#!(|KfI-w?!&+7$N3F6>tD{YO4Qg$d_`nNEdfVCha9vaPn0jI0`)`@*72hq!
zp<q2y@kKfVrSfb}8vmw$SopDtXNL>U5ND^P*RoEkbD5o#az(-g=Y)L>HH>O<qeopz
zUN9W@%YIO|oPuhw|3vc#<KCMY=x6o1bq4B(<v$M-V#@J4x8rW0u2vp3d;J)Q>c%}$
zT3Rs_ih0;4+Lv4Y;@Iv(;fUbQ=i-G(#>vghec~*j(I#r|5mqFiJBpzi&hzEcD{u$<
zRsm0BVYn=pT;0>R(itW|*D&;O%bOc7et9ACaH#J>z3A<mlHC6`?wC3cPj=a+0L!KJ
z29dbN4hGxn(vG|*nDvH_Gu%A>1A~6fdP>pmbM%xzm4>|;c_?B+%sl;Qs2{t!60$^u
zH1t@9^6>;?!FuusnISi$f5CL&;z?EqJN$FBuWDA#D5`cy_UvCFIVvf{c?4N0teh;d
zET$7aVbj08KTQS!x?Nd1Is8q8qFzs}a=!@nJ;7FSfCY^T@D-gpw`w<6e#X3+;O}1h
z$%I!M)0bg|EKUA04Qjn@+x{Rj8vt6Wn!R|3A92z}^$KfF5(#CWr4y#~re1CN4i4w0
z#GsypBR<e;sgowNDv$gUgnDd>{xA3Er7sgAi(|}1-W?s~n$7?K|9WL8kpVfw-;#b9
z+mn;=e<xV2z&$aXbbB^9!5xN=DIomsyx0q9u03Cg{>p!162U5R>_t}fOt~tE?s#m(
zO-S$7>Ay6*hHdZ)7_oU915WYYCIX;hFI-U2EWYX!pllONr@Q--2o~`!<G<U!Wm!i6
zcOe$Xm6I0E(yJ$r-ME}i2`)znbXd1p52N%TOsuKK&9}G3_UznkOzVC5f5D;nCf)Z+
zj#uVX)+?#DL<kaNRk~0wN>isi6vTPLJ4@(|o=<RrQ3C!v$5WYUUCW7tGYI}Ga=@S6
z#oVDLA^DrRJ><U3UOnQXJ$?>%NHYjo0_S&q*UQIROw@*N-By@P<Aa>aQ&;YxFZ0aR
zX&}LeOEz);#m~Hwm^VAY8DK}b$F4bo{jMN?d!lxKPhNklzr^Cd`0f4oJr^z=I|l`*
zm8AHm*fPV`0=lF3Pnnp}&J0N1X@}-D94YvmUabFrLGSnTz7Mu^21F#O5tN#CuY9Vh
zUZBH=ez%h*wkf0hBtXJh1SN3d+IF{gzT7lp)j}n?03lt;XSQRAh7qd&v;RwTYDuQ#
zbI2*r<>?x-G0@hM{;%{VBD7nLKt~D`T~-HAt5;h%i0_=Ifs=yHma5dhJ+QMG?Ux(a
z|E?1CMy1!~oA`FP!k~iG=t&5#>bVdz=peT8HMB6Y)#7PpETtNryT^+Rv3vpJaF^zP
z{H}0-LyV9Fu21ID%wO9f1IKlFr1p4c{o-?03vyB-tr5duk^&L$;m_|f$vs`^Sl{j2
z95}oY{LlY+=ZS%J+tZoXCd0*sSU7w^gjovXn+g7uyra5{cU49@yHf#Z^Jl-$9cIfo
z+AJuxH$VLb=#+uBbVmUjn<pB8s2*J`I5CyYgqeYUoxo|zGhX;tyDo1a#27aF@cZj$
zgh*)qH$l}mt);}{RwPfX7p=vEVccsmWhYwNX6Is75w5D@Tj;I~X$WiCH;n&HX9}>x
zxb1pZ@-O9=AIk4@S)m6fJ2?{HrNYwwnL3a45muuNjr;6$O`bGEM0T4A2_S$t=86*-
zcO+0mywg*j<MP8}9*qyfJ7GqMnvW0dCHIXpIOyq&xVwY1Hj?9}nQ4)L0000000000
G0001O&w8c+

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp b/examples/llama.android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.webp
new file mode 100644
index 0000000000000000000000000000000000000000..9126ae37cbc3587421d6889eadd1d91fbf1994d4
GIT binary patch
literal 7778
zcmV-o9-ZM*Nk&Fm9smGWMM6+kP&il$0000G0002L006%L06|PpNM;KF009|=ZQC}G
z?WFVnhub3}`X3k)f7gJdHv?Xy!R81AlJ*B*AtF+%2T777MNUTbu9%sbnHg^^{r@jg
z*GbiFHdh@YCSU?QVcWL6ZMJROew>#A4mU}enR_!cGmIYQ;qwfchWtFEXL)AK%*;=j
znYne+hS4EMy3S)C*mZ1KI>!+)0V@9!N<vFAw%bSx)5&s%!VB9)5>6H$Y}~MJ{rYuf
zz^KljIWvFi<cP&X*lv%IdKPZD;Oa}RxZ=WXTQ_f5SBivP>-?#?V@LPR&c6Nn{!=XM
z>}-h$S76;$H{E{Y%@^zlmOl^efBwa%UU+jJD9UVukQ3ti_kH-?H*RC0?M1W%FCvMB
zM_+v6fk$6X2sx)-p~B3&Kl{nscK}pNLM*qjtpaf9>AU{-iPKQZR8yCg!TY}Qg*(;)
z)gdvCcB%kppZc$VdvsK@)3l1{&DG!d_6OHOS`y=ITLEVu`unSKA2E%JD*DVX{LJ}K
z9l>hMRDqxQh0lnpGHpVYneX}eA3Pt|2v%=q;rt)``R|#bDyB)OXY&vI_@|*}h}G?^
z@aZ4_!7cQPX`!fW_?{oT1NTwHs#l5L-0`E|y@48<3Q^HFf8=Idi<poq)!h6e-w-t>
zpJYD%1MkII!~|7I^WGo)IF=?{>ACnjJ_WUi39C}!Q{QnheVJqeKKqq5^o5CBde(g9
zvw$X6^jz_^E2$wSw4!q5*RG(C2_^XO$HBn_55vbl44OnTTRwRaePP0vo{K)U1#99&
z<>rq7V&V(<&@I%MFoN5zrY}sz=(*-L&}1QQ*a%`u25h{cFj===17eB_uGuzG&byQ<
zrm8BJZl4r_E$3k|Wo6FW0-6M7>qac5uFQsQcmkLWGfeH74S3Z_rJ!jgN++!@i=HW8
zkyjI(oPH-+-N#Qc^-mpNO`bc6r=2-<%&Wy5K1vfFJB(L_IkpS6fY^NmuL8qsgj>MD
zn~BHH9WM~32_3vd=W&B)k7F9q%stJx+b_L_X-4zr^LVUMCmyCTA3sWtkvsmME?Xiy
z?xOSfB=_$oY06~J-HcCq&)qcW{j;uP;?Dm}=hkq?zh&n!;m((-G-u_t|6x399Q;>A
zgNpxoJNj{u|MFDH7Rhq@FCAl0dE|ddnl!oh9{Lq?@JDoR6L<VshF8r0_5hVetvvR3
zUa9QP{tlg6#T|cqYLF{a{Z~(rG;8wQAGxkbcBg-f;&yT2caC>;C941IK`ISfdE$4S
zE0AUQ8+2|Ncl_q5QkSp#AODp~(^mfP&%Au@@|TBQwoP`UU+V{6u8|)6ZA{~uKmQ*M
zmrMTDU8S~8Eqi{^v0Ug&5Upcm#y7Z1(RbgZAG8jB$eRwCspQ)>5;U)oGZ&E5aeR*K
z8Yt`Y0$G))Yd(Y3KH}tA4`-_QmNke5hU_|nq=xtyjwW(_o<J(bXz&TLG*KqE+J2b|
zzGMf@yloAVGVyLu8$qUB0*aL7J!IELCX-VpLrK)~9;`MJCx<$?q(odYLqjiF1(aQ#
zL@ODYw5>?itz>B>WM&^63bNdQ)k@-IgDHW*RW$Xo9#R<IvZbwNj6)I=m!3rJ1R1ab
z2r2SX+N#$AB#3}6!qHGpW<lbPOR(BWoXkKL%kIL~nqp#++Ky;w$go6AM8rlKdq5Y2
z(2QEE+W<&V$_+GEA2Ij~w6?iAbps?Q2F=yh2@>zrTrCn7L2H{9Amq|qNg@#eZY=|P
zCoI?2s+L)zsM%WX(NbVEY^`C>lFjIBYmJ6@DKJ0ZT4&F&WHW!dwa%QzOG!?jY_2(S
zDcEzZbz*2Q!43|z))9yOP9X1Xt%DXzwY(3tl-TR=Qb_MbZYRrooh;dYYmS!U<ZgRO
zPVYNRQ_syhy#$k<o5k&9_8xKKcLFP4qp4@lDp|7eON3j=!K=ngvNK;As+}}?A#E=O
zoNvBGL+^hj&C*@-@GH2L%&xby`W!OyNy2U9;JIO(gR%4JUah41RARgoaLwm;(ad|F
z%xacy_j&lKc6#Zp9NA05srmrn7IN_DDAJr$pN|}*jW~LL_UB~W*EgSRr5B&8BjcrE
zSL&UF+sDEEL#oZWI%~cEfLcgL?yQ;STy6LL>_as1(=YVB?Q_A|tNu5Ut&_q3jbfDM
zoFxT^uEuH`nX3*sB%K?GuHUkweYReBwnHqh3P)~`+s3+Tj!rDA1e)8vuBv5J*IsxC
zkd^~b(aGzArj08{>cnzOuy04C+C`}gb|Yz-1avxeWzev3NzcHb<pG3uZzt6%N_M`H
z63Z^ZKqoGZc8Lo{3_x10h0fhGO0|hnn_f$^(nSX^2^uxdKSsxjo4qPli^yf&E(~ZT
z1mV|r$Sq=R+vNgcB?V-eJF|f%of#c231}t2Hhy-ks#-%;>z_&4W@QCr$z3~w=8Ua-
z`;vfG1~BP8CyLb=F7t1am~ph_#|O%$khSJ9%Vtcn)YmpgQxF?xM<vI^;GRAEI=6(o
z!@KAW9tUBYeDbWUR*=;{nzD_?0kAXj(FnJKLyxD@W^C=OI{Dn1XoVQOdR%qPoISf=
z9>^_Vb+5fnpB^W0I`f%X8gb9#X{Q-yJG0{Z56aWeI&zPxnf5pdJA38bM`cYnS#x)%
z`n1tFf$i)W-hGm(f9mde^=X@NcV_lFb=P`4&CI&H=IArijGwdCk&X@uQ$5xmj!~^?
z#$ROCI)V-~t%L%GS#wo@U27ddR`4`3)WoB{R-4snfNrfee|kI8^bu#yDgYqOwas9#
zmcb`3!kRJ`Cr=_tq)8aMt{aGtUZsqwVlj6DgCGre>AEt&x8H_in!x@uwgExIh|-mA
zjdaC(29~CTVSaaF7HPbql&*9Uo8P@f)>LqCXclr}peS7_1BQ28u9PO8Eq1@`l3q9o
zkfKCaO2?T?ZyA6loW<#9_c^O=m<&h}CA!ineAD@=(gbq`vyT|tiJ6#^B1$P;;qax`
z55k&Q?wEh#87niLo*+n4L@65J(Nz~=Ya%7^(miLb(E>A3B@|Jjl;FU&D>o|9#7PJH
z?|a<zSu;Ip07(%g)WPBHm#+z16D28}dg#ALW>go!o;WC^h=|T7PVBg(DAB}72cyUS
zb(f>Bwbr!F1eTCO5fpj<{PqhY5>143p?~5ZA5H40);=@M#MYvrB6gqHbU_!GSY??i
z%s=>-ciA4*zOOZHds0a(kWewZ4h(k8h(ua7HX)Au&mY~H8KY6(_cb$_<O0w_RIGh(
zj5b~uP$jJb+Xd>&fA@QjIW-*heP3%$d!m5^AdnT}`12qA^c@!g3DOwZ5WwE2?)-yU
z!)Vx#Mtxt?FzFTwK!77sy7)sMzUd->w4^bxtpM2j!b1<f<x~!bqtR&8*R*Y>pjgyk
zGKwWGeb4)^zjy{9Es&PU1}gwg?|J#L$KJB7ett9@4M%-nGtIQr0>Fl@8-yh`-+1ed
zS6r}(MeSvgSoFmH*_WPu@i?}!AB~2?;i&IxrkNg~cQ9Som98tcq)k^|eeER|Zl77t
za-TVUc;DNvzVXJ%w52+#weN?+;i#{f#!Oc&z?81*N>^e~ltRS%ZI@lR{rs()HmqG!
zx*}ZrI-EZ}ckJMiy>A^oofwDfC~IH)z8{VHKGT@#E5I(Ll&+MnMCl>~AV7+>Gi%mF
zkU1QlKASdR0B80!YhP<$Ywi0?W2Ux45oPfxv9QolWzJPD^weBfvo4<Lv~8xkBt=At
z1tlUBk`xLcfCSQM+v&`#3$kXW7iH=TEsRjnVxh%BfWeFBVy@2gLQEqHp@pGPNU;b4
zVK9rNold70VoXyCgwUc$LP9JwHn#Di7=vk2fj|g>SONxP3<lG-Vxd@6fLYWmG!qwA
zP&gpY5&!^@QvjU-D!>5106sAmh(e+vAs0GboFD@PvNs)jNPvarhW}0YliZEg{Gazv
z+JDIpoojRVPr<*C|BTq<`6ga{5q^8^!|0cxe=rZ!zxH3%f5ZO0cQ*Z<^$Yt2{|Ek0
zyT|*F+CO@K;(owBKtGg!S^xj-Z~rga2m6nxKl9J=fBSuNKW_dLKWhJKeg^-Xe`^1?
z`TyJj)8E!#>_3Y?uKrwqq3LJ#SGU>AzUO|6`nR^u&3FNN_j<GeeqH_3zoS&&2>GOc
zw)Nw`wr3yIKhgcee6IaN=ws>M{6677%)hPwx&HzC(f&u~&)6@b2kNRzBDQAP0*H73
zq%McOmRk{B3i47qRe=DA*$&odrbEJZ*pV9XXa&p@wlW~@Yfs>V{yiTtplMhgM*-Bz
zsSnlq&pG;z0OUN%$~$<ZO!D9T#`!1$`I`)uEDsTp3AbG(+{8$XAm|$7F$y3bNSK&o
zhMQ9>3=g1UF+G*>+17eRbBf3=y79J}KR8owon@$1Z7MIrvvWWH)34nK2SD)GsrJ{l
z1Cl#oVo3A8qY3e=aF)qzms~FG#2$LzT=gs&aVMOj>(%{y<&O0cG!nCiESl~x=^dF{
zKvj8F1K8Ng171wwM5Fh4KoQw`_c6#y$(5cAm7e}~nJ#A*fx+c9;y#&W!#VukR)ugk
zK<lHF5iU?+a7q%LIY(gu+6HC@fZla2JM0Ile!_1KZv9N%EWfH8UHOSr(*_6U#b-Cb
zai)>p3=+;Ut+IYn%m+r4d*<`L2h%aDnX5}^!5R|H;(34AoVWjRx(msBZvk;rCI*|~
zdOijqI@9Z{Vu!~jvHW{lBa$rnl4+!s_5sfK3bCGk-B%iDe&@-}<f8H?NUz%;&9H88
zKeI&VsF;x;0RI0CWD-A=n<aDIbr2zA<Y!3Wi(DHhnBH?R)$`P~*0>+%fOKU|(9?V1
zHE8&@<R$bW%n4d_;X)D(J`BN4--OoA!GW*A7BtPjaSmp`zgPw*Oe`>4z)Kx!RAvAs
z!Wic9=o#(bg?kc-G68-m(jZ`^=XGUXb)}t(%&~sjFnV^sEX%hSy6UKC4iOhgV=BHV
z2w`4g7Y=s#Vu2B_?#VQ|hP39@eArgfX>-0S+dd&^mx0*wp}>)x;c4RUgxz%;oNe?&
z-7-lJ@Y^2^C;=qJsxx5|xF)*pTGhch2B&kxtn;f!7=gznk}I3}Dh}(CoMX<eGe%cp
z=v9i^xLO*DOYAZWh--Ne8Y1JFpkNLk|K_#vEpqOoMnt%@<hp8sD_<1p5We4-TpTv=
z@dBVR@NqKZ79EWW+IW3m@25-^MwFGYc|3Iaf{t{r;5BIY87t(~JYkd-!RZM95t^|g
z07?EzPs4Z1gIL&LXZM}_wC~D}fm!$9AF#Z|NLd2|?&*W35Smz$R&Hh=C8hAKESEx;
z7UL1wsQ2@>gA5-p&kS2<sXj@I%7<}I553&2vzZWIw);>02!l?!fT3t|HG*rIP~mS*
z$Wjo}jq3}z$Qq!9yrtd3fM0N629ZM?<L02(oRsk|cKnS1tXi7sM+ObQ;AZLyiGDYy
z1RgK8pSjl}{cQh;nYY)=9K%s6{tG&%9FL;!g~bmGX~a4g!n&7zzE^gC-I1bT&W``}
z66$KuBZCs7b+dQQBIP@BJSdX=5219?|NB>LU$nv@Tv9b7I;D|;0H2dsA~g7Z7zp1|
zB)XmrkMgF6OQr|R)HHD^TE{Y#j!~SR?b`Xt3Qs`B+x<<kW!i9<O`?sx%JHr)b{N_2
zsIq=l(WQUySmI-3X^7>hxexYeAjMUWdZ-*n9%(1)Wb(n2U<><7&9dwGJmrob)4%H?
zlQ%z+L-^$dFhhH|@u$%97Qz?*Ynh2VG@q|?8vY&L74&fs&_b&3$x&Oyjl~LQDRRap
zJU4U*R+(2Dd!G+lh8!V{<r1^+GAeYtGH~*MH@9IPqULc;?zD%ZNz2PCP@GD{4SECK
zPY*^?z2ea0Y)plNuqxlsmeQ^&V)zAS)RXazR|EI17g$lgY~r6eW5A-QFMHbn4F^J8
zK?Z#1jQ&ia6vN5$+;lZLMvOdX!IncZ+^BZpbtA`^!X(k2teqsW>pT_UJn+^1Qg6$`
zqkNm(a#hWyc6SP+p5=C4HL8-m`pO`5o~`-LI?_h5CsH?F_%?nDodmz&pWR20WTpJE
z?N|wSzLjMUK8E)a2tI}Lf<e1!ycmj;OhldY>;+;*M|h3Y(U#>)g1>zk9|Hd}oZAa2
zLYBWBoSW!Ts!RwXr^8h+U*@{9{zqS^iH)O<vJb;bYH<NbE9~U+1jXCB%D6D6++2OF
zC8hT}ItR8a8Ks4QSsg8TAvp2qTg7+tOXd=rH`PP_B@#$Ony(BV|E}YZJ0sKl#WIN9
z;n_@S>p<;r`Uw~nc}<^$V~_i%$GFjaG?X1@E|M`h)nekvFKt`Dh-f>@|0-`Xoq)o`
zx;JmzDfOV9qCx|EVpogEe0LK~tGS?5$$L_i6P$P6wIsCQaP_;d{{N=iV@+8LI}o#(
zvo*Ejy=IIn{rdIQh1&q-{EuohpVOjJ^Q3lD*YTp37$^RRgn8ihpdu5{Ct%5-KO!VL
zcNB6dUajXI9jkm-P|i3~GB-A(X`P1Oqqb$tcku<Vg(&6)R*R}%pmBmf#me#Ed}K@H
z8>)UJw0w3GeUijb__#QT4j%64z%EeB7S?jlWwx_7&+EEvB|6N=kV}DwnyAlX=?j`)
zmU#!$*^@NIu#n_d7;WoJV@*Fbv9|yJO4;n|BNF2xy(54RyB>t~8lUOUW$&2%Nwi1y
zx6JxW88>U2$#qhl^6KUbtmg9}D0o5vYDT7kWJthLGkpGnN4T>{St^_EU>4;DmLF9o
zr|LqsA8_MoNLQ=}w?8u!ziSZ@PC#Y<#9uJFo-ozVo6D;<8j^1$c|qAE3ZTE5i~zmE
z$BU5lw6l=EWsg^y^;8>r9qH{xfL|~PZYK#md$zZ0?o11gV<*WSW~cgy2GYGQir%wf
zt4iW8D+;s*;RGrmd(-T<@2&j(Cb9xhV*l-x`TpK`xq|7p?5R%5*s!69?2c!cC*VY*
z2DE^9pvOPLU!1e}wA8S8opcTJ3`NB>hY=JQnL~QFXR4K8A$BqJnoEB$wn-%u@E6Mh
zCfMF4kusv3N!(aHC}4)Xs^xoOwXd%e^6pi5|DZo=Q25j+6HlJ^7FodH6y1bMROR^q
zGu6)fopS`h%Sw<;ZH%TEPf+#81-#_v+@8nlR0jLcIDKQtLleOC)6yLZgC!D9X3GgS
zohwU{v$jl=quD#Go^hB{`@Qw*a%`(^jyT~=q^bWgGzRj;|12J55HWdCWV}EB|K=%N
z3Nq-qxJJ`>^|1MNN+q}zTB&ooE3j==AgK@^UW<^oSbeALa2peF)Th6{@sj0KyMNHZ
zksk1+MXN2tv+22A%cQOGpS9)77(uP9mh+!5T5ERLvF@b}$+WvXM45Z?-kCa)fb~f1
znVbTD$Gx-0Zxc`0D@YgHakge6SL0H`-vN_x?AP0>iGH0_EE&=v83hMJgaKAI0jJXm
zVxVz;X<$v6WW7}fxROO7vr#YLP;;lij5VrX{;>7kK6TtOH&6|Ar^xo>00%+u$C4@#
z>!jOt6*3><171+WxoZnKDTzJtDRw+T030;yI}~uV@9fCnei^I*j>Bp&mzP2d=FPb_
zCM*l_+$LDR3B*a!A$g#>xsrZvw0lckxmMg>0aQd7tPyN=t{dgXb;Ie+T8{fZH=gdu
zM7Rg9c(kg(Jg0?ARRRl=AONFKrvFj)lTY$KfT%6^6s`mk*ABGhsce*LsoD>K{z_M2
ziPpnu+lw22PfF!CoId^6n*G4H(Ix+#+N{C(da7<o)nCVrQ%K)QqP`yFXo7PsA<-DU
zVMn^-y!SU^P0>t1BYMGEaE#PdpOLxsVD5riQXHp@OX;`S`8VnpM~)I920w~<3|mo0
zf8~Az`*?2?H&gZ&*K&bRkV@qzvMlRHXys8*Ze2+1c?5o!^+$&MHxB@4Ee5cke52R!
zmn7AZtY6ST%ixgU5)%$<dO~q_W%Rzmn(4tRfE<xMHx$P1`u}U6@H!GZ8tEEf&cv?)
z2u#O+2S1%b{)tq(t>%QcwHj7Es-Qu^kLAPwy%7pGBw_4Q9#da^W2$}axNHr03)_nw
z5?yuNmXrI5HgS46)c5&}B)Tts49oU92>3xBLLy}FMUW=84DQbVq^;7_e7|(Sdz|&J
z73N+M`rc2rt*oSWu#7S{*s~nH6HRHJS1SmzeXk|;CA)FI4bat3<%}nkB<VHA4gqfj
zl0c&fw1Dm2e6sUf&4R3pS7y>%;;?=F>B7ms9QSxv#@+69;@>QaR?RE<L$*e~^=r_E
zM6(YEnz4sUr&1M;q>YX4&)=itG>rM{<{A79Rmk)`5ON#GL`*KX%}Ihk3w(RtM-WLt
z?f&FLF}4N^yE!(pZ&Yj&Bc`~K0@4_}*0Om?wN|}4WJ>WL;G^H2*QpgEkGA~OET-Km
zkwz|5{6dnz1U<2Pe9DNL>3g5FEIvp1jzP&2<zv~g6q4yB4PSXe1Yq;eeDSaCI$tYe
zd<>K#z~j%g6!7B;^zF+o95?fV{3mnB8*RMhCDNp>Am-3e@jNfMj?jHV$MWjk!DDKP
zkAz$Y?Sr)!GUOX}qTQ5aMh|wq1uq}~joWyKl=b_LboM#w<m`%Ex?PAOCx}KyqH|0m
zMm>i{CMuz5x6BKlA<Gnnv$B=BB8%!h*H_i-Tweiu!rKyF(6w*ztog$E7?Dn;Fsr}3
zwL`Q@oV!vslT%h4VY@}nshA9|>-<piE(ABvkYO1QD9p$yEigj)f0Cj)(&2(rbxw!V
zM%K+Ek6bSac+S_7S3O;ceo@ZQD*wDR2Tdkd<OJ+c^*EYsqI1UL^Zaq0<O)p`PIMLK
z$1kyCgIO}nO`jTwAU=at!sp{m4~1u%tP8UWy5ibk$HVQF2OM{>qy++cM01D3b7`uD
z#l6M4pI;JCypO8<S|y?OHJ-^u$MQEUXk0j9S7^e0R+yzxu2rgvqnc)8!Jfj(0GJ|#
zfKI96iqjA9&64W)LsvsI)xDh5KN*z0vDJ-~+G=~=<hD=9tEx-(&J83f7aO9jLLwyc
z;)4VHlpQ`2zPH@0X%*RsWbnz+<jsLc$^=v`tAFMl7Ri{#5|T|4UeNV&U@X@+G+gki
zfR-9a$JT8f!5P4x41Tc%J^4K-;T$xK1`JU-Q{7rnzr@AVEUhJG=PT@Pep_x+ESPlz
z0tx?tzq#;5IlYwr`sZ)IA1-}@5w1dCdU(X7bVp3{CgA;vt3_>JZ6?U&wNxR!{4oB_
zlV!x9+-&Qy6{%MQ{~yoZGkKiTSC`YS_j22~G;xUV855g2&C(zm^V!(wpcm@zn{%!g
z4}JGo(s<W9*jHf`0Z`sZNImo*zS9^}e$Hhx6?SOff0@ASakX~#!(k|vo}w9fd(?cy
zwAK`)3tyun^cNZw)rZ*mX~fh|mazC{&Xr^!lQTy`eUQx>GZ1O~to-}le<P>Um<p!Q
z<gGQ5FG|(-vlFWdETkYksRqG0&L`FE-FQ8}8w0Km*&aVL&VPE3Z_R*=0!8ED0m=#v
zHm`a~(XYG#7=I=)B-;aP4B#qGPKdDR=l}rFl{hVhe};PI53gQSx3a&9v!900Va<9R
z={~tB8-KUBmq5Ncp~B2(Z_K}=b7a=UI4je&_uXB0(>Y2RIYtNPVDpE$%vda+HD#3m
z&VuXJ{BK&Qe+rBa7eq}Q(bq|tn(RrJAk|ztj2(i{d>nmQnM?;HF2k&9sA6up5tmjl
z7lySlzMbifH17-m-Lwa_F&e7nO<lMXsPt#CNgKF%HdwG@ztDK#niqC%M#bR!wQc6I
zA52LFM%an*93hR1a$6-Q5Y3MEutAX4S=G&3@BbBIaUu5=j(<^FKOPJ4u~mgGD`9GY
z#;IN>H?ESi3#ckR3tsM+jsck3`oG!uMS}|eAwVXv>}qxwq?QY%QJ0}r@^;fhuUA9W
z*BVl>TGo&N004@xSiwDUXUvp51sVmqO3m)=B55aPwf@0=e}cN+$-BdKxY`YrT_4)0
z_d10#i44Q*rFr<T(^i|y7FsZ?QiUH5fV)rQ^pCDAt`%;DE`N^_wDGgG|9V5D{T+0f
zLdvJGflLYa)DxONTTEv{RtDYn&LmiVPZ7_9xNeE>8MC>*)v$EJvz``(pb{e&*6k+b
zsMz%($|1+8hn8c2?P(l@;Rb&CsZeYoCI3?2!LqjbwPXW3z4G$Qfj=cT5Yb%vY0(AX
oeb?AaKtwrnc|$|zzw9vfv<y6>n^aJJ!zd)XFXqqy0000001=f@-~a#s

literal 0
HcmV?d00001

diff --git a/examples/llama.android/app/src/main/res/values/colors.xml b/examples/llama.android/app/src/main/res/values/colors.xml
new file mode 100644
index 0000000000000..ca1931bca99e3
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/values/colors.xml
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="purple_200">#FFBB86FC</color>
+    <color name="purple_500">#FF6200EE</color>
+    <color name="purple_700">#FF3700B3</color>
+    <color name="teal_200">#FF03DAC5</color>
+    <color name="teal_700">#FF018786</color>
+    <color name="black">#FF000000</color>
+    <color name="white">#FFFFFFFF</color>
+</resources>
diff --git a/examples/llama.android/app/src/main/res/values/strings.xml b/examples/llama.android/app/src/main/res/values/strings.xml
new file mode 100644
index 0000000000000..7a9d314e2969b
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/values/strings.xml
@@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">LlamaAndroid</string>
+</resources>
diff --git a/examples/llama.android/app/src/main/res/values/themes.xml b/examples/llama.android/app/src/main/res/values/themes.xml
new file mode 100644
index 0000000000000..8a24fda56602c
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/values/themes.xml
@@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <style name="Theme.LlamaAndroid" parent="android:Theme.Material.Light.NoActionBar" />
+</resources>
diff --git a/examples/llama.android/app/src/main/res/xml/backup_rules.xml b/examples/llama.android/app/src/main/res/xml/backup_rules.xml
new file mode 100644
index 0000000000000..148c18b6593d9
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/xml/backup_rules.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+   Sample backup rules file; uncomment and customize as necessary.
+   See https://developer.android.com/guide/topics/data/autobackup
+   for details.
+   Note: This file is ignored for devices older that API 31
+   See https://developer.android.com/about/versions/12/backup-restore
+-->
+<full-backup-content>
+    <!--
+   <include domain="sharedpref" path="."/>
+   <exclude domain="sharedpref" path="device.xml"/>
+-->
+</full-backup-content>
diff --git a/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml b/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
new file mode 100644
index 0000000000000..0c4f95cab9126
--- /dev/null
+++ b/examples/llama.android/app/src/main/res/xml/data_extraction_rules.xml
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+   Sample data extraction rules file; uncomment and customize as necessary.
+   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
+   for details.
+-->
+<data-extraction-rules>
+    <cloud-backup>
+        <!-- TODO: Use <include> and <exclude> to control what is backed up.
+        <include .../>
+        <exclude .../>
+        -->
+    </cloud-backup>
+    <!--
+    <device-transfer>
+        <include .../>
+        <exclude .../>
+    </device-transfer>
+    -->
+</data-extraction-rules>
diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts
new file mode 100644
index 0000000000000..50ebc821122f6
--- /dev/null
+++ b/examples/llama.android/build.gradle.kts
@@ -0,0 +1,5 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+plugins {
+    id("com.android.application") version "8.2.0" apply false
+    id("org.jetbrains.kotlin.android") version "1.9.0" apply false
+}
diff --git a/examples/llama.android/gradle.properties b/examples/llama.android/gradle.properties
new file mode 100644
index 0000000000000..2cbd6d19d3371
--- /dev/null
+++ b/examples/llama.android/gradle.properties
@@ -0,0 +1,23 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app's APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
+# Kotlin code style for this project: "official" or "obsolete":
+kotlin.code.style=official
+# Enables namespacing of each library's R class so that its R class includes only the
+# resources declared in the library itself and none from the library's dependencies,
+# thereby reducing the size of the R class for that library
+android.nonTransitiveRClass=true
diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.jar b/examples/llama.android/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000000000000000000000000000000000000..e708b1c023ec8b20f512888fe07c5bd3ff77bb8f
GIT binary patch
literal 59203
zcma&O1CT9Y(k9%tZQHhO+qUh#ZQHhO+qmuS+qP|E@9xZO?0h@l{(r>DQ>P;GjjD{w
zH}lENr;dU&FbEU?00aa80D$0M0RRB{U*7-#kbjS|qAG&4l5%47zyJ#WrfA#1$1Ctx
zf&Z_d{GW=lf^w2#qRJ|CvSJUi(^E3iv~=^Z(zH}F)3Z%V3`@+rNB7gT<C4E+e^X1+
z079LInxq~UYf-kNla?M9Qw5`wqM;O{-8tPk0sfaO{=LZmzBQ1)zwMpO|F66HKXsu0
zsblVBXkugf|5Qc(cU5;MLk9;_r~hk73h82L2Ot0dCNKa1{eNB}WN+`{?DBWLtf8fy
zvWuaUi>VU{Bb~90p|f+0(v;nz01EG7yDMX9@S~__vVgv%rS$+?IH+oZ03D5zYrv|^
zC1J)SruYHmCki$jLBlTaE5&dFG9-kq3!^i>^UQL`%gn6)jz54$WDmeYdsBE9;PqZ_
zoGd=P4+|(-u4U1dbAVQrFWoNgNd;0nrghPFbQrJctO>nwDdI`Q^i0XJDUYm|T|RWc
zZ3^Qgo_Qk$%Fvjj-G}1NB#ZJqIkh;kX%V{THPqOyiq)d)0+(r9o(qKlSp*hmK#iIY
zA^)Vr$-Hz<#SF=0@tL@;dCQsm`V9s1vYNq}K1B)!XSK?=I1)tX+bUV52$YQu*<ra!
z!v%7ZiKpO7g;NmE(;dSwu}#Qr14TWb<rzbgaS}{2FVDKaeCbmIt`T?_&=oa1ox)Gi
zqwS3lX?Fkmj%*6-JQ8ia`$(tFUJ#ol59+HHQxhli%Jb#vc@r`6ZP-EsfP2S!rwy#d
z;DP`C{cFdu2M4~`pHtE1KsUc1?BTR?&fOjbQh0|PcMiZgx_Kq$bLD%;`Ig2_LE~#`
zt32~lMNxY>0%fnWEukW>mxkz+%3-S!oguE8u#MGzST8_Dy^#U?fA@S#K$<FiPyhgu
zzq^L^|GyXf(+AWxl#$gjesG=F>S@9msUiX!gd_ow>08w5)nX{-KxqMOo7d?k2&?Vf
z&diGDtZr(0cwPe9z9FAUSD9KC)7(n^lMWuayCfxzy8EZsns%OEblHFSzP=cL6}?J|
z0U$H!4S_<U3#TVDkQ!s%Ox_BnFc2H6iNU0q=!|+Z9mk`Nbw?whndl6tI(Fj=$tl!^
zIOq<7BPlTvwG$fSu#@_%M(FvF2tpewu1-c35x~(IN{;#g5`-28n}V56vUKDyHalgc
zVFs4DD7(uszamXg!+b}p?!s)SZXGtIED*Jww1@^#7%op*kD~rw8S#!ebx(C|Oi-ci
zN~c@b8rVJSYHe*Cyn5uEa+-wenYQT6aAn!pd*%?%r>TVjj<`6dy^2j`V`)mC;cB%*
z8{>_%E1^FH!*{>4a7*C1v>~1*@TMcLK{7nEQ!_igZC}ikJ$*<$yHy>7)oy79A~#xE
zWavoJOIOC$5b6*q*F_qN1>2#MY)AXVyr$6x4b=$x^*aqF*L?vmj>Mgv+|ITnw_BoW
zO?jwHvNy^prH{9$rrik1#fhyU^MpFqF2fYEt(;4`Q&XWOGDH8k6<goGkkqwiLLr*g
z5z6x8$sF`?<e^h`j@COy$E+qY=Oj=v!b*KAnIYc*AP7qC0C#dwjl&srabEh<e-#E9
zv&sl%zw~Grb~0?}V1_A9--Q~b&NxawFDL54EJtT<qO4Z~5p7>M=%@fics4ajI;st#
zCU^r1CK&|jzUhRMv;+W~6N;u<;#DI6cCw-<Kad;hI?@=R3TXw!Cr}=BbI5m+uEl-w
zqErRfdJ>otsc@IsN3MoSD^O`eNflIoR~l4*&-%RBYk@gb^|-JXs&~KuSEmMxB}xSb
z@K76cXD=Y|=I&SNC2E+>Zg?R6E%DGCH5J1nU!A|@eX9oS(WPaMm==k2s_ueCqdZw|
z&hqHp)47`c{BgwgvY2{xz%OIkY1xDwkw!<0veB#yF4ZKJyabhyyVS`gZepcFIk%e2
zTcrmt2@-8`7i-@5Nz>oQWFuMC_KlroCl(PLSodswHqJ3fn<;gxg9=}~3x_L3P`9Sn
zChIf}8vCHvTriz~T2~FamRi?rh?>3bX1j}%bLH+uFX+p&+^aXbOK7clZxdU~6Uxgy
z8R=obwO4dL%pmVo*Ktf=lH6hnl<C=NTH%5`gfxBHGJj`iNz(A#<w&(&isR(NdjRau
zf|I4<<<u=|XXZxd2JFGs>z_5k3cG;m8lgaPp~?eD!Yn2kf)tU6PF{kLyn|oI@eQ`F
z3IF7~Blqg8-uwUuWZScRKn%c2_}dXB6Dx_&xR*n9M9LXasJhtZdr$vBY!rP{c@=)&
z#!?L$2UrkvClwQO>U*fSMs67oSj2mxiJ$t;E|>q%Kh_GzzWWO&3;ufU%2z%ucBU8H
z3WIwr$n)cfCXR&>tyB7BcSInK>=ByZA%;cVEJhcg<#6N{aZC4>K41XF>ZgjG`z_u&
zGY?;Ad?-sgiOnI`oppF1o1Gurqbi*;#x2>+SSV6|1^G@ooVy<y93LDm;wIOTj=5c$
zC-QhzvAl0y_;%{)gWRy`;Bf=?TtTe*SY!MP@9W|edu{l86Kr1<Cmr((Tem3gt4{#y
z3D<WVud@WzG8_>@fg?wyf@0Y!UZ4!}nGuLeC^l)6pwkh|oRY`s1Pm$>zZ3u-83T|9
z<QZfPf@8vDvzJ5u4A^A<#$7q&=f7lp3&n5W!oLxA;ja+?=SVAJ?`~&fZ)ozb9P1k`
z3pL1q5VB*z+Ct?<9|-*itS69vS4hVra5Z!lDKSySn;jjmUpRtte+Bax7QXjI?`90S
zA4?c)l!1W6+}k;06I}~wRC@!%R<xI9L>GaKJIV3_x+u1>cRibsaJpJqhcm%?0-L;2
zitBrdRxNmb0OO2J%Y&Ym(6*`_P3&&5Bw157{o7LFguvxC$4&zTy#U=W*l&(Q2MNO}
zfaUwYm{<Z#B+303&8&j|1AYBZQ1ef~@-GzfzfBY|H8XUzarxJ|f|I?ulc}?_jHR=S
zshz3QKN3ud>XtILD$3864IA_nn34oVa_g^FRuHL5wdUd)+W-p-iWCKe8m_cMHk+=?
zeKX)M?Dt(|{r5t7IenkAXo%&EXIb-i^w+0CX0D=xApC=|Xy(`xy+QG^UyF<x+1@YN
z-ZMAue9y-N{P;V-w=mmGh-1)C76Xu!U?t<}ByuZz$q|bl4S<l@37Jh~Glu1WLl}(l
zthb2~6ne4Q83FI<<Ay4c4`8D(It&am2#(!Ony)app0o62Q+)KCL_LQOA)tF@50mMJ
z<<yYk$({rlv3BkbtJ+SuacOk#dTZz@Qrop4gQ~UO=spb=-piAtn0x3U*bMbJdwI>e
z+#J6h_&T5i#sV)hj3D4WN%z;2+jJcZxcI3*CHXGmOF3^)JD5j&wfX)e?-|V0GPuA+
zQFot%aEqGNJJHn$!_}#PaAvQ^{3-Ye7b}rWwrUmX53(|~i0v{}G_sI9uDch_brX&6
zWl5Ndj-AYg(W9CGfQf<6!YmY>Ey)+uYd<oGrQ)76s0B|$EDBcWW*bGf!XzhAcke;%
zrh(E+RSxuWf~{)Nc=B!JS!X`$2DA4*+k72=D%60mnI%QDRarAyE}--rmK`Z^V+Yae
z&vDIckd3^cYT8wuAh(uIoqyu^LYf=&(`lZI>_J<fvopl!qQMd6Z#ZYajSPWAp}`qN
zjM~mJhn3N452gfOT)3x7am8MT`AA+VSSTZ}v@@8Ef-MrsX1vV|`Qx}Yh}~|pD;qlf
zbIL{96T!~st%_2?)f30K4Wp&xR_;627NS?#a?ONO1)C^TbaxNEe^u!aMYPy7;-Hq3
z8kFD}u19!l+a?p#Hl9<7(I=E6f85Gdi%U@U>NXH=>|`OH-CDCmcH(0%iD_aLlNHKH
z7bcW-^5+QV$jK?R*)wZ>r9t}loM@XN&M-Pw=F#xn(;u3!(3SXXY^@=aoj70;_=QE9
zGghsG3e<X)8z!NOUQ-K7Tg*M~$*Lq1YL~>kq#N||u{4We_25U=y#T*S{4I{++Ku)>
zQ!DZW;pVcn>b;&g2;YE#+V`v*Bl&Y-i@X6D*OpNA{G@JAXho&aOk(_j^weW{#3X5Y
z%$q_wpb07EYPdmyH(1^09i$ca<gJ(BN8tBcV!2)N5jxRqNZX(-f?Oe`25b>{O<}7)
zRWncXdSPgBE%BM#by!E>tdnc$8RwUJg1*x($6$}ae$e9Knj8gvVZe#bLi!<+&BkFj
zg@nOpDneyc+hU9P-;jmOSMN|*H#>^Ez#?;%C3hg_65leSUm;iz)UkW)jX#p)e&S&M
z1|a?wDzV5NVnlhRBCd_;F87wp>6c<&nkgvC+!@KGiIqWY4l}=&1w7|r6{oBN8xyzh
zG$b#2=RJp_iq6)#t5%yLkKx(0@D=C3w+oiXtSuaQ%I1WIb-eiE$d~!)b@|4XLy!CZ
z9p=t=%3ad@Ep+<9003D2KZ5VyP~_n$=;~r&YUg5UZ0KVD&tR1DHy9x)qWtKJp#Kq#
zP*8p#W(8JJ_*h_3W}FlvRam?<4Z+<U)YVE|@-H{P0L;lgD=4M+K83d`J=;XZ6cLXJ
z^Pb^7ai96hX*$t^`}X?;F@T^K^_U|s%%#VBR44vL@CQr;#z>-H77^$Lvi+#vmhL9J
zJ<1SV45xi;SrO2f=-OB(7#iNA5)x1uNC-yNxUw|!00vcW2PufRm>e~<GrW@0xg&;k
zL?iysf{CN}V{o=!B5cYHv{+Y1t=R-R>toH;M0Q85MQLWd?3O{i8H+5VkR@l9Dg-ma
ze2fZ%>G(u5(k9EHj2L6!;(KZ8%8|*-1V|B#EagbF(rc+5iL_5;Eu)L4Z-V;0HfK4d
z*{utLse_rvHZeQ>V5H=f78M3Ntg1BPxFCVD{HbNA6?9*^YIq;B-DJd{Ca2L#)qWP?
zvX^NhFmX?CTWw&Ns}lgs;r3i+Bq@y}Ul+U%pzOS0Fcv9~aB(0!>GT0)NO?p=25LjN
z2bh>6RhgqD7bQj#k-KOm@JLgMa6>%-ok1WpOe<hhSU`7#vu9^7%H57&(`pARdXnZM
z;eION-Jf1cVd1-MCh?1mFC&L}n&D)2PEzUHkSiE_t+j1!E~K#qcYI-`hc5a+<MQay
z<1}>)FS^XOU{c?d5shG(lIn3GiVBxmg`u%-j=)^v&pX1JecJics3&jvPI)mDut52?
z3jEA)DM%}BYbxxKrizVYwq?(P&19EXlwD9^-6J+4!}9{ywR9Gk42jjAURAF&EO|~N
z)?s>$Da@ikI4|^z0e{r`J8zIs>SpM~Vn^{3fArRu;?+43>lD+^XtUcY1HidJwnR6+
z!;oG2=B6Z_=M%*{z-RaHc(n|1RTKQdNjjV!Pn9lFt^4w|AeN06*j}Zyhq<HLE#a+A
zJ{@Y`FHfEI5bv;8o$ZEGCqCOp4`6-5eeTUyKzzSKd~k>Z^!-=cyGP_ShV1rGxkx8t
zB;8`h!S{LD%ot``700d0@Grql(DTt4Awgmi+Yr0@#jbe=2#UkK%rv=OLqF)9D7D1j
z!~McAwMYkeaL$~kI~90)5vBhBzWYc3Cj1WI0RS`z000R8-@ET0dA~*r(gS<U)h`aD
zLE6Dn5Q+3O*c2B9elF29G1VrakVK+eBtDr^Gf}-iF&NZnZ$t<sha>iCJmQMN&4%1D
zyVNf0?}sBH8zNbBLn>~(W{d3%@kL_eQ6jEcR{l><V*L*c8i9aN^J3k5@LC4D{kHc*
ztiPGM*MYdYK5-%KXLXp`;*IAihHn*4Pn*cYN8uZA$oHJE-8(VkYwPa2amx3wu)oxf
z;@K?9yEDA%g1*No{aF{@JJ45(#kW@dSB%B?ig4GNH`G^-kpE2q*E={oZNZKA>C|JK
z(R-fA!z|TTRG40|zv}7E@PqCAXP3n`;%|SCQ|ZS%ym$I{`}t3KPL&^l5`3>yah4*6
zifO#{VNz3)?<E9P2g)v<M^QTEwJXCARf%0EK}z-2B8QN$*7-J-mD@w9d6fzs+FVv4
zrJ6y#^xsh;O&ToHBFf~MC54(~Sb2j}sCotWQt($NS~kJz72WH?<eo8;ekt}!o;CT~
zFlF~#@UZt@DeNAn7u||SULov|N<ot`RP2<Gm?2rbIJ;;m_J~Yu)ZL<c+N#@?_iQO`
zRo%izIfJ9RLf%!Awp2T%_jV~S<vv}L?&aO;G&bt(!h60lS`2vX)v@WhGFXzOtRK;c
zyr@kt8OV3(72@wS&Pz;*6SWo#Z2jYhpl6$h+sV9U!&ep>ZL$be;NEaAk9b#{tV?V7
zP|wf5YA*1;s<)9A4~l3BHzG&HH`1xNr#%){4xZ!jq%o=7nN*wMuXlFV{HaiQLJ`5G
zBhDi#D(m`Q1pLh@Tq+L;OwuC52RdW7b8}~60WCOK5iYMUad9}7aWBuILb({5=z~YF
zt<!zJa=c$qO%0sJrsowK;!t%<S=X7Y&Jxn+nGDY&f(=_-CL}c~4K|D7ux0QbLt&^f
zSr24bBRsz?=*UjQu7gZoZ09|ewuhGV`hI~v^swhOT82+78;Tj2xT>?*Jr<A3W3T-3
z<1&Nz$ui_+E7n$IbPK2Rrimn|(;7EPx1h6Fi)-3?>5NG+WadM{mDL>GyiByCuR)hd
zA=HM?J6l1Xv0Dl+LW@w$OTcEoOda^nFCw*Sy^I@$sSuneMl{4ys)|RY#9&NxW4S)9
zq|%83IpslTLoz~&vTo!Ga@?rj_kw{|k{nv+w&Ku?fyk4Ki4I?);M|5Axm)t+BaE)D
zm(`AQ#k^DWrjbuXoJf2{Aj<e%564WZLSS(Vd#ROb?}iRVX%`%#SRY^TSh{u=kI84e
z3Ow&PB(uhOFrXgEsH|%j)I-Ug(1-{oj6dcgiDH#{Xclz$DM?xphY_S^6Z_M-j<Q{M
z3m5`Vg`@HsKN#jbtysG34Af*<`=E}BAe7{&&983YEEQfM{J8#P!otJO%!Ug4=|diY
zavSVyOcEEa#lU9-1SEeUY5cJ=<Ds##5c*EUsRi@fN(`2}Y5bh+k`b9uFL%rwlbF%M
zdSW_2pP9rtReb`cMyUFd*B2fwpi645^+_wI6EzB|mef<{4KFd8`yOM9scKF$vrUM|
zCoNlGyffVj5PnZZrY&Y5n|uW=PHnHX<YXTSdl@8HE*;J7fEIy#D!|gwR}m6V5SwWq
z(a}Z41Prc!Wk&AE!j3s3VBd*3Lw?gbM3E&oKBI9kltN8F1T6wcC{`?P4s@QUU>^KT
zFb1zMSqxq|vceV+Mf-)$oPflsO$@*A0n0Z!R{&(xh8s}=;t(lIy<Pl3uQSpJ4S8Y>
zv$S8x>m;vQNHuRzoaOo?eiWFe{0;$s`Bc+Osz~}Van${u;g(su`3lJ^TEfo~nERfP
z)?aFzpDgnLYiER<rBlW-W;>sKPu<X=38r|I{0&8J8r0^`tKAAKA8jUsvBhopXc14T
zgUW&TYxF3i;mZl;v5E+2qh>|0tq4l2wT)Atr6Qb%m-AUn6HnCue*yWICp7TjW$@sO
zm5rm4aTcPQ(rfi7a`xP7cKCFrJD}*&_~xgLyr^-bmsL}y;A5P|al8J3WUoBSjqu%v
zxC;mK!g(7r6RRJ852Z~feoC&sD3(6}^5-uLK8o)9{8L_%%rItZK9C){UxB|;G>JbP
zsRRtS4-3B*5c+K2kvmgZK8472%l>3cntWUO<l-B0`McQr%h#X$@@JrQb8#@ppQUG9
z)`zI{d#E;id*aB>VHxB|{Ay~aOg5RN<wtviA9uMGSHMzxbdkCR`SfK34yQ^bEpjN~
zPf^Bz$`=daT8!L25oqT&4i=+dRE^=E#-D#x^n<}e%YqXyp_!ut)H)b@E0UPjTon!Z
zEyxi=e>;{PJgeVD*H%ac+y!h#wi%o2bF2Ca8IyMyH{>4#{E_8u^@+l-+n=V}Sq?$O
z{091@v%Bd*<G_U#2l9N@Y9~(ygCl-h&9i8Yj9_3cV?DM!lxf11kY_LYnfWL$a@7F3
zk0$$mP<ka_iUf&lMdUX}1{nJ-EWCBw#je~5Smwg)p`tEv>3pk0^2UtiF9Z+(a@wy6
zUdw8J*ze$K#=$48IBi1U%;hmhO>lu!uU;+RS}p&6@rQila7WftH->*A4=5W|Fmtze
z)7E}jh@cbmr9iup^i%*(uF%LG&!+Fyl@LFA-}Ca#bxRfDJAiR2dt6644TaYw1Ma79
zt8&DYj31j^5WPNf5P&{)J?WlCe@<3u^78wnd(Ja4^a>{^Tw}W>|Cjt^If|7l^l)^Q
zbz|7~CF(k_9~n|h;ysZ+jHzkXf<NcXr7lv4BLC0%brrpzF{-obsvQr%!_>(*O*@5m
zLzUmbHp=x!Q|!9NVXyipZ3)^GuIG$k;D)EK!a5=8MFLI_lpf`HPKl=-Ww%z8H_0$j
ztJ||IfFG1lE9nmQ<au0Ml%cF(=R}3nZqu!gn~ulcq{4`|I}2tuFXsdOn=eq9Qz{LW
zP^cfInuhA7QFIUcH%13E<=O+Vy|WwD8Y7tHV5z9{b#53O<eDk6+LD+O<*lHC*Sh-<
zQQ*d{QylAFdIrjkp%D|@{O_AHu^XIETj$pt=r^Iy${cPPEHt@N92tqR6yn>0+jPQy
zCBdKkjArH@K7jVcMNz);Q(Q^R{d5G?-kk;Uu_IXSyWB)~KGIizZL(^&qF;|1PI7!E
zTP`%l)gpX|OFn&)M%txpQ2F!hdA~hX1Cm5)IrdljqzRg!f{mN%G~H1&oqe`5eJCIF
zHdD7O;AX-{XEV(a`gBFJ9ews#CVS2y!&>Cm_dm3C8*n3MA*e67(WC?uP@8TXuMroq
z{#<dYp{{+xCUwQw6)w9(r^`^0<F6$P{m>w$%z@CBIkRM7?}Xib+>hRjy?%G!fiw8!
z8(gB+8J~KOU}yO7UGm&1g_MDJ$IXS!`+*b*QW2x)9>K<JF7b%^NJe_9Fs0_(5~kP{
zjLlh(Gb|iDD!%2k=};N1Ay-P7$Wu_KP|<Nv<px9DnR*Bhp`@^|=#?&n9#0)w-FDC^
zYYwq=gkfiuN_5CjVN9Ao)scz%pogt!6S`BY&a-LBZ0vR8Hs`uYWwn}-oY<_#H(3U4
z+C^OQlGq~g8s<Q&bZc>~Y*E&bYMnjl6h!{17_8d!%&9D`a7r&LKZjC<&XOvTRaKJ1
zUY@hl5^R&kZl3lU3njk`3dPzxj$2foOL26r(9zsVF3n_F#<NB6ex9*oBW}l?xfbxx
zL%E!#nXs+R%q=x9ZnSy_@kOHMCrx3@5&BuHZd_<OOwDuA?g-PK!m67qWTCZlm0wKU
zv#i7p-7isYt{yv)OR8I5o;!7xop*Zm6Rp~J_EdjQW~sUFuwoyaW1U&NaV|M4U1?nR
z@)EUG9kIA_lYfhiyvI%HWy;Ge%?fKvOXHCrMb%0xeab5(rJo188D&|PUo17GFJ{-P
zs?t#M)7w#0RHw{KaZQP;u<ZWKm`x0&oP9ITEjmgxyDYbPrK*U@Usmmw0c2<WSXv-G
zvb6FvJhqfc@v@g(G}B*mRq{2pTwn0X@yc3sxqx=5S4D&>v)s5vv3<FgSd?p+)!kNm
zTv$3*E<J6vu2~)lp%4nMdtWE@=#AViY%^W&8R2_<v(+Qa%Mf{$Tn~RH;PHc?^uW;B
zl33*<G2>@dgs|lP#eylq62{<-vczqP!RpVBTgI>@O6&sU>W|do17+#OzQ7o5A$ICH
z?GqwqnK^n2%LR;$^oZM;)+>$X3s2n}2jZ7CdWIW0lnGK-<n4*9i9?XprV!b-2yz1y
z82;&r=GhqHm&zBz3e~(LKRa(8ZwPt%3TYt5`IuWWTX+)iO!EB5qiR4W$RDCd`g{-j
zZuyRBvdZnk><gEOlAyZgBQiT7{>b#EG01)P@aU`pg}th&J-TrU`tIpb5t((0eu|!u
zQz+3ZiOQ^?RxxK4;zs=l8q!-n7X{@jSwK(iqNFiRColuEOg}!7cyZi`iBX4g1pNBj
zAPzL?P^Ljhn;1$r8?bc=#n|Ed7wB&oHcw()&*k#SS#h}jO?ZB246EGItsz*;^&tzp
zu^YJ0=lwsi`eP_pU8}6JA7MS<SY@ZXteVGu2D;{e_dFnzL(OQbWI#IL1J(@}vxKG~
z{>;9pfD;DsSsLo~ogzMNP70@@;Fm8f0^;>$Z>~<d_jp#pvX)$*IF^kRoeC(j+t)Ur
zPx*FO4h)ZG=iqY;QK6W@ekh%ILEiXbr05ou^~><kZopq=@-e)e2pdVj4ZJKNAd~~=
zIWD3^k4g6+ly88l1dUP7-<u$%X=hfWc{o8z>}GWRw!W5J3tNX*^2+1f3hz{~rIzJo
z6W%J(H!g-eI_J1>0juX$X4Cl6i+3wbc~k146UIX&G22}WE<iA2ePZPFMn_vlC3@`(
zC;LX5HFTJ@i$J`Hbx0sRaj^uiUmU^s{mchNU>>0ga#WLsn9tY(&29zBvH1$`iWtTe
zG2jYl@P!P)eb<5DsR72BdI7-zP&cZNI{7q3e@?N8IKc4DE#UVr->|-ryuJXk^u^>4
z$3wE~=q390;XuOQP~TNoDR?#|NSPJ%sTMInA6*rJ%go|=YjGe!B>z6u$IhgQSwoV*
zjy3F2#I>uK{42{&IqP59)Y(1*Z>>#W8rCf4_eVsH)`v!P#^;BgzKDR`ARGEZzkNX+
zJUQu=*-ol=Xqqt5=`=pA@BIn@6a9G8C{c&`i^(i+BxQO9?YZ3iu%$$da&Kb?2kCCo
zo7t$UpSFWqmydXf@l3bVJ=%K?SSw)|?srhJ-1ZdFu*5QhL$~-IQS!K1s@XzAtv6*Y
zl8@(5BlWYL<vlUo^g==w+bwzmrMIfe1OGTk=p=b_r!$VPWnqI4+K-Q0Kn&YY_INtV
ze5P(@!H@m0AM%dGQN5FSnZ`He_+j(1bNGu}ee2>t1yAWy?rMD&bwze8bC3-GfNH=p
zynNFCdxyX?K&G(ZZ)afguQ2|<I|ToEQs5dh02L^JB6>r;XoV^=^(;Cku#qYn4<V}b
zA&FlaPRwlltsTXe9~a58?uH)L#hLx*;|`>Lus`UeKt6rAlFo_rU`|Rq<F_mt<XG?>
z&G?~iWMB<P-m~>io<78of-2X(ZYHx~=U0Vz4btyXkctMKdc9UM!vYr~B-(>)(Hc|D
zMzkN4!PBg%tZoh+=Gba!0++d193gbMk2&krfDgcbx0jI92cq?FFESVg0D$>F+bil}
zY~$)|>1HZsX=5sAZ2WgPB5P=8X#TI+NQ(M~GqyVB53c6IdX=k>Wu@A0Svf5#?uHaF
zsYn|koIi3$(%GZ2+G+7Fv^lHTb#5b8sAHSTnL^qWZLM<(1|9|QFw9pnRU{svj}_Al
zL)b9>fN{QiA($8peNEJyy`(a{&uh-T4_kdZFIVsKKVM(?05}76E<BOuTKNZ>Ez?#W
za^fiZOAd14IJ4zLX-n7Lq0qlQ^lW8Cvz4U<X+>KkV9~P}>sq0?xD3vg+$4vLm~C(+
zM{-3Z#qnZ09bJ>}j?6ry^h+@PfaD7*jZxBEY4)UG&daWb??6)TP+|3#Z&?GL?<NWw
zpo!n^_Mnx`#l`Mtp<H71ndq}LH(ZXJlzYuNAHINUglm=WWXJLtuPRY2?rIWUs=h@;
z1r9;ol65y`xO*SzvRa8o!D+_s;~<Bb-h5u71zg5ymFjIboenFkEakiwkF#l0c-sut
znM=(poO3Xq&}pnDV!O6|+_sscTBqqe%{pj$E^ku;b!#-zj>1i+280CFsE|vIXQbm|
zM}Pk!U`U5NsNbyKzkrul-DzwB{X?n3E6?TUHr{M&+R*2%yOiXdW-_2Yd6?38M9Vy^
z*lE%gA{wwoSR~vN0=no}tP2Ul5Gk5M(Xq`<W>$nw#ndFk`tcpd5A=Idue`XZ!FS>Q
zG^0w#>P4pPG<EP*xHW9WCTGn>+*NC9gLP4x2m=cKP}YuS!l^?sHSFf<ssGXChcP9X
za)*XSF8xSAX6O&AkMIyXtB<sR`J2gFTR8ck#bl<tOy9ZU%W%Jooi$>tZy{4CoQrb_
z^20(NnG`wAhMI=eq)SsIE~&Gp9Ne0nD4%Xiu|0Fj1UFk?6avDqjdXz{O1nKao*46y
zT8~iA%Exu=G#{x=KD;_C&M+Zx4+n`sHT>^>=-1YM;H<72k>$py1?F3#T1*ef9mLZw
z5naLQr?n7K;2l+{_uIw*_1nsTn~I|kkCgrn;|G~##hM;9l7Jy$yJfmk+&}W@JeKcF
zx@@Woiz8qdi|D%aH3XTx5*wDlbs?dC1_nrFpm^QbG@wM=i2?Zg;$VK!c^Dp8<}BTI
zyRhAq@#%2pGV49*Y5_mV4+OICP|%I(dQ7x=6Ob}>EjnB_-_18*xrY?b%-yEDT(wrO
z9RY2QT0`_OpGfMObKHV;QLVnrK%mc?$WAdIT`kJQT^n%GuzE7|9@k3ci5fYOh(287
zuIbg!GB3xLg$YN=n)^pHGB0jH+_iIiC=nUcD;G6LuJsjn2VI1cyZx=a?ShCsF==QK
z;q~*m&}L<-cb+mDDXzv<F>vrRsybcgQ;Vg21P(uLv5I+eGc7o7tc6`;OA9{soHFOz
zT~2?>Ts}gprIX$wRBb4yE>ot<8+*Bv`qbSDv*VtRi|cyWS>)Fjs>fkNOH-+PX&4(~
z&)T8Zam2L6puQl?;5zg9h<}k4#|yH9czHw;1jw-pwBM*O2hUR6yvHATrI%^mvs9q_
z&ccT0>f#eDG<^WG^q@oVqlJrhxH)dcq2cty@l3~|5#UDdExyXUmLQ}f4#;6fI{f^t
zDCsgIJ~0`af%YR%Ma<z>5VQq-p21k`vaBu6WE?66+5=XUd%Ay%D$irN>5LhluRWt7
zov-=f>QbMk*G##&DTQyou$s7UqjjW@k6=!I@!k+S{pP8R(2=e@io;N8E`EOB;OGoI
zw6Q+{X1_I{OO0HPpBz!X!@`5YQ2)t{+!?M_iH25X(d~-Zx~cXnS9z>u?+If|iNJbx
zyFU2d1!ITX64D|lE0Z{dLRqL1Ajj=CCMfC4lD3&mYR_R_VZ>_7_~|<^o*%_&jevU+
zQ4|qzci=0}Jydw|LXLCrO<tY^5#SIaz%jR-X`&*7)+bTVSc1BE2BLn%vVH`Mbz1R-
z>l1_P6Xf@c0$ieK2^7@A9UbF{@V_0p%lqW|L?5k>bVM8|p5v&2g;~r>B8uo<4N+`B
zH{J)h;SYiIVx@#jI&p-v3dwL5QNV1oxPr8J%ooezTnLW>i*3Isb49%5i!&ac_dEXv
zvXmVUck^QHmyrF8>CGXijC_R-y(Qr{3Zt~EmW)-nC!tiH`wlw5D*W7Pip;T?&j%kX
z6DkZX4&}iw>hE(boLyjOoupf6JpvBG8}jIh!!VhnD0>}KSMMo{1#uU6kiFcA04~|7
zVO8eI&x1`g4CZ<2cYUI(n#wz2MtVFHx47yE5eL~8bot~>EHbevSt}LLMQX?odD{Ux
zJMnam{d)W4da{l7&y-JrgiU~qY3$~}_F#G7|MxT)e;G{U`In&?`j<5D->}cb{}<s=
z)!EcZ!rs<Y#@^oA)#0D)Gvy7r4FLpRXcD*RfTd(<C=@A5lICI1^#y3rLIVLJon(Px
z9#Jw)(4Yq5v4TSV<tUJH3ExZMzKTk&i(qL2_(Map=flfs&WkPnAHQ!Ph9FQ-#b`+n
zGGm<qkbNX1D53P^JDqBMk-0!hNJ&trQIk`mzGOz)`{-cJ&~H;?Q%CleBz;+Wy0Yj`
zyHSagKdo#qV6?6Vcv+pcT%^1Q-l@v({R}QyX*+WEw%BJmI(}Q@j4m`CawF`x{MPHL
za$h&MF~4nMuyWgQrt}RgV#pg|Y^CiI*j!3z!tB-Jp4;1uuh(==9iU5dSb3$ZFATE!
z>{T(4DF0BOk<QYr>-=1195KB-E*o@c?`>y#4=dMtYtSY=&L{!TAjFVcq0y@AH`vH!
z$41+u!Ld&}F^COPgL(EE{0X7LY&%D7-(?!kjFF7=qw<;`V{nwWBq<)1QiGJgUc^Vz
ztMUlq1bZqKn17|6x6iAHbWc~l1HcmAxr%$Puv!znW)!JiukwIrqQ00|H$Z)OmGG@=
zv%A8*4cq}(?qn4rN6o`$Y))(MyXr8R<2S^J+v(wmFmtac!%VOfN?&(8Nr!T@kV`N;
z*Q33V3t`^rN&aBiHet)18wy{*wi1=W!B%B-Q6}SCrUl$~Hl{@!95ydml@FK8P=u4s
z4e*7gV2s=YxEvskw2Ju!2%{8h01<bQ)d%wXmu1Zpj%~-&jsiWxq+->rx-3`NCPc(O
zH&J0VH5etNB2KY6k4R@2Wvl^Ck$MoR3=)|SEclT2ccJ!RI9Nuter7u9@;<u*AgOz(
zn=C13+36e?Wp?2OIME<hmtjrXzJ<Zd*?$>sWf-%um;GfI!=eEIQ2l2p_YWUd{|6EG
ze{yO6;lMc>;2tPrsNdi@&1K6(1;|$xe8vLgiouj%QD%gYk`4p{Ktv9|j+!OF-P?@p
z;}SV|<w!@0*CQ5@xp9@`8c_*)IC@^rAGd{(#wPf?$`(^V&!%1qI&#?UztvBAF!4M;
z_oxBXB0!;X3yhd^D}+Xx4sUHZH*0n|si;UgfM!*1c|d1h4nY076_94CJP`FR$D}_!
zDgwP#mZV0tbmF7vmG7Log$Afqr(GuMl<urHsSR(EhO7^7wNPIUT%rDwjj%sGil746
zDLtAZLp-7)K|QJh+bT3@0I$b@q3|9LuBZk*!Xn-Gb?+~>oIK)iwlBs+`ROXkhd&NK
zzo__r!B>tOXpBJMDcv!Mq54P+n4(@dijL^EpO1wdg~q+!DT3lB<>9AA<tf}r`cy*Y
zjhdtI5OMNT6H0#L@X?3Sm%kGA7Vl5JMh4bZuEy3uPM@!CETCEPH`bN;-XzRi=Uj<*
zy1%%&-XKAU$eorwmA2>NSe!T1XgC=J^)IP0XEZ()_vpu!!3HQyJhwh?r`Ae%Yr~b%
zO*NY9t9#qWa@GCPYOF9aron7thfWT`eujS4`t2uG6)~JRTI;f(ZuoRQwjZjp5Pg34
z)rp$)Kr?R+KdJ;IO;pM{$6|2y<ydqUT>=k_siqvp%)2||cHTe|b5Ht8&A{wazGNca
zX$Ol?H)E_R@SDi~4{d-|8nGFhZPW;Cts1;08TwUvLLv&_2$O6Vt=M)X;g%HUr$&06
zISZb(6)Q3%?;3r~*3~USIg=HcJhFtHhIV(siOwV&QkQe#J%H9&E21!C*d@ln3E@J*
zVqRO^<)V^ky-R|%{(9`l-(JXq9J)1r$`uQ8a}$vr9E^nNiI*thK8=&UZ0dsFN_eSl
z(q~lnD?EymWLsNa3|1{CRPW60>DSkY9YQ;$4o3W7Ms&@&lv9eH!tk~N&dhqX&>K@}
zi1g~GqglxkZ5pEFkllJ)Ta<O6(=j@N_YA4+m(2{`eegY!ZQ2^vALR<RgI6}@oJ;#Q
znz8Q+-c~%`4rP2NC%pl75Va8USgVLiZL*d#F+vsZ>1I^c&Bt6#r(QLQ02yHTaJB~-
zCcE=5tmi`UA>@P=1LBfBiqk)HB4t8D?02;9eXj~kVPwv?m{5&!&TFY<knKM4n)!Ph
zud#tQR<C%y^0~@DM`a6)LueXb{y5yQ{QdB(pAh_Nx5%(@`(@LGcfv~*Wnh>hu>3=_
zsGmYZ^mo*-j69-42y&Jj0cBLLEulNRZ9vXE)8~mt9C#;tZs;=#M=1*hebkS;7(aGf
zcs7zH(I8Eui9UU4L--))yy<O?Y9%&u#eL}xwSxY`c_7W7JcW|iGpGkga)aNR1NtnV
zsQ!z$?wFhYyP2W>`&d&$In&VA2?DAEss4LAPCLd>-$i?lpXvn!gu^JJ$(DoUlc<UX
zkfN^{^6VUfk4qkMm$Kwn8iV7sNct*PLMd9WvC5vZDv+`U&Q-ZGnQrAf`3aDr3@9ll
zcH!Spxa>6wE98VLZ*z`QGQov5l4Fm_h?V-;mHLYDVOwKz7>e4+%AzeO>P6v}ndPW|
zM>m#6Tnp7K?0mbK=>gV}=@k*0Mr_PVAgGMu$j+pWxzq4MAa&jpCDU&-5eH27Iz>m^
zax1?*HhG%pJ((tkR(V(O(L%7v7L%!_X->IjS3H5kuXQT2!ow(;%FDE>16&3r){!ex
zhf==oJ!}YU89C9@mfDq!P3S4yx$aGB?rbtVH?sHpg?J5C->!_FHM%Hl3#D4eplxzQ
zRA+<@LD%LKSkTk2NyWCg7u=$%F#;SIL44~S_OGR}JqX}X+=bc@swpiClB`Zbz|f!4
z7Ysah7OkR8liXfI`}IIwtEoL}(URrGe;IM8%{>b1SsqXh)~w}P>yiFRaE>}rEnNkT
z!HXZUtxUp1NmFm)Dm@-{FI^aRQqpSkz}ZSyKR%Y}Y<Q&WCZ#r6E1BX>HNzBk)ZIp}
zMtS=aMvkgWKm9&oTcU0?S|L~CDqA+sHpOxwnswF-fEG)cXCzUR?ps@tZa$=O)=L+5
zf%m58cq8g_o}3?Bhh+c!w4(7AjxwQ3>WnVi<{{38g7yFboo>q|+7qs<$8CPXUFAN<
zG&}BHbbyQ5n|qqSr?U~GY{@GJ{(Jny{bMaOG{|IkUj7tj^9pa9|FB_<+KHLxSxR;@
zHpS$4V)PP+tx}22fWx(Ku9y+}Ap;VZqD0AZW4gCDTPCG=zgJmF{|x;(rvdM<f^n;1
zK8nxqYR0vKH0j_lDlpPKZ&MGwB{uMn`Y?-pX_!Y3B)-jRTo&O^8pthSIE0&!0lK-e
z!icB)w$6c%`*wHe1Fz?U<?TZjyK8JuG0UC$ZCke^hB6%|%?F1X$1uXP^O6o2PCxRi
zb_5>|2|9a}cex6xrMkERnkE;}jvU-kmzd%_J50$M`lIPCKf+^*zL=@LW`1SaEc%=m
zQ+lT06Gw+wVwvQ9fZ~#qd430v2HndFsBa9WjD0P}K(rZYdAt^5WQIvb%D^Q|pkVE^
z<umx#lsJRB+%c;<9rnPd8EZLPtdXF8y%S2H@ouSr1qOSZk)i$^f3$XJg9S<yWHU+7
zyJJ%uaeqUu9^B#_n2Ir_nIYz}G3SZ_C|5(`c6Umw#_yBgS{UuH<_&oA!pzsBV%g@=
zbOzA1`OA)5@fUbFgE?}SVsAWLGVW?bLByY#LXc_UE*PeNYf={+;y-vKh@$+ue#tsu
zvCKGNIefnaDUw~m>te$&#~zmULFACGfS#g=2OLOnIf2Of-k!(BIHjs77nr!5Q1*I9
z1%?=~#Oss!rV~?-6Gm~BWJiA4mJ5TY&iPm_$)H1_rTltuU1F3<q3^BkZ&_=uH2O^K
z_@P51{S(`LSn2c$B(%yIo=USrBfbZd`d}}bPg&0tq#f2!iBZc9Z>I(qTQ^U$S>%$l
z)Wx1}R?ij0idp@8w-p!Oz{&*W;v*IA;JFHA9%nUvVDy7Q8woheC#|8QuDZb-L_5@R
zOqHwrh|mVL9b=+$nJxM`3eE{O$sCt$UK<z2Kci>^2@L$R(r^-_+z?lOo+me-VW=Zw
z-Bn>$4ovfWd%S<yVaGCiuR4}{b|0tZrdZYdj>PY`ab-u9{INc*k2h+yH%toDHIyqQ
zO68=u`N}RIIs7lsn1D){)~%>ByF<>i@qFb<-axvu(Z+6t7v<^z&gm9McRB~BIaDn$
z#xSGT!rzgad8o>~kyj#h1?7g96tOcCJniQ+*#=b7wPio>|6a1Z?_(TS{)KrPe}(8j
z!#&A=k(&Pj^F;r)CI=Z{LVu>uj!_W1q4b`N1}<u;U_b=ty|*S;DXz*F-)!)F0Pv+Q
zRm=!T^zTn*A6)$bH1cl>E(i%;BWjbEcnD=mv$FL$l?zS6bW!{$7j1GR5ocn94P2u{
z70tAAcpqtQo<@cXw~@i-@6B23;317|l~S>CB?hR5qJ%J3EFgyBdJd^fH<ao26B*v)
zGUaiB1_W^rk+d9W+h~_tj2D}FfPY~B-BL~)lzp|oFVck~{r8sIIlCCz*!+vHo}=OE
zgW`_*^W8W`lLWY+AcSs_rDfwxzeg23BqYRWi$p*e3{sqP3719K#C&l{6X2y_TO;0c
zk>Zu7AzHF(BQ!tyAz<BOKd)9J&U=CXtSstlZ^pj1MMKG$H~T%~{<Zzl`|=?>^L0`X
z23S4Fe{2X$W0$zu9gm%rg~A>ijaE#GlYlrF9$ds^QtaszE#4M(OLVP2O-;XdT(XIC
zatwzF*)1c+t~c{L=fMG8Z=k5lv>U0;C{caN1NItnuSMp)6G3mbahu>E#sj&oy94KC
zpH}8oEw{G@N3pvHhp{^-YaZeH;K+T_1AUv;IKD<=mv^&Ueegrb!yf`4VlRl$M?wsl
zZyFol(2|_QM`e_2lYSABpKR{{NlxlDSYQNkS;J66aT#MSiTx~;tUmvs-b*CrR4w=f
z8+0;*th6kfZ3|5!Icx3RV11sp=?`0Jy3Fs0N4GZQMN=8HmT6%x9@{Dza)k}UwL6JT
zHRDh;%!XwXr6yuuy`4;Xsn0zlR$k%r%9abS1;_v?`HX_hI|+EibVnlyE@3aL5vhQq
zlIG?tN^w@0(v9M*&L+{_+RQZw=o|&BRPGB>e5=ys7H`nc8nx)|-g;s7mRc7hg{GJC
zAe^vCIJhaj<rU9d-Ny$?&}6qolDp7MDNB|ftJI`wi&EhaNhHyV!qQfb_Q>mm7C6g!
zL&!WAQ~5d_5)00?w_*|*H>3$loHrvFbitw#WvLB!JASO?#5Ig5$Ys10n>e4|3d;tS
zELJ0|R4n3Az(Fl3-r^QiV_C;)lQ1_CW{5bKS15U|E9?ZgLec@%kXr84>5jV2a5v=w
z?pB1GPdxD$IQL4)G||B_lI+A=08MUFFR4MxfGOu07vfIm+j=z9tp~5i_6jb`tR>qV
z$#`=BQ*jpCjm$F<t%->0+F)L%xRlnS%#&gro6PiRfu^l!EVan|r3y}AHJQOORGx4~
z&<)3=K-tx518DZyp%|!EqpU!+X3Et7n2AaC5(AtrkW>_57i}$eqs$rupubg0a1+WO
zGHZ<ibB|u&Jk?tUGE|?~tl5Wk^jlF-{lPR;A5i_2TUJp0F;38(es)rx!d-0-m4P-!
z$~|tV-l!W$kj%u&D~eY>KLN2L0D;ab%{_S1Pl<uJj0^JDir_rTS5CizT^_%RU3Cwc
zfrHnUz@7T<9U{4O%SD*qhHiuSo|}zv3Hju=+>m|hx8R?O14*w*f&2&bB050n!R2by
zw!@XOQx$SqZ5I<(Qu$V6g>o#A!JVwErWv#(Pjx=KeS0@hxr4?13zj#oWwPS(7Ro|v
z>Mp@Kmxo79q|}!5qtX2-O@U&&@6s~!I&)1WQIl?lTnh6UdKT_1R640S4~f=_xoN3-
zI+O)$R@RjV$F=>Ti7BlnG1-cFKCC(<Cu6MUX_IBo_X35UX_<48O%CsD25V#~38R@v
zrtYIkXvf1CPBiwCyX%=srhV;xX%`i(ICDDA6?ULc$>t|Qjm{SalS~V-t<tWc<BV1;
zl_jTzT9WFjJ~QeHdyK~l8BJDMnzlnWR?wVpES%e_)7o6AC7w~n8DW>X#+2ekRhwmN
zZr`8{QF6y~Z!D|{=1*2D-JUa<(1Z=;!Ei!KiRNH?o{p5o3crFF=_pX9O-YyJchr$~
zRC`+G+8kx~fD2k*ZIiiIGR<8r&M@3H?%JVOfE>)})7ScOd&?OjgAGT@WVNSCZ8N(p
zuQ<bOx+QY5+%zcISi3vNjYInP2!bhID0f~o9vf+-rhLL{z1^Ck(#`qsgx*#5swyXT
zW>G~76GE3%(%h1*vUXg$vH{ua0b`sQ4f0*y=u~lgyb^!#CcPJa2mkSEHGLsnO^kb$
zru5_l#nu=Y{rSMWiYx?nO{8I!gH+?wEj~UM?IrG}E|bRIBUM>UlY<`T1EHpRr36vv
zBi&dG8oxS|J$!zoaq{+JpJy+O^W(nt*|#g32bd&K^w-t>!Vu9N!k9eA8r!Xc{utY>
zg9aZ(D2E0gL#W0MdjwES-7~Wa8iubPrd?8-$C4BP?*wok&O8+ykOx{P=Izx+G~hM8
z*9?BYz!T8~dzcZr#ux8kS7u7r@A#DogBH8km8Ry4slyie^n|GrTbO|cLhpqgMdsjX
zJ_LdmM<k-VT(PJj^o5EU9N}k~N8WtTC-IGZ`@y!j$sb^<J0;G@Qw2FyCS!3I_NA$4
z1f*^zNnJ-2I0{osRtQts^V?d(XVo8U_KxuKrC4_(t;qw3T5EIb1gaTiFo3yT&HyFL
z)1@c>#I&4LqqsOUIXK8gW;V0B(7^$y#h3h>J0k^WJfAMeYek%Y-Dcb_+0zPJez!GM
zAmJ1u;*rK=FNM0Nf}Y!!P9c4)HIkMnq^b;JFd!S3?_Qi2G#LIQ)TF|iHl~WKK6JmK
zbv7rPE6VkYr_%_BT}CK8h=?%pk@3cz(UrZ{@h40%XgT<XHMp$hv;h9Ymj#=odkk`i
zIQae`_=X#DiyU&N400<DsTZGknpd*i59`VZ(hlvyF={mwt^rqfpTI2&<T<g_&ag;)
zGmdv3$7_Umm3*d1pC<tD4&^z@fB#qeFKt~J{6LxaG(+}hgg|*Exu--o{)yT#JuFbk
z1;57xTR<^av=7#O$H&Fm{_yrRCu{yYJoJ}c0L{yx#X=<de!)a1JlNVp083z$!k1?|
zV)^ps1(-LWXGsXjQ$H~1;~fb%PNBhFP?}VP6a;UunlMt9FKF=Z7zjl{Vm5wAE|rFq
z1-k1JTp~NHKqro%r!pzCx_v|wc?QzZb;2~gF~c_e(7kI?gTf@Sl;&2zZGHBdXMu10
ze@7*&Fo`5Se_O_wf2XJaP0LvFpQz-YM_~(7W9xrFv6N(8d1OJ9&>hP*-Oeo`T0eq9
zA8BnWZKzCy5e&&_GEsU4*;_k}(8l_&al5K-V*B<vk;3_5Y5GsaQRXITZ7En5+1cr}
zoEOfUwTvx(fB#P)g+XFq$3s&MIR%RGTY*w)u1F^x)_4KQF{~iPr`KL;JvOUA!)Bl1
z9p@=a4SF1Po>FM=O~;MgRkYsOs%9eOY6s6AtE*<7GQAR2ulC3RAJrG_P1iQK5Z~&B
z&f8X<>yJV6)oDGIlS$Y*D^Rj(cszTy5c81a5IwBr`BtnC6_e`ArI8CaTX_%rx7;cn
zR-0?J_LFg*?(#n~G8cXut(1nV<GD8u>F0Oka$A<Xrs+39Fcy_UX4+wJsyL;Ad#|W_
zoZzvm=HX_}HyBFXw08LR4`!>$1FGcERU<^ggx;p@CZc?3UB41RY+wLS`LWFNSs~YP
zuw1@DNN3lTd|jDL7gjBsd9}wIw}4xT2+8dBQzI00m<@?c2L%>}QLfK5%r!a-iII`p
zX@`VEUH)uj^$;7jVUYdADQ2k*!1O3WdfgF?OMtUXNpQ1}QINamBTKDuv19^{$`8A1
zeq%q*O0mi@(%sZU>Xdb0Ru96CFqk9-L3pzLVsMQ`Xpa~N6CR{9Rm2)A|CI21L(%GW
zh&)Y$BNHa=FD+=mBw3{qTgw)j0b!Eahs!rZnpu)z!!E$*eXE~##yaXz`KE5(nQM`s
zD<pRzTt9Ga7_+7V*(vgeNjPLq#T#Hzh4oMyk4m^&mDHa-;LXM`BMlpNPVXZiWB!7-
zsrLYk0v?{Pinwui>!$vW9XH)iMxu9R>r$VlLk9oIR%HxpUiW=BK@4U)|1WNQ=mz9a
z^!KkO=>GaJ!GBXm{KJj^;kh-MkUlEQ%lza`-G&}C5y1>La1sR6hT=d*NeCnuK%_LV
zOXt$}iP6(YJKc<sy3IAHEj2Y-R)3MS?rC66If(_;`nr~Onw70}P1hEBm+!itUy1C`
zNpHpki6_MC$7{&PcGt_M^XxtUNv`)v*iXj|1|scVAGjs`iL^4oZ_EXmgi;5b%!&n+
ziIZl66eo#;Grax0|H0Th2FKohS;Mhy+qQFJ+qP}z#I|iaIk9cqPEKq)Z~pVlJTr68
zJXP<9U-g%+tE>9j-Fxq~*ItVUqljQ8?oaysB-EYtFQp9oxZ|5m0^Hq(qV!S+hq#g(
z?|i*H2MIr^Kxgz+3vIljQ*Feejy6S4v~jKEPTF~Qhq!(ms5>NGtRgO5vfPPc4Z^AM
zTj!`5xEreIN)vaNxa|q6qWdg>+T`Ol0Uz)ckXBXEGvPNEL3R8hB3=C5`@=SYgAju1
z!)UBr{2~=~xa{b8>x2@C7weRAEuatC)3pkRhT#pMPTpSbA|tan%U7NGMvzmF?c!V8
z=pEWxbdXbTAGtWTyI?Fml%lEr-^AE}w#l(<7OIw;ctw}imYax&vR4UYNJZK6P7ZOd
zP87XfhnUHxCUHhM@b*NbTi#<F%^Z|eYk;z2_lk&RVh7s<F5xO2t#&+J(_Y0MwZqrY
zicy(G9z6&?{09>(-8|wcv%3BGNs#zRCVV(W?1Qj6^PPQa<{yaBwZ`+<`w|;rqUY_C
z&AeyKwwf*q#OW-F()lir=T^<^wjK65Lif$puuU5+tk$;e_EJ;Lu+pH>=-8=PDhkBg
z8cWt%@$Sc#C6F$Vd+0507;{OOyT7Hs%nKS88q-W!$f~9*WGBpHGgNp}=C*7!RiZ5s
zn1L_DbKF@B8kwhDiLKRB@lsXVVLK|ph=w%_`#owlf@s@V(pa`GY$8h%;-#h@<I6M_
zmQBeLGoxDbjVm1w<bC@g^^f(C!^JL|q^~JGpu4s;6B8qK+5^Y5Qq9@Z$yi1AO_ivY
zEj2e_U?RYgY($+y4MYyPw@#Znh;BBeSCVn{gx>TsO|Y8V=n@*!Rog7<7Cid%apR|x
zOjhHCy<N%=9>fbIt%+*PCveTEcuiDi%Wx;O;+K=W?OF<?k#V0K7HZbJn*d3I+eoH#
zh7<l$XDBh2e3X@S)0xR1RL*X3v}anZNmTRLP100Vjg6lZsj8M4WV<xI)QGou52W;l
zuhA|fVCliO=-1Jr275Sx<Y+M^NR?|nFbw_$)30P*nj@^6+|ZdgJZ18l!}qu2UJUSH
z1R0|V6_~e{(m?E}wM3qUG_Q$t&XB<F<iQ0B?7yghy}e3`PL?Jnn_FH7zs7FvV}ZNC
zg-k1nj;~OaAd0bVj&A17jIs@8j;@`1Da87%aS+QW)3w-0g!{V<Z8Jk0K;t-e-m_}H
z#$DpTS=ZJ!079fqU)_vsqe{Hxz{t*PFrYaR_X{Idl~pteGXU-!HXH+n-Nm@~!Nn|q
zjK^|6Bi^u9r}}mzbV)k27Kxg2T1S^j^-m&I%;>UV%)%~6;gl?<0%)?snDDqIvkHF{
zyI02)+lI9ov42^hL>ZRrh*HhjF9B$A@=H94iaBESBF=eC_KT$8A@uB^6$~o?3W<f9
zYtSca#$B|92$2MdaDc&Gb6a^9xjGKojcdPg>m5t1OIaqF^~><2?4e3c&)@wKn9bD?
zoeCs;H>b8DL^F&>Xw-xjZEUFFTv>JD^O#1E#)CMBaG4DX9bD(Wtc8Rzq}9soQ8`jf
zeSnHOL}<+WVSKp4kkq&?SbETjq6yr@4%SAqOG=9E(3YeLG9dtV+8vmzq+6PFPk{L;
z(&d++iu=^F%b+ea$i2UeTC{R*0Isk;vFK!no<;L+(`y`3&H-~VTdKROkdyowo1iqR
zbVW(3`+(PQ2>TKY>N!jGm<sD6AnGF=)0Re*7@0HIc_nG=Iw;>Go7oeoB8O|P_!Ic@
zZ^;3dnuXo;WJ?S+)%P>{Hcg!Jz#2SI(s&dY4QAy_vRlmOh)QHvs_7c&zkJCmJGVvV
zX;Mtb>QE+xp`KyciG$Cn*0?AK%-a|=o!+7x&&yzHQOS>8=B*R=niSnta^Pxp1`=md
z#;$pS$4WCT?mbiCYU?FcHGZ#)kHVJTTBt^%XE(Q};aaO=Zik0UgLcc0I(tUpt(>|&
zcxB_|fxCF7>&~5eJ=Dpn&5Aj{A^cV^^}(7w#p;HG&Q)EaN~~EqrE1qKrMAc&W<U9$
z+^)SZeHD%7BgO}J?hdzGer@n(wj6EYe>XIE;>@<&)5;gD2?={Xf@Mvn@OJKw=8Mgn
z!JUFMwD+s==JpjhroT&d{$kQAy%+d`a*XxDEVxy3`NHzmITrE`o!;5ClXNPb4t*8P
zzAivdr{j_v!=9!^?T3y?gzmqDWX6mkzhIzJ-3S{T5bcCFMr&RPDryMcdwbBuZbsgN
zGrp@^i?rcfN7v0NKGzDPGE#4yszxu=I_`MI%Z|10nFjU-UjQXXA?k8Pk|OE<(?ae)
zE%vG#eZAlj*E7_3dx#Zz4kMLj>H^;}33UAankJiDy5ZvEhrjr`!9eMD8COp}U*hP+
zF}KIYx@pkccIgyxFm#LNw~<U{`Via^B{gMVX)!#|jCh_xM6ikcrvnXbWa+dwBKVSS
zK-4G%y><Bux_?U*2>G&`;o&5)2`5aogs`1~7cMZQ7zj!%L4E`2yzlQN6REX20&O<9
zKV6fyr)TSc<e)zOh9v0_vMgh>JPPzNTC2gL+0x#=u>(({{D7j)c-%tvqls3#Y?Z1m
zV5WUE)zdJ{$p>yX;^P!UcXP?UD~YM;IRa#Rs5~l+*$&nO(;Ers`G=0D!twR(0GF@c
zHl9E5DQI}Oz7<JMjFG_&ureTdMV@FLw-eP(lgkhig$ECHD*ei9Xc()#hw||V#@>4n
zfKP>&$q0($T4y$6w(p=ERAFh+>n%iaeRA%!T%<^+p<H-<uJNd-sy)}Y@~CN#3u5W*
zjI&ROZ_Rx}0(a!_QCcJ9_!&~Uc}E0JQd?M`bT;-C9-Z?5E@UMa&Fe8b7GLT8{XPb5
zyf%i|Q0Xl+SI;QD#Yg>g?M)@ucY<&59$x9M#n+V&>}=nO9wCV{O~lg&v#+jcUj(tQ
z`0u1YH)-`U$15a{pBkGyPL0THv1P|4e@pf@3IBZS4dVJPo#H>pWq%Lr0YS-SeWash
z8R7=jb28KPMI|_lo#GEO|5B?N_e``H*23{~a!AmUJ+fb4HX-%QI@lSEUxKlGV7z7Q
zSKw@-TR>@1RL%w{x}dW#k1NgW+q4yt2Xf1J62Bx*O^WG8OJ|FqI4&@d3_o8Id@*)4
zYrk=>@!wv~mh7YWv*bZhxqSmFh2Xq)o=m;%n$I?GSz49l1$xRpPu_^N(vZ>*>Z<04
z2+rP70oM=NDysd!@fQdM2OcyT?3T^Eb@lIC-UG=Bw{BjQ&P`KCv$AcJ;?`vdZ4){d
z&gkoUK{$!$$K`3*O-j<Zi{jTPO?eI9$VNEqKns5`DQpeK-2u;%g&U>yM1~<ZC)5l@
zIkr@87e@uhA6MH$K<@Z>p-7T*qb)Ys>Myt^;<CgA4AYJg%~h2T>#1&a%O@x8A+E>!
zY<A2w=yT8AF-EkBw>8=eD`ZG)LVagDLBeHg>=atOG?Kr%h4B%E6m@J^C+U|y)XX@f
z8oyJDW|9g=<#f<{JRr{y#~euMnv)`7j=%cHWLc}ngjq~7k**6%4u>Px&W%4D94(r*
z+akunK}O0DC2A%Xo9jyF;DobX?!1I(7%}@7F>i%&nk*LMO)bMGg2N+1iqtg+r(70q
zF5{Msgsm5GS7DT`kBsjMvOrkx&|EU!{{~gL4d2MWrAT=KBQ-^zQCUq{5PD1orxlIL
zq;CvlWx#f1NWvh`hg011I%?T_s!e38l*lWVt|~z-PO4~~1g)SrJ|>*tXh=QfXT)%(
z+ex+inPvD&O4Ur;JGz>$sUOnWdpSLcm1X%aQDw4{dB!cnj`^muI$CJ2%p&-kULVCE
z>$eMR36kN$wCPR+OFDM3-U(VOrp9k3)lI&YVFqd;Kpz~K)@Fa<T;^H>&FRw}L(SoD
z9B4a+hQzZT-BnVltst&=kq6Y(f^S4hIGNKYBgMxGJ^;2yrO}P3;r)(-I-CZ)26Y6?
z&rzHI_1GCvGkgy-t1E;r^3Le30|%$ebDRu2+gdLG)r=A~Qz`}~&L@aGJ{}vVs_GE*
zVUjFnzHiXfKQbpv&bR&}l2bzIjAooB)=-XNcYmrGmBh(&iu@o!^hn0^#}m2yZZUK8
zufVm7Gq0y`Mj;9b>`c?&PZkU0j4>IL=UL&-Lp3j<Z6B(#r?G=5OTIVqppQV7;$oUZ
z_*bfNYVfkU+>&47B5pAW4JceG{!XCA)kT<%2nqCxj<)uy6XR_uws~>_MEKPOpAQ!H
zkn>FKh)<9DwwS*|Y(q?$^N!6<o$@lIm`~9<Ur$vhpA3&|(gF`!3*mR7y8%R>(51O0
z^JM~Ax{AI1Oj$fs-S5d4T7Z_i1?{%0SsIuQ&r8#(JA=2iLcTN+?>wOL532%&dMYkT
z*T5xepC+V6zxhS@vNbMoi|i)=rpli@R9~P!39tWbSSb904ekv7D#quKbgFEMTb48P
zuq(VJ+&L8aWU(_FCD$3^uD!YM%O^K(dvy~Wm2hUuh6bD|#(I39Xt>N1Y{ZqXL`Fg6
zKQ?T2htHN!(Bx;tV2bfTtIj7e)liN-29s1kew>v(D^@)#v;}C4-G=7x#;-dM4yRWm
zyY`cS21ulzMK{PoaQ6xChEZ}o_#}X-o}<&0)$1#3we?+QeLt;aVCjeA)hn!}UaKt<
zat1fHEx13y-rXNMvpUUmCVzocPmN~-Y4(YJvQ#db)4|%B!rBsgAe+*yor~}FrNH08
z3V!97S}D7d$zbSD{$z;@IYMxM6aHdypIuS*pr_U6;#Y!_?0i<Th%7C5rAWptzT;+8
z^q+L14xti*_qZw!_5cZk%7|55dKPFjM~R1_=~fye=uVS6nDqhi)PszECfuz7L#WMP
z!r+BR+k)}Z4nk;GJ*^~}T(1)9OIq4cSY?CPrYrC|{k3|*qwa%HsQZRC>|&yU*@16l
z*dcMqDQgfNBf}?quiu4e>H)yTVfsp#f+Du0@=Kc41QockXkCkvu>FBd6Q+@FL!(Yx
z2`YuX#eMEiLEDhp+9uFqME_E^faV&~9qjBHJkIp~%$x^bN=N)K@kvSVEMdDuzA0sn
z88CBG?`RX1@#hQNd`o^V{37)!w|nA)QfiYBE^m=yQKv-fQF+UCMcuEe1d4BH7$?>b
zJl-r9@0^Ie=)guO1vOd=i$_4sz>y3x^R7n4ED!5o<f!284U3o`_8>XL3@5**h<Hcp
z@=-jpH@#|<>(xr%Hv)_gILarO46q+MaDOF%ChaymKoI6JU5Pg;7#2n9-18|S1;AK+
zgsn6;k6-%!QD>D?cFy}8F;r@z8H9xN1jsOBw2vQONVqBVEbkiNUqgw~*!^##ht>w0
zUOykwH=$LwX<q*Afk5R&Mctbiz>2j&nLy=@{hr)2O&-wm-NyjW7n~Zs9UlH;P7iP3
zI}S(r0YFVYacnKH(+{*)Tbw)@;6>%=&Th=+Z6NHo_tR|JCI8TJiXv2N7ei7M^Q+RM
z?9o`meH$5Yi;@9XaNR#jIK^&{N|DYNNbtdb)XW1Lv2k{E>;?F`#Pq|&_;gm~&~Zc9
zf+6ZE%{x4|{YdtE?a^gKyzr}dA>OxQv+pq|@IXL%WS0CiX!V<r8QLM#*MRM`MfrL^
z$oozeKu(Nq#u9g*IQ%|9o}%#$^xUC(KN(HGggqLu?YF|WP9UQsZcqciD65u58u(o>
zm$fCePA%lU{%pTKD7|5NJHeXg=I0jL@$tOF@K*MI$)f?om)D63K*M|r`gb9edD1~Y
zc|w7N)Y%do7=0{RC|AziW7#am$)9jciRJ?IWl9PE{G3U+$%FcyKs_0Cgq`=K3@ttV
z9g;M!3z~f_?P%y3-ph%vBM<NK?r_G3UHy@sF~h!`(w;SXKRHdJ%HcvAvAAT4Qk5qo
zMgavqbI6bm$F8YQRbu-Q60TKKDjBrG#E!{k)-_O9;f|9i`Bf)1frR1W)S8zwM~^w3
zgTV=3ki=7ABU#jjo~7zOCe&Y02bU{vhUnu>eS@p7P&Ea8M@97+%XEj*(1E6vHj==d
zjsoviB>j^$_^OI_DEPvFkVo(BGRo%cJeD){6Uckei=~1}>sp299|IRjhXe)%?uP0I
zF5+>?0#Ye}T^Y$u_rc4=lPcq4K^D(TZG-w30-YiEM=dcK+4#o*>lJ8&JLi+3UcpZk
z!^?95S^C0ja^jwP`|{<+3cBVo<?ZqT?*63G^-kLfa79`_!t|3XN?#4oHPT!4lKDlS
z&|`dJ%anWz)cD{Izk)|S%W;K%rEUv+1C{y)t{{-Ax56UAAwT>g$(mRdQmadS+Vh~z
zS@|P}=|z3P6uS+&@QsMp0no9Od&27O&14zHXGAOEy<!&qoA;BoYe5Gd6XaoKZz3O>
zh~OKpymK5C%;LLb467@KgIiVwYbYd6wFxI{0-~MOGf<ue$0Kq0FKpj=H%&N{0wz|#
zv3H@VVff^)dR)T~U^4qB#5GIMnxWTKM_`%1WhbW1RX5wx$kbFST`O+(C#(QYZ1qYr
zH@an#>Tq$nBTB!{SrW<fHPfZ<Irz+G=%+OcFrK=FGzu_OrmZYz+$?F}3ABX2OV4Ts
zsD|%iq->mL9H<Tr^2xz>s}C&l&l#m?s*{tA?BHS4mVKHAVMqm63H<|c5n0~k)-kbg
zXidai&9ZUy0~WFYYKT;oe~rytRk?)r8bptITsWj(@HLI;@=v5|XUnSls7$uaxFRL+
zRVMGuL3w}NbV1`^=Pw*0?>bm8+xfeY(1PikW*PB>>Tq(FR`91N0c2&>lL2sZo5=VD
zQY{>7dh_TX98L2)n{2OV=T10~*YzX27i2Q7W<Ntl33%|cp7@_XQ%`!?JFZGy3th+(
z9`69=1diKn`u=0X>86M4$?gZIXZaBq#sA*{PH8){|GUi;oM>e?ua7eF4WFuFYZSG|
zze?srg|5Ti8Og{<A|-y4P^Vw-HZz`B)6iIj60J8RmoI9z56$O?KklBRh#A8wwXksP
zIQ{KLcY4jo=L=|_hbacrV%Qq6!Kf@BT^Q2N;#Sl~J=F}P(ian;6LK=Pia-a<t*`EG
zsvh#5MX1PG_H?pP{~kcN#Yl5&$-{v%{fm#~)VR&aYV*QjB+l!bVIL2ZIK*(o)mg>O
zeFx<XJQO~(UxB;12D)?%fz2kHQ0)}w{#l!~t+-NWg^{L<(tn&kN>uw9!U+zhyk?@w
zjsA6(oKD=Ka;A>Ca)oPORxK+kxH#O@<!C?nm{!PyX#FIJ5Fce^7Glx51z4q!IF-9T
z(}u{s9F?k?->zhC!!XS4@=swnuMk>t+JmLmFiE^1aX3f<)D@`%K0FGK^gg1a1j>zi
z2KhV>sjU7AX3F$SEqrXSC}fRx64GDoc%!u2Yag68Lw@w9v;xOONf@o)Lc|Uh3<2<E
zaTlN?EK~WL>1ctTYu-mFZuHk*+R{GjXHIGq3p)tFtQp%TYqD=j1&y)>@zxoxUJ!G@
zgI0XKmP6MNzw>n<q>RxK$-Gbzs}dyfFzt>#5;f6oR27ql!%+{tr+(`(>%51|k`ML}
zY4eE)Lxq|JMas(;JibNQds1bUB&r}ydMQXBY4x(^&fY_&LlQC)3hylc$~8&~|06-D
z#T+%66rYbHX%^KuqJED_wuGB+=h`nWA!>1n0)3wZrBG3%`b^Ozv6__dNa@%V14|!D
zQ?o$z5u0^8`giv%qE!B<jmp7|q6krbi_G>zZ!3j;BlDlJDk)h@9{nSQeEk!z9RGW)
z${RSF3phEM*ce*>Xdp}585vj$|40=&S{S-GTiE?Op*vY&Lvr9}BO$XWy80IF+6@%n
z5*2ueT_g@ofP#u5pxb7n*fv^Xtt7&?SRc{*2Ka-*!BuOpf}neHGCiHy$@Ka1^Dint
z;DkmIL$-e)rj4o2WQV%Gy;Xg(_Bh#qeOsTM2f@KEe~4kJ8kNLQ+;(!j^b<yk)2Ga*
z`Da!i<uGd>gJMcNhvklP5Z6I+9Fq@c&D~8Fb-4rmDT!MB5QC{Dsb;BharP*O;SF4&
zc$wj-7Oep7#$WZN!1nznc@Vb<_Dn%ga-O#J(l=OGB`dy=Sy&$(5<R7(thvp%XRU@s
zvD!RD|Ap4-iUR15Zc?p`W5PfYcwl?pqP=!?Ly8YQJV-_=LA<5F14$ueWH9bhJ%_)w
z7V5=v(sMJKH%(v>-n3zzu%d7E#^8`T@}V+5B;PP8J14#4cCPw-SQTdGa2gWL0*zKM
z#DfSXs_iWOMt<QB?xsXtvjwXo?shW#x+{vW<m&Lb0}Ev3ll-+Cv)H2yx72H2vfbao
z)&}#dlu-#QrCe#mI_dwQZ8aV{yR-kuQpUpp0FeFvxZ;1G7pk86VlScc8cAV!@aPt-
zLbAEaXYm_LG-m+FWTqvpGKKfn>)0*+Y>Lkd=LlyoHjublNLefhKBv@JoC>P7N1_#>
zv=mLWe96%EY;!ZGSQDbZ<UL<`wO?;KrMew|zF)HM0NF!*C)o2Xb2C86^s8tmic{Un
z9|@ov+&qEN=O7wL#??)8qe;19z|E|XhPC!>Wb#;tzqAGgx~uk+-$+2_8U`!ypbwXl
z^2E-FkM1?lY@yt8=J3%QK+xaZ6ok=-y%=KXCD^0r!5vUneW>95PzCkOPO*t}p$;->
ze5j-BLT_;)cZQzR2CEsm@rU7GZfFtdp*a|g4wDr%8?2QkIGasRfDWT-Dvy*U{?IHT
z*}wGnzdlSptl#ZF^sf)KT|BJs&kLG91^A6ls{CzFprZ6-Y!V0Xysh%9p%iMd7HLsS
zN+^Un$<a&ma`v<=_o?VGrwUGJw=O>tDV)T@i!v?3o0Fsx2qI(AX_$dDkBzQ@fRM%n
zRXk6hb9Py#JXUs+7)w@eo;g%QQ95Yq!K_<hHRNS7N*hWtqjDKgW@A%kX02U4KOU=O
zvdAz#;kT}}XkJ!QzU;bHk8JAq+{_%?pw^sYL|x|PeydT9?IGU9Y51^30i-q1#6m%s
zrcB|gv?mw<4R{=sU0@i6UC;{u0I&8wO=u?6r+b>d=z{0dGS+pToEI6=Bo8+{k$7&Z
zo4>PH(`ce8E-Ps&uv`NQ;U$%t;w~|@E3WVOCi~R4oj5wP?%<*1C%}Jq%a^q~T7u>K
zML5AKfQDv6>PuT`{SrKHRAF+^&edg6+5R_#H?Lz3iGoWo#PCEd0DS;)2U({{X#zU^
zw_xv{4x7|t!S)>44J;KfA|DC?;uQ($l+5Vp7oeqf7{GBF9356nx|&B~gs+@N^gSdd
zvb*>&W)|u#F{Z_b`f#GV<S^Bd2~Pyg*5%eHlTUz}@u5PV*7l^~{G24{Qqrx`@++o~
ztboMW3urCbjTB~&;i*a|(eC0qz31>tQ`pYv3#||N{xj1NgB<#=Odt6{eB%#9RLt5v
zIi|0u70`#ai}9fJjKv7dE!9ZrOIX!3{$z_K5FBd-Kp-&e4(J$LD-)NMTp^_pB`RT;
zftVVlK2g@+1Ahv2$D){@Y#cL#dUj<nH(n~Edj4!|Skt}P+Crb$TxBzD_v2O|_fXDf
zyOhDlptC3q?wEW8BD4DYDy}&OXN4=Gin3Clovb(?fZbl&a4@HKR2*IMYDM~#=eox}
zr@@VgW<o#eD*<OYTh-Dp*Y9Q~ulW+qlsH#Z{CRf%{8T(JHO1!sg=2L#MeC>9*&%#6
zd2m9{1NYp>)6=oAvqdCn5#cx{AJ%S8skUgMglu2*IAtd+z1>B&`MuEAS(D(<6X#Lj
z?f4CFx$)M&$=7*>9v1ER4b6!SIz-m0e{o0BfkySREchp?WdVPpQCh!q$t>?rL!&Jg
zd#heM;&~A}VEm8Dvy&P|J*eAV&w!&Nx6HFV&B8jJFVTmgLaswn!cx$&%JbTsloz!3
zMEz1d`k==`Ueub_JAy_&`!ogbwx27^ZXgFNAbx=g_I~5nO^r)}&myw~+yY*cJl4$I
znNJ32M&K=0(2Dj_>@39`3=FX!v3nZHno_@q^!y}%(yw0PqOo=);6Y@&ylVe>nMOZ~
zd>j#QQSBn3oaWd;qy<AH3+eYXyF}9Xqs&`j)Js;7`deezzO@(if{@wtUbaD%FKTU}
z5w@ZwEG`RDwK>$&5(5H$Ayi)0haAYO6TH<z?6tyk$+6-5SvI&DI1q3_)(4fdZ5Se9
zBbX2Qt9$@_(SU-uL9$`(89L9b^PH`sRc(S;t{a+9w+wmDT>>FR?rhqHmNOO+(})NB
zLI@B@v0)eq!ug`>G<@htRlp3n!EpU|n+G+AvXFrWSUsLMBfL*ZB`CRsIVHNTR&b?K
zxBg<HRwQH-Pd9_*;+0`Ns@&d6=NP#dv8(oIe;Q1jESZF+juys7ECR&=R-98_-67uO
z3$~aVQBexDk*n5GG0IbgeygBsp6J9Ak`^z@J>sN0BjfB>UVcJ|x%=-zb%OV7lmZc&
zxiupadZVF7)6QuhoY;;FK2b*qL0J-Rn-8!X4ZY$-ZSUXV5DFd7`T41c(#lAeLMoeT
z4%g655v@7AqT!i@)Edt5JMbN(=Q-6{=L4iG8RA%}w;&pKmtWvI4?G9pVRp|RTw`g0
zD5c12B&A2&P6Ng~8WM2eIW<Jfbv|J{U>=wxd?r7A*N+&!Be7PX3s|7~z=APxm=A?5
z<Ct?-npDR`F>t>xB4WG|*Td@VX{Rs)PV0|yK`oI3^xn(4c_j&vgxk_Y3o(-`_5o`V
zRTghg6%l@(qodXN;dB#+OKJEEvhfcnc#BeO2|E(5df-!fKDZ!%9!^BJ_4)9P+9Dq5
zK1=(v?KmIp34r?z{NEWnLB3Px{XYwy-akun4F7xTRr2^zeYW{gcK9)>aJDdU5;w5@
zak=<+-PLH-|04pelTb%ULpuuuJC7DgyT@D|p{!V!0v3KpDnRjANN12q6SUR3mb9<-
z>2r~IApQGhst<D{{Vf5G$!J9FyJ-E-*A*EoNl_C#=MT@p=Xle}_jo(=^L>Z!3*?5V
z8#)hJ0TdZ<kpQa~V_Tj9(@ze|2#~^ENp?G7Jt@Eflo`qt*qnbcmXouacBm8O7C@Os
z{Du2beNd5?Za)HL!ZE%zpwg!kyVuKIF9N=(m23`w*$5>g0M-BK#nGFP>$i=qk82DO
z7h;Ft!D5E15OgW)&%lej*?^1~2=*Z5$2VX>V{x8SC+{i10BbtUk9@I#Vi&hX)q<DD
zB<)Sw)<~oUy)n6wE>Q!LwySI{Bnv%Sm)yh{^sSVJ8&h_D-BJ_YZe5eCaAWU9b$O2c
z$T|{vWVRtOL!xC0DTc(Qbe`ItNtt5hr<)VijD0{U;T#bUEp381_y`%ZIav?kuY<v6
zX67v2C6cbIOG)5k!|qC-WsWgOfACm-q$QibFt!z9&vuM4Ra<@t5?aicE~iV+3AvjG
zIg1RVRV7=ipka+$YiU^iebt34R#X_ZP(5j+8#<1x%dt8dn8LWW*;+`tjJbED5GP+(
z;WzH$-`*O}!b)z&?E7%4&3EtV-<sTj?CD<=L+<P?A5!5CSPGi)w4A<JPgzq;6rR*O
zXNdyY$eTaKl$~z~drmdabQ@3giDtf2>G{iyYdEBPW=*xNSc;Rlt6~F4M`5G+VtOjc
z*0qGzCb@gME5udTjJA-9O<&TWd~}ysBd(eVT1-H82-doyH9RST)|+Pb{o*;$j9Tjs
zhU!IlsPsj8=(x3bAKJTopW3^6AKROHR^7wZ185wJGVhA~hEc|LP;k7NEz-@4p5o}F
z`AD6naG3(n=NF9HTH81=F+Q|JOz$7wm9I<+#BSmB@o_cLt2GkW9|?7mM;r!JZp89l
zbo!Hp8=n!XH1{GwaDU+k)pGp`C|cXkCU5%vcH)+v@0eK>%7gWxmuMu9YLlChA|_D@
zi#5zovN_!a-0?~pUV-Rj*1P)KwdU-LguR>YM&*Nen+ln8Q$?WFCJg%DY%K}2!!1FE
zDv-A%Cbwo^p(lzac&_TZ-l#9kq`mhLcY3h9ZTUVCM(Ad&=EriQY5{jJv<5K&g|*Lk
zgV%ILnf1%8V2B0E&;Sp4sYbYOvvMebLwYwzkRQ#F8GpTQq#uv=J`uaSJ34OW<hi?`
zP14gpnLmL&7H`p4L%8;nC1<^Sv9p;w<Wt?|_#g6*20U1pM3nHRfS>ITeSGo6+-8Xw
znCk*n{kdDEi)Hi&u^)~cs@iyCkFWB2SWZU|Uc%^43ZIZQ-vWNExCCtDW<BUzywAz*
z=(vxHv4d&65KOllAq-#$@U{@M^Tie>jqHs;;tWf$v{}0{p0Rvxkq``)*>+Akq%|Na
zA`@~-Vfe|+(AIlqru+7Ceh4nsVmO9p9jc8}HX^W&ViBDXT+uXbT#R#idPn&L>+#b6
zflC-4C5-X;kUnR~L>PSLh*gvL68}RBsu#2l`s_9KjUWRhiqF`j)`y`2`YU(>3bdBj
z?>iyjEhe-~$^I5!nn%B6Wh+I`FvLNv<KrLHuL|CB_`07?lkcY;`F}7N|9w?h$j;W(
z!pz0d;6Gc;=tP?z1|!0VS^mTNfuvL}h&K?b1^iwS6ciDpxQaBY5Gc}49BtNL@wSAH
zN-`fR84|MY8{n7xC}ub4B$LcEGUf*6``pjVtH+rgy&k|kpb4%YlJ~9w&{2Xuzeu1M
zq`UMUPdX@*+$axeLs?$}*bD{+cnrR~Y#}m-O=_R~Wti_#iWT_s(=ymH^VTEl)dozx
zf?Q;WOl1sf3*~dyaUWrzpj(B{DD^$`<1{1iWu+66OaaANG(C3>auve~eX<+Ipl&04
zT}};W&1a3%W?dJ2=N#0t?e+aK+%t}5q%jSLvp3jZ%?&F}nOOWr>+{GFIa%wO_2`et
z=JzoRR~}iKuuR+azPI8;Gf9)z3kyA4EIOSl!sRR$DlW}0>&?Gb<F6G*f~S_Xmd?&<
zcTp7FM31gIFJU_kMTKVvYfdLV&qNi%10o_cXO}T8c0JA~S7IvYJW{2Mc@QfYQPg<B
zM^$g#zoTZ3_m+CrHFauLo}15?s=HP)?JB{H_0#&&1l#xC0PUJEil&(!l3&4$4Oabl
z(t*+m#j)cGDV4roRR~6gh^^UTE<ORsZfX<yztwQw%!x#Q2cf-H-Ei*R_oEBu0*M}r
zaR*fDOfo;PPv!PbT7OdxPFjDUG&0BrIox?s;EQMl0XBMs85AAF!I(xegz)j_*&}F&
zVxM4&whWS?c~-!t@$rAV{-U7+LEraxy0yVT-$%W;DztTaLwSHk;hmJtJCfzd$8ZUU
zM49WF9Y$VtKqVmyl%^86>gPojmj<IO9KHY)gcrh=qY_}jG!}())PpS;BXBA!e*hSR
zilZbQ&4Xd81)(e#05gdbS)_Rc7=w(fM<O8%<WUPqvy2OZsgKBL!XxkiWU2;{7$;C6
z9R+3;R|H$*pUT7|00m@1wlw|z2Pi1^Q3j9tQwHGtFdr%Y_fp{BLtn(*#K`48rPtM-
zeUXnbzjJ6`4-eFtz^q{qhyCKLVL%|Li&oS2mxY?F!w9Q6rOe*>mnln;cTqCt=ADbE
zZ8GAnoM+S1(5$i8^O4t`ue;vO4i}z0wz-QEIVe5_u03;}-!G1NyY8;h^}y;tzY}i5
zqQr#Ur3Fy8sSa$Q0ys+f`!`+>9WbvU_I`Sj;$4{S>O3?#inLHCrtLy~!s#WXV=oVP
zeE93*Nc`PBi4q@%Ao$x4lw9vLHM!6mn3-b_cebF|n-2vt-zYVF_&sDE--J-P;2WHo
z+@n2areE0o$LjvjlV2X7ZU@j+`{*8zq`JR3gKF#EW|#+{nMyo-a>nFFTg&vhyT=b}
zDa8+v0(Dgx0yRL@ZXOYIlV<CBxH&myw-{Mgw|3Z>SZ0|MFizy0VPW8;AfA5|pe!#j
zX}Py^8fl5SyS4g1WSKKtnyP+_PoOwMMw<ZOz9;D51vwfT?(7P{J9uC+df4xbrr2zV
zB?xsPrQn{*Mv;KOgSS&5@+LuXkea1)Zq>u`(i@Z)diJp~U54*-miOchy7Z35eL>^M
z4p<-aIxH4VUZgS783@H%M7P9hX>t{|<kmOhY+3PLOlAj^>RU7$n4T(brCG#h9e9p!
z+o`i;EGGq3&pF;~5V~eBD}lC)>if$w%Vf}AFxGqO88|ApfHf&Bvu+xdG)@vuF}Yvk
z)o;~k-%+0K0g+L`Wal<EEr#pT9&cB!85zTR)doGHYQA;Y4iG{j=~O<Gh^(9@<v8&o
z)RDtDdQW7hx3ZX>a!$=ZV|z$e%>f0%XoLib%)!R^RoS+{!#X?h-<kGFCJU+*P>6uu
zF&&KxORdZU&EwQFITIRLo(7TA3W}y6X{?Y%y2j0It!ekU#<)$qghZtpcS>L3uh`Uj
z7GY;6f$9qKynP#oS3$$a{p^{D+0oJQ71`1?OAn_m8)UGZmj3l*ZI)`V-a>MKGGFG<
z&^jg#Ok%(hhm>hSrZ5;Qga4u(?^i>GiW_j9%_7M>j(^|Om$#{k+^*UL<NK}BVp#gd
zBD%KcbS)b{|CXjYx_P3_eGPFMMz1_Nk540?Us_HPBMy;cERLy0SxfOpX5xzV&cl#)
zt&&6pjk~F}sDOyA9s+XeyTf%9%I=S3nCcDp2Af;QpT#R>nEgzW_1gCICtAD^WpC`A
z{9&DXkG#01Xo)U$OC(L5Y$DQ|Q4C6CjUKk1UkPj$nXH##J{c8e#K|&{mA*;b$r0E4
zUNo0jthwA(c&N1l=PEe8Rw_8cEl|-eya9z&H3#n`B$t#+aJ03RFMzrV@gowbe8v(c
zIFM60^0&lCFO10NU4w@|61xiZ4CVXeaKjd;d?sv<gv6;u6elr3xr~p;CLrK1=+ZBw
z$Y<W|q#kt(G{*D&Qkw-trQSG}>52XM*lS8XiVjgWpRB;&U_C0g+`6B5V&w|O6B*_q
zsATxL!M}+$He)1eOWECce#eS@2n^xhlB4<_Nn?yCVEQWDs(r`|@2GqLe<#(|&P0U?
z$7V5IgpWf09uIf_RazRwC?qEqRaHyL?iiS05UiGesJy%^>-C{{ypTBI&B0-iUYhk>
zIk<5xpsuV@g|z(AZD+C-;A!fTG=df1=<%nxy(a(IS+U{ME4ZbDEBtcD_3V=icT6*_
z)>|J<Y(kIYsG`z6gyX&}qOiN*?9OL086m5X{hq6Ig4#W@iNENBRsw2^%kU7#L2|pp
zfdZ~o%7N|AM$3+2Q82nCBw2dK>?>&6%nvHhZERBtjK+s4xnut*@>G<VN!iYC0JC0F
z7C4lQZ~9^GNSctWumcZ#1X037wz24bZkA{*jo3r`%UXDS^K_+8hV8{gw8t#vVSJ;%
z9c?Ez0!rvl#uwpap|4<*d9UuKSjWn24OFIi&rE}!=sVnu59lQ~@4HIl&h;AmLia?2
zUL-?{m1bz0QDnL&$JD)qE4*iTL#%6dSw8A41Gt4doXWr-`$rleCkVKi0#mfRM!)mY
zIRoubkmw30NIQhtCJYLo7$LOA<y9EL8N*i`%g_PIUD8SHni$er{_LTTN*Wqfx<buS
zb$(_9;wX-c*`xTSvy6C@PDNyy2YoKV;`dWQKiLwa26g^kXh`=t6PlPNT~Q9So~U*X
z3nx6eVXazyi3=nJQw6@jVna;b;RTqKnF0f_=;<}6<0^@)YUgQHPBXXoP$FjaOB^=t
zP%99lNT=pyPA&53Wj%hQT>AmA5m*OTp$!^CHTr}vM4n(X1Q*;{e-Rd2BCF-u@1ZGm
z!S8hJ6L=Gl4T_SDa7Xx|-{4mxveJg=ctf`BJ*fy!yF6Dz&?w(Q_6B}WQVtNI!BVBC
zKfX<>7vd6C96}XAQmF-Jd?1Q4eTfRB3q7hCh0f!(JkdWT5<{iAE#dKy*Jxq&3a1@~
z8C||Dn2mFNyrUV|<-)C^_y7@8c2Fz+2jrae9deBDu;U}tJ{^xAdxCD248(k;dCJ%o
z`y3sADe>U%s<JWQeHbE;y_*tD>uxwwv~8A1+R$VB=Q?%U?4joI$um;aH+eCrBqpn-
z%79D_7rb;R-;-9RTrwi9dPlg8&@tfWhhZ(Vx<DUpdj8XcHW`q{6yqiIzcmGmhsxE(
z@-zez1_>&1PQ+6(huX`;M9x~LrW~~#3{j0Bh2kDU$}@!fFQej4VGkJv?M4rU^x!RU
zEwhu$!CA_iDjFjrJa`aocySDX16?~;+wgav;}Zut6Mg%C4><PTsD!{kIiIRy9=1Aw
zY;M8Eqj=FH#bx<r<6@cBrQ0UMw6e8^v|*8L6vZKlp_PoKF*td|-B1-bh(bXl;KGoy
ze|2jEKeeu5MGO@KpjsEha>}8FL?8)Kgwc(Qlj{@#2Pt0?G`$h7<q&K~4OS{ORv@L7
z>P#M+qoXtlV@d}%c&OzO+QYKK`kyXaK{U(O^2DyIXCZlNQjt0^8~8JzNGrIxhj}}M
z&~QZlbx%t;MJ(Vux;2tgNKGlAqphLq%pd}JG9uoVHUo?|hN{pLQ6Em%r*+7t^<);X
zm~6=qChlNAVXNN*Sow->*4;}T;l;D1I-5T{Bif@4_}=>l`tK;qqDdt5zvisCKhMAH
z#r}`)7VW?LZqfdmXQ%zo5bJ00{Xb9^YKrk0Nf|oIW*K@(=`o2Vndz}ZDyk{!u}PVx
zzd--+_WC*U{~DH3{?GI64IB+@On&@9X>EUAo&L+G{L^dozaI4C3G#2wr~hseW@K&g
zKWs{uHu-9Je!3;4pE>eBltKUXb^*hG8I&413)$J&{D4N%7PcloU6bn%jPxJyQL?g*
z9g+YFFEDiE`8rW^laCNzQmi7CTnPfwyg3VDHRAl>h=In6jeaVOP@!-CP60j3+#vpL
zEYmh_oP0{-gTe7Or`L6x)6w?77QVi~<VDx`fw;L`VTJ3A_($%0y%60sa5-^3a6v=d
zmmak>jD8lWN@3RHcm80iV%M1A!+Y6iHM)05iC64tb$X2lV_%Txk@0l^hZqi^%Z?#-
zE;LE0uFx)R08_S-#(wC=dS&}vj6P4>5ZWjhthP=*Hht&TdLtKDR;rXEX4*z0h74FA
zMCINqrh3Vq;s%3MC1YL`{WjIAPk<IG3Wg4CtfDQ*vr2eaPwC^O8I68!!&nV4u+r_F
z?Axith|qN&KZ99L>VL#3rj^9Pj9Ss7>7duy!9H0vYF%>1jh)EPqvlr6h%R%CxDsk|
z!BACz7E%j?bm=pH6Eaw{+suniuY7C9Ut~1cWfOX9KW9=H><&kQlinPV3h9R>3nJvK
z4L9(DRM=x;R&d#a@oFY7mB|m8h4692U5eYfcw|QKwqRsshN(q^v$4$)HgPpAJDJ`I
zkqj<O$?%~S@(l(Nhxp_sGA`$A%xJ7kG%+@Mg|5Rt6m|$*ZUs_O&tJj;*GHA8ccUky
ztdHVq5%mDvVz0+qo6-yO?&N>q(8Cd!K!+wCd=d@w%~e$=gdUgD&wj$LQ1r>-E=O@c
ze+Z$x{>6(JA-fNVr)X;*)40Eym1TtUZI1Pwwx1hUi+G1Jlk~vCYeXMNYtr)1?qwyg
zsX_e*$h?380O0<gk^3K^_@6N5|2rtFRkhT9jP1T;nZ|k<{vtt&=cCpw1{DoTNi|6*
zNR~&3M3w}sVsT4{Fd&%jPlD{py<BN_zia9hI9Ip7U3}jPq_+H)r9(in7-!PpJihq-
zw0+F|?s|8=Bj*cv!`7|95cO>0ou?0R@7-Fc59o$UvyVs4cUbujHUA>sH!}L54>`e`
zHUx#Q+Hn&Og#YVOuo*niy*GU3rH;%f``nk#NN5-xrZ34NeH$l`4@t);4(+0|Z#I>Y
z)~Kzs#exIAaf--65L0U<!Yaeav3)qH(`Lb>HT_SvV8O2WYeD>Mq^Y6L!Xu8%vnp<f
zF-ML?EA-QLj_KdFHPDT9@{&CgOfV=>ofG@w!}R7M28?i1*T&zp3X4^OMCY6(Dg<-!
zXmcGQrRgHXGYre7GfTJ)rhl|rs%abKT_Nt24_Q``XH{88NVPW+`x4ZdrMuO0iZ0g`
z%p}y};~T5gbb9SeL8BSc`SO#ixC$@QhXxZ=B}L`tP}&k?1oSPS=4%{UOHe0<_XWln
zwbl5cn(j-qK`)vGHY5B5C|QZd5)W7c@{bNVXqJ!<MH|d3P$j9&VqkPfJu|wcu=-=7
zLO~2}`PnP-On3_NOs4{hJ&4M7x*M^%g^_s4zus|ykus<=xkIi<3#(~DK73KtIuaza
zq3Ok{j4$g8v7e>!n$^ufc?N9C-BF2QK1(kv++h!>$QbAjq)_b$$PcJdV+F7hz0Hu@
zqj+}m0qn{t^tD3DfBb~0B36|Q`bs*xs|$i^G4uNUEBl4g;op-;Wl~iThgga?+dL7s
zUP(8lMO?g{GcYpDS{NM!UA8Hco?#}eNEioRBHy4`mq!Pd-9@-97|k$hpEX>xoX+dY
zDr$wfm^P&}Wu{!%?)U_(%Mn79$(ywvu*kJ9r4u|MyYLI_67U7%6Gd_vb##Nerf@>&
z8W11z$$~xEZt$dPG}+*IZky+os5Ju2eRi;1=rUEeIn>t-AzC_IGM-<V%6qUamagiO
zojXi&7L8*IWlJ}@8}`JxyIBN$u|oRzm@w!)u;`JJ^U^Jm{ZDgyeTstN3t7NGnWGqQ
z+EIa8gC_h)Zo3JN!q<7m%o03s)htD&NG?lXEop9Gx?#VaLBB3SgcFmh3F=U430hIe
z6>IXWK3^6QNU+2pe=MBn4I*R@A%-iLD<B3fBYdI`-^xCP#WQb@1Qb6Wc1|7$rrI$0
zS^}=1l87G|X!N)P#&2-ZEFApzIK5kn5Gfb61a_<ma9;3$9f;mz8FokB;4w(Y@!^ya
zu;TjxPaQyO7n{FcIlqEr<3P?P4i)3rOGc5I4};xC;qQa&0^@jzv_HUBAQxZ^AEFm5
zBTlS@?>COHTE-O^wo$sL_h{dcPl=^muAQb`_BRm};=cy{qSkui;`WSsj9%c^+bIDQ
z0`_?KX0<-=o!t{u(Ln)v><gH1Bl$-3LHEdi^mWSpxtdQaHDvah(~iGD|9egFyL#F~
z?1$GP^^-sUm)!YZ{+<LJ&0K7L{O*24r%mkt6Kzb6pOOS-L<yFAV=ioJ*0iBM>%VGL
z0pC=GB7*AQ?N7N{ut*a%MH-tdtNmNC+Yf$|KS)BW(g<Pdkm%)j?It(+K5~xj$IK!{
zp+6C1gYJS33v`?Xo$~~77W&2eyGU19<m6FlNMw~43kikpZyKWVZ@N_*wAeJPS%sqH
z40>QJ*z$d{+{j?(e&hgTy^2|AR9vx1Xre2fagGv0YXWqtNkg*v%40v?BJBt|f9wX5
z{QTlCM}b-0{mV?IG>TW_BdviUKhtosrBqdfq&Frdz>cF~yK{P@(w{Vr7z2qKFwLhc
zQuogKO@~YwyS9%+d-zD7mJG~@?EFJLSn!a&mhE5$_4xBl&6QHMzL?CdzEnC~C3$X@
zvY!{_GR06ep5;<#cKCSJ%srxX=+pn?ywDwtJ2{TV;0DKBO2t++B(tIO4)Wh`rD13P
z4fE$#%zkd=UzOB<aY_AGMe|>7<sX$yw1SM(554I3!t(H9Z3X!Z5et+k4_h&Thynx!
ziTh5;c|GaYsH@W32G0xdhr&KvFp@Nsj=Jm7RQGd8dhSlG9l$e(+pLSdq<w-kZ7W-l
z4}DfW=)F=&(6?c$=|K`W>4gi=-*CuID&Z3zI^-`4<B}t|X$V`AU_-K%EPFL_9twl_
zImK{&j6sLpH7WP2;B>U^S?dHxK8fP*;fE|a(KYMgMUo`T<LMf=xb>HIS1f!*6dOI2
zFjC3<eBkTsS?s~jirYF@Nrja2AilB1Grgok(Q{J!-yMdCbS>O=-AL`<v=HnY;qaP=
z%r%pyN;W|G{!13k5HgL{&1MZLLmT$ii@iPy4pvmSTAaNGh_&&=IOx}7|G?LkOre&5
ze$)ZX{wLMvAK$Ose+l($RV_DU6^yOKl5CcA-LOWBeC<Ob3qkCr#=PoHO}Zi7*=4i{
zhy`P5@RD?fbbG{Ww-4~nmy*sK0y=*cjfuaB@ZGQIADDEhtuw}4>6=9pp;`CYPTdVX
z8(*?V&%QoipuH0>WKlL8<OkjufLT@|_$itWfOgQQ1sL1my9jh5H&Aqu0?B`H1h6nM
zGosuE%;~qrp)5qiJt5TkvuY!P5&J7R#=`qhgmgE%$-tyU1wv>A*zTKckD!paN@~hh
zmXzm~qZhMGV<pYA7aBO1=rl8ewvtPYii`7)>dQGd=AG8&20HW0RGV8X{$9LldFZYm
zE?}`Q3i?xJRz43S?VFMmqRyvWaS#(~Lempg9nTM$EFDP(Gzx#$r)W&lpFKqc<y(GB
zQ)iBAS`N=DS_sKSXI55P(lHkwH)@Q89oTUc{-vd@Ue^>AoJh-AxEw$-bjW>`_+gEi
z2w`99#UbFZGiQjS8kj~@PGqpsPX`T{YO<m!OQBo2H4HfqK(14BF?XJ?$*LTUG|=XE
z%mXZ(W0fCU1)eodty#XEHD<Wgj8lxa(h;^9fW-UTt2UUeP&QC*>j`CaEqTFag;$jY
z8_{Wzz>HXx&G*Dx<5skhpETxIdhKH?DtY@b9l8$l?UkM#J-Snmts7bd7xayKTFJ(u
zyAT&@6cAYcs{PBfpqZa%sxhJ5nSZBPji?Zlf&}#L?t)vC4X5VLp%~fz2Sx<*oN<7`
z?ge=k<=X7r<~F7Tvp9#HB{!mA!QWBOf%EiSJ6KIF8QZNjg&x~-%e*tflL(ji_S^sO
ztmib1rp09uon}RcsFi#k)oLs@$?vs(i>5k3YN%$T(5Or(TZ5JW9mA6mIMD08=749$
z!d+l*iu{Il7^Yu}H;lgw=En1sJpCKPSqTCHy4(f&NPelr31^*l%KHq^QE>z>Ks_bH
zjbD?({~8Din7IvZeJ>8Ey=e;I?thpzD=zE5UHeO|neioJwG;IyLk?xOz(yO&0DTU~
z^#)xcs|s>Flgmp;SmYJ4g(|HMu3v7#<SE)AQQ4Qptn#EiMmi^p+Ei-7DkP+HtEnTf
z=ihEK*Qz1Ztg&vmf~iSIJvy;UM*1)`@_lXyy`^7!LwVESw8;X~PDDY`x##uAQg`Ks
z=exD#_B;<%p^vKWmo{iCKEmMLveyeSGEVy6BOKJM`)TA4((F&!tIH99l|b!Je=-ox
zTnV0#B=>;c*Aa8iF#UZo7CvDq4>8#qLJ|YdZ!AsH%^_7N1IQjCro<P>K7UpUK$>l@
zw`1S}(D?mUXu_C{wupRS-jiX~w=Uqqhf|Vb3Cm9L=<ElBA0am3@5~_j4>T+w91Cu^
z*&Ty%sN?x*h~mJc4g~k{xD4ZmF%FXZNC;oVDwLZ_WvrnzY|{v8hc1nmx4^<U$9gKZ
zXD;2On@pVz(zumYHw-cDICc(jj*0Lh@n}B_NB#b3S3Y`joU*Hg4D^tTlnR`X)wKiE
z<1(Wo3^4=KAZ|hOq%hItOAv2Yuv6FZW5>}Z;yriXsAf+Lp+OFLbR!&Ox?x<j>ABwl
zu8w&|5pCxmu#$?Cv2_-Vghl2LZ6m7}VLEfR5o2Ou$x02uA-%QB2$c(c1rH3R9hesc
zfpn#oqpbKuVsdfV#cv@5pV4^f_!WS+F>SV6N0JQ9E!T90EX((_{bSSFv9ld%I0&}9
zH&Jd4MEX1e0iqDtq~h?DBrxQX1iI0lIs<|kB$Yrh&cpeK0-^K%=FBsCBT46@h<vzQ
zR*F0_2h6Z11Y9Zzju}{_b;!1TT<IfdPup&BfzJrm?bo16`)8Tm1upGj;JDjh>#<Oy
z8t{(&!50W|bIsS-1avP5+*F7gID<ta6$a=EvLN_Gn#i|E*V;GvV|e9HOQ9zf{;k4c
z6E1|)o(`{GoSr=usmv~(8ivQq9h)k@J&LDI;_p0b!?ZsmFktoKIgcIE+yq!s*%$%)
z#ay6sWeCZZ>yi!AyDq1V(#V}^;{{V<B^hU0(%wtueKZDT`@J-_3mSLDBmaq~d|XD<
z7XM`Dzdz4EMpggFBJu`~KkDTFp@A11CGaoLc9C!MrY0!YzwKdGfx7t=5D@|7VVCk+
z^B@Uld)n>*@T4WJ&U-NTq43w=|K>z8%pr_nC>%C(Wa_l78Ufib$r8Od)IIN=u>417
z`Hl{9A$mI5A(;+-Q&$F&h-@;NR>Z<2U;Y21>>Z;s@0V@SbkMQQj%_;~+qTuQ?c|AV
zcWm3XZQHhP&R%QWarS%mJ!9R^&!_)*s(v+VR@I#QrAT}`17Y+l<`b-nvmDNW`De%y
zrwTZ9EJrj1AFA>B`1jYDow}~*dfPs}IZMO3=a{Fy#IOILc8F0;JS4x(k-NSpbN@qM
z`@aE_e}5{!$v3+qVs7u?sOV(y@1Os*Fgu`fCW9=G@F_#VQ%xf$hj0~wnnP0$hFI+@
zkQj~v#V>xn)u??YutKsX><O0LtJ*CTAYt%cbdwrkH)-u>pxKCl^p!C-o?+9;!Nug^
z{rP!|+KsP5%uF;ZCa5F;O^9TGac=<e#17!Ngq|n{jbexb$Js+_Pf5VvAs8ZxqWBs8
zB0&S13$r1<F)VBk$e5stmC}hMzOSHBA0t(kR!@rbH|QU6*IH0=h|_OeT6wpDmdv23
z34f|!XhwEd(Kkc%FUYK@H{gz+bBn7dd5Fl+jfYh}4;N_qJ7#NlT2O+)R4Pxe>M|=V
z_H(PfkV1rz4jl?gJ(ArXMyWT4y(86d3`$iI4^l9`vLdZkzpznSd5Ikfrs8qcSy&>z
zTIZgWZGXw0n9ibQxYWE@gI0(3#KA-dAdPcsL_|hg2@~C!VZDM}5;v_Nykfq!*@*Zf
zE_wVgx82GMDryKO{U{D>vSzSc%B~|cjDQrt5BN=Ugpsf8H8f1lR4SGo#hCuXPL;QQ
z#~<Fi-k$qL|7RVJ1Mm1Ag<$gB0(}Grl3S(KpYI)5f?u3vC7#V;7XHhGB(-?~L6q0H
z3n(8fq$!!dl{5-8`7Oj@C;&S?r8OD`wLCHxqX2ux!ard>b?C4MoepT3X`qdW2dNn&
z<Us;@%o6IHN`m_LE5@ycr7sZ5iN)2(z?4TB28{O!&JjxPf+H!xrjo)qWAjZifsc&}
zRUE<g)K;7gXjcN#UR5gfzb2nN2z_KhXjz-ToEUcM{^o{zAJg2QnR<7bsn(YUNIKF@
z%f>o8)K}%Lpu>0tQei+{<z4bzXqy$)s2?v=pRVAvTjJpopW)-K>>*VGErz|qjbK#9
zvtd8rcHplw%YyQCKR{kyo6fgg!)6tHUYT(L>B7er5)41iG`j$qe*kSh$fY!PehLcD
zWeKZHn<492B34*JUQh=CY1R~jT9Jt=k=jCU2=SL&&y5QI2uAG2?L8qd2U(^AW#{(x
zThSy=C#>k+QMo^7caQcpU?Qn}j-`s?1vXuzG#j8(A+RUAY})F@=r&F(8nI&HspAy4
z4>(M>hI9c7?DCW8rw6|23?qQMSq?*Vx?v30U%luBo)B-k2mkL)Ljk5xUha3pK>EEj
z@(;tH|M@xkuN?gsz;*bygizwYR!6=(Xgcg^>WlGtRYCozY<<HB2ZgQ)h*70uL|pZ2
z>rFX2E>kaZo)O<^J7a`MX8Pf`gBd4vrtD|qKn&B)C&wp0O-x*@-|m*0egT=-t@%dD
zgP2D+#WPptnc;_ugD6%zN}Z+X4=c61XNLb7L1gWd8;NHrBXwJ7s0ce#lWnnFUMTR&
z1_R9Fin4!d17d4jpKcfh?MKRxxQk$@)*hradH2$3)nyXep<KqaI3?gC#eK%i>5Z;B
z?yX+-Bd=TqO2!11?MDtG0n(*T^!CIiF@ZQymqq1wPM_X$Iu9-P=^}v7npvvPBu!d$
z7K?@CsA8H38+zjA@{;{kG)#AHME>Ix<711_iQ@WWMObXyVO)a&^qE1GqpP47Q|_AG
zP`(AD&r!V^MXQ^e+*n5~Lp9!B+#y3#f8J^5!iC@3Y@P`;FoUH{G*pj*q7MVV)29+j
z>BC`a|1@U_v%%o9VH_HsSnM`jZ-&CDvbiqDg)tQEnV>b%Ptm)T|1?TrpIl)Y$LnG_
zzKi5j2Fx^K^PG1=*?GhK;$(UCF-tM~^=Z*+Wp{FSuy7iHt9#4n(sUuH<I4zjuT^*g
zTOj)T0YzMUS}8lWAh=#M-<;W0su6sG+MC}XSqZj-`H_&HD?2Z|qE8QdIM(X#NffsX
zuwQ<4gm=;OFM%U76&;OT!|+5x?B#F>K??@v+6*|10Csdnyg9hAsC5_OrSL;jVkLlf
zHXIPukLqbhs~-*oa^gqgvtpgTk_7Gy<tuXYyQ=x)UZpuI%V@LXq4|I6hehHuRD*Qk
z+@3~62V_$Rc&ax*Fe`-_r5e~NV8Dh7^Qkc47_lIobqdmfrCP}{2YGKf1@Bh!U!R!l
zVh{a+_vh}x%?yACZT@@6M}$iwzkin{Cukrb?*D6_{O__PU~FPz|81A`Ki4JuH6?`4
zsEYmO+F8y*acunUpGD%Hp$-=5%jS}CI-%I;>pwH><53riYYL*M=Q@F-yEPLqQ&1Sc
zZB%w}T~RO|#jFjMWcKMZccxm-SL)s_ig?OC?y_~gLFj{n8D$J_Kw%{r0oB8?@dWzn
zB528d-wUBQzrrS<hDznyNCgL-qOb|p_nS%fSb5_Ze4V1&Bgk3Vp>SL<Uo&kY+&uyM
z)HW-LQUfYYW$q$nSg9;!DvA>q?fR!K%59Zv9J4yCQhhDGwhptpA5O5U?Hjqt>8nOD
zi{)0CI|&Gu%zunGI*XFZh(ix)q${jT8wnnzbBMPYVJc4HX*9d^mz|21$=R$J$(y7V
zo0dxdbX3N#=F$zjstTf*t8vL)2*{XH!+<2IJ1VVFa67|{?LP<VPiuN~9#0c!YR12a
zo;Lj{EKN(4nyG=Ui#tn@dJ(*;md5Ze)kd6mTF$dEZFv=nYr2Lr(==N%ad&FBajgz2
z6_^{6U)eu_=!Ix#bP)AA_tuIc?3=_Mq548F*;t3>&P41h$2i2;?N~RA30LV`BsUcj
zfO9#Pg1$t}7zpv#&)8`mis3~o+P(DxOMgz-V*(?wWaxi?R=NhtW}<#^Z?(BhSwyar
zG|A#Q7wh4OfK<|DAcl9THc-W4*>J4nTevsD%dkj`U~wSUCh15?_N@uMdF^Kw+{agk
zJ`im^wDqj`Ev)W3k3stasP`88-M0ZBs7;B6{-tSm3>I@_e-QfT?7|n0D~0RRqDb^G
zyHb=is;IwuQ&ITzL4KsP@Z`b$d%B0W<OzS}HhQ;U8}SDa=0^@=1zh?Ebwi(h3u<mG
zLDfZ57#t!A)c-~Y!O{*PdxzDz>uhioo1CWttW8yhsER1ZUZzA{F*K=wmi-sb#Ju+j
z-l@In^IKnb{bQG}Ps>+Vu_W#grNKNGto+yjA)<V{IjYd_d3*YH={C)J_}OyeI6i_H
zu|j9WcfZFU?Sgyac7umALud#l=-0F(fRl)O28qU<%HpKBh#im&8;uWW953cFj~Iy&
zlr`ZkkzG;HtUSI4CRZRj#8C;N*{hU?ezHzcM|gV*@iklj`$SU@402&viLPBGiWFL2
zQ6uyKx<>?>0?~X`4I3T@5G1)RqGUZuP^NJCq&^HykuYtMDD8qq+l8RcZNJsvN(10{
zQ1$XcGt}QH-U^WU!-wRR1d--{B$%vY{JLWIV%P<Wi12|;({s?oRC80?)@wj6UxV~n
zk2qyqDc_<v8oEujhH&m^Jq#Lrp)Z$Oj#rxlpUicHLpt@6dZF=U?wW~T_o?IN;ie!I
zV-bY1+kei0nc9KpW>4-KQuxxDeJaF#{eu&&r!3Qu{w}0f--8^H|KwE>)ORrcR+2Qf
zb})DRcH>k0zWK8@{RX}NYvTF;E~phK{+F;MkIP$)T$93Ba2R2TvKc>`D??#mv9wg$
zd~|-`Qx5LwwsZ2hb*Rt4S9dsF%Cny5<1fscy~)d;0m2r$f=83<->c~!GNyb!U)PA;
zq^!`@@)UaG)Ew(9V?5ZBq#c%dCWZrplmuM`o~TyHjAIMh0*#1{B>K4po-dx$Tk-Cq
z=WZDkP5x2W&Os`N8KiYHRH#UY*n|nvd<ovggige3u`1qGi1+Y8X!3s{W#*m=tX&CV
zNWQ(*z*>(U>yO=MFI-2BEp?x@=N<~CbLJBf6P)}vLS?xJXYJ2^<3KJUdrwKnJnTp{
zjIi|R=L7rn9b*D#Xxr4*R<3T5AuOS+#U8hNlfo&^9JO{VbH!v9^JbK=TCGR-5EWR@
zN8T-_I|&@<P6ysJp1u%bVccl?q?sU4Onn?IFII0`6;jp*_+1Vcjf$mX{%JA^!$Gkf
z>A}(hKeL4_*eb!1G8p~&_Im8|wc>Cdir+gg90n1dw?QaXcx6Op_W1r=axR<Rt0$d-
z&gdORS`9;Z%6j=d$PU%VL0xT-jF-dHo&#w}>w>4;rM*UOpT#Eb9xU1IiWo@h?|5uP
zka>-XW0Ikp@dIe;MN8B01a7+5V@h3WN{J=HJ*pe0uwQ3S&MyWFni47X32Q7SyCTNQ
z+sR!_9IZa5!&GTf&V$`q!%H8ci!a|RMx5}5MA_kr+bhtQy{-^)(hCVa@I!^TV4RBi
zAFa!Nsi3y37I5EK;0cqu|9MR<Vh>j<^r&h1lF}u0KpKQD^5Y+LvFEwM<n%Y4Ns0&r
z#Pgp7tfaM#i}k;d-@gi@qNBc}@xL(OgxbkB%Zc*U!8(yY_d_z4QrJ%DIL^_}pG(C;
zxV&Dt0*#6mW+VnKpUKH&)*t(_EhJ1#-d4~Kom-)N+kGAW3vl$z=E{EB!4#iw1#JGZ
zpZv7B?(+0N;`4s@&;+D$6BOaTPLlV-MY35`gn~5zS!mCgh|W$2sr@*jRa}74{|6)>
zLU@@v4_Na#Axy6tn3P%sD^5P#<7F;sd$f4a7L<t5V42bo`*JV+&3HWm9OI@30?%Oh
z5o+B(*v(C-H_!6}Lzhp-kE~j|H(u&BA@KX~k?60QV5NR)N2OJYIOG(f(FG`kmvdU7
zwM#zp&<w6$6785wBe4}t?5yT4MP5N47S8;*P_q6hn|Wj2S~%IPE(O9P2?RAKY>BMk
zGU^RZHBcxSA%kCx*eH&wgA?Qwazm8>9SCSz_!;MqY-QX<1@p$*T8lc?@`ikEqJ>#w
zcG``^CoFMAhdEXT9qt47g0IZkaU)4R7wkGs^Ax}usqJ5HfDYAV$!=6?>J6+Ha1I<5
z|6=9<InPN?WUE(MBZF{EDJ@lH!7KCH_7xC@JvfZL@&nc!7K2v>soU4>E))tW$<#>F
ziZ$6>K<f#VmS*<VCLk5Snrr-`d{Bp+A{=r<v#~0tw_zC-WYWg-s*<dPsHVYZm|7R#
z>Jf0bPfbx_)7-}tMINlc=}|H+$uX)mhC6-Hz+XZxsKd^b?RFB6et}O#+>Wmw9Ec9)
z{q}XFWp{3@qmyK*Jvzpyqv57LIR;hPXKsrh{G?&dRjF%Zt5<eso5q2Qq7|ChevXoo
zUUuocw%BMFdbc16MLS>&m20Ll?Oy<ul%w2Zua&zkQjQpssgWs#RKAL}6i`eHUzx7p
zSoNx{P@%ayUjixVqBLi(th$z4mR4dC*OaQENb9y_y<R>fUYC3WRn{cgQ?^V~UAv+5
z&_m#&nIwffgX1*Z2#5^Kl4DbE#NrD&Hi4|7SPqZ}(>_+JMz=s|k77aEL}<=0Zfb)a
z%F(*L3zCA<=xO)2U3B|pcTqDbBoFp>QyAEU(jMu8(jLA61-H!ucI804+B!$E^cQQa
z)_ERrW3g!B9iLb3nn3dlkvD7KsY?sRvls3QC0qPi>o<)GHx%4Xb$5a3GBTJ(k@`e@
z$RUa^%S15^1oLEmA=sayrP5;9qtf!Z1*?e$ORVPsXpL{j<cf(aoOxSwSjBR1mea0e
z^c3Q=wn8)%*koW31D%}j9dO0gR7Y@b_00J@vMv~iauOuPVkQGKG^LpkGueXj*z<(H
zv6PsTl$Bd^g^4yr-LK|PFHGhgs2bdrT$8)TR7nc(mnkPw6vrbaA_vA0JdMNVsoX7X
zzI)+QrS1A&rLp4hm_yCc#tP)G{!hBfUV@*{)uh;EQVyvIbs@16JE!9Psq3UaXMGQ^
zJ^Y2zHl3Zl1idw?=a56!^Y{sB%B-k(>L<6E)0sj&swP3}NPmR%FM?O>SQgN5XfHE<
zo(4#Cv11(%Nnw_{_Ro}r6=gKd{k?NebJ~<~Kv0r(r0qe4n3LFx$5%x(BKvrz$m?LG
zjLIc;hbj0FMdb9aH9Lpsof#yG$(0sG2%RL;d(n>;#jb!R_+dad+K;Ccw!|RY?uS(a
zj~?=&M!4C(5LnlH6k%aYvz@7?xRa^2gml%vn&eKl$R_lJ+e|xsNfXzr#xuh(>`}9g
zLHSyiFwK^-p!;p$yt7$F|3*IfO3Mlu9e>Dpx8O`37?fA`cj`C0B-m9uRhJjs^mRp#
zWB;Aj6|G^1V6`jg7#7V9UFvnB4((nIwG?k%c7h`?0tS8J3Bn0t#pb#SA}N-|45$-j
z$R>%7cc2ebAClXc(&0UtHX<>pd)akR3Kx_cK+n<}FhzmTx!8e9^u2e4%x{>T6pQ`6
zO182bh$-W5A3^wos0SV_TgPmF4WUP-+D25KjbC{y_6W_9I2_vNKwU(^qSdn&>^=*t
z&uvp*@c8#2*paD!ZMCi3;K<prH4a&u_&GPH7Wp*~+CzWZpzTbL&)4k=HC|E1=uT#O
zPuYcj5Jbb}c9+j=4foN!N-fSV`(c<30`iukl213!u4qI^cD3Ytg!~P3N1S(`-3^yF
zlFzoPUGJT0f@f{`ZDabbR@mNt3*M;G<P@?W{8}}Lg`Hvy7&Jmu=04fd1erYTPEwb=
z15%g^dl!<DS<}y*RUs*g6?{>{Na;I4Q35zw$YrW5U@Kk~)&rw;G?d7Q&c9|x<<Maw
zgBLn8*CoGPVj_^rpPn~ENvB}-`?vqbUaC%e+l73U+D^-Cb&Pj13W#6g!+x^$_tX)*
z@cs9Wk+{Dzx8NJw-G7(M|EOb>Hg|CNMsxovmfth*|E*GHezPTWa^Hd^F4!B3sF;)?
z(NaPyAhocu1jUe(!5Cy|dh|W2=!@fNmuNOzxi^tE_jAtzNJ0JR-avc_H|ve#KO}#S
z#a(8secu|^Tx553d4r@3#6^MHbH)vmiBpn0X^29xEv!Vuh1n(Sr5I0V&`jA2;WS|Y
zbf0e}X|)wA-Pf5gBZ>r4YX3Mav1kKY(ulAJ0Q*jB)YhviHK)w!TJsi3^dMa$L@^{`
z_De`fF4;M87vM3Ph9SzCoCi$#Fsd38u!^0#*sPful^p5oI(xGU?yeYjn;Hq1!wzFk
zG&2w}W3`AX4bxoVm03y>ts{KaDf!}b&7$(P4KAMP=vK5?1In^-YYNtx1f#}+2QK@h
zeSeAI@E6<gTYtZud5<DeMJ>Z8a?)>sZ`fbq9_snl6LCu6g>o)rO;ijp3|$vig+4t}
zylEo7$SEW<_U+qgVcaVhk+4k+C9THI5V10qV*dOV<w;)q>6pPtAI$)QN{!JRBKh-D
zk2^{j@bZ}yqW?<#VVuI_27*cI-V~sJiqQv&m07+10XF+#ZnIJdr8t`9s_EE;T2V;B
z4UnQUH9EdX%zwh-5&wflY#ve!IWt0UE-My3?L#^Bh%kcgP1q{<CvcqqQ2utk7s%sV
zoGl}#Zl)W@RNSAAf;w-DBO+*e0HO2%x-G=Z;*Pl$zHy^xW)%na$gbyTIw>&26eXLn
zTkjJ*w+(|_>Pq0v8{%nX$QZbf)tbJaLY$03;MO=Ic-uqYUmUCuXD>J>o6BCRF=xa%
z3R4SK9#t1!K4I_d>tZgE>&+kZ?Q}1qo4&h%U$GfY058s%*=!kac{0Z+4Hwm!)pFLR
zJ+5*OpgWUrm0FPI2ib4NPJ+Sk07j(`diti^i#kh&f}i>P4~|d?RFb#!JN)~<!~Z`*
zivNy#7kzu<{{tyjD6d)1{g;-B-EK2+0;|?2Nj`=2hUDsRiVj-}RAJN{d@x~38|)#_
zx&F#UxFFdbXxE(|#84p;-%dtBDbgEpl>D@)beox}bw?4VCf^y*`2{4`-@%SFTry2h
z>9VBc9#JxEs1+0i2^LR@B1J`B9Ac=#FW=(?2;5;#U$0E0UNag_!jY$&2diQk_n)bT
zl5Me_SUvqUjwCqmVcyb`igygB_4YUB*m$h5oeKv3uIF0s<pxU7XrV;DR{UhyjHRs5
zb+8Qf7A65FfQ?d1ZT2w}F_l*Eb)?ah<8c%Yy;Eal4{xBsX^nN@Pe5CxcymxUwL?eh
zv9_Z0XXBqZl6EhcKDo~Ou&%?PpG{{$wPe(7oy?yZ1mnWmr0b~pN$igR!(Rx*QN$iy
z=-Re}qI2g(ku?t~HgBj3V=|H$hiN2{j!P%zCB+1x34pnjx#?&{ENcU`o_2tynp}0U
zKI9mTgI{WS`?XY!3FH!0Q>k}~es!{D>4r%PC*F~FN3owq5e0|Y<Du-bB4EU)q{6=q
z#<0gBE8S|!ZrmQeH3JgM^AxLU0k8cAwCY-9?0w8gxwWKqzGP>eUTSG#Vq%&Gk7uwW
z0lDo#_wvflqHeRm*}l?}o;EILszBt|EW*zNPmq#?4A<L0#2f-Fpgzo6i9m?Cv{^Fe
z9>+&i0xx^?9obLyY4xx=Y9&^G;xYXYPxG)DOpPg!i_Ccl#3L}6xAAZzNhPK1XaC_~
z!A|mlo?Be*8Nn=a+FhgpOj@G7yYs(Qk(8&|h@_>w8Y^r<TLc~8#)=w@0;xlrL@mM3
zg*K(X)@-O)lt;P?5e(;WTL%O;a;rQNAE5;DqERSyAXc1biP%NUWXy?=-B^)wQ=+I4
zU%qA-ghSXXn27E3w8NMG!5XHJY>&5nCqe0V60rRz?b5%J;GYeBqSAjo|K692GxD4`
zRZyM2FdI+-jK2}WAZTZ()w_)V{n5tEb@>+JYluDozCb$fA4H)$bzg(Ux{*hXurjO^
zwAxc+UXu=&JV*E59}h3kzQPG4M)X8E*<G*nR0cBsK+3(q5`<{N)Z$_eT#;miD(s%h
z{fdYdgo~K&tWs<DY?yIi#?k!bT;M<ZDoV|<xhf7jcRFXDXl`LtGFz=LPAW$(hAEz}
zq@oGhJoeM0w4KvLTg%>}#_&}w*KEg<F5P|-B$Y<3$zfM|>tX)cU{vm9b$atHa;s>|
z+L6&cn8xUL*OSjx4YGjf6{Eq+Q3{!ZyhrL&^6Vz@jGbI%cAM9GkmFlamTbcQGvOlL
zmJ?(FI)c86=JEs|*;?h~o)88>12nXlpMR4@yh%qdwFNpct;vMlc=;{FSo*apJ;p}!
zAX~t;3tb~VuP|ZW;<Q&V!o{~5>z$=IHf->F@Ml)&-&Bnb{iQyE#;GZ@C$PzE<QuI)
zv_|vm%I_n1Dpq6lr--l%%tq7K!<v~?55k`WhA?y(q<f`<c6%@dUw9u~aA?j^`pueW
zW?_3n{u)d4@AQyf;UHiIRxp16RoWg&F+uwIJYB{!Spu!Z6TFEXau<!8UfawC4vbZv
zJTpZLC-RhzHO9xSd6HqYzkfjT+8e>f6~q}4D>9jic@mTO5x76ulDz@+XAcm35!VSu
zT*Gs>;f0b2TNpjU_BjHZ&S6Sqk6V1370+!eppV2H+FY!q*n=GHQ!9Rn6MjY!Jc77A
zG7Y!lFp8?TIHN!LXO?gCnsYM-gQxsm=Ek**V<u>mZu7vnuufD7K~GIxfxbsQ@qv2T
zPa`tvHB$fFCyZl>3oYg?_wW)C>^_iDOc^B7klnTOoytQH18WkOk)L2BSD0r%xgRSW
zQS9elF^?O=_@|58zKLK;(f77l-Zzu}4{fXed2saq!5k#UZAoDBqYQS{sn@j@Vtp|$
zG%gnZ$U|9@u#w1@11Sjl8ze^Co=)7yS(}=;68a3~g;NDe_X^}yJj;~s8xq9ahQ5_r
zxAlTMnep*)w1e(TG%tWsjo3RR;yVGPEO4V{Zp?=a_0R#=V^ioQu4YL=BO4r0$$XTX
zZfnw#_$V}sDAIDrezGQ+h?q24St0QNug_?{s-pI(^jg`#JRxM1YBV;a@@JQvH8*>>
zIJvku74E0NlXkYe_624>znU0J@L<-c=G#F3k4A_)*;ky!C(^uZfj%WB3-*{*B$?9+
zDm$WFp=0(xnt6`vDQV3Jl5f&R(Mp};;q8d3I%Kn>Kx=^;uSVCw0L=gw53%Bp==8Sw
zxtx=cs!^-_+i{2OK`Q;913+AXc_&Z5$@z3<)So0CU3;JAv=H?@Zpi~riQ{z-zLtVL
z!oF<}@IgJp)Iyz1zVJ42!SPHSkjYNS4%ulVVIXdRuiZ@5Mx8LJS}J#qD^Zi_xQ<pa
zK0_C<`%bp5M~CVCk7hV^j*M;Wzcj7kCsCfgg5CJ~2`y3|66=yp|GC7FJNP7A_Wc+(
zejiW#M^nRp{rUgmIRC{MB`ST%d>@>DKDr-_e#>5h3dtje*NcwH_h;i{Sx7}dkdpuW
z(yUCjckQsagv*QGMSi9u1`Z|V^}Wjf7B@q%j2DQXyd0nOyqg%m{CK_lAoKlJ7#8M}
z%IvR?Vh$6aDWK2W!=i?*<77q&B8O&3?zP(Cs@kapc)&p7En?J;t-TX9abGT#H?TW?
ztO5(lPKRuC7fs}zwcUKbRh=7E8wzTsa#Z{a`WR}?UZ%!HohN}d&xJ=JQhpO1PI#>X
zHkb>pW04pU%Bj_mf~U}1F1=wxdBZu1790>3Dm44bQ#F=T4V3&HlOLsGH)+AK$cHk6
zia$=$kog?)07HCL*PI6}DRhpM^*%I*kHM<#1Se+AQ!!xyhcy6j7`iDX7Z-2i73_n#
zas*?7LkxS<P{ZTB#tR|&N^U;Moy2#JwwW4RFPddYtD_bw0R1|Eo=5;j>-XSqv;YBa
zW_n*32D(HTYQ0$feV_Fru1ZxW0g&iwqixPX3=9t4o)o|kOo79V$?$uh?#8<F`E$a_
zutly1{7L1J@Y@6Vp*~KB!yXMF2QHqby@+ZG8+ND)X+s9is!(NOe)h&%h+bxjPFhwq
z$60~SJQ<aykcGl3;BUCZ>Q8e>4e)V6;_(x&ViUVxma+i25qea;d-oK7ouuDsB^ab{
zu1qjQ%`n56VtxBE#0qAzb7lph`Eb-}TYpXB!H-}3Ykqyp`otprp<BVE0iW(NxIg}T
zHQz%!Kt^1I=QqGS(r32Y=&DuF_0#yaLgW`I>7{VEuW*^IR2n$Fb99*nAtqT&oOFIf
z@w*6>YvOGw@Ja?Pp1=whZqydzx@9X4n^2!n83C5{C?G@|E?&$?p*g68)kNvUTJ)I6
z1Q|(#UuP6p<iHaJ>j78GUxq11m-GSszc+)X{C2eo-?8ud9sB=3(D47v?`JAa{V(IF
zPZQ_0AY*9M97>Jf<<kmA6!?J&2x7=_q{>o%#O_%Wq}8>YM=q0|tGY+hlXcpE=Z4Od
z`NT7Hu2hnvRoqOw@g1f=bv`+nba{GwA$Ak0INlqI1k<9!x_!sL()h?hEWoWrdU3w`
zZ%%)VR+Bc@_v!C#koM1p-3v_^L6)_Ktj4HE>aUh%2XZE@JFMOn)J~c`_7VWNb9c-N
z2b|SZMR4Z@E7j&q&9(6H3yjEu6HV7{2!1t0lgizD;mZ9$r(r7W5G$ky@w(T_dFnOD
z*p#+z$@pKE+>o@%eT(2-p_C}wbQ5s(%Sn_{$HDN@MB+Ev?t@3dPy`%TZ!z}AThZSu
zN<1i$s<Qod+!u+1TpqzHMAR;(P|C33h|NdU1+@toT{?QhAJAzzUDj;ch>iJhXFdjV
zP*y|V<`V8t=h#XTRUR~5`c`Z9^-`*BZf?WAehGdg)E2Je)hqFa!k{V(u+(hTf^Yq&
zoruUh2(^3pe)2{bvt4&4Y9CY3js<F>)PUHtd4rVG57}uFJL)D(JfSIo^{P=7liFXG
zq5yqgof0V8paQcP!gy+;^pp-DA5pj=gbMN0eW=-eY+N8~y+G>t+x}oa!5r>tW$xhI
zPQSv=pi;~653Gvf6~*JcQ%t1xOrH2l3Zy@8AoJ+wz@daW@m7?%LXkr!bw9GY@ns3e
zSfuWF_gkWnesv?s3I`@}NgE2xwgs&rj?k<VL?gG5MC{Nmj1vZX?3e8O$&f`#KcfCT
zD|dGfAH<9vQYUE_U}e#K2epdwK03De5{_327SI@sw~J+|<wi@;rZX!9Y2MH7_L7?E
z^an@e4GxaY9F(p#Ot=#L(YG%x=Gq!vNbxtM=IXzPyPmYSGU#`>H-FEy82=O8`+szN
ziHch`vvS`zNfap14!&#i9H@wF7}yIPm=UB%(o(}F{wsZ(wA0nJ2aD^@B41>>o-_U6
zUqD~vdo48S8~FTb^+%#zcbQiiYoDKYcj&$#^;Smmb+Ljp(L=1Kt_J!;0s%1|JK}Wi
z;={~oL!foo5n8=}rs6MmUW~R&;SIJO3TL4Ky?kh+b2rT9B1Jl4<n8E`j>>#Uh-Bec
z`Hsp<==#UEW6pGPhNk8H!!DUQR~#F9jEMI6T*OWfN^Ze&X(4nV$wa8QUJ>oTkruH#
zm~O<`J7Wxseo@FqaZMl#Y(mrFW9AHM9Kb|XBMqaZ2a)DvJgYipkDD_VUF_PKd~dT7
z#02}bBfPn9a!X!O#83=lbJSK#E}K&yx-HI#T6ua)6o0{|={*HFusCkHzs|Fn&|C3H
zBck1cmfcWVUN&i>X$YU^Sn6k2H;r3zuXbJFz)r5~3$d$tUj(l1?o={MM){kjgqXRO
zc5R*#{;V7AQh|G|)jLM@wGAK&rm2~@{Pewv#06pHbKn#wL0P6F1!^qw9g&cW3Z=9}
zj)POhOlwsh@eF=>z?#sIs*C-Nl(yU!#DaiaxhEs#iJqQ8w%(?+6lU02MYSeDkr!B-
zPjMv+on6OLXgGnAtl(ao>|X2Y8*Hb}GRW5}-IzXnoo-d0!m4Vy$GS!XOLy>3_+UGs
z2D|YcQx@M#M|}TDOetGi{9lGo9m-=0-^+nKE^*?$^uHkxZh}I{#UTQd;X!L+W@jm(
zDg@N4+lUqI92o_rNk{3P>1gxAL=&O;x)ZT=q1mk0kLlE$WeWuY<p^lk9k<q+rcOGG
zY)G$sy1c;kpqg0vV-}_XHLMzubt5&Y+X3Q{3Wa&iSOY9S8qUS1LUsYa--u3<U|kYH
zfL}q@Sl3A;lg32U^*mSX!dr5wpp#<9G)=5WC=&Cv)mW|a!muj?rXU0JHBrQ<`Qqt}
zCgYqLnoe5^we#~H-+!p+9n;rMDHiXMi5YFyOWW{wi{Tnu%1mqAov`>_$0`0jY-Kkt
zP*|m3AF}Ubd=`<>(Xg0har*_@x2YH}bn0Wk*OZz3*e5;Zc;2<o%?~}~ck-Gk+}B1o
z7H(amz-SpgFI})ialV|e<4!f0)HG}_n?GAIDeiC%vdRTJZ<WeGYT+p)vyz_FBWtO$
zz2K9gb9XN8(>uBdnl8?&XjupbkOeNZsNh6pvsq_ydmJI+*z**{<vBKXP!vVq<!p_e
z&dE_6Lim}RGRF|DDL<W_^>I{0K)-;p1~k8cpJXL$^t!-`E}=*4G^-E8>H!LjTPxSx
zcF<xkt%17w#Nu8CTO5_pkM9Mxb}Wf0;14cP1{yv5#7uIFNY2eGhq{_*hEBSbjQA$U
znAZkjT-yd=I_R8ZJ?H07yh5z>+cS`ommfKMhNSbas^<U&=a>@YbTpH1*RFrBuATUR
zt{oFWSk^$xU&kbFQ;MCX22RAN5F6eq9UfR$ut`Jw--p2YX)A*J69m^!oYfj2y7<PW
z+v>NYcH6&r+0~_sH^c^nzeN1AU4Ga7=FlR{S|Mm~MpzY0$Z+p2W(a={b-pR9EO1Rs
zB%KY|@wLcAA@)KXi!d2_Bxr<pdgvLNQgZmCJz&c*%>khDn`DT1=Dec}V!okd{$+wK
z4E{n8R*xKyci1(CnNdhf$Dp2(Jpof0-0%-38X=Dd9PQgT+w%Lshx9+loPS~MOm%ZT
zt%2B2iL_KU_ita<m0BJ>%N>xjB!<T714UjSOi9+HnppU8HTO6Xys3~>#71_3=3c}o
zgeW~^U_ZTJQ2!PqXulQd=3b=XOQhwATK$y(9$#1jOQ4}4?~l#&nek)H(04f(Sr=s|
zWv7Lu1=%WGk4FSw^;;!8&YPM)pQ<cfOEqZOX3qg|bi6Y+6raTouS~FCMLV)D&#=5p
zIeXB!ZqEvR`rjuFN1ix6GoI%|3=23*fhels&m^Kl7$Xb)d4}z-AG?ZOEDO)5!dX$;
zxo;%86BwnKIVK{n#wbp)z+DlG`Eo;!p1qmI5u}Dr3ERkBC^gA;rI=Ohq{XCvK{N9A
zv&z##C1k^|5<I!-;+MCJN#mX7#cD{4PE)89nNv$gm~ronTcb2Mq~#~4^M!^C-1#M(
z=1UT>DCY9DhU`hMty1@sq1=Tj7bFsOOBZOFlpR`W>-J$-(kezWJj;`?x-v>ev{*8V
z8p|KXJPV$HyQr1A(9LVrM47<GkZ=yfPF+Dq%S|w=MijcJ6`g68YLr!qXVkqpG~JGH
zv`>u-XpcrIyO`yWvx1pVYc&?154aneRpLqg<bQ>x)EMvRaa#|9?Wwqs2+W8n5~79G
z(}iCiLk;?enn}ew`HzhG+tu+Ru@T+K5juvZN)wY;x6HjvqD!&!)$$;1VAh~7fg0K|
zEha#aN=Yv|3^~YFH}cc38ovVb%L|g@9W6fo(JtT6$fa?zf@Ct88e}m?i)b*Jgc{fl
zExfdvw-BYDmH6>(4QMt#p0;FUIQqkhD}aH?a7)_%JtA~soqj{ppP_82yi9kaxuK>~
ze_)Zt>1?q=ZH*kF{1iq9sr*tVuy=u>Zev}!gEZx@O6-fjyu9X00gpIl-fS_pzjpqJ
z1yqBmf9NF!jaF<+YxgH6oXBdK)sH(>VZ)1siyA$P<#KDt;8NT*l_0{xit~5j1P)FN
zI8hhYK<Ctos5a2iSB6E;K{q?&ab{FphO?zxIqqf}%h}LQupBY+8nGBng;9rl>hQ)i
z37^aP<qpGiOwA$)u%NPBnx5-dJ?eaKuJ|k=(U7<z&!E=Ex{~rH$w*MsBugwy8}mlc
z@Cp`bIFkJ)nD=}RT7wXAD(8ljk?9q}6wNph<*7S`FnAlTK859*a=phG5gBZhql8^1
zqKXJBJSm<4{Im$>13B~u65?sg+_@2Kr^iWHN=U;EDSZ@2W2!5ALhGNWXnFBY%7W?1
z=HI9JzQ-pLKZDYTv<0-lt|6c-RwhxZ)mU2Os{bsX_i^@*fKUj8*aDO5pks=qn3Dv6
zwggpKLuyRCTVPwmw1r}B#AS}?X7b837UlXwp~E2|PJw2SGVueL7){Y&z!jL!XN=0i
zU^Eig`S2`{+<wl==2w=5|F9xM-02N5`F*r(R=yBH;xL${JnA$s&UywB#{_0RS^+hX
z*x9l+ftTA)(qqfU_K+4UB=w=A|0J9m!!ePjqBV0F7$&R=n`+yR@2tk2NwHhL{|&cH
z$fXb3^<M+ME~B<-d*&_qT19T_sRLwwvu;hmu)Vt+WHH@$?DQ6m2&aLvtcbK<MgTBu
zp(SQPX~F@0CgG?+)(jWG3Oc!+oEMIunjIDK0w|m6;WzWCMpr#dq3I<GeN7c@71Fqq
zf&ADs4Jz}_R;T_`?S9~(wB(9e>gU$68aRdWx?BZ{sU_f=8sn~>s~M?GU~`fH5kCc;
z8ICp+INM3(3{#k32RZdv6b9MQYdZXNuk<tNnM*Zj3!<Rmy`SQx3idGTVVW@OzSywq
z<sw@>7ed8;G?S2nT+NZBG=Tar^KFl2SvhW$bGW#kdWL-I)s_IqVnCDDM9fm8g;P;8
z7t4yZn3^*NQfx7SwmkzP$=fwdC}bafQSEF@pd&P8@H#`swGy_rz;Z?Ty5mkS%>m#%
zp_!m9e<()sfKiY(nF<1zBz&&`ZlJf6QLvLhl`_``%RW&{+O>Xhp;lwSsyRqGf=RWd
zpftiR`={2(siiPAS|p}@q=NhVc0ELprt%=fMXO3B)4ryC2LT(o=sLM7hJC!}T1@)E
zA3^J$3&1*M6Xq>03FX`R&w*NkrZE?FwU+Muut;>qNhj@bX17ZJxnOlPSZ=Zeiz~T_
zOu#yc3t6ONHB;?|r4w+pI)~KGN;HOGC)txxiUN8#m<I|Y6ES5NY<qwYy-|}EoiBmM
zzK&og-IJMpwjbL8IAA?{APd++5KnIBFcmKwZzO~h`v>exj+W(cz%9a4sx|IRG=}ia
zuEBuba3AHsV2feqw-3MvuL`I+2|`Ud4~7ZkN=JZ;L20|Oxna5vx1qbIh#k2O4$RQF
zo`tL()zxaqibg^GbB+BS5#U{@K;WWQj~GcB1zb}zJkPwH|5hZ9iH2308!>_;%msji
zJHSL~s)YHBR=Koa1mLEOHos*`gp=<n)R&-+66xg`pqUXP#0Zm{sf^MKJnR>s8KA-C
zu0aE+W!#iJ*0xqKm3A`fUGy#O+X+5W36myS>Uh2!R*s$aCU^`K&KKLCCDkejX2p=5
z%o7-fl03x`gaSNyr?3_JLv?2RLS3F*8ub>Jd@^Cc17)v8vYEK4aqo?OS@W9mt%ITJ
z9=S2%R8M){CugT@k~~0x`}Vl!svYqX=E)c_oU6o}#Hb^%G1l3BudxA{F*tbjG<UG_
z<Vo0tRnGPd)K5|7jd0}T-wQdZZG$TREj=4*<4bR;^};Y7orbT=`ITOz*@g<Y=x;1T
zesE`pc}OxPz?01Be<R|c+WT7B<35I5;~ham;Y6hY?F{PCSdwf@9qXBZn%JBWfW6@H
zxqGR{o+TW^4T&@lZ$GVT-8s0{R3MmXsMGGDxgNTZ8|S`+<Qxcf#Ei_^nsS}Yt*PB*
z5#h+HDAU+!GmbOJ#d(J8|6Rt?+U5gR1>;W_>=xV73pKY53v%>I)@D36I_@&p$h|Aw
zonQS`07z_F#@T-%@-Tb|)7;;anoD_WH>9ewFy(ZcEOM$#Y)8>qi7rCnsH9GO-_7zF
zu*C87{Df1P4TEOsnzZ@H%&lvV(3V@;Q!%+OYRp`g05PjY^gL$^$-t<lOaPfbWQ29U
ziQLZAEqJX}(BXW*YUz0#v0~iKqbJ35-`aw1m+YA~k)TQVyq!wOKDE%}gHJ%Woa=@J
z_G3Z9VfhbWs>0Y>H*CDDs?FZly*oZ&dxvsxaUWF!{em4{A>n@vpXg$dwvt@_rgmHF
z-MER`ABa8R-t_H*kv>}CzOpz;!>p^^9ztHMsHL|SRnS<-y5Z*r(_}c4=fXF`l<Xkw
zEy6TsVDdDoV_TNG(<WvkK8TcvFZ^8LA2wsz%Sl;DbLS}my*l(?>^-i}>e7v!qs_jv
zqvWhX^F=2sDNWA9c@P0?lUlr6ecrTKM%pNQ^?*Lq?p-0~?_j50xV%^(+H>sMul#Tw
zeciF*1=?a7cI(}352%>LO96pD+?9!fNyl^9v3^v&Y4L)mNGK0FN43&Xf8jUlxW1Bw
zyiu2;qW-aGNhs=zbuoxnxiwZ3{PFZM#Kw)9H@(hgX23h(`Wm~m4&TvoZoYp{plb^>
z_#?vXcxd>r7K+1HKJvhed>gtK`TAbJUazUWQY6T~t2af%#<+Veyr%7-#<!`u7=6z<
z@9(d@bkwuJK`;AXMIasQ=4>*A#@&*;@g58{i|E%6yC_InGXCOd{L0;$)z#?n7M`re
zh!kO{6=>7I?*}czyF7_frt#)s1CFJ_XE&VrDA?Dp3XbvF{qsEJgb&OLSNz_5g?HpK
z9)8rsr4JN!Af3G9!#Qn(6zaUDqLN(g2g8*M)Djap?WMK9NKlkC)E2|-g|#-rp%!Gz
zAHd%`iq|81efi93m3yTBw3g0j#;Yb2X{mhRAI?&KDmbGqou(2xiRNb^sV}%%Wu0?<
z?($L>(#BO*)^)<q1<y(%o-D>rSgyNRni$i`R4v;GhlCZ8$@e^ROX(p=2_v6Y!%^As
zu022)fHdv_-~Yu_H6WVPLpHQx!W%^6j)cBhS`O3QBW#x(eX54d&I22op(N59b*&$v
zFiSRY6r<ogkg`lB4Kzk}79-1@4WexUMjw+78PVxKkpJ}l$#uYSKRni~wAs(LFQ`S~
z)6H^=SsBl_=gtAZcCb%LqI~={V0L=Y2XD)_)<%!{eY=Ch6CuBtAu<8uk=VNIh~<**
z4}OE-7;4DiEfmH5r(r_l!C{NrR4GiA;)KB7^Z5GJe5Gi4jN{kozWVvza9J%ysu#}1
zV05Y-@;(u_e>Oc^(dgSV1<S;M;fP~XvG?H2UWU4_zajZjOwrCC*tclCC;-DiooNK0
zKT=dwZdL_ATeNsapPF;r>>a7-5C;(5S5MvKcM2Jm-LD9TGqDpP097%52V+0>Xqq!!
zq4e3vj53SE6i8J`XcQB|MZPP8j;PAOnpGnllH6#Ku~vS42xP*Nz@~y%db7Xi8s09P
z1)e%8ys6&M8D=Dt6&t`iKG_4X=!kgRQoh%Z`dc&mlOUqXk-k`jKv9@(a^2-Upw>?<
zt5*^DV~6Zedbec4NVl(<J^HBK@2E4@wD3MtdV(xxnq%Vbc}WuHo}5QftrZiThWddC
z`SqHbMkXtVXKo79381*cF)4R%emsAj0XSNL+PDEL#FejmgEu5hh1&sJBWank<?5||
zFa70^J_fIcXV;YE%H5TE-LegY#;Q{2+X5ec9PUkBc$GBoPD$f3kMxcc{GCuyqMnT@
zoFpa`18N8o#cD|?wHX2`2uRGD5<2)(v9&spIF?JOuSy<x-3h*1uo<tmXh9c!`+f(x
zJ!3;pKi`8)fAw>$2T{&b)zA@b#dUyd>`2JC0=xa_fIm8<d`*pjaOx{pV5Pe7q{O1W
zAT5c_%T(D{?pE<afMwAc3^p*w8J|rwP_Jm84rW1XYj_z!4LWkm8d^QnYM3TAJ00M}
zVy+@IpL$^~2r+Bvv<<LIU<+tp*%PbBjNTe&Z{H48g3%crRya-=a)qRylf)#ctNvw?
zlFff;8hFwWk7>{5u<t#%K`^*tgQ@!!adiD7YiKBgV_c5|kR2YSiy*0+0NhSocT-E<
zLsbh?CBfxQ!F7|i4p=%MX(y=Km3GT?bty_*CZh1*xF8v|8xjqfoni!cQE1EFLyy>m
zr-!ApXZhC8@=vC2WyxO|!@0Km)h8ep*`^he92$@YwP>VcdoS5OC^s38e#7RPsg4j+
zbVGG}WRSET&ZfrcR(x~k8n1rTP%CnfUNKUonD$P?FtNFF#cn!wEIab-;jU=B1dHK@
z(;(yAQJ`O$sMn>h;pf^8{JISW%d+@v6@CnXh9n5TXGC}?FI9i-D0OMaIg&mAg=0Kn
zNJ7oz5*ReJukD55fU<geid?Ih<u$lD<ZaBChRKMH%^g0f>sMuaP+H4tDN&V9zfqF@
zr=#ecUk9wu{0;!+gl;3Bw=Vn^)z$ahVhhw)io!na&9}LmWurLb0zubxK=UEnU*{5P
z+SP}&*(iBKSO4{alBHaY^)5Q=mZ+2OwIooJ7*Q5XJ+2|q`9#f?6myq!&oz?klihLq
z4C)$XP!BNS0G_Z1&TM>?Jk{S~{F3n83ioli=IO6f%wkvCl(RFFw~j0tb{Gv<R+ryT
zRX%I@y>XTx>*sB0McY0s&SNvj4+^h`9nJ_wM>F!Uc>X}9PifQekn0sKI2SAJP!a4h
z5cyGTuCj3ZBM^&{dRelIlT^9zcfaAuL5Y~bl!ppSf`wZbK$z#6U~rdclk``e+!qhe
z6Qspo*%<)eu6?C;Bp<^VuW6JI|Ncvyn+LlSl;Mp22Bl7ARQ0Xc24%29(<s4_N=u??
z$t{gru-VXd6Kxbxv5Bt?Z$yc1w+D)_3NsUN5!&<3)ieB&Nu@~@oJ<S*(@c3&Pewi0
z%9!QRq|vABdor|x%wu0~=1Z5hOOpQi`0!iyJhFLc128BQmq$$|x}WJU?iN}7Kl;!3
z&fH)Q*P121qPNW?v_ooy8Sm4CFA(z37NUr<`Xy>ZrdsIPw&-=yHQ7_Vle|5h>AST0
zUGX2Zk34vp?U~IHT|;$U86T+UUHl_NE4m|}>E~6q``7hccCaT^#y+?wD##Q%HwPd8
zV3x4L4|qqu`B<QDcIkg$mi~*?H~RNY_Wx4D`}Y)A)!YT!6xCZ~ZTPl7{FKBwUP;<e
zGwA>$4(LXqDJngNy-{&@aFBvVsywt@X^}iH7P%>bR?ciC$I^U-4Foa<B~H&?t+Qu`
zYwLM1`)@`@%8{2TvE*VigXq!t`qQLW_S5DOEz|2z2j3@V_m6j4f9Dj+Tf}F*MS{?9
z%5n(*$y|Ke#xMg=rzn#aJmhAj==f%8S}Mvc)f)j&X?h~FezLP0LfVpNB_5sLG5z-3
z-35pyFHV0nPmuu&M8{s3y}I4c7J41@C$_+Wqk&1f`MvOF;v`)pAYGN4MEXuFe)LY2
z=&C(w{X1B@m%`@ul3h(Gtch>`YKI^qDyGK7k%E%c_P=yzAi`YnxGA%DeNd++j3*h^
z=rn>oBd0|~lZ<6YvmkKY*ZJlJ;Im0tqgWu&E92eqt;+NYdxx`eS(4Hw_Jb5|yVvBg
z*tbdY^!AN;luEyN4VRhS@-_DC{({ziH{&Z}iGElSV~qvT>L<HMHzj7J-Kpa!p!c29
zr8L!oM5$VXc*@gwRrKuo*h${B7BEkfln@c=6oRf(*M;GdM=)m;H&XnLvs%-2b*+hr
z7OLt;wv$AG{1d5A_3iuPkRtq^Z3(lga~pLocX)JXz8xtPuFH$tb1E#7^Np@GfB#l+
z&L?_OtoPwP5<R)d%5@nIE}~}{W{Q)eZmHbhg7xXREu-ZQD=)K37HlG2J>-8G%+yEL
zX#MFOhj{InyKG=mvW-<1B@c-}x$vA(nU?>S>0*eN#!SLzQ)Ex7fvQ)S4D<8|I#N$3
zT5Ei`Z?cxBODHX8(Xp73v`IsAYC@9b;t}z0wxVuQSY1J^GRwDPN@qbM-ZF48T$GZ<
z8WU+;Pqo?{ghI-KZ-i*ydXu`Ep0Xw^McH_KE9J0S7G;x8Fe`DVG?j3Pv=0YzJ}yZR
z%2=oqHiUjvuk0~Ca>Kol4CFi0_xQT~;_F?=u+!kIDl-9g`#ZNZ9HCy17Ga1v^Jv9#
z{T4Kb1-AzUxq*MutfOWWZgD*HnFfyYg0&e9f(5tZ>krPF6{VikNeHoc{linPPt#Si
z&*g>(c54V8rT_AX!J&bNm-!umPvOR}vDai#`CX___J#=zeB*{4<&2WpaDncZsOkp*
zsg<%@@rbrMkR_ux9?LsQxzoBa1s%$BBn6<mos5IdgH=dhp$~J!{0^n6tTjg;U5Diz
zxn*0gDh2*1rZJV&&gt&fW9znj@{DqJLV4eBUx+J-d4%cpr)wD1@Vx?X)?B*tGi&A=
zu5vL>vk#{&&zUwcfzeCBJUwFYSF$08qDsB;gWQN*g!p8pxjofWbqNSZOEKOaTx@+*
zwdt5*Q47@EOZ~EZL9s?1o?A%9TJT=Ob_13yyugvPg*e&ZU(r6^k4=2+D-@n=Hv5vu
zSXG|hM(>h9^zn=eQ=$6`JO&70&2|%V5Lsx>)(%#;pcOfu>*nk_3HB_BNaH$`jM<^S
zcSftDU<V-inKL>1?nL;jy)+sfonQN}(}gUW?d_ikr*3=^{G)=tjBtEPe>TO|0ddVB
zTklrSHiW+!#26frPXQQ(YN8DG$PZo?(<o9TOVxrss2Tawf48g5GCBiDSDE3#UTp|k
z0pKVwU|jx{+EQY()yCur&pg1mIZ@qM8XqZ|+FwhjuUu?KF@3ju{(}}|suk~@HhJ@t
zQ#Y9~FZ`d#%R!jufJh!!RQMg1{%cNgteq&qcX<80AJGOV1RA*AhOQap$U?Y%h(E_|
z8KTI5C~yZDX7BVzv96*H*$a>po(QUCCf_OJC`pw*uey00%gmH!`WJkrKXj2!<hVw7
zGRWHF)G~HffGg-$(O!{Pk3Jwp$~FflQ(t@$&I&M44Aeq=eh)c85(&68I*-`m6uoK%
zdl?LKwDh&L2fOllfoB@M_mp%O$0z3aZY$k-iHXr}^=#M>#6?`T25mTu9OJp2L8z3!
z=arrL$ZqxuE{%yV)14Kd>k}j7pxZ6#$Dz8$@WV5p8kTqN<-7W)Q7Gt2{KoOPK_tZ|
zf2WG~O5@{qPI+W<4f_;reuFVdO^5`ADC1!JQE|N`s3cq@(0WB!n0uh@*c{=LAd;~}
zyGK@hbF-Oo+!nN)@i*O(`@FA#u?o=~e{`4O#5}z&=UkU*50fOrzi11D^&FOqe>wii
z?*k+2|EcUs;Gx{!@KBT~>PAwLrIDT7Th=Utu?~?np@t^gFs?zgX=D${RwOY^WGh-+
z+#4$066I<rp}NtcF5!lhZvD^1^l^-7{>Sh8eYW#FXWp~S`<*%O^ZuItL1Tyqt8#tZ
zY120E;^VG`!lZn&3sPd$RkdHpU#|w+bYV)pJC|SH9g%|5IkxVTQcBA4CL0}$&}ef@
zW^Vtj%M;;_1xxP9x#ex17&4N*{ksO*_4O}xYu(p*JkL#yr}@7b)t5X?%CY<+s5_MJ
zuiqt+N_;A(_)%lumoyRFixWa-M7qK_9s6<1X?JDa9fP!+_6u~~M$5L=ipB=7(j#f<
zZ<k8E`Rwou;l+s_T|L^56YmSu4%^`QFTOq==C$v}z48r)V)i|8HET~j$njp}R%Tx6
z6z1$DcJ|+-rK;8O+M4D2OI&h9Z=KBOt2kjcN}lMJQdr(sa_4xJa;t^b;+K^1J9dTU
z7E3oumY>34J%=bs549%~_mA(|={uZNs_0?o7;-LBP(ZRnkd{-^|2|=4vUTmtByHL8
zEph`(LSEzQj68a+`d$V<45J7cyv^#|^|%fD#si1Nx!4NW*`l*{->HEWNh6-|g>-=r
zXmQ|-i}Ku$ndUeHQ^&ieT!Lf}vf6GaqW9$DJ2NWrqwPY%%4nip$@vK$nRp*_C-v<|
zuKz~ZyN&<%!NS26&x?jhy+@awJipMQ-8(X4#Ae5??U<1QMt1l9R=w9fAnEF}NYu$2
z>6<Remz*?TALWUbsLm^V8JpOI8Qrv^I{Uf`UiQ}=bwlT?m7XzzYl?d75*MLY>}Vkc
zIb*A?G*z8^IvibmBKn_u^5&T_1oey0gZS2~obf(#xk=erZGTEdQnt3DMGM+0oPwss
zj5zXD;(oWhB_T@~Ig#9@v)AKtXu3>Inmgf@A|-lD-1U>cNyl3h?ADD9)GG4}zUGPk
zZzaXe!~Kf?<~@$G?Uql3t8jy9{2!doq4=J}j9ktTxss{p6!9Ud<L0{JvuM8zT}59}
zy?q<PQ`(o7I;|2vYuA@mlwTC2VWpG0`?f+&<cL+^MokC9tZR;`s@;M`H_iw&h5ar5
zCVhp4@v?m@Lrzz?2Z>jyDERlA*xZ!=Q)KDs5O)phz>Vq3BNGoM(H|=1*Q4$^2fTZw
z(%nq1P|5Rt81}SYJpEEzMPl5VJsV5&4e)ZWKDyo<HF@#^TWD2$%#hJqrPwhn_I`~h
zrl9TX+98FE%rR>Z>1EwpkHx-AQVQc8%JMz;{H~p{=FXV>jIxvm4X*qv52e?Y-f%DJ
zxEA165GikEASQ^fH6K#d!Tpu2HP{sFs%E=e$gYd$aj$+xue6N+Wc(rAz~wUsk2`(b
z8Kvmyz%bKQxpP}~baG-rwYcYCv<u@FIev&XxIE!%Ztbxbt-Y%ng;He-ByR+Lk{8!A
z+z_?%ukrIgzuh|^o09LLIM$E(jD2g{U~B*3O~s^?p1O(7xg6;i3B@l87#4eO$$oiS
znHnuA9vGDnw~@NpSw1YYZjoo+HWTzC%$-J_E}3wK-(9hvP0Ct}242U82Wv+8>kHOi
zlkR<=>ZBTU*8RF_d#Bl@zZsRIhx<%~Z@Z<pYsoy05ZE@HkOT*OyM_H}bO0QHT>=ik
z>adw3!DK(8R|q$vy{FTxw%#xliD~6qXmY^7_9kthVPTF~Xy1CfBqbU~?1QmxmU=+k
z(ggxvEuA;0e&+ci-zQR{-f7aO{O(Pz_OsEjLh_K>MbvoZ4nxtk5u{g@nPv)cgW_R}
z9}EA4K4@z0?7ue}Z(o~R(X&FjejUI2g~08PH1E4w>9o{)S(?1>Z0XMvTb|;&EuyOE
z<Uf?|k(eh=|5A0k%+DyPOynyZIUEys#os=5y;pE3-eqjU+vfn|kx^`PvS0aFgK5k7
z_Ws8b<tF{v{r8Xc%Ra(S^vGriKjcqYF_IFKBCZ&>GvWNpYX)Nv<8|a^;<iw`qCF!m
zJCer*R!Lka?@SE&ch5f_idG-0J0#H0t`-WqI6fU-<8)0T>1>bh#&znEcl-r!T#pn=
z4$?Yudha6F%4b>*8@=BdtXXY4N+`U4Dmx$}>HeVJk-QdTG@t!tVT#0(LeV0gvqyyw
z2sEp^9eY0N`u10Tm4n8No&A=)IeEC|gnmEXoNSzu!1<4R<%-9kY_8~5Ej?zReg<ow
z_S`;&)5*<v9D*b@Pp375jJ+bUt(~O-SF0ZSb>Mn78wuMs#;i&eUA0Zk_RXQ3b&TT}
z;SCI<r<Tr9OC%fOR>=7-FUB@*&;8|n>(_g^HGf3@QODE3LpmX~ELnymQm{Sx9xrKS
zK29p~?v@R$0=v6Dr5aW>-!{+h@?Q58|Kz8{{W`%J+lDAdb&M5VHrX_m<yIVSusgG)
z`wE|gu8*tGFN+9LZP$c*%%oTIyT7Y#Uu$+WtVYvbCQ+Do^ID=OntZUS+AP2A;pDLp
zQ|q{WRobah5mlF}4!P`?SD*ypcWDw;D;8QgDGN?GxP&*+636N@_L%TF<fCxBbhCW%
zM%&xYBGf(}%YX|$hc1=>DY;1-JLnf)ezmPau$)1;=`-FU=-r-83tX=C`S#}GZufju
zQ>sXNT0Ny=k@nc%cFnvA_i4SC)?_ORXHq8B4D%el1uPX`c~uG#S1M7C+*MMqLw78E
zhY2dI8@+N^qrM<pEkuRyo-$3h*azHIX|CuJeg6U1uTYr#f1L)jm7pty!Nt_EpVqGj
zXVf`s*T3XSv0ipIVh~5QdLCYp1wi6^@nTR33ob<X9tfmOm*uFzZ-27JIbc;RY)sW`
zt!7sXsMOap1pmxDHwCgCeJ^w>I1+;TUd<Y7V_}N3cTlx9g%Oxl!1(@Fz@fhB!X!Qg
zzGte&a;gDDb7s`2pRO@2w9ly`vje6Ilj<$_c7{OhbUJ}K%$c%`!Wu9kSA#`Xja47q
z=~M~uTQ&ejWO4`vDE9|h6MO?f*d<f1pCzFOx6YSGxl|H(I2h0=%!FyILZ_|=bX<Z@
zIXy76kKLDpG=TR2T<cl5;7+F|gWhKYZ346vNMyq9`PzDJfsBPN=&eq$MKd*arcmkB
zATW410h$o^5JE{IECTR`<Q+_;Q1BqjI5Ciw%m8kGTdmE2GB82|h<4gEtNm!8e#?mt
z;D<+GLqmLk>a(vGqGSRyU{Fnm`aqrr7bz4<dKN=G5TxM1z~0~yR6MN!z}y8FQ<9oh
z=+qK`&H;`)dq`^^SX>2c5xsOO-~oZpkzorD1g}Y<6rk&3>PsSGy}W?MtqFky@A(X#
zIuNZK0cK?^=;PUAu>j0#HtjbHCV*6?jzA&OoE$*Jlga*}LF`SF?WLhv1O|zqC<>*>
zYB;#lsYKx0&kH@BFpW8n*yDcc6?;_zaJs<-jPSkCsSX-!aV=P5kUgF@Nu<{a%#K*F
z134Q{9|YX7X(v$62_cY3^G%t~rD>Q0z@)1|zs)vjJ6Jq9;7#Ki`w+eS**En?7;n&7
zu==V3T&eFboN3ZiMx3D8qYc;VjFUk_H-WWCau(VFXSQf~viH0L$<p$oTrv;PAp(HR
zxGYwoQ{`RRg4u%PEm#VL7~lyq<6`h+F9OK~tPR1wo}q|fJ3M>gwD$<h61N|-us1_A
zl+PEzjP+uR3A<ynvirg<hO=WL=vh8&(Wl2haO~h*nJ_e*eGq{@?#~H&b}m8~6;8Z^
zpbmt5cVjrE;ei2}0ANsjD@<R3Rp?ZF=p4ZLJsD20%)C2Oh4~(z31DYTGiF#ITp7;7
z!>UfFHqNcgN`x}M+YQ6RnN<+@t>JUp#)9YOkqst-Ga?{FsDpEeX0(5v{0J~SEbWiL
zXC2}M4?UH@u&|;%0y`eb33ldo4~z-x8zY!oVmV=c+f$m?RfDC35mdQ2E>Pze7KWP-
z>!Bh<&57I+O_^s}9Tg^k)h7{xx@0a0IA~GAOt2yy!X%Q$1rt~LbTB6@Du!_0%HV>N
zlf)QI1&gvERKwso23mJ!Ou6ZS#zCS5W`gxE5T>C#E|{i<1D35C222I33?Njaz`On7
zi<+VWFP6D{e-{yiN#M|Jgk<44u1TiMI78S5W`Sdb5f+{zu34s{CfWN7a3Cf^@L%!&
zN$?|!!9j2c)j$~+R6n#891w-z8(!oBpL2K=+%a$r2|~8-(vQj5_XT`<0Ksf;oP+tz
z9CObS!0m)Tgg`K#xBM8B(|Z)Wb&DYL{WTYv`;A=q6~Nnx2+!lTIXtj8J7dZE!P_{z
z#f8w6F}^!?^KE#+ZDv+xd5O&3EmomZzsv?>E-~ygGum45fk!SBN&|eo1rKw^?aZJ4
E2O(~oYXATM

literal 0
HcmV?d00001

diff --git a/examples/llama.android/gradle/wrapper/gradle-wrapper.properties b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000000000..a3958c140b4e1
--- /dev/null
+++ b/examples/llama.android/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Dec 21 14:31:09 AEDT 2023
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
diff --git a/examples/llama.android/gradlew b/examples/llama.android/gradlew
new file mode 100755
index 0000000000000..4f906e0c811fc
--- /dev/null
+++ b/examples/llama.android/gradlew
@@ -0,0 +1,185 @@
+#!/usr/bin/env sh
+
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=`expr $i + 1`
+    done
+    case $i in
+        0) set -- ;;
+        1) set -- "$args0" ;;
+        2) set -- "$args0" "$args1" ;;
+        3) set -- "$args0" "$args1" "$args2" ;;
+        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=`save "$@"`
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+exec "$JAVACMD" "$@"
diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts
new file mode 100644
index 0000000000000..2ba32c4fafc5c
--- /dev/null
+++ b/examples/llama.android/settings.gradle.kts
@@ -0,0 +1,17 @@
+pluginManagement {
+    repositories {
+        google()
+        mavenCentral()
+        gradlePluginPortal()
+    }
+}
+dependencyResolutionManagement {
+    repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
+    repositories {
+        google()
+        mavenCentral()
+    }
+}
+
+rootProject.name = "LlamaAndroid"
+include(":app")

From 158f8c9e21302114bac3c646f80ea85b52ffa0bd Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Tue, 16 Jan 2024 17:05:19 +0000
Subject: [PATCH 08/25] metal : localized logic in `ggml_metal_graph_compute`
 (#4924)

* Metal: Localized logic in `ggml_metal_graph_compute`, minor performance improvement

* Whitespace

* Collecting command buffer completions on single thread

* Whitespace

* Reduce diff noise
---
 ggml-metal.h |  1 -
 ggml-metal.m | 37 +++++++++++++++++--------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/ggml-metal.h b/ggml-metal.h
index 8b0bfc5f10329..df83a1807c6b2 100644
--- a/ggml-metal.h
+++ b/ggml-metal.h
@@ -27,7 +27,6 @@
 
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
-#define GGML_METAL_MAX_COMMAND_BUFFERS 32
 
 struct ggml_tensor;
 struct ggml_cgraph;
diff --git a/ggml-metal.m b/ggml-metal.m
index c21dc465ae50c..a549e6713e9bc 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -170,9 +170,6 @@
     id<MTLCommandQueue> queue;
     id<MTLLibrary>      library;
 
-    id<MTLCommandBuffer>         command_buffers [GGML_METAL_MAX_COMMAND_BUFFERS];
-    id<MTLComputeCommandEncoder> command_encoders[GGML_METAL_MAX_COMMAND_BUFFERS];
-
     dispatch_queue_t d_queue;
 
     int n_buffers;
@@ -719,25 +716,25 @@ static bool ggml_metal_graph_compute(
     @autoreleasepool {
 
     MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
-
-    const int n_nodes  = gf->n_nodes;
     edesc.dispatchType = MTLDispatchTypeSerial;
 
     // create multiple command buffers and enqueue them
     // then, we encode the graph into the command buffers in parallel
 
+    const int n_nodes  = gf->n_nodes;
     const int n_cb = ctx->n_cb;
+    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
 
-    for (int i = 0; i < n_cb; ++i) {
-        ctx->command_buffers[i] = [ctx->queue commandBuffer];
+    id<MTLCommandBuffer> command_buffer_builder[n_cb];
+    for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
+        id<MTLCommandBuffer> command_buffer  = [ctx->queue commandBufferWithUnretainedReferences];
+        command_buffer_builder[cb_idx] = command_buffer;
 
         // enqueue the command buffers in order to specify their execution order
-        [ctx->command_buffers[i] enqueue];
-
-        ctx->command_encoders[i] = [ctx->command_buffers[i] computeCommandEncoderWithDescriptor: edesc];
+        [command_buffer enqueue];
     }
+    const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
 
-    const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
     dispatch_apply(n_cb, ctx->d_queue, ^(size_t iter) {
         const int cb_idx = iter;
 
@@ -745,15 +742,13 @@ static bool ggml_metal_graph_compute(
         size_t offs_src1 = 0;
         size_t offs_dst  = 0;
 
-        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-        id<MTLComputeCommandEncoder> encoder = ctx->command_encoders[cb_idx];
+        id<MTLCommandBuffer> command_buffer  = command_buffers[cb_idx];
+        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoderWithDescriptor: edesc];
 
         const int node_start =                                      (cb_idx + 0) * n_nodes_per_cb;
         const int node_end   = MIN((cb_idx == n_cb - 1) ? n_nodes : (cb_idx + 1) * n_nodes_per_cb, n_nodes);
 
-        for (int ind = node_start; ind < node_end; ++ind) {
-            const int i = ind;
-
+        for (int i = node_start; i < node_end; ++i) {
             if (i == -1) {
                 [encoder memoryBarrierWithScope:MTLBarrierScopeBuffers];
                 continue;
@@ -2249,12 +2244,14 @@ static bool ggml_metal_graph_compute(
         [command_buffer commit];
     });
 
-    // check status of command buffers
+    // Wait for completion and check status of each command buffer
     // needed to detect if the device ran out-of-memory for example (#1881)
-    for (int i = 0; i < n_cb; i++) {
-        [ctx->command_buffers[i] waitUntilCompleted];
 
-        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
+    for (int i = 0; i < n_cb; ++i) {
+        id<MTLCommandBuffer> command_buffer = command_buffers[i];
+        [command_buffer waitUntilCompleted];
+
+        MTLCommandBufferStatus status = [command_buffer status];
         if (status != MTLCommandBufferStatusCompleted) {
             GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
             return false;

From c37b3474e61d609d43cccc3bde5d559e80e4f5d1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 16 Jan 2024 19:13:54 +0200
Subject: [PATCH 09/25] flake.lock: update flake-parts,
 flake-parts/nixpkgs-lib, and nixpkgs (#4920)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Flake lock file updates:

• Updated input 'flake-parts':
    'github:hercules-ci/flake-parts/34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5' (2023-12-01)
  → 'github:hercules-ci/flake-parts/07f6395285469419cf9d078f59b5b49993198c00' (2024-01-11)
• Updated input 'flake-parts/nixpkgs-lib':
    'github:NixOS/nixpkgs/e92039b55bcd58469325ded85d4f58dd5a4eaf58?dir=lib' (2023-11-29)
  → 'github:NixOS/nixpkgs/b0d36bd0a420ecee3bc916c91886caca87c894e9?dir=lib' (2023-12-30)
• Updated input 'nixpkgs':
    'github:NixOS/nixpkgs/cfc3698c31b1fb9cdcf10f36c9643460264d0ca8' (2023-12-27)
  → 'github:NixOS/nixpkgs/317484b1ead87b9c1b8ac5261a8d2dd748a0492d' (2024-01-08)

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 flake.lock | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/flake.lock b/flake.lock
index 15a0a1a8e10fa..cd532ef4fc035 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
         "nixpkgs-lib": "nixpkgs-lib"
       },
       "locked": {
-        "lastModified": 1701473968,
-        "narHash": "sha256-YcVE5emp1qQ8ieHUnxt1wCZCC3ZfAS+SRRWZ2TMda7E=",
+        "lastModified": 1704982712,
+        "narHash": "sha256-2Ptt+9h8dczgle2Oo6z5ni5rt/uLMG47UFTR1ry/wgg=",
         "owner": "hercules-ci",
         "repo": "flake-parts",
-        "rev": "34fed993f1674c8d06d58b37ce1e0fe5eebcb9f5",
+        "rev": "07f6395285469419cf9d078f59b5b49993198c00",
         "type": "github"
       },
       "original": {
@@ -20,11 +20,11 @@
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1703637592,
-        "narHash": "sha256-8MXjxU0RfFfzl57Zy3OfXCITS0qWDNLzlBAdwxGZwfY=",
+        "lastModified": 1705133751,
+        "narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "cfc3698c31b1fb9cdcf10f36c9643460264d0ca8",
+        "rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
         "type": "github"
       },
       "original": {
@@ -37,11 +37,11 @@
     "nixpkgs-lib": {
       "locked": {
         "dir": "lib",
-        "lastModified": 1701253981,
-        "narHash": "sha256-ztaDIyZ7HrTAfEEUt9AtTDNoCYxUdSd6NrRHaYOIxtk=",
+        "lastModified": 1703961334,
+        "narHash": "sha256-M1mV/Cq+pgjk0rt6VxoyyD+O8cOUiai8t9Q6Yyq4noY=",
         "owner": "NixOS",
         "repo": "nixpkgs",
-        "rev": "e92039b55bcd58469325ded85d4f58dd5a4eaf58",
+        "rev": "b0d36bd0a420ecee3bc916c91886caca87c894e9",
         "type": "github"
       },
       "original": {

From 959ef0c0df725c013c7f712eaa7790b8e38a8e20 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 16 Jan 2024 19:34:54 +0200
Subject: [PATCH 10/25] perplexity : fix kv cache handling for hellaswag
 (#4981)

ggml-ci
---
 examples/perplexity/perplexity.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9a77beca6df32..b4fedf8039c78 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -428,6 +428,7 @@ static std::vector<float> hellaswag_evaluate_tokens(
     for (size_t i_chunk = 0; i_chunk < n_chunk; ++i_chunk) {
         size_t n_tokens = tokens.size() - i_chunk * n_batch;
         n_tokens = std::min(n_tokens, size_t(n_batch));
+        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
         if (llama_decode(ctx, llama_batch_get_one(tokens.data() + i_chunk * n_batch, n_tokens, n_past, 0))) {
             fprintf(stderr, "%s : failed to eval\n", __func__);
             return {};

From 4feb4b33eeb1756e46084a4db9230b279af1a480 Mon Sep 17 00:00:00 2001
From: Maximilian Winter <maximilian.winter.91@gmail.com>
Date: Tue, 16 Jan 2024 18:41:42 +0100
Subject: [PATCH 11/25] examples : add complete parallel function calling
 example (#4974)

---
 .../pydantic-models-to-grammar-examples.py    | 123 +++++++++++++++++-
 1 file changed, 121 insertions(+), 2 deletions(-)

diff --git a/examples/pydantic-models-to-grammar-examples.py b/examples/pydantic-models-to-grammar-examples.py
index a8a4919cff243..cbf3766526ff7 100644
--- a/examples/pydantic-models-to-grammar-examples.py
+++ b/examples/pydantic-models-to-grammar-examples.py
@@ -1,5 +1,5 @@
 # Function calling example using pydantic models.
-
+import datetime
 import json
 from enum import Enum
 from typing import Union, Optional
@@ -8,7 +8,8 @@
 from pydantic import BaseModel, Field
 
 import importlib
-from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation
+from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
+
 
 # Function to get completion on the llama.cpp server with grammar.
 def create_completion(prompt, grammar):
@@ -134,3 +135,121 @@ class Book(BaseModel):
 json_data = json.loads(text)
 
 print(Book(**json_data))
+# An example for parallel function calling with a Python function, a pydantic function model and an OpenAI like function definition.
+
+def get_current_datetime(output_format: Optional[str] = None):
+    """
+    Get the current date and time in the given format.
+    Args:
+         output_format: formatting string for the date and time, defaults to '%Y-%m-%d %H:%M:%S'
+    """
+    if output_format is None:
+        output_format = '%Y-%m-%d %H:%M:%S'
+    return datetime.datetime.now().strftime(output_format)
+
+
+# Enum for the calculator tool.
+class MathOperation(Enum):
+    ADD = "add"
+    SUBTRACT = "subtract"
+    MULTIPLY = "multiply"
+    DIVIDE = "divide"
+
+
+
+# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
+class Calculator(BaseModel):
+    """
+    Perform a math operation on two numbers.
+    """
+    number_one: Union[int, float] = Field(..., description="First number.")
+    operation: MathOperation = Field(..., description="Math operation to perform.")
+    number_two: Union[int, float] = Field(..., description="Second number.")
+
+    def run(self):
+        if self.operation == MathOperation.ADD:
+            return self.number_one + self.number_two
+        elif self.operation == MathOperation.SUBTRACT:
+            return self.number_one - self.number_two
+        elif self.operation == MathOperation.MULTIPLY:
+            return self.number_one * self.number_two
+        elif self.operation == MathOperation.DIVIDE:
+            return self.number_one / self.number_two
+        else:
+            raise ValueError("Unknown operation.")
+
+
+# Example function to get the weather
+def get_current_weather(location, unit):
+    """Get the current weather in a given location"""
+    if "London" in location:
+        return json.dumps({"location": "London", "temperature": "42", "unit": unit.value})
+    elif "New York" in location:
+        return json.dumps({"location": "New York", "temperature": "24", "unit": unit.value})
+    elif "North Pole" in location:
+        return json.dumps({"location": "North Pole", "temperature": "-42", "unit": unit.value})
+    else:
+        return json.dumps({"location": location, "temperature": "unknown"})
+
+
+# Here is a function definition in OpenAI style
+current_weather_tool = {
+    "type": "function",
+    "function": {
+        "name": "get_current_weather",
+        "description": "Get the current weather in a given location",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "location": {
+                    "type": "string",
+                    "description": "The city and state, e.g. San Francisco, CA",
+                },
+                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
+            },
+            "required": ["location"],
+        },
+    },
+}
+
+# Convert OpenAI function definition into pydantic model
+current_weather_tool_model = convert_dictionary_to_pydantic_model(current_weather_tool)
+# Add the actual function to a pydantic model
+current_weather_tool_model = add_run_method_to_dynamic_model(current_weather_tool_model, get_current_weather)
+
+# Convert normal Python function to a pydantic model
+current_datetime_model = create_dynamic_model_from_function(get_current_datetime)
+
+tool_list = [SendMessageToUser, Calculator, current_datetime_model, current_weather_tool_model]
+
+
+gbnf_grammar, documentation = generate_gbnf_grammar_and_documentation(
+    pydantic_model_list=tool_list, outer_object_name="function",
+    outer_object_content="params", model_prefix="Function", fields_prefix="Parameters", list_of_outputs=True)
+
+system_message = "You are an advanced AI assistant. You are interacting with the user and with your environment by calling functions. You call functions by writing JSON objects, which represent specific function calls.\nBelow is a list of your available function calls:\n\n" + documentation
+
+
+text = """Get the date and time, get the current weather in celsius in London and solve the following calculation: 42 * 42"""
+prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant"
+
+text = create_completion(prompt=prompt, grammar=gbnf_grammar)
+
+json_data = json.loads(text)
+
+print(json_data)
+# Should output something like this:
+# [{'function': 'get_current_datetime', 'params': {'output_format': '%Y-%m-%d %H:%M:%S'}}, {'function': 'get_current_weather', 'params': {'location': 'London', 'unit': 'celsius'}}, {'function': 'Calculator', 'params': {'number_one': 42, 'operation': 'multiply', 'number_two': 42}}]
+
+
+for call in json_data:
+    if call["function"] == "Calculator":
+        print(Calculator(**call["params"]).run())
+    elif call["function"] == "get_current_datetime":
+        print(current_datetime_model(**call["params"]).run())
+    elif call["function"] == "get_current_weather":
+        print(current_weather_tool_model(**call["params"]).run())
+# Should output something like this:
+# 2024-01-14 13:36:06
+# {"location": "London", "temperature": "42", "unit": "celsius"}
+# 1764

From 334a835a1ccc8106a5fa355683a965efb1bfa24b Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Tue, 16 Jan 2024 19:51:26 +0200
Subject: [PATCH 12/25] ggml : importance matrix support for legacy quants
 (#4969)

* imatrix: adding support for legacy quants

* imatrix: guard Q4_0/Q5_0 against ffn_down craziness

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 ggml-quants.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++++++
 ggml-quants.h |   4 ++
 ggml.c        |  28 +++++---
 llama.cpp     |  10 +++
 4 files changed, 226 insertions(+), 8 deletions(-)

diff --git a/ggml-quants.c b/ggml-quants.c
index 0750fe1bb27f1..31b053e335787 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -515,6 +515,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {
     quantize_row_q4_0_reference(x, y, k);
 }
 
+
 void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) {
     const int qk = QK4_1;
 
@@ -3039,6 +3040,197 @@ size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int
     return nrow * row_size;
 }
 
+static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK4_0 == 32, "QK4_0 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q4_0_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK4_0];
+    int8_t L[QK4_0];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK4_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK4_0 * ib;
+        const float * qw = quant_weights + QK4_0 * ib;
+        for (int j = 0; j < QK4_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float d = make_qx_quants(QK4_0, 8, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        for (int j = 0; j < 16; ++j) {
+            y[ib].qs[j] = L[j] | (L[j+16] << 4);
+        }
+    }
+}
+
+size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK4_1 == 32, "QK4_1 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q4_1_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK4_1];
+    uint8_t L[QK4_1], Laux[QK4_1];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK4_1;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK4_1 * ib;
+        const float * qw = quant_weights + QK4_1 * ib;
+        for (int j = 0; j < QK4_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float min;
+        float d = make_qkx3_quants(QK4_1, 15, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        y[ib].m = GGML_FP32_TO_FP16(-min);
+        for (int j = 0; j < 16; ++j) {
+            y[ib].qs[j] = L[j] | (L[j+16] << 4);
+        }
+    }
+}
+
+size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK5_0 == 32, "QK5_0 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q5_0_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK5_0];
+    int8_t L[QK5_0];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK5_0;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK5_0 * ib;
+        const float * qw = quant_weights + QK5_0 * ib;
+        for (int j = 0; j < QK5_0; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float d = make_qx_quants(QK5_0, 16, xb, L, 1, weight);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+
+        uint32_t qh = 0;
+
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+
+        memcpy(&y[ib].qh, &qh, sizeof(qh));
+    }
+}
+
+size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
+static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) {
+    static_assert(QK5_1 == 32, "QK5_1 must be 32");
+
+    if (!quant_weights) {
+        quantize_row_q5_1_reference(x, y, n_per_row);
+        return;
+    }
+
+    float weight[QK5_1];
+    uint8_t L[QK5_1], Laux[QK5_1];
+
+    float sum_x2 = 0;
+    for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j];
+    float sigma2 = sum_x2/n_per_row;
+
+    const int nb = n_per_row/QK5_1;
+    for (int ib = 0; ib < nb; ++ib) {
+        const float * xb = x + QK5_1 * ib;
+        const float * qw = quant_weights + QK5_1 * ib;
+        for (int j = 0; j < QK5_1; ++j) weight[j] = qw[j] * sqrtf(sigma2 + xb[j]*xb[j]);
+        float min;
+        float d = make_qkx3_quants(QK5_1, 31, xb, weight, L, &min, Laux, -0.9f, 0.05f, 36, false);
+        y[ib].d = GGML_FP32_TO_FP16(d);
+        y[ib].m = GGML_FP32_TO_FP16(-min);
+
+        uint32_t qh = 0;
+        for (int j = 0; j < 16; ++j) {
+            const uint8_t xi0 = L[j];
+            const uint8_t xi1 = L[j+16];
+            y[ib].qs[j] = (xi0 & 0x0F) | ((xi1 & 0x0F) << 4);
+            // get the 5-th bit and store it in qh at the right position
+            qh |= ((xi0 & 0x10u) >> 4) << (j + 0);
+            qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2);
+        }
+        memcpy(&y[ib].qh, &qh, sizeof(qh));
+    }
+}
+
+size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
+    if (!quant_weights) {
+        return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
+    }
+    int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+    char * qrow = (char *)dst;
+    for (int row = 0; row < nrow; ++row) {
+        quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
+        src += n_per_row;
+        qrow += row_size;
+    }
+    return nrow * row_size;
+}
+
 // ====================== "True" 2-bit (de)-quantization
 
 static const  uint64_t iq2xxs_grid[256] = {
diff --git a/ggml-quants.h b/ggml-quants.h
index 99467936aa724..d7fefdb547911 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -253,3 +253,7 @@ size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
diff --git a/ggml.c b/ggml.c
index 5779f32d297e3..d7e01b81f0179 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18674,26 +18674,38 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
         case GGML_TYPE_Q4_0:
             {
                 GGML_ASSERT(start % QK4_0 == 0);
-                block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = ggml_quantize_q4_0(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q4_1:
             {
                 GGML_ASSERT(start % QK4_1 == 0);
-                block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = ggml_quantize_q4_1(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q5_0:
             {
                 GGML_ASSERT(start % QK5_0 == 0);
-                block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
-                result = ggml_quantize_q5_0(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q5_1:
             {
                 GGML_ASSERT(start % QK5_1 == 0);
-                block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
-                result = ggml_quantize_q5_1(src + start, block, n, n, hist);
+                GGML_ASSERT(start % n_per_row == 0);
+                size_t start_row = start / n_per_row;
+                size_t row_size = ggml_row_size(type, n_per_row);
+                result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
+                GGML_ASSERT(result == row_size * nrows);
             } break;
         case GGML_TYPE_Q8_0:
             {
diff --git a/llama.cpp b/llama.cpp
index 46c4d11c88873..765d20ddb639a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8374,6 +8374,8 @@ struct quantize_state_internal {
     int n_k_quantized     = 0;
     int n_fallback        = 0;
 
+    bool has_imatrix      = false;
+
     quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
         : model(model)
         , params(params)
@@ -8546,6 +8548,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
             new_type = GGML_TYPE_Q5_K;
         }
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
+                && qs.has_imatrix && i_layer < n_layer/8) {
+            // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
+            // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
+            // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
+            new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
+        }
         ++qs.i_feed_forward_w2;
     } else if (name.find("attn_output.weight") != std::string::npos) {
         if (arch != LLM_ARCH_FALCON) {
@@ -8669,6 +8678,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
         if (imatrix_data) {
             LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
+            qs.has_imatrix = true;
         }
     }
 

From cec8a4847062fbd76253e3b085683f39d91e80d3 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Tue, 16 Jan 2024 18:54:24 +0100
Subject: [PATCH 13/25] finetune : add training data file to log message
 (#4979)

This commit adds the name of the training data file to the log message
printed when the training data is tokenized.

The motivation for this change is that it can be useful to show which
file is being tokenized when running the finetune example.

Signed-off-by: Daniel Bevenius <daniel.bevenius@gmail.com>
---
 examples/finetune/finetune.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index a6620fd73ca18..11fcbf443326e 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -1799,7 +1799,7 @@ int main(int argc, char ** argv) {
     std::vector<llama_token> train_tokens;
     std::vector<size_t> train_samples_begin;
     std::vector<size_t> train_samples_size;
-    printf("%s: tokenize training data\n", __func__);
+    printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
     tokenize_file(lctx,
             params.common.fn_train_data,
             params.common.sample_start,

From bee938da74c33f42242c3a1058ac0a0a6eeef531 Mon Sep 17 00:00:00 2001
From: Philip Taron <philip.taron@gmail.com>
Date: Tue, 16 Jan 2024 09:56:21 -0800
Subject: [PATCH 14/25] nix: remove nixConfig from flake.nix (#4984)

---
 flake.nix | 57 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

diff --git a/flake.nix b/flake.nix
index 488ed6c59d963..ec62c773a644f 100644
--- a/flake.nix
+++ b/flake.nix
@@ -6,28 +6,41 @@
     flake-parts.url = "github:hercules-ci/flake-parts";
   };
 
-  # Optional binary cache
-  nixConfig = {
-    extra-substituters = [
-      # Populated by the CI in ggerganov/llama.cpp
-      "https://llama-cpp.cachix.org"
-
-      # A development cache for nixpkgs imported with `config.cudaSupport = true`.
-      # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
-      # This lets one skip building e.g. the CUDA-enabled openmpi.
-      # TODO: Replace once nix-community obtains an official one.
-      "https://cuda-maintainers.cachix.org"
-    ];
-
-    # Verify these are the same keys as published on
-    # - https://app.cachix.org/cache/llama-cpp
-    # - https://app.cachix.org/cache/cuda-maintainers
-    extra-trusted-public-keys = [
-      "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
-      "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
-    ];
-  };
-
+  # There's an optional binary cache available. The details are below, but they're commented out.
+  #
+  # Why? The terrible experience of being prompted to accept them on every single Nix command run.
+  # Plus, there are warnings shown about not being a trusted user on a default Nix install
+  # if you *do* say yes to the prompts.
+  #
+  # This experience makes having `nixConfig` in a flake a persistent UX problem.
+  #
+  # To make use of the binary cache, please add the relevant settings to your `nix.conf`.
+  # It's located at `/etc/nix/nix.conf` on non-NixOS systems. On NixOS, adjust the `nix.settings`
+  # option in your NixOS configuration to add `extra-substituters` and `extra-trusted-public-keys`,
+  # as shown below.
+  #
+  # ```
+  # nixConfig = {
+  #   extra-substituters = [
+  #     # Populated by the CI in ggerganov/llama.cpp
+  #     "https://llama-cpp.cachix.org"
+  #
+  #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
+  #     # Populated by https://hercules-ci.com/github/SomeoneSerge/nixpkgs-cuda-ci.
+  #     # This lets one skip building e.g. the CUDA-enabled openmpi.
+  #     # TODO: Replace once nix-community obtains an official one.
+  #     "https://cuda-maintainers.cachix.org"
+  #   ];
+  #
+  #   # Verify these are the same keys as published on
+  #   # - https://app.cachix.org/cache/llama-cpp
+  #   # - https://app.cachix.org/cache/cuda-maintainers
+  #   extra-trusted-public-keys = [
+  #     "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
+  #     "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
+  #   ];
+  # };
+  # ```
 
   # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
   #

From 5c999609013a30c06e6fd28be8db5c2074bcc196 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 16 Jan 2024 20:59:31 +0200
Subject: [PATCH 15/25] py : remove unnecessary hasattr (#4903)

---
 convert-hf-to-gguf.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index b133f3b49f719..1178d63a231fa 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -266,11 +266,10 @@ def _set_vocab_gpt2(self):
                 toktypes.append(gguf.TokenType.USER_DEFINED)
             elif reverse_vocab[i] in added_vocab:
                 tokens.append(reverse_vocab[i])
-                if hasattr(tokenizer, "added_tokens_decoder"):
-                    if tokenizer.added_tokens_decoder[i].special:
-                        toktypes.append(gguf.TokenType.CONTROL)
-                    else:
-                        toktypes.append(gguf.TokenType.USER_DEFINED)
+                if tokenizer.added_tokens_decoder[i].special:
+                    toktypes.append(gguf.TokenType.CONTROL)
+                else:
+                    toktypes.append(gguf.TokenType.USER_DEFINED)
             else:
                 tokens.append(reverse_vocab[i])
                 toktypes.append(gguf.TokenType.NORMAL)

From f46c0c1b0ea0bc67e24e4bf026a7e898c1af22a9 Mon Sep 17 00:00:00 2001
From: David Renshaw <dwrenshaw@gmail.com>
Date: Wed, 17 Jan 2024 02:17:50 -0500
Subject: [PATCH 16/25] llama : fix copy/paste error in llama_sampling_params
 comment (#4994)

---
 common/sampling.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/sampling.h b/common/sampling.h
index f16ef97e34a10..2ee1803761aa2 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -17,7 +17,7 @@ typedef struct llama_sampling_params {
     float       min_p                 = 0.05f;    // 0.0 = disabled
     float       tfs_z                 = 1.00f;    // 1.0 = disabled
     float       typical_p             = 1.00f;    // 1.0 = disabled
-    float       temp                  = 0.80f;    // 1.0 = disabled
+    float       temp                  = 0.80f;    // <= 0.0 to sample greedily, 0.0 to not output probabilities
     int32_t     penalty_last_n        = 64;       // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float       penalty_repeat        = 1.10f;    // 1.0 = disabled
     float       penalty_freq          = 0.00f;    // 0.0 = disabled

From 75632936659772d5b2ce54b0b65319fecbaac2e6 Mon Sep 17 00:00:00 2001
From: Paul Tsochantaris <ptsochantaris@icloud.com>
Date: Wed, 17 Jan 2024 08:07:24 +0000
Subject: [PATCH 17/25] metal : remove unnecessary nil check (#4986)

---
 ggml-metal.m | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml-metal.m b/ggml-metal.m
index a549e6713e9bc..8bb4edd64db2e 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -2236,10 +2236,7 @@ static bool ggml_metal_graph_compute(
 #endif
         }
 
-        if (encoder != nil) {
-            [encoder endEncoding];
-            encoder = nil;
-        }
+        [encoder endEncoding];
 
         [command_buffer commit];
     });

From 2b3a665d3917edf393761a24c4835447894df74a Mon Sep 17 00:00:00 2001
From: Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Date: Wed, 17 Jan 2024 12:36:37 +0200
Subject: [PATCH 18/25] llama : use Q4_K for attn_v for Q2_K_S when n_gqa >= 4
 (#4996)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
---
 llama.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 765d20ddb639a..2c5983c67f671 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8477,7 +8477,12 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
         }
         else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
     } else if (name.find("attn_v.weight") != std::string::npos) {
-        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+        if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
+            new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K;
+        }
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && qs.model.hparams.n_gqa() >= 4) {
+            new_type = GGML_TYPE_Q4_K;
+        }
         else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
             new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
         }

From 4f4bf35f46600441dec2f941e667291eeb9a18d8 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 15:45:03 +0200
Subject: [PATCH 19/25] py : fix missing added_tokens_dict for SPM and BPE
 vocabs (#4971)

* py : fix missing added_tokens_dict for SPM vocab

* py : pad with unknown tokens when data is missing

ggml-ci

* py : fix BPE vocab conversion

ggml-ci

* py : fix padded dummy tokens (I hope)
---
 convert.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/convert.py b/convert.py
index 3b613eefc6c2c..b47bb618513be 100755
--- a/convert.py
+++ b/convert.py
@@ -387,6 +387,7 @@ def __init__(
         self.bpe_tokenizer = json.loads(
             open(str(fname_tokenizer), encoding="utf-8").read()
         )
+        self.vocab = self.bpe_tokenizer["model"]["vocab"]
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
             # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
@@ -405,7 +406,7 @@ def __init__(
                     if item["content"] not in self.bpe_tokenizer
                 )
 
-        vocab_size: int = len(self.bpe_tokenizer)
+        vocab_size: int = len(self.vocab)
         expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
         actual_ids = sorted(added_tokens.values())
         if expected_ids != actual_ids:
@@ -415,6 +416,7 @@ def __init__(
             )
 
         items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
+        self.added_tokens_dict = added_tokens
         self.added_tokens_list = [text for (text, idx) in items]
         self.vocab_size_base: int = vocab_size
         self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list)
@@ -422,10 +424,9 @@ def __init__(
         self.fname_added_tokens = fname_added_tokens
 
     def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
-        tokenizer = self.bpe_tokenizer
-        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
 
-        for i, _ in enumerate(tokenizer):
+        for i, _ in enumerate(self.vocab):
             yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
 
     def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
@@ -466,6 +467,7 @@ def __init__(
             )
 
         # Token pieces that were added to the base vocabulary.
+        self.added_tokens_dict = added_tokens
         self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
         self.vocab_size_base = vocab_size
         self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
@@ -1006,6 +1008,7 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N
         )
         for i in range(1, pad_count + 1):
             vocab.added_tokens_dict[f"<dummy{i:05}>"] = -1
+            vocab.added_tokens_list.append(f"<dummy{i:05}>")
         vocab.vocab_size = params.n_vocab
         return
 
@@ -1097,6 +1100,8 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]
             scores.append(score)
             toktypes.append(toktype)
 
+        assert(len(tokens) == vocab.vocab_size)
+
         return tokens, scores, toktypes
 
     def add_meta_vocab(self, vocab: Vocab) -> None:
@@ -1373,15 +1378,14 @@ def _detect_files(self):
                 self.files[file] = file_path
             elif parent_file_path.exists():
                 self.files[file] = parent_file_path
+        print(f"Found vocab files: {self.files}")
 
     def _select_file(self, vocabtype: Optional[str]) -> Path:
         if vocabtype in ["spm", "bpe"]:
-            # For SentencePiece and BPE, return specific files as before
-            file_key = "tokenizer.model" if vocabtype == "spm" else "vocab.json"
-            if self.files[file_key]:
-                return self.files[file_key]
-            else:
-                raise FileNotFoundError(f"{vocabtype} {file_key} not found.")
+            for file_key in self.files.keys():
+                if self.files[file_key]:
+                    return self.files[file_key]
+            raise FileNotFoundError(f"{vocabtype} vocab not found.")
         elif vocabtype == "hfft":
             # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file
             return self.path

From 0f83e727af0a7cadf90b7ecc1f8e35de1d0880bc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:37:36 +0200
Subject: [PATCH 20/25] py : fix whitespace

---
 convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert.py b/convert.py
index b47bb618513be..e38ee5315af30 100755
--- a/convert.py
+++ b/convert.py
@@ -1100,7 +1100,7 @@ def extract_vocabulary_from_model(self, vocab: Vocab) -> Tuple[list, list, list]
             scores.append(score)
             toktypes.append(toktype)
 
-        assert(len(tokens) == vocab.vocab_size)
+        assert len(tokens) == vocab.vocab_size
 
         return tokens, scores, toktypes
 

From c918fe8dca8fa1c4602427e0a4b88e20046f6c34 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:38:39 +0200
Subject: [PATCH 21/25] metal : create autorelease pool during library build
 (#4970)

* metal : create autorelease pool during library build

ggml-ci

* test : simplify

ggml-ci
---
 .gitignore                 |  1 +
 Makefile                   |  5 ++++-
 ci/run.sh                  |  2 ++
 ggml-metal.m               | 19 +++++++++----------
 tests/CMakeLists.txt       |  1 +
 tests/test-autorelease.cpp | 28 ++++++++++++++++++++++++++++
 6 files changed, 45 insertions(+), 11 deletions(-)
 create mode 100644 tests/test-autorelease.cpp

diff --git a/.gitignore b/.gitignore
index fba207045344c..5ab81445d98f1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,3 +105,4 @@ poetry.toml
 /tests/test-tokenizer-1-bpe
 /tests/test-rope
 /tests/test-backend-ops
+/tests/test-autorelease
diff --git a/Makefile b/Makefile
index 995b89f7adac9..a8658a596eee0 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
 	tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope      \
-	tests/test-backend-ops
+	tests/test-backend-ops tests/test-autorelease
 
 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@@ -747,3 +747,6 @@ tests/test-c.o: tests/test-c.c llama.h
 
 tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
+
+tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
diff --git a/ci/run.sh b/ci/run.sh
index 47a254f4cf1e8..86293f0dbdfd6 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -179,6 +179,8 @@ function gg_run_open_llama_3b_v2 {
 
     wiki_test_60="${path_wiki}/wiki.test-60.raw"
 
+    ./bin/test-autorelease ${model_f16}
+
     ./bin/quantize ${model_f16} ${model_q8_0} q8_0
     ./bin/quantize ${model_f16} ${model_q4_0} q4_0
     ./bin/quantize ${model_f16} ${model_q4_1} q4_1
diff --git a/ggml-metal.m b/ggml-metal.m
index 8bb4edd64db2e..66d4d675eb32f 100644
--- a/ggml-metal.m
+++ b/ggml-metal.m
@@ -303,22 +303,21 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
                 return NULL;
             }
 
-            // dictionary of preprocessor macros
-            NSMutableDictionary * prep = [NSMutableDictionary dictionary];
+            @autoreleasepool {
+                // dictionary of preprocessor macros
+                NSMutableDictionary * prep = [NSMutableDictionary dictionary];
 
 #ifdef GGML_QKK_64
-            prep[@"QK_K"] = @(64);
+                prep[@"QK_K"] = @(64);
 #endif
 
-            MTLCompileOptions* options = [MTLCompileOptions new];
-            options.preprocessorMacros = prep;
+                MTLCompileOptions* options = [MTLCompileOptions new];
+                options.preprocessorMacros = prep;
 
-            //[options setFastMathEnabled:false];
+                //[options setFastMathEnabled:false];
 
-            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
-
-            [options release];
-            [prep release];
+                ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
+            }
         }
 
         if (error) {
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 7c932240de82d..d7aaab8430faf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -49,6 +49,7 @@ llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp)
 # llama_build_and_test_executable(test-opt.cpp) # SLOW
 llama_build_and_test_executable(test-backend-ops.cpp)
+llama_build_and_test_executable(test-autorelease.cpp)
 
 llama_build_and_test_executable(test-rope.cpp)
 
diff --git a/tests/test-autorelease.cpp b/tests/test-autorelease.cpp
new file mode 100644
index 0000000000000..289c6ba6c6da3
--- /dev/null
+++ b/tests/test-autorelease.cpp
@@ -0,0 +1,28 @@
+// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
+
+#include <cstdio>
+#include <string>
+#include <thread>
+
+#include "llama.h"
+
+// This creates a new context inside a pthread and then tries to exit cleanly.
+int main(int argc, char ** argv) {
+    if (argc < 2) {
+        printf("Usage: %s model.gguf\n", argv[0]);
+        return 0; // intentionally return success
+    }
+
+    const std::string fname = argv[1];
+
+    std::thread([&fname]() {
+        llama_backend_init(false);
+        auto * model = llama_load_model_from_file(fname.c_str(), llama_model_default_params());
+        auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
+        llama_free(ctx);
+        llama_free_model(model);
+        llama_backend_free();
+    }).join();
+
+    return 0;
+}

From 44a1a4a41a4c0b03afaa7d9e06bcbc7cf95aa1e6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:39:41 +0200
Subject: [PATCH 22/25] backend : add eval callback (#4935)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* simple : no need for ggml_is_contiguous + fix bool parse

* llama : fix callback placement in llama_context_params

* backend : avoid double-ask callback calls

* simple : restore examples, imatrix will serve as a demo
---
 ggml-backend.c | 42 ++++++++++++++++++++++++++++++++++++++++--
 ggml-backend.h | 11 +++++++++++
 llama.cpp      |  9 +++++++++
 llama.h        |  4 ++++
 4 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/ggml-backend.c b/ggml-backend.c
index f5424fb904117..4266250f926ee 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -802,6 +802,9 @@ struct ggml_backend_sched {
     __attribute__((aligned(GGML_MEM_ALIGN)))
     #endif
     char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
+
+    ggml_backend_sched_eval_callback callback_eval;
+    void * callback_eval_user_data;
 };
 
 #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
@@ -1324,9 +1327,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
         ggml_graph_dump_dot(split->graph, NULL, split_filename);
 #endif
 
+
         uint64_t compute_start_us = ggml_time_us();
-        ggml_backend_graph_compute(split_backend, &split->graph);
-        //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        if (!sched->callback_eval) {
+            ggml_backend_graph_compute(split_backend, &split->graph);
+          //ggml_backend_synchronize(split_backend); // necessary to measure compute time
+        } else {
+            // similar to ggml_backend_compare_graph_backend
+            for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
+                struct ggml_tensor * t = split->graph.nodes[j0];
+
+                // check if the user needs data from this node
+                bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+
+                int j1 = j0;
+
+                // determine the range [j0, j1] of nodes that can be computed together
+                while (!need && j1 < split->graph.n_nodes - 1) {
+                    t = split->graph.nodes[++j1];
+                    need = sched->callback_eval(t, true, sched->callback_eval_user_data);
+                }
+
+                struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
+
+                ggml_backend_graph_compute(split_backend, &gv);
+
+                if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
+                    break;
+                }
+
+                j0 = j1;
+            }
+        }
         uint64_t compute_end_us = ggml_time_us();
         compute_us[split_backend_id] += compute_end_us - compute_start_us;
     }
@@ -1431,6 +1463,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
     sched_reset(sched);
 }
 
+
+void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
+    sched->callback_eval = callback;
+    sched->callback_eval_user_data = user_data;
+}
+
 int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
     return sched->n_splits;
 }
diff --git a/ggml-backend.h b/ggml-backend.h
index 12b4b4ab74935..ab4ad773ffbce 100644
--- a/ggml-backend.h
+++ b/ggml-backend.h
@@ -148,6 +148,14 @@ extern "C" {
     struct ggml_backend_sched;
     typedef struct ggml_backend_sched * ggml_backend_sched_t;
 
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
     // Initialize a backend scheduler
     GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
     GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
@@ -168,6 +176,9 @@ extern "C" {
     // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
     GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
 
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
     //
     // Utils
     //
diff --git a/llama.cpp b/llama.cpp
index 2c5983c67f671..81829b13e4e94 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1393,6 +1393,9 @@ struct llama_cparams {
 
     bool mul_mat_q;
     bool offload_kqv;
+
+    ggml_backend_sched_eval_callback cb_eval;
+    void * cb_eval_user_data;
 };
 
 struct llama_layer {
@@ -6254,6 +6257,7 @@ static int llama_decode_internal(
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
     ggml_backend_sched_reset(lctx.sched);
+    ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
     ggml_cgraph * gf = llama_build_graph(lctx, batch);
 
@@ -9276,6 +9280,8 @@ struct llama_context_params llama_context_default_params() {
         /*.yarn_beta_fast              =*/ 32.0f,
         /*.yarn_beta_slow              =*/ 1.0f,
         /*.yarn_orig_ctx               =*/ 0,
+        /*.cb_eval                     =*/ nullptr,
+        /*.cb_eval_user_data           =*/ nullptr,
         /*.type_k                      =*/ GGML_TYPE_F16,
         /*.type_v                      =*/ GGML_TYPE_F16,
         /*.mul_mat_q                   =*/ true,
@@ -9416,6 +9422,9 @@ struct llama_context * llama_new_context_with_model(
                                hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx :
                                                               hparams.n_ctx_train;
 
+    cparams.cb_eval           = params.cb_eval;
+    cparams.cb_eval_user_data = params.cb_eval_user_data;
+
     auto rope_scaling_type = params.rope_scaling_type;
     if (rope_scaling_type == LLAMA_ROPE_SCALING_UNSPECIFIED) {
         rope_scaling_type = hparams.rope_scaling_type_train;
diff --git a/llama.h b/llama.h
index a570b0d6968fb..e268d7a1d0cc9 100644
--- a/llama.h
+++ b/llama.h
@@ -2,6 +2,7 @@
 #define LLAMA_H
 
 #include "ggml.h"
+#include "ggml-backend.h"
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
@@ -231,6 +232,9 @@ extern "C" {
         float    yarn_beta_slow;   // YaRN high correction dim
         uint32_t yarn_orig_ctx;    // YaRN original context size
 
+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
         enum ggml_type type_k; // data type for K cache
         enum ggml_type type_v; // data type for V cache
 

From ba69bbc84ced580fe4fdb0713ca2d95634325b7a Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:46:30 +0200
Subject: [PATCH 23/25] imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
---
 ci/run.sh                    |  11 ++-
 examples/imatrix/imatrix.cpp | 150 +++++++++++++++++++++++++++--------
 ggml.c                       |  14 ----
 ggml.h                       |   6 --
 4 files changed, 128 insertions(+), 53 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index 86293f0dbdfd6..f3a8ff774afbc 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -216,6 +216,8 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
@@ -243,6 +245,8 @@ function gg_run_open_llama_3b_v2 {
     check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
     check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
     # lora
     function compare_ppl {
         qnt="$1"
@@ -284,7 +288,6 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
     compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
 
-
     set +e
 }
 
@@ -294,6 +297,7 @@ function gg_sum_open_llama_3b_v2 {
     gg_printf 'OpenLLaMA 3B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
@@ -393,6 +397,8 @@ function gg_run_open_llama_7b_v2 {
     (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
     (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
 
+    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
     (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
 
     function check_ppl {
@@ -420,6 +426,8 @@ function gg_run_open_llama_7b_v2 {
     check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
     check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
 
+    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
     # lora
     function compare_ppl {
         qnt="$1"
@@ -471,6 +479,7 @@ function gg_sum_open_llama_7b_v2 {
     gg_printf 'OpenLLaMA 7B-v2:\n'
     gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
     gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
     gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
     gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
     gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 1461bc96376a7..af78711c5ab66 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -33,43 +33,120 @@ class IMatrixCollector {
 public:
     IMatrixCollector() = default;
     void set_parameters(StatParams&& params) { m_params = std::move(params); }
-    void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
+    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
     void save_imatrix() const;
 private:
     std::unordered_map<std::string, Stats> m_stats;
     StatParams                             m_params;
     std::mutex                             m_mutex;
     int                                    m_last_call = 0;
+    std::vector<float>                     m_src1_data;
+    std::vector<int>                       m_ids; // the expert ids from ggml_mul_mat_id
 };
 
-void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
-    if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto& e = m_stats[src0->name];
-    if (e.values.empty()) {
-        e.values.resize(src1->ne[0], 0);
-    }
-    else if (e.values.size() != (size_t)src1->ne[0]) {
-        fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
-        exit(1); //GGML_ASSERT(false);
+bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    GGML_UNUSED(user_data);
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
+    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
+    if (ask) {
+        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
+        if (t->op != GGML_OP_MUL_MAT) return false;
+        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
+        if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
+        return true;
     }
-    ++e.ncall;
-    if (m_params.verbosity > 1) {
-        printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
+
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    // copy the data from the GPU memory if needed
+    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
+
+    if (!is_host) {
+        m_src1_data.resize(ggml_nelements(src1));
+        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
     }
-    for (int row = 0; row < (int)src1->ne[1]; ++row) {
-        const float * x = (const float *)src1->data + row * src1->ne[0];
-        for (int j = 0; j < (int)src1->ne[0]; ++j) {
-            e.values[j] += x[j]*x[j];
+
+    const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
+
+    if (t->op == GGML_OP_MUL_MAT_ID) {
+        const int idx  = ((int32_t *) t->op_params)[0];
+        const int n_as = ((int32_t *) t->op_params)[1];
+
+        // the top-k selected expert ids are stored in the src0 tensor
+        // for simplicity, always copy src0 to host, because it is small
+        // take into account that src0 is not contiguous!
+        GGML_ASSERT(src0->ne[1] == src1->ne[1]);
+        GGML_ASSERT(n_as*ggml_nrows(src0));
+        m_ids.resize(ggml_nbytes(src0)/sizeof(int));
+        ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
+
+        // loop over all possible experts, regardless if they are used or not in the batch
+        // this is necessary to guarantee equal number of "ncall" for each tensor
+        for (int ex = 0; ex < n_as; ++ex) {
+            src0 = t->src[2 + ex];
+            auto& e = m_stats[src0->name];
+            if (e.values.empty()) {
+                e.values.resize(src1->ne[0], 0);
+            }
+            else if (e.values.size() != (size_t)src1->ne[0]) {
+                fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+                exit(1); //GGML_ASSERT(false);
+            }
+            // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
+            //       using the following line, we can correct for that if needed
+            //if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
+            ++e.ncall;
+            if (m_params.verbosity > 1) {
+                printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+            }
+            for (int row = 0; row < (int)src1->ne[1]; ++row) {
+                const int excur = m_ids[row*n_as + idx];
+                GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
+                if (excur != ex) continue;
+                const float * x = data + row * src1->ne[0];
+                for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                    e.values[j] += x[j]*x[j];
+                }
+            }
+            if (e.ncall > m_last_call) {
+                m_last_call = e.ncall;
+                if (m_last_call % m_params.n_output_frequency == 0) {
+                    save_imatrix();
+                }
+            }
         }
-    }
-    if (e.ncall > m_last_call) {
-        m_last_call = e.ncall;
-        if (m_last_call % m_params.n_output_frequency == 0) {
-            save_imatrix();
+    } else {
+        auto& e = m_stats[src0->name];
+        if (e.values.empty()) {
+            e.values.resize(src1->ne[0], 0);
+        }
+        else if (e.values.size() != (size_t)src1->ne[0]) {
+            fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
+            exit(1); //GGML_ASSERT(false);
+        }
+        ++e.ncall;
+        if (m_params.verbosity > 1) {
+            printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
+        }
+        for (int row = 0; row < (int)src1->ne[1]; ++row) {
+            const float * x = data + row * src1->ne[0];
+            for (int j = 0; j < (int)src1->ne[0]; ++j) {
+                e.values[j] += x[j]*x[j];
+            }
+        }
+        if (e.ncall > m_last_call) {
+            m_last_call = e.ncall;
+            if (m_last_call % m_params.n_output_frequency == 0) {
+                save_imatrix();
+            }
         }
     }
+
+    return true;
 }
 
 void IMatrixCollector::save_imatrix() const {
@@ -93,8 +170,8 @@ void IMatrixCollector::save_imatrix() const {
 
 static IMatrixCollector g_collector;
 
-static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
-    g_collector.collect_imatrix(src0, src1);
+static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
+    return g_collector.collect_imatrix(t, ask, user_data);
 }
 
 
@@ -320,8 +397,6 @@ int main(int argc, char ** argv) {
 
     g_collector.set_parameters(std::move(sparams));
 
-    ggml_set_imatrix_collection(ik_collect_imatrix);
-
     params.logits_all = true;
     params.n_batch = std::min(params.n_batch, params.n_ctx);
 
@@ -340,16 +415,27 @@ int main(int argc, char ** argv) {
 
     llama_backend_init(params.numa);
 
-    llama_model * model;
-    llama_context * ctx;
+    llama_model_params mparams = llama_model_params_from_gpt_params(params);
 
-    // load the model and apply lora adapter, if any
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
     if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
     }
 
+    llama_context_params cparams = llama_context_params_from_gpt_params(params);
+
+    // pass the callback to the backend scheduler
+    // it will be executed for each node during the graph computation
+    cparams.cb_eval = ik_collect_imatrix;
+    cparams.cb_eval_user_data = NULL;
+
+    llama_context * ctx = llama_new_context_with_model(model, cparams);
+    if (ctx == NULL) {
+        fprintf(stderr, "%s: error: unable to create context\n", __func__);
+        return 1;
+    }
+
     const int n_ctx_train = llama_n_ctx_train(model);
     if (params.n_ctx > n_ctx_train) {
         fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
diff --git a/ggml.c b/ggml.c
index d7e01b81f0179..35fd29a9ec2dc 100644
--- a/ggml.c
+++ b/ggml.c
@@ -394,12 +394,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
 static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
 static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
 
-ggml_collect_imatrix_t g_imatrix_collect = NULL;
-
-void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
-    g_imatrix_collect = imatrix_collect;
-}
-
 static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
     [GGML_TYPE_I8] = {
         .type_name                = "i8",
@@ -9790,10 +9784,6 @@ static void ggml_compute_forward_mul_mat(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    if (ith == 1 && g_imatrix_collect) {
-        g_imatrix_collect(src0, src1);
-    }
-
     const enum ggml_type type = src0->type;
 
     const bool src1_cont = ggml_is_contiguous(src1);
@@ -10097,10 +10087,6 @@ static void ggml_compute_forward_mul_mat_id(
 
         const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
 
-        if (ith == 1 && g_imatrix_collect) {
-            g_imatrix_collect(src0_cur, src1);
-        }
-
         const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t row_size = ggml_row_size(vec_dot_type, ne10);
 
diff --git a/ggml.h b/ggml.h
index 837c52e68c90c..27daf6fd1e12b 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2085,12 +2085,6 @@ extern "C" {
     GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
     GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
 
-    //
-    // Importance matrix
-    //
-    typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
-    GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
-
     //
     // gguf
     //

From 38566680cdfe982a495562332c25b9227de9cf8d Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 18:54:56 +0200
Subject: [PATCH 24/25] ggml : add IQ2 to test-backend-ops + refactoring
 (#4990)

* ggml : add IQ2 to test-backend-ops + refactoring

ggml-ci

* cuda : update supports_op for IQ2

ggml-ci

* ci : enable LLAMA_CUBLAS=1 for CUDA nodes

ggml-ci

* cuda : fix out-of-bounds-access in `mul_mat_vec_q`

ggml-ci

* tests : avoid creating RNGs for each Q tensor

ggml-ci

* tests : avoid creating RNGs for each tensor

ggml-ci
---
 ci/run.sh                  | 12 ++++---
 ggml-backend.c             |  2 ++
 ggml-cuda.cu               | 12 +++++--
 ggml-quants.c              | 74 ++++++++++++++++----------------------
 ggml-quants.h              |  3 ++
 ggml.c                     | 34 ++++++++++++++++--
 ggml.h                     | 20 ++++++++---
 llama.cpp                  | 12 +------
 tests/test-backend-ops.cpp | 46 ++++++++++++++----------
 9 files changed, 128 insertions(+), 87 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index f3a8ff774afbc..791b17a191a2b 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -36,6 +36,10 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
 fi
 
+if [ ! -z ${GG_BUILD_CUDA} ]; then
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+fi
+
 ## helpers
 
 # download a file if it does not exist or if it is outdated
@@ -160,8 +164,8 @@ function gg_run_open_llama_3b_v2 {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     python3 ../convert.py ${path_models}
 
@@ -343,8 +347,8 @@ function gg_run_open_llama_7b_v2 {
 
     set -e
 
-    (time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                              ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
 
     python3 ../convert.py ${path_models}
 
diff --git a/ggml-backend.c b/ggml-backend.c
index 4266250f926ee..ef518dae0909b 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -692,6 +692,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
 
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
     switch (op->op) {
+        case GGML_OP_CPY:
+            return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
         case GGML_OP_MUL_MAT:
             return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
         default:
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 568c411afd3ee..b2211d858c23a 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -5131,10 +5131,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     const block_q_t  * x = (const block_q_t  *) vx;
     const block_q8_1 * y = (const block_q8_1 *) vy;
 
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
+    for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row*blocks_per_row + i; // x block index
 
-        const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
+        const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
 
         const int iqs  = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
 
@@ -10918,6 +10918,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
                 if (a->ne[3] != b->ne[3]) {
                     return false;
                 }
+                ggml_type a_type = a->type;
+                if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
+                    if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
+                        return false;
+                    }
+                }
                 return true;
             } break;
         case GGML_OP_GET_ROWS:
diff --git a/ggml-quants.c b/ggml-quants.c
index 31b053e335787..7d2f033e9a0fe 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -1274,7 +1274,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
     }
     float sumlx = 0;
     float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 0; i < n; ++i) {
+#else
     for (int i = 0; i < n; ++i) {
+#endif
         int l = nearest_int(iscale * x[i]);
         l = MAX(-nmax, MIN(nmax-1, l));
         L[i] = l + nmax;
@@ -1649,7 +1654,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
     float max = x[0];
     float sum_w = weights ? weights[0] : x[0]*x[0];
     float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
     for (int i = 1; i < n; ++i) {
+#endif
         if (x[i] < min) min = x[i];
         if (x[i] > max) max = x[i];
         float w = weights ? weights[i] : x[i]*x[i];
@@ -1660,7 +1670,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
         min = 0;
     }
     if (max <= min) {
-        for (int i = 0; i < n; ++i) L[i] = 0;
+        memset(L, 0, n);
         *the_min = -min;
         return 0.f;
     }
@@ -1862,7 +1872,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
 
 size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
     }
@@ -2181,7 +2191,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
 
 size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
     }
@@ -2448,7 +2458,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
 
 size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
     }
@@ -2771,7 +2781,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
 
 size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
     }
@@ -3025,7 +3035,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
 
 size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
     (void)hist;
-    int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
     if (!quant_weights) {
         quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
     }
@@ -3072,7 +3082,7 @@ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
@@ -3116,7 +3126,7 @@ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
@@ -3169,7 +3179,7 @@ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
@@ -3221,7 +3231,7 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
     if (!quant_weights) {
         return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
     }
-    int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
+    size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
         quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
@@ -8565,7 +8575,7 @@ static int iq2_compare_func(const void * left, const void * right) {
     return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
 }
 
-static void q2xs_init_impl(int grid_size) {
+void iq2xs_init_impl(int grid_size) {
     const int gindex = iq2_data_index(grid_size);
     if (iq2_data[gindex].grid) {
         return;
@@ -8720,19 +8730,7 @@ static void q2xs_init_impl(int grid_size) {
     free(dist2);
 }
 
-void ggml_init_iq2_quantization(enum ggml_type type) {
-    if (type == GGML_TYPE_IQ2_XXS) {
-        q2xs_init_impl(256);
-    }
-    else if (type == GGML_TYPE_IQ2_XS) {
-        q2xs_init_impl(512);
-    }
-    else {
-        fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
-    }
-}
-
-static void q2xs_deinit_impl(int grid_size) {
+void iq2xs_free_impl(int grid_size) {
     GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
     const int gindex = iq2_data_index(grid_size);
     if (iq2_data[gindex].grid) {
@@ -8742,18 +8740,6 @@ static void q2xs_deinit_impl(int grid_size) {
     }
 }
 
-void ggml_deinit_iq2_quantization(enum ggml_type type) {
-    if (type == GGML_TYPE_IQ2_XXS) {
-        q2xs_deinit_impl(256);
-    }
-    else if (type == GGML_TYPE_IQ2_XS) {
-        q2xs_deinit_impl(512);
-    }
-    else {
-        fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
-    }
-}
-
 static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
         const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
     int num_neighbors = neighbours[0];
@@ -8786,10 +8772,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
     const int      * kmap_q2xs       = iq2_data[gindex].map;
     const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
 
-    GGML_ASSERT(quant_weights);
-    GGML_ASSERT(kgrid_q2xs);
-    GGML_ASSERT(kmap_q2xs);
-    GGML_ASSERT(kneighbors_q2xs);
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(n%QK_K == 0);
 
     const int kMaxQ = 3;
@@ -9005,10 +8991,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
     const int      * kmap_q2xs       = iq2_data[gindex].map;
     const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
 
-    GGML_ASSERT(quant_weights);
-    GGML_ASSERT(kmap_q2xs);
-    GGML_ASSERT(kgrid_q2xs);
-    GGML_ASSERT(kneighbors_q2xs);
+    GGML_ASSERT(quant_weights   && "missing quantization weights");
+    GGML_ASSERT(kmap_q2xs       && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kgrid_q2xs      && "forgot to call ggml_quantize_init()?");
+    GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(n%QK_K == 0);
 
     const int kMaxQ = 3;
diff --git a/ggml-quants.h b/ggml-quants.h
index d7fefdb547911..7d7cf9178f76e 100644
--- a/ggml-quants.h
+++ b/ggml-quants.h
@@ -257,3 +257,6 @@ size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row,
 size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
+void iq2xs_init_impl(int grid_size);
+void iq2xs_free_impl(int grid_size);
diff --git a/ggml.c b/ggml.c
index 35fd29a9ec2dc..cbf2d4bddddb8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -18524,6 +18524,28 @@ enum ggml_opt_result ggml_opt_resume_g(
 
 ////////////////////////////////////////////////////////////////////////////////
 
+void ggml_quantize_init(enum ggml_type type) {
+    ggml_critical_section_start();
+
+    switch (type) {
+        case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
+        case GGML_TYPE_IQ2_XS:  iq2xs_init_impl(512); break;
+        default: // nothing
+            break;
+    }
+
+    ggml_critical_section_end();
+}
+
+void ggml_quantize_free(void) {
+    ggml_critical_section_start();
+
+    iq2xs_free_impl(256);
+    iq2xs_free_impl(512);
+
+    ggml_critical_section_end();
+}
+
 size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
@@ -18651,9 +18673,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
     return (n/QK8_0*sizeof(block_q8_0));
 }
 
+bool ggml_quantize_requires_imatrix(enum ggml_type type) {
+    return
+        type == GGML_TYPE_IQ2_XXS ||
+        type == GGML_TYPE_IQ2_XS;
+}
+
 size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
         int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
-    (void)imatrix;
+    ggml_quantize_init(type); // this is noop if already initialized
     size_t result = 0;
     int n = nrows * n_per_row;
     switch (type) {
@@ -18766,13 +18794,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
             } break;
         case GGML_TYPE_F16:
             {
-                int elemsize = sizeof(ggml_fp16_t);
+                size_t elemsize = sizeof(ggml_fp16_t);
                 ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
                 result = n * elemsize;
             } break;
         case GGML_TYPE_F32:
             {
-                int elemsize = sizeof(float);
+                size_t elemsize = sizeof(float);
                 result = n * elemsize;
                 memcpy((uint8_t *)dst + start * elemsize, src + start, result);
             } break;
diff --git a/ggml.h b/ggml.h
index 27daf6fd1e12b..de8162b8135f3 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2065,6 +2065,18 @@ extern "C" {
     // quantization
     //
 
+    // - ggml_quantize_init can be called multiple times with the same type
+    //   it will only initialize the quantization tables for the first call or after ggml_quantize_free
+    //   automatically called by ggml_quantize_chunk for convenience
+    //
+    // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
+    //   call this at the end of the program to avoid memory leaks
+    //
+    // note: these are thread-safe
+    //
+    GGML_API void ggml_quantize_init(enum ggml_type type);
+    GGML_API void ggml_quantize_free(void);
+
     // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
     GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@@ -2078,13 +2090,13 @@ extern "C" {
     GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
     GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
 
+    // some quantization type cannot be used without an importance matrix
+    GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
+
+    // calls ggml_quantize_init internally (i.e. can allocate memory)
     GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
             int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
-    // These are needed for IQ2_XS and IQ2_XXS quantizations
-    GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
-    GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
-
     //
     // gguf
     //
diff --git a/llama.cpp b/llama.cpp
index 81829b13e4e94..d28382f7d47b7 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8747,8 +8747,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
     // placeholder for the meta data
     ::zeros(fout, meta_size);
 
-    std::set<ggml_type> used_iq2;
-
     for (int i = 0; i < ml.n_tensors; ++i) {
         struct ggml_tensor * tensor = ml.get_tensor_meta(i);
 
@@ -8801,11 +8799,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         } else {
             const size_t nelements = ggml_nelements(tensor);
 
-            if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
-                ggml_init_iq2_quantization(new_type);
-                used_iq2.insert(new_type);
-            }
-
             const float * imatrix = nullptr;
             if (imatrix_data) {
                 auto it = imatrix_data->find(tensor->name);
@@ -8931,10 +8924,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
 
     fout.close();
 
-    for (auto type : used_iq2) {
-        ggml_deinit_iq2_quantization(type);
-    }
-
     gguf_free(ctx_out);
 
     LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9342,6 +9331,7 @@ void llama_backend_free(void) {
 #ifdef GGML_USE_MPI
     ggml_mpi_backend_free();
 #endif
+    ggml_quantize_free();
 }
 
 int64_t llama_time_us(void) {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 22a7856d46f41..55ce14e0d902c 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -16,39 +16,37 @@
 #include <vector>
 
 static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    // static RNG initialization (revisit if n_threads stops being constant)
+    static const size_t n_threads = std::thread::hardware_concurrency();
+    static std::vector<std::default_random_engine> generators = []() {
+        std::random_device rd;
+        std::vector<std::default_random_engine> vec;
+        vec.reserve(n_threads);
+        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+        return vec;
+    }();
+
     size_t size = ggml_nelements(tensor);
     std::vector<float> data(size);
 
-#if 0
-    static std::default_random_engine generator(1234);
-    std::uniform_real_distribution<float> distribution(min, max);
-
-    for (size_t i = 0; i < size; i++) {
-        data[i] = distribution(generator);
-    }
-#else
-    auto init_thread = [&](size_t start, size_t end) {
-        std::random_device rd;
-        std::default_random_engine generator(rd());
+    auto init_thread = [&](size_t ith, size_t start, size_t end) {
         std::uniform_real_distribution<float> distribution(min, max);
-
         for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generator);
+            data[i] = distribution(generators[ith]);
         }
     };
 
-    size_t n_threads = std::thread::hardware_concurrency();
     std::vector<std::thread> threads;
     threads.reserve(n_threads);
     for (size_t i = 0; i < n_threads; i++) {
         size_t start =     i*size/n_threads;
         size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, start, end);
+        threads.emplace_back(init_thread, i, start, end);
     }
     for (auto & t : threads) {
         t.join();
     }
-#endif
 
     if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
         ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
@@ -56,7 +54,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
         GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
         std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
         int64_t hist[16];
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr);
+        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
         ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
     } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
         // This is going to create some weird integers though.
@@ -1472,7 +1479,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
         GGML_TYPE_Q8_0,
         GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
         GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
-        GGML_TYPE_Q6_K
+        GGML_TYPE_Q6_K,
+        GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
     };
 
     // unary ops
@@ -1752,6 +1760,8 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
+    ggml_quantize_free();
+
     printf("\033[1;32mOK\033[0m\n");
     return 0;
 }

From 6b6916b215251e09bd57cdbf870dc8a73345edc2 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 17 Jan 2024 20:54:50 +0200
Subject: [PATCH 25/25] sync : ggml

---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index be9e408fbeb39..4d52d946bbf9d 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-b306d6e996ec0ace77118fa5098822cdc7f9c88f
+6c1ce0bd591a430c1d3f6797d905194581c878c1