diff --git a/.gitignore b/.gitignore
index 50ae0973ae3b3..8202bdb201ed7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,6 +40,7 @@ cmake-build-*
 out/
 tmp/
 
+loras/*
 models/*
 models-mnt
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 477c5b57c20e7..b802edd3194ba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -291,6 +291,8 @@ if (LLAMA_METAL)
 endif()
 
 if (LLAMA_BLAS)
+    message(STATUS "Building with OpenBLAS")
+
     if (LLAMA_STATIC)
         set(BLA_STATIC ON)
     endif()
@@ -299,77 +301,14 @@ if (LLAMA_BLAS)
     endif()
 
     set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
-    find_package(BLAS)
-
-    if (BLAS_FOUND)
-        message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
-
-        if ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
-            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
-            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
-            find_package(PkgConfig REQUIRED)
-            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
-                pkg_check_modules(DepBLAS REQUIRED blas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
-                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
-                pkg_check_modules(DepBLAS openblas64)
-                if (NOT DepBLAS_FOUND)
-                    pkg_check_modules(DepBLAS REQUIRED openblas)
-                endif()
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
-                pkg_check_modules(DepBLAS REQUIRED blis)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
-                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
-                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
-                # all Intel* libraries share the same include path
-                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
-            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
-                # this doesn't provide pkg-config
-                # suggest to assign BLAS_INCLUDE_DIRS on your own
-                if ("${NVHPC_VERSION}" STREQUAL "")
-                    message(WARNING "Better to set NVHPC_VERSION")
-                else()
-                    set(DepBLAS_FOUND ON)
-                    set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
-                endif()
-            endif()
-            if (DepBLAS_FOUND)
-                set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
-            else()
-                message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
-                " detected by pkgconfig, trying to find cblas.h from possible paths...")
-                find_path(BLAS_INCLUDE_DIRS
-                    NAMES cblas.h
-                    HINTS
-                        /usr/include
-                        /usr/local/include
-                        /usr/include/openblas
-                        /opt/homebrew/opt/openblas/include
-                        /usr/local/opt/openblas/include
-                        /usr/include/x86_64-linux-gnu/openblas/include
-                )
-            endif()
-        endif()
+    add_compile_options(${BLAS_LINKER_FLAGS})
 
-        message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
+    add_compile_definitions(GGML_USE_OPENBLAS)
 
-        add_compile_options(${BLAS_LINKER_FLAGS})
+    add_subdirectory(../OpenBLAS ${CMAKE_CURRENT_BINARY_DIR}/OpenBLAS)
 
-        add_compile_definitions(GGML_USE_OPENBLAS)
-
-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
-            add_compile_definitions(GGML_BLAS_USE_MKL)
-        endif()
-
-        set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     ${BLAS_LIBRARIES})
-        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
-    else()
-        message(WARNING "BLAS not found, please refer to "
-        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct LLAMA_BLAS_VENDOR")
-    endif()
+    set(LLAMA_EXTRA_LIBS     ${LLAMA_EXTRA_LIBS}     openblas_shared)
+    set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ../OpenBLAS ${CMAKE_CURRENT_BINARY_DIR}/OpenBLAS)
 endif()
 
 if (LLAMA_LLAMAFILE)
@@ -485,19 +424,24 @@ if (LLAMA_MPI)
 endif()
 
 if (LLAMA_CLBLAST)
-    find_package(CLBlast)
-    if (CLBlast_FOUND)
-        message(STATUS "CLBlast found")
+    message(STATUS "Building with CLBlast")
 
-        set(GGML_HEADERS_OPENCL ggml-opencl.h)
-        set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
+    set(GGML_HEADERS_OPENCL ggml-opencl.h)
+    set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
 
-        add_compile_definitions(GGML_USE_CLBLAST)
+    add_compile_definitions(GGML_USE_CLBLAST)
 
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
-    else()
-        message(WARNING "CLBlast not found")
-    endif()
+    # link our libOpenCL.so (this is only used during compile time)
+    add_library(OpenCL SHARED IMPORTED)
+    set_target_properties(OpenCL PROPERTIES IMPORTED_LOCATION ${PROJECT_SOURCE_DIR}/../OpenCL/lib/libOpenCL.so)
+
+    # add our prebuilt clblast library
+    add_library(clblast SHARED IMPORTED)
+    set_target_properties(clblast PROPERTIES IMPORTED_LOCATION ${PROJECT_SOURCE_DIR}/../../android/app/src/main/jniLibs/${ANDROID_ABI}/libclblast.so)
+
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast OpenCL)
+    set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ../CLBlast/include)
+    set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ../OpenCL/include)
 endif()
 
 if (LLAMA_VULKAN)
diff --git a/common/common.cpp b/common/common.cpp
index 243b88abf1aab..aa854677e1420 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -629,6 +629,30 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         sparams.penalty_present = std::stof(argv[i]);
         return true;
     }
+    if (arg == "--dry-multiplier") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        sparams.dry_multiplier = std::stof(argv[i]);
+        return true;
+    }
+    if (arg == "--dry-base") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        sparams.dry_base = std::stoi(argv[i]);
+        return true;
+    }
+    if (arg == "--dry-allowed-length") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        sparams.dry_allowed_length = std::stoi(argv[i]);
+        return true;
+    }
     if (arg == "--dynatemp-range") {
         if (++i >= argc) {
             invalid_param = true;
@@ -1464,6 +1488,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
     printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
     printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
+    printf("  --dry-multiplier N    DRY sampler multiplier (default: %.1f, 0.0 = disabled)\n", (double)sparams.dry_multiplier);
+    printf("  --dry-base N          DRY sampler base (default: %.1f)\n", (double)sparams.dry_base);
+    printf("  --dry-allowed-length N\n");
+    printf("                        DRY sampler allowed length (default: %d)\n", sparams.dry_allowed_length);
     printf("  --dynatemp-range N    dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
     printf("  --dynatemp-exp N      dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
     printf("  --mirostat N          use Mirostat sampling.\n");
diff --git a/common/sampling.cpp b/common/sampling.cpp
index cc83600d9926e..38e0d0a507a44 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -48,10 +48,10 @@ void llama_sampling_free(struct llama_sampling_context * ctx) {
     delete ctx;
 }
 
-void llama_sampling_reset(llama_sampling_context * ctx) {
+void llama_sampling_reset_grammar(struct llama_sampling_context * ctx) {
     if (ctx->grammar != NULL) {
         llama_grammar_free(ctx->grammar);
-        ctx->grammar = NULL;
+        ctx->grammar = nullptr;
     }
 
     if (!ctx->parsed_grammar.rules.empty()) {
@@ -61,6 +61,10 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
                 grammar_rules.data(),
                 grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
     }
+}
+
+void llama_sampling_reset(llama_sampling_context * ctx) {
+    llama_sampling_reset_grammar(ctx);
 
     std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
     ctx->cur.clear();
@@ -110,10 +114,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
             "\ttop_k = %d, tfs_z = %.3f, top_p = %.3f, min_p = %.3f, typical_p = %.3f, temp = %.3f\n"
-            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
+            "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f, dry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d",
             params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present,
             params.top_k, params.tfs_z, params.top_p, params.min_p, params.typical_p, params.temp,
-            params.mirostat, params.mirostat_eta, params.mirostat_tau);
+            params.mirostat, params.mirostat_eta, params.mirostat_tau, params.dry_multiplier, params.dry_base, params.dry_allowed_length);
 
     return std::string(result);
 }
@@ -267,13 +271,19 @@ static llama_token_data_array llama_sampling_prepare_impl(
 
     const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
 
+    // repetition penalties
     const int32_t penalty_last_n  = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n;
     const float   penalty_repeat  = params.penalty_repeat;
     const float   penalty_freq    = params.penalty_freq;
     const float   penalty_present = params.penalty_present;
-
     const bool    penalize_nl     = params.penalize_nl;
 
+    // DRY sampler parameters
+    const float     dry_multiplier        = params.dry_multiplier;
+    const float     dry_base              = params.dry_base;
+    const uint32_t  dry_allowed_length    = params.dry_allowed_length;
+    const uint32_t  dry_penalty_last_n    = params.dry_penalty_last_n;
+
     auto & prev = ctx_sampling->prev;
     auto & cur  = ctx_sampling->cur;
 
@@ -303,26 +313,41 @@ static llama_token_data_array llama_sampling_prepare_impl(
 
     llama_token_data_array cur_p = { cur.data(), cur.size(), false };
 
-    // apply penalties
     const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev;
-    const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
-    if (penalty_tokens_used_size) {
-        const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
-
-        llama_sample_repetition_penalties(ctx_main, &cur_p,
-                penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
-                penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
-
-        if (!penalize_nl) {
-            for (size_t idx = 0; idx < cur_p.size; idx++) {
-                if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
-                    cur_p.data[idx].logit = nl_logit;
-                    break;
+
+    // apply repetition penalties
+    {
+        const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n);
+        if (penalty_tokens_used_size) {
+            const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))];
+
+            // repetition penalties
+            llama_sample_repetition_penalties(ctx_main, &cur_p,
+                    penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                    penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present);
+
+            if (!penalize_nl) {
+                for (size_t idx = 0; idx < cur_p.size; idx++) {
+                    if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) {
+                        cur_p.data[idx].logit = nl_logit;
+                        break;
+                    }
                 }
             }
         }
     }
 
+    // apply DRY penalties
+    {
+        const int penalty_tokens_used_size = std::min(penalty_tokens.size(), (size_t)dry_penalty_last_n);
+        if (penalty_tokens_used_size) {
+            llama_sample_dry(&cur_p,
+                        penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size,
+                        penalty_tokens_used_size, dry_base, dry_multiplier, dry_allowed_length,
+                        params.dry_seq_breakers.data(), params.dry_seq_breakers.size());
+        }
+    }
+
     // apply grammar checks before sampling logic
     if (apply_grammar && ctx_sampling->grammar != NULL) {
         llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar);
@@ -362,3 +387,14 @@ void llama_sampling_accept(
         llama_grammar_accept_token(ctx_main, ctx_sampling->grammar, id);
     }
 }
+
+
+void llama_sampling_rollback(
+        struct llama_sampling_context * ctx_sampling,
+        int rollback_num) {
+    if(rollback_num > ctx_sampling->prev.size()) {
+        rollback_num = ctx_sampling->prev.size();
+    }
+
+    ctx_sampling->prev.erase(ctx_sampling->prev.end() - rollback_num, ctx_sampling->prev.end());
+}
\ No newline at end of file
diff --git a/common/sampling.h b/common/sampling.h
index cf7081e3674f1..8d00178cee5ac 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -41,6 +41,10 @@ typedef struct llama_sampling_params {
     float       mirostat_eta          = 0.10f;              // learning rate
     bool        penalize_nl           = false;              // consider newlines as a repeatable token
     uint32_t    seed                  = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context
+    float       dry_multiplier        = 0.0f;               // 0.0f = disabled, recommended value: 0.8f
+    float       dry_base              = 1.75f;
+    uint32_t    dry_allowed_length    = 2;
+    uint32_t    dry_penalty_last_n    = -1;                 // DRY last n tokens to penalize (0 = disable penalty, -1 = context size)
 
     std::vector<llama_sampler_type> samplers_sequence = {
         llama_sampler_type::TOP_K,
@@ -61,6 +65,7 @@ typedef struct llama_sampling_params {
     std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
 
     std::vector<llama_token> penalty_prompt_tokens;
+    std::vector<llama_token> dry_seq_breakers; // sequence breakers for the DRY sampler
     bool                     use_penalty_prompt_tokens = false;
 } llama_sampling_params;
 
@@ -92,6 +97,9 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
 
 void llama_sampling_free(struct llama_sampling_context * ctx);
 
+// Reset the sampler grammar without resetting the context
+void llama_sampling_reset_grammar(struct llama_sampling_context * ctx);
+
 // Reset the sampler context
 // - clear prev tokens
 // - reset grammar
@@ -152,3 +160,7 @@ void llama_sampling_accept(
         struct llama_context * ctx_main,
         llama_token id,
         bool apply_grammar);
+
+void llama_sampling_rollback(
+        struct llama_sampling_context * ctx_sampling,
+        int rollback_num);
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index eabbc2db38286..14bc03a1ff301 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -126,6 +126,8 @@ int main(int argc, char ** argv) {
     }
     llama_sampling_params & sparams = params.sparams;
 
+    sparams.dry_multiplier = 0.8f;
+
 #ifndef LOG_DISABLE_LOGS
     log_set_target(log_filename_generator("main", "log"));
     LOG_TEE("Log start\n");
@@ -735,6 +737,7 @@ int main(int argc, char ** argv) {
             for (auto id : embd) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
                 printf("%s", token_str.c_str());
+                //printf("(%d)", id);
 
                 if (embd.size() > 1) {
                     input_tokens.push_back(id);
diff --git a/grammars/chinese.gbnf b/grammars/chinese.gbnf
new file mode 100644
index 0000000000000..e9653f02c3d5f
--- /dev/null
+++ b/grammars/chinese.gbnf
@@ -0,0 +1,4 @@
+root          ::= cn-char+ ([ \t\n] cn-char+)*
+cn-char       ::= cjk | punctuation
+cjk           ::= [一-鿿] | [𠀀-𯿽]
+punctuation   ::= [、-〾]
diff --git a/grammars/korean.gbnf b/grammars/korean.gbnf
new file mode 100644
index 0000000000000..87e9a439b6dad
--- /dev/null
+++ b/grammars/korean.gbnf
@@ -0,0 +1,8 @@
+root            ::= conversation+ 
+conversation    ::= assistant-line "\nUSER: "
+assistant-line  ::= kr-string "\n"
+kr-string       ::= kr-char* 
+kr-char         ::= hangul | punctuation | whitespace
+hangul          ::= [가-힣]
+punctuation     ::= [、-〾]
+whitespace      ::= [ \t]
\ No newline at end of file
diff --git a/grammars/schedule.gbnf b/grammars/schedule.gbnf
new file mode 100644
index 0000000000000..4a04c771a1851
--- /dev/null
+++ b/grammars/schedule.gbnf
@@ -0,0 +1,3 @@
+root       ::= record
+record     ::= "Event: " string "\n" "Date: " string "\n" "Time: " string "\n"
+string     ::= "" [ -~]* ""
\ No newline at end of file
diff --git a/ios.toolchain.cmake b/ios.toolchain.cmake
new file mode 100644
index 0000000000000..10f3676dfb5e8
--- /dev/null
+++ b/ios.toolchain.cmake
@@ -0,0 +1,17 @@
+set(CMAKE_SYSTEM_NAME iOS)
+
+# specify the cross compiler
+set(CMAKE_XCODE_ATTRIBUTE_ONLY_ACTIVE_ARCH NO)
+
+# specify which architectures to build for
+set(CMAKE_OSX_ARCHITECTURES "$(ARCHS_STANDARD)")
+
+# you can also choose to build for a specific device
+# set(CMAKE_OSX_ARCHITECTURES "arm64")
+# or for the simulator
+# set(CMAKE_OSX_ARCHITECTURES "x86_64")
+
+set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos;-iphonesimulator")
+
+# you might also want to set the deployment target
+# set(CMAKE_XCODE_ATTRIBUTE_IPHONEOS_DEPLOYMENT_TARGET "10.0")
diff --git a/llama.cpp b/llama.cpp
index 18b49ec20909e..d473c5dbc3752 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6524,7 +6524,6 @@ static struct ggml_tensor * llm_build_kqv(
                     ggml_row_size(kv.v_l[il]->type, n_embd_head_k),
                     0);
         cb(v, "v", il);
-
         cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
 
         if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
@@ -6535,7 +6534,7 @@ static struct ggml_tensor * llm_build_kqv(
     } else {
         struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
         cb(kq, "kq", il);
-
+      
         if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
             // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs
             // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847
@@ -13354,6 +13353,96 @@ void llama_sample_min_p(struct llama_context * ctx, llama_token_data_array * can
     }
 }
 
+void llama_sample_dry(llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float dry_base, float dry_multiplier, int dry_allowed_length, const llama_token * dry_seq_breakers, size_t dry_seq_breakers_size) {
+    // skip dry sampler if we don't have a previous token
+    if (last_tokens_size < 1) return;
+
+    // get the last token
+    auto last_token = last_tokens[last_tokens_size - 1];
+
+    // if last token is part of the sequence breakers, skip whole sampler
+    if (std::find(dry_seq_breakers, dry_seq_breakers + dry_seq_breakers_size, last_token) != dry_seq_breakers + dry_seq_breakers_size) {
+        return;
+    }
+
+    // create an unordered map of "next tokens" <-> max match length
+    std::unordered_map<llama_token, size_t> match_lengths;
+
+    // loop through each previous token (exclude the last token)
+    for (size_t i = 0; i < last_tokens_size - 1; ++i) {
+        // skip if the compare token is not the same as the last token
+        if (last_tokens[i] != last_token) {
+            continue;
+        }
+
+        // get the next token (i + 1 is always less than last_tokens_size)
+        auto next_token = last_tokens[i + 1];
+
+        // if next token is part of the sequence breakers, skip
+        if (std::find(dry_seq_breakers, dry_seq_breakers + dry_seq_breakers_size, next_token) != dry_seq_breakers + dry_seq_breakers_size) {
+            continue;
+        }
+
+        // try to extend the match backwards (match length starts at 1 because last token is already matched)
+        size_t match_length = 1;
+
+        // loop through the previous tokens
+        for (;; match_length++) {
+            // if we have reached the start of our last tokens, break
+            if (i < match_length) break;
+
+            // compare token starts at our prev index, going backwards by match length
+            auto compare_token = last_tokens[i - match_length];
+
+            // head token starts at the end of last tokens, going backwards by match length, minus 1 because we start at the last token itself
+            auto head_token = last_tokens[last_tokens_size - 1 - match_length];
+
+            // break out of the match if any tokens don't match
+            if (compare_token != head_token) {
+                break;
+            }
+
+            // if compare token is part of the sequence breakers, break out of the match
+            if (std::find(dry_seq_breakers, dry_seq_breakers + dry_seq_breakers_size, compare_token) != dry_seq_breakers + dry_seq_breakers_size) {
+                break;
+            }
+        }
+
+        // Check if the next token exists in the map
+        auto it = match_lengths.find(next_token);
+
+        if (it == match_lengths.end()) {
+            // Key does not exist, insert the new value
+            match_lengths[next_token] = match_length;
+        } else {
+            // Key exists, update it with the max of the new value or the existing value
+            it->second = std::max(it->second, match_length);
+        }
+    }
+
+    // apply penalties
+    for (const auto& pair : match_lengths) {
+        auto next_token = pair.first;
+        auto match_length = pair.second;
+
+        // if the match length is greater than or equal to our allowed length in config, we apply penalities
+        if (match_length >= dry_allowed_length) {
+
+            // find our next token in the candidates->data
+            for (size_t i = 0; i < candidates->size; ++i) {
+                if (candidates->data[i].id == next_token) {
+                    // calculate the penalty
+                    float penalty = dry_multiplier * pow(dry_base, match_length - dry_allowed_length);
+
+                    // apply the dry penalty
+                    candidates->data[i].logit -= penalty;
+                    break;
+                }
+            }
+        }
+    }
+}
+
 void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep) {
     if (z >= 1.0f || candidates->size <= 2) {
         return;
diff --git a/llama.h b/llama.h
index 059d78f115c6d..3b87a523581d9 100644
--- a/llama.h
+++ b/llama.h
@@ -943,6 +943,17 @@ extern "C" {
                            float   p,
                           size_t   min_keep);
 
+    ///  @details DRY sampler as described in: https://github.com/oobabooga/text-generation-webui/pull/5677
+    LLAMA_API void llama_sample_dry(
+          llama_token_data_array * candidates,
+               const llama_token * last_tokens,
+                          size_t   last_tokens_size,
+                           float   dry_base,
+                           float   dry_multiplier,
+                             int   dry_allowed_length,
+               const llama_token * dry_seq_breakers,
+                          size_t   dry_seq_breakers_size);
+
     /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
     LLAMA_API void llama_sample_tail_free(
             struct llama_context * ctx,
diff --git a/prompts/chat-with-layla.txt b/prompts/chat-with-layla.txt
new file mode 100644
index 0000000000000..fc500ed8b4414
--- /dev/null
+++ b/prompts/chat-with-layla.txt
@@ -0,0 +1,5 @@
+Layla is an AI created by Layla Network that is helpful, polite, and to the point. She is here to help User with everyday tasks. Layla's favourite animal is the butterfly because it represents transformation, growth, and beauty.
+
+Conversation between User and an Assistant named "Layla":
+
+User:
\ No newline at end of file