From e9b0d1984cb0bc42ca65ba8cf37c56cf135ffb05 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 21 Aug 2024 16:28:07 -0400 Subject: [PATCH] llama : fix Mamba-2 conv state saving * ggml : make the ggml_mul fast broadcast path more consistently formatted --- ggml/src/ggml.c | 4 ++-- src/llama.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f8e708088b3574..415fa6901304a3 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -10226,11 +10226,11 @@ static void ggml_compute_forward_mul_f32( if (scale == 0.0f) { // NOTE: this also sets NANs to zero, which is not compliant with IEEE754, // but it is useful when resetting the state of recurrent models. - memset((char *)dst->data + ir*nb1, 0, nb1); + memset((char *) dst->data + ir*nb1, 0, ne0 * sizeof(float)); } else { if (dst->data != src0->data) { // src0 is same shape as dst => same indices - memcpy((char *)dst->data + ir*nb1, (char *)src0->data + ir*nb01, ne0 * sizeof(float)); + memcpy((char *) dst->data + ir*nb1, (char *) src0->data + ir*nb01, ne0 * sizeof(float)); } if (scale != 1.0f) { ggml_vec_scale_f32(ne0, (float *) ((char *) dst->data + ir*nb1), scale); diff --git a/src/llama.cpp b/src/llama.cpp index fd80361bd76050..03f93164a89e8c 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -9335,7 +9335,7 @@ static struct ggml_tensor * llm_build_mamba2( ggml_cpy(ctx, last_conv, ggml_view_1d(ctx, conv_states_all, (d_conv - 1)*(d_inner + 2*n_group*d_state)*(n_seqs), - kv_head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_states_all)))); + kv_head*(d_conv - 1)*(d_inner + 2*n_group*d_state)*ggml_element_size(conv_states_all)))); // 1D convolution // The equivalent is to make a self-overlapping view of conv_x