diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index a413df35750b1e..cdf55d5a3a84a0 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2479,6 +2479,9 @@ extern "C" { GGML_API int ggml_cpu_has_cann (void); GGML_API int ggml_cpu_has_llamafile (void); + // get the sve vector length in bytes + GGML_API int ggml_cpu_get_sve_cnt(void); + // // Internal types and functions exposed for tests and benchmarks // diff --git a/ggml/src/ggml-aarch64.c b/ggml/src/ggml-aarch64.c index 72cb83c9bb0c68..b3a0ef37f7cb1b 100644 --- a/ggml/src/ggml-aarch64.c +++ b/ggml/src/ggml-aarch64.c @@ -546,8 +546,8 @@ void ggml_gemv_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && + if (ggml_cpu_get_sve_cnt() == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -658,8 +658,8 @@ void ggml_gemv_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) - if (ggml_sve_cnt_b == QK8_0) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && + if (ggml_cpu_get_sve_cnt() == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -776,7 +776,7 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (ggml_sve_cnt_b == QK8_0) { + if (ggml_cpu_get_sve_cnt() == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -842,12 +842,12 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } @@ -997,8 +997,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_sve_cnt_b == QK8_0) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && + if (ggml_cpu_get_sve_cnt() == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1518,8 +1518,8 @@ void ggml_gemm_q4_0_4x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_sve_cnt_b == QK8_0) { - GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && + if (ggml_cpu_get_sve_cnt() == QK8_0) { + GGML_ASSERT(!(ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) && "__ARM_FEATURE_SVE defined, use the Q4_0_8_8 quantization format for optimal performance"); } #endif @@ -1980,7 +1980,7 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * UNUSED(blocklen); #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_MATMUL_INT8) && ! ((defined(_MSC_VER)) && ! defined(__clang__)) - if (ggml_sve_cnt_b == QK8_0) { + if (ggml_cpu_get_sve_cnt() == QK8_0) { const void * b_ptr = vx; const void * a_ptr = vy; float * res_ptr = s; @@ -2391,12 +2391,12 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void * return; } else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - GGML_ASSERT((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) && + GGML_ASSERT((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) && "__ARM_FEATURE_SVE for vector size of 256-bits not defined, use the Q4_0_4_8 quantization format for optimal " "performance"); } else if (ggml_cpu_has_neon()) { - GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_sve_cnt_b == QK8_0)) || ggml_cpu_has_matmul_int8()) && + GGML_ASSERT(((ggml_cpu_has_sve() && (ggml_cpu_get_sve_cnt() == QK8_0)) || ggml_cpu_has_matmul_int8()) && "__ARM_FEATURE_SVE for vector size of 256-bits and __ARM_FEATURE_MATMUL_INT8 not defined, use the Q4_0_4_4 " "quantization format for optimal performance"); } diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 7c1ec8d54a7763..05d7cb9869743a 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4012,7 +4012,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); - const int vector_length = ggml_sve_cnt_b*8; + const int vector_length = ggml_cpu_get_sve_cnt()*8; // VLA Implementation using switch case switch (vector_length) { @@ -5596,7 +5596,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r svfloat32_t sumv0 = svdup_n_f32(0.0f); svfloat32_t sumv1 = svdup_n_f32(0.0f); - const int vector_length = ggml_sve_cnt_b*8; + const int vector_length = ggml_cpu_get_sve_cnt()*8; //VLA Implemenation for SVE switch (vector_length) { diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index e96ce2b5e5c4e7..df9c4b24ae74fa 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -142,10 +142,6 @@ void iq2xs_free_impl(enum ggml_type type); void iq3xs_init_impl(int grid_size); void iq3xs_free_impl(int grid_size); -#if defined(__ARM_FEATURE_SVE) -extern int ggml_sve_cnt_b; -#endif - #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 490c8d602853bd..51c9e4c2aaa190 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -37,9 +37,15 @@ #include #endif -#if defined(__ARM_FEATURE_SVE) -int ggml_sve_cnt_b = 0; +#if defined(__aarch64__) +struct ggml_aarch64_features_type { + int has_neon; + int has_i8mm; + int has_sve; + int sve_cnt; +} ggml_aarch64_features = {-1, -1, -1, 0}; #endif + #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8) #undef GGML_USE_LLAMAFILE #endif @@ -3643,6 +3649,65 @@ static inline int ggml_up(int n, int m) { //////////////////////////////////////////////////////////////////////////////// +#if defined(__aarch64__) + +#if defined(__linux__) +#include +#elif defined(__APPLE__) +#include +#endif + +static void ggml_init_aarch64_features(void) { +#if defined(__linux__) + uint32_t hwcap = getauxval(AT_HWCAP); + uint32_t hwcap2 = getauxval(AT_HWCAP2); + + ggml_aarch64_features.has_neon = !!(hwcap & HWCAP_ASIMD); + ggml_aarch64_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM); + ggml_aarch64_features.has_sve = !!(hwcap & HWCAP_SVE); +#if defined(__ARM_FEATURE_SVE) + ggml_aarch64_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); +#endif +#elif defined(__APPLE__) + int oldp = 0; + size_t size = sizeof(oldp); + if (sysctlbyname("hw.optional.AdvSIMD", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_aarch64_features.has_neon = oldp; + + if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) { + oldp = 0; + } + ggml_aarch64_features.has_i8mm = oldp; + + ggml_aarch64_features.has_sve = 0; + ggml_aarch64_features.sve_cnt = 0; +#else +// Run-time CPU feature detection not implemented for this platform, fallback to compile time +#if defined(__ARM_NEON) + ggml_aarch64_features.has_neon = 1; +#else + ggml_aarch64_features.has_neon = 0; +#endif + +#if defined(__ARM_FEATURE_MATMUL_INT8) + ggml_aarch64_features.has_i8mm = 1; +#else + ggml_aarch64_features.has_i8mm = 0; +#endif + +#if defined(__ARM_FEATURE_SVE) + ggml_aarch64_features.has_sve = 1; + ggml_aarch64_features.sve_cnt = 16; +#else + ggml_aarch64_features.has_sve = 0; + ggml_aarch64_features.sve_cnt = 0; +#endif +#endif +} +#endif + struct ggml_context * ggml_init(struct ggml_init_params params) { // make this function thread safe ggml_critical_section_start(); @@ -3693,6 +3758,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } +#if defined(__aarch64__) + ggml_init_aarch64_features(); +#endif + is_first_call = false; } @@ -3741,12 +3810,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_ASSERT_ALIGNED(ctx->mem_buffer); -#if defined(__ARM_FEATURE_SVE) - if (!ggml_sve_cnt_b) { - ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL); - } -#endif - GGML_PRINT_DEBUG("%s: context initialized\n", __func__); ggml_critical_section_end(); @@ -23265,16 +23328,16 @@ int ggml_cpu_has_fma(void) { } int ggml_cpu_has_neon(void) { -#if defined(__ARM_NEON) - return 1; +#if defined(__aarch64__) + return ggml_aarch64_features.has_neon; #else return 0; #endif } int ggml_cpu_has_sve(void) { -#if defined(__ARM_FEATURE_SVE) - return 1; +#if defined(__aarch64__) + return ggml_aarch64_features.has_sve; #else return 0; #endif @@ -23421,11 +23484,18 @@ int ggml_cpu_has_vsx(void) { } int ggml_cpu_has_matmul_int8(void) { -#if defined(__ARM_FEATURE_MATMUL_INT8) - return 1; +#if defined(__aarch64__) + return ggml_aarch64_features.has_i8mm; #else return 0; #endif } +int ggml_cpu_get_sve_cnt(void) { +#if defined(__aarch64__) + return ggml_aarch64_features.sve_cnt; +#else + return 0; +#endif +} ////////////////////////////////////////////////////////////////////////////////