Skip to content

Commit

Permalink
[CPU] [ARM] SVE FP16 functions for MHASingleToken kernel (#28182)
Browse files Browse the repository at this point in the history
### Details
In continuation with
#27273, adds SVE FP16
implementations for functions called during execution of MHASingleToken
for SVE-128, SVE-256 and SVE-512 platforms. SVE implementations are
compiled only if runtime support for SVE is detected on the hardware,
otherwise it falls back to Neon.

### Benchmarking results
Below are the benchmarking results of execution time of each ported
function. Measurements were performed by running each function
individually on dummy inputs (128 fp16 elements) for 1,000,000
iterations and computing average time (in micro-seconds).


![image](https://github.com/user-attachments/assets/85efd5d9-da91-4d46-a1c3-82a440d17470)
  • Loading branch information
NishantPrabhuFujitsu authored Jan 2, 2025
1 parent 638f3cb commit 66e1ec7
Show file tree
Hide file tree
Showing 6 changed files with 343 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ set(_ACCEPTED_ARCHS_AVX "^(ANY|SSE42|AVX)$")
set(_ACCEPTED_ARCHS_AVX2 "^(ANY|SSE42|AVX|AVX2)$")
set(_ACCEPTED_ARCHS_AVX512F "^(ANY|SSE42|AVX|AVX2|AVX512F)$")
set(_ACCEPTED_ARCHS_NEON_FP16 "^(ANY|NEON_FP16)$")
set(_ACCEPTED_ARCHS_SVE "^(ANY|SVE)$")
set(_ACCEPTED_ARCHS_SVE "^(ANY|NEON_FP16|SVE)$")

## Arch specific definitions
set(_DEFINE_ANY "")
Expand Down Expand Up @@ -186,10 +186,10 @@ endfunction()
# Return currently requested ARCH id
#
function(_currently_requested_top_arch VAR)
if(ENABLE_NEON_FP16)
set(RES NEON_FP16)
elseif(ENABLE_SVE)
if(ENABLE_SVE)
set(RES SVE)
elseif(ENABLE_NEON_FP16)
set(RES NEON_FP16)
elseif(ENABLE_AVX512F)
set(RES AVX512F)
elseif(ENABLE_AVX2)
Expand Down
13 changes: 11 additions & 2 deletions cmake/developer_package/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

include(options)
include(target_flags)
include(compile_flags/os_flags)

if(WIN32)
set (CPACK_GENERATOR "ZIP" CACHE STRING "Cpack generator for OpenVINO")
Expand Down Expand Up @@ -49,9 +50,9 @@ ov_dependent_option (ENABLE_AVX2 "Enable AVX2 optimizations" ON "X86_64 OR (X86

ov_dependent_option (ENABLE_AVX512F "Enable AVX512 optimizations" ON "X86_64 OR (X86 AND NOT EMSCRIPTEN)" OFF)

ov_dependent_option(ENABLE_NEON_FP16 "Enable ARM FP16 optimizations" ON "AARCH64" OFF)
ov_dependent_option (ENABLE_NEON_FP16 "Enable ARM FP16 optimizations" ON "AARCH64" OFF)

ov_dependent_option(ENABLE_SVE "Enable SVE optimizations" ON "AARCH64" OFF)
ov_dependent_option (ENABLE_SVE "Enable SVE optimizations" ON "AARCH64" OFF)

# Type of build, we add this as an explicit option to default it to ON
get_property(BUILD_SHARED_LIBS_DEFAULT GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS)
Expand Down Expand Up @@ -106,3 +107,11 @@ if(ENABLE_AVX512F)
set(ENABLE_AVX512F OFF CACHE BOOL "" FORCE)
endif()
endif()

if(ENABLE_SVE)
ov_check_compiler_supports_sve("-march=armv8-a+sve")

if(NOT CXX_HAS_SVE)
set(ENABLE_SVE OFF CACHE BOOL "" FORCE)
endif()
endif()
10 changes: 5 additions & 5 deletions src/plugins/intel_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -283,16 +283,16 @@ target_include_directories(${TARGET_NAME} PRIVATE $<TARGET_PROPERTY:openvino::re
set(SOFTMAX_ARCH_LIST AVX512F AVX2)
set(MHA_SINGLE_TOKEN_ARCH_LIST AVX512F AVX2)

if(ENABLE_NEON_FP16)
list(APPEND SOFTMAX_ARCH_LIST NEON_FP16)
list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST NEON_FP16)
endif()

if(ENABLE_SVE)
list(APPEND SOFTMAX_ARCH_LIST SVE)
list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST SVE)
endif()

if(ENABLE_NEON_FP16)
list(APPEND SOFTMAX_ARCH_LIST NEON_FP16)
list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST NEON_FP16)
endif()

list(APPEND SOFTMAX_ARCH_LIST ANY)
list(APPEND MHA_SINGLE_TOKEN_ARCH_LIST ANY)

Expand Down
32 changes: 31 additions & 1 deletion src/plugins/intel_cpu/src/nodes/kernels/scaled_attn/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,14 @@ static constexpr size_t vec_len_f32_neon = vec_len_neon / sizeof(float);
static constexpr size_t vec_len_f16_neon = vec_len_neon / sizeof(ov::float16);

#if defined(HAVE_SVE)
static constexpr size_t vec_len_f32_sve = svcntw();
inline size_t vec_len_f32_sve() {
static size_t len = svcntw();
return len;
}
inline size_t vec_len_f16_sve() {
static size_t len = svcnth();
return len;
}
#endif

#ifdef HAVE_AVX512F
Expand Down Expand Up @@ -403,6 +410,28 @@ inline void __vst1q_f32(ov::bfloat16* a, float32x4_t b) {
#endif

#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
# if defined(HAVE_SVE)
inline svfloat16_t exp_ps_sve_f16(svbool_t& pg, svfloat16_t& src) {
svbool_t pg_f32 = svtrn1_b16(pg, svpfalse());

// Extract lower and upper halves of src into two separate vecs and convert
svfloat16_t zero = svdup_n_f16(0.0);
svfloat16_t low_f16 = svtrn1_f16(src, zero);
svfloat16_t high_f16 = svtrn2_f16(src, zero);
svfloat32_t low_f32 = svcvt_f32_f16_z(pg, low_f16);
svfloat32_t high_f32 = svcvt_f32_f16_z(pg, high_f16);

// Perform exp and convert back to f16
svfloat32_t low_exp_f32 = exp_ps_sve(pg_f32, low_f32);
svfloat32_t high_exp_f32 = exp_ps_sve(pg_f32, high_f32);
svfloat16_t low_exp_f16 = svcvt_f16_f32_z(pg_f32, low_exp_f32);
svfloat16_t high_exp_f16 = svcvt_f16_f32_z(pg_f32, high_exp_f32);

// Interleave both to get final result
svfloat16_t res = svtrn1_f16(low_exp_f16, high_exp_f16);
return res;
}
# else
inline float16x8_t exp_ps_neon_f16(float16x8_t x) {
const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
const float32x4_t x_low = vcvt_f32_f16(vget_low_f16(x));
Expand All @@ -411,6 +440,7 @@ inline float16x8_t exp_ps_neon_f16(float16x8_t x) {
const float16x8_t res = vcombine_f16(vcvt_f16_f32(exp_ps_neon_f32(x_low)), vcvt_f16_f32(exp_ps_neon_f32(x_high)));
return res;
}
# endif
inline float16_t hsum(float16x8_t vec) {
float16x4_t sum1 = vpadd_f16(vget_low_f16(vec), vget_high_f16(vec));
float16x4_t sum2 = vpadd_f16(sum1, sum1);
Expand Down
Loading

0 comments on commit 66e1ec7

Please sign in to comment.