From 06ce4783c4d50e209544e0459dd3ff798eb81ac3 Mon Sep 17 00:00:00 2001 From: Andrula Song Date: Thu, 25 Jan 2024 19:19:30 +0800 Subject: [PATCH] Audio: Mixin_mixout: Add HiFi5 implementation. Add HiFi5 implementation of mix functions, compared with HiFi3 version, can reduce about 27% cycles. Signed-off-by: Andrula Song --- src/audio/mixin_mixout/CMakeLists.txt | 2 +- src/audio/mixin_mixout/Kconfig | 37 +++ src/audio/mixin_mixout/mixin_mixout.h | 12 - src/audio/mixin_mixout/mixin_mixout_generic.c | 2 +- src/audio/mixin_mixout/mixin_mixout_hifi3.c | 2 +- src/audio/mixin_mixout/mixin_mixout_hifi5.c | 285 ++++++++++++++++++ zephyr/CMakeLists.txt | 1 + 7 files changed, 326 insertions(+), 15 deletions(-) create mode 100644 src/audio/mixin_mixout/mixin_mixout_hifi5.c diff --git a/src/audio/mixin_mixout/CMakeLists.txt b/src/audio/mixin_mixout/CMakeLists.txt index b1723cd114ae..2e6dc7456ae5 100644 --- a/src/audio/mixin_mixout/CMakeLists.txt +++ b/src/audio/mixin_mixout/CMakeLists.txt @@ -1 +1 @@ -add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c) +add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c mixin_mixout_hifi5.c) diff --git a/src/audio/mixin_mixout/Kconfig b/src/audio/mixin_mixout/Kconfig index 449d6617fce7..2315023d61d1 100644 --- a/src/audio/mixin_mixout/Kconfig +++ b/src/audio/mixin_mixout/Kconfig @@ -6,3 +6,40 @@ config COMP_MIXIN_MIXOUT default y help Select for Mixin_mixout component + +choice "MIXIN_MIXOUT_SIMD_LEVEL_SELECT" + prompt "choose which SIMD level used for MIXIN_MIXOUT module" + depends on COMP_MIXIN_MIXOUT + default MIXIN_MIXOUT_HIFI_MAX + + config MIXIN_MIXOUT_HIFI_MAX + prompt "Max HiFi level available in the toolchain" + bool + help + When this was selected, optimization level will be determined + by toolchain. + + config MIXIN_MIXOUT_HIFI_5 + prompt "choose HIFI4 intrinsic optimized MIXIN_MIXOUT module" + bool + help + This option used to build HIFI4 optimized MIXIN_MIXOUT code + + config MIXIN_MIXOUT_HIFI_4 + prompt "choose HIFI4 intrinsic optimized MIXIN_MIXOUT module" + bool + help + This option used to build HIFI4 optimized MIXIN_MIXOUT code + + config MIXIN_MIXOUT_HIFI_3 + prompt "choose HIFI3 intrinsic optimized MIXIN_MIXOUT module" + bool + help + This option used to build HIFI3 intrinsic optimized MIXIN_MIXOUT code + + config MIXIN_MIXOUT_HIFI_NONE + prompt "choose generic C MIXIN_MIXOUT module, no HIFI SIMD involved" + bool + help + This option used to build MIXIN_MIXOUT generic code. +endchoice diff --git a/src/audio/mixin_mixout/mixin_mixout.h b/src/audio/mixin_mixout/mixin_mixout.h index 8ac844383917..085187e0e743 100644 --- a/src/audio/mixin_mixout/mixin_mixout.h +++ b/src/audio/mixin_mixout/mixin_mixout.h @@ -31,18 +31,6 @@ #include #include -#define MIXIN_MIXOUT_GENERIC - -#if defined(__XCC__) - -#include -#if XCHAL_HAVE_HIFI3 || XCHAL_HAVE_HIFI4 -#undef MIXIN_MIXOUT_GENERIC -#define MIXIN_MIXOUT_HIFI3 -#endif - -#endif - enum ipc4_mixin_config_param { /* large_config_set param id for ipc4_mixer_mode_config */ IPC4_MIXER_MODE = 1 diff --git a/src/audio/mixin_mixout/mixin_mixout_generic.c b/src/audio/mixin_mixout/mixin_mixout_generic.c index c69305073660..60a9f8211b05 100644 --- a/src/audio/mixin_mixout/mixin_mixout_generic.c +++ b/src/audio/mixin_mixout/mixin_mixout_generic.c @@ -9,7 +9,7 @@ #include "mixin_mixout.h" -#ifdef MIXIN_MIXOUT_GENERIC +#if SOF_USE_HIFI(NONE, MIXIN_MIXOUT) #if CONFIG_FORMAT_S16LE static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi3.c b/src/audio/mixin_mixout/mixin_mixout_hifi3.c index 369ede78d36a..568cbed5202b 100644 --- a/src/audio/mixin_mixout/mixin_mixout_hifi3.c +++ b/src/audio/mixin_mixout/mixin_mixout_hifi3.c @@ -8,7 +8,7 @@ #include "mixin_mixout.h" -#ifdef MIXIN_MIXOUT_HIFI3 +#if SOF_USE_HIFI(3, MIXIN_MIXOUT) || SOF_USE_HIFI(4, MIXIN_MIXOUT) #if CONFIG_FORMAT_S16LE static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, diff --git a/src/audio/mixin_mixout/mixin_mixout_hifi5.c b/src/audio/mixin_mixout/mixin_mixout_hifi5.c new file mode 100644 index 000000000000..6d82ec54d50d --- /dev/null +++ b/src/audio/mixin_mixout/mixin_mixout_hifi5.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: BSD-3-Clause +// +// Copyright(c) 2024 Intel Corporation. All rights reserved. +// +// Author: Andrula Song + +#include + +#include "mixin_mixout.h" + +#if SOF_USE_HIFI(5, MIXIN_MIXOUT) + +#if CONFIG_FORMAT_S16LE +static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, + const struct cir_buf_ptr *source, + int32_t sample_count, uint16_t gain) +{ + int samples_to_mix, samples_to_copy, left_samples; + int n, nmax, i, m, left; + ae_int16x4 in_sample, in_sample1; + ae_int16x4 out_sample, out_sample1; + ae_int16x8 *in; + ae_int16x8 *out; + ae_valignx2 inu = AE_ZALIGN128(); + ae_valignx2 outu1 = AE_ZALIGN128(); + ae_valignx2 outu2 = AE_ZALIGN128(); + /* cir_buf_wrap() is required and is done below in a loop */ + ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample; + ae_int16 *src = source->ptr; + + assert(mixed_samples >= start_sample); + samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count); + samples_to_copy = sample_count - samples_to_mix; + n = 0; + + for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (ae_int16 *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (ae_int16 *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int16x8 *)src; + out = (ae_int16x8 *)dst; + inu = AE_LA128_PP(in); + outu1 = AE_LA128_PP(out); + m = n >> 3; + left = n & 0x07; + /* process 8 samples per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); + AE_LA16X4X2_IP(out_sample, out_sample1, outu1, out); + out--; + out_sample = AE_ADD16S(in_sample, out_sample); + out_sample1 = AE_ADD16S(in_sample1, out_sample1); + AE_SA16X4X2_IP(out_sample, out_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left samples that less than 8 + * one by one to avoid memory access overrun + */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); + AE_L16_IP(out_sample, (ae_int16 *)out, 0); + out_sample = AE_ADD16S(in_sample, out_sample); + AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16)); + } + } + + for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (ae_int16 *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (ae_int16 *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int16x8 *)src; + out = (ae_int16x8 *)dst; + inu = AE_LA128_PP(in); + m = n >> 3; + left = n & 0x07; + /* process 8 frames per loop */ + for (i = 0; i < m; i++) { + AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); + AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left samples that less than 8 + * one by one to avoid memory access overrun + */ + for (i = 0; i < left ; i++) { + AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); + AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16)); + } + } +} +#endif /* CONFIG_FORMAT_S16LE */ + +#if CONFIG_FORMAT_S24LE +static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, + const struct cir_buf_ptr *source, + int32_t sample_count, uint16_t gain) +{ + int samples_to_mix, samples_to_copy, left_samples; + int n, nmax, i, m, left; + ae_int32x2 in_sample, in_sample1; + ae_int32x2 out_sample, out_sample1; + ae_int32x4 *in; + ae_int32x4 *out; + ae_valignx2 inu = AE_ZALIGN128(); + ae_valignx2 outu1 = AE_ZALIGN128(); + ae_valignx2 outu2 = AE_ZALIGN128(); + /* cir_buf_wrap() is required and is done below in a loop */ + int32_t *dst = (int32_t *)sink->ptr + start_sample; + int32_t *src = source->ptr; + + assert(mixed_samples >= start_sample); + samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count); + samples_to_copy = sample_count - samples_to_mix; + n = 0; + + for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + outu1 = AE_LA128_PP(out); + m = n >> 2; + left = n & 3; + /* process 2 samples per time */ + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); + out--; + out_sample = AE_ADD24S(in_sample, out_sample); + out_sample1 = AE_ADD24S(in_sample1, out_sample1); + AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left sample to avoid memory access overrun */ + if (left) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + AE_L32_IP(out_sample, (ae_int32 *)out, 0); + out_sample = AE_ADD24S(in_sample, out_sample); + AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } + + for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + m = n >> 2; + left = n & 3; + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + /* process the left sample to avoid memory access overrun */ + if (left) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } +} + +#endif /* CONFIG_FORMAT_S24LE */ + +#if CONFIG_FORMAT_S32LE +static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, + const struct cir_buf_ptr *source, + int32_t sample_count, uint16_t gain) +{ + int samples_to_mix, samples_to_copy, left_samples; + int n, nmax, i, m, left; + ae_int32x2 in_sample, in_sample1; + ae_int32x2 out_sample, out_sample1; + ae_int32x4 *in; + ae_int32x4 *out; + ae_valignx2 inu = AE_ZALIGN128(); + ae_valignx2 outu1 = AE_ZALIGN128(); + ae_valignx2 outu2 = AE_ZALIGN128(); + /* cir_buf_wrap() is required and is done below in a loop */ + int32_t *dst = (int32_t *)sink->ptr + start_sample; + int32_t *src = source->ptr; + + assert(mixed_samples >= start_sample); + samples_to_mix = AE_MIN32(mixed_samples - start_sample, sample_count); + samples_to_copy = sample_count - samples_to_mix; + n = 0; + + for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + outu1 = AE_LA128_PP(out); + m = n >> 2; + left = n & 3; + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); + out--; + out_sample = AE_ADD32S(in_sample, out_sample); + out_sample1 = AE_ADD32S(in_sample1, out_sample1); + AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + + /* process the left sample to avoid memory access overrun */ + if (left) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + AE_L32_IP(out_sample, (ae_int32 *)out, 0); + out_sample = AE_ADD32S(in_sample, out_sample); + AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } + + for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { + src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); + dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); + /* calculate the remaining samples*/ + nmax = (int32_t *)source->buf_end - src; + n = AE_MIN32(left_samples, nmax); + nmax = (int32_t *)sink->buf_end - dst; + n = AE_MIN32(n, nmax); + in = (ae_int32x4 *)src; + out = (ae_int32x4 *)dst; + inu = AE_LA128_PP(in); + m = n >> 2; + left = n & 3; + for (i = 0; i < m; i++) { + AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); + AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); + } + AE_SA128POS_FP(outu2, out); + /* process the left sample to avoid memory access overrun */ + if (left) { + AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); + AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); + } + } +} + +#endif /* CONFIG_FORMAT_S32LE */ + +const struct mix_func_map mix_func_map[] = { +#if CONFIG_FORMAT_S16LE + { SOF_IPC_FRAME_S16_LE, mix_s16 }, +#endif +#if CONFIG_FORMAT_S24LE + { SOF_IPC_FRAME_S24_4LE, mix_s24 }, +#endif +#if CONFIG_FORMAT_S32LE + { SOF_IPC_FRAME_S32_LE, mix_s32 } +#endif +}; + +const size_t mix_count = ARRAY_SIZE(mix_func_map); + +#endif diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt index c2cf0bcfffb1..5a871a1ab027 100644 --- a/zephyr/CMakeLists.txt +++ b/zephyr/CMakeLists.txt @@ -561,6 +561,7 @@ zephyr_library_sources_ifdef(CONFIG_COMP_MIXIN_MIXOUT ${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout.c ${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_generic.c ${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi3.c + ${SOF_AUDIO_PATH}/mixin_mixout/mixin_mixout_hifi5.c ) zephyr_library_sources_ifdef(CONFIG_COMP_TONE