-
Notifications
You must be signed in to change notification settings - Fork 321
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Audio: Mixin_mixout: Add HiFi5 implementation.
Add HiFi5 implementation of mix functions, compared with HiFi3 version, can reduce about 27% cycles. Signed-off-by: Andrula Song <[email protected]>
- Loading branch information
1 parent
ff9343a
commit 61dfc3c
Showing
4 changed files
with
296 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c) | ||
add_local_sources(sof mixin_mixout.c mixin_mixout_generic.c mixin_mixout_hifi3.c mixin_mixout_hifi5.c) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,285 @@ | ||
// SPDX-License-Identifier: BSD-3-Clause | ||
// | ||
// Copyright(c) 2024 Intel Corporation. All rights reserved. | ||
// | ||
// Author: Andrula Song <[email protected]> | ||
|
||
#include <sof/common.h> | ||
|
||
#include "mixin_mixout.h" | ||
|
||
#ifdef MIXIN_MIXOUT_HIFI5 | ||
|
||
#if CONFIG_FORMAT_S16LE | ||
static void mix_s16(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, | ||
const struct cir_buf_ptr *source, | ||
int32_t sample_count, uint16_t gain) | ||
{ | ||
int samples_to_mix, samples_to_copy, left_samples; | ||
int n, nmax, i, m, left; | ||
ae_int16x4 in_sample, in_sample1; | ||
ae_int16x4 out_sample, out_sample1; | ||
ae_int16x8 *in; | ||
ae_int16x8 *out; | ||
ae_valignx2 inu = AE_ZALIGN128(); | ||
ae_valignx2 outu1 = AE_ZALIGN128(); | ||
ae_valignx2 outu2 = AE_ZALIGN128(); | ||
/* cir_buf_wrap() is required and is done below in a loop */ | ||
ae_int16 *dst = (ae_int16 *)sink->ptr + start_sample; | ||
ae_int16 *src = source->ptr; | ||
|
||
assert(mixed_samples >= start_sample); | ||
samples_to_mix = AE_MIN_32_signed(mixed_samples - start_sample, sample_count); | ||
samples_to_copy = sample_count - samples_to_mix; | ||
n = 0; | ||
|
||
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { | ||
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); | ||
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); | ||
/* calculate the remaining samples*/ | ||
nmax = (ae_int16 *)source->buf_end - src; | ||
n = AE_MIN_32_signed(left_samples, nmax); | ||
nmax = (ae_int16 *)sink->buf_end - dst; | ||
n = AE_MIN_32_signed(n, nmax); | ||
in = (ae_int16x8 *)src; | ||
out = (ae_int16x8 *)dst; | ||
inu = AE_LA128_PP(in); | ||
outu1 = AE_LA128_PP(out); | ||
m = n >> 3; | ||
left = n & 0x07; | ||
/* process 8 samples per loop */ | ||
for (i = 0; i < m; i++) { | ||
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); | ||
AE_LA16X4X2_IP(out_sample, out_sample1, outu1, out); | ||
out--; | ||
out_sample = AE_ADD16S(in_sample, out_sample); | ||
out_sample1 = AE_ADD16S(in_sample1, out_sample1); | ||
AE_SA16X4X2_IP(out_sample, out_sample1, outu2, out); | ||
} | ||
AE_SA128POS_FP(outu2, out); | ||
|
||
/* process the left samples that less than 8 | ||
* one by one to avoid memory access overrun | ||
*/ | ||
for (i = 0; i < left ; i++) { | ||
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); | ||
AE_L16_IP(out_sample, (ae_int16 *)out, 0); | ||
out_sample = AE_ADD16S(in_sample, out_sample); | ||
AE_S16_0_IP(out_sample, (ae_int16 *)out, sizeof(ae_int16)); | ||
} | ||
} | ||
|
||
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { | ||
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); | ||
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); | ||
/* calculate the remaining samples*/ | ||
nmax = (ae_int16 *)source->buf_end - src; | ||
n = AE_MIN_32_signed(left_samples, nmax); | ||
nmax = (ae_int16 *)sink->buf_end - dst; | ||
n = AE_MIN_32_signed(n, nmax); | ||
in = (ae_int16x8 *)src; | ||
out = (ae_int16x8 *)dst; | ||
inu = AE_LA128_PP(in); | ||
m = n >> 3; | ||
left = n & 0x07; | ||
/* process 8 frames per loop */ | ||
for (i = 0; i < m; i++) { | ||
AE_LA16X4X2_IP(in_sample, in_sample1, inu, in); | ||
AE_SA16X4X2_IP(in_sample, in_sample1, outu2, out); | ||
} | ||
AE_SA128POS_FP(outu2, out); | ||
|
||
/* process the left samples that less than 8 | ||
* one by one to avoid memory access overrun | ||
*/ | ||
for (i = 0; i < left ; i++) { | ||
AE_L16_IP(in_sample, (ae_int16 *)in, sizeof(ae_int16)); | ||
AE_S16_0_IP(in_sample, (ae_int16 *)out, sizeof(ae_int16)); | ||
} | ||
} | ||
} | ||
#endif /* CONFIG_FORMAT_S16LE */ | ||
|
||
#if CONFIG_FORMAT_S24LE | ||
static void mix_s24(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, | ||
const struct cir_buf_ptr *source, | ||
int32_t sample_count, uint16_t gain) | ||
{ | ||
int samples_to_mix, samples_to_copy, left_samples; | ||
int n, nmax, i, m, left; | ||
ae_int32x2 in_sample, in_sample1; | ||
ae_int32x2 out_sample, out_sample1; | ||
ae_int32x4 *in; | ||
ae_int32x4 *out; | ||
ae_valignx2 inu = AE_ZALIGN128(); | ||
ae_valignx2 outu1 = AE_ZALIGN128(); | ||
ae_valignx2 outu2 = AE_ZALIGN128(); | ||
/* cir_buf_wrap() is required and is done below in a loop */ | ||
int32_t *dst = (int32_t *)sink->ptr + start_sample; | ||
int32_t *src = source->ptr; | ||
|
||
assert(mixed_samples >= start_sample); | ||
samples_to_mix = AE_MIN_32_signed(mixed_samples - start_sample, sample_count); | ||
samples_to_copy = sample_count - samples_to_mix; | ||
n = 0; | ||
|
||
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { | ||
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); | ||
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); | ||
/* calculate the remaining samples*/ | ||
nmax = (int32_t *)source->buf_end - src; | ||
n = AE_MIN_32_signed(left_samples, nmax); | ||
nmax = (int32_t *)sink->buf_end - dst; | ||
n = AE_MIN_32_signed(n, nmax); | ||
in = (ae_int32x4 *)src; | ||
out = (ae_int32x4 *)dst; | ||
inu = AE_LA128_PP(in); | ||
outu1 = AE_LA128_PP(out); | ||
m = n >> 2; | ||
left = n & 3; | ||
/* process 2 samples per time */ | ||
for (i = 0; i < m; i++) { | ||
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); | ||
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); | ||
out--; | ||
out_sample = AE_ADD24S(in_sample, out_sample); | ||
out_sample1 = AE_ADD24S(in_sample1, out_sample1); | ||
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); | ||
} | ||
AE_SA128POS_FP(outu2, out); | ||
|
||
/* process the left sample to avoid memory access overrun */ | ||
if (left) { | ||
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); | ||
AE_L32_IP(out_sample, (ae_int32 *)out, 0); | ||
out_sample = AE_ADD24S(in_sample, out_sample); | ||
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); | ||
} | ||
} | ||
|
||
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { | ||
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); | ||
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); | ||
nmax = (int32_t *)source->buf_end - src; | ||
n = AE_MIN_32_signed(left_samples, nmax); | ||
nmax = (int32_t *)sink->buf_end - dst; | ||
n = AE_MIN_32_signed(n, nmax); | ||
in = (ae_int32x4 *)src; | ||
out = (ae_int32x4 *)dst; | ||
inu = AE_LA128_PP(in); | ||
m = n >> 2; | ||
left = n & 3; | ||
for (i = 0; i < m; i++) { | ||
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); | ||
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); | ||
} | ||
AE_SA128POS_FP(outu2, out); | ||
/* process the left sample to avoid memory access overrun */ | ||
if (left) { | ||
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); | ||
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); | ||
} | ||
} | ||
} | ||
|
||
#endif /* CONFIG_FORMAT_S24LE */ | ||
|
||
#if CONFIG_FORMAT_S32LE | ||
static void mix_s32(struct cir_buf_ptr *sink, int32_t start_sample, int32_t mixed_samples, | ||
const struct cir_buf_ptr *source, | ||
int32_t sample_count, uint16_t gain) | ||
{ | ||
int samples_to_mix, samples_to_copy, left_samples; | ||
int n, nmax, i, m, left; | ||
ae_int32x2 in_sample, in_sample1; | ||
ae_int32x2 out_sample, out_sample1; | ||
ae_int32x4 *in; | ||
ae_int32x4 *out; | ||
ae_valignx2 inu = AE_ZALIGN128(); | ||
ae_valignx2 outu1 = AE_ZALIGN128(); | ||
ae_valignx2 outu2 = AE_ZALIGN128(); | ||
/* cir_buf_wrap() is required and is done below in a loop */ | ||
int32_t *dst = (int32_t *)sink->ptr + start_sample; | ||
int32_t *src = source->ptr; | ||
|
||
assert(mixed_samples >= start_sample); | ||
samples_to_mix = AE_MIN_32_signed(mixed_samples - start_sample, sample_count); | ||
samples_to_copy = sample_count - samples_to_mix; | ||
n = 0; | ||
|
||
for (left_samples = samples_to_mix; left_samples > 0; left_samples -= n) { | ||
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); | ||
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); | ||
/* calculate the remaining samples*/ | ||
nmax = (int32_t *)source->buf_end - src; | ||
n = AE_MIN_32_signed(left_samples, nmax); | ||
nmax = (int32_t *)sink->buf_end - dst; | ||
n = AE_MIN_32_signed(n, nmax); | ||
in = (ae_int32x4 *)src; | ||
out = (ae_int32x4 *)dst; | ||
inu = AE_LA128_PP(in); | ||
outu1 = AE_LA128_PP(out); | ||
m = n >> 2; | ||
left = n & 3; | ||
for (i = 0; i < m; i++) { | ||
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); | ||
AE_LA32X2X2_IP(out_sample, out_sample1, outu1, out); | ||
out--; | ||
out_sample = AE_ADD32S(in_sample, out_sample); | ||
out_sample1 = AE_ADD32S(in_sample1, out_sample1); | ||
AE_SA32X2X2_IP(out_sample, out_sample1, outu2, out); | ||
} | ||
AE_SA128POS_FP(outu2, out); | ||
|
||
/* process the left sample to avoid memory access overrun */ | ||
if (left) { | ||
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); | ||
AE_L32_IP(out_sample, (ae_int32 *)out, 0); | ||
out_sample = AE_ADD32S(in_sample, out_sample); | ||
AE_S32_L_IP(out_sample, (ae_int32 *)out, sizeof(ae_int32)); | ||
} | ||
} | ||
|
||
for (left_samples = samples_to_copy; left_samples > 0; left_samples -= n) { | ||
src = cir_buf_wrap(src + n, source->buf_start, source->buf_end); | ||
dst = cir_buf_wrap(dst + n, sink->buf_start, sink->buf_end); | ||
/* calculate the remaining samples*/ | ||
nmax = (int32_t *)source->buf_end - src; | ||
n = AE_MIN_32_signed(left_samples, nmax); | ||
nmax = (int32_t *)sink->buf_end - dst; | ||
n = AE_MIN_32_signed(n, nmax); | ||
in = (ae_int32x4 *)src; | ||
out = (ae_int32x4 *)dst; | ||
inu = AE_LA128_PP(in); | ||
m = n >> 2; | ||
left = n & 3; | ||
for (i = 0; i < m; i++) { | ||
AE_LA32X2X2_IP(in_sample, in_sample1, inu, in); | ||
AE_SA32X2X2_IP(in_sample, in_sample1, outu2, out); | ||
} | ||
AE_SA128POS_FP(outu2, out); | ||
/* process the left sample to avoid memory access overrun */ | ||
if (left) { | ||
AE_L32_IP(in_sample, (ae_int32 *)in, sizeof(ae_int32)); | ||
AE_S32_L_IP(in_sample, (ae_int32 *)out, sizeof(ae_int32)); | ||
} | ||
} | ||
} | ||
|
||
#endif /* CONFIG_FORMAT_S32LE */ | ||
|
||
const struct mix_func_map mix_func_map[] = { | ||
#if CONFIG_FORMAT_S16LE | ||
{ SOF_IPC_FRAME_S16_LE, mix_s16 }, | ||
#endif | ||
#if CONFIG_FORMAT_S24LE | ||
{ SOF_IPC_FRAME_S24_4LE, mix_s24 }, | ||
#endif | ||
#if CONFIG_FORMAT_S32LE | ||
{ SOF_IPC_FRAME_S32_LE, mix_s32 } | ||
#endif | ||
}; | ||
|
||
const size_t mix_count = ARRAY_SIZE(mix_func_map); | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters