From 17ff6006f2943b863e110dc326a020592da2d074 Mon Sep 17 00:00:00 2001 From: Radkesvat <134321679+radkesvat@users.noreply.github.com> Date: Sat, 23 Nov 2024 14:34:45 +0000 Subject: [PATCH] tweaks --- core/main.c | 2 +- ww/buffer_pool.c | 2 +- ww/shiftbuffer.c | 8 +++--- ww/shiftbuffer.h | 64 +++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 64 insertions(+), 12 deletions(-) diff --git a/core/main.c b/core/main.c index 4b740dc..7865fbd 100644 --- a/core/main.c +++ b/core/main.c @@ -65,7 +65,7 @@ int main(void) config_file_t *cfile = parseConfigFile(*k.ref); /* - in case of error in config file, the details is already printed out and the + in case of error in config file, the details are already printed out and the program will not reach this line. */ diff --git a/ww/buffer_pool.c b/ww/buffer_pool.c index c4f3f87..837c2db 100644 --- a/ww/buffer_pool.c +++ b/ww/buffer_pool.c @@ -253,7 +253,7 @@ shift_buffer_t *duplicateBufferP(buffer_pool_t *pool, shift_buffer_t *b) bnew = popSmallBuffer(pool); } setLen(bnew, bufLen(b)); - memcpy(rawBufMut(bnew), rawBuf(b), bufLen(b)); + memCopy128(rawBufMut(bnew), rawBuf(b), bufLen(b)); return bnew; } diff --git a/ww/shiftbuffer.c b/ww/shiftbuffer.c index f69158c..7ad0b08 100644 --- a/ww/shiftbuffer.c +++ b/ww/shiftbuffer.c @@ -6,7 +6,7 @@ #include #include -#define LEFTPADDING (RAM_PROFILE >= kRamProfileS2Memory ? (1U << 10) : (1U << 8)) +#define LEFTPADDING ((RAM_PROFILE >= kRamProfileS2Memory ? (1U << 10) : (1U << 8)) - (sizeof(uint32_t) * 3)) #define RIGHTPADDING ((RAM_PROFILE >= kRamProfileS2Memory ? (1U << 9) : (1U << 7))) #define TOTALPADDING ((uint32_t) (sizeof(shift_buffer_t) + (LEFTPADDING + RIGHTPADDING))) @@ -24,7 +24,7 @@ shift_buffer_t *newShiftBuffer(uint32_t pre_cap) // NOLINT } uint32_t real_cap = pre_cap + TOTALPADDING; - shift_buffer_t *self = globalMalloc(real_cap); + shift_buffer_t *self = globalMalloc(real_cap + EXTRA_ALLOC); self->len = 0; self->curpos = LEFTPADDING; @@ -61,7 +61,7 @@ shift_buffer_t *duplicateBuffer(shift_buffer_t *b) uint32_t pre_cap = bufCap(b) - TOTALPADDING; shift_buffer_t *newbuf = newShiftBuffer(pre_cap); setLen(newbuf, bufLen(b)); - memcpy(rawBufMut(newbuf), rawBuf(b), bufLen(b)); + memCopy128(rawBufMut(newbuf), rawBuf(b), bufLen(b)); return newbuf; } @@ -87,7 +87,7 @@ shift_buffer_t *sliceBufferTo(shift_buffer_t *restrict dest, shift_buffer_t *res dest = bigger_buf; } setLen(dest, bytes); - memcpy(rawBufMut(dest), rawBuf(source), bytes); + memCopy128(rawBufMut(dest), rawBuf(source), bytes); shiftr(source, bytes); return dest; diff --git a/ww/shiftbuffer.h b/ww/shiftbuffer.h index fa192e4..3c0b0b2 100644 --- a/ww/shiftbuffer.h +++ b/ww/shiftbuffer.h @@ -18,13 +18,68 @@ */ +#if defined(WW_AVX) && defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) + +#define EXTRA_ALLOC 128 +#define BUF_USES_AVX 1 + +#include +static inline void memCopy128(void *dest, const void *src, long int n) +{ + __m256i *d_vec = (__m256i *) (dest); + const __m256i *s_vec = (const __m256i *) (src); + + if ((uintptr_t) dest % 128 != 0 || (uintptr_t) src % 128 != 0) + { + + while (n > 0) + { + _mm256_storeu_si256(d_vec, _mm256_loadu_si256(s_vec)); + _mm256_storeu_si256(d_vec + 1, _mm256_loadu_si256(s_vec + 1)); + _mm256_storeu_si256(d_vec + 2, _mm256_loadu_si256(s_vec + 2)); + _mm256_storeu_si256(d_vec + 3, _mm256_loadu_si256(s_vec + 3)); + + n -= 128; + d_vec += 4; + s_vec += 4; + } + + return; + } + + while (n > 0) + { + _mm256_store_si256(d_vec, _mm256_load_si256(s_vec)); + _mm256_store_si256(d_vec + 1, _mm256_load_si256(s_vec + 1)); + _mm256_store_si256(d_vec + 2, _mm256_load_si256(s_vec + 2)); + _mm256_store_si256(d_vec + 3, _mm256_load_si256(s_vec + 3)); + + n -= 128; + d_vec += 4; + s_vec += 4; + } +} +#elif + +#define EXTRA_ALLOC 0 +#define BUF_USES_AVX 0 + +static inline void memCopy128(void *__restrict __dest, const void *__restrict __src, size_t __n) +{ + memcpy(__dest, __src, __n); +} + +#endif struct shift_buffer_s { uint32_t len; uint32_t curpos; - uint64_t capacity; - uint8_t buf[]; + uint32_t capacity; +#if BUF_USES_AVX + uint8_t _pad_[EXTRA_ALLOC]; +#endif + uint8_t buf[]; }; typedef struct shift_buffer_s shift_buffer_t; @@ -37,8 +92,6 @@ shift_buffer_t *sliceBufferTo(shift_buffer_t *restrict dest, shift_buffer_t *res shift_buffer_t *sliceBuffer(shift_buffer_t *self, uint32_t bytes); shift_buffer_t *duplicateBuffer(shift_buffer_t *b); - - static inline unsigned int bufCap(shift_buffer_t *const self) { return self->capacity; @@ -167,7 +220,7 @@ static inline shift_buffer_t *reserveBufSpace(shift_buffer_t *const self, const { shift_buffer_t *bigger_buf = newShiftBuffer(bytes); setLen(bigger_buf, bufLen(self)); - memcpy(rawBufMut(bigger_buf), rawBuf(self), bufLen(self)); + memCopy128(rawBufMut(bigger_buf), rawBuf(self), bufLen(self)); destroyShiftBuffer(self); return bigger_buf; } @@ -182,7 +235,6 @@ static inline void concatBufferNoCheck(shift_buffer_t *restrict root, const shif memcpy(rawBufMut(root) + root_length, rawBuf(buf), append_length); } - #ifdef DEBUG // free and re create the buffer so in case of use after free we catch it static shift_buffer_t *debugBufferWontBeReused(shift_buffer_t *b)