From b26d5e0c63216e7050783b97abde2c8668436786 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Tue, 12 Dec 2023 18:16:13 +0200 Subject: [PATCH 1/4] WIP: remove the use of macros for critical loops, easier to debug, same performance --- src/fdr/fdr.c | 191 +++++++++++++++++--------------------------------- 1 file changed, 64 insertions(+), 127 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 561e8f986..d67e27199 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -82,44 +82,6 @@ struct zone { const u8 *floodPtr; }; -static -const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { - { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, - { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, - { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } -}; - /* generates an initial state mask based on the last byte-ish of history rather * than being all accepting. If there is no history to consider, the state is * generated based on the minimum length of each bucket in order to prevent @@ -141,13 +103,13 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, return s; } + static really_inline void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + UNUSED const u8 *end_ptr, u32 domain_mask, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a domain_mask = ~domain_mask_flipped; u64a it_hi = *(const u64a *)itPtr; u64a it_lo = *(const u64a *)(itPtr + 8); @@ -212,25 +174,25 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, static really_inline void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + UNUSED const u8 *end_ptr, u32 domain_mask, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach2 = andn(domain_mask_flipped, itPtr + 2); - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach6 = andn(domain_mask_flipped, itPtr + 6); + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach2 = domain_mask & (it_hi >> 16); + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach6 = domain_mask & (it_hi >> 48); + u64a reach8 = domain_mask & it_lo; + u64a reach10 = domain_mask & (it_lo >> 16); + u64a reach12 = domain_mask & (it_lo >> 32); + u64a reach14 = domain_mask & (it_lo >> 48); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st2 = load_m128_from_u64a(ft + reach2); m128 st4 = load_m128_from_u64a(ft + reach4); m128 st6 = load_m128_from_u64a(ft + reach6); - - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach10 = andn(domain_mask_flipped, itPtr + 10); - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); - u64a reach14 = andn(domain_mask_flipped, itPtr + 14); - m128 st8 = load_m128_from_u64a(ft + reach8); m128 st10 = load_m128_from_u64a(ft + reach10); m128 st12 = load_m128_from_u64a(ft + reach12); @@ -239,6 +201,9 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, st2 = lshiftbyte_m128(st2, 2); st4 = lshiftbyte_m128(st4, 4); st6 = lshiftbyte_m128(st6, 6); + st10 = lshiftbyte_m128(st10, 2); + st12 = lshiftbyte_m128(st12, 4); + st14 = lshiftbyte_m128(st14, 6); *s = or128(*s, st0); *s = or128(*s, st2); @@ -249,10 +214,6 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, *s = rshiftbyte_m128(*s, 8); *conf0 ^= ~0ULL; - st10 = lshiftbyte_m128(st10, 2); - st12 = lshiftbyte_m128(st12, 4); - st14 = lshiftbyte_m128(st14, 6); - *s = or128(*s, st8); *s = or128(*s, st10); *s = or128(*s, st12); @@ -265,14 +226,16 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, static really_inline void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, - UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + UNUSED const u8 *end_ptr, u32 domain_mask, const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a reach0 = andn(domain_mask_flipped, itPtr); - u64a reach4 = andn(domain_mask_flipped, itPtr + 4); - u64a reach8 = andn(domain_mask_flipped, itPtr + 8); - u64a reach12 = andn(domain_mask_flipped, itPtr + 12); + u64a it_hi = *(const u64a *)itPtr; + u64a it_lo = *(const u64a *)(itPtr + 8); + u64a reach0 = domain_mask & it_hi; + u64a reach4 = domain_mask & (it_hi >> 32); + u64a reach8 = domain_mask & it_lo; + u64a reach12 = domain_mask & (it_lo >> 32); m128 st0 = load_m128_from_u64a(ft + reach0); m128 st4 = load_m128_from_u64a(ft + reach4); @@ -660,41 +623,6 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, #define INVALID_MATCH_ID (~0U) -#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ - do { \ - const u8 *tryFloodDetect = zz->floodPtr; \ - const u8 *start_ptr = zz->start; \ - const u8 *end_ptr = zz->end; \ - for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr; \ - itPtr += 4*ITER_BYTES) { \ - __builtin_prefetch(itPtr); \ - } \ - \ - for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ - itPtr += ITER_BYTES) { \ - if (unlikely(itPtr > tryFloodDetect)) { \ - tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ - &floodBackoff, &control, \ - ITER_BYTES); \ - if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ - return HWLM_TERMINATED; \ - } \ - } \ - __builtin_prefetch(itPtr + ITER_BYTES); \ - u64a conf0; \ - u64a conf8; \ - get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ - ft, &conf0, &conf8, &s); \ - do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \ - &last_match_id, zz); \ - do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \ - &last_match_id, zz); \ - if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ - return HWLM_TERMINATED; \ - } \ - } /* end for loop */ \ - } while (0) \ - static never_inline hwlm_error_t fdr_engine_exec(const struct FDR *fdr, const struct FDR_Runtime_Args *a, @@ -703,7 +631,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, u32 floodBackoff = FLOOD_BACKOFF_START; u32 last_match_id = INVALID_MATCH_ID; - u32 domain_mask_flipped = ~fdr->domainMask; + u32 domain_mask = fdr->domainMask; u8 stride = fdr->stride; const u64a *ft = (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); @@ -722,42 +650,51 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr, for (size_t curZone = 0; curZone < numZone; curZone++) { struct zone *z = &zones[curZone]; - dumpZoneInfo(z, curZone); - - /* When a zone contains less data than is processed in an iteration - * of FDR_MAIN_LOOP(), we need to scan over some extra data. - * - * We have chosen to scan this extra data at the start of the - * iteration. The extra data is either data we have already scanned or - * garbage (if it is earlier than offset 0), - * - * As a result we need to shift the incoming state back so that it will - * properly line up with the data being scanned. - * - * We also need to forbid reporting any matches in the data being - * rescanned as they have already been reported (or are over garbage but - * later stages should also provide that safety guarantee). - */ u8 shift = z->shift; - state = variable_byte_shift_m128(state, shift); + state = or128(state, variable_byte_shift_m128(ones128(), shift-16)); - state = or128(state, load128(zone_or_mask[shift])); - - switch (stride) { - case 1: - FDR_MAIN_LOOP(z, state, get_conf_stride_1); - break; - case 2: - FDR_MAIN_LOOP(z, state, get_conf_stride_2); - break; - case 4: - FDR_MAIN_LOOP(z, state, get_conf_stride_4); - break; - default: - break; + const u8 *tryFloodDetect = z->floodPtr; + const u8 *start_ptr = z->start; + const u8 *end_ptr = z->end; + for (const u8 *itPtr = ROUNDDOWN_PTR(z->start, 64); itPtr + 4*ITER_BYTES <= z->end; itPtr += 4*ITER_BYTES) { + __builtin_prefetch(itPtr + 16*ITER_BYTES); } + + for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; + itPtr += ITER_BYTES) { + if (unlikely(itPtr > tryFloodDetect)) { + tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect, + &floodBackoff, &control, + ITER_BYTES); + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { + return HWLM_TERMINATED; + } + } + u64a conf0; + u64a conf8; + __builtin_prefetch(itPtr + 16*ITER_BYTES); + switch (stride) { + case 1: + get_conf_stride_1(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state); + break; + case 2: + get_conf_stride_2(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state); + break; + case 4: + get_conf_stride_4(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state); + break; + default: + break; + } + + do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, &last_match_id, z); + do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, &last_match_id, z); + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { + return HWLM_TERMINATED; + } + } /* end for loop */ } return HWLM_SUCCESS; From 9643bb4636145c26d30c37d6e046d2bd3776d7f6 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 19 Feb 2024 13:09:02 +0200 Subject: [PATCH 2/4] WIP: rework fdr to use fewer instructions, gives about 10% performance increase on SSE/AVX2 --- src/fdr/fdr.c | 128 ++++++++++++++++++++++-------- src/util/arch/arm/simd_utils.h | 10 +++ src/util/arch/common/simd_utils.h | 8 ++ src/util/arch/x86/simd_utils.h | 33 ++++++++ 4 files changed, 144 insertions(+), 35 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index d67e27199..62a08e4e4 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2024, VectorCamp PC * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -103,6 +104,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, return s; } +#include "../print_simd.h" static really_inline void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, @@ -111,41 +113,97 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, /* +1: the zones ensure that we can read the byte at z->end */ assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); - u64a it_hi = *(const u64a *)itPtr; - u64a it_lo = *(const u64a *)(itPtr + 8); - u64a reach0 = domain_mask & it_hi; - u64a reach1 = domain_mask & (it_hi >> 8); - u64a reach2 = domain_mask & (it_hi >> 16); - u64a reach3 = domain_mask & (it_hi >> 24); - u64a reach4 = domain_mask & (it_hi >> 32); - u64a reach5 = domain_mask & (it_hi >> 40); - u64a reach6 = domain_mask & (it_hi >> 48); - u64a reach7 = domain_mask & ((it_hi >> 56) | (it_lo << 8)); - u64a reach8 = domain_mask & it_lo; - u64a reach9 = domain_mask & (it_lo >> 8); - u64a reach10 = domain_mask & (it_lo >> 16); - u64a reach11 = domain_mask & (it_lo >> 24); - u64a reach12 = domain_mask & (it_lo >> 32); - u64a reach13 = domain_mask & (it_lo >> 40); - u64a reach14 = domain_mask & (it_lo >> 48); - u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15); - - m128 st0 = load_m128_from_u64a(ft + reach0); - m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1); - m128 st2 = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2); - m128 st3 = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3); - m128 st4 = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4); - m128 st5 = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5); - m128 st6 = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6); - m128 st7 = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7); - m128 st8 = load_m128_from_u64a(ft + reach8); - m128 st9 = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1); - m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2); - m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3); - m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4); - m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5); - m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6); - m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7); + // u64a ALIGN_ATTR(16) reach[16]; + u32 ALIGN_ATTR(16) reach[16]; + + m128 domain_mask_v = set1_4x32(domain_mask); + // m256 ft_v = set1_4x64((ptrdiff_t)ft); + + m128 it_v = loadu128(itPtr); + m128 it_shifted8_v = rshiftbyte_m128(it_v, 1); + m128 it_shifted16_v = rshiftbyte_m128(it_v, 2); + m128 it_shifted24_v = rshiftbyte_m128(it_v, 3); + it_shifted24_v = insert32_m128(it_shifted24_v, unaligned_load_u32(itPtr + 15), 3); + + m128 reach_v[4]; + // m256 reach64_v[4]; + + reach_v[0] = and128(domain_mask_v, it_v); + reach_v[1] = and128(domain_mask_v, it_shifted8_v); + reach_v[2] = and128(domain_mask_v, it_shifted16_v); + reach_v[3] = and128(domain_mask_v, it_shifted24_v); + + // reach_v[0] = lshift32_m128(reach_v[0], 3); + // reach_v[1] = lshift32_m128(reach_v[1], 3); + // reach_v[2] = lshift32_m128(reach_v[2], 3); + // reach_v[3] = lshift32_m128(reach_v[3], 3); + + // reach64_v[0] = widen128(reach_v[0]); + // reach64_v[1] = widen128(reach_v[1]); + // reach64_v[2] = widen128(reach_v[2]); + // reach64_v[3] = widen128(reach_v[3]); + + // reach64_v[0] = add256(reach64_v[0], ft_v); + // reach64_v[1] = add256(reach64_v[1], ft_v); + // reach64_v[2] = add256(reach64_v[2], ft_v); + // reach64_v[3] = add256(reach64_v[3], ft_v); + + // store256(&reach[0], reach64_v[0]); + // store256(&reach[4], reach64_v[1]); + // store256(&reach[8], reach64_v[2]); + // store256(&reach[12], reach64_v[3]); + store128(&reach[0], reach_v[0]); + store128(&reach[4], reach_v[1]); + store128(&reach[8], reach_v[2]); + store128(&reach[12], reach_v[3]); + + m128 st0 = load_m128_from_u64a(ft + reach[0]); + m128 st4 = load_m128_from_u64a(ft + reach[1]); + m128 st8 = load_m128_from_u64a(ft + reach[2]); + m128 st12 = load_m128_from_u64a(ft + reach[3]); + m128 st1 = load_m128_from_u64a(ft + reach[4]); + m128 st5 = load_m128_from_u64a(ft + reach[5]); + m128 st9 = load_m128_from_u64a(ft + reach[6]); + m128 st13 = load_m128_from_u64a(ft + reach[7]); + m128 st2 = load_m128_from_u64a(ft + reach[8]); + m128 st6 = load_m128_from_u64a(ft + reach[9]); + m128 st10 = load_m128_from_u64a(ft + reach[10]); + m128 st14 = load_m128_from_u64a(ft + reach[11]); + m128 st3 = load_m128_from_u64a(ft + reach[12]); + m128 st7 = load_m128_from_u64a(ft + reach[13]); + m128 st11 = load_m128_from_u64a(ft + reach[14]); + m128 st15 = load_m128_from_u64a(ft + reach[15]); + // m128 st0 = load_m128_from_u64a((u64a *)reach[0]); + // m128 st4 = load_m128_from_u64a((u64a *)reach[1]); + // m128 st8 = load_m128_from_u64a((u64a *)reach[2]); + // m128 st12 = load_m128_from_u64a((u64a *)reach[3]); + // m128 st1 = load_m128_from_u64a((u64a *)reach[4]); + // m128 st5 = load_m128_from_u64a((u64a *)reach[5]); + // m128 st9 = load_m128_from_u64a((u64a *)reach[6]); + // m128 st13 = load_m128_from_u64a((u64a *)reach[7]); + // m128 st2 = load_m128_from_u64a((u64a *)reach[8]); + // m128 st6 = load_m128_from_u64a((u64a *)reach[9]); + // m128 st10 = load_m128_from_u64a((u64a *)reach[10]); + // m128 st14 = load_m128_from_u64a((u64a *)reach[11]); + // m128 st3 = load_m128_from_u64a((u64a *)reach[12]); + // m128 st7 = load_m128_from_u64a((u64a *)reach[13]); + // m128 st11 = load_m128_from_u64a((u64a *)reach[14]); + // m128 st15 = load_m128_from_u64a((u64a *)reach[15]); + + st1 = lshiftbyte_m128(st1, 1); + st2 = lshiftbyte_m128(st2, 2); + st3 = lshiftbyte_m128(st3, 3); + st4 = lshiftbyte_m128(st4, 4); + st5 = lshiftbyte_m128(st5, 5); + st6 = lshiftbyte_m128(st6, 6); + st7 = lshiftbyte_m128(st7, 7); + st9 = lshiftbyte_m128(st9, 1); + st10 = lshiftbyte_m128(st10, 2); + st11 = lshiftbyte_m128(st11, 3); + st12 = lshiftbyte_m128(st12, 4); + st13 = lshiftbyte_m128(st13, 5); + st14 = lshiftbyte_m128(st14, 6); + st15 = lshiftbyte_m128(st15, 7); st0 = or128(st0, st1); st2 = or128(st2, st3); diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 7f8539b09..858866d77 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -181,6 +181,10 @@ static really_inline m128 set1_2x64(u64a c) { return (m128) vdupq_n_u64(c); } +static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) { + return vsetq_lane_u32((uint32x4_t)in, val, imm); +} + static really_inline u32 movd(const m128 in) { return vgetq_lane_u32((uint32x4_t) in, 0); } @@ -195,6 +199,12 @@ m128 load_m128_from_u64a(const u64a *p) { return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0); } +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0); +} + static really_inline u32 extract32from128(const m128 in, unsigned imm) { #if defined(HAVE__BUILTIN_CONSTANT_P) if (__builtin_constant_p(imm)) { diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h index 24331b103..4ac92ab3a 100644 --- a/src/util/arch/common/simd_utils.h +++ b/src/util/arch/common/simd_utils.h @@ -384,6 +384,14 @@ m256 pshufb_m256(m256 a, m256 b) { return rv; } +static really_inline +m256 widen128(m128 x) { + m256 rv; + rv.lo = widenlo128(x); + rv.hi = widenhi128(x); + return rv; +} + #endif // HAVE_SIMD_256_BITS /**** diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h index 49797abab..9c2984c0e 100644 --- a/src/util/arch/x86/simd_utils.h +++ b/src/util/arch/x86/simd_utils.h @@ -122,6 +122,17 @@ m128 sub_2x64(m128 a, m128 b) { return (m128) _mm_sub_epi64(a, b); } +static really_really_inline +m128 lshift32_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi32(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi32(a, x); +} + static really_really_inline m128 lshift64_m128(m128 a, unsigned b) { #if defined(HAVE__BUILTIN_CONSTANT_P) @@ -156,6 +167,10 @@ static really_inline m128 set1_2x64(u64a c) { return _mm_set1_epi64x(c); } +static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) { + return _mm_insert_epi32(in, val, imm); +} + static really_inline u32 movd(const m128 in) { return _mm_cvtsi128_si32(in); } @@ -451,6 +466,18 @@ m128 set2x64(u64a hi, u64a lo) { return _mm_set_epi64x(hi, lo); } +#include "../print_simd.h" + +static really_inline +m128 widenlo128(m128 x) { + return _mm_unpacklo_epi32(x, zeroes128()); +} + +static really_inline +m128 widenhi128(m128 x) { + return _mm_unpackhi_epi32(x, zeroes128()); +} + /**** **** 256-bit Primitives ****/ @@ -677,6 +704,12 @@ m256 combine2x128(m128 hi, m128 lo) { return insert128to256(cast128to256(lo), hi, 1); #endif } + +static really_inline +m256 widen128(m128 x) { + return (m256) _mm256_cvtepu32_epi64(x); +} + #endif //AVX2 /**** From 880d6bcbf0f013054557f3d4bc97e3ea359361a9 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 19 Feb 2024 19:42:14 +0800 Subject: [PATCH 3/4] fix arm build --- src/util/arch/arm/simd_utils.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h index 858866d77..b097e66a8 100644 --- a/src/util/arch/arm/simd_utils.h +++ b/src/util/arch/arm/simd_utils.h @@ -182,7 +182,7 @@ static really_inline m128 set1_2x64(u64a c) { } static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) { - return vsetq_lane_u32((uint32x4_t)in, val, imm); + return (m128) vsetq_lane_u32(val, (uint32x4_t)in, imm); } static really_inline u32 movd(const m128 in) { @@ -199,12 +199,6 @@ m128 load_m128_from_u64a(const u64a *p) { return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0); } -/* another form of movq */ -static really_inline -m128 load_m128_from_u64a(const u64a *p) { - return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0); -} - static really_inline u32 extract32from128(const m128 in, unsigned imm) { #if defined(HAVE__BUILTIN_CONSTANT_P) if (__builtin_constant_p(imm)) { @@ -449,4 +443,14 @@ m128 set2x64(u64a hi, u64a lo) { return (m128) vld1q_u64((uint64_t *) data); } +static really_inline +m128 widenlo128(m128 x) { + return (m128) vmovl_u32(vget_low_u32((uint32x4_t)x)); +} + +static really_inline +m128 widenhi128(m128 x) { + return (m128) vmovl_u32(vget_high_u32((uint32x4_t)x)); +} + #endif // ARCH_ARM_SIMD_UTILS_H From a5fdbcb873d414d5f14305d924e2c2267de0e0cb Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Mon, 19 Feb 2024 20:37:46 +0800 Subject: [PATCH 4/4] reorder instructions, for some reason it's faster on x86 but slower on arm, needs investigation --- src/fdr/fdr.c | 80 +++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c index 62a08e4e4..8c94d0d04 100644 --- a/src/fdr/fdr.c +++ b/src/fdr/fdr.c @@ -104,7 +104,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, return s; } -#include "../print_simd.h" +//#include "../print_simd.h" static really_inline void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, @@ -158,21 +158,50 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, store128(&reach[12], reach_v[3]); m128 st0 = load_m128_from_u64a(ft + reach[0]); - m128 st4 = load_m128_from_u64a(ft + reach[1]); - m128 st8 = load_m128_from_u64a(ft + reach[2]); - m128 st12 = load_m128_from_u64a(ft + reach[3]); m128 st1 = load_m128_from_u64a(ft + reach[4]); - m128 st5 = load_m128_from_u64a(ft + reach[5]); - m128 st9 = load_m128_from_u64a(ft + reach[6]); - m128 st13 = load_m128_from_u64a(ft + reach[7]); + st1 = lshiftbyte_m128(st1, 1); + st0 = or128(st0, st1); + m128 st2 = load_m128_from_u64a(ft + reach[8]); - m128 st6 = load_m128_from_u64a(ft + reach[9]); - m128 st10 = load_m128_from_u64a(ft + reach[10]); - m128 st14 = load_m128_from_u64a(ft + reach[11]); + st2 = lshiftbyte_m128(st2, 2); m128 st3 = load_m128_from_u64a(ft + reach[12]); + st3 = lshiftbyte_m128(st3, 3); + st2 = or128(st2, st3); + + m128 st4 = load_m128_from_u64a(ft + reach[1]); + st4 = lshiftbyte_m128(st4, 4); + m128 st5 = load_m128_from_u64a(ft + reach[5]); + st5 = lshiftbyte_m128(st5, 5); + st4 = or128(st4, st5); + + m128 st6 = load_m128_from_u64a(ft + reach[9]); + st6 = lshiftbyte_m128(st6, 6); m128 st7 = load_m128_from_u64a(ft + reach[13]); + st7 = lshiftbyte_m128(st7, 7); + st6 = or128(st6, st7); + + m128 st8 = load_m128_from_u64a(ft + reach[2]); + m128 st9 = load_m128_from_u64a(ft + reach[6]); + st9 = lshiftbyte_m128(st9, 1); + st8 = or128(st8, st9); + + m128 st10 = load_m128_from_u64a(ft + reach[10]); + st10 = lshiftbyte_m128(st10, 2); m128 st11 = load_m128_from_u64a(ft + reach[14]); + st11 = lshiftbyte_m128(st11, 3); + st10 = or128(st10, st11); + + m128 st12 = load_m128_from_u64a(ft + reach[3]); + st12 = lshiftbyte_m128(st12, 4); + m128 st13 = load_m128_from_u64a(ft + reach[7]); + st13 = lshiftbyte_m128(st13, 5); + st12 = or128(st12, st13); + + m128 st14 = load_m128_from_u64a(ft + reach[11]); + st14 = lshiftbyte_m128(st14, 6); m128 st15 = load_m128_from_u64a(ft + reach[15]); + st15 = lshiftbyte_m128(st15, 7); + st14 = or128(st14, st15); // m128 st0 = load_m128_from_u64a((u64a *)reach[0]); // m128 st4 = load_m128_from_u64a((u64a *)reach[1]); // m128 st8 = load_m128_from_u64a((u64a *)reach[2]); @@ -190,42 +219,19 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, // m128 st11 = load_m128_from_u64a((u64a *)reach[14]); // m128 st15 = load_m128_from_u64a((u64a *)reach[15]); - st1 = lshiftbyte_m128(st1, 1); - st2 = lshiftbyte_m128(st2, 2); - st3 = lshiftbyte_m128(st3, 3); - st4 = lshiftbyte_m128(st4, 4); - st5 = lshiftbyte_m128(st5, 5); - st6 = lshiftbyte_m128(st6, 6); - st7 = lshiftbyte_m128(st7, 7); - st9 = lshiftbyte_m128(st9, 1); - st10 = lshiftbyte_m128(st10, 2); - st11 = lshiftbyte_m128(st11, 3); - st12 = lshiftbyte_m128(st12, 4); - st13 = lshiftbyte_m128(st13, 5); - st14 = lshiftbyte_m128(st14, 6); - st15 = lshiftbyte_m128(st15, 7); - st0 = or128(st0, st1); - st2 = or128(st2, st3); - st4 = or128(st4, st5); - st6 = or128(st6, st7); st0 = or128(st0, st2); st4 = or128(st4, st6); st0 = or128(st0, st4); - - st8 = or128(st8, st9); - st10 = or128(st10, st11); - st12 = or128(st12, st13); - st14 = or128(st14, st15); + m128 st = or128(*s, st0); + *conf0 = movq(st) ^ ~0ULL; + st = rshiftbyte_m128(st, 8); + st8 = or128(st8, st10); st12 = or128(st12, st14); st8 = or128(st8, st12); - m128 st = or128(*s, st0); - *conf0 = movq(st) ^ ~0ULL; - st = rshiftbyte_m128(st, 8); st = or128(st, st8); - *conf8 = movq(st) ^ ~0ULL; *s = rshiftbyte_m128(st, 8); }