From b26d5e0c63216e7050783b97abde2c8668436786 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Tue, 12 Dec 2023 18:16:13 +0200
Subject: [PATCH 1/4] WIP: remove the use of macros for critical loops, easier
 to debug, same performance

---
 src/fdr/fdr.c | 191 +++++++++++++++++---------------------------------
 1 file changed, 64 insertions(+), 127 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 561e8f986..d67e27199 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -82,44 +82,6 @@ struct zone {
     const u8 *floodPtr;
 };
 
-static
-const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = {
-    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 },
-    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 },
-    { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-      0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }
-};
-
 /* generates an initial state mask based on the last byte-ish of history rather
  * than being all accepting. If there is no history to consider, the state is
  * generated based on the minimum length of each bucket in order to prevent
@@ -141,13 +103,13 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
     return s;
 }
 
+
 static really_inline
 void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
-                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       UNUSED const u8 *end_ptr, u32 domain_mask,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     /* +1: the zones ensure that we can read the byte at z->end */
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
-    u64a domain_mask = ~domain_mask_flipped;
 
     u64a it_hi = *(const u64a *)itPtr;
     u64a it_lo = *(const u64a *)(itPtr + 8);
@@ -212,25 +174,25 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
 
 static really_inline
 void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
-                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       UNUSED const u8 *end_ptr, u32 domain_mask,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
 
-    u64a reach0 = andn(domain_mask_flipped, itPtr);
-    u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
-    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
-    u64a reach6 = andn(domain_mask_flipped, itPtr + 6);
+    u64a it_hi = *(const u64a *)itPtr;
+    u64a it_lo = *(const u64a *)(itPtr + 8);
+    u64a reach0  = domain_mask & it_hi;
+    u64a reach2  = domain_mask & (it_hi >> 16);
+    u64a reach4  = domain_mask & (it_hi >> 32);
+    u64a reach6  = domain_mask & (it_hi >> 48);
+    u64a reach8  = domain_mask & it_lo;
+    u64a reach10 = domain_mask & (it_lo >> 16);
+    u64a reach12 = domain_mask & (it_lo >> 32);
+    u64a reach14 = domain_mask & (it_lo >> 48);
 
     m128 st0 = load_m128_from_u64a(ft + reach0);
     m128 st2 = load_m128_from_u64a(ft + reach2);
     m128 st4 = load_m128_from_u64a(ft + reach4);
     m128 st6 = load_m128_from_u64a(ft + reach6);
-
-    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
-    u64a reach10 = andn(domain_mask_flipped, itPtr + 10);
-    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
-    u64a reach14 = andn(domain_mask_flipped, itPtr + 14);
-
     m128 st8 = load_m128_from_u64a(ft + reach8);
     m128 st10 = load_m128_from_u64a(ft + reach10);
     m128 st12 = load_m128_from_u64a(ft + reach12);
@@ -239,6 +201,9 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
     st2  = lshiftbyte_m128(st2, 2);
     st4  = lshiftbyte_m128(st4, 4);
     st6  = lshiftbyte_m128(st6, 6);
+    st10 = lshiftbyte_m128(st10, 2);
+    st12 = lshiftbyte_m128(st12, 4);
+    st14 = lshiftbyte_m128(st14, 6);
 
     *s = or128(*s, st0);
     *s = or128(*s, st2);
@@ -249,10 +214,6 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
     *s = rshiftbyte_m128(*s, 8);
     *conf0 ^= ~0ULL;
 
-    st10 = lshiftbyte_m128(st10, 2);
-    st12 = lshiftbyte_m128(st12, 4);
-    st14 = lshiftbyte_m128(st14, 6);
-
     *s = or128(*s, st8);
     *s = or128(*s, st10);
     *s = or128(*s, st12);
@@ -265,14 +226,16 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
 
 static really_inline
 void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
-                       UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
+                       UNUSED const u8 *end_ptr, u32 domain_mask,
                        const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
 
-    u64a reach0 = andn(domain_mask_flipped, itPtr);
-    u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
-    u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
-    u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
+    u64a it_hi = *(const u64a *)itPtr;
+    u64a it_lo = *(const u64a *)(itPtr + 8);
+    u64a reach0  = domain_mask & it_hi;
+    u64a reach4  = domain_mask & (it_hi >> 32);
+    u64a reach8  = domain_mask & it_lo;
+    u64a reach12 = domain_mask & (it_lo >> 32);
 
     m128 st0 = load_m128_from_u64a(ft + reach0);
     m128 st4 = load_m128_from_u64a(ft + reach4);
@@ -660,41 +623,6 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
 
 #define INVALID_MATCH_ID (~0U)
 
-#define FDR_MAIN_LOOP(zz, s, get_conf_fn)                                   \
-    do {                                                                    \
-        const u8 *tryFloodDetect = zz->floodPtr;                            \
-        const u8 *start_ptr = zz->start;                                    \
-        const u8 *end_ptr = zz->end;                                        \
-        for (const u8 *itPtr = ROUNDDOWN_PTR(start_ptr, 64); itPtr + 4*ITER_BYTES <= end_ptr;      \
-            itPtr += 4*ITER_BYTES) {                                        \
-            __builtin_prefetch(itPtr);                                      \
-        }                                                                   \
-                                                                            \
-        for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;    \
-            itPtr += ITER_BYTES) {                                          \
-            if (unlikely(itPtr > tryFloodDetect)) {                         \
-                tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\
-                                             &floodBackoff, &control,       \
-                                             ITER_BYTES);                   \
-                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {         \
-                    return HWLM_TERMINATED;                                 \
-                }                                                           \
-            }                                                               \
-            __builtin_prefetch(itPtr + ITER_BYTES);                         \
-            u64a conf0;                                                     \
-            u64a conf8;                                                     \
-            get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped,     \
-                        ft, &conf0, &conf8, &s);                            \
-            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr,         \
-                           &last_match_id, zz);                             \
-            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr,         \
-                           &last_match_id, zz);                             \
-            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {             \
-                return HWLM_TERMINATED;                                     \
-            }                                                               \
-        } /* end for loop */                                                \
-    } while (0)                                                             \
-
 static never_inline
 hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
                              const struct FDR_Runtime_Args *a,
@@ -703,7 +631,7 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
 
     u32 floodBackoff = FLOOD_BACKOFF_START;
     u32 last_match_id = INVALID_MATCH_ID;
-    u32 domain_mask_flipped = ~fdr->domainMask;
+    u32 domain_mask = fdr->domainMask;
     u8 stride = fdr->stride;
     const u64a *ft =
         (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
@@ -722,42 +650,51 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,
 
     for (size_t curZone = 0; curZone < numZone; curZone++) {
         struct zone *z = &zones[curZone];
-        dumpZoneInfo(z, curZone);
-
-        /* When a zone contains less data than is processed in an iteration
-         * of FDR_MAIN_LOOP(), we need to scan over some extra data.
-         *
-         * We have chosen to scan this extra data at the start of the
-         * iteration. The extra data is either data we have already scanned or
-         * garbage (if it is earlier than offset 0),
-         *
-         * As a result we need to shift the incoming state back so that it will
-         * properly line up with the data being scanned.
-         *
-         * We also need to forbid reporting any matches in the data being
-         * rescanned as they have already been reported (or are over garbage but
-         * later stages should also provide that safety guarantee).
-         */
 
         u8 shift = z->shift;
-
         state = variable_byte_shift_m128(state, shift);
+        state = or128(state, variable_byte_shift_m128(ones128(), shift-16));
 
-        state = or128(state, load128(zone_or_mask[shift]));
-
-        switch (stride) {
-        case 1:
-            FDR_MAIN_LOOP(z, state, get_conf_stride_1);
-            break;
-        case 2:
-            FDR_MAIN_LOOP(z, state, get_conf_stride_2);
-            break;
-        case 4:
-            FDR_MAIN_LOOP(z, state, get_conf_stride_4);
-            break;
-        default:
-            break;
+        const u8 *tryFloodDetect = z->floodPtr;
+        const u8 *start_ptr = z->start;
+        const u8 *end_ptr = z->end;
+        for (const u8 *itPtr = ROUNDDOWN_PTR(z->start, 64); itPtr + 4*ITER_BYTES <= z->end; itPtr += 4*ITER_BYTES) {
+            __builtin_prefetch(itPtr + 16*ITER_BYTES);
         }
+
+        for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr;
+            itPtr += ITER_BYTES) {
+            if (unlikely(itPtr > tryFloodDetect)) {
+                tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,
+                                             &floodBackoff, &control,
+                                             ITER_BYTES);
+                if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
+                    return HWLM_TERMINATED;
+                }
+            }
+            u64a conf0;
+            u64a conf8;
+            __builtin_prefetch(itPtr + 16*ITER_BYTES);
+            switch (stride) {
+            case 1:
+                get_conf_stride_1(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state);
+                break;
+            case 2:
+                get_conf_stride_2(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state);
+                break;
+            case 4:
+                get_conf_stride_4(itPtr, start_ptr, end_ptr, domain_mask, ft, &conf0, &conf8, &state);
+                break;
+            default:
+                break;
+            }
+
+            do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, &last_match_id, z);
+            do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, &last_match_id, z);
+            if (unlikely(control == HWLM_TERMINATE_MATCHING)) {
+                return HWLM_TERMINATED;
+            }
+        } /* end for loop */
     }
 
     return HWLM_SUCCESS;

From 9643bb4636145c26d30c37d6e046d2bd3776d7f6 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <markos@freevec.org>
Date: Mon, 19 Feb 2024 13:09:02 +0200
Subject: [PATCH 2/4] WIP: rework fdr to use fewer instructions, gives about
 10% performance increase on SSE/AVX2

---
 src/fdr/fdr.c                     | 128 ++++++++++++++++++++++--------
 src/util/arch/arm/simd_utils.h    |  10 +++
 src/util/arch/common/simd_utils.h |   8 ++
 src/util/arch/x86/simd_utils.h    |  33 ++++++++
 4 files changed, 144 insertions(+), 35 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index d67e27199..62a08e4e4 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2015-2017, Intel Corporation
+ * Copyright (c) 2020-2024, VectorCamp PC
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -103,6 +104,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
     return s;
 }
 
+#include "../print_simd.h"
 
 static really_inline
 void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
@@ -111,41 +113,97 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     /* +1: the zones ensure that we can read the byte at z->end */
     assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
 
-    u64a it_hi = *(const u64a *)itPtr;
-    u64a it_lo = *(const u64a *)(itPtr + 8);
-    u64a reach0  = domain_mask & it_hi;
-    u64a reach1  = domain_mask & (it_hi >> 8);
-    u64a reach2  = domain_mask & (it_hi >> 16);
-    u64a reach3  = domain_mask & (it_hi >> 24);
-    u64a reach4  = domain_mask & (it_hi >> 32);
-    u64a reach5  = domain_mask & (it_hi >> 40);
-    u64a reach6  = domain_mask & (it_hi >> 48);
-    u64a reach7  = domain_mask & ((it_hi >> 56) | (it_lo << 8));
-    u64a reach8  = domain_mask & it_lo;
-    u64a reach9  = domain_mask & (it_lo >> 8);
-    u64a reach10 = domain_mask & (it_lo >> 16);
-    u64a reach11 = domain_mask & (it_lo >> 24);
-    u64a reach12 = domain_mask & (it_lo >> 32);
-    u64a reach13 = domain_mask & (it_lo >> 40);
-    u64a reach14 = domain_mask & (it_lo >> 48);
-    u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
-
-    m128 st0  = load_m128_from_u64a(ft + reach0);
-    m128 st1  = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
-    m128 st2  = lshiftbyte_m128(load_m128_from_u64a(ft + reach2), 2);
-    m128 st3  = lshiftbyte_m128(load_m128_from_u64a(ft + reach3), 3);
-    m128 st4  = lshiftbyte_m128(load_m128_from_u64a(ft + reach4), 4);
-    m128 st5  = lshiftbyte_m128(load_m128_from_u64a(ft + reach5), 5);
-    m128 st6  = lshiftbyte_m128(load_m128_from_u64a(ft + reach6), 6);
-    m128 st7  = lshiftbyte_m128(load_m128_from_u64a(ft + reach7), 7);
-    m128 st8  = load_m128_from_u64a(ft + reach8);
-    m128 st9  = lshiftbyte_m128(load_m128_from_u64a(ft + reach9), 1);
-    m128 st10 = lshiftbyte_m128(load_m128_from_u64a(ft + reach10), 2);
-    m128 st11 = lshiftbyte_m128(load_m128_from_u64a(ft + reach11), 3);
-    m128 st12 = lshiftbyte_m128(load_m128_from_u64a(ft + reach12), 4);
-    m128 st13 = lshiftbyte_m128(load_m128_from_u64a(ft + reach13), 5);
-    m128 st14 = lshiftbyte_m128(load_m128_from_u64a(ft + reach14), 6);
-    m128 st15 = lshiftbyte_m128(load_m128_from_u64a(ft + reach15), 7);
+    // u64a ALIGN_ATTR(16) reach[16];
+    u32 ALIGN_ATTR(16) reach[16];
+
+    m128 domain_mask_v = set1_4x32(domain_mask);
+    // m256 ft_v = set1_4x64((ptrdiff_t)ft);
+    
+    m128 it_v = loadu128(itPtr);
+    m128 it_shifted8_v = rshiftbyte_m128(it_v, 1);
+    m128 it_shifted16_v = rshiftbyte_m128(it_v, 2);
+    m128 it_shifted24_v = rshiftbyte_m128(it_v, 3);
+    it_shifted24_v = insert32_m128(it_shifted24_v, unaligned_load_u32(itPtr + 15), 3);
+
+    m128 reach_v[4];
+    // m256 reach64_v[4];
+
+    reach_v[0] = and128(domain_mask_v, it_v);
+    reach_v[1] = and128(domain_mask_v, it_shifted8_v);
+    reach_v[2] = and128(domain_mask_v, it_shifted16_v);
+    reach_v[3] = and128(domain_mask_v, it_shifted24_v);
+
+    // reach_v[0] = lshift32_m128(reach_v[0], 3);
+    // reach_v[1] = lshift32_m128(reach_v[1], 3);
+    // reach_v[2] = lshift32_m128(reach_v[2], 3);
+    // reach_v[3] = lshift32_m128(reach_v[3], 3);
+
+    // reach64_v[0] = widen128(reach_v[0]);
+    // reach64_v[1] = widen128(reach_v[1]);
+    // reach64_v[2] = widen128(reach_v[2]);
+    // reach64_v[3] = widen128(reach_v[3]);
+
+    // reach64_v[0] = add256(reach64_v[0], ft_v);
+    // reach64_v[1] = add256(reach64_v[1], ft_v);
+    // reach64_v[2] = add256(reach64_v[2], ft_v);
+    // reach64_v[3] = add256(reach64_v[3], ft_v);
+
+    // store256(&reach[0], reach64_v[0]);
+    // store256(&reach[4], reach64_v[1]);
+    // store256(&reach[8], reach64_v[2]);
+    // store256(&reach[12], reach64_v[3]);
+    store128(&reach[0], reach_v[0]);
+    store128(&reach[4], reach_v[1]);
+    store128(&reach[8], reach_v[2]);
+    store128(&reach[12], reach_v[3]);
+
+    m128 st0  = load_m128_from_u64a(ft + reach[0]);
+    m128 st4  = load_m128_from_u64a(ft + reach[1]);
+    m128 st8  = load_m128_from_u64a(ft + reach[2]);
+    m128 st12 = load_m128_from_u64a(ft + reach[3]);
+    m128 st1  = load_m128_from_u64a(ft + reach[4]);
+    m128 st5  = load_m128_from_u64a(ft + reach[5]);
+    m128 st9  = load_m128_from_u64a(ft + reach[6]);
+    m128 st13 = load_m128_from_u64a(ft + reach[7]);
+    m128 st2  = load_m128_from_u64a(ft + reach[8]);
+    m128 st6  = load_m128_from_u64a(ft + reach[9]);
+    m128 st10 = load_m128_from_u64a(ft + reach[10]);
+    m128 st14 = load_m128_from_u64a(ft + reach[11]);
+    m128 st3  = load_m128_from_u64a(ft + reach[12]);
+    m128 st7  = load_m128_from_u64a(ft + reach[13]);
+    m128 st11 = load_m128_from_u64a(ft + reach[14]);
+    m128 st15 = load_m128_from_u64a(ft + reach[15]);
+    // m128 st0  = load_m128_from_u64a((u64a *)reach[0]);
+    // m128 st4  = load_m128_from_u64a((u64a *)reach[1]);
+    // m128 st8  = load_m128_from_u64a((u64a *)reach[2]);
+    // m128 st12 = load_m128_from_u64a((u64a *)reach[3]);
+    // m128 st1  = load_m128_from_u64a((u64a *)reach[4]);
+    // m128 st5  = load_m128_from_u64a((u64a *)reach[5]);
+    // m128 st9  = load_m128_from_u64a((u64a *)reach[6]);
+    // m128 st13 = load_m128_from_u64a((u64a *)reach[7]);
+    // m128 st2  = load_m128_from_u64a((u64a *)reach[8]);
+    // m128 st6  = load_m128_from_u64a((u64a *)reach[9]);
+    // m128 st10 = load_m128_from_u64a((u64a *)reach[10]);
+    // m128 st14 = load_m128_from_u64a((u64a *)reach[11]);
+    // m128 st3  = load_m128_from_u64a((u64a *)reach[12]);
+    // m128 st7  = load_m128_from_u64a((u64a *)reach[13]);
+    // m128 st11 = load_m128_from_u64a((u64a *)reach[14]);
+    // m128 st15 = load_m128_from_u64a((u64a *)reach[15]);
+    
+    st1  = lshiftbyte_m128(st1, 1);
+    st2  = lshiftbyte_m128(st2, 2);
+    st3  = lshiftbyte_m128(st3, 3);
+    st4  = lshiftbyte_m128(st4, 4);
+    st5  = lshiftbyte_m128(st5, 5);
+    st6  = lshiftbyte_m128(st6, 6);
+    st7  = lshiftbyte_m128(st7, 7);
+    st9  = lshiftbyte_m128(st9, 1);
+    st10 = lshiftbyte_m128(st10, 2);
+    st11 = lshiftbyte_m128(st11, 3);
+    st12 = lshiftbyte_m128(st12, 4);
+    st13 = lshiftbyte_m128(st13, 5);
+    st14 = lshiftbyte_m128(st14, 6);
+    st15 = lshiftbyte_m128(st15, 7);
 
     st0 = or128(st0, st1);
     st2 = or128(st2, st3);
diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 7f8539b09..858866d77 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -181,6 +181,10 @@ static really_inline m128 set1_2x64(u64a c) {
     return (m128) vdupq_n_u64(c);
 }
 
+static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
+    return vsetq_lane_u32((uint32x4_t)in, val, imm);
+}
+
 static really_inline u32 movd(const m128 in) {
     return vgetq_lane_u32((uint32x4_t) in, 0);
 }
@@ -195,6 +199,12 @@ m128 load_m128_from_u64a(const u64a *p) {
     return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
 }
 
+/* another form of movq */
+static really_inline
+m128 load_m128_from_u64a(const u64a *p) {
+    return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
+}
+
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(imm)) {
diff --git a/src/util/arch/common/simd_utils.h b/src/util/arch/common/simd_utils.h
index 24331b103..4ac92ab3a 100644
--- a/src/util/arch/common/simd_utils.h
+++ b/src/util/arch/common/simd_utils.h
@@ -384,6 +384,14 @@ m256 pshufb_m256(m256 a, m256 b) {
     return rv;
 }
 
+static really_inline
+m256 widen128(m128 x) {
+    m256 rv;
+    rv.lo = widenlo128(x);
+    rv.hi = widenhi128(x);
+    return rv;
+}
+
 #endif // HAVE_SIMD_256_BITS
 
 /****
diff --git a/src/util/arch/x86/simd_utils.h b/src/util/arch/x86/simd_utils.h
index 49797abab..9c2984c0e 100644
--- a/src/util/arch/x86/simd_utils.h
+++ b/src/util/arch/x86/simd_utils.h
@@ -122,6 +122,17 @@ m128 sub_2x64(m128 a, m128 b) {
     return (m128) _mm_sub_epi64(a, b);
 }
 
+static really_really_inline
+m128 lshift32_m128(m128 a, unsigned b) {
+#if defined(HAVE__BUILTIN_CONSTANT_P)
+    if (__builtin_constant_p(b)) {
+        return _mm_slli_epi32(a, b);
+    }
+#endif
+    m128 x = _mm_cvtsi32_si128(b);
+    return _mm_sll_epi32(a, x);
+}
+
 static really_really_inline
 m128 lshift64_m128(m128 a, unsigned b) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
@@ -156,6 +167,10 @@ static really_inline m128 set1_2x64(u64a c) {
     return _mm_set1_epi64x(c);
 }
 
+static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
+    return _mm_insert_epi32(in, val, imm);
+}
+
 static really_inline u32 movd(const m128 in) {
     return _mm_cvtsi128_si32(in);
 }
@@ -451,6 +466,18 @@ m128 set2x64(u64a hi, u64a lo) {
     return _mm_set_epi64x(hi, lo);
 }
 
+#include "../print_simd.h"
+
+static really_inline
+m128 widenlo128(m128 x) {
+    return _mm_unpacklo_epi32(x, zeroes128());
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+    return _mm_unpackhi_epi32(x, zeroes128());
+}
+
 /****
  **** 256-bit Primitives
  ****/
@@ -677,6 +704,12 @@ m256 combine2x128(m128 hi, m128 lo) {
     return insert128to256(cast128to256(lo), hi, 1);
 #endif
 }
+
+static really_inline
+m256 widen128(m128 x) {
+    return (m256) _mm256_cvtepu32_epi64(x);
+}
+
 #endif //AVX2
 
 /****

From 880d6bcbf0f013054557f3d4bc97e3ea359361a9 Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 19 Feb 2024 19:42:14 +0800
Subject: [PATCH 3/4] fix arm build

---
 src/util/arch/arm/simd_utils.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/util/arch/arm/simd_utils.h b/src/util/arch/arm/simd_utils.h
index 858866d77..b097e66a8 100644
--- a/src/util/arch/arm/simd_utils.h
+++ b/src/util/arch/arm/simd_utils.h
@@ -182,7 +182,7 @@ static really_inline m128 set1_2x64(u64a c) {
 }
 
 static really_inline m128 insert32_m128(m128 in, u32 val, const int imm) {
-    return vsetq_lane_u32((uint32x4_t)in, val, imm);
+    return (m128) vsetq_lane_u32(val, (uint32x4_t)in, imm);
 }
 
 static really_inline u32 movd(const m128 in) {
@@ -199,12 +199,6 @@ m128 load_m128_from_u64a(const u64a *p) {
     return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
 }
 
-/* another form of movq */
-static really_inline
-m128 load_m128_from_u64a(const u64a *p) {
-    return (m128) vsetq_lane_u64(*p, (uint64x2_t) zeroes128(), 0);
-}
-
 static really_inline u32 extract32from128(const m128 in, unsigned imm) {
 #if defined(HAVE__BUILTIN_CONSTANT_P)
     if (__builtin_constant_p(imm)) {
@@ -449,4 +443,14 @@ m128 set2x64(u64a hi, u64a lo) {
     return (m128) vld1q_u64((uint64_t *) data);
 }
 
+static really_inline
+m128 widenlo128(m128 x) {
+    return (m128) vmovl_u32(vget_low_u32((uint32x4_t)x));
+}
+
+static really_inline
+m128 widenhi128(m128 x) {
+    return (m128) vmovl_u32(vget_high_u32((uint32x4_t)x));
+}
+
 #endif // ARCH_ARM_SIMD_UTILS_H

From a5fdbcb873d414d5f14305d924e2c2267de0e0cb Mon Sep 17 00:00:00 2001
From: Konstantinos Margaritis <konstantinos@vectorcamp.gr>
Date: Mon, 19 Feb 2024 20:37:46 +0800
Subject: [PATCH 4/4] reorder instructions, for some reason it's faster on x86
 but slower on arm, needs investigation

---
 src/fdr/fdr.c | 80 +++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 37 deletions(-)

diff --git a/src/fdr/fdr.c b/src/fdr/fdr.c
index 62a08e4e4..8c94d0d04 100644
--- a/src/fdr/fdr.c
+++ b/src/fdr/fdr.c
@@ -104,7 +104,7 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,
     return s;
 }
 
-#include "../print_simd.h"
+//#include "../print_simd.h"
 
 static really_inline
 void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
@@ -158,21 +158,50 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     store128(&reach[12], reach_v[3]);
 
     m128 st0  = load_m128_from_u64a(ft + reach[0]);
-    m128 st4  = load_m128_from_u64a(ft + reach[1]);
-    m128 st8  = load_m128_from_u64a(ft + reach[2]);
-    m128 st12 = load_m128_from_u64a(ft + reach[3]);
     m128 st1  = load_m128_from_u64a(ft + reach[4]);
-    m128 st5  = load_m128_from_u64a(ft + reach[5]);
-    m128 st9  = load_m128_from_u64a(ft + reach[6]);
-    m128 st13 = load_m128_from_u64a(ft + reach[7]);
+    st1  = lshiftbyte_m128(st1, 1);
+    st0 = or128(st0, st1);
+
     m128 st2  = load_m128_from_u64a(ft + reach[8]);
-    m128 st6  = load_m128_from_u64a(ft + reach[9]);
-    m128 st10 = load_m128_from_u64a(ft + reach[10]);
-    m128 st14 = load_m128_from_u64a(ft + reach[11]);
+    st2  = lshiftbyte_m128(st2, 2);
     m128 st3  = load_m128_from_u64a(ft + reach[12]);
+    st3  = lshiftbyte_m128(st3, 3);
+    st2 = or128(st2, st3);
+
+    m128 st4  = load_m128_from_u64a(ft + reach[1]);
+    st4  = lshiftbyte_m128(st4, 4);
+    m128 st5  = load_m128_from_u64a(ft + reach[5]);
+    st5  = lshiftbyte_m128(st5, 5);
+    st4 = or128(st4, st5);
+
+    m128 st6  = load_m128_from_u64a(ft + reach[9]);
+    st6  = lshiftbyte_m128(st6, 6);
     m128 st7  = load_m128_from_u64a(ft + reach[13]);
+    st7  = lshiftbyte_m128(st7, 7);
+    st6 = or128(st6, st7);
+
+    m128 st8  = load_m128_from_u64a(ft + reach[2]);
+    m128 st9  = load_m128_from_u64a(ft + reach[6]);
+    st9  = lshiftbyte_m128(st9, 1);
+    st8 = or128(st8, st9);
+
+    m128 st10 = load_m128_from_u64a(ft + reach[10]);
+    st10 = lshiftbyte_m128(st10, 2);
     m128 st11 = load_m128_from_u64a(ft + reach[14]);
+    st11 = lshiftbyte_m128(st11, 3);
+    st10 = or128(st10, st11);
+
+    m128 st12 = load_m128_from_u64a(ft + reach[3]);
+    st12 = lshiftbyte_m128(st12, 4);
+    m128 st13 = load_m128_from_u64a(ft + reach[7]);
+    st13 = lshiftbyte_m128(st13, 5);
+    st12 = or128(st12, st13);
+
+    m128 st14 = load_m128_from_u64a(ft + reach[11]);
+    st14 = lshiftbyte_m128(st14, 6);
     m128 st15 = load_m128_from_u64a(ft + reach[15]);
+    st15 = lshiftbyte_m128(st15, 7);
+    st14 = or128(st14, st15);
     // m128 st0  = load_m128_from_u64a((u64a *)reach[0]);
     // m128 st4  = load_m128_from_u64a((u64a *)reach[1]);
     // m128 st8  = load_m128_from_u64a((u64a *)reach[2]);
@@ -190,42 +219,19 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
     // m128 st11 = load_m128_from_u64a((u64a *)reach[14]);
     // m128 st15 = load_m128_from_u64a((u64a *)reach[15]);
     
-    st1  = lshiftbyte_m128(st1, 1);
-    st2  = lshiftbyte_m128(st2, 2);
-    st3  = lshiftbyte_m128(st3, 3);
-    st4  = lshiftbyte_m128(st4, 4);
-    st5  = lshiftbyte_m128(st5, 5);
-    st6  = lshiftbyte_m128(st6, 6);
-    st7  = lshiftbyte_m128(st7, 7);
-    st9  = lshiftbyte_m128(st9, 1);
-    st10 = lshiftbyte_m128(st10, 2);
-    st11 = lshiftbyte_m128(st11, 3);
-    st12 = lshiftbyte_m128(st12, 4);
-    st13 = lshiftbyte_m128(st13, 5);
-    st14 = lshiftbyte_m128(st14, 6);
-    st15 = lshiftbyte_m128(st15, 7);
 
-    st0 = or128(st0, st1);
-    st2 = or128(st2, st3);
-    st4 = or128(st4, st5);
-    st6 = or128(st6, st7);
     st0 = or128(st0, st2);
     st4 = or128(st4, st6);
     st0 = or128(st0, st4);
-
-    st8 = or128(st8, st9);
-    st10 = or128(st10, st11);
-    st12 = or128(st12, st13);
-    st14 = or128(st14, st15);
+    m128 st = or128(*s, st0);
+    *conf0 = movq(st) ^ ~0ULL;
+    st = rshiftbyte_m128(st, 8);
+   
     st8 = or128(st8, st10);
     st12 = or128(st12, st14);
     st8 = or128(st8, st12);
 
-    m128 st = or128(*s, st0);
-    *conf0 = movq(st) ^ ~0ULL;
-    st = rshiftbyte_m128(st, 8);
     st = or128(st, st8);
-
     *conf8 = movq(st) ^ ~0ULL;
     *s = rshiftbyte_m128(st, 8);
 }