Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fdr get conf optimization #311

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 66 additions & 11 deletions src/fdr/fdr.c
Original file line number Diff line number Diff line change
Expand Up @@ -143,12 +143,39 @@ m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft,

static really_inline
void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
UNUSED const u8 *end_ptr, uint16_t domain_mask,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
/* +1: the zones ensure that we can read the byte at z->end */
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);
u64a domain_mask = ~domain_mask_flipped;

#if defined(HAVE_NEON)
uint8x16_t input = vld1q_u8(itPtr);
uint8x16_t shifted_input = vextq_u8(input, vdupq_n_u8(0), 1);

uint16x8_t even = vreinterpretq_u16_u8(input);
uint16x8_t odd = vreinterpretq_u16_u8(shifted_input);
//between those two we have 15 values. The last one will still be scalar.

uint16x8_t vect_domain_mask = vdupq_n_u16(domain_mask);
even = vandq_u16(vect_domain_mask, even);
odd = vandq_u16(vect_domain_mask, odd);

uint16_t reach0 = vgetq_lane_u16(even, 0);
uint16_t reach1 = vgetq_lane_u16(odd, 0);
uint16_t reach2 = vgetq_lane_u16(even, 1);
uint16_t reach3 = vgetq_lane_u16(odd, 1);
uint16_t reach4 = vgetq_lane_u16(even, 2);
uint16_t reach5 = vgetq_lane_u16(odd, 2);
uint16_t reach6 = vgetq_lane_u16(even, 3);
uint16_t reach7 = vgetq_lane_u16(odd, 3);
uint16_t reach8 = vgetq_lane_u16(even, 4);
uint16_t reach9 = vgetq_lane_u16(odd, 4);
uint16_t reach10 = vgetq_lane_u16(even, 5);
uint16_t reach11 = vgetq_lane_u16(odd, 5);
uint16_t reach12 = vgetq_lane_u16(even, 6);
uint16_t reach13 = vgetq_lane_u16(odd, 6);
uint16_t reach14 = vgetq_lane_u16(even, 7);
uint16_t reach15 = domain_mask & unaligned_load_u16(itPtr + 15);
#else
u64a it_hi = *(const u64a *)itPtr;
u64a it_lo = *(const u64a *)(itPtr + 8);
u64a reach0 = domain_mask & it_hi;
Expand All @@ -167,6 +194,7 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,
u64a reach13 = domain_mask & (it_lo >> 40);
u64a reach14 = domain_mask & (it_lo >> 48);
u64a reach15 = domain_mask & unaligned_load_u32(itPtr + 15);
#endif

m128 st0 = load_m128_from_u64a(ft + reach0);
m128 st1 = lshiftbyte_m128(load_m128_from_u64a(ft + reach1), 1);
Expand Down Expand Up @@ -212,10 +240,37 @@ void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr,

static really_inline
void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
UNUSED const u8 *end_ptr, uint16_t domain_mask,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);

#if defined(HAVE_NEON)
uint8x16_t input = vld1q_u8(itPtr);
uint16x8_t even = vreinterpretq_u16_u8(input);

uint16x8_t vect_domain_mask = vdupq_n_u16(domain_mask);
even = vandq_u16(vect_domain_mask, even);

uint16_t reach0 = vgetq_lane_u16(even, 0);
uint16_t reach2 = vgetq_lane_u16(even, 1);
uint16_t reach4 = vgetq_lane_u16(even, 2);
uint16_t reach6 = vgetq_lane_u16(even, 3);

m128 st0 = load_m128_from_u64a(ft + reach0);
m128 st2 = load_m128_from_u64a(ft + reach2);
m128 st4 = load_m128_from_u64a(ft + reach4);
m128 st6 = load_m128_from_u64a(ft + reach6);

uint16_t reach8 = vgetq_lane_u16(even, 4);
uint16_t reach10 = vgetq_lane_u16(even, 5);
uint16_t reach12 = vgetq_lane_u16(even, 6);
uint16_t reach14 = vgetq_lane_u16(even, 7);

m128 st8 = load_m128_from_u64a(ft + reach8);
m128 st10 = load_m128_from_u64a(ft + reach10);
m128 st12 = load_m128_from_u64a(ft + reach12);
m128 st14 = load_m128_from_u64a(ft + reach14);
#else
u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach2 = andn(domain_mask_flipped, itPtr + 2);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
Expand All @@ -235,6 +290,7 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,
m128 st10 = load_m128_from_u64a(ft + reach10);
m128 st12 = load_m128_from_u64a(ft + reach12);
m128 st14 = load_m128_from_u64a(ft + reach14);
#endif

st2 = lshiftbyte_m128(st2, 2);
st4 = lshiftbyte_m128(st4, 4);
Expand Down Expand Up @@ -265,14 +321,14 @@ void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr,

static really_inline
void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr,
UNUSED const u8 *end_ptr, u32 domain_mask_flipped,
UNUSED const u8 *end_ptr, uint16_t domain_mask,
const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) {
assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr);

u64a reach0 = andn(domain_mask_flipped, itPtr);
u64a reach4 = andn(domain_mask_flipped, itPtr + 4);
u64a reach8 = andn(domain_mask_flipped, itPtr + 8);
u64a reach12 = andn(domain_mask_flipped, itPtr + 12);
u64a reach0 = domain_mask & *(itPtr);
u64a reach4 = domain_mask & *(itPtr + 4);
u64a reach8 = domain_mask & *(itPtr + 8);
u64a reach12 = domain_mask & *(itPtr + 12);

m128 st0 = load_m128_from_u64a(ft + reach0);
m128 st4 = load_m128_from_u64a(ft + reach4);
Expand Down Expand Up @@ -683,7 +739,7 @@ size_t prepareZones(const u8 *buf, size_t len, const u8 *hend,
__builtin_prefetch(itPtr + ITER_BYTES); \
u64a conf0; \
u64a conf8; \
get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \
get_conf_fn(itPtr, start_ptr, end_ptr, fdr->domainMask, \
ft, &conf0, &conf8, &s); \
do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \
&last_match_id, zz); \
Expand All @@ -703,7 +759,6 @@ hwlm_error_t fdr_engine_exec(const struct FDR *fdr,

u32 floodBackoff = FLOOD_BACKOFF_START;
u32 last_match_id = INVALID_MATCH_ID;
u32 domain_mask_flipped = ~fdr->domainMask;
u8 stride = fdr->stride;
const u64a *ft =
(const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR)));
Expand Down