forked from WojciechMula/sse-popcount
-
Notifications
You must be signed in to change notification settings - Fork 0
/
popcnt-avx512vbmi-lookup.cpp
46 lines (31 loc) · 1.74 KB
/
popcnt-avx512vbmi-lookup.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
std::uint64_t popcnt_AVX512VBMI_lookup(const uint8_t* data, const size_t n) {
size_t i = 0;
const __m512i lookup_lo = _mm512_setr_epi64(0x0302020102010100, 0x0403030203020201,
0x0403030203020201, 0x0504040304030302,
0x0403030203020201, 0x0504040304030302,
0x0504040304030302, 0x0605050405040403);
const __m512i lookup_hi = _mm512_setr_epi64(0x0403030203020201, 0x0504040304030302,
0x0504040304030302, 0x0605050405040403,
0x0504040304030302, 0x0605050405040403,
0x0605050405040403, 0x0706060506050504);
const __m512i lsb_mask = _mm512_set1_epi8(0x01);
__m512i acc = _mm512_setzero_si512();
while (i + 64 < n) {
__m512i local = _mm512_setzero_si512();
for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) {
const __m512i vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(data + i));
// get popcount from bits 6:0
const __m512i lo = _mm512_permutex2var_epi8(lookup_lo, vec, lookup_hi);
// and move 7th bit onto position 0 -- i.e. (x & 0x80 ? 1 : 0)
const __m512i hi = _mm512_and_si512(_mm512_srli_epi32(vec, 7), lsb_mask);
local = _mm512_add_epi8(local, lo);
local = _mm512_add_epi8(local, hi);
}
acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512()));
}
uint64_t result = custom::_mm512_hsum_epi64(acc);
for (/**/; i < n; i++) {
result += lookup8bit[data[i]];
}
return result;
}