diff --git a/DFTTest/DFTTest.cpp b/DFTTest/DFTTest.cpp index 51d7bc2..c9e9b98 100644 --- a/DFTTest/DFTTest.cpp +++ b/DFTTest/DFTTest.cpp @@ -7,9 +7,9 @@ ** ** Copyright (C) 2007-2010 Kevin Stone ** -** This program is free software; you can redistribute it and/or modify +** This program is free software: you can redistribute it and/or modify ** it under the terms of the GNU General Public License as published by -** the Free Software Foundation; either version 2 of the License, or +** the Free Software Foundation, either version 3 of the License, or ** (at your option) any later version. ** ** This program is distributed in the hope that it will be useful, @@ -18,8 +18,7 @@ ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License -** along with this program; if not, write to the Free Software -** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +** along with this program. If not, see . */ #define _USE_MATH_DEFINES @@ -27,249 +26,73 @@ #include #include -#include #include -#include "DFTTest.hpp" +#include "DFTTest.h" -#ifdef VS_TARGET_CPU_X86 -#include "vectorclass/vectorclass.h" +using namespace std::literals; -template extern void filter_sse2(float *, const float *, const int, const float *, const float *, const float *) noexcept; -template extern void filter_avx2(float *, const float *, const int, const float *, const float *, const float *) noexcept; +#ifdef DFTTEST_X86 +#include "VCL2/vectorclass.h" -template extern void func_0_sse2(VSFrameRef * [3], VSFrameRef *, const DFTTestData *, const VSAPI *) noexcept; -template extern void func_0_avx2(VSFrameRef * [3], VSFrameRef *, const DFTTestData *, const VSAPI *) noexcept; +template extern void filter_sse2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template extern void filter_avx2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template extern void filter_avx512(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; -template extern void func_1_sse2(VSFrameRef * [15][3], VSFrameRef *, const int, const DFTTestData *, const VSAPI *) noexcept; -template extern void func_1_avx2(VSFrameRef * [15][3], VSFrameRef *, const int, const DFTTestData *, const VSAPI *) noexcept; +template extern void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template extern void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template extern void func_0_avx512(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; + +template extern void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template extern void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template extern void func_1_avx512(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; #endif #define EXTRA(a,b) (((a) % (b)) ? ((b) - ((a) % (b))) : 0) -struct NPInfo { - int fn, b, y, x; -}; - -static double besselI0(double p) noexcept { - p /= 2.; - double n = 1., t = 1., d = 1.; - int k = 1; - double v; - - do { - n *= p; - d *= k; - v = n / d; - t += v * v; - } while (++k < 15 && v > 1e-8); - - return t; -} - -static double getWinValue(const double n, const double size, const int win, const double beta) noexcept { - switch (win) { - case 0: // hanning - return 0.5 - 0.5 * std::cos(2. * M_PI * n / size); - case 1: // hamming - return 0.53836 - 0.46164 * std::cos(2. * M_PI * n / size); - case 2: // blackman - return 0.42 - 0.5 * std::cos(2. * M_PI * n / size) + 0.08 * std::cos(4. * M_PI * n / size); - case 3: // 4 term blackman-harris - return 0.35875 - 0.48829 * std::cos(2. * M_PI * n / size) + 0.14128 * std::cos(4. * M_PI * n / size) - 0.01168 * std::cos(6. * M_PI * n / size); - case 4: // kaiser-bessel - { - const double v = 2. * n / size - 1.; - return besselI0(M_PI * beta * std::sqrt(1. - v * v)) / besselI0(M_PI * beta); - } - case 5: // 7 term blackman-harris - return 0.27105140069342415 - - 0.433297939234486060 * std::cos(2. * M_PI * n / size) + - 0.218122999543110620 * std::cos(4. * M_PI * n / size) - - 0.065925446388030898 * std::cos(6. * M_PI * n / size) + - 0.010811742098372268 * std::cos(8. * M_PI * n / size) - - 7.7658482522509342E-4 * std::cos(10. * M_PI * n / size) + - 1.3887217350903198E-5 * std::cos(12. * M_PI * n / size); - case 6: // flat top - return 0.2810639 - 0.5208972 * std::cos(2. * M_PI * n / size) + 0.1980399 * std::cos(4. * M_PI * n / size); - case 7: // rectangular - return 1.; - case 8: // Bartlett - return 2. / size * (size / 2. - std::abs(n - size / 2.)); - case 9: // Bartlett-Hann - return 0.62 - 0.48 * (n / size - 0.5) - 0.38 * std::cos(2. * M_PI * n / size); - case 10: // Nuttall - return 0.355768 - 0.487396 * std::cos(2. * M_PI * n / size) + 0.144232 * std::cos(4. * M_PI * n / size) - 0.012604 * std::cos(6. * M_PI * n / size); - case 11: // Blackman-Nuttall - return 0.3635819 - 0.4891775 * std::cos(2. * M_PI * n / size) + 0.1365995 * std::cos(4. * M_PI * n / size) - 0.0106411 * std::cos(6. * M_PI * n / size); - default: - return 0.; - } -} - -static void normalizeForOverlapAdd(double * VS_RESTRICT hw, const int bsize, const int osize) noexcept { - double * VS_RESTRICT nw = new double[bsize](); - const int inc = bsize - osize; - - for (int q = 0; q < bsize; q++) { - for (int h = q; h >= 0; h -= inc) - nw[q] += hw[h] * hw[h]; - for (int h = q + inc; h < bsize; h += inc) - nw[q] += hw[h] * hw[h]; - } - - for (int q = 0; q < bsize; q++) - hw[q] /= std::sqrt(nw[q]); - - delete[] nw; -} - -static void createWindow(float * VS_RESTRICT hw, const int tmode, const int smode, const DFTTestData * d) noexcept { - double * VS_RESTRICT tw = new double[d->tbsize]; - for (int j = 0; j < d->tbsize; j++) - tw[j] = getWinValue(j + 0.5, d->tbsize, d->twin, d->tbeta); - if (tmode == 1) - normalizeForOverlapAdd(tw, d->tbsize, d->tosize); - - double * VS_RESTRICT sw = new double[d->sbsize]; - for (int j = 0; j < d->sbsize; j++) - sw[j] = getWinValue(j + 0.5, d->sbsize, d->swin, d->sbeta); - if (smode == 1) - normalizeForOverlapAdd(sw, d->sbsize, d->sosize); - - const double nscale = 1. / std::sqrt(d->bvolume); - for (int j = 0; j < d->tbsize; j++) - for (int k = 0; k < d->sbsize; k++) - for (int q = 0; q < d->sbsize; q++) - hw[(j * d->sbsize + k) * d->sbsize + q] = static_cast(tw[j] * sw[k] * sw[q] * nscale); - - delete[] tw; - delete[] sw; +template +static auto getArg(const VSAPI * vsapi, const VSMap * map, const char * key, const arg_t defaultValue) noexcept { + arg_t arg{}; + int err{}; + + if constexpr (std::is_same_v) + arg = !!vsapi->propGetInt(map, key, 0, &err); + else if constexpr (std::is_same_v) + arg = int64ToIntS(vsapi->propGetInt(map, key, 0, &err)); + else if constexpr (std::is_same_v) + arg = vsapi->propGetInt(map, key, 0, &err); + else if constexpr (std::is_same_v) + arg = static_cast(vsapi->propGetFloat(map, key, 0, &err)); + else if constexpr (std::is_same_v) + arg = vsapi->propGetFloat(map, key, 0, &err); + + if (err) + arg = defaultValue; + + return arg; } -static float * parseSigmaLocation(const double * s, const int num, int & poscnt, const float sigma, const float pfact) { - float * parray = nullptr; - - if (!s) { - parray = new float[4]; - parray[0] = 0.0f; - parray[2] = 1.0f; - parray[1] = parray[3] = std::pow(sigma, pfact); - poscnt = 2; - } else { - const double * sT = s; - bool found[2] = { false, false }; - poscnt = 0; - - for (int i = 0; i < num; i += 2) { - const float pos = static_cast(sT[i]); - - if (pos < 0.0f || pos > 1.0f) - throw std::string{ "sigma location - invalid pos (" } + std::to_string(pos) + ")"; - - if (pos == 0.0f) - found[0] = true; - else if (pos == 1.0f) - found[1] = true; - - poscnt++; - } - - if (!found[0] || !found[1]) - throw std::string{ "sigma location - one or more end points not provided" }; - - parray = new float[poscnt * 2]; - sT = s; - poscnt = 0; - - for (int i = 0; i < num; i += 2) { - parray[poscnt * 2 + 0] = static_cast(sT[i + 0]); - parray[poscnt * 2 + 1] = std::pow(static_cast(sT[i + 1]), pfact); - - poscnt++; - } - - for (int i = 1; i < poscnt; i++) { - int j = i; - const float t0 = parray[j * 2 + 0]; - const float t1 = parray[j * 2 + 1]; - - while (j > 0 && parray[(j - 1) * 2] > t0) { - parray[j * 2 + 0] = parray[(j - 1) * 2 + 0]; - parray[j * 2 + 1] = parray[(j - 1) * 2 + 1]; - j--; - } - - parray[j * 2 + 0] = t0; - parray[j * 2 + 1] = t1; - } - } - - return parray; -} - -static float interp(const float pf, const float * pv, const int cnt) noexcept { - int lidx = 0; - for (int i = cnt - 1; i >= 0; i--) { - if (pv[i * 2] <= pf) { - lidx = i; - break; - } - } - - int hidx = cnt - 1; - for (int i = 0; i < cnt; i++) { - if (pv[i * 2] >= pf) { - hidx = i; - break; - } - } - - const float d0 = pf - pv[lidx * 2]; - const float d1 = pv[hidx * 2] - pf; - - if (hidx == lidx || d0 <= 0.0f) - return pv[lidx * 2 + 1]; - if (d1 <= 0.0f) - return pv[hidx * 2 + 1]; - - const float tf = d0 / (d0 + d1); - return pv[lidx * 2 + 1] * (1.0f - tf) + pv[hidx * 2 + 1] * tf; -} - -static float getSVal(const int pos, const int len, const float * pv, const int cnt, float & pf) noexcept { - if (len == 1) { - pf = 0.0f; - return 1.0f; - } - - const int ld2 = len / 2; - if (pos > ld2) - pf = (len - pos) / static_cast(ld2); - else - pf = pos / static_cast(ld2); - - return interp(pf, pv, cnt); -} - -template -static void copyPad(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestData * d, const VSAPI * vsapi) noexcept { +template +static auto copyPad(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int srcWidth = vsapi->getFrameWidth(src, plane); const int dstWidth = vsapi->getFrameWidth(dst[plane], 0); const int srcHeight = vsapi->getFrameHeight(src, plane); const int dstHeight = vsapi->getFrameHeight(dst[plane], 0); - const int dstStride = vsapi->getStride(dst[plane], 0) / sizeof(T); + const int dstStride = vsapi->getStride(dst[plane], 0) / sizeof(pixel_t); const int offy = (dstHeight - srcHeight) / 2; const int offx = (dstWidth - srcWidth) / 2; - vs_bitblt(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * offy + offx * sizeof(T), vsapi->getStride(dst[plane], 0), - vsapi->getReadPtr(src, plane), vsapi->getStride(src, plane), - srcWidth * sizeof(T), srcHeight); + vs_bitblt(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * offy + offx * sizeof(pixel_t), + vsapi->getStride(dst[plane], 0), + vsapi->getReadPtr(src, plane), + vsapi->getStride(src, plane), + srcWidth * sizeof(pixel_t), + srcHeight); - T * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst[plane], 0)) + dstStride * offy; + pixel_t * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst[plane], 0)) + dstStride * offy; for (int y = offy; y < srcHeight + offy; y++) { int w = offx * 2; @@ -287,25 +110,22 @@ static void copyPad(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestDa for (int y = 0; y < offy; y++, w--) memcpy(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * y, vsapi->getReadPtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * w, - dstWidth * sizeof(T)); + dstWidth * sizeof(pixel_t)); w = offy + srcHeight - 2; for (int y = offy + srcHeight; y < dstHeight; y++, w--) memcpy(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * y, vsapi->getReadPtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * w, - dstWidth * sizeof(T)); + dstWidth * sizeof(pixel_t)); } } } -template -static inline void proc0(const T * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept; - -template<> -inline void proc0(const uint8_t * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept { +template +static inline auto proc0(const pixel_t * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float srcScale) noexcept { for (int u = 0; u < p1; u++) { for (int v = 0; v < p1; v++) - d[v] = s0[v] * s1[v]; + d[v] = s0[v] * srcScale * s1[v]; s0 += p0; s1 += p1; @@ -313,31 +133,7 @@ inline void proc0(const uint8_t * s0, const float * s1, float * VS_RESTRICT d, c } } -template<> -inline void proc0(const uint16_t * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept { - for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v++) - d[v] = s0[v] * divisor * s1[v]; - - s0 += p0; - s1 += p1; - d += p1; - } -} - -template<> -inline void proc0(const float * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept { - for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v++) - d[v] = s0[v] * 255.0f * s1[v]; - - s0 += p0; - s1 += p1; - d += p1; - } -} - -static inline void proc1(const float * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1) noexcept { +static inline auto proc1(const float * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1) noexcept { for (int u = 0; u < p0; u++) { for (int v = 0; v < p0; v++) d[v] += s0[v] * s1[v]; @@ -348,179 +144,128 @@ static inline void proc1(const float * s0, const float * s1, float * VS_RESTRICT } } -static inline void removeMean(float * VS_RESTRICT dftc, const float * dftgc, const int ccnt, float * VS_RESTRICT dftc2) noexcept { +static inline auto removeMean(float * VS_RESTRICT dftc, const float * dftgc, const int ccnt, float * VS_RESTRICT dftc2) noexcept { const float gf = dftc[0] / dftgc[0]; for (int h = 0; h < ccnt; h += 2) { - dftc2[h] = gf * dftgc[h]; + dftc2[h + 0] = gf * dftgc[h + 0]; dftc2[h + 1] = gf * dftgc[h + 1]; - dftc[h] -= dftc2[h]; + dftc[h + 0] -= dftc2[h + 0]; dftc[h + 1] -= dftc2[h + 1]; } } -static inline void addMean(float * VS_RESTRICT dftc, const int ccnt, const float * dftc2) noexcept { +static inline auto addMean(float * VS_RESTRICT dftc, const int ccnt, const float * dftc2) noexcept { for (int h = 0; h < ccnt; h += 2) { - dftc[h] += dftc2[h]; + dftc[h + 0] += dftc2[h + 0]; dftc[h + 1] += dftc2[h + 1]; } } template -static inline void filter_c(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; - -template<> -inline void filter_c<0>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 2) { - const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1]; - const float mult = std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f); - dftc[h] *= mult; - dftc[h + 1] *= mult; - } -} - -template<> -inline void filter_c<1>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 2) { - const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1]; - if (psd < sigmas[h]) - dftc[h] = dftc[h + 1] = 0.0f; - } -} - -template<> -inline void filter_c<2>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 2) { - dftc[h] *= sigmas[h]; - dftc[h + 1] *= sigmas[h]; - } -} +static inline void filter_c(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { + const float beta = pmin[0]; -template<> -inline void filter_c<3>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { for (int h = 0; h < ccnt; h += 2) { - const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1]; - - if (psd >= pmin[h] && psd <= pmax[h]) { - dftc[h] *= sigmas[h]; + float psd, mult; + + if constexpr (type != 2) + psd = dftc[h + 0] * dftc[h + 0] + dftc[h + 1] * dftc[h + 1]; + + if constexpr (type == 0) { + mult = std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f); + } else if constexpr (type == 1) { + if (psd < sigmas[h]) + dftc[h + 0] = dftc[h + 1] = 0.0f; + } else if constexpr (type == 2) { + dftc[h + 0] *= sigmas[h]; dftc[h + 1] *= sigmas[h]; + } else if constexpr (type == 3) { + if (psd >= pmin[h] && psd <= pmax[h]) { + dftc[h + 0] *= sigmas[h]; + dftc[h + 1] *= sigmas[h]; + } else { + dftc[h + 0] *= sigmas2[h]; + dftc[h + 1] *= sigmas2[h]; + } + } else if constexpr (type == 4) { + mult = sigmas[h] * std::sqrt(psd * pmax[h] / ((psd + pmin[h]) * (psd + pmax[h]) + 1e-15f)); + } else if constexpr (type == 5) { + mult = std::pow(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f), beta); } else { - dftc[h] *= sigmas2[h]; - dftc[h + 1] *= sigmas2[h]; + mult = std::sqrt(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f)); } - } -} -template<> -inline void filter_c<4>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 2) { - const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1] + 1e-15f; - const float mult = sigmas[h] * std::sqrt(psd * pmax[h] / ((psd + pmin[h]) * (psd + pmax[h]))); - dftc[h] *= mult; - dftc[h + 1] *= mult; - } -} - -template<> -inline void filter_c<5>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - const float beta = pmin[0]; - - for (int h = 0; h < ccnt; h += 2) { - const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1]; - const float mult = std::pow(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f), beta); - dftc[h] *= mult; - dftc[h + 1] *= mult; - } -} - -template<> -inline void filter_c<6>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 2) { - const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1]; - const float mult = std::sqrt(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f)); - dftc[h] *= mult; - dftc[h + 1] *= mult; - } -} - -template -static void cast(const float * ebp, T * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, - const float multiplier, const int peak) noexcept; - -template<> -void cast(const float * ebp, uint8_t * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, - const float multiplier, const int peak) noexcept { - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x++) - dstp[x] = std::min(std::max(static_cast(ebp[x] + 0.5f), 0), 255); - - ebp += ebpStride; - dstp += dstStride; + if constexpr (type == 0 || type > 3) { + dftc[h + 0] *= mult; + dftc[h + 1] *= mult; + } } } -template<> -void cast(const float * ebp, uint16_t * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, - const float multiplier, const int peak) noexcept { +template +static auto cast(const float * ebp, pixel_t * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, + const float dstScale, const int peak) noexcept { for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x++) - dstp[x] = std::min(std::max(static_cast(ebp[x] * multiplier + 0.5f), 0), peak); + for (int x = 0; x < dstWidth; x++) { + if constexpr (std::is_integral_v) + dstp[x] = std::clamp(static_cast(ebp[x] * dstScale + 0.5f), 0, peak); + else + dstp[x] = ebp[x] * dstScale; + } ebp += ebpStride; dstp += dstStride; } } -template<> -void cast(const float * ebp, float * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, - const float multiplier, const int peak) noexcept { - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x++) - dstp[x] = ebp[x] * (1.0f / 255.0f); +template +static void func_0_c(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); - ebp += ebpStride; - dstp += dstStride; - } -} - -template -static void func_0_c(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept { const auto threadId = std::this_thread::get_id(); - float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0)); - float * dftr = d->dftr.at(threadId); - fftwf_complex * dftc = d->dftc.at(threadId); - fftwf_complex * dftc2 = d->dftc2.at(threadId); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int width = d->padWidth[plane]; const int height = d->padHeight[plane]; const int eheight = d->eheight[plane]; - const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(T); - const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float); - const T * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); + const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); float * ebpSaved = ebuff; memset(ebuff, 0, ebpStride * height * sizeof(float)); for (int y = 0; y < eheight; y += d->inc) { for (int x = 0; x <= width - d->sbsize; x += d->inc) { - proc0(srcp + x, d->hw, dftr, srcStride, d->sbsize, d->divisor); + proc0(srcp + x, hw, dftr, srcStride, d->sbsize, d->srcScale); - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(ft, dftr, dftc); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); - d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2); + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); if (d->zmean) addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); - fftwf_execute_dft_c2r(d->fti, dftc, dftr); + fftwf_execute_dft_c2r(fti, dftc, dftr); if (d->type & 1) // spatial overlapping - proc1(dftr, d->hw, ebpSaved + x, d->sbsize, ebpStride); + proc1(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); else - ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * d->hw[d->sbd1 * d->sbsize + d->sbd1]; + ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * hw[d->sbd1 * d->sbsize + d->sbd1]; } srcp += srcStride * d->inc; @@ -529,54 +274,64 @@ static void func_0_c(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const int dstWidth = vsapi->getFrameWidth(dst, plane); const int dstHeight = vsapi->getFrameHeight(dst, plane); - const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); - T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; - cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak); + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); } } } -template -static void func_1_c(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept { +template +static void func_1_c(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); + const auto threadId = std::this_thread::get_id(); - float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0)); - float * dftr = d->dftr.at(threadId); - fftwf_complex * dftc = d->dftc.at(threadId); - fftwf_complex * dftc2 = d->dftc2.at(threadId); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int width = d->padWidth[plane]; const int height = d->padHeight[plane]; const int eheight = d->eheight[plane]; - const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(T); - const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float); - const T * srcp[15] = {}; + const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + + const pixel_t * srcp[15] = {}; for (int i = 0; i < d->tbsize; i++) - srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); + srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); memset(ebuff, 0, ebpStride * height * sizeof(float)); for (int y = 0; y < eheight; y += d->inc) { for (int x = 0; x <= width - d->sbsize; x += d->inc) { for (int z = 0; z < d->tbsize; z++) - proc0(srcp[z] + x, d->hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->divisor); + proc0(srcp[z] + x, hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->srcScale); - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(ft, dftr, dftc); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); - d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2); + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); if (d->zmean) addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); - fftwf_execute_dft_c2r(d->fti, dftc, dftr); + fftwf_execute_dft_c2r(fti, dftc, dftr); if (d->type & 1) // spatial overlapping - proc1(dftr + pos * d->barea, d->hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + proc1(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); else - ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * d->hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; + ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; } for (int q = 0; q < d->tbsize; q++) @@ -585,109 +340,14 @@ static void func_1_c(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, c const int dstWidth = vsapi->getFrameWidth(dst, plane); const int dstHeight = vsapi->getFrameHeight(dst, plane); - const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); - T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; - cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak); + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); } } } -static void selectFunctions(const unsigned ftype, const unsigned opt, DFTTestData * d) noexcept { - if (ftype == 0) { - if (std::abs(d->f0beta - 1.0f) < 0.00005f) - d->filterCoeffs = filter_c<0>; - else if (std::abs(d->f0beta - 0.5f) < 0.00005f) - d->filterCoeffs = filter_c<6>; - else - d->filterCoeffs = filter_c<5>; - } else if (ftype == 1) { - d->filterCoeffs = filter_c<1>; - } else if (ftype == 2) { - d->filterCoeffs = filter_c<2>; - } else if (ftype == 3) { - d->filterCoeffs = filter_c<3>; - } else { - d->filterCoeffs = filter_c<4>; - } - - if (d->vi->format->bytesPerSample == 1) { - d->copyPad = copyPad; - d->func_0 = func_0_c; - d->func_1 = func_1_c; - } else if (d->vi->format->bytesPerSample == 2) { - d->copyPad = copyPad; - d->func_0 = func_0_c; - d->func_1 = func_1_c; - } else { - d->copyPad = copyPad; - d->func_0 = func_0_c; - d->func_1 = func_1_c; - } - -#ifdef VS_TARGET_CPU_X86 - const int iset = instrset_detect(); - - if ((opt == 0 && iset >= 8) || opt == 3) { - if (ftype == 0) { - if (std::abs(d->f0beta - 1.0f) < 0.00005f) - d->filterCoeffs = filter_avx2<0>; - else if (std::abs(d->f0beta - 0.5f) < 0.00005f) - d->filterCoeffs = filter_avx2<6>; - else - d->filterCoeffs = filter_avx2<5>; - } else if (ftype == 1) { - d->filterCoeffs = filter_avx2<1>; - } else if (ftype == 2) { - d->filterCoeffs = filter_avx2<2>; - } else if (ftype == 3) { - d->filterCoeffs = filter_avx2<3>; - } else { - d->filterCoeffs = filter_avx2<4>; - } - - if (d->vi->format->bytesPerSample == 1) { - d->func_0 = func_0_avx2; - d->func_1 = func_1_avx2; - } else if (d->vi->format->bytesPerSample == 2) { - d->func_0 = func_0_avx2; - d->func_1 = func_1_avx2; - } else { - d->func_0 = func_0_avx2; - d->func_1 = func_1_avx2; - } - } else if ((opt == 0 && iset >= 2) || opt == 2) { - if (ftype == 0) { - if (std::abs(d->f0beta - 1.0f) < 0.00005f) - d->filterCoeffs = filter_sse2<0>; - else if (std::abs(d->f0beta - 0.5f) < 0.00005f) - d->filterCoeffs = filter_sse2<6>; - else - d->filterCoeffs = filter_sse2<5>; - } else if (ftype == 1) { - d->filterCoeffs = filter_sse2<1>; - } else if (ftype == 2) { - d->filterCoeffs = filter_sse2<2>; - } else if (ftype == 3) { - d->filterCoeffs = filter_sse2<3>; - } else { - d->filterCoeffs = filter_sse2<4>; - } - - if (d->vi->format->bytesPerSample == 1) { - d->func_0 = func_0_sse2; - d->func_1 = func_1_sse2; - } else if (d->vi->format->bytesPerSample == 2) { - d->func_0 = func_0_sse2; - d->func_1 = func_1_sse2; - } else { - d->func_0 = func_0_sse2; - d->func_1 = func_1_sse2; - } - } -#endif -} - static void VS_CC dfttestInit(VSMap * in, VSMap * out, void ** instanceData, VSNode * node, VSCore * core, const VSAPI * vsapi) { DFTTestData * d = static_cast(*instanceData); vsapi->setVideoInfo(d->vi, 1, node); @@ -710,25 +370,27 @@ static const VSFrameRef * VS_CC dfttestGetFrame(int n, int activationReason, voi auto threadId = std::this_thread::get_id(); if (!d->ebuff.count(threadId)) { - d->ebuff.emplace(threadId, vsapi->newVideoFrame(vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core), d->padWidth[0], d->padHeight[0], nullptr, core)); + d->ebuff.emplace(threadId, + unique_VSFrameRef{ vsapi->newVideoFrame(vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core), d->padWidth[0], d->padHeight[0], nullptr, core), + vsapi->freeFrame }); - float * dftr = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32); + float * dftr = vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64); if (!dftr) - throw std::string{ "malloc failure (dftr)" }; - d->dftr.emplace(threadId, dftr); + throw "malloc failure (dftr)"; + d->dftr.emplace(threadId, unique_float{ dftr, vs_aligned_free }); - fftwf_complex * dftc = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32); + fftwf_complex * dftc = vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64); if (!dftc) - throw std::string{ "malloc failure (dftc)" }; - d->dftc.emplace(threadId, dftc); + throw "malloc failure (dftc)"; + d->dftc.emplace(threadId, unique_fftwf_complex{ dftc, vs_aligned_free }); - fftwf_complex * dftc2 = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32); + fftwf_complex * dftc2 = vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64); if (!dftc2) - throw std::string{ "malloc failure (dftc2)" }; - d->dftc2.emplace(threadId, dftc2); + throw "malloc failure (dftc2)"; + d->dftc2.emplace(threadId, unique_fftwf_complex{ dftc2, vs_aligned_free }); } - } catch (const std::string & error) { - vsapi->setFilterError(("DFTTest: " + error).c_str(), frameCtx); + } catch (const char * error) { + vsapi->setFilterError(("DFTTest: "s + error).c_str(), frameCtx); return nullptr; } @@ -760,7 +422,7 @@ static const VSFrameRef * VS_CC dfttestGetFrame(int n, int activationReason, voi const int pos = d->tbsize / 2; for (int i = n - pos; i <= n + pos; i++) { - src[i - n + pos] = vsapi->getFrameFilter(std::min(std::max(i, 0), d->vi->numFrames - 1), d->node, frameCtx); + src[i - n + pos] = vsapi->getFrameFilter(std::clamp(i, 0, d->vi->numFrames - 1), d->node, frameCtx); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) @@ -788,31 +450,7 @@ static const VSFrameRef * VS_CC dfttestGetFrame(int n, int activationReason, voi static void VS_CC dfttestFree(void * instanceData, VSCore * core, const VSAPI * vsapi) { DFTTestData * d = static_cast(instanceData); - vsapi->freeNode(d->node); - - vs_aligned_free(d->hw); - vs_aligned_free(d->dftgc); - vs_aligned_free(d->sigmas); - vs_aligned_free(d->sigmas2); - vs_aligned_free(d->pmins); - vs_aligned_free(d->pmaxs); - - fftwf_destroy_plan(d->ft); - fftwf_destroy_plan(d->fti); - - for (auto & iter : d->ebuff) - vsapi->freeFrame(iter.second); - - for (auto & iter : d->dftr) - vs_aligned_free(iter.second); - - for (auto & iter : d->dftc) - vs_aligned_free(iter.second); - - for (auto & iter : d->dftc2) - vs_aligned_free(iter.second); - delete d; } @@ -820,192 +458,399 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, std::unique_ptr d = std::make_unique(); int err; - d->node = vsapi->propGetNode(in, "clip", 0, nullptr); - d->vi = vsapi->getVideoInfo(d->node); - - try { - if (!isConstantFormat(d->vi) || - (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) || - (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32)) - throw std::string{ "only constant format 8-16 bit integer and 32 bit float input supported" }; - - const int ftype = int64ToIntS(vsapi->propGetInt(in, "ftype", 0, &err)); - - float sigma = static_cast(vsapi->propGetFloat(in, "sigma", 0, &err)); - if (err) - sigma = 8.0f; - - float sigma2 = static_cast(vsapi->propGetFloat(in, "sigma2", 0, &err)); - if (err) - sigma2 = 8.0f; - - const float pmin = static_cast(vsapi->propGetFloat(in, "pmin", 0, &err)); - - float pmax = static_cast(vsapi->propGetFloat(in, "pmax", 0, &err)); - if (err) - pmax = 500.0f; - - d->sbsize = int64ToIntS(vsapi->propGetInt(in, "sbsize", 0, &err)); - if (err) - d->sbsize = 16; + auto createWindow = [&](unique_float & hw, const int tmode, const int smode) noexcept { + auto getWinValue = [](const double n, const double size, const int win, const double beta) noexcept { + auto besselI0 = [](double p) noexcept { + p /= 2.0; + double n = 1.0, t = 1.0, d = 1.0; + int k = 1; + double v; + + do { + n *= p; + d *= k; + v = n / d; + t += v * v; + } while (++k < 15 && v > 1e-8); + + return t; + }; + + switch (win) { + case 0: // hanning + return 0.5 - 0.5 * std::cos(2.0 * M_PI * n / size); + case 1: // hamming + return 0.53836 - 0.46164 * std::cos(2.0 * M_PI * n / size); + case 2: // blackman + return 0.42 - 0.5 * std::cos(2.0 * M_PI * n / size) + 0.08 * std::cos(4.0 * M_PI * n / size); + case 3: // 4 term blackman-harris + return 0.35875 - 0.48829 * std::cos(2.0 * M_PI * n / size) + 0.14128 * std::cos(4.0 * M_PI * n / size) - 0.01168 * std::cos(6.0 * M_PI * n / size); + case 4: // kaiser-bessel + { + const double v = 2.0 * n / size - 1.0; + return besselI0(M_PI * beta * std::sqrt(1.0 - v * v)) / besselI0(M_PI * beta); + } + case 5: // 7 term blackman-harris + return 0.27105140069342415 - + 0.433297939234486060 * std::cos(2.0 * M_PI * n / size) + + 0.218122999543110620 * std::cos(4.0 * M_PI * n / size) - + 0.065925446388030898 * std::cos(6.0 * M_PI * n / size) + + 0.010811742098372268 * std::cos(8.0 * M_PI * n / size) - + 7.7658482522509342e-4 * std::cos(10.0 * M_PI * n / size) + + 1.3887217350903198e-5 * std::cos(12.0 * M_PI * n / size); + case 6: // flat top + return 0.2810639 - 0.5208972 * std::cos(2.0 * M_PI * n / size) + 0.1980399 * std::cos(4.0 * M_PI * n / size); + case 7: // rectangular + return 1.0; + case 8: // Bartlett + return 2.0 / size * (size / 2.0 - std::abs(n - size / 2.0)); + case 9: // Bartlett-Hann + return 0.62 - 0.48 * (n / size - 0.5) - 0.38 * std::cos(2.0 * M_PI * n / size); + case 10: // Nuttall + return 0.355768 - 0.487396 * std::cos(2.0 * M_PI * n / size) + 0.144232 * std::cos(4.0 * M_PI * n / size) - 0.012604 * std::cos(6.0 * M_PI * n / size); + case 11: // Blackman-Nuttall + return 0.3635819 - 0.4891775 * std::cos(2.0 * M_PI * n / size) + 0.1365995 * std::cos(4.0 * M_PI * n / size) - 0.0106411 * std::cos(6.0 * M_PI * n / size); + default: + return 0.0; + } + }; - int smode = int64ToIntS(vsapi->propGetInt(in, "smode", 0, &err)); - if (err) - smode = 1; + auto normalizeForOverlapAdd = [](std::unique_ptr & hw, const int bsize, const int osize) noexcept { + std::unique_ptr nw = std::make_unique(bsize); + const int inc = bsize - osize; - d->sosize = int64ToIntS(vsapi->propGetInt(in, "sosize", 0, &err)); - if (err) - d->sosize = 12; + for (int q = 0; q < bsize; q++) { + for (int h = q; h >= 0; h -= inc) + nw[q] += hw[h] * hw[h]; + for (int h = q + inc; h < bsize; h += inc) + nw[q] += hw[h] * hw[h]; + } - d->tbsize = int64ToIntS(vsapi->propGetInt(in, "tbsize", 0, &err)); - if (err) - d->tbsize = 3; + for (int q = 0; q < bsize; q++) + hw[q] /= std::sqrt(nw[q]); + }; + + std::unique_ptr tw = std::make_unique(d->tbsize); + for (int j = 0; j < d->tbsize; j++) + tw[j] = getWinValue(j + 0.5, d->tbsize, d->twin, d->tbeta); + if (tmode == 1) + normalizeForOverlapAdd(tw, d->tbsize, d->tosize); + + std::unique_ptr sw = std::make_unique(d->sbsize); + for (int j = 0; j < d->sbsize; j++) + sw[j] = getWinValue(j + 0.5, d->sbsize, d->swin, d->sbeta); + if (smode == 1) + normalizeForOverlapAdd(sw, d->sbsize, d->sosize); + + const double nscale = 1.0 / std::sqrt(d->bvolume); + for (int j = 0; j < d->tbsize; j++) + for (int k = 0; k < d->sbsize; k++) + for (int q = 0; q < d->sbsize; q++) + hw[(j * d->sbsize + k) * d->sbsize + q] = static_cast(tw[j] * sw[k] * sw[q] * nscale); + }; + + auto interp = [](const float pf, const std::unique_ptr & pv, const int cnt) noexcept { + int lidx = 0; + for (int i = cnt - 1; i >= 0; i--) { + if (pv[i * 2] <= pf) { + lidx = i; + break; + } + } - const int tmode = int64ToIntS(vsapi->propGetInt(in, "tmode", 0, &err)); + int hidx = cnt - 1; + for (int i = 0; i < cnt; i++) { + if (pv[i * 2] >= pf) { + hidx = i; + break; + } + } - d->tosize = int64ToIntS(vsapi->propGetInt(in, "tosize", 0, &err)); + const float d0 = pf - pv[lidx * 2]; + const float d1 = pv[hidx * 2] - pf; - d->swin = int64ToIntS(vsapi->propGetInt(in, "swin", 0, &err)); + if (hidx == lidx || d0 <= 0.0f) + return pv[lidx * 2 + 1]; + if (d1 <= 0.0f) + return pv[hidx * 2 + 1]; - d->twin = int64ToIntS(vsapi->propGetInt(in, "twin", 0, &err)); - if (err) - d->twin = 7; + const float tf = d0 / (d0 + d1); + return pv[lidx * 2 + 1] * (1.0f - tf) + pv[hidx * 2 + 1] * tf; + }; - d->sbeta = static_cast(vsapi->propGetFloat(in, "sbeta", 0, &err)); - if (err) - d->sbeta = 2.5f; + auto getSVal = [&](const int pos, const int len, const std::unique_ptr & pv, const int cnt, float & pf) noexcept { + if (len == 1) { + pf = 0.0f; + return 1.0f; + } - d->tbeta = static_cast(vsapi->propGetFloat(in, "tbeta", 0, &err)); - if (err) - d->tbeta = 2.5f; + const int ld2 = len / 2; + pf = (pos > ld2 ? len - pos : pos) / static_cast(ld2); + return interp(pf, pv, cnt); + }; - d->zmean = !!vsapi->propGetInt(in, "zmean", 0, &err); - if (err) - d->zmean = true; + try { + d->node = vsapi->propGetNode(in, "clip", 0, nullptr); + d->vi = vsapi->getVideoInfo(d->node); - d->f0beta = static_cast(vsapi->propGetFloat(in, "f0beta", 0, &err)); - if (err) - d->f0beta = 1.0f; + if (!isConstantFormat(d->vi) || + (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) || + (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32)) + throw "only constant format 8-16 bit integer and 32 bit float input supported"; + + const int ftype = getArg(vsapi, in, "ftype", 0); + const float sigma = getArg(vsapi, in, "sigma", 8.0f); + const float sigma2 = getArg(vsapi, in, "sigma2", 8.0f); + const float pmin = getArg(vsapi, in, "pmin", 0.0f); + const float pmax = getArg(vsapi, in, "pmax", 500.0f); + d->sbsize = getArg(vsapi, in, "sbsize", 16); + const int smode = getArg(vsapi, in, "smode", 1); + d->sosize = getArg(vsapi, in, "sosize", 12); + d->tbsize = getArg(vsapi, in, "tbsize", 3); + const int tmode = getArg(vsapi, in, "tmode", 0); + d->tosize = getArg(vsapi, in, "tosize", 0); + d->swin = getArg(vsapi, in, "swin", 0); + d->twin = getArg(vsapi, in, "twin", 7); + d->sbeta = getArg(vsapi, in, "sbeta", 2.5); + d->tbeta = getArg(vsapi, in, "tbeta", 2.5); + d->zmean = getArg(vsapi, in, "zmean", true); + d->f0beta = getArg(vsapi, in, "f0beta", 1.0f); + const float alpha = getArg(vsapi, in, "alpha", ftype == 0 ? 5.0f : 7.0f); + const int ssystem = getArg(vsapi, in, "ssystem", 0); + const int opt = getArg(vsapi, in, "opt", 0); const int64_t * nlocation = vsapi->propGetIntArray(in, "nlocation", &err); - - float alpha = static_cast(vsapi->propGetFloat(in, "alpha", 0, &err)); - if (err) - alpha = (ftype == 0) ? 5.0f : 7.0f; - const double * slocation = vsapi->propGetFloatArray(in, "slocation", &err); - const double * ssx = vsapi->propGetFloatArray(in, "ssx", &err); - const double * ssy = vsapi->propGetFloatArray(in, "ssy", &err); - const double * sst = vsapi->propGetFloatArray(in, "sst", &err); - const int ssystem = int64ToIntS(vsapi->propGetInt(in, "ssystem", 0, &err)); + const int numNlocation = vsapi->propNumElements(in, "nlocation"); + const int numSlocation = vsapi->propNumElements(in, "slocation"); + const int numSsx = vsapi->propNumElements(in, "ssx"); + const int numSsy = vsapi->propNumElements(in, "ssy"); + const int numSst = vsapi->propNumElements(in, "sst"); - const int m = vsapi->propNumElements(in, "planes"); + { + const int m = vsapi->propNumElements(in, "planes"); - for (int i = 0; i < 3; i++) - d->process[i] = (m <= 0); + for (int i = 0; i < 3; i++) + d->process[i] = (m <= 0); - for (int i = 0; i < m; i++) { - const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr)); + for (int i = 0; i < m; i++) { + const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr)); - if (n < 0 || n >= d->vi->format->numPlanes) - throw std::string{ "plane index out of range" }; + if (n < 0 || n >= d->vi->format->numPlanes) + throw "plane index out of range"; - if (d->process[n]) - throw std::string{ "plane specified twice" }; + if (d->process[n]) + throw "plane specified twice"; - d->process[n] = true; + d->process[n] = true; + } } - const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err)); - if (ftype < 0 || ftype > 4) - throw std::string{ "ftype must be 0, 1, 2, 3, or 4" }; + throw "ftype must be 0, 1, 2, 3, or 4"; if (d->sbsize < 1) - throw std::string{ "sbsize must be greater than or equal to 1" }; + throw "sbsize must be greater than or equal to 1"; if (smode < 0 || smode > 1) - throw std::string{ "smode must be 0 or 1" }; + throw "smode must be 0 or 1"; if (smode == 0 && !(d->sbsize & 1)) - throw std::string{ "sbsize must be odd when using smode=0" }; + throw "sbsize must be odd when using smode=0"; if (smode == 0) d->sosize = 0; if (d->sosize < 0 || d->sosize >= d->sbsize) - throw std::string{ "sosize must be between 0 and sbsize-1 (inclusive)" }; + throw "sosize must be between 0 and sbsize-1 (inclusive)"; if (d->sosize > d->sbsize / 2 && d->sbsize % (d->sbsize - d->sosize) != 0) - throw std::string{ "spatial overlap greater than 50% requires that sbsize-sosize is a divisor of sbsize" }; + throw "spatial overlap greater than 50% requires that sbsize-sosize is a divisor of sbsize"; if (d->tbsize < 1 || d->tbsize > 15) - throw std::string{ "tbsize must be between 1 and 15 (inclusive)" }; + throw "tbsize must be between 1 and 15 (inclusive)"; if (tmode != 0) - throw std::string{ "tmode must be 0. tmode=1 is not implemented" }; + throw "tmode must be 0. tmode=1 is not implemented"; if (tmode == 0 && !(d->tbsize & 1)) - throw std::string{ "tbsize must be odd when using tmode=0" }; + throw "tbsize must be odd when using tmode=0"; if (tmode == 0) d->tosize = 0; if (d->tosize < 0 || d->tosize >= d->tbsize) - throw std::string{ "tosize must be between 0 and tbsize-1 (inclusive)" }; + throw "tosize must be between 0 and tbsize-1 (inclusive)"; if (d->tosize > d->tbsize / 2 && d->tbsize % (d->tbsize - d->tosize) != 0) - throw std::string{ "temporal overlap greater than 50% requires that tbsize-tosize is a divisor of tbsize" }; + throw "temporal overlap greater than 50% requires that tbsize-tosize is a divisor of tbsize"; if (d->tbsize > d->vi->numFrames) - throw std::string{ "tbsize must be less than or equal to the number of frames in the clip" }; + throw "tbsize must be less than or equal to the number of frames in the clip"; if (d->swin < 0 || d->swin > 11) - throw std::string{ "swin must be between 0 and 11 (inclusive)" }; + throw "swin must be between 0 and 11 (inclusive)"; if (d->twin < 0 || d->twin > 11) - throw std::string{ "twin must be between 0 and 11 (inclusive)" }; + throw "twin must be between 0 and 11 (inclusive)"; - if (nlocation && (vsapi->propNumElements(in, "nlocation") & 3)) - throw std::string{ "the number of elements in nlocation must be a multiple of 4" }; + if (nlocation && (numNlocation & 3)) + throw "number of elements in nlocation must be a multiple of 4"; if (alpha <= 0.0f) - throw std::string{ "alpha must be greater than 0.0" }; + throw "alpha must be greater than 0.0"; - if (slocation && (vsapi->propNumElements(in, "slocation") & 1)) - throw std::string{ "the number of elements in slocation must be a multiple of 2" }; + if (slocation && (numSlocation & 1)) + throw "number of elements in slocation must be a multiple of 2"; - if (ssx && (vsapi->propNumElements(in, "ssx") & 1)) - throw std::string{ "the number of elements in ssx must be a multiple of 2" }; + if (ssx && (numSsx & 1)) + throw "number of elements in ssx must be a multiple of 2"; - if (ssy && (vsapi->propNumElements(in, "ssy") & 1)) - throw std::string{ "the number of elements in ssy must be a multiple of 2" }; + if (ssy && (numSsy & 1)) + throw "number of elements in ssy must be a multiple of 2"; - if (sst && (vsapi->propNumElements(in, "sst") & 1)) - throw std::string{ "the number of elements in sst must be a multiple of 2" }; + if (sst && (numSst & 1)) + throw "number of elements in sst must be a multiple of 2"; if (ssystem < 0 || ssystem > 1) - throw std::string{ "ssystem must be 0 or 1" }; + throw "ssystem must be 0 or 1"; + + if (opt < 0 || opt > 4) + throw "opt must be 0, 1, 2, 3, or 4"; + + { + if (ftype == 0) { + if (std::abs(d->f0beta - 1.0f) < 0.00005f) + d->filterCoeffs = filter_c<0>; + else if (std::abs(d->f0beta - 0.5f) < 0.00005f) + d->filterCoeffs = filter_c<6>; + else + d->filterCoeffs = filter_c<5>; + } else if (ftype == 1) { + d->filterCoeffs = filter_c<1>; + } else if (ftype == 2) { + d->filterCoeffs = filter_c<2>; + } else if (ftype == 3) { + d->filterCoeffs = filter_c<3>; + } else { + d->filterCoeffs = filter_c<4>; + } - if (opt < 0 || opt > 3) - throw std::string{ "opt must be 0, 1, 2, or 3" }; + if (d->vi->format->bytesPerSample == 1) { + d->copyPad = copyPad; + d->func_0 = func_0_c; + d->func_1 = func_1_c; + } else if (d->vi->format->bytesPerSample == 2) { + d->copyPad = copyPad; + d->func_0 = func_0_c; + d->func_1 = func_1_c; + } else { + d->copyPad = copyPad; + d->func_0 = func_0_c; + d->func_1 = func_1_c; + } - const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads; - d->ebuff.reserve(numThreads); - d->dftr.reserve(numThreads); - d->dftc.reserve(numThreads); - d->dftc2.reserve(numThreads); +#ifdef DFTTEST_X86 + const int iset = instrset_detect(); + if ((opt == 0 && iset >= 10) || opt == 4) { + if (ftype == 0) { + if (std::abs(d->f0beta - 1.0f) < 0.00005f) + d->filterCoeffs = filter_avx512<0>; + else if (std::abs(d->f0beta - 0.5f) < 0.00005f) + d->filterCoeffs = filter_avx512<6>; + else + d->filterCoeffs = filter_avx512<5>; + } else if (ftype == 1) { + d->filterCoeffs = filter_avx512<1>; + } else if (ftype == 2) { + d->filterCoeffs = filter_avx512<2>; + } else if (ftype == 3) { + d->filterCoeffs = filter_avx512<3>; + } else { + d->filterCoeffs = filter_avx512<4>; + } - selectFunctions(ftype, opt, d.get()); + if (d->vi->format->bytesPerSample == 1) { + d->func_0 = func_0_avx512; + d->func_1 = func_1_avx512; + } else if (d->vi->format->bytesPerSample == 2) { + d->func_0 = func_0_avx512; + d->func_1 = func_1_avx512; + } else { + d->func_0 = func_0_avx512; + d->func_1 = func_1_avx512; + } + } else if ((opt == 0 && iset >= 8) || opt == 3) { + if (ftype == 0) { + if (std::abs(d->f0beta - 1.0f) < 0.00005f) + d->filterCoeffs = filter_avx2<0>; + else if (std::abs(d->f0beta - 0.5f) < 0.00005f) + d->filterCoeffs = filter_avx2<6>; + else + d->filterCoeffs = filter_avx2<5>; + } else if (ftype == 1) { + d->filterCoeffs = filter_avx2<1>; + } else if (ftype == 2) { + d->filterCoeffs = filter_avx2<2>; + } else if (ftype == 3) { + d->filterCoeffs = filter_avx2<3>; + } else { + d->filterCoeffs = filter_avx2<4>; + } + + if (d->vi->format->bytesPerSample == 1) { + d->func_0 = func_0_avx2; + d->func_1 = func_1_avx2; + } else if (d->vi->format->bytesPerSample == 2) { + d->func_0 = func_0_avx2; + d->func_1 = func_1_avx2; + } else { + d->func_0 = func_0_avx2; + d->func_1 = func_1_avx2; + } + } else if ((opt == 0 && iset >= 2) || opt == 2) { + if (ftype == 0) { + if (std::abs(d->f0beta - 1.0f) < 0.00005f) + d->filterCoeffs = filter_sse2<0>; + else if (std::abs(d->f0beta - 0.5f) < 0.00005f) + d->filterCoeffs = filter_sse2<6>; + else + d->filterCoeffs = filter_sse2<5>; + } else if (ftype == 1) { + d->filterCoeffs = filter_sse2<1>; + } else if (ftype == 2) { + d->filterCoeffs = filter_sse2<2>; + } else if (ftype == 3) { + d->filterCoeffs = filter_sse2<3>; + } else { + d->filterCoeffs = filter_sse2<4>; + } + + if (d->vi->format->bytesPerSample == 1) { + d->func_0 = func_0_sse2; + d->func_1 = func_1_sse2; + } else if (d->vi->format->bytesPerSample == 2) { + d->func_0 = func_0_sse2; + d->func_1 = func_1_sse2; + } else { + d->func_0 = func_0_sse2; + d->func_1 = func_1_sse2; + } + } +#endif + } if (d->vi->format->sampleType == stInteger) { - d->multiplier = static_cast(1 << (d->vi->format->bitsPerSample - 8)); - d->divisor = 1.0f / d->multiplier; + d->dstScale = static_cast(1 << (d->vi->format->bitsPerSample - 8)); + d->srcScale = 1.0f / d->dstScale; d->peak = (1 << d->vi->format->bitsPerSample) - 1; + } else { + d->srcScale = 255.0f; + d->dstScale = 1.0f / 255.0f; } if (ftype != 0) @@ -1038,28 +883,31 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, } } - d->hw = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32); + d->hw = { vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free }; if (!d->hw) - throw std::string{ "malloc failure (hw)" }; - createWindow(d->hw, tmode, smode, d.get()); + throw "malloc failure (hw)"; + + createWindow(d->hw, tmode, smode); - float * dftgr = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32); - d->dftgc = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32); + unique_float dftgr{ vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free }; + d->dftgc = { vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free }; if (!dftgr || !d->dftgc) - throw std::string{ "malloc failure (dftgr/dftgc)" }; + throw "malloc failure (dftgr/dftgc)"; + + fftwf_make_planner_thread_safe(); if (d->tbsize > 1) { - d->ft = fftwf_plan_dft_r2c_3d(d->tbsize, d->sbsize, d->sbsize, dftgr, d->dftgc, FFTW_PATIENT | FFTW_DESTROY_INPUT); - d->fti = fftwf_plan_dft_c2r_3d(d->tbsize, d->sbsize, d->sbsize, d->dftgc, dftgr, FFTW_PATIENT | FFTW_DESTROY_INPUT); + d->ft = { fftwf_plan_dft_r2c_3d(d->tbsize, d->sbsize, d->sbsize, dftgr.get(), d->dftgc.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan }; + d->fti = { fftwf_plan_dft_c2r_3d(d->tbsize, d->sbsize, d->sbsize, d->dftgc.get(), dftgr.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan }; } else { - d->ft = fftwf_plan_dft_r2c_2d(d->sbsize, d->sbsize, dftgr, d->dftgc, FFTW_PATIENT | FFTW_DESTROY_INPUT); - d->fti = fftwf_plan_dft_c2r_2d(d->sbsize, d->sbsize, d->dftgc, dftgr, FFTW_PATIENT | FFTW_DESTROY_INPUT); + d->ft = { fftwf_plan_dft_r2c_2d(d->sbsize, d->sbsize, dftgr.get(), d->dftgc.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan }; + d->fti = { fftwf_plan_dft_c2r_2d(d->sbsize, d->sbsize, d->dftgc.get(), dftgr.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan }; } float wscale = 0.0f; - const float * hwT = d->hw; - float * VS_RESTRICT dftgrT = dftgr; + const float * hwT = d->hw.get(); + float * VS_RESTRICT dftgrT = dftgr.get(); for (int s = 0; s < d->tbsize; s++) { for (int i = 0; i < d->sbsize; i++) { for (int k = 0; k < d->sbsize; k++) { @@ -1070,20 +918,81 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, dftgrT += d->sbsize; } } - fftwf_execute_dft_r2c(d->ft, dftgr, d->dftgc); - vs_aligned_free(dftgr); wscale = 1.0f / wscale; const float wscalef = (ftype < 2) ? wscale : 1.0f; - d->sigmas = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32); - d->sigmas2 = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32); - d->pmins = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32); - d->pmaxs = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32); + fftwf_execute_dft_r2c(d->ft.get(), dftgr.get(), d->dftgc.get()); + + d->sigmas = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free }; + d->sigmas2 = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free }; + d->pmins = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free }; + d->pmaxs = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free }; if (!d->sigmas || !d->sigmas2 || !d->pmins || !d->pmaxs) - throw std::string{ "malloc failure (sigmas/sigmas2/pmins/pmaxs)" }; + throw "malloc failure (sigmas/sigmas2/pmins/pmaxs)"; if (slocation || ssx || ssy || sst) { + auto parseSigmaLocation = [&](const double * s, const int num, int & poscnt, const float pfact) { + float * parray = nullptr; + + if (!s) { + parray = new float[4]; + parray[0] = 0.0f; + parray[2] = 1.0f; + parray[1] = parray[3] = std::pow(sigma, pfact); + poscnt = 2; + } else { + const double * sT = s; + bool found[] = { false, false }; + poscnt = 0; + + for (int i = 0; i < num; i += 2) { + const float pos = static_cast(sT[i]); + + if (pos < 0.0f || pos > 1.0f) + throw "sigma location - invalid pos (" + std::to_string(pos) + ")"; + + if (pos == 0.0f) + found[0] = true; + else if (pos == 1.0f) + found[1] = true; + + poscnt++; + } + + if (!found[0] || !found[1]) + throw "sigma location - one or more end points not provided"; + + parray = new float[poscnt * 2]; + sT = s; + poscnt = 0; + + for (int i = 0; i < num; i += 2) { + parray[poscnt * 2 + 0] = static_cast(sT[i + 0]); + parray[poscnt * 2 + 1] = std::pow(static_cast(sT[i + 1]), pfact); + + poscnt++; + } + + for (int i = 1; i < poscnt; i++) { + int j = i; + const float t0 = parray[j * 2 + 0]; + const float t1 = parray[j * 2 + 1]; + + while (j > 0 && parray[(j - 1) * 2] > t0) { + parray[j * 2 + 0] = parray[(j - 1) * 2 + 0]; + parray[j * 2 + 1] = parray[(j - 1) * 2 + 1]; + j--; + } + + parray[j * 2 + 0] = t0; + parray[j * 2 + 1] = t1; + } + } + + return parray; + }; + int ndim = 3; if (d->tbsize == 1) ndim -= 1; @@ -1092,16 +1001,16 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, const float ndiv = 1.0f / ndim; int tcnt = 0, sycnt = 0, sxcnt = 0; - float * tdata, * sydata, * sxdata; + std::unique_ptr tdata, sydata, sxdata; if (slocation) { - tdata = parseSigmaLocation(slocation, vsapi->propNumElements(in, "slocation"), tcnt, sigma, ssystem ? 1.0f : ndiv); - sydata = parseSigmaLocation(slocation, vsapi->propNumElements(in, "slocation"), sycnt, sigma, ssystem ? 1.0f : ndiv); - sxdata = parseSigmaLocation(slocation, vsapi->propNumElements(in, "slocation"), sxcnt, sigma, ssystem ? 1.0f : ndiv); + tdata = std::unique_ptr{ parseSigmaLocation(slocation, numSlocation, tcnt, ssystem ? 1.0f : ndiv) }; + sydata = std::unique_ptr{ parseSigmaLocation(slocation, numSlocation, sycnt, ssystem ? 1.0f : ndiv) }; + sxdata = std::unique_ptr{ parseSigmaLocation(slocation, numSlocation, sxcnt, ssystem ? 1.0f : ndiv) }; } else { - tdata = parseSigmaLocation(sst, vsapi->propNumElements(in, "sst"), tcnt, sigma, ndiv); - sydata = parseSigmaLocation(ssy, vsapi->propNumElements(in, "ssy"), sycnt, sigma, ndiv); - sxdata = parseSigmaLocation(ssx, vsapi->propNumElements(in, "ssx"), sxcnt, sigma, ndiv); + tdata = std::unique_ptr{ parseSigmaLocation(sst, numSst, tcnt, ndiv) }; + sydata = std::unique_ptr{ parseSigmaLocation(ssy, numSsy, sycnt, ndiv) }; + sxdata = std::unique_ptr{ parseSigmaLocation(ssx, numSsx, sxcnt, ndiv) }; } const int cpx = d->sbsize / 2 + 1; @@ -1125,14 +1034,10 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, } const int pos = ((z * d->sbsize + y) * cpx + x) * 2; - d->sigmas[pos] = d->sigmas[pos + 1] = val / wscalef; + d->sigmas[pos + 0] = d->sigmas[pos + 1] = val / wscalef; } } } - - delete[] tdata; - delete[] sydata; - delete[] sxdata; } else { for (int i = 0; i < d->ccnt2; i++) d->sigmas[i] = sigma / wscalef; @@ -1145,17 +1050,22 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, } if (nlocation && ftype < 2) { - memset(d->sigmas, 0, d->ccnt2 * sizeof(float)); + struct NPInfo final { + int fn, b, y, x; + }; + + memset(d->sigmas.get(), 0, d->ccnt2 * sizeof(float)); - float * VS_RESTRICT hw2 = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32); + unique_float hw2{ vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free }; if (!hw2) - throw std::string{ "malloc failure (hw2)" }; - createWindow(hw2, 0, 0, d.get()); + throw "malloc failure (hw2)"; - float * VS_RESTRICT dftr = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32); - fftwf_complex * dftgc2 = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32); + createWindow(hw2, 0, 0); + + unique_float dftr{ vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free }; + unique_fftwf_complex dftgc2{ vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free }; if (!dftr || !dftgc2) - throw std::string{ "malloc failure (dftr/dftgc2)" }; + throw "malloc failure (dftr/dftgc2)"; float wscale2 = 0.0f; int w = 0; @@ -1168,33 +1078,33 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, } } wscale2 = 1.0f / wscale2; - fftwf_execute_dft_r2c(d->ft, dftr, dftgc2); + fftwf_execute_dft_r2c(d->ft.get(), dftr.get(), dftgc2.get()); int nnpoints = 0; - NPInfo * npts = new NPInfo[500]; + std::unique_ptr npts = std::make_unique(500); - for (int i = 0; i < vsapi->propNumElements(in, "nlocation"); i += 4) { + for (int i = 0; i < numNlocation; i += 4) { const int fn = int64ToIntS(nlocation[i + 0]); const int b = int64ToIntS(nlocation[i + 1]); const int y = int64ToIntS(nlocation[i + 2]); const int x = int64ToIntS(nlocation[i + 3]); if (fn < 0 || fn > d->vi->numFrames - d->tbsize) - throw std::string{ "invalid frame number in nlocation (" } + std::to_string(fn) + ")"; + throw "invalid frame number in nlocation (" + std::to_string(fn) + ")"; if (b < 0 || b >= d->vi->format->numPlanes) - throw std::string{ "invalid plane number in nlocation (" } + std::to_string(b) + ")"; + throw "invalid plane number in nlocation (" + std::to_string(b) + ")"; const int height = d->vi->height >> (b ? d->vi->format->subSamplingH : 0); if (y < 0 || y > height - d->sbsize) - throw std::string{ "invalid y pos in nlocation (" } + std::to_string(y) + ")"; + throw "invalid y pos in nlocation (" + std::to_string(y) + ")"; const int width = d->vi->width >> (b ? d->vi->format->subSamplingW : 0); if (x < 0 || x > width - d->sbsize) - throw std::string{ "invalid x pos in nlocation (" } + std::to_string(x) + ")"; + throw "invalid x pos in nlocation (" + std::to_string(x) + ")"; if (nnpoints >= 500) - throw std::string{ "maximum number of entries in nlocation is 500" }; + throw "maximum number of entries in nlocation is 500"; npts[nnpoints].fn = fn; npts[nnpoints].b = b; @@ -1204,10 +1114,12 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, } for (int ct = 0; ct < nnpoints; ct++) { - fftwf_complex * dftc = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32); - fftwf_complex * dftc2 = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32); - if (!dftc || !dftc2) - throw std::string{ "malloc failure (dftc/dftc2)" }; + unique_fftwf_complex _dftc{ vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free }; + unique_fftwf_complex dftc2{ vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free }; + if (!_dftc || !dftc2) + throw "malloc failure (dftc/dftc2)"; + + float * dftc = reinterpret_cast(_dftc.get()); for (int z = 0; z < d->tbsize; z++) { const VSFrameRef * src = vsapi->getFrame(npts[ct].fn + z, d->node, nullptr, 0); @@ -1215,42 +1127,44 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData, if (d->vi->format->bytesPerSample == 1) { const uint8_t * srcp = vsapi->getReadPtr(src, npts[ct].b) + stride * npts[ct].y + npts[ct].x; - proc0(srcp, hw2 + d->barea * z, dftr + d->barea * z, stride, d->sbsize, d->divisor); + proc0(srcp, hw2.get() + d->barea * z, dftr.get() + d->barea * z, stride, d->sbsize, d->srcScale); } else if (d->vi->format->bytesPerSample == 2) { const uint16_t * srcp = reinterpret_cast(vsapi->getReadPtr(src, npts[ct].b)) + stride * npts[ct].y + npts[ct].x; - proc0(srcp, hw2 + d->barea * z, dftr + d->barea * z, stride, d->sbsize, d->divisor); + proc0(srcp, hw2.get() + d->barea * z, dftr.get() + d->barea * z, stride, d->sbsize, d->srcScale); } else { const float * srcp = reinterpret_cast(vsapi->getReadPtr(src, npts[ct].b)) + stride * npts[ct].y + npts[ct].x; - proc0(srcp, hw2 + d->barea * z, dftr + d->barea * z, stride, d->sbsize, d->divisor); + proc0(srcp, hw2.get() + d->barea * z, dftr.get() + d->barea * z, stride, d->sbsize, d->srcScale); } vsapi->freeFrame(src); } - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(d->ft.get(), dftr.get(), reinterpret_cast(dftc)); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc2), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(dftc, reinterpret_cast(dftgc2.get()), d->ccnt2, reinterpret_cast(dftc2.get())); for (int h = 0; h < d->ccnt2; h += 2) { - const float psd = reinterpret_cast(dftc)[h] * reinterpret_cast(dftc)[h] + reinterpret_cast(dftc)[h + 1] * reinterpret_cast(dftc)[h + 1]; - d->sigmas[h] += psd; + const float psd = dftc[h + 0] * dftc[h + 0] + dftc[h + 1] * dftc[h + 1]; + d->sigmas[h + 0] += psd; d->sigmas[h + 1] += psd; } - - vs_aligned_free(dftc); - vs_aligned_free(dftc2); } - vs_aligned_free(hw2); - vs_aligned_free(dftr); - vs_aligned_free(dftgc2); - delete[] npts; - const float scale = 1.0f / nnpoints; for (int h = 0; h < d->ccnt2; h++) - d->sigmas[h] *= scale * (wscale2 / wscale) * alpha; + d->sigmas[h] = d->sigmas[h] * scale * (wscale2 / wscale) * alpha; } + + const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads; + d->ebuff.reserve(numThreads); + d->dftr.reserve(numThreads); + d->dftc.reserve(numThreads); + d->dftc2.reserve(numThreads); + } catch (const char * error) { + vsapi->setError(out, ("DFTTest: "s + error).c_str()); + vsapi->freeNode(d->node); + return; } catch (const std::string & error) { vsapi->setError(out, ("DFTTest: " + error).c_str()); vsapi->freeNode(d->node); diff --git a/DFTTest/DFTTest.h b/DFTTest/DFTTest.h new file mode 100644 index 0000000..8cc99aa --- /dev/null +++ b/DFTTest/DFTTest.h @@ -0,0 +1,39 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include + +using unique_float = std::unique_ptr; +using unique_fftwf_complex = std::unique_ptr; +using unique_VSFrameRef = std::unique_ptr; + +struct DFTTestData final { + VSNodeRef * node; + const VSVideoInfo * vi; + int sbsize, sosize, tbsize, tosize, swin, twin; + double sbeta, tbeta; + float f0beta; + bool zmean, process[3]; + float srcScale, dstScale; + int barea, bvolume, ccnt, ccnt2, type, sbd1, inc, peak; + bool uf0b; + const VSFormat * padFormat; + int padWidth[3], padHeight[3], eheight[3]; + unique_float hw{ nullptr, nullptr }, sigmas{ nullptr, nullptr }, sigmas2{ nullptr, nullptr }, pmins{ nullptr, nullptr }, pmaxs{ nullptr, nullptr }; + unique_fftwf_complex dftgc{ nullptr, nullptr }; + std::unique_ptr ft{ nullptr, nullptr }, fti{ nullptr, nullptr }; + std::unordered_map ebuff; + std::unordered_map dftr; + std::unordered_map dftc, dftc2; + void (*copyPad)(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; + void (*filterCoeffs)(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; + void (*func_0)(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; + void (*func_1)(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +}; diff --git a/DFTTest/DFTTest.hpp b/DFTTest/DFTTest.hpp deleted file mode 100644 index 6682fa1..0000000 --- a/DFTTest/DFTTest.hpp +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include -#include - -#include -#include - -#include - -struct DFTTestData { - VSNodeRef * node; - const VSVideoInfo * vi; - int sbsize, sosize, tbsize, tosize, swin, twin; - float sbeta, tbeta, f0beta; - bool zmean, process[3]; - float divisor, multiplier; - int peak, barea, bvolume, ccnt, type, sbd1, ccnt2, inc; - bool uf0b; - const VSFormat * padFormat; - int padWidth[3], padHeight[3], eheight[3]; - float * hw, * sigmas, * sigmas2, * pmins, * pmaxs; - fftwf_complex * dftgc; - fftwf_plan ft, fti; - std::unordered_map ebuff; - std::unordered_map dftr; - std::unordered_map dftc, dftc2; - void (*copyPad)(const VSFrameRef *, VSFrameRef * [3], const DFTTestData *, const VSAPI *) noexcept; - void (*filterCoeffs)(float *, const float *, const int, const float *, const float *, const float *) noexcept; - void (*func_0)(VSFrameRef * [3], VSFrameRef *, const DFTTestData *, const VSAPI *) noexcept; - void (*func_1)(VSFrameRef * [15][3], VSFrameRef *, const int, const DFTTestData *, const VSAPI *) noexcept; -}; diff --git a/DFTTest/DFTTest.vcxproj b/DFTTest/DFTTest.vcxproj index dd8aee8..20e849a 100644 --- a/DFTTest/DFTTest.vcxproj +++ b/DFTTest/DFTTest.vcxproj @@ -7,7 +7,6 @@ - true 16.0 {23B9F54B-763D-491C-A201-657918CD1377} Win32Proj @@ -32,16 +31,17 @@ - C:\Program Files\VapourSynth\sdk\include\vapoursynth;C:\fftw-3.3.8-dll64;$(IncludePath) - C:\fftw-3.3.8-dll64;$(LibraryPath) + C:\Program Files\VapourSynth\sdk\include\vapoursynth;$(IncludePath) - VS_TARGET_CPU_X86;_CRT_SECURE_NO_WARNINGS;NDEBUG;%(PreprocessorDefinitions) + DFTTEST_X86;_CRT_SECURE_NO_WARNINGS;NDEBUG;%(PreprocessorDefinitions) Level3 true false + false true + stdcpp17 Windows @@ -55,11 +55,14 @@ AdvancedVectorExtensions2 + + AdvancedVectorExtensions512 + - + - + diff --git a/DFTTest/DFTTest.vcxproj.filters b/DFTTest/DFTTest.vcxproj.filters index b8ccc71..011a3d5 100644 --- a/DFTTest/DFTTest.vcxproj.filters +++ b/DFTTest/DFTTest.vcxproj.filters @@ -24,12 +24,15 @@ Source Files - + + Source Files + + Source Files - + Header Files diff --git a/DFTTest/DFTTest_AVX2.cpp b/DFTTest/DFTTest_AVX2.cpp index b9b1211..ddd859d 100644 --- a/DFTTest/DFTTest_AVX2.cpp +++ b/DFTTest/DFTTest_AVX2.cpp @@ -1,52 +1,27 @@ -#ifdef VS_TARGET_CPU_X86 -#ifndef __AVX2__ -#define __AVX2__ -#endif - -#include "DFTTest.hpp" +#ifdef DFTTEST_X86 +#include "DFTTest.h" -#include "vectorclass/vectormath_exp.h" +#include "VCL2/vectormath_exp.h" -template -static inline void proc0(const T * s0, const float * s1, float * d, const int p0, const int p1, const float divisor) noexcept; - -template<> -inline void proc0(const uint8_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept { +template +static inline auto proc0(const pixel_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float srcScale) noexcept { for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v += 8) { - const Vec8f s0 = to_float(Vec8i().load_8uc(_s0 + v)); - const Vec8f s1 = Vec8f().load(_s1 + v); - (s0 * s1).store(d + v); - } + for (int v = 0; v < p1; v += Vec8f().size()) { + Vec8f s0; - _s0 += p0; - _s1 += p1; - d += p1; - } -} + if constexpr (std::is_same_v) + s0 = to_float(Vec8i().load_8uc(_s0 + v)); + else if constexpr (std::is_same_v) + s0 = to_float(Vec8i().load_8us(_s0 + v)); + else + s0 = Vec8f().load(_s0 + v); -template<> -inline void proc0(const uint16_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept { - for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v += 8) { - const Vec8f s0 = to_float(Vec8i().load_8us(_s0 + v)); const Vec8f s1 = Vec8f().load(_s1 + v); - (s0 * divisor * s1).store(d + v); - } - _s0 += p0; - _s1 += p1; - d += p1; - } -} - -template<> -inline void proc0(const float * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept { - for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v += 8) { - const Vec8f s0 = Vec8f().load(_s0 + v); - const Vec8f s1 = Vec8f().load(_s1 + v); - (s0 * 255.0f * s1).store(d + v); + if constexpr (std::is_same_v) + (s0 * s1).store(d + v); + else + (s0 * srcScale * s1).store(d + v); } _s0 += p0; @@ -55,9 +30,9 @@ inline void proc0(const float * _s0, const float * _s1, float * d, const int p0, } } -static inline void proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { +static inline auto proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { for (int u = 0; u < p0; u++) { - for (int v = 0; v < p0; v += 8) { + for (int v = 0; v < p0; v += Vec8f().size()) { const Vec8f s0 = Vec8f().load(_s0 + v); const Vec8f s1 = Vec8f().load(_s1 + v); const Vec8f d = Vec8f().load(_d + v); @@ -70,13 +45,13 @@ static inline void proc1(const float * _s0, const float * _s1, float * _d, const } } -static inline void proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { - const int regularPart = p0 & -8; +static inline auto proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { + const int regularPart = p0 & ~(Vec8f().size() - 1); for (int u = 0; u < p0; u++) { int v; - for (v = 0; v < regularPart; v += 8) { + for (v = 0; v < regularPart; v += Vec8f().size()) { const Vec8f s0 = Vec8f().load(_s0 + v); const Vec8f s1 = Vec8f().load(_s1 + v); const Vec8f d = Vec8f().load(_d + v); @@ -94,10 +69,10 @@ static inline void proc1Partial(const float * _s0, const float * _s1, float * _d } } -static inline void removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept { +static inline auto removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept { const Vec8f gf = _dftc[0] / _dftgc[0]; - for (int h = 0; h < ccnt; h += 8) { + for (int h = 0; h < ccnt; h += Vec8f().size()) { const Vec8f dftgc = Vec8f().load_a(_dftgc + h); const Vec8f dftc = Vec8f().load_a(_dftc + h); const Vec8f dftc2 = gf * dftgc; @@ -106,8 +81,8 @@ static inline void removeMean(float * _dftc, const float * _dftgc, const int ccn } } -static inline void addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept { - for (int h = 0; h < ccnt; h += 8) { +static inline auto addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept { + for (int h = 0; h < ccnt; h += Vec8f().size()) { const Vec8f dftc = Vec8f().load_a(_dftc + h); const Vec8f dftc2 = Vec8f().load_a(_dftc2 + h); (dftc + dftc2).store_a(_dftc + h); @@ -115,133 +90,65 @@ static inline void addMean(float * _dftc, const int ccnt, const float * _dftc2) } template -void filter_avx2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; - -template<> -void filter_avx2<0>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - const Vec8f dftcSquare = dftc * dftc; - const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare); - const Vec8f mult = max((psd - sigmas) * approx_recipr(psd + 1e-15f), zero_8f()); - (dftc * mult).store_a(_dftc + h); - } -} - -template<> -void filter_avx2<1>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - const Vec8f dftcSquare = dftc * dftc; - const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare); - select(psd < sigmas, zero_8f(), dftc).store_a(_dftc + h); - } -} - -template<> -void filter_avx2<2>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - (dftc * sigmas).store_a(_dftc + h); - } -} - -template<> -void filter_avx2<3>(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - const Vec8f sigmas2 = Vec8f().load_a(_sigmas2 + h); - const Vec8f pmin = Vec8f().load_a(_pmin + h); - const Vec8f pmax = Vec8f().load_a(_pmax + h); - const Vec8f dftcSquare = dftc * dftc; - const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare); - select(psd >= pmin && psd <= pmax, dftc * sigmas, dftc * sigmas2).store_a(_dftc + h); - } -} - -template<> -void filter_avx2<4>(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - const Vec8f pmin = Vec8f().load_a(_pmin + h); - const Vec8f pmax = Vec8f().load_a(_pmax + h); - const Vec8f dftcSquare = dftc * dftc; - const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare) + 1e-15f; - const Vec8f mult = sigmas * sqrt(psd * pmax * approx_recipr((psd + pmin) * (psd + pmax))); - (dftc * mult).store_a(_dftc + h); - } -} +inline void filter_avx2(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept { + const Vec8f beta = _pmin[0]; -template<> -void filter_avx2<5>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - const Vec8f beta = pmin[0]; + for (int h = 0; h < ccnt; h += Vec8f().size()) { + Vec8f dftc, psd, sigmas, pmin, pmax, mult; - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - const Vec8f dftcSquare = dftc * dftc; - const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare); - const Vec8f mult = pow(max((psd - sigmas) * approx_recipr(psd + 1e-15f), zero_8f()), beta); - (dftc * mult).store_a(_dftc + h); - } -} + dftc = Vec8f().load_a(_dftc + h); + sigmas = Vec8f().load_a(_sigmas + h); -template<> -void filter_avx2<6>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - const Vec8f dftc = Vec8f().load_a(_dftc + h); - const Vec8f sigmas = Vec8f().load_a(_sigmas + h); - const Vec8f dftcSquare = dftc * dftc; - const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare); - const Vec8f mult = sqrt(max((psd - sigmas) * approx_recipr(psd + 1e-15f), zero_8f())); - (dftc * mult).store_a(_dftc + h); - } -} - -template -static void cast(const float * ebp, T * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept; - -template<> -void cast(const float * ebp, uint8_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept { - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x += 16) { - const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load(ebp + x) + 0.5f); - const Vec8i srcp_8i_2 = truncate_to_int(Vec8f().load(ebp + x + 8) + 0.5f); - const Vec16s srcp_16s = compress_saturated(srcp_8i_1, srcp_8i_2); - const Vec16uc srcp = compress_saturated_s2u(srcp_16s.get_low(), srcp_16s.get_high()); - srcp.stream(dstp + x); + if constexpr (type != 2) { + const Vec8f dftcSquare = dftc * dftc; + psd = dftcSquare + permute8<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare); } - ebp += ebpStride; - dstp += dstStride; - } -} + if constexpr (type == 3 || type == 4) { + pmin = Vec8f().load_a(_pmin + h); + pmax = Vec8f().load_a(_pmax + h); + } -template<> -void cast(const float * ebp, uint16_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept { - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x += 8) { - const Vec8i srcp_8i = truncate_to_int(mul_add(Vec8f().load(ebp + x), multiplier, 0.5f)); - const Vec8us srcp = compress_saturated_s2u(srcp_8i.get_low(), srcp_8i.get_high()); - min(srcp, peak).stream(dstp + x); + if constexpr (type == 0) { + mult = max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_8f()); + } else if constexpr (type == 1) { + dftc = select(psd < sigmas, zero_8f(), dftc); + } else if constexpr (type == 2) { + dftc *= sigmas; + } else if constexpr (type == 3) { + const Vec8f sigmas2 = Vec8f().load_a(_sigmas2 + h); + dftc = select(psd >= pmin && psd <= pmax, dftc * sigmas, dftc * sigmas2); + } else if constexpr (type == 4) { + mult = sigmas * sqrt(psd * pmax * rcp_nr(mul_add(psd + pmin, psd + pmax, 1e-15f))); + } else if constexpr (type == 5) { + mult = pow(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_8f()), beta); + } else { + mult = sqrt(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_8f())); } - ebp += ebpStride; - dstp += dstStride; + if constexpr (type == 0 || type > 3) + dftc *= mult; + + dftc.store_a(_dftc + h); } } -template<> -void cast(const float * ebp, float * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept { +template +static auto cast(const float * ebp, pixel_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float dstScale, const int peak) noexcept { for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x += 8) { - const Vec8f srcp = Vec8f().load(ebp + x); - (srcp * (1.0f / 255.0f)).stream(dstp + x); + for (int x = 0; x < dstWidth; x += Vec8f().size()) { + if constexpr (std::is_same_v) { + const Vec8i srcp = truncatei(Vec8f().load(ebp + x) + 0.5f); + const auto result = compress_saturated_s2u(compress_saturated(srcp, zero_si256()), zero_si256()).get_low(); + result.storel(dstp + x); + } else if constexpr (std::is_same_v) { + const Vec8i srcp = truncatei(mul_add(Vec8f().load(ebp + x), dstScale, 0.5f)); + const auto result = compress_saturated_s2u(srcp, zero_si256()).get_low(); + min(result, peak).store_nt(dstp + x); + } else { + const Vec8f srcp = Vec8f().load(ebp + x) * dstScale; + srcp.store_nt(dstp + x); + } } ebp += ebpStride; @@ -249,47 +156,56 @@ void cast(const float * ebp, float * dstp, const int dstWidth, const int dstHeig } } -template -void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept { +template +void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); + const auto threadId = std::this_thread::get_id(); - float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0)); - float * dftr = d->dftr.at(threadId); - fftwf_complex * dftc = d->dftc.at(threadId); - fftwf_complex * dftc2 = d->dftc2.at(threadId); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int width = d->padWidth[plane]; const int height = d->padHeight[plane]; const int eheight = d->eheight[plane]; - const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(T); - const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float); - const T * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); + const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); float * ebpSaved = ebuff; memset(ebuff, 0, ebpStride * height * sizeof(float)); for (int y = 0; y < eheight; y += d->inc) { for (int x = 0; x <= width - d->sbsize; x += d->inc) { - proc0(srcp + x, d->hw, dftr, srcStride, d->sbsize, d->divisor); + proc0(srcp + x, hw, dftr, srcStride, d->sbsize, d->srcScale); - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(ft, dftr, dftc); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); - d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2); + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); if (d->zmean) addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); - fftwf_execute_dft_c2r(d->fti, dftc, dftr); + fftwf_execute_dft_c2r(fti, dftc, dftr); if (d->type & 1) { // spatial overlapping - if (!(d->sbsize & 7)) - proc1(dftr, d->hw, ebpSaved + x, d->sbsize, ebpStride); + if (!(d->sbsize & (Vec8f().size() - 1))) + proc1(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); else - proc1Partial(dftr, d->hw, ebpSaved + x, d->sbsize, ebpStride); + proc1Partial(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); } else { - ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * d->hw[d->sbd1 * d->sbsize + d->sbd1]; + ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * hw[d->sbd1 * d->sbsize + d->sbd1]; } } @@ -299,61 +215,67 @@ void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, c const int dstWidth = vsapi->getFrameWidth(dst, plane); const int dstHeight = vsapi->getFrameHeight(dst, plane); - const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); - T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; - cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak); + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); } } } -template void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept; +template +void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); -template -void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept { const auto threadId = std::this_thread::get_id(); - float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0)); - float * dftr = d->dftr.at(threadId); - fftwf_complex * dftc = d->dftc.at(threadId); - fftwf_complex * dftc2 = d->dftc2.at(threadId); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int width = d->padWidth[plane]; const int height = d->padHeight[plane]; const int eheight = d->eheight[plane]; - const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(T); - const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float); - const T * srcp[15] = {}; + const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + + const pixel_t * srcp[15] = {}; for (int i = 0; i < d->tbsize; i++) - srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); + srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); memset(ebuff, 0, ebpStride * height * sizeof(float)); for (int y = 0; y < eheight; y += d->inc) { for (int x = 0; x <= width - d->sbsize; x += d->inc) { for (int z = 0; z < d->tbsize; z++) - proc0(srcp[z] + x, d->hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->divisor); + proc0(srcp[z] + x, hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->srcScale); - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(ft, dftr, dftc); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); - d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2); + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); if (d->zmean) addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); - fftwf_execute_dft_c2r(d->fti, dftc, dftr); + fftwf_execute_dft_c2r(fti, dftc, dftr); if (d->type & 1) { // spatial overlapping - if (!(d->sbsize & 7)) - proc1(dftr + pos * d->barea, d->hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + if (!(d->sbsize & (Vec8f().size() - 1))) + proc1(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); else - proc1Partial(dftr + pos * d->barea, d->hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + proc1Partial(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); } else { - ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * d->hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; + ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; } } @@ -363,15 +285,27 @@ void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const const int dstWidth = vsapi->getFrameWidth(dst, plane); const int dstHeight = vsapi->getFrameHeight(dst, plane); - const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); - T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; - cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak); + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); } } } -template void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept; +template void filter_avx2<0>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx2<1>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx2<2>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx2<3>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx2<4>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx2<5>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx2<6>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; + +template void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; + +template void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; #endif diff --git a/DFTTest/DFTTest_AVX512.cpp b/DFTTest/DFTTest_AVX512.cpp new file mode 100644 index 0000000..c15c691 --- /dev/null +++ b/DFTTest/DFTTest_AVX512.cpp @@ -0,0 +1,311 @@ +#ifdef DFTTEST_X86 +#include "DFTTest.h" + +#include "VCL2/vectormath_exp.h" + +template +static inline auto proc0(const pixel_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float srcScale) noexcept { + for (int u = 0; u < p1; u++) { + for (int v = 0; v < p1; v += Vec16f().size()) { + Vec16f s0; + + if constexpr (std::is_same_v) + s0 = to_float(Vec16i().load_16uc(_s0 + v)); + else if constexpr (std::is_same_v) + s0 = to_float(Vec16i().load_16us(_s0 + v)); + else + s0 = Vec16f().load(_s0 + v); + + const Vec16f s1 = Vec16f().load(_s1 + v); + + if constexpr (std::is_same_v) + (s0 * s1).store(d + v); + else + (s0 * srcScale * s1).store(d + v); + } + + _s0 += p0; + _s1 += p1; + d += p1; + } +} + +static inline auto proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { + for (int u = 0; u < p0; u++) { + for (int v = 0; v < p0; v += Vec16f().size()) { + const Vec16f s0 = Vec16f().load(_s0 + v); + const Vec16f s1 = Vec16f().load(_s1 + v); + const Vec16f d = Vec16f().load(_d + v); + mul_add(s0, s1, d).store(_d + v); + } + + _s0 += p0; + _s1 += p0; + _d += p1; + } +} + +static inline auto proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { + const int regularPart = p0 & ~(Vec16f().size() - 1); + + for (int u = 0; u < p0; u++) { + int v; + + for (v = 0; v < regularPart; v += Vec16f().size()) { + const Vec16f s0 = Vec16f().load(_s0 + v); + const Vec16f s1 = Vec16f().load(_s1 + v); + const Vec16f d = Vec16f().load(_d + v); + mul_add(s0, s1, d).store(_d + v); + } + + const Vec16f s0 = Vec16f().load(_s0 + v); + const Vec16f s1 = Vec16f().load(_s1 + v); + const Vec16f d = Vec16f().load(_d + v); + mul_add(s0, s1, d).store_partial(p0 - v, _d + v); + + _s0 += p0; + _s1 += p0; + _d += p1; + } +} + +static inline auto removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept { + const Vec16f gf = _dftc[0] / _dftgc[0]; + + for (int h = 0; h < ccnt; h += Vec16f().size()) { + const Vec16f dftgc = Vec16f().load_a(_dftgc + h); + const Vec16f dftc = Vec16f().load_a(_dftc + h); + const Vec16f dftc2 = gf * dftgc; + dftc2.store_a(_dftc2 + h); + (dftc - dftc2).store_a(_dftc + h); + } +} + +static inline auto addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept { + for (int h = 0; h < ccnt; h += Vec16f().size()) { + const Vec16f dftc = Vec16f().load_a(_dftc + h); + const Vec16f dftc2 = Vec16f().load_a(_dftc2 + h); + (dftc + dftc2).store_a(_dftc + h); + } +} + +template +inline void filter_avx512(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept { + const Vec16f beta = _pmin[0]; + + for (int h = 0; h < ccnt; h += Vec16f().size()) { + Vec16f dftc, psd, sigmas, pmin, pmax, mult; + + dftc = Vec16f().load_a(_dftc + h); + sigmas = Vec16f().load_a(_sigmas + h); + + if constexpr (type != 2) { + const Vec16f dftcSquare = dftc * dftc; + psd = dftcSquare + permute16<1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14>(dftcSquare); + } + + if constexpr (type == 3 || type == 4) { + pmin = Vec16f().load_a(_pmin + h); + pmax = Vec16f().load_a(_pmax + h); + } + + if constexpr (type == 0) { + mult = max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_16f()); + } else if constexpr (type == 1) { + dftc = select(psd < sigmas, zero_16f(), dftc); + } else if constexpr (type == 2) { + dftc *= sigmas; + } else if constexpr (type == 3) { + const Vec16f sigmas2 = Vec16f().load_a(_sigmas2 + h); + dftc = select(psd >= pmin && psd <= pmax, dftc * sigmas, dftc * sigmas2); + } else if constexpr (type == 4) { + mult = sigmas * sqrt(psd * pmax * rcp_nr(mul_add(psd + pmin, psd + pmax, 1e-15f))); + } else if constexpr (type == 5) { + mult = pow(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_16f()), beta); + } else { + mult = sqrt(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_16f())); + } + + if constexpr (type == 0 || type > 3) + dftc *= mult; + + dftc.store_a(_dftc + h); + } +} + +template +static auto cast(const float * ebp, pixel_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float dstScale, const int peak) noexcept { + for (int y = 0; y < dstHeight; y++) { + for (int x = 0; x < dstWidth; x += Vec16f().size()) { + if constexpr (std::is_same_v) { + const Vec16i srcp = truncatei(Vec16f().load(ebp + x) + 0.5f); + const auto result = compress_saturated_s2u(compress_saturated(srcp, zero_si512()), zero_si512()).get_low().get_low(); + result.store_nt(dstp + x); + } else if constexpr (std::is_same_v) { + const Vec16i srcp = truncatei(mul_add(Vec16f().load(ebp + x), dstScale, 0.5f)); + const auto result = compress_saturated_s2u(srcp, zero_si512()).get_low(); + min(result, peak).store_nt(dstp + x); + } else { + const Vec16f srcp = Vec16f().load(ebp + x) * dstScale; + srcp.store_nt(dstp + x); + } + } + + ebp += ebpStride; + dstp += dstStride; + } +} + +template +void func_0_avx512(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); + + const auto threadId = std::this_thread::get_id(); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); + + for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { + if (d->process[plane]) { + const int width = d->padWidth[plane]; + const int height = d->padHeight[plane]; + const int eheight = d->eheight[plane]; + const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); + float * ebpSaved = ebuff; + + memset(ebuff, 0, ebpStride * height * sizeof(float)); + + for (int y = 0; y < eheight; y += d->inc) { + for (int x = 0; x <= width - d->sbsize; x += d->inc) { + proc0(srcp + x, hw, dftr, srcStride, d->sbsize, d->srcScale); + + fftwf_execute_dft_r2c(ft, dftr, dftc); + if (d->zmean) + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); + + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); + + if (d->zmean) + addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); + fftwf_execute_dft_c2r(fti, dftc, dftr); + + if (d->type & 1) { // spatial overlapping + if (!(d->sbsize & (Vec16f().size() - 1))) + proc1(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); + else + proc1Partial(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); + } else { + ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * hw[d->sbd1 * d->sbsize + d->sbd1]; + } + } + + srcp += srcStride * d->inc; + ebpSaved += ebpStride * d->inc; + } + + const int dstWidth = vsapi->getFrameWidth(dst, plane); + const int dstHeight = vsapi->getFrameHeight(dst, plane); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); + } + } +} + +template +void func_1_avx512(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); + + const auto threadId = std::this_thread::get_id(); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); + + for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { + if (d->process[plane]) { + const int width = d->padWidth[plane]; + const int height = d->padHeight[plane]; + const int eheight = d->eheight[plane]; + const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + + const pixel_t * srcp[15] = {}; + for (int i = 0; i < d->tbsize; i++) + srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); + + memset(ebuff, 0, ebpStride * height * sizeof(float)); + + for (int y = 0; y < eheight; y += d->inc) { + for (int x = 0; x <= width - d->sbsize; x += d->inc) { + for (int z = 0; z < d->tbsize; z++) + proc0(srcp[z] + x, hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->srcScale); + + fftwf_execute_dft_r2c(ft, dftr, dftc); + if (d->zmean) + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); + + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); + + if (d->zmean) + addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); + fftwf_execute_dft_c2r(fti, dftc, dftr); + + if (d->type & 1) { // spatial overlapping + if (!(d->sbsize & (Vec16f().size() - 1))) + proc1(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + else + proc1Partial(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + } else { + ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; + } + } + + for (int q = 0; q < d->tbsize; q++) + srcp[q] += srcStride * d->inc; + } + + const int dstWidth = vsapi->getFrameWidth(dst, plane); + const int dstHeight = vsapi->getFrameHeight(dst, plane); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); + } + } +} + +template void filter_avx512<0>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx512<1>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx512<2>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx512<3>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx512<4>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx512<5>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_avx512<6>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; + +template void func_0_avx512(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_0_avx512(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_0_avx512(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; + +template void func_1_avx512(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_1_avx512(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_1_avx512(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +#endif diff --git a/DFTTest/DFTTest_SSE2.cpp b/DFTTest/DFTTest_SSE2.cpp index 6c9788c..5380edd 100644 --- a/DFTTest/DFTTest_SSE2.cpp +++ b/DFTTest/DFTTest_SSE2.cpp @@ -1,48 +1,27 @@ -#ifdef VS_TARGET_CPU_X86 -#include "DFTTest.hpp" +#ifdef DFTTEST_X86 +#include "DFTTest.h" -#include "vectorclass/vectormath_exp.h" +#include "VCL2/vectormath_exp.h" -template -static inline void proc0(const T * s0, const float * s1, float * d, const int p0, const int p1, const float divisor) noexcept; - -template<> -inline void proc0(const uint8_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept { +template +static inline auto proc0(const pixel_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float srcScale) noexcept { for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v += 4) { - const Vec4f s0 = to_float(Vec4i().load_4uc(_s0 + v)); - const Vec4f s1 = Vec4f().load(_s1 + v); - (s0 * s1).store(d + v); - } + for (int v = 0; v < p1; v += Vec4f().size()) { + Vec4f s0; - _s0 += p0; - _s1 += p1; - d += p1; - } -} + if constexpr (std::is_same_v) + s0 = to_float(Vec4i().load_4uc(_s0 + v)); + else if constexpr (std::is_same_v) + s0 = to_float(Vec4i().load_4us(_s0 + v)); + else + s0 = Vec4f().load(_s0 + v); -template<> -inline void proc0(const uint16_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept { - for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v += 4) { - const Vec4f s0 = to_float(Vec4i().load_4us(_s0 + v)); const Vec4f s1 = Vec4f().load(_s1 + v); - (s0 * divisor * s1).store(d + v); - } - _s0 += p0; - _s1 += p1; - d += p1; - } -} - -template<> -inline void proc0(const float * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept { - for (int u = 0; u < p1; u++) { - for (int v = 0; v < p1; v += 4) { - const Vec4f s0 = Vec4f().load(_s0 + v); - const Vec4f s1 = Vec4f().load(_s1 + v); - (s0 * 255.0f * s1).store(d + v); + if constexpr (std::is_same_v) + (s0 * s1).store(d + v); + else + (s0 * srcScale * s1).store(d + v); } _s0 += p0; @@ -51,9 +30,9 @@ inline void proc0(const float * _s0, const float * _s1, float * d, const int p0, } } -static inline void proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { +static inline auto proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { for (int u = 0; u < p0; u++) { - for (int v = 0; v < p0; v += 4) { + for (int v = 0; v < p0; v += Vec4f().size()) { const Vec4f s0 = Vec4f().load(_s0 + v); const Vec4f s1 = Vec4f().load(_s1 + v); const Vec4f d = Vec4f().load(_d + v); @@ -66,13 +45,13 @@ static inline void proc1(const float * _s0, const float * _s1, float * _d, const } } -static inline void proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { - const int regularPart = p0 & -4; +static inline auto proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept { + const int regularPart = p0 & ~(Vec4f().size() - 1); for (int u = 0; u < p0; u++) { int v; - for (v = 0; v < regularPart; v += 4) { + for (v = 0; v < regularPart; v += Vec4f().size()) { const Vec4f s0 = Vec4f().load(_s0 + v); const Vec4f s1 = Vec4f().load(_s1 + v); const Vec4f d = Vec4f().load(_d + v); @@ -90,10 +69,10 @@ static inline void proc1Partial(const float * _s0, const float * _s1, float * _d } } -static inline void removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept { +static inline auto removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept { const Vec4f gf = _dftc[0] / _dftgc[0]; - for (int h = 0; h < ccnt; h += 4) { + for (int h = 0; h < ccnt; h += Vec4f().size()) { const Vec4f dftgc = Vec4f().load_a(_dftgc + h); const Vec4f dftc = Vec4f().load_a(_dftc + h); const Vec4f dftc2 = gf * dftgc; @@ -102,8 +81,8 @@ static inline void removeMean(float * _dftc, const float * _dftgc, const int ccn } } -static inline void addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept { - for (int h = 0; h < ccnt; h += 4) { +static inline auto addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept { + for (int h = 0; h < ccnt; h += Vec4f().size()) { const Vec4f dftc = Vec4f().load_a(_dftc + h); const Vec4f dftc2 = Vec4f().load_a(_dftc2 + h); (dftc + dftc2).store_a(_dftc + h); @@ -111,226 +90,91 @@ static inline void addMean(float * _dftc, const int ccnt, const float * _dftc2) } template -void filter_sse2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; - -template<> -void filter_sse2<0>(float * dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - Vec4f dftcLow = Vec4f().load_a(dftc + h); - Vec4f dftcHigh = Vec4f().load_a(dftc + h + 4); - Vec4f real = blend4f<0, 2, 4, 6>(dftcLow, dftcHigh); - Vec4f imag = blend4f<1, 3, 5, 7>(dftcLow, dftcHigh); - const Vec4f psd = mul_add(real, real, imag * imag); - - const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h); - const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + 4); - const Vec4f sigmas = blend4f<0, 2, 4, 6>(sigmasLow, sigmasHigh); - - const Vec4f mult = max((psd - sigmas) / (psd + 1e-15f), zero_4f()); - real *= mult; - imag *= mult; - - dftcLow = blend4f<0, 4, 1, 5>(real, imag); - dftcHigh = blend4f<2, 6, 3, 7>(real, imag); - dftcLow.store_a(dftc + h); - dftcHigh.store_a(dftc + h + 4); - } -} - -template<> -void filter_sse2<1>(float * dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - Vec4f dftcLow = Vec4f().load_a(dftc + h); - Vec4f dftcHigh = Vec4f().load_a(dftc + h + 4); - Vec4f real = blend4f<0, 2, 4, 6>(dftcLow, dftcHigh); - Vec4f imag = blend4f<1, 3, 5, 7>(dftcLow, dftcHigh); - const Vec4f psd = mul_add(real, real, imag * imag); - - const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h); - const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + 4); - const Vec4f sigmas = blend4f<0, 2, 4, 6>(sigmasLow, sigmasHigh); - - const Vec4fb flag = (psd < sigmas); - real = select(flag, zero_4f(), real); - imag = select(flag, zero_4f(), imag); - - dftcLow = blend4f<0, 4, 1, 5>(real, imag); - dftcHigh = blend4f<2, 6, 3, 7>(real, imag); - dftcLow.store_a(dftc + h); - dftcHigh.store_a(dftc + h + 4); - } -} - -template<> -void filter_sse2<2>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 4) { - const Vec4f dftc = Vec4f().load_a(_dftc + h); - const Vec4f sigmas = Vec4f().load_a(_sigmas + h); - (dftc * sigmas).store_a(_dftc + h); - } -} - -template<> -void filter_sse2<3>(float * dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - Vec4f dftcLow = Vec4f().load_a(dftc + h); - Vec4f dftcHigh = Vec4f().load_a(dftc + h + 4); - Vec4f real = blend4f<0, 2, 4, 6>(dftcLow, dftcHigh); - Vec4f imag = blend4f<1, 3, 5, 7>(dftcLow, dftcHigh); - const Vec4f psd = mul_add(real, real, imag * imag); - - const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h); - const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + 4); - const Vec4f sigmas = blend4f<0, 2, 4, 6>(sigmasLow, sigmasHigh); - - const Vec4f sigmas2Low = Vec4f().load_a(_sigmas2 + h); - const Vec4f sigmas2High = Vec4f().load_a(_sigmas2 + h + 4); - const Vec4f sigmas2 = blend4f<0, 2, 4, 6>(sigmas2Low, sigmas2High); - - const Vec4f pminLow = Vec4f().load_a(_pmin + h); - const Vec4f pminHigh = Vec4f().load_a(_pmin + h + 4); - const Vec4f pmin = blend4f<0, 2, 4, 6>(pminLow, pminHigh); - - const Vec4f pmaxLow = Vec4f().load_a(_pmax + h); - const Vec4f pmaxHigh = Vec4f().load_a(_pmax + h + 4); - const Vec4f pmax = blend4f<0, 2, 4, 6>(pmaxLow, pmaxHigh); - - const Vec4fb flag = (psd >= pmin && psd <= pmax); - real = select(flag, real * sigmas, real * sigmas2); - imag = select(flag, imag * sigmas, imag * sigmas2); - - dftcLow = blend4f<0, 4, 1, 5>(real, imag); - dftcHigh = blend4f<2, 6, 3, 7>(real, imag); - dftcLow.store_a(dftc + h); - dftcHigh.store_a(dftc + h + 4); - } -} - -template<> -void filter_sse2<4>(float * dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - Vec4f dftcLow = Vec4f().load_a(dftc + h); - Vec4f dftcHigh = Vec4f().load_a(dftc + h + 4); - Vec4f real = blend4f<0, 2, 4, 6>(dftcLow, dftcHigh); - Vec4f imag = blend4f<1, 3, 5, 7>(dftcLow, dftcHigh); - const Vec4f psd = mul_add(real, real, imag * imag) + 1e-15f; - - const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h); - const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + 4); - const Vec4f sigmas = blend4f<0, 2, 4, 6>(sigmasLow, sigmasHigh); - - const Vec4f pminLow = Vec4f().load_a(_pmin + h); - const Vec4f pminHigh = Vec4f().load_a(_pmin + h + 4); - const Vec4f pmin = blend4f<0, 2, 4, 6>(pminLow, pminHigh); - - const Vec4f pmaxLow = Vec4f().load_a(_pmax + h); - const Vec4f pmaxHigh = Vec4f().load_a(_pmax + h + 4); - const Vec4f pmax = blend4f<0, 2, 4, 6>(pmaxLow, pmaxHigh); - - const Vec4f mult = sigmas * sqrt(psd * pmax / ((psd + pmin) * (psd + pmax))); - real *= mult; - imag *= mult; - - dftcLow = blend4f<0, 4, 1, 5>(real, imag); - dftcHigh = blend4f<2, 6, 3, 7>(real, imag); - dftcLow.store_a(dftc + h); - dftcHigh.store_a(dftc + h + 4); - } -} - -template<> -void filter_sse2<5>(float * dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - const Vec4f beta = pmin[0]; - - for (int h = 0; h < ccnt; h += 8) { - Vec4f dftcLow = Vec4f().load_a(dftc + h); - Vec4f dftcHigh = Vec4f().load_a(dftc + h + 4); - Vec4f real = blend4f<0, 2, 4, 6>(dftcLow, dftcHigh); - Vec4f imag = blend4f<1, 3, 5, 7>(dftcLow, dftcHigh); - const Vec4f psd = mul_add(real, real, imag * imag); - - const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h); - const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + 4); - const Vec4f sigmas = blend4f<0, 2, 4, 6>(sigmasLow, sigmasHigh); - - const Vec4f mult = pow(max((psd - sigmas) / (psd + 1e-15f), zero_4f()), beta); - real *= mult; - imag *= mult; - - dftcLow = blend4f<0, 4, 1, 5>(real, imag); - dftcHigh = blend4f<2, 6, 3, 7>(real, imag); - dftcLow.store_a(dftc + h); - dftcHigh.store_a(dftc + h + 4); - } -} - -template<> -void filter_sse2<6>(float * dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept { - for (int h = 0; h < ccnt; h += 8) { - Vec4f dftcLow = Vec4f().load_a(dftc + h); - Vec4f dftcHigh = Vec4f().load_a(dftc + h + 4); - Vec4f real = blend4f<0, 2, 4, 6>(dftcLow, dftcHigh); - Vec4f imag = blend4f<1, 3, 5, 7>(dftcLow, dftcHigh); - const Vec4f psd = mul_add(real, real, imag * imag); - - const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h); - const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + 4); - const Vec4f sigmas = blend4f<0, 2, 4, 6>(sigmasLow, sigmasHigh); - - const Vec4f mult = sqrt(max((psd - sigmas) / (psd + 1e-15f), zero_4f())); - real *= mult; - imag *= mult; - - dftcLow = blend4f<0, 4, 1, 5>(real, imag); - dftcHigh = blend4f<2, 6, 3, 7>(real, imag); - dftcLow.store_a(dftc + h); - dftcHigh.store_a(dftc + h + 4); - } -} +inline void filter_sse2(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept { + const int step = Vec4f().size() * 2; + const Vec4f beta = _pmin[0]; + + for (int h = 0; h < ccnt; h += step) { + Vec4f dftcLow, dftcHigh, real, imag, psd, sigmas, pmin, pmax, mult; + + if constexpr (type != 2) { + dftcLow = Vec4f().load_a(_dftc + h + 0); + dftcHigh = Vec4f().load_a(_dftc + h + Vec4f().size()); + real = blend4<0, 2, 4, 6>(dftcLow, dftcHigh); + imag = blend4<1, 3, 5, 7>(dftcLow, dftcHigh); + psd = mul_add(real, real, imag * imag); + + const Vec4f sigmasLow = Vec4f().load_a(_sigmas + h + 0); + const Vec4f sigmasHigh = Vec4f().load_a(_sigmas + h + Vec4f().size()); + sigmas = blend4<0, 2, 4, 6>(sigmasLow, sigmasHigh); + } -template -static void cast(const float * ebp, T * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept; + if constexpr (type == 3 || type == 4) { + const Vec4f pminLow = Vec4f().load_a(_pmin + h + 0); + const Vec4f pminHigh = Vec4f().load_a(_pmin + h + Vec4f().size()); + pmin = blend4<0, 2, 4, 6>(pminLow, pminHigh); -template<> -void cast(const float * ebp, uint8_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept { - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x += 16) { - const Vec4i srcp_4i_1 = truncate_to_int(Vec4f().load(ebp + x) + 0.5f); - const Vec4i srcp_4i_2 = truncate_to_int(Vec4f().load(ebp + x + 4) + 0.5f); - const Vec4i srcp_4i_3 = truncate_to_int(Vec4f().load(ebp + x + 8) + 0.5f); - const Vec4i srcp_4i_4 = truncate_to_int(Vec4f().load(ebp + x + 12) + 0.5f); - const Vec8s srcp_8s_1 = compress_saturated(srcp_4i_1, srcp_4i_2); - const Vec8s srcp_8s_2 = compress_saturated(srcp_4i_3, srcp_4i_4); - const Vec16uc srcp = compress_saturated_s2u(srcp_8s_1, srcp_8s_2); - srcp.stream(dstp + x); + const Vec4f pmaxLow = Vec4f().load_a(_pmax + h + 0); + const Vec4f pmaxHigh = Vec4f().load_a(_pmax + h + Vec4f().size()); + pmax = blend4<0, 2, 4, 6>(pmaxLow, pmaxHigh); } - ebp += ebpStride; - dstp += dstStride; - } -} + if constexpr (type == 0) { + mult = max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_4f()); + } else if constexpr (type == 1) { + const Vec4fb flag = (psd < sigmas); + real = select(flag, zero_4f(), real); + imag = select(flag, zero_4f(), imag); + } else if constexpr (type == 2) { + const Vec4f dftc = Vec4f().load_a(_dftc + h); + sigmas = Vec4f().load_a(_sigmas + h); + (dftc * sigmas).store_a(_dftc + h); + } else if constexpr (type == 3) { + const Vec4f sigmas2Low = Vec4f().load_a(_sigmas2 + h + 0); + const Vec4f sigmas2High = Vec4f().load_a(_sigmas2 + h + Vec4f().size()); + const Vec4f sigmas2 = blend4<0, 2, 4, 6>(sigmas2Low, sigmas2High); + + const Vec4fb flag = (psd >= pmin && psd <= pmax); + real = select(flag, real * sigmas, real * sigmas2); + imag = select(flag, imag * sigmas, imag * sigmas2); + } else if constexpr (type == 4) { + mult = sigmas * sqrt(psd * pmax * rcp_nr(mul_add(psd + pmin, psd + pmax, 1e-15f))); + } else if constexpr (type == 5) { + mult = pow(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_4f()), beta); + } else { + mult = sqrt(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_4f())); + } -template<> -void cast(const float * ebp, uint16_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept { - for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x += 8) { - const Vec4i srcp_4i_1 = truncate_to_int(mul_add(Vec4f().load(ebp + x), multiplier, 0.5f)); - const Vec4i srcp_4i_2 = truncate_to_int(mul_add(Vec4f().load(ebp + x + 4), multiplier, 0.5f)); - const Vec8us srcp = compress_saturated_s2u(srcp_4i_1, srcp_4i_2); - min(srcp, peak).stream(dstp + x); + if constexpr (type == 0 || type > 3) { + real *= mult; + imag *= mult; } - ebp += ebpStride; - dstp += dstStride; + if constexpr (type != 2) { + dftcLow = blend4<0, 4, 1, 5>(real, imag); + dftcHigh = blend4<2, 6, 3, 7>(real, imag); + dftcLow.store_a(_dftc + h + 0); + dftcHigh.store_a(_dftc + h + Vec4f().size()); + } } } -template<> -void cast(const float * ebp, float * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept { +template +static auto cast(const float * ebp, pixel_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float dstScale, const int peak) noexcept { for (int y = 0; y < dstHeight; y++) { - for (int x = 0; x < dstWidth; x += 4) { - const Vec4f srcp = Vec4f().load(ebp + x); - (srcp * (1.0f / 255.0f)).stream(dstp + x); + for (int x = 0; x < dstWidth; x += Vec4f().size()) { + if constexpr (std::is_same_v) { + const Vec4i srcp = truncatei(Vec4f().load(ebp + x) + 0.5f); + const auto result = compress_saturated_s2u(compress_saturated(srcp, zero_si128()), zero_si128()); + result.store_si32(dstp + x); + } else if constexpr (std::is_same_v) { + const Vec4i srcp = truncatei(mul_add(Vec4f().load(ebp + x), dstScale, 0.5f)); + const auto result = compress_saturated_s2u(srcp, zero_si128()); + min(result, peak).storel(dstp + x); + } else { + const Vec4f srcp = Vec4f().load(ebp + x) * dstScale; + srcp.store_nt(dstp + x); + } } ebp += ebpStride; @@ -338,47 +182,56 @@ void cast(const float * ebp, float * dstp, const int dstWidth, const int dstHeig } } -template -void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept { +template +void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); + const auto threadId = std::this_thread::get_id(); - float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0)); - float * dftr = d->dftr.at(threadId); - fftwf_complex * dftc = d->dftc.at(threadId); - fftwf_complex * dftc2 = d->dftc2.at(threadId); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int width = d->padWidth[plane]; const int height = d->padHeight[plane]; const int eheight = d->eheight[plane]; - const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(T); - const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float); - const T * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); + const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0)); float * ebpSaved = ebuff; memset(ebuff, 0, ebpStride * height * sizeof(float)); for (int y = 0; y < eheight; y += d->inc) { for (int x = 0; x <= width - d->sbsize; x += d->inc) { - proc0(srcp + x, d->hw, dftr, srcStride, d->sbsize, d->divisor); + proc0(srcp + x, hw, dftr, srcStride, d->sbsize, d->srcScale); - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(ft, dftr, dftc); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); - d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2); + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); if (d->zmean) addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); - fftwf_execute_dft_c2r(d->fti, dftc, dftr); + fftwf_execute_dft_c2r(fti, dftc, dftr); if (d->type & 1) { // spatial overlapping - if (!(d->sbsize & 3)) - proc1(dftr, d->hw, ebpSaved + x, d->sbsize, ebpStride); + if (!(d->sbsize & (Vec4f().size() - 1))) + proc1(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); else - proc1Partial(dftr, d->hw, ebpSaved + x, d->sbsize, ebpStride); + proc1Partial(dftr, hw, ebpSaved + x, d->sbsize, ebpStride); } else { - ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * d->hw[d->sbd1 * d->sbsize + d->sbd1]; + ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * hw[d->sbd1 * d->sbsize + d->sbd1]; } } @@ -388,61 +241,67 @@ void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, c const int dstWidth = vsapi->getFrameWidth(dst, plane); const int dstHeight = vsapi->getFrameHeight(dst, plane); - const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); - T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; - cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak); + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); } } } -template void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept; +template +void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept { + const float * hw = d->hw.get(); + const float * sigmas = d->sigmas.get(); + const float * sigmas2 = d->sigmas2.get(); + const float * pmins = d->pmins.get(); + const float * pmaxs = d->pmaxs.get(); + const fftwf_complex * dftgc = d->dftgc.get(); + fftwf_plan ft = d->ft.get(); + fftwf_plan fti = d->fti.get(); -template -void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept { const auto threadId = std::this_thread::get_id(); - float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0)); - float * dftr = d->dftr.at(threadId); - fftwf_complex * dftc = d->dftc.at(threadId); - fftwf_complex * dftc2 = d->dftc2.at(threadId); + float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0)); + float * dftr = d->dftr.at(threadId).get(); + fftwf_complex * dftc = d->dftc.at(threadId).get(); + fftwf_complex * dftc2 = d->dftc2.at(threadId).get(); for (int plane = 0; plane < d->vi->format->numPlanes; plane++) { if (d->process[plane]) { const int width = d->padWidth[plane]; const int height = d->padHeight[plane]; const int eheight = d->eheight[plane]; - const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(T); - const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float); - const T * srcp[15] = {}; + const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(pixel_t); + const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float); + + const pixel_t * srcp[15] = {}; for (int i = 0; i < d->tbsize; i++) - srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); + srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0)); memset(ebuff, 0, ebpStride * height * sizeof(float)); for (int y = 0; y < eheight; y += d->inc) { for (int x = 0; x <= width - d->sbsize; x += d->inc) { for (int z = 0; z < d->tbsize; z++) - proc0(srcp[z] + x, d->hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->divisor); + proc0(srcp[z] + x, hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->srcScale); - fftwf_execute_dft_r2c(d->ft, dftr, dftc); + fftwf_execute_dft_r2c(ft, dftr, dftc); if (d->zmean) - removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2)); + removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2)); - d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2); + d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2); if (d->zmean) addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2)); - fftwf_execute_dft_c2r(d->fti, dftc, dftr); + fftwf_execute_dft_c2r(fti, dftc, dftr); if (d->type & 1) { // spatial overlapping - if (!(d->sbsize & 3)) - proc1(dftr + pos * d->barea, d->hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + if (!(d->sbsize & (Vec4f().size() - 1))) + proc1(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); else - proc1Partial(dftr + pos * d->barea, d->hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); + proc1Partial(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride); } else { - ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * d->hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; + ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1]; } } @@ -452,15 +311,27 @@ void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const const int dstWidth = vsapi->getFrameWidth(dst, plane); const int dstHeight = vsapi->getFrameHeight(dst, plane); - const int dstStride = vsapi->getStride(dst, plane) / sizeof(T); - T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); + const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t); + pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane)); const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2; - cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak); + cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak); } } } -template void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept; -template void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept; +template void filter_sse2<0>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_sse2<1>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_sse2<2>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_sse2<3>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_sse2<4>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_sse2<5>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; +template void filter_sse2<6>(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept; + +template void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; + +template void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; +template void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept; #endif diff --git a/DFTTest/VCL2/LICENSE b/DFTTest/VCL2/LICENSE new file mode 100644 index 0000000..fd2deda --- /dev/null +++ b/DFTTest/VCL2/LICENSE @@ -0,0 +1,191 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + + Copyright 2012-2019 Agner Fog. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/DFTTest/VCL2/instrset.h b/DFTTest/VCL2/instrset.h new file mode 100644 index 0000000..34e34b7 --- /dev/null +++ b/DFTTest/VCL2/instrset.h @@ -0,0 +1,1485 @@ +/**************************** instrset.h ********************************** +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2020-06-08 +* Version: 2.01.03 +* Project: vector class library +* Description: +* Header file for various compiler-specific tasks as well as common +* macros and templates. This file contains: +* +* > Selection of the supported instruction set +* > Defines compiler version macros +* > Undefines certain macros that prevent function overloading +* > Helper functions that depend on instruction set, compiler, or platform +* > Common templates for permute, blend, etc. +* +* For instructions, see vcl_manual.pdf +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ + +#ifndef INSTRSET_H +#define INSTRSET_H 20102 + + +// Allow the use of floating point permute instructions on integer vectors. +// Some CPU's have an extra latency of 1 or 2 clock cycles for this, but +// it may still be faster than alternative implementations: +#define ALLOW_FP_PERMUTE true + + +// Macro to indicate 64 bit mode +#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__) +#define __x86_64__ 1 // There are many different macros for this, decide on only one +#endif + +// The following values of INSTRSET are currently defined: +// 2: SSE2 +// 3: SSE3 +// 4: SSSE3 +// 5: SSE4.1 +// 6: SSE4.2 +// 7: AVX +// 8: AVX2 +// 9: AVX512F +// 10: AVX512BW/DQ/VL +// In the future, INSTRSET = 11 may include AVX512VBMI and AVX512VBMI2, but this +// decision cannot be made before the market situation for CPUs with these +// instruction sets is known (these future instruction set extensions are already +// used in some VCL functions and tested with an emulator) + +// Find instruction set from compiler macros if INSTRSET is not defined. +// Note: Most of these macros are not defined in Microsoft compilers +#ifndef INSTRSET +#if defined ( __AVX512VL__ ) && defined ( __AVX512BW__ ) && defined ( __AVX512DQ__ ) +#define INSTRSET 10 +#elif defined ( __AVX512F__ ) || defined ( __AVX512__ ) +#define INSTRSET 9 +#elif defined ( __AVX2__ ) +#define INSTRSET 8 +#elif defined ( __AVX__ ) +#define INSTRSET 7 +#elif defined ( __SSE4_2__ ) +#define INSTRSET 6 +#elif defined ( __SSE4_1__ ) +#define INSTRSET 5 +#elif defined ( __SSSE3__ ) +#define INSTRSET 4 +#elif defined ( __SSE3__ ) +#define INSTRSET 3 +#elif defined ( __SSE2__ ) || defined ( __x86_64__ ) +#define INSTRSET 2 +#elif defined ( __SSE__ ) +#define INSTRSET 1 +#elif defined ( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2 +#define INSTRSET _M_IX86_FP +#else +#define INSTRSET 0 +#endif // instruction set defines +#endif // INSTRSET + +// Include the appropriate header file for intrinsic functions +#if INSTRSET > 7 // AVX2 and later +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) +#include // x86intrin.h includes header files for whatever instruction + // sets are specified on the compiler command line, such as: + // xopintrin.h, fma4intrin.h +#else +#include // MS/Intel version of immintrin.h covers AVX and later +#endif // __GNUC__ +#elif INSTRSET == 7 +#include // AVX +#elif INSTRSET == 6 +#include // SSE4.2 +#elif INSTRSET == 5 +#include // SSE4.1 +#elif INSTRSET == 4 +#include // SSSE3 +#elif INSTRSET == 3 +#include // SSE3 +#elif INSTRSET == 2 +#include // SSE2 +#elif INSTRSET == 1 +#include // SSE +#endif // INSTRSET + +#if INSTRSET >= 8 && !defined(__FMA__) +// Assume that all processors that have AVX2 also have FMA3 +#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) +// Prevent error message in g++ and Clang when using FMA intrinsics with avx2: +#if !defined(DISABLE_WARNING_AVX2_WITHOUT_FMA) +#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" +#endif +#elif ! defined (__clang__) +#define __FMA__ 1 +#endif +#endif + +// AMD instruction sets +#if defined (__XOP__) || defined (__FMA4__) +#ifdef __GNUC__ +#include // AMD XOP (Gnu) +#else +#include // AMD XOP (Microsoft) +#endif // __GNUC__ +#elif defined (__SSE4A__) // AMD SSE4A +#include +#endif // __XOP__ + +// FMA3 instruction set +#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__)) && ! defined (__INTEL_COMPILER) +#include +#endif // __FMA__ + +// FMA4 instruction set +#if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__)) +#include // must have both x86intrin.h and fma4intrin.h, don't know why +#endif // __FMA4__ + + +#include // Define integer types with known size +#include // define abs(int) + +#ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler +#include // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int) +#endif // _MSC_VER + + +// functions in instrset_detect.cpp: +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + int instrset_detect(void); // tells which instruction sets are supported + bool hasFMA3(void); // true if FMA3 instructions supported + bool hasFMA4(void); // true if FMA4 instructions supported + bool hasXOP(void); // true if XOP instructions supported + bool hasAVX512ER(void); // true if AVX512ER instructions supported + bool hasAVX512VBMI(void); // true if AVX512VBMI instructions supported + bool hasAVX512VBMI2(void); // true if AVX512VBMI2 instructions supported +#ifdef VCL_NAMESPACE +} +#endif + +// functions in physical_processors.cpp: +int physicalProcessors(int * logical_processors = 0); + + +// GCC version +#if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__) +#define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__)) +#endif + +// Clang version +#if defined (__clang__) +#define CLANG_VERSION ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__)) +// Problem: The version number is not consistent across platforms +// http://llvm.org/bugs/show_bug.cgi?id=12643 +// Apple bug 18746972 +#endif + +// Fix problem with non-overloadable macros named min and max in WinDef.h +#ifdef _MSC_VER +#if defined (_WINDEF_) && defined(min) && defined(max) +#undef min +#undef max +#endif +#ifndef NOMINMAX +#define NOMINMAX +#endif + +// warning for poor support for AVX512F in MS compiler +#ifndef __INTEL_COMPILER +#if INSTRSET == 9 +#pragma message("Warning: MS compiler cannot generate code for AVX512F without AVX512DQ") +#endif +#if _MSC_VER < 1920 && INSTRSET > 8 +#pragma message("Warning: Your compiler has poor support for AVX512. Code may be erroneous.\nPlease use a newer compiler version or a different compiler!") +#endif +#endif // __INTEL_COMPILER +#endif // _MSC_VER + +/* Intel compiler problem: +The Intel compiler currently cannot compile version 2.00 of VCL. It seems to have +a problem with constexpr function returns not being constant enough. +*/ +#if defined(__INTEL_COMPILER) && __INTEL_COMPILER < 9999 +#error The Intel compiler version 19.00 cannot compile VCL version 2. Use Version 1.xx of VCL instead +#endif + +/* Clang problem: +The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128d as identical. +See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164 +Additional problem: The version number is not consistent across platforms. The Apple build has +different version numbers. We have to rely on __apple_build_version__ on the Mac platform: +http://llvm.org/bugs/show_bug.cgi?id=12643 +We have to make switches here when - hopefully - the error some day has been fixed. +We need different version checks with and whithout __apple_build_version__ +*/ +#if (defined (__clang__) || defined(__apple_build_version__)) && !defined(__INTEL_COMPILER) +#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY +#endif + +#if defined (GCC_VERSION) && GCC_VERSION < 99999 && !defined(__clang__) +#define ZEXT_MISSING // Gcc 7.4.0 does not have _mm256_zextsi128_si256 and similar functions +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +// Constant for indicating don't care in permute and blend functions. +// V_DC is -256 in Vector class library version 1.xx +// V_DC can be any value less than -1 in Vector class library version 2.00 +constexpr int V_DC = -256; + + +/***************************************************************************** +* +* Helper functions that depend on instruction set, compiler, or platform +* +*****************************************************************************/ + +// Define interface to cpuid instruction. +// input: functionnumber = leaf (eax), ecxleaf = subleaf(ecx) +// output: output[0] = eax, output[1] = ebx, output[2] = ecx, output[3] = edx +static inline void cpuid(int output[4], int functionnumber, int ecxleaf = 0) { +#if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax + int a, b, c, d; + __asm("cpuid" : "=a"(a), "=b"(b), "=c"(c), "=d"(d) : "a"(functionnumber), "c"(ecxleaf) : ); + output[0] = a; + output[1] = b; + output[2] = c; + output[3] = d; + +#elif defined (_MSC_VER) // Microsoft compiler, intrin.h included + __cpuidex(output, functionnumber, ecxleaf); // intrinsic function for CPUID + +#else // unknown platform. try inline assembly with masm/intel syntax + __asm { + mov eax, functionnumber + mov ecx, ecxleaf + cpuid; + mov esi, output + mov[esi], eax + mov[esi + 4], ebx + mov[esi + 8], ecx + mov[esi + 12], edx + } +#endif +} + + +// Define popcount function. Gives sum of bits +#if INSTRSET >= 6 // SSE4.2 +// popcnt instruction is not officially part of the SSE4.2 instruction set, +// but available in all known processors with SSE4.2 +static inline uint32_t vml_popcnt(uint32_t a) { + return (uint32_t)_mm_popcnt_u32(a); // Intel intrinsic. Supported by gcc and clang +} +#ifdef __x86_64__ +static inline int64_t vml_popcnt(uint64_t a) { + return _mm_popcnt_u64(a); // Intel intrinsic. +} +#else // 32 bit mode +static inline int64_t vml_popcnt(uint64_t a) { + return _mm_popcnt_u32(uint32_t(a >> 32)) + _mm_popcnt_u32(uint32_t(a)); +} +#endif +#else // no SSE4.2 +static inline uint32_t vml_popcnt(uint32_t a) { + // popcnt instruction not available + uint32_t b = a - ((a >> 1) & 0x55555555); + uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333); + uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F; + uint32_t e = d * 0x01010101; + return e >> 24; +} + +static inline int32_t vml_popcnt(uint64_t a) { + return vml_popcnt(uint32_t(a >> 32)) + vml_popcnt(uint32_t(a)); +} + +#endif + +// Define bit-scan-forward function. Gives index to lowest set bit +#if defined (__GNUC__) || defined(__clang__) + // gcc and Clang have no bit_scan_forward intrinsic +#if defined(__clang__) // fix clang bug + // Clang uses a k register as parameter a when inlined from horizontal_find_first +__attribute__((noinline)) +#endif +static uint32_t bit_scan_forward(uint32_t a) { + uint32_t r; + __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : ); + return r; +} +static inline uint32_t bit_scan_forward(uint64_t a) { + uint32_t lo = uint32_t(a); + if (lo) return bit_scan_forward(lo); + uint32_t hi = uint32_t(a >> 32); + return bit_scan_forward(hi) + 32; +} + +#else // other compilers +static inline uint32_t bit_scan_forward(uint32_t a) { + unsigned long r; + _BitScanForward(&r, a); // defined in intrin.h for MS and Intel compilers + return r; +} +#ifdef __x86_64__ +static inline uint32_t bit_scan_forward(uint64_t a) { + unsigned long r; + _BitScanForward64(&r, a); // defined in intrin.h for MS and Intel compilers + return (uint32_t)r; +} +#else +static inline uint32_t bit_scan_forward(uint64_t a) { + uint32_t lo = uint32_t(a); + if (lo) return bit_scan_forward(lo); + uint32_t hi = uint32_t(a >> 32); + return bit_scan_forward(hi) + 32; +} +#endif +#endif + + +// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a)) +#if defined (__GNUC__) || defined(__clang__) +static inline uint32_t bit_scan_reverse(uint32_t a) __attribute__((pure)); +static inline uint32_t bit_scan_reverse(uint32_t a) { + uint32_t r; + __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : ); + return r; +} +#ifdef __x86_64__ +static inline uint32_t bit_scan_reverse(uint64_t a) { + uint64_t r; + __asm("bsrq %1, %0" : "=r"(r) : "r"(a) : ); + return r; +} +#else // 32 bit mode +static inline uint32_t bit_scan_reverse(uint64_t a) { + uint64_t ahi = a >> 32; + if (ahi == 0) return bit_scan_reverse(uint32_t(a)); + else return bit_scan_reverse(uint32_t(ahi)) + 32; +} +#endif +#else +static inline uint32_t bit_scan_reverse(uint32_t a) { + unsigned long r; + _BitScanReverse(&r, a); // defined in intrin.h for MS and Intel compilers + return r; +} +#ifdef __x86_64__ +static inline uint32_t bit_scan_reverse(uint64_t a) { + unsigned long r; + _BitScanReverse64(&r, a); // defined in intrin.h for MS and Intel compilers + return r; +} +#else // 32 bit mode +static inline uint32_t bit_scan_reverse(uint64_t a) { + uint64_t ahi = a >> 32; + if (ahi == 0) return bit_scan_reverse(uint32_t(a)); + else return bit_scan_reverse(uint32_t(ahi)) + 32; +} +#endif +#endif + +// Same function, for compile-time constants +constexpr int bit_scan_reverse_const(uint64_t const n) { + if (n == 0) return -1; + uint64_t a = n, b = 0, j = 64, k = 0; + do { + j >>= 1; + k = (uint64_t)1 << j; + if (a >= k) { + a >>= j; + b += j; + } + } while (j > 0); + return int(b); +} + + +/***************************************************************************** +* +* Common templates +* +*****************************************************************************/ + +// Template class to represent compile-time integer constant +template class Const_int_t {}; // represent compile-time signed integer constant +template class Const_uint_t {}; // represent compile-time unsigned integer constant +#define const_int(n) (Const_int_t ()) // n must be compile-time integer constant +#define const_uint(n) (Const_uint_t()) // n must be compile-time unsigned integer constant + + +// template for producing quiet NAN +template +static inline VTYPE nan_vec(uint32_t payload = 0x100) { + if constexpr ((VTYPE::elementtype() & 1) != 0) { // double + union { + uint64_t q; + double f; + } ud; + // n is left justified to avoid loss of NAN payload when converting to float + ud.q = 0x7FF8000000000000 | uint64_t(payload) << 29; + return VTYPE(ud.f); + } + // float will be converted to double if necessary + union { + uint32_t i; + float f; + } uf; + uf.i = 0x7FC00000 | (payload & 0x003FFFFF); + return VTYPE(uf.f); +} + + +// Test if a parameter is a compile-time constant +/* Unfortunately, this works only for macro parameters, not for inline function parameters. + I hope that some solution will appear in the future, but for now it appears to be + impossible to check if a function parameter is a compile-time constant. + This would be useful in operator / and in function pow: + #if defined(__GNUC__) || defined (__clang__) + #define is_constant(a) __builtin_constant_p(a) + #else + #define is_constant(a) false + #endif +*/ + + +/***************************************************************************** +* +* Helper functions for permute and blend functions +* +****************************************************************************** +Rules for constexpr functions: + +> All variable declarations must include initialization + +> Do not put variable declarations inside a for-clause, e.g. avoid: for (int i=0; .. + Instead, you have to declare the loop counter before the for-loop. + +> Do not make constexpr functions that return vector types. This requires type + punning with a union, which is not allowed in constexpr functions under C++17. + It may be possible under C++20 + +*****************************************************************************/ + +// Define type for Encapsulated array to use as return type: +template +struct EList { + T a[N]; +}; + + +// get_inttype: get an integer of a size that matches the element size +// of vector class V with the value -1 +template +constexpr auto get_inttype() { + constexpr int elementsize = sizeof(V) / V::size(); // size of vector elements + + if constexpr (elementsize >= 8) { + return -int64_t(1); + } + else if constexpr (elementsize >= 4) { + return int32_t(-1); + } + else if constexpr (elementsize >= 2) { + return int16_t(-1); + } + else { + return int8_t(-1); + } +} + + +// zero_mask: return a compact bit mask mask for zeroing using AVX512 mask. +// Parameter a is a reference to a constexpr int array of permutation indexes +template +constexpr auto zero_mask(int const (&a)[N]) { + uint64_t mask = 0; + int i = 0; + + for (i = 0; i < N; i++) { + if (a[i] >= 0) mask |= uint64_t(1) << i; + } + if constexpr (N <= 8 ) return uint8_t(mask); + else if constexpr (N <= 16) return uint16_t(mask); + else if constexpr (N <= 32) return uint32_t(mask); + else return mask; +} + + +// zero_mask_broad: return a broad byte mask for zeroing. +// Parameter a is a reference to a constexpr int array of permutation indexes +template +constexpr auto zero_mask_broad(int const (&A)[V::size()]) { + constexpr int N = V::size(); // number of vector elements + typedef decltype(get_inttype()) Etype; // element type + EList u = {{0}}; // list for return + int i = 0; + for (i = 0; i < N; i++) { + u.a[i] = A[i] >= 0 ? get_inttype() : 0; + } + return u; // return encapsulated array +} + + +// make_bit_mask: return a compact mask of bits from a list of N indexes: +// B contains options indicating how to gather the mask +// bit 0-7 in B indicates which bit in each index to collect +// bit 8 = 0x100: set 1 in the lower half of the bit mask if the indicated bit is 1. +// bit 8 = 0 : set 1 in the lower half of the bit mask if the indicated bit is 0. +// bit 9 = 0x200: set 1 in the upper half of the bit mask if the indicated bit is 1. +// bit 9 = 0 : set 1 in the upper half of the bit mask if the indicated bit is 0. +// bit 10 = 0x400: set 1 in the bit mask if the corresponding index is -1 or V_DC +// Parameter a is a reference to a constexpr int array of permutation indexes +template +constexpr uint64_t make_bit_mask(int const (&a)[N]) { + uint64_t r = 0; // return value + uint8_t j = uint8_t(B & 0xFF); // index to selected bit + uint64_t s = 0; // bit number i in r + uint64_t f = 0; // 1 if bit not flipped + int i = 0; + for (i = 0; i < N; i++) { + int ix = a[i]; + if (ix < 0) { // -1 or V_DC + s = (B >> 10) & 1; + } + else { + s = ((uint32_t)ix >> j) & 1; // extract selected bit + if (i < N/2) { + f = (B >> 8) & 1; // lower half + } + else { + f = (B >> 9) & 1; // upper half + } + s ^= f ^ 1; // flip bit if needed + } + r |= uint64_t(s) << i; // set bit in return value + } + return r; +} + + +// make_broad_mask: Convert a bit mask m to a broad mask +// The return value will be a broad boolean mask with elementsize matching vector class V +template +constexpr auto make_broad_mask(uint64_t const m) { + constexpr int N = V::size(); // number of vector elements + typedef decltype(get_inttype()) Etype; // element type + EList u = {{0}}; // list for returning + int i = 0; + for (i = 0; i < N; i++) { + u.a[i] = ((m >> i) & 1) != 0 ? get_inttype() : 0; + } + return u; // return encapsulated array +} + + +// perm_mask_broad: return a mask for permutation by a vector register index. +// Parameter A is a reference to a constexpr int array of permutation indexes +template +constexpr auto perm_mask_broad(int const (&A)[V::size()]) { + constexpr int N = V::size(); // number of vector elements + typedef decltype(get_inttype()) Etype; // vector element type + EList u = {{0}}; // list for returning + int i = 0; + for (i = 0; i < N; i++) { + u.a[i] = Etype(A[i]); + } + return u; // return encapsulated array +} + + +// perm_flags: returns information about how a permute can be implemented. +// The return value is composed of these flag bits: +const int perm_zeroing = 1; // needs zeroing +const int perm_perm = 2; // permutation needed +const int perm_allzero = 4; // all is zero or don't care +const int perm_largeblock = 8; // fits permute with a larger block size (e.g permute Vec2q instead of Vec4i) +const int perm_addz = 0x10; // additional zeroing needed after permute with larger block size or shift +const int perm_addz2 = 0x20; // additional zeroing needed after perm_zext, perm_compress, or perm_expand +const int perm_cross_lane = 0x40; // permutation crossing 128-bit lanes +const int perm_same_pattern = 0x80; // same permute pattern in all 128-bit lanes +const int perm_punpckh = 0x100; // permutation pattern fits punpckh instruction +const int perm_punpckl = 0x200; // permutation pattern fits punpckl instruction +const int perm_rotate = 0x400; // permutation pattern fits rotation within lanes. 4 bit count returned in bit perm_rot_count +const int perm_shright = 0x1000; // permutation pattern fits shift right within lanes. 4 bit count returned in bit perm_rot_count +const int perm_shleft = 0x2000; // permutation pattern fits shift left within lanes. negative count returned in bit perm_rot_count +const int perm_rotate_big = 0x4000; // permutation pattern fits rotation across lanes. 6 bit count returned in bit perm_rot_count +const int perm_broadcast = 0x8000; // permutation pattern fits broadcast of a single element. +const int perm_zext = 0x10000; // permutation pattern fits zero extension +const int perm_compress = 0x20000; // permutation pattern fits vpcompress instruction +const int perm_expand = 0x40000; // permutation pattern fits vpexpand instruction +const int perm_outofrange = 0x10000000; // index out of range +const int perm_rot_count = 32; // rotate or shift count is in bits perm_rot_count to perm_rot_count+3 +const int perm_ipattern = 40; // pattern for pshufd is in bit perm_ipattern to perm_ipattern + 7 if perm_same_pattern and elementsize >= 4 + +template +constexpr uint64_t perm_flags(int const (&a)[V::size()]) { + // a is a reference to a constexpr array of permutation indexes + // V is a vector class + constexpr int N = V::size(); // number of elements + uint64_t r = perm_largeblock | perm_same_pattern | perm_allzero; // return value + uint32_t i = 0; // loop counter + int j = 0; // loop counter + int ix = 0; // index number i + const uint32_t nlanes = sizeof(V) / 16; // number of 128-bit lanes + const uint32_t lanesize = N / nlanes; // elements per lane + const uint32_t elementsize = sizeof(V) / N; // size of each vector element + uint32_t lane = 0; // current lane + uint32_t rot = 999; // rotate left count + int32_t broadc = 999; // index to broadcasted element + uint32_t patfail = 0; // remember certain patterns that do not fit + uint32_t addz2 = 0; // remember certain patterns need extra zeroing + int32_t compresslasti = -1; // last index in perm_compress fit + int32_t compresslastp = -1; // last position in perm_compress fit + int32_t expandlasti = -1; // last index in perm_expand fit + int32_t expandlastp = -1; // last position in perm_expand fit + + int lanepattern[lanesize] = {0}; // pattern in each lane + + for (i = 0; i < N; i++) { // loop through indexes + ix = a[i]; // current index + // meaning of ix: -1 = set to zero, V_DC = don't care, non-negative value = permute. + if (ix == -1) { + r |= perm_zeroing; // zeroing requested + } + else if (ix != V_DC && uint32_t(ix) >= N) { + r |= perm_outofrange; // index out of range + } + if (ix >= 0) { + r &= ~ perm_allzero; // not all zero + if (ix != (int)i) r |= perm_perm; // needs permutation + if (broadc == 999) broadc = ix; // remember broadcast index + else if (broadc != ix) broadc = 1000; // does not fit broadcast + } + // check if pattern fits a larger block size: + // even indexes must be even, odd indexes must fit the preceding even index + 1 + if ((i & 1) == 0) { // even index + if (ix >= 0 && (ix & 1)) r &= ~perm_largeblock;// not even. does not fit larger block size + int iy = a[i + 1]; // next odd index + if (iy >= 0 && (iy & 1) == 0) r &= ~ perm_largeblock; // not odd. does not fit larger block size + if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ perm_largeblock; // does not fit preceding index + 1 + if (ix == -1 && iy >= 0) r |= perm_addz; // needs additional zeroing at current block size + if (iy == -1 && ix >= 0) r |= perm_addz; // needs additional zeroing at current block size + } + lane = i / lanesize; // current lane + if (lane == 0) { // first lane, or no pattern yet + lanepattern[i] = ix; // save pattern + } + // check if crossing lanes + if (ix >= 0) { + uint32_t lanei = (uint32_t)ix / lanesize; // source lane + if (lanei != lane) r |= perm_cross_lane; // crossing lane + } + // check if same pattern in all lanes + if (lane != 0 && ix >= 0) { // not first lane + int j1 = i - int(lane * lanesize); // index into lanepattern + int jx = ix - int(lane * lanesize); // pattern within lane + if (jx < 0 || jx >= (int)lanesize) r &= ~perm_same_pattern; // source is in another lane + if (lanepattern[j1] < 0) { + lanepattern[j1] = jx; // pattern not known from previous lane + } + else { + if (lanepattern[j1] != jx) r &= ~perm_same_pattern; // not same pattern + } + } + if (ix >= 0) { + // check if pattern fits zero extension (perm_zext) + if (uint32_t(ix*2) != i) { + patfail |= 1; // does not fit zero extension + } + // check if pattern fits compress (perm_compress) + if (ix > compresslasti && ix - compresslasti >= (int)i - compresslastp) { + if ((int)i - compresslastp > 1) addz2 |= 2;// perm_compress may need additional zeroing + compresslasti = ix; compresslastp = i; + } + else { + patfail |= 2; // does not fit perm_compress + } + // check if pattern fits expand (perm_expand) + if (ix > expandlasti && ix - expandlasti <= (int)i - expandlastp) { + if (ix - expandlasti > 1) addz2 |= 4; // perm_expand may need additional zeroing + expandlasti = ix; expandlastp = i; + } + else { + patfail |= 4; // does not fit perm_compress + } + } + else if (ix == -1) { + if ((i & 1) == 0) addz2 |= 1; // zero extension needs additional zeroing + } + } + if (!(r & perm_perm)) return r; // more checks are superfluous + + if (!(r & perm_largeblock)) r &= ~ perm_addz; // remove irrelevant flag + if (r & perm_cross_lane) r &= ~ perm_same_pattern; // remove irrelevant flag + if ((patfail & 1) == 0) { + r |= perm_zext; // fits zero extension + if ((addz2 & 1) != 0) r |= perm_addz2; + } + else if ((patfail & 2) == 0) { + r |= perm_compress; // fits compression + if ((addz2 & 2) != 0) { // check if additional zeroing needed + for (j = 0; j < compresslastp; j++) { + if (a[j] == -1) r |= perm_addz2; + } + } + } + else if ((patfail & 4) == 0) { + r |= perm_expand; // fits expansion + if ((addz2 & 4) != 0) { // check if additional zeroing needed + for (j = 0; j < expandlastp; j++) { + if (a[j] == -1) r |= perm_addz2; + } + } + } + + if (r & perm_same_pattern) { + // same pattern in all lanes. check if it fits specific patterns + bool fit = true; + // fit shift or rotate + for (i = 0; i < lanesize; i++) { + if (lanepattern[i] >= 0) { + uint32_t rot1 = uint32_t(lanepattern[i] + lanesize - i) % lanesize; + if (rot == 999) { + rot = rot1; + } + else { // check if fit + if (rot != rot1) fit = false; + } + } + } + rot &= lanesize-1; // prevent out of range values + if (fit) { // fits rotate, and possibly shift + uint64_t rot2 = (rot * elementsize) & 0xF; // rotate right count in bytes + r |= rot2 << perm_rot_count; // put shift/rotate count in output bit 16-19 +#if INSTRSET >= 4 // SSSE3 + r |= perm_rotate; // allow palignr +#endif + // fit shift left + fit = true; + for (i = 0; i < lanesize-rot; i++) { // check if first rot elements are zero or don't care + if (lanepattern[i] >= 0) fit = false; + } + if (fit) { + r |= perm_shleft; + for (; i < lanesize; i++) if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed + } + // fit shift right + fit = true; + for (i = lanesize-(uint32_t)rot; i < lanesize; i++) { // check if last (lanesize-rot) elements are zero or don't care + if (lanepattern[i] >= 0) fit = false; + } + if (fit) { + r |= perm_shright; + for (i = 0; i < lanesize-rot; i++) { + if (lanepattern[i] == -1) r |= perm_addz; // additional zeroing needed + } + } + } + // fit punpckhi + fit = true; + uint32_t j2 = lanesize / 2; + for (i = 0; i < lanesize; i++) { + if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false; + if ((i & 1) != 0) j2++; + } + if (fit) r |= perm_punpckh; + // fit punpcklo + fit = true; + j2 = 0; + for (i = 0; i < lanesize; i++) { + if (lanepattern[i] >= 0 && lanepattern[i] != (int)j2) fit = false; + if ((i & 1) != 0) j2++; + } + if (fit) r |= perm_punpckl; + // fit pshufd + if (elementsize >= 4) { + uint64_t p = 0; + for (i = 0; i < lanesize; i++) { + if (lanesize == 4) { + p |= (lanepattern[i] & 3) << 2 * i; + } + else { // lanesize = 2 + p |= ((lanepattern[i] & 1) * 10 + 4) << 4 * i; + } + } + r |= p << perm_ipattern; + } + } +#if INSTRSET >= 7 + else { // not same pattern in all lanes + if constexpr (nlanes > 1) { // Try if it fits big rotate + for (i = 0; i < N; i++) { + ix = a[i]; + if (ix >= 0) { + uint32_t rot2 = (ix + N - i) % N; // rotate count + if (rot == 999) { + rot = rot2; // save rotate count + } + else if (rot != rot2) { + rot = 1000; break; // does not fit big rotate + } + } + } + if (rot < N) { // fits big rotate + r |= perm_rotate_big | (uint64_t)rot << perm_rot_count; + } + } + } +#endif + if (broadc < 999 && (r & (perm_rotate|perm_shright|perm_shleft|perm_rotate_big)) == 0) { + r |= perm_broadcast | (uint64_t)broadc << perm_rot_count; // fits broadcast + } + return r; +} + + +// compress_mask: returns a bit mask to use for compression instruction. +// It is presupposed that perm_flags indicates perm_compress. +// Additional zeroing is needed if perm_flags indicates perm_addz2 +template +constexpr uint64_t compress_mask(int const (&a)[N]) { + // a is a reference to a constexpr array of permutation indexes + int ix = 0, lasti = -1, lastp = -1; + uint64_t m = 0; + int i = 0; int j = 1; // loop counters + for (i = 0; i < N; i++) { + ix = a[i]; // permutation index + if (ix >= 0) { + m |= (uint64_t)1 << ix; // mask for compression source + for (j = 1; j < i - lastp; j++) { + m |= (uint64_t)1 << (lasti + j); // dummy filling source + } + lastp = i; lasti = ix; + } + } + return m; +} + +// expand_mask: returns a bit mask to use for expansion instruction. +// It is presupposed that perm_flags indicates perm_expand. +// Additional zeroing is needed if perm_flags indicates perm_addz2 +template +constexpr uint64_t expand_mask(int const (&a)[N]) { + // a is a reference to a constexpr array of permutation indexes + int ix = 0, lasti = -1, lastp = -1; + uint64_t m = 0; + int i = 0; int j = 1; + for (i = 0; i < N; i++) { + ix = a[i]; // permutation index + if (ix >= 0) { + m |= (uint64_t)1 << i; // mask for expansion destination + for (j = 1; j < ix - lasti; j++) { + m |= (uint64_t)1 << (lastp + j); // dummy filling destination + } + lastp = i; lasti = ix; + } + } + return m; +} + +// perm16_flags: returns information about how to permute a vector of 16-bit integers +// Note: It is presupposed that perm_flags reports perm_same_pattern +// The return value is composed of these bits: +// 1: data from low 64 bits to low 64 bits. pattern in bit 32-39 +// 2: data from high 64 bits to high 64 bits. pattern in bit 40-47 +// 4: data from high 64 bits to low 64 bits. pattern in bit 48-55 +// 8: data from low 64 bits to high 64 bits. pattern in bit 56-63 +template +constexpr uint64_t perm16_flags(int const (&a)[V::size()]) { + // a is a reference to a constexpr array of permutation indexes + // V is a vector class + constexpr int N = V::size(); // number of elements + + uint64_t retval = 0; // return value + uint32_t pat[4] = {0,0,0,0}; // permute patterns + uint32_t i = 0; // loop counter + int ix = 0; // index number i + const uint32_t lanesize = 8; // elements per lane + uint32_t lane = 0; // current lane + int lanepattern[lanesize] = {0}; // pattern in each lane + + for (i = 0; i < N; i++) { + ix = a[i]; + lane = i / lanesize; // current lane + if (lane == 0) { + lanepattern[i] = ix; // save pattern + } + else if (ix >= 0) { // not first lane + uint32_t j = i - lane * lanesize; // index into lanepattern + int jx = ix - lane * lanesize; // pattern within lane + if (lanepattern[j] < 0) { + lanepattern[j] = jx; // pattern not known from previous lane + } + } + } + // four patterns: low2low, high2high, high2low, low2high + for (i = 0; i < 4; i++) { + // loop through low pattern + if (lanepattern[i] >= 0) { + if (lanepattern[i] < 4) { // low2low + retval |= 1; + pat[0] |= uint32_t(lanepattern[i] & 3) << (2 * i); + } + else { // high2low + retval |= 4; + pat[2] |= uint32_t(lanepattern[i] & 3) << (2 * i); + } + } + // loop through high pattern + if (lanepattern[i+4] >= 0) { + if (lanepattern[i+4] < 4) { // low2high + retval |= 8; + pat[3] |= uint32_t(lanepattern[i+4] & 3) << (2 * i); + } + else { // high2high + retval |= 2; + pat[1] |= uint32_t(lanepattern[i+4] & 3) << (2 * i); + } + } + } + // join return data + for (i = 0; i < 4; i++) { + retval |= (uint64_t)pat[i] << (32 + i*8); + } + return retval; +} + + +// pshufb_mask: return a broad byte mask for permutation within lanes +// for use with the pshufb instruction (_mm..._shuffle_epi8). +// The pshufb instruction provides fast permutation and zeroing, +// allowing different patterns in each lane but no crossing of lane boundaries +template +constexpr auto pshufb_mask(int const (&A)[V::size()]) { + // Parameter a is a reference to a constexpr array of permutation indexes + // V is a vector class + // oppos = 1 for data from the opposite 128-bit lane in 256-bit vectors + constexpr uint32_t N = V::size(); // number of vector elements + constexpr uint32_t elementsize = sizeof(V) / N; // size of each vector element + constexpr uint32_t nlanes = sizeof(V) / 16; // number of 128 bit lanes in vector + constexpr uint32_t elements_per_lane = N / nlanes; // number of vector elements per lane + + EList u = {{0}}; // list for returning + + uint32_t i = 0; // loop counters + uint32_t j = 0; + int m = 0; + int k = 0; + uint32_t lane = 0; + + for (lane = 0; lane < nlanes; lane++) { // loop through lanes + for (i = 0; i < elements_per_lane; i++) { // loop through elements in lane + // permutation index for element within lane + int8_t p = -1; + int ix = A[m]; + if (ix >= 0) { + ix ^= oppos * elements_per_lane; // flip bit if opposite lane + } + ix -= int(lane * elements_per_lane); // index relative to lane + if (ix >= 0 && ix < (int)elements_per_lane) { // index points to desired lane + p = ix * elementsize; + } + for (j = 0; j < elementsize; j++) { // loop through bytes in element + u.a[k++] = p < 0 ? -1 : p + j; // store byte permutation index + } + m++; + } + } + return u; // return encapsulated array +} + + +// largeblock_perm: return indexes for replacing a permute or blend with +// a certain block size by a permute or blend with the double block size. +// Note: it is presupposed that perm_flags() indicates perm_largeblock +// It is required that additional zeroing is added if perm_flags() indicates perm_addz +template +constexpr EList largeblock_perm(int const (&a)[N]) { + // Parameter a is a reference to a constexpr array of permutation indexes + EList list = {{0}}; // result indexes + int ix = 0; // even index + int iy = 0; // odd index + int iz = 0; // combined index + bool fit_addz = false; // additional zeroing needed at the lower block level + int i = 0; // loop counter + + // check if additional zeroing is needed at current block size + for (i = 0; i < N; i += 2) { + ix = a[i]; // even index + iy = a[i+1]; // odd index + if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) { + fit_addz = true; + } + } + + // loop through indexes + for (i = 0; i < N; i += 2) { + ix = a[i]; // even index + iy = a[i+1]; // odd index + if (ix >= 0) { + iz = ix / 2; // half index + } + else if (iy >= 0) { + iz = iy / 2; + } + else { + iz = ix | iy; // -1 or V_DC. -1 takes precedence + if (fit_addz) iz = V_DC; // V_DC, because result will be zeroed later + } + list.a[i/2] = iz; // save to list + } + return list; +} + + +// blend_flags: returns information about how a blend function can be implemented +// The return value is composed of these flag bits: +const int blend_zeroing = 1; // needs zeroing +const int blend_allzero = 2; // all is zero or don't care +const int blend_largeblock = 4; // fits blend with a larger block size (e.g permute Vec2q instead of Vec4i) +const int blend_addz = 8; // additional zeroing needed after blend with larger block size or shift +const int blend_a = 0x10; // has data from a +const int blend_b = 0x20; // has data from b +const int blend_perma = 0x40; // permutation of a needed +const int blend_permb = 0x80; // permutation of b needed +const int blend_cross_lane = 0x100; // permutation crossing 128-bit lanes +const int blend_same_pattern = 0x200; // same permute/blend pattern in all 128-bit lanes +const int blend_punpckhab = 0x1000; // pattern fits punpckh(a,b) +const int blend_punpckhba = 0x2000; // pattern fits punpckh(b,a) +const int blend_punpcklab = 0x4000; // pattern fits punpckl(a,b) +const int blend_punpcklba = 0x8000; // pattern fits punpckl(b,a) +const int blend_rotateab = 0x10000; // pattern fits palignr(a,b) +const int blend_rotateba = 0x20000; // pattern fits palignr(b,a) +const int blend_shufab = 0x40000; // pattern fits shufps/shufpd(a,b) +const int blend_shufba = 0x80000; // pattern fits shufps/shufpd(b,a) +const int blend_rotate_big = 0x100000; // pattern fits rotation across lanes. count returned in bits blend_rotpattern +const int blend_outofrange= 0x10000000; // index out of range +const int blend_shufpattern = 32; // pattern for shufps/shufpd is in bit blend_shufpattern to blend_shufpattern + 7 +const int blend_rotpattern = 40; // pattern for palignr is in bit blend_rotpattern to blend_rotpattern + 7 + +template +constexpr uint64_t blend_flags(int const (&a)[V::size()]) { + // a is a reference to a constexpr array of permutation indexes + // V is a vector class + constexpr int N = V::size(); // number of elements + uint64_t r = blend_largeblock | blend_same_pattern | blend_allzero; // return value + uint32_t iu = 0; // loop counter + int32_t ii = 0; // loop counter + int ix = 0; // index number i + const uint32_t nlanes = sizeof(V) / 16; // number of 128-bit lanes + const uint32_t lanesize = N / nlanes; // elements per lane + uint32_t lane = 0; // current lane + uint32_t rot = 999; // rotate left count + int lanepattern[lanesize] = {0}; // pattern in each lane + if (lanesize == 2 && N <= 8) { + r |= blend_shufab | blend_shufba; // check if it fits shufpd + } + + for (ii = 0; ii < N; ii++) { // loop through indexes + ix = a[ii]; // index + if (ix < 0) { + if (ix == -1) r |= blend_zeroing; // set to zero + else if (ix != V_DC) { + r = blend_outofrange; break; // illegal index + } + } + else { // ix >= 0 + r &= ~ blend_allzero; + if (ix < N) { + r |= blend_a; // data from a + if (ix != ii) r |= blend_perma; // permutation of a + } + else if (ix < 2*N) { + r |= blend_b; // data from b + if (ix != ii + N) r |= blend_permb; // permutation of b + } + else { + r = blend_outofrange; break; // illegal index + } + } + // check if pattern fits a larger block size: + // even indexes must be even, odd indexes must fit the preceding even index + 1 + if ((ii & 1) == 0) { // even index + if (ix >= 0 && (ix&1)) r &= ~blend_largeblock; // not even. does not fit larger block size + int iy = a[ii+1]; // next odd index + if (iy >= 0 && (iy & 1) == 0) r &= ~ blend_largeblock; // not odd. does not fit larger block size + if (ix >= 0 && iy >= 0 && iy != ix+1) r &= ~ blend_largeblock; // does not fit preceding index + 1 + if (ix == -1 && iy >= 0) r |= blend_addz; // needs additional zeroing at current block size + if (iy == -1 && ix >= 0) r |= blend_addz; // needs additional zeroing at current block size + } + lane = (uint32_t)ii / lanesize; // current lane + if (lane == 0) { // first lane, or no pattern yet + lanepattern[ii] = ix; // save pattern + } + // check if crossing lanes + if (ix >= 0) { + uint32_t lanei = uint32_t(ix & ~N) / lanesize; // source lane + if (lanei != lane) { + r |= blend_cross_lane; // crossing lane + } + if (lanesize == 2) { // check if it fits pshufd + if (lanei != lane) r &= ~(blend_shufab | blend_shufba); + if ((((ix & N) != 0) ^ ii) & 1) r &= ~blend_shufab; + else r &= ~blend_shufba; + } + } + // check if same pattern in all lanes + if (lane != 0 && ix >= 0) { // not first lane + int j = ii - int(lane * lanesize); // index into lanepattern + int jx = ix - int(lane * lanesize); // pattern within lane + if (jx < 0 || (jx & ~N) >= (int)lanesize) r &= ~blend_same_pattern; // source is in another lane + if (lanepattern[j] < 0) { + lanepattern[j] = jx; // pattern not known from previous lane + } + else { + if (lanepattern[j] != jx) r &= ~blend_same_pattern; // not same pattern + } + } + } + if (!(r & blend_largeblock)) r &= ~ blend_addz; // remove irrelevant flag + if (r & blend_cross_lane) r &= ~ blend_same_pattern; // remove irrelevant flag + if (!(r & (blend_perma | blend_permb))) { + return r; // no permutation. more checks are superfluous + } + if (r & blend_same_pattern) { + // same pattern in all lanes. check if it fits unpack patterns + r |= blend_punpckhab | blend_punpckhba | blend_punpcklab | blend_punpcklba; + for (iu = 0; iu < lanesize; iu++) { // loop through lanepattern + ix = lanepattern[iu]; + if (ix >= 0) { + if ((uint32_t)ix != iu / 2 + (iu & 1) * N) r &= ~ blend_punpcklab; + if ((uint32_t)ix != iu / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpcklba; + if ((uint32_t)ix != (iu + lanesize) / 2 + (iu & 1) * N) r &= ~ blend_punpckhab; + if ((uint32_t)ix != (iu + lanesize) / 2 + ((iu & 1) ^ 1) * N) r &= ~ blend_punpckhba; + } + } +#if INSTRSET >= 4 // SSSE3. check if it fits palignr + for (iu = 0; iu < lanesize; iu++) { + ix = lanepattern[iu]; + if (ix >= 0) { + uint32_t t = ix & ~N; + if (ix & N) t += lanesize; + uint32_t tb = (t + 2*lanesize - iu) % (lanesize * 2); + if (rot == 999) { + rot = tb; + } + else { // check if fit + if (rot != tb) rot = 1000; + } + } + } + if (rot < 999) { // firs palignr + if (rot < lanesize) { + r |= blend_rotateba; + } + else { + r |= blend_rotateab; + } + const uint32_t elementsize = sizeof(V) / N; + r |= uint64_t((rot & (lanesize - 1)) * elementsize) << blend_rotpattern; + } +#endif + if (lanesize == 4) { + // check if it fits shufps + r |= blend_shufab | blend_shufba; + for (ii = 0; ii < 2; ii++) { + ix = lanepattern[ii]; + if (ix >= 0) { + if (ix & N) r &= ~ blend_shufab; + else r &= ~ blend_shufba; + } + } + for (; ii < 4; ii++) { + ix = lanepattern[ii]; + if (ix >= 0) { + if (ix & N) r &= ~ blend_shufba; + else r &= ~ blend_shufab; + } + } + if (r & (blend_shufab | blend_shufba)) { // fits shufps/shufpd + uint8_t shufpattern = 0; // get pattern + for (iu = 0; iu < lanesize; iu++) { + shufpattern |= (lanepattern[iu] & 3) << iu * 2; + } + r |= (uint64_t)shufpattern << blend_shufpattern; // return pattern + } + } + } + else if (nlanes > 1) { // not same pattern in all lanes + rot = 999; // check if it fits big rotate + for (ii = 0; ii < N; ii++) { + ix = a[ii]; + if (ix >= 0) { + uint32_t rot2 = (ix + 2 * N - ii) % (2 * N);// rotate count + if (rot == 999) { + rot = rot2; // save rotate count + } + else if (rot != rot2) { + rot = 1000; break; // does not fit big rotate + } + } + } + if (rot < 2 * N) { // fits big rotate + r |= blend_rotate_big | (uint64_t)rot << blend_rotpattern; + } + } + if (lanesize == 2 && (r & (blend_shufab | blend_shufba))) { // fits shufpd. Get pattern + for (ii = 0; ii < N; ii++) { + r |= uint64_t(a[ii] & 1) << (blend_shufpattern + ii); + } + } + return r; +} + +// blend_perm_indexes: return an Indexlist for implementing a blend function as +// two permutations. N = vector size. +// dozero = 0: let unused elements be don't care. The two permutation results must be blended +// dozero = 1: zero unused elements in each permuation. The two permutation results can be OR'ed +// dozero = 2: indexes that are -1 or V_DC are preserved +template +constexpr EList blend_perm_indexes(int const (&a)[N]) { + // a is a reference to a constexpr array of permutation indexes + EList list = {{0}}; // list to return + int u = dozero ? -1 : V_DC; // value to use for unused entries + int j = 0; + + for (j = 0; j < N; j++) { // loop through indexes + int ix = a[j]; // current index + if (ix < 0) { // zero or don't care + if (dozero == 2) { + // list.a[j] = list.a[j + N] = ix; // fails in gcc in complicated cases + list.a[j] = ix; + list.a[j + N] = ix; + } + else { + // list.a[j] = list.a[j + N] = u; + list.a[j] = u; + list.a[j + N] = u; + } + } + else if (ix < N) { // value from a + list.a[j] = ix; + list.a[j+N] = u; + } + else { + list.a[j] = u; // value from b + list.a[j+N] = ix - N; + } + } + return list; +} + +// largeblock_indexes: return indexes for replacing a permute or blend with a +// certain block size by a permute or blend with the double block size. +// Note: it is presupposed that perm_flags or blend_flags indicates _largeblock +// It is required that additional zeroing is added if perm_flags or blend_flags +// indicates _addz +template +constexpr EList largeblock_indexes(int const (&a)[N]) { + // Parameter a is a reference to a constexpr array of N permutation indexes + EList list = {{0}}; // list to return + + bool fit_addz = false; // additional zeroing needed at the lower block level + int ix = 0; // even index + int iy = 0; // odd index + int iz = 0; // combined index + int i = 0; // loop counter + + for (i = 0; i < N; i += 2) { + ix = a[i]; // even index + iy = a[i+1]; // odd index + if (ix >= 0) { + iz = ix / 2; // half index + } + else if (iy >= 0) { + iz = iy / 2; // half index + } + else iz = ix | iy; // -1 or V_DC. -1 takes precedence + list.a[i/2] = iz; // save to list + // check if additional zeroing is needed at current block size + if ((ix == -1 && iy >= 0) || (iy == -1 && ix >= 0)) { + fit_addz = true; + } + } + // replace -1 by V_DC if fit_addz + if (fit_addz) { + for (i = 0; i < N/2; i++) { + if (list.a[i] < 0) list.a[i] = V_DC; + } + } + return list; +} + + +/**************************************************************************************** +* +* Vector blend helper function templates +* +* These templates are for emulating a blend with a vector size that is not supported by +* the instruction set, using multiple blends or permutations of half the vector size +* +****************************************************************************************/ + +// Make dummy blend function templates to avoid error messages when the blend funtions are not yet defined +template void blend2(){} +template void blend4(){} +template void blend8(){} +template void blend16(){} +template void blend32(){} + +// blend_half_indexes: return an Indexlist for emulating a blend function as +// blends or permutations from multiple sources +// dozero = 0: let unused elements be don't care. Multiple permutation results must be blended +// dozero = 1: zero unused elements in each permuation. Multiple permutation results can be OR'ed +// dozero = 2: indexes that are -1 or V_DC are preserved +// src1, src2: sources to blend in a partial implementation +template +constexpr EList blend_half_indexes(int const (&a)[N]) { + // a is a reference to a constexpr array of permutation indexes + EList list = {{0}}; // list to return + int u = dozero ? -1 : V_DC; // value to use for unused entries + int j = 0; // loop counter + + for (j = 0; j < N; j++) { // loop through indexes + int ix = a[j]; // current index + if (ix < 0) { // zero or don't care + list.a[j] = (dozero == 2) ? ix : u; + } + else { + int src = ix / N; // source + if (src == src1) { + list.a[j] = ix & (N - 1); + } + else if (src == src2) { + list.a[j] = (ix & (N - 1)) + N; + } + else list.a[j] = u; + } + } + return list; +} + +// selectblend: select one of four sources for blending +template +static inline auto selectblend(W const a, W const b) { + if constexpr (s == 0) return a.get_low(); + else if constexpr (s == 1) return a.get_high(); + else if constexpr (s == 2) return b.get_low(); + else return b.get_high(); +} + +// blend_half: Emulate a blend with a vector size that is not supported +// by multiple blends with half the vector size. +// blend_half is called twice, to give the low and high half of the result +// Parameters: W: type of full-size vector +// i0...: indexes for low or high half +// a, b: full size input vectors +// return value: half-size vector for lower or upper part +template +auto blend_half(W const& a, W const& b) { + typedef decltype(a.get_low()) V; // type for half-size vector + constexpr int N = V::size(); // size of half-size vector + static_assert(sizeof...(i0) == N, "wrong number of indexes in blend_half"); + constexpr int ind[N] = { i0... }; // array of indexes + + // lambda to find which of the four possible sources are used + // return: EList containing a list of up to 4 sources. The last element is the number of sources used + auto listsources = [](int const n, int const (&ind)[N]) constexpr { + bool source_used[4] = { false,false,false,false }; // list of sources used + int i = 0; + for (i = 0; i < n; i++) { + int ix = ind[i]; // index + if (ix >= 0) { + int src = ix / n; // source used + source_used[src & 3] = true; + } + } + // return a list of sources used. The last element is the number of sources used + EList sources = {{0}}; + int nsrc = 0; // number of sources + for (i = 0; i < 4; i++) { + if (source_used[i]) { + sources.a[nsrc++] = i; + } + } + sources.a[4] = nsrc; + return sources; + }; + // list of sources used + constexpr EList sources = listsources(N, ind); + constexpr int nsrc = sources.a[4]; // number of sources used + + if constexpr (nsrc == 0) { // no sources + return V(0); + } + // get indexes for the first one or two sources + constexpr int uindex = (nsrc > 2) ? 1 : 2; // unused elements set to zero if two blends are combined + constexpr EList L = blend_half_indexes(ind); + V x0; + V src0 = selectblend(a, b); // first source + V src1 = selectblend(a, b); // second source + if constexpr (N == 2) { + x0 = blend2 (src0, src1); + } + else if constexpr (N == 4) { + x0 = blend4 (src0, src1); + } + else if constexpr (N == 8) { + x0 = blend8 (src0, src1); + } + else if constexpr (N == 16) { + x0 = blend16 (src0, src1); + } + else if constexpr (N == 32) { + x0 = blend32 (src0, src1); + } + if constexpr (nsrc > 2) { // get last one or two sources + constexpr EList M = blend_half_indexes(ind); + V x1; + V src2 = selectblend(a, b); // third source + V src3 = selectblend(a, b); // fourth source + if constexpr (N == 2) { + x1 = blend2 (src0, src1); + } + else if constexpr (N == 4) { + x1 = blend4 (src2, src3); + } + else if constexpr (N == 8) { + x1 = blend8 (src2, src3); + } + else if constexpr (N == 16) { + x1 = blend16 (src2, src3); + } + else if constexpr (N == 32) { + x1 = blend32 (src2, src3); + } + x0 |= x1; // combine result of two blends. Unused elements are zero + } + return x0; +} + + +#ifdef VCL_NAMESPACE +} +#endif + + +#endif // INSTRSET_H diff --git a/DFTTest/vectorclass/instrset_detect.cpp b/DFTTest/VCL2/instrset_detect.cpp similarity index 75% rename from DFTTest/vectorclass/instrset_detect.cpp rename to DFTTest/VCL2/instrset_detect.cpp index 600b9e1..119661b 100644 --- a/DFTTest/vectorclass/instrset_detect.cpp +++ b/DFTTest/VCL2/instrset_detect.cpp @@ -1,14 +1,15 @@ /************************** instrset_detect.cpp **************************** * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2017-05-02 -* Version: 1.28 -* Project: vector classes +* Last modified: 2019-08-01 +* Version: 2.00.00 +* Project: vector class library * Description: * Functions for checking which instruction sets are supported. * -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses -\*****************************************************************************/ +* (c) Copyright 2012-2019 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ #include "instrset.h" @@ -16,53 +17,21 @@ namespace VCL_NAMESPACE { #endif -// Define interface to cpuid instruction. -// input: eax = functionnumber, ecx = 0 -// output: eax = output[0], ebx = output[1], ecx = output[2], edx = output[3] -static inline void cpuid (int output[4], int functionnumber) { -#if defined(__GNUC__) || defined(__clang__) // use inline assembly, Gnu/AT&T syntax - - int a, b, c, d; - __asm("cpuid" : "=a"(a),"=b"(b),"=c"(c),"=d"(d) : "a"(functionnumber),"c"(0) : ); - output[0] = a; - output[1] = b; - output[2] = c; - output[3] = d; - -#elif defined (_MSC_VER) || defined (__INTEL_COMPILER) // Microsoft or Intel compiler, intrin.h included - - __cpuidex(output, functionnumber, 0); // intrinsic function for CPUID - -#else // unknown platform. try inline assembly with masm/intel syntax - - __asm { - mov eax, functionnumber - xor ecx, ecx - cpuid; - mov esi, output - mov [esi], eax - mov [esi+4], ebx - mov [esi+8], ecx - mov [esi+12], edx - } - -#endif -} // Define interface to xgetbv instruction -static inline int64_t xgetbv (int ctr) { -#if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) // Microsoft or Intel compiler supporting _xgetbv intrinsic +static inline uint64_t xgetbv (int ctr) { +#if (defined (_MSC_FULL_VER) && _MSC_FULL_VER >= 160040000) || (defined (__INTEL_COMPILER) && __INTEL_COMPILER >= 1200) + // Microsoft or Intel compiler supporting _xgetbv intrinsic - return _xgetbv(ctr); // intrinsic function for XGETBV + return uint64_t(_xgetbv(ctr)); // intrinsic function for XGETBV -#elif defined(__GNUC__) // use inline assembly, Gnu/AT&T syntax +#elif defined(__GNUC__) || defined (__clang__) // use inline assembly, Gnu/AT&T syntax uint32_t a, d; __asm("xgetbv" : "=a"(a),"=d"(d) : "c"(ctr) : ); return a | (uint64_t(d) << 32); -#else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax - +#else // #elif defined (_WIN32) // other compiler. try inline assembly with masm/intel/MS syntax uint32_t a, d; __asm { mov ecx, ctr @@ -77,11 +46,10 @@ static inline int64_t xgetbv (int ctr) { #endif } - /* find supported instruction set return value: 0 = 80386 instruction set - 1 or above = SSE (XMM) supported by CPU (not testing for O.S. support) + 1 or above = SSE (XMM) supported by CPU (not testing for OS support) 2 or above = SSE2 3 or above = SSE3 4 or above = Supplementary SSE3 (SSSE3) @@ -90,8 +58,7 @@ static inline int64_t xgetbv (int ctr) { 7 or above = AVX supported by CPU and operating system 8 or above = AVX2 9 or above = AVX512F - 10 or above = AVX512VL - 11 or above = AVX512BW, AVX512DQ + 10 or above = AVX512VL, AVX512BW, AVX512DQ */ int instrset_detect(void) { @@ -131,12 +98,11 @@ int instrset_detect(void) { if ((abcd[1] & (1 << 16)) == 0) return iset; // no AVX512 cpuid(abcd, 0xD); // call cpuid leaf 0xD for feature flags if ((abcd[0] & 0x60) != 0x60) return iset; // no AVX512 - iset = 9; + iset = 9; cpuid(abcd, 7); // call cpuid leaf 7 for feature flags if ((abcd[1] & (1 << 31)) == 0) return iset; // no AVX512VL - iset = 10; if ((abcd[1] & 0x40020000) != 0x40020000) return iset; // no AVX512BW, AVX512DQ - iset = 11; + iset = 10; return iset; } @@ -180,6 +146,21 @@ bool hasAVX512ER(void) { return ((abcd[1] & (1 << 27)) != 0); // ebx bit 27 indicates AVX512ER } +// detect if CPU supports the AVX512VBMI instruction set +bool hasAVX512VBMI(void) { + if (instrset_detect() < 10) return false; // must have AVX512BW + int abcd[4]; // cpuid results + cpuid(abcd, 7); // call cpuid function 7 + return ((abcd[2] & (1 << 1)) != 0); // ecx bit 1 indicates AVX512VBMI +} + +// detect if CPU supports the AVX512VBMI2 instruction set +bool hasAVX512VBMI2(void) { + if (instrset_detect() < 10) return false; // must have AVX512BW + int abcd[4]; // cpuid results + cpuid(abcd, 7); // call cpuid function 7 + return ((abcd[2] & (1 << 6)) != 0); // ecx bit 6 indicates AVX512VBMI2 +} #ifdef VCL_NAMESPACE } diff --git a/DFTTest/VCL2/vector_convert.h b/DFTTest/VCL2/vector_convert.h new file mode 100644 index 0000000..a7aa2b6 --- /dev/null +++ b/DFTTest/VCL2/vector_convert.h @@ -0,0 +1,574 @@ +/************************** vector_convert.h ******************************* +* Author: Agner Fog +* Date created: 2014-07-23 +* Last modified: 2019-11-17 +* Version: 2.01.00 +* Project: vector class library +* Description: +* Header file for conversion between different vector classes with different +* sizes. Also includes verious generic template functions. +* +* (c) Copyright 2012-2019 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTOR_CONVERT_H +#define VECTOR_CONVERT_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +#if MAX_VECTOR_SIZE >= 256 + +/***************************************************************************** +* +* Extend from 128 to 256 bit vectors +* +*****************************************************************************/ + +#if INSTRSET >= 8 // AVX2. 256 bit integer vectors + +// sign extend +static inline Vec16s extend (Vec16c const a) { + return _mm256_cvtepi8_epi16(a); +} + +// zero extend +static inline Vec16us extend (Vec16uc const a) { + return _mm256_cvtepu8_epi16(a); +} + +// sign extend +static inline Vec8i extend (Vec8s const a) { + return _mm256_cvtepi16_epi32(a); +} + +// zero extend +static inline Vec8ui extend (Vec8us const a) { + return _mm256_cvtepu16_epi32(a); +} + +// sign extend +static inline Vec4q extend (Vec4i const a) { + return _mm256_cvtepi32_epi64(a); +} + +// zero extend +static inline Vec4uq extend (Vec4ui const a) { + return _mm256_cvtepu32_epi64(a); +} + + +#else // no AVX2. 256 bit integer vectors are emulated + +// sign extend and zero extend functions: +static inline Vec16s extend (Vec16c const a) { + return Vec16s(extend_low(a), extend_high(a)); +} + +static inline Vec16us extend (Vec16uc const a) { + return Vec16us(extend_low(a), extend_high(a)); +} + +static inline Vec8i extend (Vec8s const a) { + return Vec8i(extend_low(a), extend_high(a)); +} + +static inline Vec8ui extend (Vec8us const a) { + return Vec8ui(extend_low(a), extend_high(a)); +} + +static inline Vec4q extend (Vec4i const a) { + return Vec4q(extend_low(a), extend_high(a)); +} + +static inline Vec4uq extend (Vec4ui const a) { + return Vec4uq(extend_low(a), extend_high(a)); +} + +#endif // AVX2 + +/***************************************************************************** +* +* Conversions between float and double +* +*****************************************************************************/ +#if INSTRSET >= 7 // AVX. 256 bit float vectors + +// float to double +static inline Vec4d to_double (Vec4f const a) { + return _mm256_cvtps_pd(a); +} + +// double to float +static inline Vec4f to_float (Vec4d const a) { + return _mm256_cvtpd_ps(a); +} + +#else // no AVX2. 256 bit float vectors are emulated + +// float to double +static inline Vec4d to_double (Vec4f const a) { + Vec2d lo = _mm_cvtps_pd(a); + Vec2d hi = _mm_cvtps_pd(_mm_movehl_ps(a, a)); + return Vec4d(lo,hi); +} + +// double to float +static inline Vec4f to_float (Vec4d const a) { + Vec4f lo = _mm_cvtpd_ps(a.get_low()); + Vec4f hi = _mm_cvtpd_ps(a.get_high()); + return _mm_movelh_ps(lo, hi); +} + +#endif + +/***************************************************************************** +* +* Reduce from 256 to 128 bit vectors +* +*****************************************************************************/ +#if INSTRSET >= 10 // AVX512VL + +// compress functions. overflow wraps around +static inline Vec16c compress (Vec16s const a) { + return _mm256_cvtepi16_epi8(a); +} + +static inline Vec16uc compress (Vec16us const a) { + return _mm256_cvtepi16_epi8(a); +} + +static inline Vec8s compress (Vec8i const a) { + return _mm256_cvtepi32_epi16(a); +} + +static inline Vec8us compress (Vec8ui const a) { + return _mm256_cvtepi32_epi16(a); +} + +static inline Vec4i compress (Vec4q const a) { + return _mm256_cvtepi64_epi32(a); +} + +static inline Vec4ui compress (Vec4uq const a) { + return _mm256_cvtepi64_epi32(a); +} + +#else // no AVX512 + +// compress functions. overflow wraps around +static inline Vec16c compress (Vec16s const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec16uc compress (Vec16us const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec8s compress (Vec8i const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec8us compress (Vec8ui const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec4i compress (Vec4q const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec4ui compress (Vec4uq const a) { + return compress(a.get_low(), a.get_high()); +} + +#endif // AVX512 + +#endif // MAX_VECTOR_SIZE >= 256 + + +#if MAX_VECTOR_SIZE >= 512 + +/***************************************************************************** +* +* Extend from 256 to 512 bit vectors +* +*****************************************************************************/ + +#if INSTRSET >= 9 // AVX512. 512 bit integer vectors + +// sign extend +static inline Vec32s extend (Vec32c const a) { +#if INSTRSET >= 10 + return _mm512_cvtepi8_epi16(a); +#else + return Vec32s(extend_low(a), extend_high(a)); +#endif +} + +// zero extend +static inline Vec32us extend (Vec32uc const a) { +#if INSTRSET >= 10 + return _mm512_cvtepu8_epi16(a); +#else + return Vec32us(extend_low(a), extend_high(a)); +#endif +} + +// sign extend +static inline Vec16i extend (Vec16s const a) { + return _mm512_cvtepi16_epi32(a); +} + +// zero extend +static inline Vec16ui extend (Vec16us const a) { + return _mm512_cvtepu16_epi32(a); +} + +// sign extend +static inline Vec8q extend (Vec8i const a) { + return _mm512_cvtepi32_epi64(a); +} + +// zero extend +static inline Vec8uq extend (Vec8ui const a) { + return _mm512_cvtepu32_epi64(a); +} + +#else // no AVX512. 512 bit vectors are emulated + + + +// sign extend +static inline Vec32s extend (Vec32c const a) { + return Vec32s(extend_low(a), extend_high(a)); +} + +// zero extend +static inline Vec32us extend (Vec32uc const a) { + return Vec32us(extend_low(a), extend_high(a)); +} + +// sign extend +static inline Vec16i extend (Vec16s const a) { + return Vec16i(extend_low(a), extend_high(a)); +} + +// zero extend +static inline Vec16ui extend (Vec16us const a) { + return Vec16ui(extend_low(a), extend_high(a)); +} + +// sign extend +static inline Vec8q extend (Vec8i const a) { + return Vec8q(extend_low(a), extend_high(a)); +} + +// zero extend +static inline Vec8uq extend (Vec8ui const a) { + return Vec8uq(extend_low(a), extend_high(a)); +} + +#endif // AVX512 + + +/***************************************************************************** +* +* Reduce from 512 to 256 bit vectors +* +*****************************************************************************/ +#if INSTRSET >= 9 // AVX512F + +// compress functions. overflow wraps around +static inline Vec32c compress (Vec32s const a) { +#if INSTRSET >= 10 // AVVX512BW + return _mm512_cvtepi16_epi8(a); +#else + return compress(a.get_low(), a.get_high()); +#endif +} + +static inline Vec32uc compress (Vec32us const a) { + return Vec32uc(compress(Vec32s(a))); +} + +static inline Vec16s compress (Vec16i const a) { + return _mm512_cvtepi32_epi16(a); +} + +static inline Vec16us compress (Vec16ui const a) { + return _mm512_cvtepi32_epi16(a); +} + +static inline Vec8i compress (Vec8q const a) { + return _mm512_cvtepi64_epi32(a); +} + +static inline Vec8ui compress (Vec8uq const a) { + return _mm512_cvtepi64_epi32(a); +} + +#else // no AVX512 + +// compress functions. overflow wraps around +static inline Vec32c compress (Vec32s const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec32uc compress (Vec32us const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec16s compress (Vec16i const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec16us compress (Vec16ui const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec8i compress (Vec8q const a) { + return compress(a.get_low(), a.get_high()); +} + +static inline Vec8ui compress (Vec8uq const a) { + return compress(a.get_low(), a.get_high()); +} + +#endif // AVX512 + +/***************************************************************************** +* +* Conversions between float and double +* +*****************************************************************************/ + +#if INSTRSET >= 9 // AVX512. 512 bit float vectors + +// float to double +static inline Vec8d to_double (Vec8f const a) { + return _mm512_cvtps_pd(a); +} + +// double to float +static inline Vec8f to_float (Vec8d const a) { + return _mm512_cvtpd_ps(a); +} + +#else // no AVX512. 512 bit float vectors are emulated + +// float to double +static inline Vec8d to_double (Vec8f const a) { + Vec4d lo = to_double(a.get_low()); + Vec4d hi = to_double(a.get_high()); + return Vec8d(lo,hi); +} + +// double to float +static inline Vec8f to_float (Vec8d const a) { + Vec4f lo = to_float(a.get_low()); + Vec4f hi = to_float(a.get_high()); + return Vec8f(lo, hi); +} + +#endif + +#endif // MAX_VECTOR_SIZE >= 512 + +// double to float +static inline Vec4f to_float (Vec2d const a) { + return _mm_cvtpd_ps(a); +} + + +/***************************************************************************** +* +* Generic template functions +* +* These templates define functions for multiple vector types in one template +* +*****************************************************************************/ + +// horizontal min/max of vector elements +// implemented with universal template, works for all vector types: + +template auto horizontal_min(T const x) { + if constexpr ((T::elementtype() & 16) != 0) { + // T is a float or double vector + if (horizontal_or(is_nan(x))) { + // check for NAN because min does not guarantee NAN propagation + return x[horizontal_find_first(is_nan(x))]; + } + } + return horizontal_min1(x); +} + +template auto horizontal_min1(T const x) { + if constexpr (T::elementtype() <= 3) { // boolean vector type + return horizontal_and(x); + } + else if constexpr (sizeof(T) >= 32) { + // split recursively into smaller vectors + return horizontal_min1(min(x.get_low(), x.get_high())); + } + else if constexpr (T::size() == 2) { + T a = permute2 <1, V_DC>(x); // high half + T b = min(a, x); + return b[0]; + } + else if constexpr (T::size() == 4) { + T a = permute4<2, 3, V_DC, V_DC>(x); // high half + T b = min(a, x); + a = permute4<1, V_DC, V_DC, V_DC>(b); + b = min(a, b); + return b[0]; + } + else if constexpr (T::size() == 8) { + T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half + T b = min(a, x); + a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = min(a, b); + a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = min(a, b); + return b[0]; + } + else { + static_assert(T::size() == 16); // no other size is allowed + T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half + T b = min(a, x); + a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = min(a, b); + a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = min(a, b); + a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = min(a, b); + return b[0]; + } +} + +template auto horizontal_max(T const x) { + if constexpr ((T::elementtype() & 16) != 0) { + // T is a float or double vector + if (horizontal_or(is_nan(x))) { + // check for NAN because max does not guarantee NAN propagation + return x[horizontal_find_first(is_nan(x))]; + } + } + return horizontal_max1(x); +} + +template auto horizontal_max1(T const x) { + if constexpr (T::elementtype() <= 3) { // boolean vector type + return horizontal_or(x); + } + else if constexpr (sizeof(T) >= 32) { + // split recursively into smaller vectors + return horizontal_max1(max(x.get_low(), x.get_high())); + } + else if constexpr (T::size() == 2) { + T a = permute2 <1, V_DC>(x); // high half + T b = max(a, x); + return b[0]; + } + else if constexpr (T::size() == 4) { + T a = permute4<2, 3, V_DC, V_DC>(x); // high half + T b = max(a, x); + a = permute4<1, V_DC, V_DC, V_DC>(b); + b = max(a, b); + return b[0]; + } + else if constexpr (T::size() == 8) { + T a = permute8<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC>(x); // high half + T b = max(a, x); + a = permute8<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = max(a, b); + a = permute8<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = max(a, b); + return b[0]; + } + else { + static_assert(T::size() == 16); // no other size is allowed + T a = permute16<8, 9, 10, 11, 12, 13, 14, 15, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC >(x); // high half + T b = max(a, x); + a = permute16<4, 5, 6, 7, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = max(a, b); + a = permute16<2, 3, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = max(a, b); + a = permute16<1, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC, V_DC>(b); + b = max(a, b); + return b[0]; + } +} + +// Find first element that is true in a boolean vector +template +static inline int horizontal_find_first(V const x) { + static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); + auto bits = to_bits(x); // convert to bits + if (bits == 0) return -1; + if constexpr (V::size() < 32) { + return bit_scan_forward((uint32_t)bits); + } + else { + return bit_scan_forward(bits); + } +} + +// Count the number of elements that are true in a boolean vector +template +static inline int horizontal_count(V const x) { + static_assert(V::elementtype() == 2 || V::elementtype() == 3, "Boolean vector expected"); + auto bits = to_bits(x); // convert to bits + if constexpr (V::size() < 32) { + return vml_popcnt((uint32_t)bits); + } + else { + return (int)vml_popcnt(bits); + } +} + +// maximum and minimum functions. This version is sure to propagate NANs, +// conforming to the new IEEE-754 2019 standard +template +static inline V maximum(V const a, V const b) { + if constexpr (V::elementtype() < 16) { + return max(a, b); // integer type + } + else { // float or double vector + V y = select(is_nan(a), a, max(a, b)); +#ifdef SIGNED_ZERO // pedantic about signed zero + y = select(a == b, a & b, y); // maximum(+0, -0) = +0 +#endif + return y; + } +} + +template +static inline V minimum(V const a, V const b) { + if constexpr (V::elementtype() < 16) { + return min(a, b); // integer type + } + else { // float or double vector + V y = select(is_nan(a), a, min(a, b)); +#ifdef SIGNED_ZERO // pedantic about signed zero + y = select(a == b, a | b, y); // minimum(+0, -0) = -0 +#endif + return y; + } +} + + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTOR_CONVERT_H diff --git a/DFTTest/VCL2/vectorclass.h b/DFTTest/VCL2/vectorclass.h new file mode 100644 index 0000000..b105711 --- /dev/null +++ b/DFTTest/VCL2/vectorclass.h @@ -0,0 +1,86 @@ +/**************************** vectorclass.h ******************************** +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2020-04-11 +* Version: 2.01.02 +* Project: vector class library +* Home: https://github.com/vectorclass +* Description: +* Header file defining vector classes as interface to intrinsic functions +* in x86 and x86-64 microprocessors with SSE2 and later instruction sets. +* +* Instructions: +* Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired +* instruction set, which must be at least SSE2. Specify the supported +* instruction set by a command line define, e.g. __SSE4_1__ if the +* compiler does not automatically do so. +* For detailed instructions, see vcl_manual.pdf +* +* Each vector object is represented internally in the CPU as a vector +* register with 128, 256 or 512 bits. +* +* This header file includes the appropriate header files depending on the +* selected instruction set. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ +#ifndef VECTORCLASS_H +#define VECTORCLASS_H 20102 + +// Maximum vector size, bits. Allowed values are 128, 256, 512 +#ifndef MAX_VECTOR_SIZE +#define MAX_VECTOR_SIZE 512 +#endif + +// Determine instruction set, and define platform-dependent functions +#include "instrset.h" // Select supported instruction set + +#if INSTRSET < 2 // instruction set SSE2 is the minimum +#error Please compile for the SSE2 instruction set or higher +#else + +// Select appropriate .h files depending on instruction set +#include "vectori128.h" // 128-bit integer vectors +#include "vectorf128.h" // 128-bit floating point vectors + +#if MAX_VECTOR_SIZE >= 256 +#if INSTRSET >= 8 +#include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set +#else +#include "vectori256e.h" // 256-bit integer vectors, emulated +#endif // INSTRSET >= 8 +#if INSTRSET >= 7 +#include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set +#else +#include "vectorf256e.h" // 256-bit floating point vectors, emulated +#endif // INSTRSET >= 7 +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +#if INSTRSET >= 9 +#include "vectori512.h" // 512-bit vectors of 32 and 64 bit integers, requires AVX512F instruction set +#include "vectorf512.h" // 512-bit floating point vectors, requires AVX512F instruction set +#else +#include "vectori512e.h" // 512-bit integer vectors, emulated +#include "vectorf512e.h" // 512-bit floating point vectors, emulated +#endif // INSTRSET >= 9 +#if INSTRSET >= 10 +#include "vectori512s.h" // 512-bit vectors of 8 and 16 bit integers, requires AVX512BW instruction set +#else +#include "vectori512se.h" // 512-bit vectors of 8 and 16 bit integers, emulated +#endif +#endif // MAX_VECTOR_SIZE >= 512 + +#include "vector_convert.h" // conversion between different vector sizes + +#endif // INSTRSET >= 2 + + +#else // VECTORCLASS_H + +#if VECTORCLASS_H < 20000 +#error Mixed versions of vector class library +#endif + +#endif // VECTORCLASS_H diff --git a/DFTTest/VCL2/vectorf128.h b/DFTTest/VCL2/vectorf128.h new file mode 100644 index 0000000..042af7d --- /dev/null +++ b/DFTTest/VCL2/vectorf128.h @@ -0,0 +1,2993 @@ +/**************************** vectorf128.h ******************************* +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining 128-bit floating point vector classes +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec4f Vector of 4 single precision floating point numbers +* Vec4fb Vector of 4 Booleans for use with Vec4f +* Vec2d Vector of 2 double precision floating point numbers +* Vec2db Vector of 2 Booleans for use with Vec2d +* +* Each vector object is represented internally in the CPU as a 128-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORF128_H +#define VECTORF128_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +/***************************************************************************** +* +* select functions +* +*****************************************************************************/ +// Select between two __m128 sources, element by element, with broad boolean vector. +// Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +// Each element in s must be either 0 (false) or 0xFFFFFFFF (true). +// No other values are allowed for broad boolean vectors. +// The implementation depends on the instruction set: +// If SSE4.1 is supported then only bit 31 in each dword of s is checked, +// otherwise all bits in s are used. +static inline __m128 selectf(__m128 const s, __m128 const a, __m128 const b) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_blendv_ps(b, a, s); +#else + return _mm_or_ps( + _mm_and_ps(s, a), + _mm_andnot_ps(s, b)); +#endif +} + +// Same, with two __m128d sources. +// and operators. Corresponds to this pseudocode: +// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; +// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other +// No other values are allowed for broad boolean vectors. +// The implementation depends on the instruction set: +// If SSE4.1 is supported then only bit 63 in each dword of s is checked, +// otherwise all bits in s are used. +static inline __m128d selectd(__m128d const s, __m128d const a, __m128d const b) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_blendv_pd(b, a, s); +#else + return _mm_or_pd( + _mm_and_pd(s, a), + _mm_andnot_pd(s, b)); +#endif +} + + +/***************************************************************************** +* +* Vec4fb: Vector of 4 Booleans for use with Vec4f +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec4fb { +protected: + __m128 xmm; // Float vector +public: + // Default constructor: + Vec4fb() { + } + // Constructor to build from all elements: + Vec4fb(bool b0, bool b1, bool b2, bool b3) { + xmm = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); + } + // Constructor to convert from type __m128 used in intrinsics: + Vec4fb(__m128 const x) { + xmm = x; + } + // Assignment operator to convert from type __m128 used in intrinsics: + Vec4fb & operator = (__m128 const x) { + xmm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec4fb(bool b) { + xmm = _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b))); + } + // Assignment operator to broadcast scalar value: + Vec4fb & operator = (bool b) { + *this = Vec4fb(b); + return *this; + } + // Constructor to convert from type Vec4ib used as Boolean for integer vectors + Vec4fb(Vec4ib const x) { + xmm = _mm_castsi128_ps(x); + } + // Assignment operator to convert from type Vec4ib used as Boolean for integer vectors + Vec4fb & operator = (Vec4ib const x) { + xmm = _mm_castsi128_ps(x); + return *this; + } + // Type cast operator to convert to __m128 used in intrinsics + operator __m128() const { + return xmm; + } + /* Clang problem: + The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128f as identical. + I have reported this problem in 2013 but it is still not fixed in 2019! + See the bug report at https://bugs.llvm.org/show_bug.cgi?id=17164 + Additional problem: The version number is not consistent across platforms. The Apple build has + different version numbers. We have to rely on __apple_build_version__ on the Mac platform: + http://llvm.org/bugs/show_bug.cgi?id=12643 + I have received reports that there was no aliasing of vector types on __apple_build_version__ = 6020053 + but apparently the problem has come back. The aliasing of vector types has been reported on + __apple_build_version__ = 8000042 + We have to make switches here when - hopefully - the error some day has been fixed. + We need different version checks with and whithout __apple_build_version__ + */ +#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY + // Type cast operator to convert to type Vec4ib used as Boolean for integer vectors + operator Vec4ib() const { + return _mm_castps_si128(xmm); + } +#endif + // Member function to change a single element in vector + Vec4fb const insert(int index, bool value) { + const int32_t maskl[8] = { 0,0,0,0,-1,0,0,0 }; + __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 3))); // mask with FFFFFFFF at index position + if (value) { + xmm = _mm_or_ps(xmm, mask); + } + else { + xmm = _mm_andnot_ps(mask, xmm); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec4ib(_mm_castps_si128(xmm)).extract(index); + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + + // Member function to change a bitfield to a boolean vector + Vec4fb & load_bits(uint8_t a) { + Vec4ib b; b.load_bits(a); + xmm = _mm_castsi128_ps(b); + return *this; + } + static constexpr int size() { + return 4; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec4fb(int b) = delete; + Vec4fb & operator = (int x) = delete; +}; + +#else + +typedef Vec4b Vec4fb; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Operators for Vec4fb +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec4fb operator & (Vec4fb const a, Vec4fb const b) { + return _mm_and_ps(a, b); +} +static inline Vec4fb operator && (Vec4fb const a, Vec4fb const b) { + return a & b; +} + +// vector operator &= : bitwise and +static inline Vec4fb & operator &= (Vec4fb & a, Vec4fb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec4fb operator | (Vec4fb const a, Vec4fb const b) { + return _mm_or_ps(a, b); +} +static inline Vec4fb operator || (Vec4fb const a, Vec4fb const b) { + return a | b; +} + +// vector operator |= : bitwise or +static inline Vec4fb & operator |= (Vec4fb & a, Vec4fb const b) { + a = a | b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec4fb operator ~ (Vec4fb const a) { + return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1))); +} + +// vector operator ^ : bitwise xor +static inline Vec4fb operator ^ (Vec4fb const a, Vec4fb const b) { + return _mm_xor_ps(a, b); +} + +// vector operator == : xnor +static inline Vec4fb operator == (Vec4fb const a, Vec4fb const b) { + return Vec4fb(a ^ Vec4fb(~b)); +} + +// vector operator != : xor +static inline Vec4fb operator != (Vec4fb const a, Vec4fb const b) { + return Vec4fb(a ^ b); +} + +// vector operator ^= : bitwise xor +static inline Vec4fb & operator ^= (Vec4fb & a, Vec4fb const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not +// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same) +static inline Vec4fb operator ! (Vec4fb const a) { + return Vec4fb(!Vec4ib(a)); +} + +// Functions for Vec4fb + +// andnot: a & ~ b +static inline Vec4fb andnot(Vec4fb const a, Vec4fb const b) { + return _mm_andnot_ps(b, a); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and(Vec4fb const a) { + return _mm_movemask_ps(a) == 0x0F; + //return horizontal_and(Vec128b(_mm_castps_si128(a))); +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or(Vec4fb const a) { + return _mm_movemask_ps(a) != 0; + //return horizontal_or(Vec128b(_mm_castps_si128(a))); +} + +#endif + + +/***************************************************************************** +* +* Vec2db: Vector of 2 Booleans for use with Vec2d +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec2db { +protected: + __m128d xmm; // Double vector +public: + // Default constructor: + Vec2db() { + } + // Constructor to broadcast scalar value: + Vec2db(bool b) { + xmm = _mm_castsi128_pd(_mm_set1_epi32(-int32_t(b))); + } + // Constructor to build from all elements: + Vec2db(bool b0, bool b1) { + xmm = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); + } + // Constructor to convert from type __m128d used in intrinsics: + Vec2db(__m128d const x) { + xmm = x; + } + // Assignment operator to convert from type __m128d used in intrinsics: + Vec2db & operator = (__m128d const x) { + xmm = x; + return *this; + } + // Assignment operator to broadcast scalar value: + Vec2db & operator = (bool b) { + *this = Vec2db(b); + return *this; + } + // Constructor to convert from type Vec2qb used as Boolean for integer vectors + Vec2db(Vec2qb const x) { + xmm = _mm_castsi128_pd(x); + } + // Assignment operator to convert from type Vec2qb used as Boolean for integer vectors + Vec2db & operator = (Vec2qb const x) { + xmm = _mm_castsi128_pd(x); + return *this; + } + // Type cast operator to convert to __m128d used in intrinsics + operator __m128d() const { + return xmm; + } +#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY + // Type cast operator to convert to type Vec2qb used as Boolean for integer vectors + operator Vec2qb() const { + return _mm_castpd_si128(xmm); + } +#endif + // Member function to change a single element in vector + Vec2db const insert(int index, bool value) { + const int32_t maskl[8] = { 0,0,0,0,-1,-1,0,0 }; + __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 1) * 2)); // mask with FFFFFFFFFFFFFFFF at index position + if (value) { + xmm = _mm_or_pd(xmm, _mm_castps_pd(mask)); + } + else { + xmm = _mm_andnot_pd(_mm_castps_pd(mask), xmm); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec2qb(_mm_castpd_si128(xmm)).extract(index); + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec2db & load_bits(uint8_t a) { + Vec2qb b; b.load_bits(a); + xmm = _mm_castsi128_pd(b); + return *this; + } + static constexpr int size() { + return 2; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec2db(int b) = delete; + Vec2db & operator = (int x) = delete; +}; + +#else + +typedef Vec2b Vec2db; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Operators for Vec2db +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec2db operator & (Vec2db const a, Vec2db const b) { + return _mm_and_pd(a, b); +} +static inline Vec2db operator && (Vec2db const a, Vec2db const b) { + return a & b; +} + +// vector operator &= : bitwise and +static inline Vec2db & operator &= (Vec2db & a, Vec2db const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec2db operator | (Vec2db const a, Vec2db const b) { + return _mm_or_pd(a, b); +} +static inline Vec2db operator || (Vec2db const a, Vec2db const b) { + return a | b; +} + +// vector operator |= : bitwise or +static inline Vec2db & operator |= (Vec2db & a, Vec2db const b) { + a = a | b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec2db operator ~ (Vec2db const a) { + return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1))); +} + +// vector operator ^ : bitwise xor +static inline Vec2db operator ^ (Vec2db const a, Vec2db const b) { + return _mm_xor_pd(a, b); +} + +// vector operator == : xnor +static inline Vec2db operator == (Vec2db const a, Vec2db const b) { + return Vec2db(a ^ Vec2db(~b)); +} + +// vector operator != : xor +static inline Vec2db operator != (Vec2db const a, Vec2db const b) { + return Vec2db(a ^ b); +} + +// vector operator ^= : bitwise xor +static inline Vec2db & operator ^= (Vec2db & a, Vec2db const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not +// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same) +static inline Vec2db operator ! (Vec2db const a) { + return Vec2db(!Vec2qb(a)); +} + +// Functions for Vec2db + +// andnot: a & ~ b +static inline Vec2db andnot(Vec2db const a, Vec2db const b) { + return _mm_andnot_pd(b, a); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and(Vec2db const a) { + return _mm_movemask_pd(a) == 3; + //return horizontal_and(Vec128b(_mm_castpd_si128(a))); +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or(Vec2db const a) { + return _mm_movemask_pd(a) != 0; + //return horizontal_or(Vec128b(_mm_castpd_si128(a))); +} + +#endif + + +/***************************************************************************** +* +* Vec4f: Vector of 4 single precision floating point values +* +*****************************************************************************/ + +class Vec4f { +protected: + __m128 xmm; // Float vector +public: + // Default constructor: + Vec4f() { + } + // Constructor to broadcast the same value into all elements: + Vec4f(float f) { + xmm = _mm_set1_ps(f); + } + // Constructor to build from all elements: + Vec4f(float f0, float f1, float f2, float f3) { + xmm = _mm_setr_ps(f0, f1, f2, f3); + } + // Constructor to convert from type __m128 used in intrinsics: + Vec4f(__m128 const x) { + xmm = x; + } + // Assignment operator to convert from type __m128 used in intrinsics: + Vec4f & operator = (__m128 const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128 used in intrinsics + operator __m128() const { + return xmm; + } + // Member function to load from array (unaligned) + Vec4f & load(float const * p) { + xmm = _mm_loadu_ps(p); + return *this; + } + // Member function to load from array, aligned by 16 + // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, + // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 16. + Vec4f & load_a(float const * p) { + xmm = _mm_load_ps(p); + return *this; + } + // Member function to store into array (unaligned) + void store(float * p) const { + _mm_storeu_ps(p, xmm); + } + // Member function storing into array, aligned by 16 + // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, + // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 16. + void store_a(float * p) const { + _mm_store_ps(p, xmm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 16 + void store_nt(float * p) const { + _mm_stream_ps(p, xmm); + } + // Partial load. Load n elements and set the rest to 0 + Vec4f & load_partial(int n, float const * p) { +#if INSTRSET >= 10 // AVX512VL + xmm = _mm_maskz_loadu_ps(__mmask8((1u << n) - 1), p); +#else + __m128 t1, t2; + switch (n) { + case 1: + xmm = _mm_load_ss(p); break; + case 2: + xmm = _mm_castpd_ps(_mm_load_sd((double const*)p)); break; + case 3: + t1 = _mm_castpd_ps(_mm_load_sd((double const*)p)); + t2 = _mm_load_ss(p + 2); + xmm = _mm_movelh_ps(t1, t2); break; + case 4: + load(p); break; + default: + xmm = _mm_setzero_ps(); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, float * p) const { +#if INSTRSET >= 10 // AVX512VL + _mm_mask_storeu_ps(p, __mmask8((1u << n) - 1), xmm); +#else + __m128 t1; + switch (n) { + case 1: + _mm_store_ss(p, xmm); break; + case 2: + _mm_store_sd((double*)p, _mm_castps_pd(xmm)); break; + case 3: + _mm_store_sd((double*)p, _mm_castps_pd(xmm)); + t1 = _mm_movehl_ps(xmm, xmm); + _mm_store_ss(p + 2, t1); break; + case 4: + store(p); break; + default:; + } +#endif + } + // cut off vector to n elements. The last 4-n elements are set to zero + Vec4f & cutoff(int n) { +#if INSTRSET >= 10 + xmm = _mm_maskz_mov_ps(__mmask8((1u << n) - 1), xmm); +#else + if (uint32_t(n) >= 4) return *this; + const union { + int32_t i[8]; + float f[8]; + } mask = { {1,-1,-1,-1,0,0,0,0} }; + xmm = _mm_and_ps(xmm, Vec4f().load(mask.f + 4 - n)); +#endif + return *this; + } + // Member function to change a single element in vector + Vec4f const insert(int index, float value) { +#if INSTRSET >= 10 // AVX512VL + xmm = _mm_mask_broadcastss_ps(xmm, __mmask8(1u << index), _mm_set_ss(value)); +#elif INSTRSET >= 5 // SSE4.1 + switch (index & 3) { + case 0: + xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 0 << 4); break; + case 1: + xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 1 << 4); break; + case 2: + xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 2 << 4); break; + default: + xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 3 << 4); break; + } +#else + const int32_t maskl[8] = { 0,0,0,0,-1,0,0,0 }; + __m128 broad = _mm_set1_ps(value); // broadcast value into all elements + __m128 mask = _mm_loadu_ps((float const*)(maskl + 4 - (index & 3))); // mask with FFFFFFFF at index position + xmm = selectf(mask, broad, xmm); +#endif + return *this; + } + // Member function extract a single element from vector + float extract(int index) const { +#if INSTRSET >= 10 + __m128 x = _mm_maskz_compress_ps(__mmask8(1u << index), xmm); + return _mm_cvtss_f32(x); +#else + float x[4]; + store(x); + return x[index & 3]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + float operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 4; + } + static constexpr int elementtype() { + return 16; + } + typedef __m128 registertype; +}; + + +/***************************************************************************** +* +* Operators for Vec4f +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec4f operator + (Vec4f const a, Vec4f const b) { + return _mm_add_ps(a, b); +} + +// vector operator + : add vector and scalar +static inline Vec4f operator + (Vec4f const a, float b) { + return a + Vec4f(b); +} +static inline Vec4f operator + (float a, Vec4f const b) { + return Vec4f(a) + b; +} + +// vector operator += : add +static inline Vec4f & operator += (Vec4f & a, Vec4f const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec4f operator ++ (Vec4f & a, int) { + Vec4f a0 = a; + a = a + 1.0f; + return a0; +} + +// prefix operator ++ +static inline Vec4f & operator ++ (Vec4f & a) { + a = a + 1.0f; + return a; +} + +// vector operator - : subtract element by element +static inline Vec4f operator - (Vec4f const a, Vec4f const b) { + return _mm_sub_ps(a, b); +} + +// vector operator - : subtract vector and scalar +static inline Vec4f operator - (Vec4f const a, float b) { + return a - Vec4f(b); +} +static inline Vec4f operator - (float a, Vec4f const b) { + return Vec4f(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec4f operator - (Vec4f const a) { + return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); +} + +// vector operator -= : subtract +static inline Vec4f & operator -= (Vec4f & a, Vec4f const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec4f operator -- (Vec4f & a, int) { + Vec4f a0 = a; + a = a - 1.0f; + return a0; +} + +// prefix operator -- +static inline Vec4f & operator -- (Vec4f & a) { + a = a - 1.0f; + return a; +} + +// vector operator * : multiply element by element +static inline Vec4f operator * (Vec4f const a, Vec4f const b) { + return _mm_mul_ps(a, b); +} + +// vector operator * : multiply vector and scalar +static inline Vec4f operator * (Vec4f const a, float b) { + return a * Vec4f(b); +} +static inline Vec4f operator * (float a, Vec4f const b) { + return Vec4f(a) * b; +} + +// vector operator *= : multiply +static inline Vec4f & operator *= (Vec4f & a, Vec4f const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec4f operator / (Vec4f const a, Vec4f const b) { + return _mm_div_ps(a, b); +} + +// vector operator / : divide vector and scalar +static inline Vec4f operator / (Vec4f const a, float b) { + return a / Vec4f(b); +} +static inline Vec4f operator / (float a, Vec4f const b) { + return Vec4f(a) / b; +} + +// vector operator /= : divide +static inline Vec4f & operator /= (Vec4f & a, Vec4f const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec4fb operator == (Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_ps_mask(a, b, 0); +#else + return _mm_cmpeq_ps(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec4fb operator != (Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_ps_mask(a, b, 4); +#else + return _mm_cmpneq_ps(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec4fb operator < (Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_ps_mask(a, b, 1); +#else + return _mm_cmplt_ps(a, b); +#endif +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec4fb operator <= (Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_ps_mask(a, b, 2); +#else + return _mm_cmple_ps(a, b); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec4fb operator > (Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_ps_mask(a, b, 6); +#else + return b < a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec4fb operator >= (Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_ps_mask(a, b, 5); +#else + return b <= a; +#endif +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec4f operator & (Vec4f const a, Vec4f const b) { + return _mm_and_ps(a, b); +} + +// vector operator &= : bitwise and +static inline Vec4f & operator &= (Vec4f & a, Vec4f const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec4f and Vec4fb +static inline Vec4f operator & (Vec4f const a, Vec4fb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_maskz_mov_ps(b, a); +#else + return _mm_and_ps(a, b); +#endif +} +static inline Vec4f operator & (Vec4fb const a, Vec4f const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec4f operator | (Vec4f const a, Vec4f const b) { + return _mm_or_ps(a, b); +} + +// vector operator |= : bitwise or +static inline Vec4f & operator |= (Vec4f & a, Vec4f const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec4f operator ^ (Vec4f const a, Vec4f const b) { + return _mm_xor_ps(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec4f & operator ^= (Vec4f & a, Vec4f const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec4fb operator ! (Vec4f const a) { + return a == Vec4f(0.0f); +} + + +/***************************************************************************** +* +* Functions for Vec4f +* +*****************************************************************************/ + +static inline Vec4f zero_4f() { + return _mm_setzero_ps(); +} + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec4f select(Vec4fb const s, Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_ps(b, s, a); +#else + return selectf(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec4f if_add(Vec4fb const f, Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 + return _mm_mask_add_ps (a, f, a, b); +#else + return a + (Vec4f(f) & b); +#endif +} + +// Conditional subtract: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec4f if_sub(Vec4fb const f, Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 + return _mm_mask_sub_ps (a, f, a, b); +#else + return a - (Vec4f(f) & b); +#endif +} + +// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec4f if_mul(Vec4fb const f, Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 + return _mm_mask_mul_ps (a, f, a, b); +#else + return a * select(f, b, 1.f); +#endif +} + +// Conditional divide: For all vector elements i: result[i] = f[i] ? (a[i] / b[i]) : a[i] +static inline Vec4f if_div(Vec4fb const f, Vec4f const a, Vec4f const b) { +#if INSTRSET >= 10 + return _mm_mask_div_ps (a, f, a, b); +#else + return a / select(f, b, 1.f); +#endif +} + +// Sign functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0f, -INF and -NAN +// Note that sign_bit(Vec4f(-0.0f)) gives true, while Vec4f(-0.0f) < Vec4f(0.0f) gives false +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec4fb sign_bit(Vec4f const a) { + Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4i t2 = t1 >> 31; // extend sign bit +#if INSTRSET >= 10 + return t2 != 0; +#else + return _mm_castsi128_ps(t2); // reinterpret as 32-bit Boolean +#endif +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec4f sign_combine(Vec4f const a, Vec4f const b) { +#if INSTRSET < 10 + return a ^ (b & Vec4f(-0.0f)); +#else + return _mm_castsi128_ps (_mm_ternarylogic_epi32( + _mm_castps_si128(a), _mm_castps_si128(b), Vec4i(0x80000000), 0x78)); +#endif +} + +// Categorization functions + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec4fb is_finite(Vec4f const a) { +#if INSTRSET >= 10 + return __mmask8(_mm_fpclass_ps_mask(a, 0x99) ^ 0x0F); +#else + Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4i t2 = t1 << 1; // shift out sign bit + Vec4i t3 = Vec4i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s + return Vec4ib(t3); +#endif +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec4fb is_inf(Vec4f const a) { +#if INSTRSET >= 10 + return __mmask8(_mm_fpclass_ps_mask(a, 0x18)); +#else + Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4i t2 = t1 << 1; // shift out sign bit + return t2 == Vec4i(0xFF000000); // exponent is all 1s, fraction is 0 +#endif +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +#if INSTRSET >= 10 +static inline Vec4fb is_nan(Vec4f const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return Vec4fb(_mm_fpclass_ps_mask(a, 0x81)); +} +//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +//__attribute__((optimize("-fno-unsafe-math-optimizations"))) +//static inline Vec4fb is_nan(Vec4f const a) { +// return a != a; // not safe with -ffinite-math-only compiler option +//} +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +static inline Vec4fb is_nan(Vec4f const a) { + __m128 aa = a; + __m128i unordered; + __asm volatile("vcmpps $3, %1, %1, %0" : "=x" (unordered) : "x" (aa) ); + return Vec4fb(unordered); +} +#else +static inline Vec4fb is_nan(Vec4f const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm_cmp_ps(a, a, 3); // compare unordered + // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option +} +#endif + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec4fb is_subnormal(Vec4f const a) { +#if INSTRSET >= 10 + return Vec4fb(_mm_fpclass_ps_mask(a, 0x20)); +#else + Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4i t2 = t1 << 1; // shift out sign bit + Vec4i t3 = 0xFF000000; // exponent mask + Vec4i t4 = t2 & t3; // exponent + Vec4i t5 = _mm_andnot_si128(t3, t2); // fraction + return Vec4ib((t4 == 0) & (t5 != 0)); // exponent = 0 and fraction != 0 +#endif +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec4fb is_zero_or_subnormal(Vec4f const a) { +#if INSTRSET >= 10 + return Vec4fb(_mm_fpclass_ps_mask(a, 0x26)); +#else + Vec4i t = _mm_castps_si128(a); // reinterpret as 32-bit integer + t &= 0x7F800000; // isolate exponent + return t == 0; // exponent = 0 +#endif +} + +// Function infinite4f: returns a vector where all elements are +INF +static inline Vec4f infinite4f() { + return _mm_castsi128_ps(_mm_set1_epi32(0x7F800000)); +} + +// Function nan4f: returns a vector where all elements are NAN (quiet) +static inline Vec4f nan4f(int n = 0x10) { + return nan_vec(n); +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline float horizontal_add(Vec4f const a) { +#if INSTRSET >= 3 && false // SSE3 + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128 t1 = _mm_hadd_ps(a, a); + __m128 t2 = _mm_hadd_ps(t1, t1); + return _mm_cvtss_f32(t2); +#else + __m128 t1 = _mm_movehl_ps(a, a); + __m128 t2 = _mm_add_ps(a, t1); + __m128 t3 = _mm_shuffle_ps(t2, t2, 1); + __m128 t4 = _mm_add_ss(t2, t3); + return _mm_cvtss_f32(t4); +#endif +} + +// function max: a > b ? a : b +static inline Vec4f max(Vec4f const a, Vec4f const b) { + return _mm_max_ps(a, b); +} + +// function min: a < b ? a : b +static inline Vec4f min(Vec4f const a, Vec4f const b) { + return _mm_min_ps(a, b); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +static inline Vec4f abs(Vec4f const a) { +#if INSTRSET >= 10 // AVX512VL + return _mm_range_ps(a, a, 8); +#else + __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); + return _mm_and_ps(a, mask); +#endif +} + +// function sqrt: square root +static inline Vec4f sqrt(Vec4f const a) { + return _mm_sqrt_ps(a); +} + +// function square: a * a +static inline Vec4f square(Vec4f const a) { + return a * a; +} + +// pow(vector,int) function template +template +static inline VTYPE pow_template_i(VTYPE const x0, int n) { + VTYPE x = x0; // a^(2^i) + VTYPE y(1.0f); // accumulator + if (n >= 0) { // make sure n is not negative + while (true) { // loop for each bit in n + if (n & 1) y *= x; // multiply if bit = 1 + n >>= 1; // get next bit of n + if (n == 0) return y; // finished + x *= x; // x = a^2, a^4, a^8, etc. + } + } + else { + // n < 0 + if (uint32_t(n) == 0x80000000u) return nan_vec(); // integer overflow + return VTYPE(1.0f) / pow_template_i(x0, -n); // reciprocal + } +} + +// The purpose of this template is to prevent implicit conversion of a float +// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included +template static Vec4f pow(Vec4f const a, TT const n); // = delete + +// Raise floating point numbers to integer power n +template <> +inline Vec4f pow(Vec4f const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec4f pow(Vec4f const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant + +// gcc can optimize pow_template_i to generate the same as the code below. MS and Clang can not. +// Therefore, this code is kept +// to do: test on Intel compiler +template +static inline V pow_n(V const a) { + if (n == 0x80000000) return nan_vec(); // integer overflow + if (n < 0) return V(1.0f) / pow_n(a); + if (n == 0) return V(1.0f); + if (n >= 256) return pow(a, n); + V x = a; // a^(2^i) + V y; // accumulator + const int lowest = n - (n & (n - 1)); // lowest set bit in n + if (n & 1) y = x; + if (n < 2) return y; + x = x * x; // x^2 + if (n & 2) { + if (lowest == 2) y = x; else y *= x; + } + if (n < 4) return y; + x = x * x; // x^4 + if (n & 4) { + if (lowest == 4) y = x; else y *= x; + } + if (n < 8) return y; + x = x * x; // x^8 + if (n & 8) { + if (lowest == 8) y = x; else y *= x; + } + if (n < 16) return y; + x = x * x; // x^16 + if (n & 16) { + if (lowest == 16) y = x; else y *= x; + } + if (n < 32) return y; + x = x * x; // x^32 + if (n & 32) { + if (lowest == 32) y = x; else y *= x; + } + if (n < 64) return y; + x = x * x; // x^64 + if (n & 64) { + if (lowest == 64) y = x; else y *= x; + } + if (n < 128) return y; + x = x * x; // x^128 + if (n & 128) { + if (lowest == 128) y = x; else y *= x; + } + return y; +} + +// implement as function pow(vector, const_int) +template +static inline Vec4f pow(Vec4f const a, Const_int_t) { + return pow_n(a); +} + +// implement the same as macro pow_const(vector, int) +#ifdef VCL_NAMESPACE +#define pow_const(x,n) pow(x, VCL_NAMESPACE::Const_int_t()) +#else +#define pow_const(x,n) pow(x,Const_int_t()) +#endif + +static inline Vec4f round(Vec4f const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_ps(a, 8); +#else // SSE2 + Vec4i y1 = _mm_cvtps_epi32(a); // convert to integer + Vec4f y2 = _mm_cvtepi32_ps(y1); // convert back to float +#ifdef SIGNED_ZERO + y2 |= (a & Vec4f(-0.0f)); // sign of zero +#endif + return select(y1 != 0x80000000, y2, a); // use original value if integer overflows +#endif +} + +// function truncate: round towards zero. (result as float vector) +static inline Vec4f truncate(Vec4f const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_ps(a, 3 + 8); +#else // SSE2 + Vec4i y1 = _mm_cvttps_epi32(a); // truncate to integer + Vec4f y2 = _mm_cvtepi32_ps(y1); // convert back to float +#ifdef SIGNED_ZERO + y2 |= (a & Vec4f(-0.0f)); // sign of zero +#endif + return select(y1 != 0x80000000, y2, a); // use original value if integer overflows +#endif +} + +// function floor: round towards minus infinity. (result as float vector) +static inline Vec4f floor(Vec4f const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_ps(a, 1 + 8); +#else // SSE2 + Vec4f y = round(a); // round + y -= Vec4f(1.f) & (y > a); // subtract 1 if bigger +#ifdef SIGNED_ZERO + y |= (a & Vec4f(-0.0f)); // sign of zero +#endif + return y; +#endif +} + +// function ceil: round towards plus infinity. (result as float vector) +static inline Vec4f ceil(Vec4f const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_ps(a, 2 + 8); +#else // SSE2 + Vec4f y = round(a); // round + y += Vec4f(1.f) & (y < a); // add 1 if bigger +#ifdef SIGNED_ZERO + y |= (a & Vec4f(-0.0f)); // sign of zero +#endif + return y; +#endif +} + +// function roundi: round to nearest integer (even). (result as integer vector) +static inline Vec4i roundi(Vec4f const a) { + // Note: assume MXCSR control register is set to rounding + return _mm_cvtps_epi32(a); +} +//static inline Vec4i round_to_int(Vec4f const a) { return roundi(a); } // deprecated + +// function truncatei: round towards zero. (result as integer vector) +static inline Vec4i truncatei(Vec4f const a) { + return _mm_cvttps_epi32(a); +} +//static inline Vec4i truncate_to_int(Vec4f const a) { return truncatei(a); } // deprecated + +// function to_float: convert integer vector to float vector +static inline Vec4f to_float(Vec4i const a) { + return _mm_cvtepi32_ps(a); +} + +// function to_float: convert unsigned integer vector to float vector +static inline Vec4f to_float(Vec4ui const a) { +#if INSTRSET >= 10 && (!defined(_MSC_VER) || defined(__INTEL_COMPILER)) // _mm_cvtepu32_ps missing in MS VS2019 + return _mm_cvtepu32_ps(a); +#elif INSTRSET >= 9 // __AVX512F__ + return _mm512_castps512_ps128(_mm512_cvtepu32_ps(_mm512_castsi128_si512(a))); +#else + Vec4f b = to_float(Vec4i(a & 0xFFFFF)); // 20 bits + Vec4f c = to_float(Vec4i(a >> 20)); // remaining bits + Vec4f d = b + c * 1048576.f; // 2^20 + return d; +#endif +} + +// Approximate math functions + +// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11) +static inline Vec4f approx_recipr(Vec4f const a) { +#ifdef __AVX512ER__ // AVX512ER: full precision + // todo: if future processors have both AVX512ER and AVX512VL: _mm128_rcp28_round_ps(a, _MM_FROUND_NO_EXC); + return _mm512_castps512_ps128(_mm512_rcp28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC)); +#elif INSTRSET >= 10 // AVX512VL: 14 bit precision + return _mm_rcp14_ps(a); +#elif INSTRSET >= 9 // AVX512F: 14 bit precision + return _mm512_castps512_ps128(_mm512_rcp14_ps(_mm512_castps128_ps512(a))); +#else // AVX: 11 bit precision + return _mm_rcp_ps(a); +#endif +} + +// Newton-Raphson refined approximate reciprocal (23 bit precision) +static inline Vec4f rcp_nr(Vec4f const a) { + Vec4f nr = _mm_rcp_ps(a); + Vec4f muls = nr * nr * a; + Vec4f dbl = nr + nr; + return dbl - muls; +} + +// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11) +static inline Vec4f approx_rsqrt(Vec4f const a) { + // use more accurate version if available. (none of these will raise exceptions on zero) +#ifdef __AVX512ER__ // AVX512ER: full precision + // todo: if future processors have both AVX512ER and AVX521VL: _mm128_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); + return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC)); +#elif INSTRSET >= 10 && !defined(_MSC_VER) // missing in VS2019 + return _mm_rsqrt14_ps(a); +#elif INSTRSET >= 9 // AVX512F: 14 bit precision + return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a))); +#else // SSE: 11 bit precision + return _mm_rsqrt_ps(a); +#endif +} + +// Fused multiply and add functions + +// Multiply and add +static inline Vec4f mul_add(Vec4f const a, Vec4f const b, Vec4f const c) { +#ifdef __FMA__ + return _mm_fmadd_ps(a, b, c); +#elif defined (__FMA4__) + return _mm_macc_ps(a, b, c); +#else + return a * b + c; +#endif +} + +// Multiply and subtract +static inline Vec4f mul_sub(Vec4f const a, Vec4f const b, Vec4f const c) { +#ifdef __FMA__ + return _mm_fmsub_ps(a, b, c); +#elif defined (__FMA4__) + return _mm_msub_ps(a, b, c); +#else + return a * b - c; +#endif +} + +// Multiply and inverse subtract +static inline Vec4f nmul_add(Vec4f const a, Vec4f const b, Vec4f const c) { +#ifdef __FMA__ + return _mm_fnmadd_ps(a, b, c); +#elif defined (__FMA4__) + return _mm_nmacc_ps(a, b, c); +#else + return c - a * b; +#endif +} + +// Multiply and subtract with extra precision on the intermediate calculations, +// even if FMA instructions not supported, using Veltkamp-Dekker split. +// This is used in mathematical functions. Do not use it in general code +// because it is inaccurate in certain cases +static inline Vec4f mul_sub_x(Vec4f const a, Vec4f const b, Vec4f const c) { +#ifdef __FMA__ + return _mm_fmsub_ps(a, b, c); +#elif defined (__FMA4__) + return _mm_msub_ps(a, b, c); +#else + // calculate a * b - c with extra precision + Vec4i upper_mask = -(1 << 12); // mask to remove lower 12 bits + Vec4f a_high = a & Vec4f(_mm_castsi128_ps(upper_mask));// split into high and low parts + Vec4f b_high = b & Vec4f(_mm_castsi128_ps(upper_mask)); + Vec4f a_low = a - a_high; + Vec4f b_low = b - b_high; + Vec4f r1 = a_high * b_high; // this product is exact + Vec4f r2 = r1 - c; // subtract c from high product + Vec4f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product + return r3; // + ((r2 - r1) + c); +#endif +} + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 +static inline Vec4i exponent(Vec4f const a) { + Vec4ui t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4ui t2 = t1 << 1; // shift out sign bit + Vec4ui t3 = t2 >> 24; // shift down logical to position 0 + Vec4i t4 = Vec4i(t3) - 0x7F; // subtract bias from exponent + return t4; +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f +// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h ! +static inline Vec4f fraction(Vec4f const a) { +#if INSTRSET >= 10 + return _mm_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); +#else + Vec4ui t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F800000); // set exponent to 0 + bias + return _mm_castsi128_ps(t2); +#endif +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0f +// n >= 128 gives +INF +// n <= -127 gives 0.0f +// This function will never produce denormals, and never raise exceptions +static inline Vec4f exp2(Vec4i const n) { + Vec4i t1 = max(n, -0x7F); // limit to allowed range + Vec4i t2 = min(t1, 0x80); + Vec4i t3 = t2 + 0x7F; // add bias + Vec4i t4 = t3 << 23; // put exponent into position 23 + return _mm_castsi128_ps(t4); // reinterpret as float +} +//static Vec4f exp2(Vec4f const x); // defined in vectormath_exp.h + + +// Control word manipulaton +// ------------------------ +// The MXCSR control word has the following bits: +// 0: Invalid Operation Flag +// 1: Denormal Flag (=subnormal) +// 2: Divide-by-Zero Flag +// 3: Overflow Flag +// 4: Underflow Flag +// 5: Precision Flag +// 6: Denormals Are Zeros (=subnormals) +// 7: Invalid Operation Mask +// 8: Denormal Operation Mask (=subnormal) +// 9: Divide-by-Zero Mask +// 10: Overflow Mask +// 11: Underflow Mask +// 12: Precision Mask +// 13-14: Rounding control +// 00: round to nearest or even +// 01: round down towards -infinity +// 10: round up towards +infinity +// 11: round towards zero (truncate) +// 15: Flush to Zero + +// Function get_control_word: +// Read the MXCSR control word +static inline uint32_t get_control_word() { + return _mm_getcsr(); +} + +// Function set_control_word: +// Write the MXCSR control word +static inline void set_control_word(uint32_t w) { + _mm_setcsr(w); +} + +// Function no_subnormals: +// Set "Denormals Are Zeros" and "Flush to Zero" mode to avoid the extremely +// time-consuming denormals in case of underflow +static inline void no_subnormals() { + uint32_t t1 = get_control_word(); + t1 |= (1 << 6) | (1 << 15); // set bit 6 and 15 in MXCSR + set_control_word(t1); +} + +// Function reset_control_word: +// Set the MXCSR control word to the default value 0x1F80. +// This will mask floating point exceptions, set rounding mode to nearest (or even), +// and allow denormals. +static inline void reset_control_word() { + set_control_word(0x1F80); +} + + +// change signs on vectors Vec4f +// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change +template +static inline Vec4f change_sign(Vec4f const a) { + if ((i0 | i1 | i2 | i3) == 0) return a; + __m128i mask = constant4ui(); + return _mm_xor_ps(a, _mm_castsi128_ps(mask)); // flip sign bits +} + + +/***************************************************************************** +* +* Vec2d: Vector of 2 double precision floating point values +* +*****************************************************************************/ + +class Vec2d { +protected: + __m128d xmm; // double vector +public: + // Default constructor: + Vec2d() { + } + // Constructor to broadcast the same value into all elements: + Vec2d(double d) { + xmm = _mm_set1_pd(d); + } + // Constructor to build from all elements: + Vec2d(double d0, double d1) { + xmm = _mm_setr_pd(d0, d1); + } + // Constructor to convert from type __m128d used in intrinsics: + Vec2d(__m128d const x) { + xmm = x; + } + // Assignment operator to convert from type __m128d used in intrinsics: + Vec2d & operator = (__m128d const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128d used in intrinsics + operator __m128d() const { + return xmm; + } + // Member function to load from array (unaligned) + Vec2d & load(double const * p) { + xmm = _mm_loadu_pd(p); + return *this; + } + // Member function to load from array, aligned by 16 + // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, + // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 16. + Vec2d const load_a(double const * p) { + xmm = _mm_load_pd(p); + return *this; + } + // Member function to store into array (unaligned) + void store(double * p) const { + _mm_storeu_pd(p, xmm); + } + // Member function storing into array, aligned by 16 + // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, + // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 16. + void store_a(double * p) const { + _mm_store_pd(p, xmm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 16 + void store_nt(double * p) const { + _mm_stream_pd(p, xmm); + } + // Partial load. Load n elements and set the rest to 0 + Vec2d & load_partial(int n, double const * p) { +#if INSTRSET >= 10 // AVX512VL + xmm = _mm_maskz_loadu_pd(__mmask8((1u << n) - 1), p); +#else + if (n == 1) { + xmm = _mm_load_sd(p); + } + else if (n == 2) { + load(p); + } + else { + xmm = _mm_setzero_pd(); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, double * p) const { +#if INSTRSET >= 10 // AVX512VL + _mm_mask_storeu_pd(p, __mmask8((1u << n) - 1), xmm); +#else + if (n == 1) { + _mm_store_sd(p, xmm); + } + else if (n == 2) { + store(p); + } +#endif + } + // cut off vector to n elements. The last 4-n elements are set to zero + Vec2d & cutoff(int n) { +#if INSTRSET >= 10 + xmm = _mm_maskz_mov_pd(__mmask8((1u << n) - 1), xmm); +#else + xmm = _mm_castps_pd(Vec4f(_mm_castpd_ps(xmm)).cutoff(n * 2)); +#endif + return *this; + } + // Member function to change a single element in vector + // Note: This function is inefficient. Use load function if changing more than one element + Vec2d const insert(int index, double value) { +#if INSTRSET >= 10 // AVX512VL + xmm = _mm_mask_movedup_pd(xmm, __mmask8(1u << index), _mm_set_sd(value)); +#else + __m128d v2 = _mm_set_sd(value); + if (index == 0) { + xmm = _mm_shuffle_pd(v2, xmm, 2); + } + else { + xmm = _mm_shuffle_pd(xmm, v2, 0); + } +#endif + return *this; + } + // Member function extract a single element from vector + double extract(int index) const { +#if INSTRSET >= 10 // AVX512VL + __m128d x = _mm_mask_unpackhi_pd(xmm, __mmask8(index), xmm, xmm); + return _mm_cvtsd_f64(x); +#else + double x[2]; + store(x); + return x[index & 1]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + double operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 2; + } + static constexpr int elementtype() { + return 17; + } + typedef __m128d registertype; +}; + + +/***************************************************************************** +* +* Operators for Vec2d +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec2d operator + (Vec2d const a, Vec2d const b) { + return _mm_add_pd(a, b); +} + +// vector operator + : add vector and scalar +static inline Vec2d operator + (Vec2d const a, double b) { + return a + Vec2d(b); +} +static inline Vec2d operator + (double a, Vec2d const b) { + return Vec2d(a) + b; +} + +// vector operator += : add +static inline Vec2d & operator += (Vec2d & a, Vec2d const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec2d operator ++ (Vec2d & a, int) { + Vec2d a0 = a; + a = a + 1.0; + return a0; +} + +// prefix operator ++ +static inline Vec2d & operator ++ (Vec2d & a) { + a = a + 1.0; + return a; +} + +// vector operator - : subtract element by element +static inline Vec2d operator - (Vec2d const a, Vec2d const b) { + return _mm_sub_pd(a, b); +} + +// vector operator - : subtract vector and scalar +static inline Vec2d operator - (Vec2d const a, double b) { + return a - Vec2d(b); +} +static inline Vec2d operator - (double a, Vec2d const b) { + return Vec2d(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec2d operator - (Vec2d const a) { + return _mm_xor_pd(a, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); +} + +// vector operator -= : subtract +static inline Vec2d & operator -= (Vec2d & a, Vec2d const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec2d operator -- (Vec2d & a, int) { + Vec2d a0 = a; + a = a - 1.0; + return a0; +} + +// prefix operator -- +static inline Vec2d & operator -- (Vec2d & a) { + a = a - 1.0; + return a; +} + +// vector operator * : multiply element by element +static inline Vec2d operator * (Vec2d const a, Vec2d const b) { + return _mm_mul_pd(a, b); +} + +// vector operator * : multiply vector and scalar +static inline Vec2d operator * (Vec2d const a, double b) { + return a * Vec2d(b); +} +static inline Vec2d operator * (double a, Vec2d const b) { + return Vec2d(a) * b; +} + +// vector operator *= : multiply +static inline Vec2d & operator *= (Vec2d & a, Vec2d const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec2d operator / (Vec2d const a, Vec2d const b) { + return _mm_div_pd(a, b); +} + +// vector operator / : divide vector and scalar +static inline Vec2d operator / (Vec2d const a, double b) { + return a / Vec2d(b); +} +static inline Vec2d operator / (double a, Vec2d const b) { + return Vec2d(a) / b; +} + +// vector operator /= : divide +static inline Vec2d & operator /= (Vec2d & a, Vec2d const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec2db operator == (Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_pd_mask(a, b, 0); +#else + return _mm_cmpeq_pd(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec2db operator != (Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_pd_mask(a, b, 4); +#else + return _mm_cmpneq_pd(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec2db operator < (Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_pd_mask(a, b, 1); +#else + return _mm_cmplt_pd(a, b); +#endif +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec2db operator <= (Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_pd_mask(a, b, 2); +#else + return _mm_cmple_pd(a, b); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec2db operator > (Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_pd_mask(a, b, 6); +#else + return b < a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec2db operator >= (Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_pd_mask(a, b, 5); +#else + return b <= a; +#endif +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec2d operator & (Vec2d const a, Vec2d const b) { + return _mm_and_pd(a, b); +} + +// vector operator &= : bitwise and +static inline Vec2d & operator &= (Vec2d & a, Vec2d const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec2d and Vec2db +static inline Vec2d operator & (Vec2d const a, Vec2db const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_maskz_mov_pd(b, a); +#else + return _mm_and_pd(a, b); +#endif +} +static inline Vec2d operator & (Vec2db const a, Vec2d const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec2d operator | (Vec2d const a, Vec2d const b) { + return _mm_or_pd(a, b); +} + +// vector operator |= : bitwise or +static inline Vec2d & operator |= (Vec2d & a, Vec2d const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec2d operator ^ (Vec2d const a, Vec2d const b) { + return _mm_xor_pd(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec2d & operator ^= (Vec2d & a, Vec2d const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec2db operator ! (Vec2d const a) { + return a == Vec2d(0.0); +} + + +/***************************************************************************** +* +* Functions for Vec2d +* +*****************************************************************************/ + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; +// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). +// No other values are allowed. +static inline Vec2d select(Vec2db const s, Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_pd(b, s, a); +#else + return selectd(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec2d if_add(Vec2db const f, Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 + return _mm_mask_add_pd (a, f, a, b); +#else + return a + (Vec2d(f) & b); +#endif +} + +// Conditional subtract +static inline Vec2d if_sub(Vec2db const f, Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 + return _mm_mask_sub_pd (a, f, a, b); +#else + return a - (Vec2d(f) & b); +#endif +} + +// Conditional multiply +static inline Vec2d if_mul(Vec2db const f, Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 + return _mm_mask_mul_pd (a, f, a, b); +#else + return a * select(f, b, 1.); +#endif +} + +// Conditional divide +static inline Vec2d if_div(Vec2db const f, Vec2d const a, Vec2d const b) { +#if INSTRSET >= 10 + return _mm_mask_div_pd (a, f, a, b); +#else + return a / select(f, b, 1.); +#endif +} + +// Sign functions + +// change signs on vectors Vec2d +// Each index i0 - i1 is 1 for changing sign on the corresponding element, 0 for no change +template +static inline Vec2d change_sign(Vec2d const a) { + if ((i0 | i1) == 0) return a; + __m128i mask = constant4ui<0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0>(); + return _mm_xor_pd(a, _mm_castsi128_pd(mask)); // flip sign bits +} + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0, -INF and -NAN +// Note that sign_bit(Vec2d(-0.0)) gives true, while Vec2d(-0.0) < Vec2d(0.0) gives false +static inline Vec2db sign_bit(Vec2d const a) { + Vec2q t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer + Vec2q t2 = t1 >> 63; // extend sign bit +#if INSTRSET >= 10 + return t2 != 0; +#else + return _mm_castsi128_pd(t2); // reinterpret as 64-bit Boolean +#endif +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec2d sign_combine(Vec2d const a, Vec2d const b) { +#if INSTRSET < 10 + return a ^ (b & Vec2d(-0.0)); +#else + return _mm_castsi128_pd (_mm_ternarylogic_epi64( + _mm_castpd_si128(a), _mm_castpd_si128(b), Vec2q(0x8000000000000000), 0x78)); +#endif +} + +// Categorization functions + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +static inline Vec2db is_finite(Vec2d const a) { +#if INSTRSET >= 10 + return __mmask8(_mm_fpclass_pd_mask(a, 0x99) ^ 0x03); +#else + Vec2q t1 = _mm_castpd_si128(a); // reinterpret as integer + Vec2q t2 = t1 << 1; // shift out sign bit + Vec2q t3 = 0xFFE0000000000000ll; // exponent mask + Vec2qb t4 = Vec2q(t2 & t3) != t3; // exponent field is not all 1s + return t4; +#endif +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +static inline Vec2db is_inf(Vec2d const a) { +#if INSTRSET >= 10 + return _mm_fpclass_pd_mask(a, 0x18); +#else + Vec2q t1 = _mm_castpd_si128(a); // reinterpret as integer + Vec2q t2 = t1 << 1; // shift out sign bit + return t2 == 0xFFE0000000000000ll; // exponent is all 1s, fraction is 0 +#endif +} + + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +#if INSTRSET >= 10 +static inline Vec2db is_nan(Vec2d const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return Vec2db(_mm_fpclass_pd_mask(a, 0x81)); +} +//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +//__attribute__((optimize("-fno-unsafe-math-optimizations"))) +//static inline Vec2db is_nan(Vec2d const a) { +// return a != a; // not safe with -ffinite-math-only compiler option +//} +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +static inline Vec2db is_nan(Vec2d const a) { + __m128d aa = a; + __m128i unordered; + __asm volatile("vcmppd $3, %1, %1, %0" : "=x" (unordered) : "x" (aa) ); + return Vec2db(unordered); +} +#else +static inline Vec2db is_nan(Vec2d const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm_cmp_pd(a, a, 3); // compare unordered + // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option +} +#endif + + +// Function is_subnormal: gives true for elements that are subnormal (denormal) +// false for finite numbers, zero, NAN and INF +static inline Vec2db is_subnormal(Vec2d const a) { +#if INSTRSET >= 10 + return _mm_fpclass_pd_mask(a, 0x20); +#else + Vec2q t1 = _mm_castpd_si128(a); // reinterpret as 32-bit integer + Vec2q t2 = t1 << 1; // shift out sign bit + Vec2q t3 = 0xFFE0000000000000ll; // exponent mask + Vec2q t4 = t2 & t3; // exponent + Vec2q t5 = _mm_andnot_si128(t3, t2);// fraction + return Vec2qb((t4 == 0) & (t5 != 0)); // exponent = 0 and fraction != 0 +#endif +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec2db is_zero_or_subnormal(Vec2d const a) { +#if INSTRSET >= 10 + return _mm_fpclass_pd_mask(a, 0x26); +#else + Vec2q t = _mm_castpd_si128(a); // reinterpret as 32-bit integer + t &= 0x7FF0000000000000ll; // isolate exponent + return t == 0; // exponent = 0 +#endif +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline double horizontal_add(Vec2d const a) { + +#if false && INSTRSET >= 3 // SSE3 + // This version causes errors in Clang version 9.0 (https://bugs.llvm.org/show_bug.cgi?id=44111) + // It is also inefficient on most processors, so we drop it + __m128d t1 = _mm_hadd_pd(a, a); + return _mm_cvtsd_f64(t1); + +#elif true + // This version is OK + __m128d t1 = _mm_unpackhi_pd(a, a); + __m128d t2 = _mm_add_pd(a, t1); + return _mm_cvtsd_f64(t2); + +#else + // This version is also OK + __m128 t0 = _mm_castpd_ps(a); + __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0, t0)); + __m128d t2 = _mm_add_sd(a, t1); + return _mm_cvtsd_f64(t2); + +#endif +} + +// function max: a > b ? a : b +static inline Vec2d max(Vec2d const a, Vec2d const b) { + return _mm_max_pd(a, b); +} + +// function min: a < b ? a : b +static inline Vec2d min(Vec2d const a, Vec2d const b) { + return _mm_min_pd(a, b); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +static inline Vec2d abs(Vec2d const a) { +#if INSTRSET >= 10 // AVX512VL + return _mm_range_pd(a, a, 8); +#else + __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(-1, 0x7FFFFFFF, -1, 0x7FFFFFFF)); + return _mm_and_pd(a, mask); +#endif +} + +// function sqrt: square root +static inline Vec2d sqrt(Vec2d const a) { + return _mm_sqrt_pd(a); +} + +// function square: a * a +static inline Vec2d square(Vec2d const a) { + return a * a; +} + +// pow(Vec2d, int): +// The purpose of this template is to prevent implicit conversion of a float +// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included +template static Vec2d pow(Vec2d const a, TT const n); + +// Raise floating point numbers to integer power n +template <> +inline Vec2d pow(Vec2d const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec2d pow(Vec2d const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec2d pow(Vec2d const a, Const_int_t) { + return pow_n(a); +} + +// function round: round to nearest integer (even). (result as double vector) +#if INSTRSET >= 5 // SSE4.1 supported +static inline Vec2d round(Vec2d const a) { + return _mm_round_pd(a, 0 + 8); +} +#else + +// avoid unsafe optimization in function round +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5 +static inline Vec2d round(Vec2d const a) __attribute__((optimize("-fno-unsafe-math-optimizations"))); +#elif defined(__clang__) && INSTRSET < 5 +static inline Vec2d round(Vec2d const a) __attribute__((optnone)); +#elif defined (FLOAT_CONTROL_PRECISE_FOR_ROUND) +#pragma float_control(push) +#pragma float_control(precise,on) +#endif +// function round: round to nearest integer (even). (result as double vector) +static inline Vec2d round(Vec2d const a) { + // Note: assume MXCSR control register is set to rounding + // (don't use conversion to int, it will limit the value to +/- 2^31) + Vec2d signmask = _mm_castsi128_pd(constant4ui<0, 0x80000000, 0, 0x80000000>()); // -0.0 + Vec2d magic = _mm_castsi128_pd(constant4ui<0, 0x43300000, 0, 0x43300000>()); // magic number = 2^52 + Vec2d sign = _mm_and_pd(a, signmask); // signbit of a + Vec2d signedmagic = _mm_or_pd(magic, sign); // magic number with sign of a + Vec2d y = a + signedmagic - signedmagic; // round by adding magic number +#ifdef SIGNED_ZERO + y |= (a & Vec2d(-0.0)); // sign of zero +#endif + return y; +} +#if defined (FLOAT_CONTROL_PRECISE_FOR_ROUND) +#pragma float_control(pop) +#endif +#endif + +// function truncate: round towards zero. (result as double vector) +static inline Vec2d truncate(Vec2d const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_pd(a, 3 + 8); +#else // SSE2 + Vec2d a1 = abs(a); // abs + Vec2d y1 = round(a1); // round + Vec2d y2 = y1 - (Vec2d(1.0) & (y1 > a1)); // subtract 1 if bigger + Vec2d y3 = y2 | (a & Vec2d(-0.)); // put the sign back in + return y3; +#endif +} + +// function floor: round towards minus infinity. (result as double vector) +static inline Vec2d floor(Vec2d const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_pd(a, 1 + 8); +#else // SSE2 + Vec2d y = round(a); // round + y -= Vec2d(1.0) & (y > a); // subtract 1 if bigger +#ifdef SIGNED_ZERO + y |= (a & Vec2d(-0.0)); // sign of zero +#endif + return y; +#endif +} + +// function ceil: round towards plus infinity. (result as double vector) +static inline Vec2d ceil(Vec2d const a) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_round_pd(a, 2 + 8); +#else // SSE2 + Vec2d y = round(a); // round + y += Vec2d(1.0) & (y < a); // add 1 if smaller +#ifdef SIGNED_ZERO + y |= (a & Vec2d(-0.0)); // sign of zero +#endif + return y; +#endif +} + +// function truncate_to_int32: round towards zero. +static inline Vec4i truncate_to_int32(Vec2d const a, Vec2d const b) { + Vec4i t1 = _mm_cvttpd_epi32(a); + Vec4i t2 = _mm_cvttpd_epi32(b); + return _mm_unpacklo_epi64(t1,t2); +} +//static inline Vec4i truncate_to_int(Vec2d const a, Vec2d const b) { // deprecated +// return truncate_to_int32(a, b);} + +// function truncate_to_int32: round towards zero. +static inline Vec4i truncate_to_int32(Vec2d const a) { + return _mm_cvttpd_epi32(a); +} +//static inline Vec4i truncate_to_int(Vec2d const a) { // deprecated +// return truncate_to_int32(a);} + +// function truncatei: round towards zero. (inefficient for lower instruction sets) +static inline Vec2q truncatei(Vec2d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + //return _mm_maskz_cvttpd_epi64( __mmask8(0xFF), a); + return _mm_cvttpd_epi64(a); +#else + double aa[2]; + a.store(aa); + return Vec2q(int64_t(aa[0]), int64_t(aa[1])); +#endif +} +//static inline Vec2q truncate_to_int64(Vec2d const a) { return truncatei(a); } // deprecated + +// function round_to_int: round to nearest integer (even). +// result as 32-bit integer vector +static inline Vec4i round_to_int32(Vec2d const a, Vec2d const b) { + // Note: assume MXCSR control register is set to rounding + Vec4i t1 = _mm_cvtpd_epi32(a); + Vec4i t2 = _mm_cvtpd_epi32(b); + return _mm_unpacklo_epi64(t1,t2); +} +//static inline Vec4i round_to_int(Vec2d const a, Vec2d const b) { // deprecated +// return round_to_int32(a, b);} + +// function round_to_int: round to nearest integer (even). +// result as 32-bit integer vector. Upper two values of result are 0 +static inline Vec4i round_to_int32(Vec2d const a) { + Vec4i t1 = _mm_cvtpd_epi32(a); + return t1; +} +//static inline Vec4i round_to_int(Vec2d const a) { return round_to_int32(a); } // deprecated + +// function round_to_int64: round to nearest or even. (inefficient for lower instruction sets) +static inline Vec2q roundi(Vec2d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm_cvtpd_epi64(a); +#else + return truncatei(round(a)); +#endif +} +//static inline Vec2q round_to_int64(Vec2d const a) { return roundi(a); } // deprecated + +// function to_double: convert integer vector elements to double vector (inefficient for lower instruction sets) +static inline Vec2d to_double(Vec2q const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm_maskz_cvtepi64_pd(__mmask8(0xFF), a); +#else + int64_t aa[2]; + a.store(aa); + return Vec2d(double(aa[0]), double(aa[1])); +#endif +} + +static inline Vec2d to_double(Vec2uq const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm_cvtepu64_pd(a); +#else + uint64_t aa[2]; // inefficient + a.store(aa); + return Vec2d(double(aa[0]), double(aa[1])); +#endif +} + +// function to_double_low: convert integer vector elements [0] and [1] to double vector +static inline Vec2d to_double_low(Vec4i const a) { + return _mm_cvtepi32_pd(a); +} + +// function to_double_high: convert integer vector elements [2] and [3] to double vector +static inline Vec2d to_double_high(Vec4i const a) { + return to_double_low(_mm_srli_si128(a, 8)); +} + +// function compress: convert two Vec2d to one Vec4f +static inline Vec4f compress(Vec2d const low, Vec2d const high) { + Vec4f t1 = _mm_cvtpd_ps(low); + Vec4f t2 = _mm_cvtpd_ps(high); + return _mm_shuffle_ps(t1, t2, 0x44); +} + +// Function extend_low : convert Vec4f vector elements [0] and [1] to Vec2d +static inline Vec2d extend_low(Vec4f const a) { + return _mm_cvtps_pd(a); +} + +// Function extend_high : convert Vec4f vector elements [2] and [3] to Vec2d +static inline Vec2d extend_high(Vec4f const a) { + return _mm_cvtps_pd(_mm_movehl_ps(a, a)); +} + +// Fused multiply and add functions + +// Multiply and add +static inline Vec2d mul_add(Vec2d const a, Vec2d const b, Vec2d const c) { +#ifdef __FMA__ + return _mm_fmadd_pd(a, b, c); +#elif defined (__FMA4__) + return _mm_macc_pd(a, b, c); +#else + return a * b + c; +#endif +} + +// Multiply and subtract +static inline Vec2d mul_sub(Vec2d const a, Vec2d const b, Vec2d const c) { +#ifdef __FMA__ + return _mm_fmsub_pd(a, b, c); +#elif defined (__FMA4__) + return _mm_msub_pd(a, b, c); +#else + return a * b - c; +#endif +} + +// Multiply and inverse subtract +static inline Vec2d nmul_add(Vec2d const a, Vec2d const b, Vec2d const c) { +#ifdef __FMA__ + return _mm_fnmadd_pd(a, b, c); +#elif defined (__FMA4__) + return _mm_nmacc_pd(a, b, c); +#else + return c - a * b; +#endif +} + + +// Multiply and subtract with extra precision on the intermediate calculations, +// even if FMA instructions not supported, using Veltkamp-Dekker split. +// This is used in mathematical functions. Do not use it in general code +// because it is inaccurate in certain cases +static inline Vec2d mul_sub_x(Vec2d const a, Vec2d const b, Vec2d const c) { +#ifdef __FMA__ + return _mm_fmsub_pd(a, b, c); +#elif defined (__FMA4__) + return _mm_msub_pd(a, b, c); +#else + // calculate a * b - c with extra precision + Vec2q upper_mask = -(1LL << 27); // mask to remove lower 27 bits + Vec2d a_high = a & Vec2d(_mm_castsi128_pd(upper_mask));// split into high and low parts + Vec2d b_high = b & Vec2d(_mm_castsi128_pd(upper_mask)); + Vec2d a_low = a - a_high; + Vec2d b_low = b - b_high; + Vec2d r1 = a_high * b_high; // this product is exact + Vec2d r2 = r1 - c; // subtract c from high product + Vec2d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product + return r3; // + ((r2 - r1) + c); +#endif +} + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 +static inline Vec2q exponent(Vec2d const a) { + Vec2uq t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer + Vec2uq t2 = t1 << 1; // shift out sign bit + Vec2uq t3 = t2 >> 53; // shift down logical to position 0 + Vec2q t4 = Vec2q(t3) - 0x3FF; // subtract bias from exponent + return t4; +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0) = 1.0, fraction(5.0) = 1.25 +// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h ! +static inline Vec2d fraction(Vec2d const a) { +#if INSTRSET >= 10 + return _mm_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); +#else + Vec2uq t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer + Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FF0000000000000ll); // set exponent to 0 + bias + return _mm_castsi128_pd(t2); +#endif +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0 +// n >= 1024 gives +INF +// n <= -1023 gives 0.0 +// This function will never produce denormals, and never raise exceptions +static inline Vec2d exp2(Vec2q const n) { + Vec2q t1 = max(n, -0x3FF); // limit to allowed range + Vec2q t2 = min(t1, 0x400); + Vec2q t3 = t2 + 0x3FF; // add bias + Vec2q t4 = t3 << 52; // put exponent into position 52 + return _mm_castsi128_pd(t4); // reinterpret as double +} +//static Vec2d exp2(Vec2d const x); // defined in vectormath_exp.h + + +/***************************************************************************** +* +* Functions for reinterpretation between vector types +* +*****************************************************************************/ + +static inline __m128i reinterpret_i(__m128i const x) { + return x; +} + +static inline __m128i reinterpret_i(__m128 const x) { + return _mm_castps_si128(x); +} + +static inline __m128i reinterpret_i(__m128d const x) { + return _mm_castpd_si128(x); +} + +static inline __m128 reinterpret_f(__m128i const x) { + return _mm_castsi128_ps(x); +} + +static inline __m128 reinterpret_f(__m128 const x) { + return x; +} + +static inline __m128 reinterpret_f(__m128d const x) { + return _mm_castpd_ps(x); +} + +static inline __m128d reinterpret_d(__m128i const x) { + return _mm_castsi128_pd(x); +} + +static inline __m128d reinterpret_d(__m128 const x) { + return _mm_castps_pd(x); +} + +static inline __m128d reinterpret_d(__m128d const x) { + return x; +} + +// Function infinite2d: returns a vector where all elements are +INF +static inline Vec2d infinite2d() { + return reinterpret_d(Vec2q(0x7FF0000000000000)); +} + +// Function nan2d: returns a vector where all elements are +NAN (quiet) +static inline Vec2d nan2d(int n = 0x10) { + return nan_vec(n); +} + + +/***************************************************************************** +* +* Vector permute and blend functions +* +****************************************************************************** +* +* The permute function can reorder the elements of a vector and optionally +* set some elements to zero. +* +* See vectori128.h for details +* +*****************************************************************************/ + +// permute vector Vec2d +template +static inline Vec2d permute2(Vec2d const a) { + int constexpr indexs[2] = { i0, i1 }; // indexes as array + __m128d y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_pd(); // just return zero + + constexpr bool fit_shleft = (flags & perm_shleft) != 0; + constexpr bool fit_shright = (flags & perm_shright) != 0; + constexpr bool fit_punpckh = (flags & perm_punpckh) != 0; + constexpr bool fit_punpckl = (flags & perm_punpckl) != 0; + constexpr bool fit_zeroing = (flags & perm_zeroing) != 0; + if constexpr ((flags & perm_perm) != 0) { // permutation needed + // try to fit various instructions + if constexpr (fit_shleft && fit_zeroing) { + // pslldq does both permutation and zeroing. if zeroing not needed use punpckl instead + return _mm_castsi128_pd(_mm_bslli_si128(_mm_castpd_si128(a), 8)); + } + if constexpr (fit_shright && fit_zeroing) { + // psrldq does both permutation and zeroing. if zeroing not needed use punpckh instead + return _mm_castsi128_pd(_mm_bsrli_si128(_mm_castpd_si128(a), 8)); + } + if constexpr (fit_punpckh) { // fits punpckhi + y = _mm_unpackhi_pd(a, a); + } + else if constexpr (fit_punpckl) { // fits punpcklo + y = _mm_unpacklo_pd(a, a); + } + else { // needs general permute + y = _mm_shuffle_pd(a, a, (i0 & 1) | (i1 & 1) * 2); + } + } + if constexpr (fit_zeroing) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_pd(zero_mask<2>(indexs), y); +#else // use unpack to avoid using data cache + if constexpr (i0 == -1) { + y = _mm_unpackhi_pd(_mm_setzero_pd(), y); + } + else if constexpr (i1 == -1) { + y = _mm_unpacklo_pd(y, _mm_setzero_pd()); + } +#endif + } + return y; +} + + +// permute vector Vec4f +template +static inline Vec4f permute4(Vec4f const a) { + constexpr int indexs[4] = {i0, i1, i2, i3}; // indexes as array + __m128 y = a; // result + + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_ps(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { + // use larger permutation + constexpr EList L = largeblock_perm<4>(indexs); // permutation pattern + y = reinterpret_f(permute2 (Vec2d(reinterpret_d(a)))); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } +#if INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask + else if constexpr ((flags & perm_zeroing) != 0) { + // Do both permutation and zeroing with PSHUFB instruction + const EList bm = pshufb_mask(indexs); + return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), Vec4i().load(bm.a))); + } +#endif + else if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm_unpackhi_ps(a, a); + } + else if constexpr ((flags & perm_punpckl) != 0) { // fits punpcklo + y = _mm_unpacklo_ps(a, a); + } + else if constexpr ((flags & perm_shleft) != 0) { // fits pslldq + y = _mm_castsi128_ps(_mm_bslli_si128(_mm_castps_si128(a), (16-(flags >> perm_rot_count)) & 0xF)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_shright) != 0) { // fits psrldq + y = _mm_castsi128_ps(_mm_bsrli_si128(_mm_castps_si128(a), (flags >> perm_rot_count) & 0xF)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } +#if INSTRSET >= 3 // SSE3 + else if constexpr (i0 == 0 && i1 == 0 && i2 == 2 && i3 == 2) { + return _mm_moveldup_ps(a); + } + else if constexpr (i0 == 1 && i1 == 1 && i2 == 3 && i3 == 3) { + return _mm_movehdup_ps(a); + } +#endif + else { // needs general permute + y = _mm_shuffle_ps(a, a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6); + } + } + if constexpr ((flags & perm_zeroing) != 0) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + // The mask-zero operation can be merged into the preceding instruction, whatever that is. + // A good optimizing compiler will do this automatically. + // I don't want to clutter all the branches above with this + y = _mm_maskz_mov_ps (zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_ps(_mm_castsi128_ps(Vec4i().load(bm.a)), y); +#endif + } + return y; +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ +// permute and blend Vec2d +template +static inline Vec2d blend2(Vec2d const a, Vec2d const b) { + int constexpr indexs[2] = { i0, i1 }; // indexes as array + __m128d y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_pd (); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute2 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute2 (b); + } + + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_pd (a, (uint8_t)make_bit_mask<2, 0x301>(indexs), b); +#elif INSTRSET >= 5 // SSE4.1 + y = _mm_blend_pd (a, b, ((i0 & 2) ? 0x01 : 0) | ((i1 & 2) ? 0x02 : 0)); +#else // SSE2 + const EList bm = make_broad_mask(make_bit_mask<2, 0x301>(indexs)); + y = selectd(_mm_castsi128_pd(Vec2q().load(bm.a)), b, a); +#endif + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm_unpacklo_pd (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm_unpacklo_pd (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm_unpackhi_pd (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm_unpackhi_pd (b, a); + } + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm_shuffle_pd(a, b, (flags >> blend_shufpattern) & 3); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm_shuffle_pd(b, a, (flags >> blend_shufpattern) & 3); + } + else { // No special cases. permute a and b separately, then blend. + // This will not occur if ALLOW_FP_PERMUTE is true +#if INSTRSET >= 5 // SSE4.1 + constexpr bool dozero = false; +#else // SSE2 + constexpr bool dozero = true; +#endif + constexpr EList L = blend_perm_indexes<2, (int)dozero>(indexs); // get permutation indexes + __m128d ya = permute2(a); + __m128d yb = permute2(b); +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_pd (ya, (uint8_t)make_bit_mask<2, 0x301>(indexs), yb); +#elif INSTRSET >= 5 // SSE4.1 + y = _mm_blend_pd (ya, yb, ((i0 & 2) ? 0x01 : 0) | ((i1 & 2) ? 0x02 : 0)); +#else // SSE2 + return _mm_or_pd(ya, yb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_pd(zero_mask<2>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_pd(_mm_castsi128_pd(Vec2q().load(bm.a)), y); +#endif + } + return y; +} + + +// permute and blend Vec4f +template +static inline Vec4f blend4(Vec4f const a, Vec4f const b) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + __m128 y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + constexpr bool blendonly = (flags & (blend_perma | blend_permb)) == 0; // no permutation, only blending + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_ps(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute4 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute4 < i0<0?i0:i0&3, i1<0?i1:i1&3, i2<0?i2:i2&3, i3<0?i3:i3&3> (b); + } + if constexpr ((flags & blend_largeblock) != 0) { // fits blending with larger block size + constexpr EList L = largeblock_indexes<4>(indexs); + y = _mm_castpd_ps(blend2 (Vec2d(_mm_castps_pd(a)), Vec2d(_mm_castps_pd(b)))); + if constexpr ((flags & blend_addz) == 0) { + return y; // any zeroing has been done by larger blend + } + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm_unpacklo_ps (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm_unpacklo_ps (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm_unpackhi_ps (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm_unpackhi_ps (b, a); + } + else if constexpr ((flags & blend_shufab) != 0 && !blendonly) { // use floating point instruction shufps + y = _mm_shuffle_ps(a, b, uint8_t(flags >> blend_shufpattern)); + } + else if constexpr ((flags & blend_shufba) != 0 && !blendonly) { // use floating point instruction shufps + y = _mm_shuffle_ps(b, a, uint8_t(flags >> blend_shufpattern)); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(a), _mm_castps_si128(b), flags >> blend_rotpattern)); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(b), _mm_castps_si128(a), flags >> blend_rotpattern)); + } +#endif + else { // No special cases. permute a and b separately, then blend. +#if INSTRSET >= 5 // SSE4.1 + constexpr bool dozero = false; +#else // SSE2 + constexpr bool dozero = true; +#endif + Vec4f ya = a, yb = b; // a and b permuted + constexpr EList L = blend_perm_indexes<4, (int)dozero>(indexs); // get permutation indexes + if constexpr ((flags & blend_perma) != 0 || dozero) { + ya = permute4 (a); + } + if constexpr ((flags & blend_permb) != 0 || dozero) { + yb = permute4 (b); + } +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_ps (ya, (uint8_t)make_bit_mask<4, 0x302>(indexs), yb); +#elif INSTRSET >= 5 // SSE4.1 + constexpr uint8_t mm = ((i0 & 4) ? 0x01 : 0) | ((i1 & 4) ? 0x02 : 0) | ((i2 & 4) ? 0x04 : 0) | ((i3 & 4) ? 0x08 : 0); + if constexpr (mm == 0x01) y = _mm_move_ss(ya, yb); + else if constexpr (mm == 0x0E) y = _mm_move_ss(yb, ya); + else { + y = _mm_blend_ps (ya, yb, mm); + } +#else // SSE2. dozero = true + return _mm_or_ps(ya, yb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_ps(zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_ps(_mm_castsi128_ps(Vec4i().load(bm.a)), y); +#endif + } + return y; +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec4f lookup4(Vec4i const index, Vec4f const table) { +#if INSTRSET >= 7 // AVX + return _mm_permutevar_ps(table, index); +#else + int32_t ii[4]; + float tt[6]; + table.store(tt); (index & 3).store(ii); + __m128 r01 = _mm_loadh_pi(_mm_load_ss(&tt[ii[0]]), (const __m64 *) & tt[ii[1]]); + __m128 r23 = _mm_loadh_pi(_mm_load_ss(&tt[ii[2]]), (const __m64 *) & tt[ii[3]]); + return _mm_shuffle_ps(r01, r23, 0x88); +#endif +} + +static inline Vec4f lookup8(Vec4i const index, Vec4f const table0, Vec4f const table1) { +#if INSTRSET >= 8 // AVX2 + __m256 tt = _mm256_insertf128_ps(_mm256_castps128_ps256(table0), table1, 1); // combine tables + __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index))); + return r; + +#elif INSTRSET >= 7 // AVX + __m128 r0 = _mm_permutevar_ps(table0, index); + __m128 r1 = _mm_permutevar_ps(table1, index); + __m128i i4 = _mm_slli_epi32(index, 29); + return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4)); + +#elif INSTRSET >= 5 // SSE4.1 + Vec4f r0 = lookup4(index, table0); + Vec4f r1 = lookup4(index, table1); + __m128i i4 = _mm_slli_epi32(index, 29); + return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4)); + +#else // SSE2 + Vec4f r0 = lookup4(index, table0); + Vec4f r1 = lookup4(index, table1); + __m128i i4 = _mm_srai_epi32(_mm_slli_epi32(index, 29), 31); + return selectf(_mm_castsi128_ps(i4), r1, r0); +#endif +} + +template +static inline Vec4f lookup(Vec4i const index, float const * table) { + if constexpr (n <= 0) return 0.0f; + if constexpr (n <= 4) return lookup4(index, Vec4f().load(table)); + if constexpr (n <= 8) { +#if INSTRSET >= 8 // AVX2 + __m256 tt = _mm256_loadu_ps(table); + __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index))); + return r; +#else // not AVX2 + return lookup8(index, Vec4f().load(table), Vec4f().load(table + 4)); +#endif + } + // n > 8. Limit index + Vec4ui index1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec4ui(index) & (n - 1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec4ui(index), n - 1); + } +#if INSTRSET >= 8 // AVX2 + return _mm_i32gather_ps(table, index1, 4); +#else + uint32_t ii[4]; index1.store(ii); + return Vec4f(table[ii[0]], table[ii[1]], table[ii[2]], table[ii[3]]); +#endif +} + +static inline Vec2d lookup2(Vec2q const index, Vec2d const table) { +#if INSTRSET >= 7 // AVX + return _mm_permutevar_pd(table, index + index); +#else + int32_t ii[4]; + double tt[2]; + table.store(tt); (index & 1).store(ii); + return Vec2d(tt[ii[0]], tt[ii[2]]); +#endif +} + +static inline Vec2d lookup4(Vec2q const index, Vec2d const table0, Vec2d const table1) { +#if INSTRSET >= 7 // AVX + Vec2q index2 = index + index; // index << 1 + __m128d r0 = _mm_permutevar_pd(table0, index2); + __m128d r1 = _mm_permutevar_pd(table1, index2); + __m128i i4 = _mm_slli_epi64(index, 62); + return _mm_blendv_pd(r0, r1, _mm_castsi128_pd(i4)); +#else + int32_t ii[4]; + double tt[4]; + table0.store(tt); table1.store(tt + 2); + (index & 3).store(ii); + return Vec2d(tt[ii[0]], tt[ii[2]]); +#endif +} + +template +static inline Vec2d lookup(Vec2q const index, double const * table) { + if constexpr (n <= 0) return 0.0; + if constexpr (n <= 2) return lookup2(index, Vec2d().load(table)); +#if INSTRSET < 8 // not AVX2 + if constexpr (n <= 4) return lookup4(index, Vec2d().load(table), Vec2d().load(table + 2)); +#endif + // Limit index + Vec2uq index1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec2uq(index) & (n - 1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec2uq(index), n - 1); + } +#if INSTRSET >= 8 // AVX2 + return _mm_i64gather_pd(table, index1, 8); +#else + uint32_t ii[4]; index1.store(ii); + return Vec2d(table[ii[0]], table[ii[2]]); +#endif +} + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// Load elements from array a with indices i0, i1, i2, i3 +template +static inline Vec4f gather4f(void const * a) { + return reinterpret_f(gather4i(a)); +} + +// Load elements from array a with indices i0, i1 +template +static inline Vec2d gather2d(void const * a) { + return reinterpret_d(gather2q(a)); +} + +/***************************************************************************** +* +* Vector scatter functions +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +*****************************************************************************/ + +template +static inline void scatter(Vec4f const data, float * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __m128i indx = constant4ui(); + __mmask8 mask = uint8_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3)); + _mm_mask_i32scatter_ps(destination, mask, indx, data, 4); + +#elif INSTRSET >= 9 // __AVX512F__ + __m512i indx = _mm512_castsi128_si512(constant4ui()); + __mmask16 mask = uint16_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3)); + _mm512_mask_i32scatter_ps(destination, mask, indx, _mm512_castps128_ps512(data), 4); + +#else + const int index[4] = { i0,i1,i2,i3 }; + for (int i = 0; i < 4; i++) { + if (index[i] >= 0) destination[index[i]] = data[i]; + } +#endif +} + +template +static inline void scatter(Vec2d const data, double * destination) { + if (i0 >= 0) destination[i0] = data[0]; + if (i1 >= 0) destination[i1] = data[1]; +} + + +/***************************************************************************** +* +* Scatter functions with variable indexes +* +*****************************************************************************/ + +static inline void scatter(Vec4i const index, uint32_t limit, Vec4f const data, float * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); + _mm_mask_i32scatter_ps(destination, mask, index, data, 4); +#else + for (int i = 0; i < 4; i++) { + if (uint32_t(index[i]) < limit) destination[index[i]] = data[i]; + } +#endif +} + +static inline void scatter(Vec2q const index, uint32_t limit, Vec2d const data, double * destination) { + if (uint64_t(index[0]) < uint64_t(limit)) destination[index[0]] = data[0]; + if (uint64_t(index[1]) < uint64_t(limit)) destination[index[1]] = data[1]; +} + + +#if INSTRSET < 10 // these are defined in vectori128.h for compact boolean vectors + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec4fb const x) { + return to_bits(Vec4ib(x)); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec2db const x) { + return to_bits(Vec2qb(x)); +} + +#endif // INSTRSET < 10 + + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORF128_H diff --git a/DFTTest/VCL2/vectorf256.h b/DFTTest/VCL2/vectorf256.h new file mode 100644 index 0000000..8db54df --- /dev/null +++ b/DFTTest/VCL2/vectorf256.h @@ -0,0 +1,3023 @@ +/**************************** vectorf256.h ******************************* +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining 256-bit floating point vector classes +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec8f Vector of 8 single precision floating point numbers +* Vec8fb Vector of 8 Booleans for use with Vec8f +* Vec4d Vector of 4 double precision floating point numbers +* Vec4db Vector of 4 Booleans for use with Vec4d +* +* Each vector object is represented internally in the CPU as a 256-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORF256_H +#define VECTORF256_H 1 + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +#ifdef VECTORF256E_H +#error Two different versions of vectorf256.h included +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + + +/***************************************************************************** +* +* Generate compile-time constant vector +* +*****************************************************************************/ + +// Generate a constant vector of 8 integers stored in memory +template +inline __m256 constant8f() { + /* + const union { + uint32_t i[8]; + __m256 ymm; + } u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; + return u.ymm; + */ + return _mm256_castsi256_ps(_mm256_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7)); +} + + +// Join two 128-bit vectors. Used below +#define set_m128r(lo,hi) _mm256_insertf128_ps(_mm256_castps128_ps256(lo),(hi),1) +// _mm256_set_m128(hi,lo); // not defined in all versions of immintrin.h + + +/***************************************************************************** +* +* Vec8fb: Vector of 8 Booleans for use with Vec8f +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec8fb { +protected: + __m256 ymm; // Float vector +public: + // Default constructor: + Vec8fb() { + } + // Constructor to build from all elements: + Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { +#if INSTRSET >= 8 // AVX2 + ymm = _mm256_castsi256_ps(_mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7)); +#else + __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); + __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b4, -(int)b5, -(int)b6, -(int)b7)); + ymm = set_m128r(blo,bhi); +#endif + } + // Constructor to build from two Vec4fb: + Vec8fb(Vec4fb const a0, Vec4fb const a1) { + ymm = set_m128r(a0, a1); + } + // Constructor to convert from type __m256 used in intrinsics: + Vec8fb(__m256 const x) { + ymm = x; + } + // Assignment operator to convert from type __m256 used in intrinsics: + Vec8fb & operator = (__m256 const x) { + ymm = x; + return *this; + } + // Constructor to broadcast the same value into all elements: + Vec8fb(bool b) { +#if INSTRSET >= 8 // AVX2 + ymm = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b)); +#else + __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b)); + //ymm = _mm256_set_m128(b1,b1); + ymm = set_m128r(b1,b1); +#endif + } + // Assignment operator to broadcast scalar value: + Vec8fb & operator = (bool b) { + *this = Vec8fb(b); + return *this; + } + // Type cast operator to convert to __m256 used in intrinsics + operator __m256() const { + return ymm; + } +#if INSTRSET >= 8 // AVX2 + // Constructor to convert from type Vec8ib used as Boolean for integer vectors + Vec8fb(Vec8ib const x) { + ymm = _mm256_castsi256_ps(x); + } + // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors + Vec8fb & operator = (Vec8ib const x) { + ymm = _mm256_castsi256_ps(x); + return *this; + } + // Member function to change a bitfield to a boolean vector + Vec8fb & load_bits(uint8_t a) { + Vec8ib b; b.load_bits(a); + ymm = _mm256_castsi256_ps(b); + return *this; + } +#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY + // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors + operator Vec8ib() const { + return _mm256_castps_si256(ymm); + } +#endif +#else // AVX version + // Constructor to convert from type Vec8ib used as Boolean for integer vectors + Vec8fb(Vec8ib const x) { + ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high())); + } + // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors + Vec8fb & operator = (Vec8ib const x) { + ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high())); + return *this; + } + // Member function to change a bitfield to a boolean vector + // AVX version. Use float instructions, treating integers as subnormal values + Vec8fb & load_bits(uint8_t a) { + __m256 b1 = _mm256_castsi256_ps(_mm256_set1_epi32((int32_t)a)); // broadcast a + __m256 m2 = constant8f<1,2,4,8,0x10,0x20,0x40,0x80>(); + __m256 d1 = _mm256_and_ps(b1, m2); // isolate one bit in each dword + ymm = _mm256_cmp_ps(d1, _mm256_setzero_ps(), 4); // compare subnormal values with 0 + return *this; + } + // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors + operator Vec8ib() const { + return Vec8i(_mm_castps_si128(get_low()), _mm_castps_si128(get_high())); + } +#endif // AVX2 + // Member function to change a single element in vector + Vec8fb const insert(int index, bool value) { + const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0}; + __m256 mask = _mm256_loadu_ps((float const*)(maskl+8-(index & 7))); // mask with FFFFFFFF at index position + if (value) { + ymm = _mm256_or_ps(ymm,mask); + } + else { + ymm = _mm256_andnot_ps(mask,ymm); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + union { + float f[8]; + int32_t i[8]; + } u; + _mm256_storeu_ps(u.f, ymm); + return u.i[index & 7] != 0; + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4fb: + Vec4fb get_low() const { + return _mm256_castps256_ps128(ymm); + } + Vec4fb get_high() const { + return _mm256_extractf128_ps(ymm,1); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec8fb(int b) = delete; + Vec8fb & operator = (int x) = delete; + }; + +#else + +typedef Vec8b Vec8fb; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Operators and functions for Vec8fb +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec8fb operator & (Vec8fb const a, Vec8fb const b) { + return _mm256_and_ps(a, b); +} +static inline Vec8fb operator && (Vec8fb const a, Vec8fb const b) { + return a & b; +} + +// vector operator &= : bitwise and +static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8fb operator | (Vec8fb const a, Vec8fb const b) { + return _mm256_or_ps(a, b); +} +static inline Vec8fb operator || (Vec8fb const a, Vec8fb const b) { + return a | b; +} + +// vector operator |= : bitwise or +static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const b) { + a = a | b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec8fb operator ~ (Vec8fb const a) { + return _mm256_xor_ps(a, constant8f<0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu>()); +} + +// vector operator ^ : bitwise xor +static inline Vec8fb operator ^ (Vec8fb const a, Vec8fb const b) { + return _mm256_xor_ps(a, b); +} + +// vector operator == : xnor +static inline Vec8fb operator == (Vec8fb const a, Vec8fb const b) { + return Vec8fb(a ^ Vec8fb(~b)); +} + +// vector operator != : xor +static inline Vec8fb operator != (Vec8fb const a, Vec8fb const b) { + return _mm256_xor_ps(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not +// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same) +static inline Vec8fb operator ! (Vec8fb const a) { +return Vec8fb( !Vec8ib(a)); +} + +// Functions for Vec8fb + +// andnot: a & ~ b +static inline Vec8fb andnot(Vec8fb const a, Vec8fb const b) { + return _mm256_andnot_ps(b, a); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec8fb const a) { + return _mm256_testc_ps(a,constant8f<0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu>()) != 0; +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec8fb const a) { + return ! _mm256_testz_ps(a,a); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec8fb const x) { + return to_bits(Vec8ib(x)); +} + +#endif + + +/***************************************************************************** +* +* Vec4db: Vector of 4 Booleans for use with Vec4d +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec4db { +protected: + __m256d ymm; // double vector +public: + // Default constructor: + Vec4db() { + } + // Constructor to build from all elements: + Vec4db(bool b0, bool b1, bool b2, bool b3) { +#if INSTRSET >= 8 // AVX2 + ymm = _mm256_castsi256_pd(_mm256_setr_epi64x(-(int64_t)b0, -(int64_t)b1, -(int64_t)b2, -(int64_t)b3)); +#else + __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); + __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b2, -(int)b2, -(int)b3, -(int)b3)); + ymm = _mm256_castps_pd(set_m128r(blo, bhi)); +#endif + } + // Constructor to build from two Vec2db: + Vec4db(Vec2db const a0, Vec2db const a1) { + ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0),_mm_castpd_ps(a1))); + //ymm = _mm256_set_m128d(a1, a0); + } + // Constructor to convert from type __m256d used in intrinsics: + Vec4db(__m256d const x) { + ymm = x; + } + // Assignment operator to convert from type __m256d used in intrinsics: + Vec4db & operator = (__m256d const x) { + ymm = x; + return *this; + } + // Constructor to broadcast the same value into all elements: + Vec4db(bool b) { +#if INSTRSET >= 8 // AVX2 + ymm = _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64_t)b)); +#else + __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b)); + ymm = _mm256_castps_pd(set_m128r(b1,b1)); +#endif + } + // Assignment operator to broadcast scalar value: + Vec4db & operator = (bool b) { + ymm = _mm256_castsi256_pd(_mm256_set1_epi32(-int32_t(b))); + return *this; + } + // Type cast operator to convert to __m256d used in intrinsics + operator __m256d() const { + return ymm; + } +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + // Constructor to convert from type Vec4qb used as Boolean for integer vectors + Vec4db(Vec4qb const x) { + ymm = _mm256_castsi256_pd(x); + } + // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors + Vec4db & operator = (Vec4qb const x) { + ymm = _mm256_castsi256_pd(x); + return *this; + } + // Member function to change a bitfield to a boolean vector + Vec4db & load_bits(uint8_t a) { + Vec4qb b; b.load_bits(a); + ymm = _mm256_castsi256_pd(b); + return *this; + } +#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY + // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors + operator Vec4qb() const { + return _mm256_castpd_si256(ymm); + } +#endif +#else // 256 bit integer vectors emulated without AVX2 + // Constructor to convert from type Vec4qb used as Boolean for integer vectors + Vec4db(Vec4qb const x) { + *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high())); + } + // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors + Vec4db & operator = (Vec4qb const x) { + *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high())); + return *this; + } + // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors + operator Vec4qb() const { + return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high())); + } + // Member function to change a bitfield to a boolean vector + // AVX version. Use float instructions, treating integers as subnormal values + Vec4db & load_bits(uint8_t a) { + __m256d b1 = _mm256_castsi256_pd(_mm256_set1_epi32((int32_t)a)); // broadcast a + __m256d m2 = _mm256_castps_pd(constant8f<1,0,2,0,4,0,8,0>()); + __m256d d1 = _mm256_and_pd(b1, m2); // isolate one bit in each dword + ymm = _mm256_cmp_pd(d1, _mm256_setzero_pd(), 4); // compare subnormal values with 0 + return *this; + } +#endif // AVX2 + // Member function to change a single element in vector + Vec4db const insert(int index, bool value) { + const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0}; + __m256d mask = _mm256_loadu_pd((double const*)(maskl+8-(index&3)*2)); // mask with FFFFFFFFFFFFFFFF at index position + if (value) { + ymm = _mm256_or_pd(ymm,mask); + } + else { + ymm = _mm256_andnot_pd(mask,ymm); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + union { + double f[8]; + int32_t i[16]; + } u; + _mm256_storeu_pd(u.f, ymm); + return u.i[(index & 3) * 2 + 1] != 0; + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4fb: + Vec2db get_low() const { + return _mm256_castpd256_pd128(ymm); + } + Vec2db get_high() const { + return _mm256_extractf128_pd(ymm,1); + } + static constexpr int size() { + return 4; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec4db(int b) = delete; + Vec4db & operator = (int x) = delete; +}; + +#else + +typedef Vec4b Vec4db; // compact boolean vector + +#endif + +/***************************************************************************** +* +* Operators and functions for Vec4db +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec4db operator & (Vec4db const a, Vec4db const b) { + return _mm256_and_pd(a, b); +} +static inline Vec4db operator && (Vec4db const a, Vec4db const b) { + return a & b; +} + +// vector operator &= : bitwise and +static inline Vec4db & operator &= (Vec4db & a, Vec4db const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec4db operator | (Vec4db const a, Vec4db const b) { + return _mm256_or_pd(a, b); +} +static inline Vec4db operator || (Vec4db const a, Vec4db const b) { + return a | b; +} + +// vector operator |= : bitwise or +static inline Vec4db & operator |= (Vec4db & a, Vec4db const b) { + a = a | b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec4db operator ~ (Vec4db const a) { + return _mm256_xor_pd(a, _mm256_castps_pd (constant8f<0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu,0xFFFFFFFFu>())); +} + +// vector operator ^ : bitwise xor +static inline Vec4db operator ^ (Vec4db const a, Vec4db const b) { + return _mm256_xor_pd(a, b); +} + +// vector operator == : xnor +static inline Vec4db operator == (Vec4db const a, Vec4db const b) { + return Vec4db(a ^ Vec4db(~b)); +} + +// vector operator != : xor +static inline Vec4db operator != (Vec4db const a, Vec4db const b) { + return _mm256_xor_pd(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec4db & operator ^= (Vec4db & a, Vec4db const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not +// (operator ! is less efficient than operator ~. Use only where not all bits in an element are the same) +static inline Vec4db operator ! (Vec4db const a) { +return Vec4db( ! Vec4qb(a)); +} + +// Functions for Vec8fb + +// andnot: a & ~ b +static inline Vec4db andnot(Vec4db const a, Vec4db const b) { + return _mm256_andnot_pd(b, a); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec4db const a) { +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + return horizontal_and(Vec256b(_mm256_castpd_si256(a))); +#else // split into 128 bit vectors + return horizontal_and(a.get_low() & a.get_high()); +#endif +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec4db const a) { +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + return horizontal_or(Vec256b(_mm256_castpd_si256(a))); +#else // split into 128 bit vectors + return horizontal_or(a.get_low() | a.get_high()); +#endif +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec4db const x) { + return to_bits(Vec4qb(x)); +} + +#endif + + + /***************************************************************************** +* +* Vec8f: Vector of 8 single precision floating point values +* +*****************************************************************************/ + +class Vec8f { +protected: + __m256 ymm; // Float vector +public: + // Default constructor: + Vec8f() { + } + // Constructor to broadcast the same value into all elements: + Vec8f(float f) { + ymm = _mm256_set1_ps(f); + } + // Constructor to build from all elements: + Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) { + ymm = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7); + } + // Constructor to build from two Vec4f: + Vec8f(Vec4f const a0, Vec4f const a1) { + ymm = set_m128r(a0, a1); + //ymm = _mm256_set_m128(a1, a0); + } + // Constructor to convert from type __m256 used in intrinsics: + Vec8f(__m256 const x) { + ymm = x; + } + // Assignment operator to convert from type __m256 used in intrinsics: + Vec8f & operator = (__m256 const x) { + ymm = x; + return *this; + } + // Type cast operator to convert to __m256 used in intrinsics + operator __m256() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec8f & load(float const * p) { + ymm = _mm256_loadu_ps(p); + return *this; + } + // Member function to load from array, aligned by 32 + // You may use load_a instead of load if you are certain that p points to an address divisible by 32 + Vec8f & load_a(float const * p) { + ymm = _mm256_load_ps(p); + return *this; + } + // Member function to store into array (unaligned) + void store(float * p) const { + _mm256_storeu_ps(p, ymm); + } + // Member function storing into array, aligned by 32 + // You may use store_a instead of store if you are certain that p points to an address divisible by 32 + void store_a(float * p) const { + _mm256_store_ps(p, ymm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 32 + void store_nt(float * p) const { + _mm256_stream_ps(p, ymm); + } + // Partial load. Load n elements and set the rest to 0 + Vec8f & load_partial(int n, float const * p) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_maskz_loadu_ps(__mmask8((1u << n) - 1), p); +#else + if (n > 0 && n <= 4) { + *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps()); + } + else if (n > 4 && n <= 8) { + *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4)); + } + else { + ymm = _mm256_setzero_ps(); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, float * p) const { +#if INSTRSET >= 10 // AVX512VL + _mm256_mask_storeu_ps(p, __mmask8((1u << n) - 1), ymm); +#else + if (n <= 4) { + get_low().store_partial(n, p); + } + else if (n <= 8) { + get_low().store(p); + get_high().store_partial(n - 4, p + 4); + } +#endif + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec8f & cutoff(int n) { +#if INSTRSET >= 10 + ymm = _mm256_maskz_mov_ps(__mmask8((1u << n) - 1), ymm); +#else + if (uint32_t(n) >= 8) return *this; + const union { + int32_t i[16]; + float f[16]; + } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}}; + *this = Vec8fb(*this) & Vec8fb(Vec8f().load(mask.f + 8 - n)); +#endif + return *this; + } + // Member function to change a single element in vector + Vec8f const insert(int index, float value) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_mask_broadcastss_ps (ymm, __mmask8(1u << index), _mm_set_ss(value)); +#else + __m256 v0 = _mm256_broadcast_ss(&value); + switch (index) { + case 0: + ymm = _mm256_blend_ps (ymm, v0, 1); break; + case 1: + ymm = _mm256_blend_ps (ymm, v0, 2); break; + case 2: + ymm = _mm256_blend_ps (ymm, v0, 4); break; + case 3: + ymm = _mm256_blend_ps (ymm, v0, 8); break; + case 4: + ymm = _mm256_blend_ps (ymm, v0, 0x10); break; + case 5: + ymm = _mm256_blend_ps (ymm, v0, 0x20); break; + case 6: + ymm = _mm256_blend_ps (ymm, v0, 0x40); break; + default: + ymm = _mm256_blend_ps (ymm, v0, 0x80); break; + } +#endif + return *this; + } + // Member function extract a single element from vector + float extract(int index) const { +#if INSTRSET >= 10 + __m256 x = _mm256_maskz_compress_ps(__mmask8(1u << index), ymm); + return _mm256_cvtss_f32(x); +#else + float x[8]; + store(x); + return x[index & 7]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + float operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4f: + Vec4f get_low() const { + return _mm256_castps256_ps128(ymm); + } + Vec4f get_high() const { + return _mm256_extractf128_ps(ymm,1); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 16; + } + typedef __m256 registertype; +}; + + +/***************************************************************************** +* +* Operators for Vec8f +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec8f operator + (Vec8f const a, Vec8f const b) { + return _mm256_add_ps(a, b); +} + +// vector operator + : add vector and scalar +static inline Vec8f operator + (Vec8f const a, float b) { + return a + Vec8f(b); +} +static inline Vec8f operator + (float a, Vec8f const b) { + return Vec8f(a) + b; +} + +// vector operator += : add +static inline Vec8f & operator += (Vec8f & a, Vec8f const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8f operator ++ (Vec8f & a, int) { + Vec8f a0 = a; + a = a + 1.0f; + return a0; +} + +// prefix operator ++ +static inline Vec8f & operator ++ (Vec8f & a) { + a = a + 1.0f; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8f operator - (Vec8f const a, Vec8f const b) { + return _mm256_sub_ps(a, b); +} + +// vector operator - : subtract vector and scalar +static inline Vec8f operator - (Vec8f const a, float b) { + return a - Vec8f(b); +} +static inline Vec8f operator - (float a, Vec8f const b) { + return Vec8f(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec8f operator - (Vec8f const a) { + return _mm256_xor_ps(a, Vec8f(-0.0f)); +} + +// vector operator -= : subtract +static inline Vec8f & operator -= (Vec8f & a, Vec8f const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8f operator -- (Vec8f & a, int) { + Vec8f a0 = a; + a = a - 1.0f; + return a0; +} + +// prefix operator -- +static inline Vec8f & operator -- (Vec8f & a) { + a = a - 1.0f; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8f operator * (Vec8f const a, Vec8f const b) { + return _mm256_mul_ps(a, b); +} + +// vector operator * : multiply vector and scalar +static inline Vec8f operator * (Vec8f const a, float b) { + return a * Vec8f(b); +} +static inline Vec8f operator * (float a, Vec8f const b) { + return Vec8f(a) * b; +} + +// vector operator *= : multiply +static inline Vec8f & operator *= (Vec8f & a, Vec8f const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec8f operator / (Vec8f const a, Vec8f const b) { + return _mm256_div_ps(a, b); +} + +// vector operator / : divide vector and scalar +static inline Vec8f operator / (Vec8f const a, float b) { + return a / Vec8f(b); +} +static inline Vec8f operator / (float a, Vec8f const b) { + return Vec8f(a) / b; +} + +// vector operator /= : divide +static inline Vec8f & operator /= (Vec8f & a, Vec8f const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8fb operator == (Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_ps_mask(a, b, 0); +#else + return _mm256_cmp_ps(a, b, 0); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8fb operator != (Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_ps_mask(a, b, 4); +#else + return _mm256_cmp_ps(a, b, 4); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8fb operator < (Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_ps_mask(a, b, 1); +#else + return _mm256_cmp_ps(a, b, 1); +#endif +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec8fb operator <= (Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_ps_mask(a, b, 2); +#else + return _mm256_cmp_ps(a, b, 2); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8fb operator > (Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_ps_mask(a, b, 6); +#else + return b < a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec8fb operator >= (Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_ps_mask(a, b, 5); +#else + return b <= a; +#endif +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec8f operator & (Vec8f const a, Vec8f const b) { + return _mm256_and_ps(a, b); +} + +// vector operator &= : bitwise and +static inline Vec8f & operator &= (Vec8f & a, Vec8f const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec8f and Vec8fb +static inline Vec8f operator & (Vec8f const a, Vec8fb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_maskz_mov_ps(b, a); +#else + return _mm256_and_ps(a, b); +#endif +} +static inline Vec8f operator & (Vec8fb const a, Vec8f const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec8f operator | (Vec8f const a, Vec8f const b) { + return _mm256_or_ps(a, b); +} + +// vector operator |= : bitwise or +static inline Vec8f & operator |= (Vec8f & a, Vec8f const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8f operator ^ (Vec8f const a, Vec8f const b) { + return _mm256_xor_ps(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec8f & operator ^= (Vec8f & a, Vec8f const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec8fb operator ! (Vec8f const a) { + return a == Vec8f(0.0f); +} + + +/***************************************************************************** +* +* Functions for Vec8f +* +*****************************************************************************/ + +static inline Vec8f zero_8f() { + return _mm256_setzero_ps(); +} + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8f select (Vec8fb const s, Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_ps(b, s, a); +#else + return _mm256_blendv_ps (b, a, s); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8f if_add (Vec8fb const f, Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 + return _mm256_mask_add_ps (a, f, a, b); +#else + return a + (Vec8f(f) & b); +#endif +} + +// Conditional subtract +static inline Vec8f if_sub (Vec8fb const f, Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 + return _mm256_mask_sub_ps (a, f, a, b); +#else + return a - (Vec8f(f) & b); +#endif +} + +// Conditional multiply +static inline Vec8f if_mul (Vec8fb const f, Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 + return _mm256_mask_mul_ps (a, f, a, b); +#else + return a * select(f, b, 1.f); +#endif +} + +// Conditional divide +static inline Vec8f if_div (Vec8fb const f, Vec8f const a, Vec8f const b) { +#if INSTRSET >= 10 + return _mm256_mask_div_ps (a, f, a, b); +#else + return a / select(f, b, 1.f); +#endif +} + +// Sign functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0f, -INF and -NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec8fb sign_bit(Vec8f const a) { +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8i t2 = t1 >> 31; // extend sign bit +#if INSTRSET >= 10 + return t2 != 0; +#else + return _mm256_castsi256_ps(t2); // reinterpret as 32-bit Boolean +#endif +#else + return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high())); +#endif +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec8f sign_combine(Vec8f const a, Vec8f const b) { +#if INSTRSET < 10 + return a ^ (b & Vec8f(-0.0f)); +#else + return _mm256_castsi256_ps (_mm256_ternarylogic_epi32( + _mm256_castps_si256(a), _mm256_castps_si256(b), Vec8i(0x80000000), 0x78)); +#endif +} + +// Categorization functions + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec8fb is_finite(Vec8f const a) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask8(~ _mm256_fpclass_ps_mask (a, 0x99)); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8i t2 = t1 << 1; // shift out sign bit + Vec8ib t3 = Vec8i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s + return t3; +#else + return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high())); +#endif +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec8fb is_inf(Vec8f const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_fpclass_ps_mask (a, 0x18); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8i t2 = t1 << 1; // shift out sign bit + return t2 == 0xFF000000; // exponent is all 1s, fraction is 0 +#else + return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high())); +#endif +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +#if INSTRSET >= 10 +static inline Vec8fb is_nan(Vec8f const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm256_fpclass_ps_mask (a, 0x81); +} +//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +//__attribute__((optimize("-fno-unsafe-math-optimizations"))) +//static inline Vec8fb is_nan(Vec8f const a) { +// return a != a; // not safe with -ffinite-math-only compiler option +//} +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +static inline Vec8fb is_nan(Vec8f const a) { + __m256 aa = a; + __m256 unordered; + __asm volatile("vcmpps $3, %1, %1, %0" : "=v" (unordered) : "v" (aa) ); + return Vec8fb(unordered); +} +#else +static inline Vec8fb is_nan(Vec8f const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm256_cmp_ps(a, a, 3); // compare unordered + // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option +} +#endif + + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec8fb is_subnormal(Vec8f const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_fpclass_ps_mask (a, 0x20); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8i t2 = t1 << 1; // shift out sign bit + Vec8i t3 = 0xFF000000; // exponent mask + Vec8i t4 = t2 & t3; // exponent + Vec8i t5 = _mm256_andnot_si256(t3,t2); // fraction + return Vec8ib(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0 +#else + return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); +#endif +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec8fb is_zero_or_subnormal(Vec8f const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_fpclass_ps_mask (a, 0x26); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 Vec8i t = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8i t = _mm256_castps_si256(a); // reinterpret as 32-bit integer + t &= 0x7F800000; // isolate exponent + return t == 0; // exponent = 0 +#else + return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high())); +#endif +} + +// change signs on vectors Vec8f +// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change +template +inline Vec8f change_sign(Vec8f const a) { + if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a; + __m256 mask = constant8f< + (i0 ? 0x80000000u : 0u), (i1 ? 0x80000000u : 0u), (i2 ? 0x80000000u : 0u), (i3 ? 0x80000000u : 0u), + (i4 ? 0x80000000u : 0u), (i5 ? 0x80000000u : 0u), (i6 ? 0x80000000u : 0u), (i7 ? 0x80000000u : 0u)> (); + return _mm256_xor_ps(a, mask); +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline float horizontal_add (Vec8f const a) { + return horizontal_add(a.get_low()+a.get_high()); +} + +// function max: a > b ? a : b +static inline Vec8f max(Vec8f const a, Vec8f const b) { + return _mm256_max_ps(a,b); +} + +// function min: a < b ? a : b +static inline Vec8f min(Vec8f const a, Vec8f const b) { + return _mm256_min_ps(a,b); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +static inline Vec8f abs(Vec8f const a) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_range_ps(a, a, 8); +#else + __m256 mask = constant8f<0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu,0x7FFFFFFFu> (); + return _mm256_and_ps(a,mask); +#endif +} + +// function sqrt: square root +static inline Vec8f sqrt(Vec8f const a) { + return _mm256_sqrt_ps(a); +} + +// function square: a * a +static inline Vec8f square(Vec8f const a) { + return a * a; +} + +// The purpose of this template is to prevent implicit conversion of a float +// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included +template static Vec8f pow(Vec8f const a, TT const n); + +// Raise floating point numbers to integer power n +template <> +inline Vec8f pow(Vec8f const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec8f pow(Vec8f const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec8f pow(Vec8f const a, Const_int_t) { + return pow_n(a); +} + +// function round: round to nearest integer (even). (result as float vector) +static inline Vec8f round(Vec8f const a) { + return _mm256_round_ps(a, 0+8); +} + +// function truncate: round towards zero. (result as float vector) +static inline Vec8f truncate(Vec8f const a) { + return _mm256_round_ps(a, 3+8); +} + +// function floor: round towards minus infinity. (result as float vector) +static inline Vec8f floor(Vec8f const a) { + return _mm256_round_ps(a, 1+8); +} + +// function ceil: round towards plus infinity. (result as float vector) +static inline Vec8f ceil(Vec8f const a) { + return _mm256_round_ps(a, 2+8); +} + +#if INSTRSET >= 8 // 256 bit integer vectors are available + +// function roundi: round to nearest integer (even). (result as integer vector) +static inline Vec8i roundi(Vec8f const a) { + // Note: assume MXCSR control register is set to rounding + return _mm256_cvtps_epi32(a); +} + +// function truncatei: round towards zero. (result as integer vector) +static inline Vec8i truncatei(Vec8f const a) { + return _mm256_cvttps_epi32(a); +} + +// function to_float: convert integer vector to float vector +static inline Vec8f to_float(Vec8i const a) { + return _mm256_cvtepi32_ps(a); +} + +// function to_float: convert unsigned integer vector to float vector +static inline Vec8f to_float(Vec8ui const a) { +#if INSTRSET >= 10 && !defined (_MSC_VER) // _mm256_cvtepu32_ps missing in VS2019 + return _mm256_cvtepu32_ps(a); +#elif INSTRSET >= 9 // __AVX512F__ + return _mm512_castps512_ps256(_mm512_cvtepu32_ps(_mm512_castsi256_si512(a))); +#else + Vec8f b = to_float(Vec8i(a & 0xFFFFF)); // 20 bits + Vec8f c = to_float(Vec8i(a >> 20)); // remaining bits + Vec8f d = b + c * 1048576.f; // 2^20 + return d; +#endif +} + +#else // no AVX2 + +// function roundi: round to nearest integer (even). (result as integer vector) +static inline Vec8i roundi(Vec8f const a) { + // Note: assume MXCSR control register is set to rounding + return Vec8i(_mm_cvtps_epi32(a.get_low()), _mm_cvtps_epi32(a.get_high())); +} + +// function truncatei: round towards zero. (result as integer vector) +static inline Vec8i truncatei(Vec8f const a) { + return Vec8i(_mm_cvttps_epi32(a.get_low()), _mm_cvttps_epi32(a.get_high())); +} + +// function to_float: convert integer vector to float vector +static inline Vec8f to_float(Vec8i const a) { + return Vec8f(_mm_cvtepi32_ps(a.get_low()), _mm_cvtepi32_ps(a.get_high())); +} + +// function to_float: convert unsigned integer vector to float vector +static inline Vec8f to_float(Vec8ui const a) { + return Vec8f(to_float(a.get_low()), to_float(a.get_high())); +} +#endif // AVX2 + + +// Fused multiply and add functions + +// Multiply and add +static inline Vec8f mul_add(Vec8f const a, Vec8f const b, Vec8f const c) { +#ifdef __FMA__ + return _mm256_fmadd_ps(a, b, c); +#elif defined (__FMA4__) + return _mm256_macc_ps(a, b, c); +#else + return a * b + c; +#endif +} + +// Multiply and subtract +static inline Vec8f mul_sub(Vec8f const a, Vec8f const b, Vec8f const c) { +#ifdef __FMA__ + return _mm256_fmsub_ps(a, b, c); +#elif defined (__FMA4__) + return _mm256_msub_ps(a, b, c); +#else + return a * b - c; +#endif +} + +// Multiply and inverse subtract +static inline Vec8f nmul_add(Vec8f const a, Vec8f const b, Vec8f const c) { +#ifdef __FMA__ + return _mm256_fnmadd_ps(a, b, c); +#elif defined (__FMA4__) + return _mm256_nmacc_ps(a, b, c); +#else + return c - a * b; +#endif +} + + +// Multiply and subtract with extra precision on the intermediate calculations, +// even if FMA instructions not supported, using Veltkamp-Dekker split +// This is used in mathematical functions. Do not use it in general code +// because it is inaccurate in certain cases +static inline Vec8f mul_sub_x(Vec8f const a, Vec8f const b, Vec8f const c) { +#ifdef __FMA__ + return _mm256_fmsub_ps(a, b, c); +#elif defined (__FMA4__) + return _mm256_msub_ps(a, b, c); +#else + // calculate a * b - c with extra precision + const uint32_t b12 = uint32_t(-(1 << 12)); // mask to remove lower 12 bits + Vec8f upper_mask = constant8f(); + Vec8f a_high = a & upper_mask; // split into high and low parts + Vec8f b_high = b & upper_mask; + Vec8f a_low = a - a_high; + Vec8f b_low = b - b_high; + Vec8f r1 = a_high * b_high; // this product is exact + Vec8f r2 = r1 - c; // subtract c from high product + Vec8f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product + return r3; // + ((r2 - r1) + c); +#endif +} + + +// Approximate math functions + +// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11) +static inline Vec8f approx_recipr(Vec8f const a) { +#ifdef __AVX512ER__ // AVX512ER: full precision + // todo: if future processors have both AVX512ER and AVX512VL: _mm256_rcp28_round_ps(a, _MM_FROUND_NO_EXC); + return _mm512_castps512_ps256(_mm512_rcp28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC)); +#elif INSTRSET >= 10 // AVX512VL: 14 bit precision + return _mm256_rcp14_ps(a); +#elif INSTRSET >= 9 // AVX512F: 14 bit precision + return _mm512_castps512_ps256(_mm512_rcp14_ps(_mm512_castps256_ps512(a))); +#else // AVX: 11 bit precision + return _mm256_rcp_ps(a); +#endif +} + +// Newton-Raphson refined approximate reciprocal (23 bit precision) +static inline Vec8f rcp_nr(Vec8f const a) { + Vec8f nr = _mm256_rcp_ps(a); + Vec8f muls = nr * nr * a; + Vec8f dbl = nr + nr; + return dbl - muls; +} + +// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11) +static inline Vec8f approx_rsqrt(Vec8f const a) { +// use more accurate version if available. (none of these will raise exceptions on zero) +#ifdef __AVX512ER__ // AVX512ER: full precision + // todo: if future processors have both AVX512ER and AVX521VL: _mm256_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); + return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC)); +#elif INSTRSET >= 10 && defined(_mm256_rsqrt14_ps) // missing in VS2019 + return _mm256_rsqrt14_ps(a); +#elif INSTRSET >= 9 // AVX512F: 14 bit precision + return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a))); +#else // AVX: 11 bit precision + return _mm256_rsqrt_ps(a); +#endif +} + + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 +static inline Vec8i exponent(Vec8f const a) { +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec8ui t1 = _mm256_castps_si256(a);// reinterpret as 32-bit integer + Vec8ui t2 = t1 << 1; // shift out sign bit + Vec8ui t3 = t2 >> 24; // shift down logical to position 0 + Vec8i t4 = Vec8i(t3) - 0x7F; // subtract bias from exponent + return t4; +#else // no AVX2 + return Vec8i(exponent(a.get_low()), exponent(a.get_high())); +#endif +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f +static inline Vec8f fraction(Vec8f const a) { +#if INSTRSET >= 10 + return _mm256_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); +#elif INSTRSET >= 8 // AVX2. 256 bit integer vectors are available + Vec8ui t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000; // set exponent to 0 + bias + return _mm256_castsi256_ps(t2); +#else + return Vec8f(fraction(a.get_low()), fraction(a.get_high())); +#endif +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0f +// n >= 128 gives +INF +// n <= -127 gives 0.0f +// This function will never produce denormals, and never raise exceptions +static inline Vec8f exp2(Vec8i const n) { +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec8i t1 = max(n, -0x7F); // limit to allowed range + Vec8i t2 = min(t1, 0x80); + Vec8i t3 = t2 + 0x7F; // add bias + Vec8i t4 = t3 << 23; // put exponent into position 23 + return _mm256_castsi256_ps(t4); // reinterpret as float +#else + return Vec8f(exp2(n.get_low()), exp2(n.get_high())); +#endif // AVX2 +} +//static inline Vec8f exp2(Vec8f const x); // defined in vectormath_exp.h + + + +/***************************************************************************** +* +* Vec4d: Vector of 4 double precision floating point values +* +*****************************************************************************/ + +class Vec4d { +protected: + __m256d ymm; // double vector +public: + // Default constructor: + Vec4d() { + } + // Constructor to broadcast the same value into all elements: + Vec4d(double d) { + ymm = _mm256_set1_pd(d); + } + // Constructor to build from all elements: + Vec4d(double d0, double d1, double d2, double d3) { + ymm = _mm256_setr_pd(d0, d1, d2, d3); + } + // Constructor to build from two Vec2d: + Vec4d(Vec2d const a0, Vec2d const a1) { + ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1))); + //ymm = _mm256_set_m128d(a1, a0); + } + // Constructor to convert from type __m256d used in intrinsics: + Vec4d(__m256d const x) { + ymm = x; + } + // Assignment operator to convert from type __m256d used in intrinsics: + Vec4d & operator = (__m256d const x) { + ymm = x; + return *this; + } + // Type cast operator to convert to __m256d used in intrinsics + operator __m256d() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec4d & load(double const * p) { + ymm = _mm256_loadu_pd(p); + return *this; + } + // Member function to load from array, aligned by 32 + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 32 + Vec4d & load_a(double const * p) { + ymm = _mm256_load_pd(p); + return *this; + } + // Member function to store into array (unaligned) + void store(double * p) const { + _mm256_storeu_pd(p, ymm); + } + // Member function storing into array, aligned by 32 + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 32 + void store_a(double * p) const { + _mm256_store_pd(p, ymm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 32 + void store_nt(double * p) const { + _mm256_stream_pd(p, ymm); + } + // Partial load. Load n elements and set the rest to 0 + Vec4d & load_partial(int n, double const * p) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_maskz_loadu_pd(__mmask8((1u << n) - 1), p); +#else + if (n > 0 && n <= 2) { + *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd()); + } + else if (n > 2 && n <= 4) { + *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2)); + } + else { + ymm = _mm256_setzero_pd(); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, double * p) const { +#if INSTRSET >= 10 // AVX512VL + _mm256_mask_storeu_pd(p, __mmask8((1u << n) - 1), ymm); +#else + if (n <= 2) { + get_low().store_partial(n, p); + } + else if (n <= 4) { + get_low().store(p); + get_high().store_partial(n - 2, p + 2); + } +#endif + } + // cut off vector to n elements. The last 4-n elements are set to zero + Vec4d & cutoff(int n) { +#if INSTRSET >= 10 + ymm = _mm256_maskz_mov_pd(__mmask8((1u << n) - 1), ymm); +#else + ymm = _mm256_castps_pd(Vec8f(_mm256_castpd_ps(ymm)).cutoff(n*2)); +#endif + return *this; + } + // Member function to change a single element in vector + // Note: This function is inefficient. Use load function if changing more than one element + Vec4d const insert(int index, double value) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_mask_broadcastsd_pd (ymm, __mmask8(1u << index), _mm_set_sd(value)); +#else + __m256d v0 = _mm256_broadcast_sd(&value); + switch (index) { + case 0: + ymm = _mm256_blend_pd (ymm, v0, 1); break; + case 1: + ymm = _mm256_blend_pd (ymm, v0, 2); break; + case 2: + ymm = _mm256_blend_pd (ymm, v0, 4); break; + default: + ymm = _mm256_blend_pd (ymm, v0, 8); break; + } +#endif + return *this; + } + // Member function extract a single element from vector + double extract(int index) const { +#if INSTRSET >= 10 + __m256d x = _mm256_maskz_compress_pd(__mmask8(1u << index), ymm); + return _mm256_cvtsd_f64(x); +#else + double x[4]; + store(x); + return x[index & 3]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + double operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2d: + Vec2d get_low() const { + return _mm256_castpd256_pd128(ymm); + } + Vec2d get_high() const { + return _mm256_extractf128_pd(ymm,1); + } + static constexpr int size() { + return 4; + } + static constexpr int elementtype() { + return 17; + } + typedef __m256d registertype; +}; + + +/***************************************************************************** +* +* Operators for Vec4d +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec4d operator + (Vec4d const a, Vec4d const b) { + return _mm256_add_pd(a, b); +} + +// vector operator + : add vector and scalar +static inline Vec4d operator + (Vec4d const a, double b) { + return a + Vec4d(b); +} +static inline Vec4d operator + (double a, Vec4d const b) { + return Vec4d(a) + b; +} + +// vector operator += : add +static inline Vec4d & operator += (Vec4d & a, Vec4d const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec4d operator ++ (Vec4d & a, int) { + Vec4d a0 = a; + a = a + 1.0; + return a0; +} + +// prefix operator ++ +static inline Vec4d & operator ++ (Vec4d & a) { + a = a + 1.0; + return a; +} + +// vector operator - : subtract element by element +static inline Vec4d operator - (Vec4d const a, Vec4d const b) { + return _mm256_sub_pd(a, b); +} + +// vector operator - : subtract vector and scalar +static inline Vec4d operator - (Vec4d const a, double b) { + return a - Vec4d(b); +} +static inline Vec4d operator - (double a, Vec4d const b) { + return Vec4d(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec4d operator - (Vec4d const a) { + return _mm256_xor_pd(a, _mm256_castps_pd(constant8f<0u,0x80000000u,0u,0x80000000u,0u,0x80000000u,0u,0x80000000u> ())); +} + +// vector operator -= : subtract +static inline Vec4d & operator -= (Vec4d & a, Vec4d const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec4d operator -- (Vec4d & a, int) { + Vec4d a0 = a; + a = a - 1.0; + return a0; +} + +// prefix operator -- +static inline Vec4d & operator -- (Vec4d & a) { + a = a - 1.0; + return a; +} + +// vector operator * : multiply element by element +static inline Vec4d operator * (Vec4d const a, Vec4d const b) { + return _mm256_mul_pd(a, b); +} + +// vector operator * : multiply vector and scalar +static inline Vec4d operator * (Vec4d const a, double b) { + return a * Vec4d(b); +} +static inline Vec4d operator * (double a, Vec4d const b) { + return Vec4d(a) * b; +} + +// vector operator *= : multiply +static inline Vec4d & operator *= (Vec4d & a, Vec4d const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec4d operator / (Vec4d const a, Vec4d const b) { + return _mm256_div_pd(a, b); +} + +// vector operator / : divide vector and scalar +static inline Vec4d operator / (Vec4d const a, double b) { + return a / Vec4d(b); +} +static inline Vec4d operator / (double a, Vec4d const b) { + return Vec4d(a) / b; +} + +// vector operator /= : divide +static inline Vec4d & operator /= (Vec4d & a, Vec4d const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec4db operator == (Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_pd_mask(a, b, 0); +#else + return _mm256_cmp_pd(a, b, 0); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec4db operator != (Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_pd_mask(a, b, 4); +#else + return _mm256_cmp_pd(a, b, 4); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec4db operator < (Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_pd_mask(a, b, 1); +#else + return _mm256_cmp_pd(a, b, 1); +#endif +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec4db operator <= (Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_pd_mask(a, b, 2); +#else + return _mm256_cmp_pd(a, b, 2); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec4db operator > (Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_pd_mask(a, b, 6); +#else + return b < a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec4db operator >= (Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_pd_mask(a, b, 5); +#else + return b <= a; +#endif +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec4d operator & (Vec4d const a, Vec4d const b) { + return _mm256_and_pd(a, b); +} + +// vector operator &= : bitwise and +static inline Vec4d & operator &= (Vec4d & a, Vec4d const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec4d and Vec4db +static inline Vec4d operator & (Vec4d const a, Vec4db const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_maskz_mov_pd(b, a); +#else + return _mm256_and_pd(a, b); +#endif +} +static inline Vec4d operator & (Vec4db const a, Vec4d const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec4d operator | (Vec4d const a, Vec4d const b) { + return _mm256_or_pd(a, b); +} + +// vector operator |= : bitwise or +static inline Vec4d & operator |= (Vec4d & a, Vec4d const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec4d operator ^ (Vec4d const a, Vec4d const b) { + return _mm256_xor_pd(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec4d & operator ^= (Vec4d & a, Vec4d const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec4db operator ! (Vec4d const a) { + return a == Vec4d(0.0); +} + + +/***************************************************************************** +* +* Functions for Vec4d +* +*****************************************************************************/ + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec4d select (Vec4db const s, Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_pd(b, s, a); +#else + return _mm256_blendv_pd(b, a, s); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec4d if_add (Vec4db const f, Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 + return _mm256_mask_add_pd (a, f, a, b); +#else + return a + (Vec4d(f) & b); +#endif +} + +// Conditional subtract +static inline Vec4d if_sub (Vec4db const f, Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 + return _mm256_mask_sub_pd (a, f, a, b); +#else + return a - (Vec4d(f) & b); +#endif +} + +// Conditional multiply +static inline Vec4d if_mul (Vec4db const f, Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 + return _mm256_mask_mul_pd (a, f, a, b); +#else + return a * select(f, b, 1.); +#endif +} + +// Conditional divide +static inline Vec4d if_div (Vec4db const f, Vec4d const a, Vec4d const b) { +#if INSTRSET >= 10 + return _mm256_mask_div_pd (a, f, a, b); +#else + return a / select(f, b, 1.); +#endif +} + +// sign functions + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec4d sign_combine(Vec4d const a, Vec4d const b) { +#if INSTRSET < 10 + return a ^ (b & Vec4d(-0.0)); +#else + return _mm256_castsi256_pd (_mm256_ternarylogic_epi64( + _mm256_castpd_si256(a), _mm256_castpd_si256(b), Vec4q(0x8000000000000000), 0x78)); +#endif +} + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +static inline Vec4db is_finite(Vec4d const a) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask8(~ _mm256_fpclass_pd_mask (a, 0x99)); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer + Vec4q t2 = t1 << 1; // shift out sign bit + Vec4q t3 = 0xFFE0000000000000; // exponent mask + Vec4qb t4 = Vec4q(t2 & t3) != t3; // exponent field is not all 1s + return t4; +#else + return Vec4db(is_finite(a.get_low()),is_finite(a.get_high())); +#endif +} + +// categorization functions + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +static inline Vec4db is_inf(Vec4d const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_fpclass_pd_mask (a, 0x18); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer + Vec4q t2 = t1 << 1; // shift out sign bit + return t2 == 0xFFE0000000000000; // exponent is all 1s, fraction is 0 +#else + return Vec4db(is_inf(a.get_low()),is_inf(a.get_high())); +#endif +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +#if INSTRSET >= 10 +static inline Vec4db is_nan(Vec4d const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm256_fpclass_pd_mask (a, 0x81); +} +//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +//__attribute__((optimize("-fno-unsafe-math-optimizations"))) +//static inline Vec4db is_nan(Vec4d const a) { +// return a != a; // not safe with -ffinite-math-only compiler option +//} +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +static inline Vec4db is_nan(Vec4d const a) { + __m256d aa = a; + __m256d unordered; + __asm volatile("vcmppd $3, %1, %1, %0" : "=v" (unordered) : "v" (aa) ); + return Vec4db(unordered); +} +#else +static inline Vec4db is_nan(Vec4d const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm256_cmp_pd(a, a, 3); // compare unordered + // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option +} +#endif + + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec4db is_subnormal(Vec4d const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_fpclass_pd_mask (a, 0x20); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer + Vec4q t2 = t1 << 1; // shift out sign bit + Vec4q t3 = 0xFFE0000000000000; // exponent mask + Vec4q t4 = t2 & t3; // exponent + Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction + return Vec4qb(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0 +#else + return Vec4db(is_subnormal(a.get_low()),is_subnormal(a.get_high())); +#endif +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec4db is_zero_or_subnormal(Vec4d const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_fpclass_pd_mask (a, 0x26); +#elif INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 Vec8i t = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec4q t = _mm256_castpd_si256(a); // reinterpret as 32-bit integer + t &= 0x7FF0000000000000ll; // isolate exponent + return t == 0; // exponent = 0 +#else + return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high())); +#endif +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline double horizontal_add (Vec4d const a) { + return horizontal_add(a.get_low() + a.get_high()); +} + +// function max: a > b ? a : b +static inline Vec4d max(Vec4d const a, Vec4d const b) { + return _mm256_max_pd(a,b); +} + +// function min: a < b ? a : b +static inline Vec4d min(Vec4d const a, Vec4d const b) { + return _mm256_min_pd(a,b); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +static inline Vec4d abs(Vec4d const a) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_range_pd(a, a, 8); +#else + __m256d mask = _mm256_castps_pd(constant8f<0xFFFFFFFFu,0x7FFFFFFFu,0xFFFFFFFFu,0x7FFFFFFFu,0xFFFFFFFFu,0x7FFFFFFFu,0xFFFFFFFFu,0x7FFFFFFFu> ()); + return _mm256_and_pd(a,mask); +#endif +} + +// function sqrt: square root +static inline Vec4d sqrt(Vec4d const a) { + return _mm256_sqrt_pd(a); +} + +// function square: a * a +static inline Vec4d square(Vec4d const a) { + return a * a; +} + +// The purpose of this template is to prevent implicit conversion of a float +// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included +template static Vec4d pow(Vec4d const a, TT const n); + +// Raise floating point numbers to integer power n +template <> +inline Vec4d pow(Vec4d const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec4d pow(Vec4d const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec4d pow(Vec4d const a, Const_int_t) { + return pow_n(a); +} + + +// function round: round to nearest integer (even). (result as double vector) +static inline Vec4d round(Vec4d const a) { + return _mm256_round_pd(a, 0+8); +} + +// function truncate: round towards zero. (result as double vector) +static inline Vec4d truncate(Vec4d const a) { + return _mm256_round_pd(a, 3+8); +} + +// function floor: round towards minus infinity. (result as double vector) +static inline Vec4d floor(Vec4d const a) { + return _mm256_round_pd(a, 1+8); +} + +// function ceil: round towards plus infinity. (result as double vector) +static inline Vec4d ceil(Vec4d const a) { + return _mm256_round_pd(a, 2+8); +} + +// function round_to_int32: round to nearest integer (even). (result as integer vector) +static inline Vec4i round_to_int32(Vec4d const a) { + // Note: assume MXCSR control register is set to rounding + return _mm256_cvtpd_epi32(a); +} + +// function truncate_to_int32: round towards zero. (result as integer vector) +static inline Vec4i truncate_to_int32(Vec4d const a) { + return _mm256_cvttpd_epi32(a); +} + +#if INSTRSET >= 8 // 256 bit integer vectors are available. AVX2 + +// function truncatei: round towards zero +static inline Vec4q truncatei(Vec4d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm256_cvttpd_epi64(a); +#else + double aa[4]; // inefficient + a.store(aa); + return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3])); +#endif +} + +// function roundi: round to nearest or even +static inline Vec4q roundi(Vec4d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm256_cvtpd_epi64(a); +#else + return truncatei(round(a)); // inefficient +#endif +} + +// function to_double: convert integer vector elements to double vector +static inline Vec4d to_double(Vec4q const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm256_maskz_cvtepi64_pd( __mmask16(0xFF), a); +#else + int64_t aa[4]; // inefficient + a.store(aa); + return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3])); +#endif +} + +static inline Vec4d to_double(Vec4uq const a) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm256_cvtepu64_pd(a); +#else + uint64_t aa[4]; // inefficient + a.store(aa); + return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3])); +#endif +} + +#else // no 256 bit integer vectors + +// function truncatei: round towards zero. (inefficient) +static inline Vec4q truncatei(Vec4d const a) { + return Vec4q(truncatei(a.get_low()), truncatei(a.get_high())); +} + +// function roundi: round to nearest or even. (inefficient) +static inline Vec4q roundi(Vec4d const a) { + return Vec4q(roundi(a.get_low()), roundi(a.get_high())); +} + +// function to_double: convert integer vector elements to double vector +static inline Vec4d to_double(Vec4q const a) { + return Vec4d(to_double(a.get_low()), to_double(a.get_high())); +} + +static inline Vec4d to_double(Vec4uq const a) { + return Vec4d(to_double(a.get_low()), to_double(a.get_high())); +} + +#endif // AVX2 + + +// function to_double: convert integer vector to double vector +static inline Vec4d to_double(Vec4i const a) { + return _mm256_cvtepi32_pd(a); +} + +// function compress: convert two Vec4d to one Vec8f +static inline Vec8f compress (Vec4d const low, Vec4d const high) { + __m128 t1 = _mm256_cvtpd_ps(low); + __m128 t2 = _mm256_cvtpd_ps(high); + return Vec8f(t1, t2); +} + +// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d +static inline Vec4d extend_low(Vec8f const a) { + return _mm256_cvtps_pd(_mm256_castps256_ps128(a)); +} + +// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d +static inline Vec4d extend_high (Vec8f const a) { + return _mm256_cvtps_pd(_mm256_extractf128_ps(a,1)); +} + +// Fused multiply and add functions + +// Multiply and add +static inline Vec4d mul_add(Vec4d const a, Vec4d const b, Vec4d const c) { +#ifdef __FMA__ + return _mm256_fmadd_pd(a, b, c); +#elif defined (__FMA4__) + return _mm256_macc_pd(a, b, c); +#else + return a * b + c; +#endif + +} + +// Multiply and subtract +static inline Vec4d mul_sub(Vec4d const a, Vec4d const b, Vec4d const c) { +#ifdef __FMA__ + return _mm256_fmsub_pd(a, b, c); +#elif defined (__FMA4__) + return _mm256_msub_pd(a, b, c); +#else + return a * b - c; +#endif +} + +// Multiply and inverse subtract +static inline Vec4d nmul_add(Vec4d const a, Vec4d const b, Vec4d const c) { +#ifdef __FMA__ + return _mm256_fnmadd_pd(a, b, c); +#elif defined (__FMA4__) + return _mm256_nmacc_pd(a, b, c); +#else + return c - a * b; +#endif +} + +// Multiply and subtract with extra precision on the intermediate calculations, +// even if FMA instructions not supported, using Veltkamp-Dekker split. +// This is used in mathematical functions. Do not use it in general code +// because it is inaccurate in certain cases +static inline Vec4d mul_sub_x(Vec4d const a, Vec4d const b, Vec4d const c) { +#ifdef __FMA__ + return _mm256_fmsub_pd(a, b, c); +#elif defined (__FMA4__) + return _mm256_msub_pd(a, b, c); +#else + // calculate a * b - c with extra precision + // mask to remove lower 27 bits + Vec4d upper_mask = _mm256_castps_pd(constant8f<0xF8000000u,0xFFFFFFFFu,0xF8000000u,0xFFFFFFFFu,0xF8000000u,0xFFFFFFFFu,0xF8000000u,0xFFFFFFFFu>()); + Vec4d a_high = a & upper_mask; // split into high and low parts + Vec4d b_high = b & upper_mask; + Vec4d a_low = a - a_high; + Vec4d b_low = b - b_high; + Vec4d r1 = a_high * b_high; // this product is exact + Vec4d r2 = r1 - c; // subtract c from high product + Vec4d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product + return r3; // + ((r2 - r1) + c); +#endif +} + + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 +static inline Vec4q exponent(Vec4d const a) { +#if INSTRSET >= 8 // 256 bit integer vectors are available + Vec4uq t1 = _mm256_castpd_si256(a);// reinterpret as 64-bit integer + Vec4uq t2 = t1 << 1; // shift out sign bit + Vec4uq t3 = t2 >> 53; // shift down logical to position 0 + Vec4q t4 = Vec4q(t3) - 0x3FF; // subtract bias from exponent + return t4; +#else + return Vec4q(exponent(a.get_low()), exponent(a.get_high())); +#endif +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0) = 1.0, fraction(5.0) = 1.25 +static inline Vec4d fraction(Vec4d const a) { +#if INSTRSET >= 10 + return _mm256_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); +#elif INSTRSET >= 8 // AVX2. 256 bit integer vectors are available + Vec4uq t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer + Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFF) | 0x3FF0000000000000); // set exponent to 0 + bias + return _mm256_castsi256_pd(t2); +#else + return Vec4d(fraction(a.get_low()), fraction(a.get_high())); +#endif +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0 +// n >= 1024 gives +INF +// n <= -1023 gives 0.0 +// This function will never produce denormals, and never raise exceptions +static inline Vec4d exp2(Vec4q const n) { +#if INSTRSET >= 8 // 256 bit integer vectors are available + Vec4q t1 = max(n, -0x3FF); // limit to allowed range + Vec4q t2 = min(t1, 0x400); + Vec4q t3 = t2 + 0x3FF; // add bias + Vec4q t4 = t3 << 52; // put exponent into position 52 + return _mm256_castsi256_pd(t4); // reinterpret as double +#else + return Vec4d(exp2(n.get_low()), exp2(n.get_high())); +#endif +} +//static inline Vec4d exp2(Vec4d const x); // defined in vectormath_exp.h + + +// Categorization functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0, -INF and -NAN +// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false +static inline Vec4db sign_bit(Vec4d const a) { +#if INSTRSET >= 8 // 256 bit integer vectors are available, AVX2 + Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer + Vec4q t2 = t1 >> 63; // extend sign bit +#if INSTRSET >= 10 + return t2 != 0; +#else + return _mm256_castsi256_pd(t2); // reinterpret as 64-bit Boolean +#endif +#else + return Vec4db(sign_bit(a.get_low()),sign_bit(a.get_high())); +#endif +} + +// change signs on vectors Vec4d +// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change +template +inline Vec4d change_sign(Vec4d const a) { + if ((i0 | i1 | i2 | i3) == 0) return a; + __m256d mask = _mm256_castps_pd(constant8f < + 0u, (i0 ? 0x80000000u : 0u), 0u, (i1 ? 0x80000000u : 0u), 0u, (i2 ? 0x80000000u : 0u), 0u, (i3 ? 0x80000000u : 0u)> ()); + return _mm256_xor_pd(a, mask); +} + + +/***************************************************************************** +* +* Functions for reinterpretation between vector types +* +*****************************************************************************/ + +#if INSTRSET >= 8 // AVX2 + +// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors. +// If necessary, compile with -fabi-version=0 to get the latest abi version +//#if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004) +static inline __m256i reinterpret_i (__m256i const x) { + return x; +} + +static inline __m256i reinterpret_i (__m256 const x) { + return _mm256_castps_si256(x); +} + +static inline __m256i reinterpret_i (__m256d const x) { + return _mm256_castpd_si256(x); +} + +static inline __m256 reinterpret_f (__m256i const x) { + return _mm256_castsi256_ps(x); +} + +static inline __m256 reinterpret_f (__m256 const x) { + return x; +} + +static inline __m256 reinterpret_f (__m256d const x) { + return _mm256_castpd_ps(x); +} + +static inline __m256d reinterpret_d (__m256i const x) { + return _mm256_castsi256_pd(x); +} + +static inline __m256d reinterpret_d (__m256 const x) { + return _mm256_castps_pd(x); +} + +static inline __m256d reinterpret_d (__m256d const x) { + return x; +} + +#else // AVX2 emulated in vectori256e.h, AVX supported + +// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors. +// If necessary, compile with -fabi-version=0 to get the latest abi version + +static inline Vec256b reinterpret_i (__m256 const x) { + Vec8f xx(x); + return Vec256b(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high())); +} + +static inline Vec256b reinterpret_i (__m256d const x) { + Vec4d xx(x); + return Vec256b(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high())); +} + +static inline __m256 reinterpret_f (__m256 const x) { + return x; +} + +static inline __m256 reinterpret_f (__m256d const x) { + return _mm256_castpd_ps(x); +} + +static inline __m256d reinterpret_d (__m256 const x) { + return _mm256_castps_pd(x); +} + +static inline __m256d reinterpret_d (__m256d const x) { + return x; +} + +static inline Vec256b reinterpret_i (Vec256b const x) { + return x; +} + +static inline __m256 reinterpret_f (Vec256b const x) { + return Vec8f(Vec4f(reinterpret_f(x.get_low())), Vec4f(reinterpret_f(x.get_high()))); +} + +static inline __m256d reinterpret_d (Vec256b const x) { + return Vec4d(Vec2d(reinterpret_d(x.get_low())), Vec2d(reinterpret_d(x.get_high()))); +} + +#endif // AVX2 + +// Function infinite4f: returns a vector where all elements are +INF +static inline Vec8f infinite8f() { + return reinterpret_f(Vec8i(0x7F800000)); +} + +// Function nan8f: returns a vector where all elements are +NAN (quiet) +static inline Vec8f nan8f(int n = 0x10) { + return nan_vec(n); +} + +// Function infinite2d: returns a vector where all elements are +INF +static inline Vec4d infinite4d() { + return reinterpret_d(Vec4q(0x7FF0000000000000)); +} + +// Function nan4d: returns a vector where all elements are +NAN (quiet) +static inline Vec4d nan4d(int n = 0x10) { + return nan_vec(n); +} + + +/***************************************************************************** +* +* Vector permute and blend functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// permute vector Vec4d +template +static inline Vec4d permute4(Vec4d const a) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + __m256d y = a; // result + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_pd(); // just return zero + + if constexpr ((flags & perm_largeblock) != 0) { // permute 128-bit blocks + constexpr EList L = largeblock_perm<4>(indexs); // permutation pattern + constexpr int j0 = L.a[0]; + constexpr int j1 = L.a[1]; +#ifndef ZEXT_MISSING + if constexpr (j0 == 0 && j1 == -1 && !(flags & perm_addz)) { // zero extend + return _mm256_zextpd128_pd256(_mm256_castpd256_pd128(y)); + } + if constexpr (j0 == 1 && j1 < 0 && !(flags & perm_addz)) { // extract upper part, zero extend + return _mm256_zextpd128_pd256(_mm256_extractf128_pd(y, 1)); + } +#endif + if constexpr ((flags & perm_perm) != 0 && !(flags & perm_zeroing)) { + return _mm256_permute2f128_pd(y, y, (j0 & 1) | (j1 & 1) << 4); + } + } + if constexpr ((flags & perm_perm) != 0) { // permutation needed + if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in both lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm256_unpackhi_pd(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm256_unpacklo_pd(y, y); + } + else { // general permute + constexpr uint8_t mm0 = (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3; + y = _mm256_permute_pd(a, mm0); // select within same lane + } + } +#if INSTRSET >= 8 // AVX2 + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) { + y = _mm256_broadcastsd_pd(_mm256_castpd256_pd128(y)); // broadcast first element + } +#endif + else { // different patterns in two lanes +#if INSTRSET >= 10 // AVX512VL + if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + constexpr uint8_t zm = zero_mask<4>(indexs); + return _mm256_castsi256_pd(_mm256_maskz_alignr_epi64 (zm, _mm256_castpd_si256(y), _mm256_castpd_si256(y), rot)); + } +#endif + if constexpr ((flags & perm_cross_lane) == 0){ // no lane crossing + constexpr uint8_t mm0 = (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3; + y = _mm256_permute_pd(a, mm0); // select within same lane + } + else { +#if INSTRSET >= 8 // AVX2 + // full permute + constexpr uint8_t mms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6; + y = _mm256_permute4x64_pd(a, mms); +#else + // permute lanes separately + __m256d sw = _mm256_permute2f128_pd(a,a,1);// swap the two 128-bit lanes + constexpr uint8_t mml = (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3; + __m256d y1 = _mm256_permute_pd(a, mml); // select from same lane + __m256d y2 = _mm256_permute_pd(sw, mml); // select from opposite lane + constexpr uint64_t blendm = make_bit_mask<4, 0x101>(indexs); // blend mask + y = _mm256_blend_pd(y1, y2, uint8_t(blendm)); +#endif + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_pd(zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + //y = _mm256_and_pd(_mm256_castsi256_pd( Vec4q().load(bm.a) ), y); // does not work with INSTRSET = 7 + __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a)); + y = _mm256_and_pd(_mm256_castsi256_pd(bm1), y); + +#endif + } + return y; +} + + +// permute vector Vec8f +template +static inline Vec8f permute8(Vec8f const a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m256 y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_ps(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<8>(indexs); // permutation pattern + y = _mm256_castpd_ps(permute4 + (Vec4d(_mm256_castps_pd(a)))); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in both lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm256_unpackhi_ps(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm256_unpacklo_ps(y, y); + } + else { // general permute, same pattern in both lanes + y = _mm256_shuffle_ps(a, a, uint8_t(flags >> perm_ipattern)); + } + } +#if INSTRSET >= 10 + else if constexpr ((flags & perm_broadcast) != 0) { + constexpr uint8_t e = flags >> perm_rot_count & 0xF; // broadcast one element + if constexpr (e > 0) { + y = _mm256_castsi256_ps(_mm256_alignr_epi32( _mm256_castps_si256(y), _mm256_castps_si256(y), e)); + } + y = _mm256_broadcastss_ps(_mm256_castps256_ps128(y)); + } +#elif INSTRSET >= 8 // AVX2 + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count == 0)) { + y = _mm256_broadcastss_ps(_mm256_castps256_ps128(y)); // broadcast first element + } +#endif +#if INSTRSET >= 8 // avx2 + else if constexpr ((flags & perm_zext) != 0) { // zero extension + y = _mm256_castsi256_ps(_mm256_cvtepu32_epi64(_mm256_castsi256_si128(_mm256_castps_si256(y)))); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif +#if INSTRSET >= 10 // AVX512VL + else if constexpr ((flags & perm_compress) != 0) { + y = _mm256_maskz_compress_ps(__mmask8(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm256_maskz_expand_ps(__mmask8(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif + else { // different patterns in two lanes +#if INSTRSET >= 10 // AVX512VL + if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + y = _mm256_castsi256_ps(_mm256_alignr_epi32(_mm256_castps_si256(y), _mm256_castps_si256(y), rot)); + } + else +#endif + if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use vpermilps + __m256 m = constant8f(); + y = _mm256_permutevar_ps(a, _mm256_castps_si256(m)); + } + else { + // full permute needed + __m256i permmask = _mm256_castps_si256( + constant8f ()); +#if INSTRSET >= 8 // AVX2 + y = _mm256_permutevar8x32_ps(a, permmask); +#else + // permute lanes separately + __m256 sw = _mm256_permute2f128_ps(a, a, 1); // swap the two 128-bit lanes + __m256 y1 = _mm256_permutevar_ps(a, permmask); // select from same lane + __m256 y2 = _mm256_permutevar_ps(sw, permmask); // select from opposite lane + constexpr uint64_t blendm = make_bit_mask<8, 0x102>(indexs); // blend mask + y = _mm256_blend_ps(y1, y2, uint8_t(blendm)); +#endif + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_ps(zero_mask<8>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a)); + y = _mm256_and_ps(_mm256_castsi256_ps(bm1), y); +#endif + } + return y; +} + + +// blend vectors Vec4d +template +static inline Vec4d blend4(Vec4d const a, Vec4d const b) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + __m256d y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_pd(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute4 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute4 (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs); // blend mask +#if INSTRSET >= 10 // AVX512VL + y = _mm256_mask_mov_pd (a, mb, b); +#else // AVX + y = _mm256_blend_pd(a, b, mb); // duplicate each bit +#endif + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 128-bit blocks + constexpr EList L = largeblock_perm<4>(indexs); // get 128-bit blend pattern + constexpr uint8_t pp = (L.a[0] & 0xF) | uint8_t(L.a[1] & 0xF) << 4; + y = _mm256_permute2f128_pd(a, b, pp); + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm256_unpacklo_pd (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm256_unpacklo_pd (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm256_unpackhi_pd(a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm256_unpackhi_pd(b, a); + } + else if constexpr ((flags & blend_shufab) != 0) { + y = _mm256_shuffle_pd(a, b, (flags >> blend_shufpattern) & 0xF); + } + else if constexpr ((flags & blend_shufba) != 0) { + y = _mm256_shuffle_pd(b, a, (flags >> blend_shufpattern) & 0xF); + } + else { // No special cases +#if INSTRSET >= 10 // AVX512VL. use vpermi2pd + __m256i const maskp = constant8ui(); + return _mm256_maskz_permutex2var_pd (zero_mask<4>(indexs), a, maskp, b); +#else // permute a and b separately, then blend. + constexpr EList L = blend_perm_indexes<4, 0>(indexs); // get permutation indexes + __m256d ya = permute4(a); + __m256d yb = permute4(b); + constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs); // blend mask + y = _mm256_blend_pd(ya, yb, mb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_pd(zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a)); + y = _mm256_and_pd(_mm256_castsi256_pd(bm1), y); +#endif + } + return y; +} + + +// blend vectors Vec8f +template +static inline Vec8f blend8(Vec8f const a, Vec8f const b) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m256 y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_ps(); // just return zero + + if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 32-bit blocks + constexpr EList L = largeblock_perm<8>(indexs); // get 32-bit blend pattern + y = _mm256_castpd_ps(blend4 + (Vec4d(_mm256_castps_pd(a)), Vec4d(_mm256_castps_pd(b)))); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute8 (a); + } + else if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes + return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b); + } + else if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs); // blend mask +#if INSTRSET >= 10 // AVX512VL + y = _mm256_mask_mov_ps(a, mb, b); +#else // AVX2 + y = _mm256_blend_ps(a, b, mb); +#endif + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm256_unpacklo_ps(a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm256_unpacklo_ps(b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm256_unpackhi_ps(a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm256_unpackhi_ps(b, a); + } + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm256_shuffle_ps(a, b, uint8_t(flags >> blend_shufpattern)); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm256_shuffle_ps(b, a, uint8_t(flags >> blend_shufpattern)); + } + else { // No special cases +#if INSTRSET >= 10 // AVX512VL. use vpermi2d + __m256i const maskp = constant8ui (); + return _mm256_maskz_permutex2var_ps(zero_mask<8>(indexs), a, maskp, b); +#else // permute a and b separately, then blend. + constexpr EList L = blend_perm_indexes<8, 0>(indexs); // get permutation indexes + __m256 ya = permute8(a); + __m256 yb = permute8(b); + constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs); // blend mask + y = _mm256_blend_ps(ya, yb, mb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_ps(zero_mask<8>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + __m256i bm1 = _mm256_loadu_si256((const __m256i*)(bm.a)); + y = _mm256_and_ps(_mm256_castsi256_ps(bm1), y); +#endif + } + return y; +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec8f lookup8(Vec8i const index, Vec8f const table) { +#if INSTRSET >= 8 // AVX2 + return _mm256_permutevar8x32_ps(table, index); + +#else // AVX + // swap low and high part of table + __m256 sw = _mm256_permute2f128_ps(table, table, 1); // swap the two 128-bit lanes + // join index parts + __m256i index2 = _mm256_insertf128_si256(_mm256_castsi128_si256(index.get_low()), index.get_high(), 1); + // permute within each 128-bit part + __m256 r0 = _mm256_permutevar_ps(table, index2); + __m256 r1 = _mm256_permutevar_ps(sw, index2); + // high index bit for blend + __m128i k1 = _mm_slli_epi32(index.get_high() ^ 4, 29); + __m128i k0 = _mm_slli_epi32(index.get_low(), 29); + __m256 kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), _mm_castsi128_ps(k1), 1); + // blend the two permutes + return _mm256_blendv_ps(r0, r1, kk); +#endif +} + +template +static inline Vec8f lookup(Vec8i const index, float const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 4) { + Vec4f table1 = Vec4f().load(table); + return Vec8f( + lookup4 (index.get_low(), table1), + lookup4 (index.get_high(), table1)); + } +#if INSTRSET < 8 // not AVX2 + if constexpr (n <= 8) { + return lookup8(index, Vec8f().load(table)); + } +#endif + // Limit index + Vec8ui index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec8ui(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec8ui(index), n-1); + } +#if INSTRSET >= 8 // AVX2 + return _mm256_i32gather_ps(table, index1, 4); +#else // AVX + return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]], + table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]); +#endif +} + +static inline Vec4d lookup4(Vec4q const index, Vec4d const table) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_permutexvar_pd(index, table); + +#elif INSTRSET >= 8 // AVX2 + // We can't use VPERMPD because it has constant indexes, vpermilpd can permute only within 128-bit lanes + // Convert the index to fit VPERMPS + Vec8i index1 = permute8<0,0,2,2,4,4,6,6> (Vec8i(index+index)); + Vec8i index2 = index1 + Vec8i(constant8ui<0,1,0,1,0,1,0,1>()); + return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(table), index2)); + +#else // AVX + // swap low and high part of table + __m256d sw = _mm256_permute2f128_pd(table, table, 1);// swap the two 128-bit lanes + // index << 1 + __m128i index2lo = index.get_low() + index.get_low(); + __m128i index2hi = index.get_high() + index.get_high(); + // join index parts + __m256i index3 = _mm256_insertf128_si256(_mm256_castsi128_si256(index2lo), index2hi, 1); + // permute within each 128-bit part + __m256d r0 = _mm256_permutevar_pd(table, index3); // permutevar_pd selects by bit 1 ! + __m256d r1 = _mm256_permutevar_pd(sw, index3); + // high index bit for blend + __m128i k1 = _mm_slli_epi64(index.get_high() ^ 2, 62); + __m128i k0 = _mm_slli_epi64(index.get_low(), 62); + __m256d kk = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_castsi128_pd(k0)), _mm_castsi128_pd(k1), 1); + // blend the two permutes + return _mm256_blendv_pd(r0, r1, kk); +#endif +} + + +template +static inline Vec4d lookup(Vec4q const index, double const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 2) { + Vec2d table1 = Vec2d().load(table); + return Vec4d( + lookup2 (index.get_low(), table1), + lookup2 (index.get_high(), table1)); + } +#if INSTRSET < 8 // not AVX2 + if constexpr (n <= 4) { + return lookup4(index, Vec4d().load(table)); + } +#endif + // Limit index + Vec4uq index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec4uq(index) & Vec4uq(n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec4uq(index), n-1); + } +#if INSTRSET >= 8 // AVX2 + return _mm256_i64gather_pd(table, index1, 8); +#else // AVX + Vec4q index2 = Vec4q(index1); + return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]); +#endif +} + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// Load elements from array a with indices i0, i1, i2, i3, .. +template +static inline Vec8f gather8f(void const * a) { + return reinterpret_f(gather8i(a)); +} + +// Load elements from array a with indices i0, i1, i2, i3 +template +static inline Vec4d gather4d(void const * a) { + return reinterpret_d(gather4q(a)); +} + +/***************************************************************************** +* +* Vector scatter functions +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +*****************************************************************************/ + +template +static inline void scatter(Vec8f const data, float * array) { +#if INSTRSET >= 10 // __AVX512VL__ + __m256i indx = constant8ui(); + __mmask8 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) | + ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7)); + _mm256_mask_i32scatter_ps(array, mask, indx, data, 4); +#elif INSTRSET >= 9 // __AVX512F__ + __m512i indx = _mm512_castsi256_si512(constant8ui()); + __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) | + ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7)); + _mm512_mask_i32scatter_ps(array, mask, indx, _mm512_castps256_ps512(data), 4); +#else + const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; + for (int i = 0; i < 8; i++) { + if (index[i] >= 0) array[index[i]] = data[i]; + } +#endif +} + +template +static inline void scatter(Vec4d const data, double * array) { +#if INSTRSET >= 10 // __AVX512VL__ + __m128i indx = constant4ui(); + __mmask8 mask = uint8_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3)); + _mm256_mask_i32scatter_pd(array, mask, indx, data, 8); +#elif INSTRSET >= 9 // __AVX512F__ + __m256i indx = _mm256_castsi128_si256(constant4ui()); + __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3)); + _mm512_mask_i32scatter_pd(array, (__mmask8)mask, indx, _mm512_castpd256_pd512(data), 8); +#else + const int index[4] = {i0,i1,i2,i3}; + for (int i = 0; i < 4; i++) { + if (index[i] >= 0) array[index[i]] = data[i]; + } +#endif +} + + +/***************************************************************************** +* +* Scatter functions with variable indexes +* +*****************************************************************************/ + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8f const data, float * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit)); + _mm256_mask_i32scatter_ps(destination, mask, index, data, 4); +#elif INSTRSET >= 9 // __AVX512F__ + __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xFFu, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); + _mm512_mask_i32scatter_ps(destination, mask, _mm512_castsi256_si512(index), _mm512_castps256_ps512(data), 4); +#else + for (int i = 0; i < 8; i++) { + if (uint32_t(index[i]) < limit) destination[index[i]] = data[i]; + } +#endif +} + +static inline void scatter(Vec4q const index, uint32_t limit, Vec4d const data, double * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit))); + _mm256_mask_i64scatter_pd(destination, mask, index, data, 8); +#elif INSTRSET >= 9 // __AVX512F__ + __mmask16 mask = _mm512_mask_cmplt_epu64_mask(0xF, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit)))); + _mm512_mask_i64scatter_pd(destination, (__mmask8)mask, _mm512_castsi256_si512(index), _mm512_castpd256_pd512(data), 8); +#else + for (int i = 0; i < 4; i++) { + if (uint64_t(index[i]) < uint64_t(limit)) destination[index[i]] = data[i]; + } +#endif +} + +static inline void scatter(Vec4i const index, uint32_t limit, Vec4d const data, double * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); + _mm256_mask_i32scatter_pd(destination, mask, index, data, 8); +#elif INSTRSET >= 9 // __AVX512F__ + __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xF, _mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit))); + _mm512_mask_i32scatter_pd(destination, (__mmask8)mask, _mm256_castsi128_si256(index), _mm512_castpd256_pd512(data), 8); +#else + for (int i = 0; i < 4; i++) { + if (uint32_t(index[i]) < limit) destination[index[i]] = data[i]; + } +#endif +} + + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORF256_H diff --git a/DFTTest/vectorclass/vectorf256e.h b/DFTTest/VCL2/vectorf256e.h similarity index 59% rename from DFTTest/vectorclass/vectorf256e.h rename to DFTTest/VCL2/vectorf256e.h index edd3a3f..e8ce334 100644 --- a/DFTTest/vectorclass/vectorf256e.h +++ b/DFTTest/VCL2/vectorf256e.h @@ -1,12 +1,14 @@ /**************************** vectorf256e.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2017-02-19 -* Version: 1.27 -* Project: vector classes +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library * Description: -* Header file defining 256-bit floating point vector classes as interface -* to intrinsic functions. Emulated for processors without AVX instruction set. +* Header file defining 256-bit floating point vector classes +* Emulated for processors without AVX instruction set. +* +* Instructions: see vcl_manual.pdf * * The following vector classes are defined here: * Vec8f Vector of 8 single precision floating point numbers @@ -14,25 +16,28 @@ * Vec4d Vector of 4 double precision floating point numbers * Vec4db Vector of 4 Booleans for use with Vec4d * -* For detailed instructions, see VectorClass.pdf +* Each vector object is represented internally in the CPU as two 128-bit registers. +* This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. *****************************************************************************/ -// check combination of header files -#ifdef VECTORF256_H -#if VECTORF256_H != 1 -#error Two different versions of vectorf256.h included +#ifndef VECTORF256E_H +#define VECTORF256E_H 1 + +#ifndef VECTORCLASS_H +#include "vectorclass.h" #endif -#else -#define VECTORF256_H 1 -#if defined (VECTORI256_H) && VECTORI256_H >= 2 -#error wrong combination of header files. Use vectorf256.h instead of vectorf256e.h if you have AVX2 +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed #endif +#ifdef VECTORF256_H +#error Two different versions of vectorf256.h included +#endif -#include "vectorf128.h" // Define 128-bit vectors #ifdef VCL_NAMESPACE namespace VCL_NAMESPACE { @@ -43,6 +48,7 @@ namespace VCL_NAMESPACE { * base class Vec256fe and Vec256de * *****************************************************************************/ + // base class to replace __m256 when AVX is not supported class Vec256fe { protected: @@ -85,42 +91,24 @@ class Vec256de { * select functions * *****************************************************************************/ -// Select between two Vec256fe sources, element by element. Used in various functions -// and operators. Corresponds to this pseudocode: +// Select between two Vec256fe sources, element by element using broad boolean vector. +// Used in various functions and operators. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each element in s must be either 0 (false) or 0xFFFFFFFF (true). -static inline Vec256fe selectf (Vec256fe const & s, Vec256fe const & a, Vec256fe const & b) { +static inline Vec256fe selectf (Vec256fe const s, Vec256fe const a, Vec256fe const b) { return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high())); } // Same, with two Vec256de sources. // and operators. Corresponds to this pseudocode: // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other +// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other // values are allowed. -static inline Vec256de selectd (Vec256de const & s, Vec256de const & a, Vec256de const & b) { +static inline Vec256de selectd (Vec256de const s, Vec256de const a, Vec256de const b) { return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high())); } - -/***************************************************************************** -* -* Generate compile-time constant vector -* -*****************************************************************************/ -// Generate a constant vector of 8 integers stored in memory, -// load as __m256 -template -static inline Vec256fe constant8f() { - static const union { - int i[8]; - __m128 y[2]; - } u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; - return Vec256fe(u.y[0], u.y[1]); -} - - /***************************************************************************** * * Vec8fb: Vector of 8 Booleans for use with Vec8f @@ -138,52 +126,46 @@ class Vec8fb : public Vec256fe { y1 = Vec4fb(b4, b5, b6, b7); } // Constructor to build from two Vec4fb: - Vec8fb(Vec4fb const & a0, Vec4fb const & a1) { + Vec8fb(Vec4fb const a0, Vec4fb const a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256fe - Vec8fb(Vec256fe const & x) { + Vec8fb(Vec256fe const x) { y0 = x.get_low(); y1 = x.get_high(); } + // Constructor to broadcast scalar value: + Vec8fb(bool b) { + y0 = y1 = Vec4fb(b); + } // Assignment operator to convert from type Vec256fe - Vec8fb & operator = (Vec256fe const & x) { + Vec8fb & operator = (Vec256fe const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } -#ifdef VECTORI256_H // 256 bit integer vectors are available // Constructor to convert from type Vec8ib used as Boolean for integer vectors - Vec8fb(Vec8ib const & x) { + Vec8fb(Vec8ib const x) { y0 = _mm_castsi128_ps(Vec8i(x).get_low()); y1 = _mm_castsi128_ps(Vec8i(x).get_high()); } // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors - Vec8fb & operator = (Vec8ib const & x) { + Vec8fb & operator = (Vec8ib const x) { y0 = _mm_castsi128_ps(Vec8i(x).get_low()); y1 = _mm_castsi128_ps(Vec8i(x).get_high()); return *this; } - // Constructor to broadcast the same value into all elements: - Vec8fb(bool b) { - y1 = y0 = Vec4fb(b); - } // Assignment operator to broadcast scalar value: Vec8fb & operator = (bool b) { y0 = y1 = Vec4fb(b); return *this; } -private: // Prevent constructing from int, etc. - Vec8fb(int b); - Vec8fb & operator = (int x); -public: // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors operator Vec8ib() const { return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1)); } -#endif // VECTORI256_H + // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8fb const & insert(uint32_t index, bool value) { - if (index < 4) { + Vec8fb const insert(int index, bool value) { + if ((uint32_t)index < 4) { y0 = Vec4fb(y0).insert(index, value); } else { @@ -192,9 +174,8 @@ class Vec8fb : public Vec256fe { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - bool extract(uint32_t index) const { - if (index < 4) { + bool extract(int index) const { + if ((uint32_t)index < 4) { return Vec4fb(y0).extract(index); } else { @@ -202,7 +183,7 @@ class Vec8fb : public Vec256fe { } } // Extract a single element. Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { + bool operator [] (int index) const { return extract(index); } // Member functions to split into two Vec4fb: @@ -212,9 +193,21 @@ class Vec8fb : public Vec256fe { Vec4fb get_high() const { return y1; } - static int size () { + // Member function to change a bitfield to a boolean vector + Vec8fb & load_bits(uint8_t a) { + y0 = Vec4fb().load_bits(a); + y1 = Vec4fb().load_bits(uint8_t(a>>4u)); + return *this; + } + static constexpr int size() { return 8; } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec8fb(int b) = delete; + Vec8fb & operator = (int x) = delete; }; @@ -225,84 +218,85 @@ class Vec8fb : public Vec256fe { *****************************************************************************/ // vector operator & : bitwise and -static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) { +static inline Vec8fb operator & (Vec8fb const a, Vec8fb const b) { return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) { +static inline Vec8fb operator && (Vec8fb const a, Vec8fb const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) { +static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) { +static inline Vec8fb operator | (Vec8fb const a, Vec8fb const b) { return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) { +static inline Vec8fb operator || (Vec8fb const a, Vec8fb const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) { +static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) { +static inline Vec8fb operator ^ (Vec8fb const a, Vec8fb const b) { return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) { +static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const b) { a = a ^ b; return a; } // vector operator ~ : bitwise not -static inline Vec8fb operator ~ (Vec8fb const & a) { +static inline Vec8fb operator ~ (Vec8fb const a) { return Vec8fb(~a.get_low(), ~a.get_high()); } +// vector operator == : xnor +static inline Vec8fb operator == (Vec8fb const a, Vec8fb const b) { + return Vec8fb(Vec8fb(a) ^ Vec8fb(~b)); +} + +// vector operator != : xor +static inline Vec8fb operator != (Vec8fb const a, Vec8fb const b) { + return Vec8fb(a ^ b); +} + // vector operator ! : logical not // (operator ! is less efficient than operator ~. Use only where not // all bits in an element are the same) -static inline Vec8fb operator ! (Vec8fb const & a) { +static inline Vec8fb operator ! (Vec8fb const a) { return Vec8fb(!a.get_low(), !a.get_high()); } // Functions for Vec8fb // andnot: a & ~ b -static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) { +static inline Vec8fb andnot(Vec8fb const a, Vec8fb const b) { return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); } - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - // horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec8fb const & a) { +static inline bool horizontal_and (Vec8fb const a) { return horizontal_and(a.get_low() & a.get_high()); } // horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec8fb const & a) { +static inline bool horizontal_or (Vec8fb const a) { return horizontal_or(a.get_low() | a.get_high()); } - /***************************************************************************** * * Vec4db: Vector of 4 Booleans for use with Vec4d @@ -320,52 +314,47 @@ class Vec4db : public Vec256de { y1 = Vec2db(b2, b3); } // Constructor to build from two Vec2db: - Vec4db(Vec2db const & a0, Vec2db const & a1) { + Vec4db(Vec2db const a0, Vec2db const a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256de - Vec4db(Vec256de const & x) { + Vec4db(Vec256de const x) { y0 = x.get_low(); y1 = x.get_high(); } + // Constructor to broadcast scalar value: + Vec4db(bool b) { + y0 = y1 = Vec2db(b); + } // Assignment operator to convert from type Vec256de - Vec4db & operator = (Vec256de const & x) { + Vec4db & operator = (Vec256de const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } -#ifdef VECTORI256_H // 256 bit integer vectors are available + // Constructor to convert from type Vec4qb used as Boolean for integer vectors - Vec4db(Vec4qb const & x) { + Vec4db(Vec4qb const x) { y0 = _mm_castsi128_pd(Vec4q(x).get_low()); y1 = _mm_castsi128_pd(Vec4q(x).get_high()); } // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors - Vec4db & operator = (Vec4qb const & x) { + Vec4db & operator = (Vec4qb const x) { y0 = _mm_castsi128_pd(Vec4q(x).get_low()); y1 = _mm_castsi128_pd(Vec4q(x).get_high()); return *this; } - // Constructor to broadcast the same value into all elements: - Vec4db(bool b) { - y1 = y0 = Vec2db(b); - } // Assignment operator to broadcast scalar value: Vec4db & operator = (bool b) { y0 = y1 = Vec2db(b); return *this; } -private: // Prevent constructing from int, etc. - Vec4db(int b); - Vec4db & operator = (int x); -public: // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors operator Vec4qb() const { return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1)); } -#endif // VECTORI256_H + // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4db const & insert(uint32_t index, bool value) { - if (index < 2) { + Vec4db const insert(int index, bool value) { + if ((uint32_t)index < 2) { y0 = Vec2db(y0).insert(index, value); } else { @@ -374,9 +363,8 @@ class Vec4db : public Vec256de { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - bool extract(uint32_t index) const { - if (index < 2) { + bool extract(int index) const { + if ((uint32_t)index < 2) { return Vec2db(y0).extract(index); } else { @@ -384,7 +372,7 @@ class Vec4db : public Vec256de { } } // Extract a single element. Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { + bool operator [] (int index) const { return extract(index); } // Member functions to split into two Vec4fb: @@ -394,9 +382,21 @@ class Vec4db : public Vec256de { Vec2db get_high() const { return y1; } - static int size () { + // Member function to change a bitfield to a boolean vector + Vec4db & load_bits(uint8_t a) { + y0 = Vec2db().load_bits(a); + y1 = Vec2db().load_bits(uint8_t(a>>2u)); + return *this; + } + static constexpr int size() { return 4; } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec4db(int b) = delete; + Vec4db & operator = (int x) = delete; }; @@ -407,83 +407,84 @@ class Vec4db : public Vec256de { *****************************************************************************/ // vector operator & : bitwise and -static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) { +static inline Vec4db operator & (Vec4db const a, Vec4db const b) { return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) { +static inline Vec4db operator && (Vec4db const a, Vec4db const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) { +static inline Vec4db & operator &= (Vec4db & a, Vec4db const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) { +static inline Vec4db operator | (Vec4db const a, Vec4db const b) { return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) { +static inline Vec4db operator || (Vec4db const a, Vec4db const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) { +static inline Vec4db & operator |= (Vec4db & a, Vec4db const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) { +static inline Vec4db operator ^ (Vec4db const a, Vec4db const b) { return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); - } // vector operator ^= : bitwise xor -static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) { +static inline Vec4db & operator ^= (Vec4db & a, Vec4db const b) { a = a ^ b; return a; } // vector operator ~ : bitwise not -static inline Vec4db operator ~ (Vec4db const & a) { +static inline Vec4db operator ~ (Vec4db const a) { return Vec4db(~a.get_low(), ~a.get_high()); } +// vector operator == : xnor +static inline Vec4db operator == (Vec4db const a, Vec4db const b) { + return Vec4db(Vec4db(a) ^ Vec4db(~b)); +} + +// vector operator != : xor +static inline Vec4db operator != (Vec4db const a, Vec4db const b) { + return Vec4db(a ^ b); +} + // vector operator ! : logical not // (operator ! is less efficient than operator ~. Use only where not // all bits in an element are the same) -static inline Vec4db operator ! (Vec4db const & a) { +static inline Vec4db operator ! (Vec4db const a) { return Vec4db(!a.get_low(), !a.get_high()); } // Functions for Vec4db // andnot: a & ~ b -static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) { +static inline Vec4db andnot(Vec4db const a, Vec4db const b) { return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); } - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - // horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec4db const & a) { +static inline bool horizontal_and (Vec4db const a) { return horizontal_and(a.get_low() & a.get_high()); } // horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec4db const & a) { +static inline bool horizontal_or (Vec4db const a) { return horizontal_or(a.get_low() | a.get_high()); } - /***************************************************************************** * * Vec8f: Vector of 8 single precision floating point values @@ -502,18 +503,18 @@ class Vec8f : public Vec256fe { // Constructor to build from all elements: Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) { y0 = _mm_setr_ps(f0, f1, f2, f3); - y1 = _mm_setr_ps(f4, f5, f6, f7); + y1 = _mm_setr_ps(f4, f5, f6, f7); } // Constructor to build from two Vec4f: - Vec8f(Vec4f const & a0, Vec4f const & a1) { + Vec8f(Vec4f const a0, Vec4f const a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256fe - Vec8f(Vec256fe const & x) { + Vec8f(Vec256fe const x) { y0 = x.get_low(); y1 = x.get_high(); } // Assignment operator to convert from type Vec256fe - Vec8f & operator = (Vec256fe const & x) { + Vec8f & operator = (Vec256fe const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -524,8 +525,7 @@ class Vec8f : public Vec256fe { return *this; } // Member function to load from array, aligned by 32 - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 32. + // You may use load_a instead of load if you are certain that p points to an address divisible by 32. Vec8f & load_a(float const * p) { y0 = _mm_load_ps(p); y1 = _mm_load_ps(p+4); @@ -536,13 +536,18 @@ class Vec8f : public Vec256fe { _mm_storeu_ps(p, y0); _mm_storeu_ps(p+4, y1); } - // Member function to store into array, aligned by 32 - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 32. + // Member function storing into array, aligned by 32 + // You may use store_a instead of store if you are certain that p points to an address divisible by 32. void store_a(float * p) const { _mm_store_ps(p, y0); _mm_store_ps(p+4, y1); } + // Member function storing to aligned uncached memory (non-temporal store). + // Note: Will generate runtime error if p is not aligned by 32 + void store_nt(float * p) const { + _mm_stream_ps(p, y0); + _mm_stream_ps(p+4, y1); + } // Partial load. Load n elements and set the rest to 0 Vec8f & load_partial(int n, float const * p) { if (n > 0 && n <= 4) { @@ -579,9 +584,8 @@ class Vec8f : public Vec256fe { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8f const & insert(uint32_t index, float value) { - if (index < 4) { + Vec8f const insert(int index, float value) { + if ((uint32_t)index < 4) { y0 = Vec4f(y0).insert(index, value); } else { @@ -590,9 +594,8 @@ class Vec8f : public Vec256fe { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - float extract(uint32_t index) const { - if (index < 4) { + float extract(int index) const { + if ((uint32_t)index < 4) { return Vec4f(y0).extract(index); } else { @@ -601,7 +604,7 @@ class Vec8f : public Vec256fe { } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - float operator [] (uint32_t index) const { + float operator [] (int index) const { return extract(index); } // Member functions to split into two Vec4f: @@ -611,9 +614,12 @@ class Vec8f : public Vec256fe { Vec4f get_high() const { return y1; } - static int size () { + static constexpr int size() { return 8; } + static constexpr int elementtype() { + return 16; + } }; @@ -624,20 +630,20 @@ class Vec8f : public Vec256fe { *****************************************************************************/ // vector operator + : add element by element -static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator + (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator + : add vector and scalar -static inline Vec8f operator + (Vec8f const & a, float b) { +static inline Vec8f operator + (Vec8f const a, float b) { return a + Vec8f(b); } -static inline Vec8f operator + (float a, Vec8f const & b) { +static inline Vec8f operator + (float a, Vec8f const b) { return Vec8f(a) + b; } // vector operator += : add -static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator += (Vec8f & a, Vec8f const b) { a = a + b; return a; } @@ -656,26 +662,26 @@ static inline Vec8f & operator ++ (Vec8f & a) { } // vector operator - : subtract element by element -static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator - (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : subtract vector and scalar -static inline Vec8f operator - (Vec8f const & a, float b) { +static inline Vec8f operator - (Vec8f const a, float b) { return a - Vec8f(b); } -static inline Vec8f operator - (float a, Vec8f const & b) { +static inline Vec8f operator - (float a, Vec8f const b) { return Vec8f(a) - b; } // vector operator - : unary minus // Change sign bit, even for 0, INF and NAN -static inline Vec8f operator - (Vec8f const & a) { +static inline Vec8f operator - (Vec8f const a) { return Vec8f(-a.get_low(), -a.get_high()); } // vector operator -= : subtract -static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator -= (Vec8f & a, Vec8f const b) { a = a - b; return a; } @@ -694,118 +700,118 @@ static inline Vec8f & operator -- (Vec8f & a) { } // vector operator * : multiply element by element -static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator * (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator * : multiply vector and scalar -static inline Vec8f operator * (Vec8f const & a, float b) { +static inline Vec8f operator * (Vec8f const a, float b) { return a * Vec8f(b); } -static inline Vec8f operator * (float a, Vec8f const & b) { +static inline Vec8f operator * (float a, Vec8f const b) { return Vec8f(a) * b; } // vector operator *= : multiply -static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator *= (Vec8f & a, Vec8f const b) { a = a * b; return a; } // vector operator / : divide all elements by same integer -static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator / (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high()); } // vector operator / : divide vector and scalar -static inline Vec8f operator / (Vec8f const & a, float b) { +static inline Vec8f operator / (Vec8f const a, float b) { return a / Vec8f(b); } -static inline Vec8f operator / (float a, Vec8f const & b) { +static inline Vec8f operator / (float a, Vec8f const b) { return Vec8f(a) / b; } // vector operator /= : divide -static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator /= (Vec8f & a, Vec8f const b) { a = a / b; return a; } // vector operator == : returns true for elements for which a == b -static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) { +static inline Vec8fb operator == (Vec8f const a, Vec8f const b) { return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b -static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) { +static inline Vec8fb operator != (Vec8f const a, Vec8f const b) { return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } // vector operator < : returns true for elements for which a < b -static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) { +static inline Vec8fb operator < (Vec8f const a, Vec8f const b) { return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); } // vector operator <= : returns true for elements for which a <= b -static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) { +static inline Vec8fb operator <= (Vec8f const a, Vec8f const b) { return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high()); } // vector operator > : returns true for elements for which a > b -static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) { +static inline Vec8fb operator > (Vec8f const a, Vec8f const b) { return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator >= : returns true for elements for which a >= b -static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) { +static inline Vec8fb operator >= (Vec8f const a, Vec8f const b) { return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // Bitwise logical operators // vector operator & : bitwise and -static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator & (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator &= : bitwise and -static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator &= (Vec8f & a, Vec8f const b) { a = a & b; return a; } // vector operator & : bitwise and of Vec8f and Vec8fb -static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) { +static inline Vec8f operator & (Vec8f const a, Vec8fb const b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) { +static inline Vec8f operator & (Vec8fb const a, Vec8f const b) { return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator | : bitwise or -static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator | (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } // vector operator |= : bitwise or -static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator |= (Vec8f & a, Vec8f const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) { +static inline Vec8f operator ^ (Vec8f const a, Vec8f const b) { return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) { +static inline Vec8f & operator ^= (Vec8f & a, Vec8f const b) { a = a ^ b; return a; } // vector operator ! : logical not. Returns Boolean vector -static inline Vec8fb operator ! (Vec8f const & a) { +static inline Vec8fb operator ! (Vec8f const a) { return Vec8fb(!a.get_low(), !a.get_high()); } @@ -819,194 +825,188 @@ static inline Vec8fb operator ! (Vec8f const & a) { // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed. -static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) { +static inline Vec8f select (Vec8fb const s, Vec8f const a, Vec8f const b) { return Vec8f(select(s.get_low(),a.get_low(),b.get_low()), select(s.get_high(),a.get_high(),b.get_high())); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) { +static inline Vec8f if_add (Vec8fb const f, Vec8f const a, Vec8f const b) { return a + (Vec8f(f) & b); } -// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] -static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) { +// Conditional subtract +static inline Vec8f if_sub (Vec8fb const f, Vec8f const a, Vec8f const b) { + return a - (Vec8f(f) & b); +} + +// Conditional multiply +static inline Vec8f if_mul (Vec8fb const f, Vec8f const a, Vec8f const b) { return a * select(f, b, 1.f); } +// Conditional divide +static inline Vec8f if_div (Vec8fb const f, Vec8f const a, Vec8f const b) { + return a / select(f, b, 1.f); +} // General arithmetic functions, etc. // Horizontal add: Calculates the sum of all vector elements. -static inline float horizontal_add (Vec8f const & a) { +static inline float horizontal_add (Vec8f const a) { return horizontal_add(a.get_low() + a.get_high()); } // function max: a > b ? a : b -static inline Vec8f max(Vec8f const & a, Vec8f const & b) { +static inline Vec8f max(Vec8f const a, Vec8f const b) { return Vec8f(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec8f min(Vec8f const & a, Vec8f const & b) { +static inline Vec8f min(Vec8f const a, Vec8f const b) { return Vec8f(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } +// NAN-safe versions of maximum and minimum are in vector_convert.h // function abs: absolute value // Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec8f abs(Vec8f const & a) { +static inline Vec8f abs(Vec8f const a) { return Vec8f(abs(a.get_low()), abs(a.get_high())); } // function sqrt: square root -static inline Vec8f sqrt(Vec8f const & a) { +static inline Vec8f sqrt(Vec8f const a) { return Vec8f(sqrt(a.get_low()), sqrt(a.get_high())); } // function square: a * a -static inline Vec8f square(Vec8f const & a) { +static inline Vec8f square(Vec8f const a) { return Vec8f(square(a.get_low()), square(a.get_high())); } // pow(Vec8f, int): -template static Vec8f pow(Vec8f const & a, TT const & n); +template static Vec8f pow(Vec8f const a, TT const n); // Raise floating point numbers to integer power n template <> -inline Vec8f pow(Vec8f const & x0, int const & n) { +inline Vec8f pow(Vec8f const x0, int const n) { return pow_template_i(x0, n); } // allow conversion from unsigned int template <> -inline Vec8f pow(Vec8f const & x0, uint32_t const & n) { +inline Vec8f pow(Vec8f const x0, uint32_t const n) { return pow_template_i(x0, (int)n); } - // Raise floating point numbers to integer power n, where n is a compile-time constant +// implement as function pow(vector, const_int) template -static inline Vec8f pow_n(Vec8f const & a) { - return Vec8f(pow_n(a.get_low()), pow_n(a.get_high())); -} - -template -static inline Vec8f pow(Vec8f const & a, Const_int_t) { - return pow_n(a); +static inline Vec8f pow(Vec8f const a, Const_int_t) { + return pow_n(a); } // function round: round to nearest integer (even). (result as float vector) -static inline Vec8f round(Vec8f const & a) { +static inline Vec8f round(Vec8f const a) { return Vec8f(round(a.get_low()), round(a.get_high())); } // function truncate: round towards zero. (result as float vector) -static inline Vec8f truncate(Vec8f const & a) { +static inline Vec8f truncate(Vec8f const a) { return Vec8f(truncate(a.get_low()), truncate(a.get_high())); } // function floor: round towards minus infinity. (result as float vector) -static inline Vec8f floor(Vec8f const & a) { +static inline Vec8f floor(Vec8f const a) { return Vec8f(floor(a.get_low()), floor(a.get_high())); } // function ceil: round towards plus infinity. (result as float vector) -static inline Vec8f ceil(Vec8f const & a) { +static inline Vec8f ceil(Vec8f const a) { return Vec8f(ceil(a.get_low()), ceil(a.get_high())); } -#ifdef VECTORI256_H // 256 bit integer vectors are available -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec8i round_to_int(Vec8f const & a) { - // Note: assume MXCSR control register is set to rounding - return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high())); +// function roundi: round to nearest integer (even). (result as integer vector) +static inline Vec8i roundi(Vec8f const a) { + return Vec8i(roundi(a.get_low()), roundi(a.get_high())); } -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec8i truncate_to_int(Vec8f const & a) { - return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high())); +// function truncatei: round towards zero. (result as integer vector) +static inline Vec8i truncatei(Vec8f const a) { + return Vec8i(truncatei(a.get_low()), truncatei(a.get_high())); } // function to_float: convert integer vector to float vector -static inline Vec8f to_float(Vec8i const & a) { +static inline Vec8f to_float(Vec8i const a) { return Vec8f(to_float(a.get_low()), to_float(a.get_high())); } // function to_float: convert unsigned integer vector to float vector -static inline Vec8f to_float(Vec8ui const & a) { +static inline Vec8f to_float(Vec8ui const a) { return Vec8f(to_float(a.get_low()), to_float(a.get_high())); } -#endif // VECTORI256_H - // Approximate math functions // approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11) -static inline Vec8f approx_recipr(Vec8f const & a) { +static inline Vec8f approx_recipr(Vec8f const a) { return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high())); } // approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11) -static inline Vec8f approx_rsqrt(Vec8f const & a) { +static inline Vec8f approx_rsqrt(Vec8f const a) { return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high())); } // Fused multiply and add functions // Multiply and add -static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { +static inline Vec8f mul_add(Vec8f const a, Vec8f const b, Vec8f const c) { return Vec8f(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high())); } // Multiply and subtract -static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) { +static inline Vec8f mul_sub(Vec8f const a, Vec8f const b, Vec8f const c) { return Vec8f(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high())); } // Multiply and inverse subtract -static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { +static inline Vec8f nmul_add(Vec8f const a, Vec8f const b, Vec8f const c) { return Vec8f(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high())); } -// Multiply and subtract with extra precision on the intermediate calculations, -// even if FMA instructions not supported, using Veltkamp-Dekker split -static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) { +// Multiply and subtract with extra precision on the intermediate calculations, used internally +static inline Vec8f mul_sub_x(Vec8f const a, Vec8f const b, Vec8f const c) { return Vec8f(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high())); } - // Math functions using fast bit manipulation -#ifdef VECTORI256_H // 256 bit integer vectors are available // Extract the exponent as an integer // exponent(a) = floor(log2(abs(a))); // exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 -static inline Vec8i exponent(Vec8f const & a) { +static inline Vec8i exponent(Vec8f const a) { return Vec8i(exponent(a.get_low()), exponent(a.get_high())); } -#endif -// Extract the fraction part of a floating point number -// a = 2^exponent(a) * fraction(a), except for a = 0 -// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f -static inline Vec8f fraction(Vec8f const & a) { - return Vec8f(fraction(a.get_low()), fraction(a.get_high())); -} - -#ifdef VECTORI256_H // 256 bit integer vectors are available // Fast calculation of pow(2,n) with n integer // n = 0 gives 1.0f // n >= 128 gives +INF // n <= -127 gives 0.0f // This function will never produce denormals, and never raise exceptions -static inline Vec8f exp2(Vec8i const & a) { +static inline Vec8f exp2(Vec8i const a) { return Vec8f(exp2(a.get_low()), exp2(a.get_high())); } -//static Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h -#endif // VECTORI256_H +//static Vec8f exp2(Vec8f const x); // defined in vectormath_exp.h +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f +static inline Vec8f fraction(Vec8f const a) { + return Vec8f(fraction(a.get_low()), fraction(a.get_high())); +} // Categorization functions @@ -1015,52 +1015,52 @@ static inline Vec8f exp2(Vec8i const & a) { // even for -0.0f, -INF and -NAN // Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb sign_bit(Vec8f const & a) { +static inline Vec8fb sign_bit(Vec8f const a) { return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high())); } // Function sign_combine: changes the sign of a when b has the sign bit set // same as select(sign_bit(b), -a, a) -static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) { +static inline Vec8f sign_combine(Vec8f const a, Vec8f const b) { return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high())); } -// Function is_finite: gives true for elements that are normal, denormal or zero, +// Function is_finite: gives true for elements that are normal, denormal or zero, // false for INF and NAN // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb is_finite(Vec8f const & a) { +static inline Vec8fb is_finite(Vec8f const a) { return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high())); } // Function is_inf: gives true for elements that are +INF or -INF // false for finite numbers and NAN // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb is_inf(Vec8f const & a) { +static inline Vec8fb is_inf(Vec8f const a) { return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high())); } // Function is_nan: gives true for elements that are +NAN or -NAN // false for finite numbers and +/-INF // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb is_nan(Vec8f const & a) { +static inline Vec8fb is_nan(Vec8f const a) { return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high())); } // Function is_subnormal: gives true for elements that are denormal (subnormal) // false for finite numbers, zero, NAN and INF -static inline Vec8fb is_subnormal(Vec8f const & a) { +static inline Vec8fb is_subnormal(Vec8f const a) { return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); } // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) // false for finite numbers, NAN and INF -static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) { +static inline Vec8fb is_zero_or_subnormal(Vec8f const a) { return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high())); } // Function infinite4f: returns a vector where all elements are +INF static inline Vec8f infinite8f() { - return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>(); + return Vec8f(infinite4f(),infinite4f()); } // Function nan4f: returns a vector where all elements are +NAN (quiet) @@ -1071,7 +1071,7 @@ static inline Vec8f nan8f(int n = 0x10) { // change signs on vectors Vec8f // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change template -static inline Vec8f change_sign(Vec8f const & a) { +inline Vec8f change_sign(Vec8f const a) { if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a; Vec4f lo = change_sign(a.get_low()); Vec4f hi = change_sign(a.get_high()); @@ -1096,20 +1096,20 @@ class Vec4d : public Vec256de { } // Constructor to build from all elements: Vec4d(double d0, double d1, double d2, double d3) { - y0 = _mm_setr_pd(d0, d1); - y1 = _mm_setr_pd(d2, d3); + y0 = _mm_setr_pd(d0, d1); + y1 = _mm_setr_pd(d2, d3); } // Constructor to build from two Vec4f: - Vec4d(Vec2d const & a0, Vec2d const & a1) { + Vec4d(Vec2d const a0, Vec2d const a1) { y0 = a0; y1 = a1; } // Constructor to convert from type Vec256de - Vec4d(Vec256de const & x) { + Vec4d(Vec256de const x) { y0 = x.get_low(); y1 = x.get_high(); } // Assignment operator to convert from type Vec256de - Vec4d & operator = (Vec256de const & x) { + Vec4d & operator = (Vec256de const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; @@ -1133,13 +1133,19 @@ class Vec4d : public Vec256de { _mm_storeu_pd(p, y0); _mm_storeu_pd(p+2, y1); } - // Member function to store into array, aligned by 32 + // Member function storing into array, aligned by 32 // You may use store_a instead of store if you are certain that p points to an address // divisible by 32 void store_a(double * p) const { _mm_store_pd(p, y0); _mm_store_pd(p+2, y1); } + // Member function storing to aligned uncached memory (non-temporal store). + // Note: Will generate runtime error if p is not aligned by 32 + void store_nt(double * p) const { + _mm_stream_pd(p, y0); + _mm_stream_pd(p+2, y1); + } // Partial load. Load n elements and set the rest to 0 Vec4d & load_partial(int n, double const * p) { if (n > 0 && n <= 2) { @@ -1173,11 +1179,10 @@ class Vec4d : public Vec256de { y1 = Vec2d(0.0); } return *this; - } + } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4d const & insert(uint32_t index, double value) { - if (index < 2) { + Vec4d const insert(int index, double value) { + if ((uint32_t)index < 2) { y0 = Vec2d(y0).insert(index, value); } else { @@ -1186,9 +1191,8 @@ class Vec4d : public Vec256de { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - double extract(uint32_t index) const { - if (index < 2) { + double extract(int index) const { + if ((uint32_t)index < 2) { return Vec2d(y0).extract(index); } else { @@ -1197,7 +1201,7 @@ class Vec4d : public Vec256de { } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - double operator [] (uint32_t index) const { + double operator [] (int index) const { return extract(index); } // Member functions to split into two Vec2d: @@ -1207,13 +1211,15 @@ class Vec4d : public Vec256de { Vec2d get_high() const { return y1; } - static int size () { + static constexpr int size() { return 4; } + static constexpr int elementtype() { + return 17; + } }; - /***************************************************************************** * * Operators for Vec4d @@ -1221,20 +1227,20 @@ class Vec4d : public Vec256de { *****************************************************************************/ // vector operator + : add element by element -static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator + (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator + : add vector and scalar -static inline Vec4d operator + (Vec4d const & a, double b) { +static inline Vec4d operator + (Vec4d const a, double b) { return a + Vec4d(b); } -static inline Vec4d operator + (double a, Vec4d const & b) { +static inline Vec4d operator + (double a, Vec4d const b) { return Vec4d(a) + b; } // vector operator += : add -static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator += (Vec4d & a, Vec4d const b) { a = a + b; return a; } @@ -1253,26 +1259,26 @@ static inline Vec4d & operator ++ (Vec4d & a) { } // vector operator - : subtract element by element -static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator - (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : subtract vector and scalar -static inline Vec4d operator - (Vec4d const & a, double b) { +static inline Vec4d operator - (Vec4d const a, double b) { return a - Vec4d(b); } -static inline Vec4d operator - (double a, Vec4d const & b) { +static inline Vec4d operator - (double a, Vec4d const b) { return Vec4d(a) - b; } // vector operator - : unary minus // Change sign bit, even for 0, INF and NAN -static inline Vec4d operator - (Vec4d const & a) { +static inline Vec4d operator - (Vec4d const a) { return Vec4d(-a.get_low(), -a.get_high()); } // vector operator -= : subtract -static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator -= (Vec4d & a, Vec4d const b) { a = a - b; return a; } @@ -1291,118 +1297,118 @@ static inline Vec4d & operator -- (Vec4d & a) { } // vector operator * : multiply element by element -static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator * (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator * : multiply vector and scalar -static inline Vec4d operator * (Vec4d const & a, double b) { +static inline Vec4d operator * (Vec4d const a, double b) { return a * Vec4d(b); } -static inline Vec4d operator * (double a, Vec4d const & b) { +static inline Vec4d operator * (double a, Vec4d const b) { return Vec4d(a) * b; } // vector operator *= : multiply -static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator *= (Vec4d & a, Vec4d const b) { a = a * b; return a; } // vector operator / : divide all elements by same integer -static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator / (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high()); } // vector operator / : divide vector and scalar -static inline Vec4d operator / (Vec4d const & a, double b) { +static inline Vec4d operator / (Vec4d const a, double b) { return a / Vec4d(b); } -static inline Vec4d operator / (double a, Vec4d const & b) { +static inline Vec4d operator / (double a, Vec4d const b) { return Vec4d(a) / b; } // vector operator /= : divide -static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator /= (Vec4d & a, Vec4d const b) { a = a / b; return a; } // vector operator == : returns true for elements for which a == b -static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) { +static inline Vec4db operator == (Vec4d const a, Vec4d const b) { return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b -static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) { +static inline Vec4db operator != (Vec4d const a, Vec4d const b) { return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } // vector operator < : returns true for elements for which a < b -static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) { +static inline Vec4db operator < (Vec4d const a, Vec4d const b) { return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high()); } // vector operator <= : returns true for elements for which a <= b -static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) { +static inline Vec4db operator <= (Vec4d const a, Vec4d const b) { return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high()); } // vector operator > : returns true for elements for which a > b -static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) { +static inline Vec4db operator > (Vec4d const a, Vec4d const b) { return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator >= : returns true for elements for which a >= b -static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) { +static inline Vec4db operator >= (Vec4d const a, Vec4d const b) { return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // Bitwise logical operators // vector operator & : bitwise and -static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator & (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator &= : bitwise and -static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator &= (Vec4d & a, Vec4d const b) { a = a & b; return a; } // vector operator & : bitwise and of Vec4d and Vec4db -static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) { +static inline Vec4d operator & (Vec4d const a, Vec4db const b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) { +static inline Vec4d operator & (Vec4db const a, Vec4d const b) { return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } // vector operator | : bitwise or -static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator | (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } // vector operator |= : bitwise or -static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator |= (Vec4d & a, Vec4d const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) { +static inline Vec4d operator ^ (Vec4d const a, Vec4d const b) { return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) { +static inline Vec4d & operator ^= (Vec4d & a, Vec4d const b) { a = a ^ b; return a; } // vector operator ! : logical not. Returns Boolean vector -static inline Vec4db operator ! (Vec4d const & a) { +static inline Vec4db operator ! (Vec4d const a) { return Vec4db(!a.get_low(), !a.get_high()); } @@ -1415,173 +1421,164 @@ static inline Vec4db operator ! (Vec4d const & a) { // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). +// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). // No other values are allowed. -static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) { +static inline Vec4d select (Vec4db const s, Vec4d const a, Vec4d const b) { return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) { +static inline Vec4d if_add (Vec4db const f, Vec4d const a, Vec4d const b) { return a + (Vec4d(f) & b); } -// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] -static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) { +// Conditional subtract +static inline Vec4d if_sub (Vec4db const f, Vec4d const a, Vec4d const b) { + return a - (Vec4d(f) & b); +} + +// Conditional multiply +static inline Vec4d if_mul (Vec4db const f, Vec4d const a, Vec4d const b) { return a * select(f, b, 1.f); } +// Conditional divide +static inline Vec4d if_div (Vec4db const f, Vec4d const a, Vec4d const b) { + return a / select(f, b, 1.); +} + + // General arithmetic functions, etc. // Horizontal add: Calculates the sum of all vector elements. -static inline double horizontal_add (Vec4d const & a) { +static inline double horizontal_add (Vec4d const a) { return horizontal_add(a.get_low() + a.get_high()); } // function max: a > b ? a : b -static inline Vec4d max(Vec4d const & a, Vec4d const & b) { +static inline Vec4d max(Vec4d const a, Vec4d const b) { return Vec4d(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec4d min(Vec4d const & a, Vec4d const & b) { +static inline Vec4d min(Vec4d const a, Vec4d const b) { return Vec4d(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } +// NAN-safe versions of maximum and minimum are in vector_convert.h // function abs: absolute value // Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec4d abs(Vec4d const & a) { +static inline Vec4d abs(Vec4d const a) { return Vec4d(abs(a.get_low()), abs(a.get_high())); } // function sqrt: square root -static inline Vec4d sqrt(Vec4d const & a) { +static inline Vec4d sqrt(Vec4d const a) { return Vec4d(sqrt(a.get_low()), sqrt(a.get_high())); } // function square: a * a -static inline Vec4d square(Vec4d const & a) { +static inline Vec4d square(Vec4d const a) { return Vec4d(square(a.get_low()), square(a.get_high())); } // pow(Vec4d, int): // Raise floating point numbers to integer power n -template static Vec4d pow(Vec4d const & a, TT const & n); +template static Vec4d pow(Vec4d const a, TT const n); // Raise floating point numbers to integer power n template <> -inline Vec4d pow(Vec4d const & x0, int const & n) { +inline Vec4d pow(Vec4d const x0, int const n) { return pow_template_i(x0, n); } // allow conversion from unsigned int template <> -inline Vec4d pow(Vec4d const & x0, uint32_t const & n) { +inline Vec4d pow(Vec4d const x0, uint32_t const n) { return pow_template_i(x0, (int)n); } - // Raise floating point numbers to integer power n, where n is a compile-time constant +// implement as function pow(vector, const_int) template -static inline Vec4d pow_n(Vec4d const & a) { - return Vec4d(pow_n(a.get_low()), pow_n(a.get_high())); -} - -template -static inline Vec4d pow(Vec4d const & a, Const_int_t) { - return pow_n(a); +static inline Vec4d pow(Vec4d const a, Const_int_t) { + return pow_n(a); } // function round: round to nearest integer (even). (result as double vector) -static inline Vec4d round(Vec4d const & a) { +static inline Vec4d round(Vec4d const a) { return Vec4d(round(a.get_low()), round(a.get_high())); } // function truncate: round towards zero. (result as double vector) -static inline Vec4d truncate(Vec4d const & a) { +static inline Vec4d truncate(Vec4d const a) { return Vec4d(truncate(a.get_low()), truncate(a.get_high())); } // function floor: round towards minus infinity. (result as double vector) -static inline Vec4d floor(Vec4d const & a) { +static inline Vec4d floor(Vec4d const a) { return Vec4d(floor(a.get_low()), floor(a.get_high())); } // function ceil: round towards plus infinity. (result as double vector) -static inline Vec4d ceil(Vec4d const & a) { +static inline Vec4d ceil(Vec4d const a) { return Vec4d(ceil(a.get_low()), ceil(a.get_high())); } -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec4i round_to_int(Vec4d const & a) { - // Note: assume MXCSR control register is set to rounding - return round_to_int(a.get_low(), a.get_high()); +// function round_to_int32: round to nearest integer (even). (result as integer vector) +static inline Vec4i round_to_int32(Vec4d const a) { + return round_to_int32(a.get_low(), a.get_high()); } -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec4i truncate_to_int(Vec4d const & a) { - return truncate_to_int(a.get_low(), a.get_high()); +// function truncate_to_int32: round towards zero. (result as integer vector) +static inline Vec4i truncate_to_int32(Vec4d const a) { + return truncate_to_int32(a.get_low(), a.get_high()); } -#ifdef VECTORI256_H // 256 bit integer vectors are available - -// function truncate_to_int64: round towards zero. (inefficient) -static inline Vec4q truncate_to_int64(Vec4d const & a) { +// function truncatei: round towards zero. (inefficient) +static inline Vec4q truncatei(Vec4d const a) { double aa[4]; a.store(aa); return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3])); } -// function truncate_to_int64_limited: round towards zero. -// result as 64-bit integer vector, but with limited range -static inline Vec4q truncate_to_int64_limited(Vec4d const & a) { - return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high())); -} - -// function round_to_int64: round to nearest or even. (inefficient) -static inline Vec4q round_to_int64(Vec4d const & a) { - return truncate_to_int64(round(a)); -} - -// function round_to_int64_limited: round to nearest integer -// result as 64-bit integer vector, but with limited range -static inline Vec4q round_to_int64_limited(Vec4d const & a) { - return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high())); +// function roundi: round to nearest or even. (inefficient) +static inline Vec4q roundi(Vec4d const a) { + return truncatei(round(a)); } // function to_double: convert integer vector elements to double vector (inefficient) -static inline Vec4d to_double(Vec4q const & a) { +static inline Vec4d to_double(Vec4q const a) { int64_t aa[4]; a.store(aa); return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3])); } -// function to_double_limited: convert integer vector elements to double vector -// limited to abs(x) < 2^31 -static inline Vec4d to_double_limited(Vec4q const & x) { - return Vec4d(to_double_limited(x.get_low()),to_double_limited(x.get_high())); +// function to_double: convert unsigned integer vector elements to double vector (inefficient) +static inline Vec4d to_double(Vec4uq const a) { + uint64_t aa[4]; + a.store(aa); + return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3])); } -#endif // VECTORI256_H - // function to_double: convert integer vector to double vector -static inline Vec4d to_double(Vec4i const & a) { +static inline Vec4d to_double(Vec4i const a) { return Vec4d(to_double_low(a), to_double_high(a)); } // function compress: convert two Vec4d to one Vec8f -static inline Vec8f compress (Vec4d const & low, Vec4d const & high) { +static inline Vec8f compress (Vec4d const low, Vec4d const high) { return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high())); } // Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d -static inline Vec4d extend_low (Vec8f const & a) { +static inline Vec4d extend_low (Vec8f const a) { return Vec4d(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d -static inline Vec4d extend_high (Vec8f const & a) { +static inline Vec4d extend_high (Vec8f const a) { return Vec4d(extend_low(a.get_high()), extend_high(a.get_high())); } @@ -1589,41 +1586,39 @@ static inline Vec4d extend_high (Vec8f const & a) { // Fused multiply and add functions // Multiply and add -static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { +static inline Vec4d mul_add(Vec4d const a, Vec4d const b, Vec4d const c) { return Vec4d(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high())); } // Multiply and subtract -static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) { +static inline Vec4d mul_sub(Vec4d const a, Vec4d const b, Vec4d const c) { return Vec4d(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high())); } // Multiply and inverse subtract -static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { +static inline Vec4d nmul_add(Vec4d const a, Vec4d const b, Vec4d const c) { return Vec4d(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high())); } -// Multiply and subtract with extra precision on the intermediate calculations, +// Multiply and subtract with extra precision on the intermediate calculations, // even if FMA instructions not supported, using Veltkamp-Dekker split -static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) { +static inline Vec4d mul_sub_x(Vec4d const a, Vec4d const b, Vec4d const c) { return Vec4d(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high())); } - // Math functions using fast bit manipulation -#ifdef VECTORI256_H // 256 bit integer vectors are available, AVX2 // Extract the exponent as an integer // exponent(a) = floor(log2(abs(a))); // exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 -static inline Vec4q exponent(Vec4d const & a) { +static inline Vec4q exponent(Vec4d const a) { return Vec4q(exponent(a.get_low()), exponent(a.get_high())); } // Extract the fraction part of a floating point number // a = 2^exponent(a) * fraction(a), except for a = 0 -// fraction(1.0) = 1.0, fraction(5.0) = 1.25 -static inline Vec4d fraction(Vec4d const & a) { +// fraction(1.0) = 1.0, fraction(5.0) = 1.25 +static inline Vec4d fraction(Vec4d const a) { return Vec4d(fraction(a.get_low()), fraction(a.get_high())); } @@ -1632,11 +1627,10 @@ static inline Vec4d fraction(Vec4d const & a) { // n >= 1024 gives +INF // n <= -1023 gives 0.0 // This function will never produce denormals, and never raise exceptions -static inline Vec4d exp2(Vec4q const & a) { +static inline Vec4d exp2(Vec4q const a) { return Vec4d(exp2(a.get_low()), exp2(a.get_high())); } -//static Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h -#endif +//static Vec4d exp2(Vec4d const x); // defined in vectormath_exp.h // Categorization functions @@ -1644,43 +1638,43 @@ static inline Vec4d exp2(Vec4q const & a) { // Function sign_bit: gives true for elements that have the sign bit set // even for -0.0, -INF and -NAN // Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false -static inline Vec4db sign_bit(Vec4d const & a) { +static inline Vec4db sign_bit(Vec4d const a) { return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high())); } // Function sign_combine: changes the sign of a when b has the sign bit set // same as select(sign_bit(b), -a, a) -static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) { +static inline Vec4d sign_combine(Vec4d const a, Vec4d const b) { return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high())); } -// Function is_finite: gives true for elements that are normal, denormal or zero, +// Function is_finite: gives true for elements that are normal, denormal or zero, // false for INF and NAN -static inline Vec4db is_finite(Vec4d const & a) { +static inline Vec4db is_finite(Vec4d const a) { return Vec4db(is_finite(a.get_low()), is_finite(a.get_high())); } // Function is_inf: gives true for elements that are +INF or -INF // false for finite numbers and NAN -static inline Vec4db is_inf(Vec4d const & a) { +static inline Vec4db is_inf(Vec4d const a) { return Vec4db(is_inf(a.get_low()), is_inf(a.get_high())); } // Function is_nan: gives true for elements that are +NAN or -NAN // false for finite numbers and +/-INF -static inline Vec4db is_nan(Vec4d const & a) { +static inline Vec4db is_nan(Vec4d const a) { return Vec4db(is_nan(a.get_low()), is_nan(a.get_high())); } // Function is_subnormal: gives true for elements that are denormal (subnormal) // false for finite numbers, zero, NAN and INF -static inline Vec4db is_subnormal(Vec4d const & a) { +static inline Vec4db is_subnormal(Vec4d const a) { return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high())); } // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) // false for finite numbers, NAN and INF -static inline Vec4db is_zero_or_subnormal(Vec4d const & a) { +static inline Vec4db is_zero_or_subnormal(Vec4d const a) { return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high())); } @@ -1697,7 +1691,7 @@ static inline Vec4d nan4d(int n = 0x10) { // change signs on vectors Vec4d // Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change template -static inline Vec4d change_sign(Vec4d const & a) { +inline Vec4d change_sign(Vec4d const a) { if ((i0 | i1 | i2 | i3) == 0) return a; Vec2d lo = change_sign(a.get_low()); Vec2d hi = change_sign(a.get_high()); @@ -1711,39 +1705,39 @@ static inline Vec4d change_sign(Vec4d const & a) { * *****************************************************************************/ -static inline Vec256ie reinterpret_i (Vec256ie const & x) { +static inline Vec256b reinterpret_i (Vec256b const x) { return x; } -static inline Vec256ie reinterpret_i (Vec256fe const & x) { - return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); +static inline Vec256b reinterpret_i (Vec256fe const x) { + return Vec256b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); } -static inline Vec256ie reinterpret_i (Vec256de const & x) { - return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); +static inline Vec256b reinterpret_i (Vec256de const x) { + return Vec256b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); } -static inline Vec256fe reinterpret_f (Vec256ie const & x) { +static inline Vec256fe reinterpret_f (Vec256b const x) { return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high())); } -static inline Vec256fe reinterpret_f (Vec256fe const & x) { +static inline Vec256fe reinterpret_f (Vec256fe const x) { return x; } -static inline Vec256fe reinterpret_f (Vec256de const & x) { +static inline Vec256fe reinterpret_f (Vec256de const x) { return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high())); } -static inline Vec256de reinterpret_d (Vec256ie const & x) { +static inline Vec256de reinterpret_d (Vec256b const x) { return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high())); } -static inline Vec256de reinterpret_d (Vec256fe const & x) { +static inline Vec256de reinterpret_d (Vec256fe const x) { return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high())); } -static inline Vec256de reinterpret_d (Vec256de const & x) { +static inline Vec256de reinterpret_d (Vec256de const x) { return x; } @@ -1754,185 +1748,43 @@ static inline Vec256de reinterpret_d (Vec256de const & x) { * ****************************************************************************** * -* The permute function can reorder the elements of a vector and optionally -* set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select. An index of -1 will generate zero. An index of -256 means don't care. -* -* Example: -* Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13) -* Vec4d b; -* b = permute4d<1,0,-1,3>(a); // b is (11, 10, 0, 13) -* -* -* The blend function can mix elements from two different vectors and -* optionally set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where indexes 0 - 3 indicate an element from the first source -* vector and indexes 4 - 7 indicate an element from the second source vector. -* A negative index will generate zero. -* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description * -* Example: -* Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13) -* Vec4d b(20., 21., 22., 23.); // a is (20, 21, 22, 23) -* Vec4d c; -* c = blend4d<4,3,7,-1> (a,b); // c is (20, 13, 23, 0) *****************************************************************************/ // permute vector Vec4d template -static inline Vec4d permute4d(Vec4d const & a) { - return Vec4d(blend2d (a.get_low(), a.get_high()), - blend2d (a.get_low(), a.get_high())); +static inline Vec4d permute4(Vec4d const a) { + return Vec4d(blend2 (a.get_low(), a.get_high()), + blend2 (a.get_low(), a.get_high())); } -// helper function used below -template -static inline Vec2d select4(Vec4d const & a, Vec4d const & b) { - switch (n) { - case 0: - return a.get_low(); - case 1: - return a.get_high(); - case 2: - return b.get_low(); - case 3: - return b.get_high(); - } - return _mm_setzero_pd(); -} - -// blend vectors Vec4d -template -static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) { - const int j0 = i0 >= 0 ? i0/2 : i0; - const int j1 = i1 >= 0 ? i1/2 : i1; - const int j2 = i2 >= 0 ? i2/2 : i2; - const int j3 = i3 >= 0 ? i3/2 : i3; - Vec2d x0, x1; - - if (j0 == j1 || i0 < 0 || i1 < 0) { // both from same - const int k0 = j0 >= 0 ? j0 : j1; - x0 = permute2d (select4 (a,b)); - } - else { - x0 = blend2d (select4(a,b), select4(a,b)); - } - if (j2 == j3 || i2 < 0 || i3 < 0) { // both from same - const int k1 = j2 >= 0 ? j2 : j3; - x1 = permute2d (select4 (a,b)); - } - else { - x1 = blend2d (select4(a,b), select4(a,b)); - } - return Vec4d(x0,x1); -} - -/***************************************************************************** -* -* Vector Vec8f permute and blend functions -* -*****************************************************************************/ - // permute vector Vec8f template -static inline Vec8f permute8f(Vec8f const & a) { - return Vec8f(blend4f (a.get_low(), a.get_high()), - blend4f (a.get_low(), a.get_high())); +static inline Vec8f permute8(Vec8f const a) { + return Vec8f(blend4 (a.get_low(), a.get_high()), + blend4 (a.get_low(), a.get_high())); } -// helper function used below -template -static inline Vec4f select4(Vec8f const & a, Vec8f const & b) { - switch (n) { - case 0: - return a.get_low(); - case 1: - return a.get_high(); - case 2: - return b.get_low(); - case 3: - return b.get_high(); - } - return _mm_setzero_ps(); + +// blend vectors Vec4d +template +static inline Vec4d blend4(Vec4d const a, Vec4d const b) { + Vec2d x0 = blend_half(a, b); + Vec2d x1 = blend_half(a, b); + return Vec4d(x0, x1); } // blend vectors Vec8f template -static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) { - const int j0 = i0 >= 0 ? i0/4 : i0; - const int j1 = i1 >= 0 ? i1/4 : i1; - const int j2 = i2 >= 0 ? i2/4 : i2; - const int j3 = i3 >= 0 ? i3/4 : i3; - const int j4 = i4 >= 0 ? i4/4 : i4; - const int j5 = i5 >= 0 ? i5/4 : i5; - const int j6 = i6 >= 0 ? i6/4 : i6; - const int j7 = i7 >= 0 ? i7/4 : i7; - Vec4f x0, x1; - - const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3; - const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7; - const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3; - const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7; - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; - - // Mask to zero out negative indexes - const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - if (r0 < 0) { - x0 = _mm_setzero_ps(); - } - else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { - // i0 - i3 all from same source - x0 = permute4f (select4 (a,b)); - } - else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { - // i0 - i3 all from two sources - const int k0 = i0 >= 0 ? i0 & 3 : i0; - const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0); - const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0); - const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0); - x0 = blend4f (select4(a,b), select4(a,b)); - } - else { - // i0 - i3 from three or four different sources - x0 = blend4f<0,1,6,7> ( - blend4f (select4(a,b), select4(a,b)), - blend4f<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4(a,b), select4(a,b))); - } - - if (r1 < 0) { - x1 = _mm_setzero_ps(); - } - else if (((m1 ^ r1*0x44440000u) & 0xCCCC0000 & mz) == 0) { - // i4 - i7 all from same source - x1 = permute4f (select4 (a,b)); - } - else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { - // i4 - i7 all from two sources - const int k4 = i4 >= 0 ? i4 & 3 : i4; - const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0); - const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0); - const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0); - x1 = blend4f (select4(a,b), select4(a,b)); - } - else { - // i4 - i7 from three or four different sources - x1 = blend4f<0,1,6,7> ( - blend4f (select4(a,b), select4(a,b)), - blend4f<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4(a,b), select4(a,b))); - } - - return Vec8f(x0,x1); +static inline Vec8f blend8(Vec8f const a, Vec8f const b) { + Vec4f x0 = blend_half(a, b); + Vec4f x1 = blend_half(a, b); + return Vec8f(x0, x1); } + /***************************************************************************** * * Vector lookup functions @@ -1942,47 +1794,29 @@ static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) { * These functions use vector elements as indexes into a table. * The table is given as one or more vectors or as an array. * -* This can be used for several purposes: -* - table lookup -* - permute or blend with variable indexes -* - blend from more than two sources -* - gather non-contiguous data -* -* An index out of range may produce any value - the actual value produced is -* implementation dependent and may be different for different instruction -* sets. An index out of range does not produce an error message or exception. -* -* Example: -* Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) -* Vec4f b(1.0f,1.1f,1.2f,1.3f); // table b is (1.0, 1.1, 1.2, 1.3) -* Vec4f c; -* c = lookup4 (a,b); // result c is (1.2, 1.0, 1.0, 1.3) -* *****************************************************************************/ -#ifdef VECTORI256_H // Vec8i and Vec4q must be defined - -static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) { +static inline Vec8f lookup8(Vec8i const index, Vec8f const table) { Vec4f r0 = lookup8(index.get_low() , table.get_low(), table.get_high()); Vec4f r1 = lookup8(index.get_high(), table.get_low(), table.get_high()); return Vec8f(r0, r1); } template -static inline Vec8f lookup(Vec8i const & index, float const * table) { - if (n <= 0) return 0; - if (n <= 4) { - Vec4f table1 = Vec4f().load(table); - return Vec8f( +static inline Vec8f lookup(Vec8i const index, float const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 4) { + Vec4f table1 = Vec4f().load(table); + return Vec8f( lookup4 (index.get_low(), table1), lookup4 (index.get_high(), table1)); } - if (n <= 8) { + if constexpr (n <= 8) { return lookup8(index, Vec8f().load(table)); } // Limit index Vec8ui index1; - if ((n & (n-1)) == 0) { + if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec8ui(index) & (n-1); } @@ -1994,35 +1828,53 @@ static inline Vec8f lookup(Vec8i const & index, float const * table) { table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]); } -static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) { +static inline Vec4d lookup4(Vec4q const index, Vec4d const table) { Vec2d r0 = lookup4(index.get_low() , table.get_low(), table.get_high()); Vec2d r1 = lookup4(index.get_high(), table.get_low(), table.get_high()); return Vec4d(r0, r1); } template -static inline Vec4d lookup(Vec4q const & index, double const * table) { - if (n <= 0) return 0; - if (n <= 2) { - Vec2d table1 = Vec2d().load(table); - return Vec4d( +static inline Vec4d lookup(Vec4q const index, double const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 2) { + Vec2d table1 = Vec2d().load(table); + return Vec4d( lookup2 (index.get_low(), table1), lookup2 (index.get_high(), table1)); } // Limit index Vec8ui index1; - if ((n & (n-1)) == 0) { + if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n - index1 = Vec8ui(index) & constant8i(); + index1 = Vec8ui(index) & Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0); } else { // n is not a power of 2, limit to n-1 - index1 = min(Vec8ui(index), constant8i() ); + index1 = min(Vec8ui(index), Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0)); } Vec4q index2 = Vec4q(index1); return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]); } -#endif // VECTORI256_H + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// Load elements from array a with indices i0, i1, i2, i3, .. +template +static inline Vec8f gather8f(void const * a) { + return reinterpret_f(gather8i(a)); +} + +// Load elements from array a with indices i0, i1, i2, i3 +template +static inline Vec4d gather4d(void const * a) { + return reinterpret_d(gather4q(a)); +} + /***************************************************************************** * @@ -2031,27 +1883,16 @@ static inline Vec4d lookup(Vec4q const & index, double const * table) { ****************************************************************************** * * These functions write the elements of a vector to arbitrary positions in an -* array in memory. Each vector element is written to an array position +* array in memory. Each vector element is written to an array position * determined by an index. An element is not written if the corresponding * index is out of range. * The indexes can be specified as constant template parameters or as an * integer vector. -* -* The scatter functions are useful if the data are distributed in a sparce -* manner into the array. If the array is dense then it is more efficient -* to permute the data into the right positions and then write the whole -* permuted vector into the array. -* -* Example: -* Vec8d a(10,11,12,13,14,15,16,17); -* double b[16] = {0}; -* scatter<0,2,14,10,1,-1,5,9>(a,b); -* // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0} * *****************************************************************************/ template -static inline void scatter(Vec8f const & data, float * array) { +static inline void scatter(Vec8f const data, float * array) { const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; for (int i = 0; i < 8; i++) { if (index[i] >= 0) array[index[i]] = data[i]; @@ -2059,55 +1900,33 @@ static inline void scatter(Vec8f const & data, float * array) { } template -static inline void scatter(Vec4d const & data, double * array) { +static inline void scatter(Vec4d const data, double * array) { const int index[4] = {i0,i1,i2,i3}; for (int i = 0; i < 4; i++) { if (index[i] >= 0) array[index[i]] = data[i]; } } -static inline void scatter(Vec8i const & index, uint32_t limit, Vec8f const & data, float * array) { +// scatter functions with variable indexes + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8f const data, float * destination) { for (int i = 0; i < 8; i++) { - if (uint32_t(index[i]) < limit) array[index[i]] = data[i]; + if (uint32_t(index[i]) < limit) destination[index[i]] = data[i]; } } -static inline void scatter(Vec4q const & index, uint32_t limit, Vec4d const & data, double * array) { +static inline void scatter(Vec4q const index, uint32_t limit, Vec4d const data, double * destination) { for (int i = 0; i < 4; i++) { - if (uint64_t(index[i]) < uint64_t(limit)) array[index[i]] = data[i]; + if (uint64_t(index[i]) < uint64_t(limit)) destination[index[i]] = data[i]; } -} +} -static inline void scatter(Vec4i const & index, uint32_t limit, Vec4d const & data, double * array) { +static inline void scatter(Vec4i const index, uint32_t limit, Vec4d const data, double * destination) { for (int i = 0; i < 4; i++) { - if (uint32_t(index[i]) < limit) array[index[i]] = data[i]; + if (uint32_t(index[i]) < limit) destination[index[i]] = data[i]; } -} - -/***************************************************************************** -* -* Horizontal scan functions -* -*****************************************************************************/ - -// Get index to the first element that is true. Return -1 if all are false - -static inline int horizontal_find_first(Vec8fb const & x) { - return horizontal_find_first(Vec8ib(x)); } -static inline int horizontal_find_first(Vec4db const & x) { - return horizontal_find_first(Vec4qb(x)); -} - -// Count the number of elements that are true -static inline uint32_t horizontal_count(Vec8fb const & x) { - return horizontal_count(Vec8ib(x)); -} - -static inline uint32_t horizontal_count(Vec4db const & x) { - return horizontal_count(Vec4qb(x)); -} /***************************************************************************** * @@ -2116,27 +1935,17 @@ static inline uint32_t horizontal_count(Vec4db const & x) { *****************************************************************************/ // to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec8fb const & x) { - return to_bits(Vec8ib(x)); -} - -// to_Vec8fb: convert integer bitfield to boolean vector -static inline Vec8fb to_Vec8fb(uint8_t x) { - return Vec8fb(to_Vec8ib(x)); +static inline uint8_t to_bits(Vec8fb const x) { + return to_bits(Vec8ib(reinterpret_i(x))); } // to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4db const & x) { - return to_bits(Vec4qb(x)); -} - -// to_Vec4db: convert integer bitfield to boolean vector -static inline Vec4db to_Vec4db(uint8_t x) { - return Vec4db(to_Vec4qb(x)); +static inline uint8_t to_bits(Vec4db const x) { + return to_bits(Vec4qb(reinterpret_i(x))); } #ifdef VCL_NAMESPACE } #endif -#endif // VECTORF256_H +#endif // VECTORF256E_H diff --git a/DFTTest/VCL2/vectorf512.h b/DFTTest/VCL2/vectorf512.h new file mode 100644 index 0000000..ea6d7b3 --- /dev/null +++ b/DFTTest/VCL2/vectorf512.h @@ -0,0 +1,2048 @@ +/**************************** vectorf512.h ******************************* +* Author: Agner Fog +* Date created: 2014-07-23 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining 512-bit floating point vector classes +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec16f Vector of 16 single precision floating point numbers +* Vec16fb Vector of 16 Booleans for use with Vec16f +* Vec8d Vector of 8 double precision floating point numbers +* Vec8db Vector of 8 Booleans for use with Vec8d +* +* Each vector object is represented internally in the CPU a 512-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2014-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORF512_H +#define VECTORF512_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +#ifdef VECTORF512E_H +#error Two different versions of vectorf512.h included +#endif + +#include "vectori512.h" + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + + +/***************************************************************************** +* +* Vec16fb: Vector of 16 Booleans for use with Vec16f +* Vec8db: Vector of 8 Booleans for use with Vec8d +* +*****************************************************************************/ + +typedef Vec16b Vec16fb; +typedef Vec8b Vec8db; + +#if INSTRSET == 9 // special cases of mixed compact and broad vectors +inline Vec16b::Vec16b(Vec8ib const x0, Vec8ib const x1) { + mm = to_bits(x0) | uint16_t(to_bits(x1) << 8); +} +inline Vec16b::Vec16b(Vec8fb const x0, Vec8fb const x1) { + mm = to_bits(x0) | uint16_t(to_bits(x1) << 8); +} +inline Vec8b::Vec8b(Vec4qb const x0, Vec4qb const x1) { + mm = to_bits(x0) | (to_bits(x1) << 4); +} +inline Vec8b::Vec8b(Vec4db const x0, Vec4db const x1) { + mm = to_bits(x0) | (to_bits(x1) << 4); +} + +inline Vec8ib Vec16b::get_low() const { + return Vec8ib().load_bits(uint8_t(mm)); +} +inline Vec8ib Vec16b::get_high() const { + return Vec8ib().load_bits(uint8_t((uint16_t)mm >> 8u)); +} +inline Vec4qb Vec8b::get_low() const { + return Vec4qb().load_bits(mm & 0xF); +} +inline Vec4qb Vec8b::get_high() const { + return Vec4qb().load_bits(mm >> 4u); +} + +#endif + + +/***************************************************************************** +* +* Vec16f: Vector of 16 single precision floating point values +* +*****************************************************************************/ + +class Vec16f { +protected: + __m512 zmm; // Float vector +public: + // Default constructor: + Vec16f() { + } + // Constructor to broadcast the same value into all elements: + Vec16f(float f) { + zmm = _mm512_set1_ps(f); + } + // Constructor to build from all elements: + Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7, + float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15) { + zmm = _mm512_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15); + } + // Constructor to build from two Vec8f: + Vec16f(Vec8f const a0, Vec8f const a1) { + zmm = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(_mm512_castps256_ps512(a0)), _mm256_castps_pd(a1), 1)); + } + // Constructor to convert from type __m512 used in intrinsics: + Vec16f(__m512 const x) { + zmm = x; + } + // Assignment operator to convert from type __m512 used in intrinsics: + Vec16f & operator = (__m512 const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512 used in intrinsics + operator __m512() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec16f & load(float const * p) { + zmm = _mm512_loadu_ps(p); + return *this; + } + // Member function to load from array, aligned by 64 + // You may use load_a instead of load if you are certain that p points to an address divisible by 64 + Vec16f & load_a(float const * p) { + zmm = _mm512_load_ps(p); + return *this; + } + // Member function to store into array (unaligned) + void store(float * p) const { + _mm512_storeu_ps(p, zmm); + } + // Member function storing into array, aligned by 64 + // You may use store_a instead of store if you are certain that p points to an address divisible by 64 + void store_a(float * p) const { + _mm512_store_ps(p, zmm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 16 + void store_nt(float * p) const { + _mm512_stream_ps(p, zmm); + } + // Partial load. Load n elements and set the rest to 0 + Vec16f & load_partial(int n, float const * p) { + zmm = _mm512_maskz_loadu_ps(__mmask16((1 << n) - 1), p); + return *this; + } + // Partial store. Store n elements + void store_partial(int n, float * p) const { + _mm512_mask_storeu_ps(p, __mmask16((1 << n) - 1), zmm); + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec16f & cutoff(int n) { + zmm = _mm512_maskz_mov_ps(__mmask16((1 << n) - 1), zmm); + return *this; + } + // Member function to change a single element in vector + Vec16f const insert(int index, float value) { + zmm = _mm512_mask_broadcastss_ps(zmm, __mmask16(1u << index), _mm_set_ss(value)); + return *this; + } + // Member function extract a single element from vector + float extract(int index) const { + __m512 x = _mm512_maskz_compress_ps(__mmask16(1u << index), zmm); + return _mm512_cvtss_f32(x); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + float operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4f: + Vec8f get_low() const { + return _mm512_castps512_ps256(zmm); + } + Vec8f get_high() const { + return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm),1)); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 16; + } + typedef __m512 registertype; +}; + + +/***************************************************************************** +* +* Operators for Vec16f +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec16f operator + (Vec16f const a, Vec16f const b) { + return _mm512_add_ps(a, b); +} + +// vector operator + : add vector and scalar +static inline Vec16f operator + (Vec16f const a, float b) { + return a + Vec16f(b); +} +static inline Vec16f operator + (float a, Vec16f const b) { + return Vec16f(a) + b; +} + +// vector operator += : add +static inline Vec16f & operator += (Vec16f & a, Vec16f const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec16f operator ++ (Vec16f & a, int) { + Vec16f a0 = a; + a = a + 1.0f; + return a0; +} + +// prefix operator ++ +static inline Vec16f & operator ++ (Vec16f & a) { + a = a + 1.0f; + return a; +} + +// vector operator - : subtract element by element +static inline Vec16f operator - (Vec16f const a, Vec16f const b) { + return _mm512_sub_ps(a, b); +} + +// vector operator - : subtract vector and scalar +static inline Vec16f operator - (Vec16f const a, float b) { + return a - Vec16f(b); +} +static inline Vec16f operator - (float a, Vec16f const b) { + return Vec16f(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec16f operator - (Vec16f const a) { + return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ 0x80000000); +} + +// vector operator -= : subtract +static inline Vec16f & operator -= (Vec16f & a, Vec16f const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec16f operator -- (Vec16f & a, int) { + Vec16f a0 = a; + a = a - 1.0f; + return a0; +} + +// prefix operator -- +static inline Vec16f & operator -- (Vec16f & a) { + a = a - 1.0f; + return a; +} + +// vector operator * : multiply element by element +static inline Vec16f operator * (Vec16f const a, Vec16f const b) { + return _mm512_mul_ps(a, b); +} + +// vector operator * : multiply vector and scalar +static inline Vec16f operator * (Vec16f const a, float b) { + return a * Vec16f(b); +} +static inline Vec16f operator * (float a, Vec16f const b) { + return Vec16f(a) * b; +} + +// vector operator *= : multiply +static inline Vec16f & operator *= (Vec16f & a, Vec16f const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec16f operator / (Vec16f const a, Vec16f const b) { + return _mm512_div_ps(a, b); +} + +// vector operator / : divide vector and scalar +static inline Vec16f operator / (Vec16f const a, float b) { + return a / Vec16f(b); +} +static inline Vec16f operator / (float a, Vec16f const b) { + return Vec16f(a) / b; +} + +// vector operator /= : divide +static inline Vec16f & operator /= (Vec16f & a, Vec16f const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec16fb operator == (Vec16f const a, Vec16f const b) { +// return _mm512_cmpeq_ps_mask(a, b); + return _mm512_cmp_ps_mask(a, b, 0); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec16fb operator != (Vec16f const a, Vec16f const b) { +// return _mm512_cmpneq_ps_mask(a, b); + return _mm512_cmp_ps_mask(a, b, 4); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec16fb operator < (Vec16f const a, Vec16f const b) { +// return _mm512_cmplt_ps_mask(a, b); + return _mm512_cmp_ps_mask(a, b, 1); +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec16fb operator <= (Vec16f const a, Vec16f const b) { +// return _mm512_cmple_ps_mask(a, b); + return _mm512_cmp_ps_mask(a, b, 2); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec16fb operator > (Vec16f const a, Vec16f const b) { + return _mm512_cmp_ps_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec16fb operator >= (Vec16f const a, Vec16f const b) { + return _mm512_cmp_ps_mask(a, b, 5); +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec16f operator & (Vec16f const a, Vec16f const b) { + return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) & Vec16i(_mm512_castps_si512(b))); +} + +// vector operator &= : bitwise and +static inline Vec16f & operator &= (Vec16f & a, Vec16f const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec16f and Vec16fb +static inline Vec16f operator & (Vec16f const a, Vec16fb const b) { + return _mm512_maskz_mov_ps(b, a); +} +static inline Vec16f operator & (Vec16fb const a, Vec16f const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec16f operator | (Vec16f const a, Vec16f const b) { + return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) | Vec16i(_mm512_castps_si512(b))); +} + +// vector operator |= : bitwise or +static inline Vec16f & operator |= (Vec16f & a, Vec16f const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16f operator ^ (Vec16f const a, Vec16f const b) { + return _mm512_castsi512_ps(Vec16i(_mm512_castps_si512(a)) ^ Vec16i(_mm512_castps_si512(b))); +} + +// vector operator ^= : bitwise xor +static inline Vec16f & operator ^= (Vec16f & a, Vec16f const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec16fb operator ! (Vec16f const a) { + return a == Vec16f(0.0f); +} + + +/***************************************************************************** +* +* Functions for Vec16f +* +*****************************************************************************/ + +static inline Vec16f zero_16f() { + return _mm512_setzero_ps(); +} + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16f select (Vec16fb const s, Vec16f const a, Vec16f const b) { + return _mm512_mask_mov_ps(b, s, a); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16f if_add (Vec16fb const f, Vec16f const a, Vec16f const b) { + return _mm512_mask_add_ps(a, f, a, b); +} + +// Conditional subtract +static inline Vec16f if_sub (Vec16fb const f, Vec16f const a, Vec16f const b) { + return _mm512_mask_sub_ps(a, f, a, b); +} + +// Conditional multiply +static inline Vec16f if_mul (Vec16fb const f, Vec16f const a, Vec16f const b) { + return _mm512_mask_mul_ps(a, f, a, b); +} + +// Conditional divide +static inline Vec16f if_div (Vec16fb const f, Vec16f const a, Vec16f const b) { + return _mm512_mask_div_ps(a, f, a, b); +} + + +// sign functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0f, -INF and -NAN +// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb sign_bit(Vec16f const a) { + Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer + return Vec16fb(t1 < 0); +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec16f sign_combine(Vec16f const a, Vec16f const b) { + // return a ^ (b & Vec16f(-0.0f)); + return _mm512_castsi512_ps (_mm512_ternarylogic_epi32( + _mm512_castps_si512(a), _mm512_castps_si512(b), Vec16i(0x80000000), 0x78)); +} + +// Categorization functions + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb is_finite(Vec16f const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + __mmask16 f = _mm512_fpclass_ps_mask(a, 0x99); + return _mm512_knot(f); +#else + Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer + Vec16i t2 = t1 << 1; // shift out sign bit + Vec16ib t3 = Vec16i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s + return Vec16fb(t3); +#endif +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb is_inf(Vec16f const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_fpclass_ps_mask(a, 0x18); +#else + Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer + Vec16i t2 = t1 << 1; // shift out sign bit + return Vec16fb(t2 == 0xFF000000); // exponent is all 1s, fraction is 0 +#endif +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +#if INSTRSET >= 10 +static inline Vec16fb is_nan(Vec16f const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm512_fpclass_ps_mask(a, 0x81); +} +//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +//__attribute__((optimize("-fno-unsafe-math-optimizations"))) +//static inline Vec16fb is_nan(Vec16f const a) { +// return a != a; // not safe with -ffinite-math-only compiler option +//} +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +static inline Vec16fb is_nan(Vec16f const a) { + __m512 aa = a; + __mmask16 unordered; + __asm volatile("vcmpps $3, %1, %1, %0" : "=Yk" (unordered) : "v" (aa) ); + return Vec16fb(unordered); +} +#else +static inline Vec16fb is_nan(Vec16f const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return Vec16fb().load_bits(_mm512_cmp_ps_mask(a, a, 3)); // compare unordered + // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option +} +#endif + + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec16fb is_subnormal(Vec16f const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_fpclass_ps_mask(a, 0x20); +#else + Vec16i t1 = _mm512_castps_si512(a); // reinterpret as 32-bit integer + Vec16i t2 = t1 << 1; // shift out sign bit + Vec16i t3 = 0xFF000000; // exponent mask + Vec16i t4 = t2 & t3; // exponent + Vec16i t5 = _mm512_andnot_si512(t3,t2);// fraction + return Vec16fb(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0 +#endif +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec16fb is_zero_or_subnormal(Vec16f const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_fpclass_ps_mask(a, 0x26); +#else + Vec16i t = _mm512_castps_si512(a); // reinterpret as 32-bit integer + t &= 0x7F800000; // isolate exponent + return Vec16fb(t == 0); // exponent = 0 +#endif +} + +// change signs on vectors Vec16f +// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change +template +static inline Vec16f change_sign(Vec16f const a) { + constexpr __mmask16 m = __mmask16((i0&1) | (i1&1)<<1 | (i2&1)<< 2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7 + | (i8&1)<<8 | (i9&1)<<9 | (i10&1)<<10 | (i11&1)<<11 | (i12&1)<<12 | (i13&1)<<13 | (i14&1)<<14 | (i15&1)<<15); + if constexpr ((uint16_t)m == 0) return a; + __m512 s = _mm512_castsi512_ps(_mm512_maskz_set1_epi32(m, 0x80000000)); + return a ^ s; +} + +// Horizontal add: Calculates the sum of all vector elements. +static inline float horizontal_add (Vec16f const a) { +#if defined(__INTEL_COMPILER) + return _mm512_reduce_add_ps(a); +#else + return horizontal_add(a.get_low() + a.get_high()); +#endif +} + +// function max: a > b ? a : b +static inline Vec16f max(Vec16f const a, Vec16f const b) { + return _mm512_max_ps(a,b); +} + +// function min: a < b ? a : b +static inline Vec16f min(Vec16f const a, Vec16f const b) { + return _mm512_min_ps(a,b); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +static inline Vec16f abs(Vec16f const a) { +#if INSTRSET >= 10 // AVX512DQ + return _mm512_range_ps(a, a, 8); +#else + return a & Vec16f(_mm512_castsi512_ps(Vec16i(0x7FFFFFFF))); +#endif +} + +// function sqrt: square root +static inline Vec16f sqrt(Vec16f const a) { + return _mm512_sqrt_ps(a); +} + +// function square: a * a +static inline Vec16f square(Vec16f const a) { + return a * a; +} + +// pow(Vec16f, int): +template static Vec16f pow(Vec16f const a, TT const n); + +// Raise floating point numbers to integer power n +template <> +inline Vec16f pow(Vec16f const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec16f pow(Vec16f const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec16f pow(Vec16f const a, Const_int_t) { + return pow_n(a); +} + +// function round: round to nearest integer (even). (result as float vector) +static inline Vec16f round(Vec16f const a) { + return _mm512_roundscale_ps(a, 0+8); +} + +// function truncate: round towards zero. (result as float vector) +static inline Vec16f truncate(Vec16f const a) { + return _mm512_roundscale_ps(a, 3+8); +} + +// function floor: round towards minus infinity. (result as float vector) +static inline Vec16f floor(Vec16f const a) { + return _mm512_roundscale_ps(a, 1+8); +} + +// function ceil: round towards plus infinity. (result as float vector) +static inline Vec16f ceil(Vec16f const a) { + return _mm512_roundscale_ps(a, 2+8); +} + +// function roundi: round to nearest integer (even). (result as integer vector) +static inline Vec16i roundi(Vec16f const a) { + return _mm512_cvt_roundps_epi32(a, 0+8 /*_MM_FROUND_NO_EXC*/); +} +//static inline Vec16i round_to_int(Vec16f const a) {return roundi(a);} // deprecated + +// function truncatei: round towards zero. (result as integer vector) +static inline Vec16i truncatei(Vec16f const a) { + return _mm512_cvtt_roundps_epi32(a, 0+8 /*_MM_FROUND_NO_EXC*/); +} +//static inline Vec16i truncate_to_int(Vec16f const a) {return truncatei(a);} // deprecated + +// function to_float: convert integer vector to float vector +static inline Vec16f to_float(Vec16i const a) { + return _mm512_cvtepi32_ps(a); +} + +// function to_float: convert unsigned integer vector to float vector +static inline Vec16f to_float(Vec16ui const a) { + return _mm512_cvtepu32_ps(a); +} + +// Approximate math functions + +// approximate reciprocal (Faster than 1.f / a. +// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512F, full precision with AVX512ER) +static inline Vec16f approx_recipr(Vec16f const a) { +#ifdef __AVX512ER__ // AVX512ER instruction set includes fast reciprocal with better precision + return _mm512_rcp28_round_ps(a, _MM_FROUND_NO_EXC); +#else + return _mm512_rcp14_ps(a); +#endif +} + +// Newton-Raphson refined approximate reciprocal (23 bit precision) +static inline Vec16f rcp_nr(Vec16f const a) { + Vec16f nr = _mm512_rcp14_ps(a); + Vec16f muls = nr * nr * a; + Vec16f dbl = nr + nr; + return dbl - muls; +} + +// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). +// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512F, full precision with AVX512ER) +static inline Vec16f approx_rsqrt(Vec16f const a) { +#ifdef __AVX512ER__ // AVX512ER instruction set includes fast reciprocal squareroot with better precision + return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); +#else + return _mm512_rsqrt14_ps(a); +#endif +} + + +// Fused multiply and add functions + +// Multiply and add +static inline Vec16f mul_add(Vec16f const a, Vec16f const b, Vec16f const c) { + return _mm512_fmadd_ps(a, b, c); +} + +// Multiply and subtract +static inline Vec16f mul_sub(Vec16f const a, Vec16f const b, Vec16f const c) { + return _mm512_fmsub_ps(a, b, c); +} + +// Multiply and inverse subtract +static inline Vec16f nmul_add(Vec16f const a, Vec16f const b, Vec16f const c) { + return _mm512_fnmadd_ps(a, b, c); +} + +// Multiply and subtract with extra precision on the intermediate calculations, +// Do not use mul_sub_x in general code because it is inaccurate in certain cases when FMA is not supported +static inline Vec16f mul_sub_x(Vec16f const a, Vec16f const b, Vec16f const c) { + return _mm512_fmsub_ps(a, b, c); +} + + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 +static inline Vec16i exponent(Vec16f const a) { + // return roundi(Vec16i(_mm512_getexp_ps(a))); + Vec16ui t1 = _mm512_castps_si512(a);// reinterpret as 32-bit integers + Vec16ui t2 = t1 << 1; // shift out sign bit + Vec16ui t3 = t2 >> 24; // shift down logical to position 0 + Vec16i t4 = Vec16i(t3) - 0x7F; // subtract bias from exponent + return t4; +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f +static inline Vec16f fraction(Vec16f const a) { + return _mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0f +// n >= 128 gives +INF +// n <= -127 gives 0.0f +// This function will never produce denormals, and never raise exceptions +static inline Vec16f exp2(Vec16i const n) { + Vec16i t1 = max(n, -0x7F); // limit to allowed range + Vec16i t2 = min(t1, 0x80); + Vec16i t3 = t2 + 0x7F; // add bias + Vec16i t4 = t3 << 23; // put exponent into position 23 + return _mm512_castsi512_ps(t4); // reinterpret as float +} +//static Vec16f exp2(Vec16f const x); // defined in vectormath_exp.h + + +/***************************************************************************** +* +* Vec8d: Vector of 8 double precision floating point values +* +*****************************************************************************/ + +class Vec8d { +protected: + __m512d zmm; // double vector +public: + // Default constructor: + Vec8d() { + } + // Constructor to broadcast the same value into all elements: + Vec8d(double d) { + zmm = _mm512_set1_pd(d); + } + // Constructor to build from all elements: + Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) { + zmm = _mm512_setr_pd(d0, d1, d2, d3, d4, d5, d6, d7); + } + // Constructor to build from two Vec4d: + Vec8d(Vec4d const a0, Vec4d const a1) { + zmm = _mm512_insertf64x4(_mm512_castpd256_pd512(a0), a1, 1); + } + // Constructor to convert from type __m512d used in intrinsics: + Vec8d(__m512d const x) { + zmm = x; + } + // Assignment operator to convert from type __m512d used in intrinsics: + Vec8d & operator = (__m512d const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512d used in intrinsics + operator __m512d() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec8d & load(double const * p) { + zmm = _mm512_loadu_pd(p); + return *this; + } + // Member function to load from array, aligned by 64 + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 64 + Vec8d & load_a(double const * p) { + zmm = _mm512_load_pd(p); + return *this; + } + // Member function to store into array (unaligned) + void store(double * p) const { + _mm512_storeu_pd(p, zmm); + } + // Member function storing into array, aligned by 64 + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 64 + void store_a(double * p) const { + _mm512_store_pd(p, zmm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 16 + void store_nt(double * p) const { + _mm512_stream_pd(p, zmm); + } + // Partial load. Load n elements and set the rest to 0 + Vec8d & load_partial(int n, double const * p) { + zmm = _mm512_maskz_loadu_pd(__mmask16((1<= 10 + __m512d x = _mm512_maskz_compress_pd(__mmask8(1u << index), zmm); + return _mm512_cvtsd_f64(x); +#else + double a[8]; + store(a); + return a[index & 7]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + double operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4d: + Vec4d get_low() const { + return _mm512_castpd512_pd256(zmm); + } + Vec4d get_high() const { + return _mm512_extractf64x4_pd(zmm,1); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 17; + } + typedef __m512d registertype; +}; + + +/***************************************************************************** +* +* Operators for Vec8d +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec8d operator + (Vec8d const a, Vec8d const b) { + return _mm512_add_pd(a, b); +} + +// vector operator + : add vector and scalar +static inline Vec8d operator + (Vec8d const a, double b) { + return a + Vec8d(b); +} +static inline Vec8d operator + (double a, Vec8d const b) { + return Vec8d(a) + b; +} + +// vector operator += : add +static inline Vec8d & operator += (Vec8d & a, Vec8d const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8d operator ++ (Vec8d & a, int) { + Vec8d a0 = a; + a = a + 1.0; + return a0; +} + +// prefix operator ++ +static inline Vec8d & operator ++ (Vec8d & a) { + a = a + 1.0; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8d operator - (Vec8d const a, Vec8d const b) { + return _mm512_sub_pd(a, b); +} + +// vector operator - : subtract vector and scalar +static inline Vec8d operator - (Vec8d const a, double b) { + return a - Vec8d(b); +} +static inline Vec8d operator - (double a, Vec8d const b) { + return Vec8d(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec8d operator - (Vec8d const a) { + return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(0x8000000000000000)); +} + +// vector operator -= : subtract +static inline Vec8d & operator -= (Vec8d & a, Vec8d const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8d operator -- (Vec8d & a, int) { + Vec8d a0 = a; + a = a - 1.0; + return a0; +} + +// prefix operator -- +static inline Vec8d & operator -- (Vec8d & a) { + a = a - 1.0; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8d operator * (Vec8d const a, Vec8d const b) { + return _mm512_mul_pd(a, b); +} + +// vector operator * : multiply vector and scalar +static inline Vec8d operator * (Vec8d const a, double b) { + return a * Vec8d(b); +} +static inline Vec8d operator * (double a, Vec8d const b) { + return Vec8d(a) * b; +} + +// vector operator *= : multiply +static inline Vec8d & operator *= (Vec8d & a, Vec8d const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec8d operator / (Vec8d const a, Vec8d const b) { + return _mm512_div_pd(a, b); +} + +// vector operator / : divide vector and scalar +static inline Vec8d operator / (Vec8d const a, double b) { + return a / Vec8d(b); +} +static inline Vec8d operator / (double a, Vec8d const b) { + return Vec8d(a) / b; +} + +// vector operator /= : divide +static inline Vec8d & operator /= (Vec8d & a, Vec8d const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8db operator == (Vec8d const a, Vec8d const b) { + return _mm512_cmp_pd_mask(a, b, 0); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8db operator != (Vec8d const a, Vec8d const b) { + return _mm512_cmp_pd_mask(a, b, 4); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8db operator < (Vec8d const a, Vec8d const b) { + return _mm512_cmp_pd_mask(a, b, 1); +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec8db operator <= (Vec8d const a, Vec8d const b) { + return _mm512_cmp_pd_mask(a, b, 2); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8db operator > (Vec8d const a, Vec8d const b) { + return _mm512_cmp_pd_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec8db operator >= (Vec8d const a, Vec8d const b) { + return _mm512_cmp_pd_mask(a, b, 5); +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec8d operator & (Vec8d const a, Vec8d const b) { + return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) & Vec8q(_mm512_castpd_si512(b))); +} + +// vector operator &= : bitwise and +static inline Vec8d & operator &= (Vec8d & a, Vec8d const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec8d and Vec8db +static inline Vec8d operator & (Vec8d const a, Vec8db const b) { + return _mm512_maskz_mov_pd((uint8_t)b, a); +} + +static inline Vec8d operator & (Vec8db const a, Vec8d const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec8d operator | (Vec8d const a, Vec8d const b) { + return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) | Vec8q(_mm512_castpd_si512(b))); +} + +// vector operator |= : bitwise or +static inline Vec8d & operator |= (Vec8d & a, Vec8d const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8d operator ^ (Vec8d const a, Vec8d const b) { + return _mm512_castsi512_pd(Vec8q(_mm512_castpd_si512(a)) ^ Vec8q(_mm512_castpd_si512(b))); +} + +// vector operator ^= : bitwise xor +static inline Vec8d & operator ^= (Vec8d & a, Vec8d const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec8db operator ! (Vec8d const a) { + return a == Vec8d(0.0); +} + + +/***************************************************************************** +* +* Functions for Vec8d +* +*****************************************************************************/ + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8d select (Vec8db const s, Vec8d const a, Vec8d const b) { + return _mm512_mask_mov_pd (b, (uint8_t)s, a); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8d if_add (Vec8db const f, Vec8d const a, Vec8d const b) { + return _mm512_mask_add_pd(a, (uint8_t)f, a, b); +} + +// Conditional subtract +static inline Vec8d if_sub (Vec8db const f, Vec8d const a, Vec8d const b) { + return _mm512_mask_sub_pd(a, (uint8_t)f, a, b); +} + +// Conditional multiply +static inline Vec8d if_mul (Vec8db const f, Vec8d const a, Vec8d const b) { + return _mm512_mask_mul_pd(a, (uint8_t)f, a, b); +} + +// Conditional divide +static inline Vec8d if_div (Vec8db const f, Vec8d const a, Vec8d const b) { + return _mm512_mask_div_pd(a, (uint8_t)f, a, b); +} + +// Sign functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0, -INF and -NAN +static inline Vec8db sign_bit(Vec8d const a) { + Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer + return Vec8db(t1 < 0); +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec8d sign_combine(Vec8d const a, Vec8d const b) { + // return a ^ (b & Vec8d(-0.0)); + return _mm512_castsi512_pd (_mm512_ternarylogic_epi64( + _mm512_castpd_si512(a), _mm512_castpd_si512(b), Vec8q(0x8000000000000000), 0x78)); +} + +// Categorization functions + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +static inline Vec8db is_finite(Vec8d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + __mmask8 f = _mm512_fpclass_pd_mask(a, 0x99); + return __mmask8(_mm512_knot(f)); +#else + Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer + Vec8q t2 = t1 << 1; // shift out sign bit + Vec8q t3 = 0xFFE0000000000000ll; // exponent mask + Vec8qb t4 = Vec8q(t2 & t3) != t3; // exponent field is not all 1s + return Vec8db(t4); +#endif +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +static inline Vec8db is_inf(Vec8d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_fpclass_pd_mask(a, 0x18); +#else + Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer + Vec8q t2 = t1 << 1; // shift out sign bit + return Vec8db(t2 == 0xFFE0000000000000ll); // exponent is all 1s, fraction is 0 +#endif +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +#if INSTRSET >= 10 +static inline Vec8db is_nan(Vec8d const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return _mm512_fpclass_pd_mask(a, 0x81); +} +//#elif defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) +//__attribute__((optimize("-fno-unsafe-math-optimizations"))) +//static inline Vec8db is_nan(Vec8d const a) { +// return a != a; // not safe with -ffinite-math-only compiler option +//} +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__INTEL_COMPILER) +static inline Vec8db is_nan(Vec8d const a) { + __m512d aa = a; + __mmask16 unordered; + __asm volatile("vcmppd $3, %1, %1, %0" : "=Yk" (unordered) : "v" (aa) ); + return Vec8db(unordered); +} +#else +static inline Vec8db is_nan(Vec8d const a) { + // assume that compiler does not optimize this away with -ffinite-math-only: + return Vec8db().load_bits(_mm512_cmp_pd_mask(a, a, 3)); // compare unordered + // return a != a; // This is not safe with -ffinite-math-only, -ffast-math, or /fp:fast compiler option +} +#endif + + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec8db is_subnormal(Vec8d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_fpclass_pd_mask(a, 0x20); +#else + Vec8q t1 = _mm512_castpd_si512(a); // reinterpret as 64-bit integer + Vec8q t2 = t1 << 1; // shift out sign bit + Vec8q t3 = 0xFFE0000000000000ll; // exponent mask + Vec8q t4 = t2 & t3; // exponent + Vec8q t5 = _mm512_andnot_si512(t3,t2);// fraction + return Vec8db(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0 +#endif +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec8db is_zero_or_subnormal(Vec8d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_fpclass_pd_mask(a, 0x26); +#else + Vec8q t = _mm512_castpd_si512(a); // reinterpret as 32-bit integer + t &= 0x7FF0000000000000ll; // isolate exponent + return Vec8db(t == 0); // exponent = 0 +#endif +} + +// change signs on vectors Vec8d +// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change +template +static inline Vec8d change_sign(Vec8d const a) { + const __mmask16 m = __mmask16((i0&1) | (i1&1)<<1 | (i2&1)<< 2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7); + if ((uint8_t)m == 0) return a; +#ifdef __x86_64__ + __m512d s = _mm512_castsi512_pd(_mm512_maskz_set1_epi64(m, 0x8000000000000000)); +#else // 32 bit mode + __m512i v = Vec8q(0x8000000000000000); + __m512d s = _mm512_castsi512_pd(_mm512_maskz_mov_epi64(m, v)); +#endif + return a ^ s; +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline double horizontal_add (Vec8d const a) { +#if defined(__INTEL_COMPILER) + return _mm512_reduce_add_pd(a); +#else + return horizontal_add(a.get_low() + a.get_high()); +#endif +} + +// function max: a > b ? a : b +static inline Vec8d max(Vec8d const a, Vec8d const b) { + return _mm512_max_pd(a,b); +} + +// function min: a < b ? a : b +static inline Vec8d min(Vec8d const a, Vec8d const b) { + return _mm512_min_pd(a,b); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +static inline Vec8d abs(Vec8d const a) { +#if INSTRSET >= 10 // AVX512DQ + return _mm512_range_pd(a, a, 8); +#else + return a & Vec8d(_mm512_castsi512_pd(Vec8q(0x7FFFFFFFFFFFFFFF))); +#endif +} + +// function sqrt: square root +static inline Vec8d sqrt(Vec8d const a) { + return _mm512_sqrt_pd(a); +} + +// function square: a * a +static inline Vec8d square(Vec8d const a) { + return a * a; +} + +// The purpose of this template is to prevent implicit conversion of a float +// exponent to int when calling pow(vector, float) and vectormath_exp.h is not included +template static Vec8d pow(Vec8d const a, TT const n); // = delete; + +// pow(Vec8d, int): +// Raise floating point numbers to integer power n +template <> +inline Vec8d pow(Vec8d const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec8d pow(Vec8d const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec8d pow(Vec8d const a, Const_int_t) { + return pow_n(a); +} + + +// function round: round to nearest integer (even). (result as double vector) +static inline Vec8d round(Vec8d const a) { + return _mm512_roundscale_pd(a, 0); +} + +// function truncate: round towards zero. (result as double vector) +static inline Vec8d truncate(Vec8d const a) { + return _mm512_roundscale_pd(a, 3); +} + +// function floor: round towards minus infinity. (result as double vector) +static inline Vec8d floor(Vec8d const a) { + return _mm512_roundscale_pd(a, 1); +} + +// function ceil: round towards plus infinity. (result as double vector) +static inline Vec8d ceil(Vec8d const a) { + return _mm512_roundscale_pd(a, 2); +} + +// function round_to_int32: round to nearest integer (even). (result as integer vector) +static inline Vec8i round_to_int32(Vec8d const a) { + //return _mm512_cvtpd_epi32(a); + return _mm512_cvt_roundpd_epi32(a, 0+8); +} +//static inline Vec8i round_to_int(Vec8d const a) {return round_to_int32(a);} // deprecated + + +// function truncate_to_int32: round towards zero. (result as integer vector) +static inline Vec8i truncate_to_int32(Vec8d const a) { + return _mm512_cvttpd_epi32(a); +} +//static inline Vec8i truncate_to_int(Vec8d const a) {return truncate_to_int32(a);} // deprecated + + +// function truncatei: round towards zero +static inline Vec8q truncatei(Vec8d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_cvttpd_epi64(a); +#else + double aa[8]; // inefficient + a.store(aa); + return Vec8q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]), int64_t(aa[4]), int64_t(aa[5]), int64_t(aa[6]), int64_t(aa[7])); +#endif +} +//static inline Vec8q truncate_to_int64(Vec8d const a) {return truncatei(a);} // deprecated + +// function roundi: round to nearest or even +static inline Vec8q roundi(Vec8d const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_cvtpd_epi64(a); +#else + return truncatei(round(a)); +#endif +} +//static inline Vec8q round_to_int64(Vec8d const a) {return roundi(a);} // deprecated + +// function to_double: convert integer vector elements to double vector +static inline Vec8d to_double(Vec8q const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_cvtepi64_pd(a); +#else + int64_t aa[8]; // inefficient + a.store(aa); + return Vec8d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]), double(aa[4]), double(aa[5]), double(aa[6]), double(aa[7])); +#endif +} + +static inline Vec8d to_double(Vec8uq const a) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_cvtepu64_pd(a); +#else + uint64_t aa[8]; // inefficient + a.store(aa); + return Vec8d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]), double(aa[4]), double(aa[5]), double(aa[6]), double(aa[7])); +#endif +} + +// function to_double: convert integer vector to double vector +static inline Vec8d to_double(Vec8i const a) { + return _mm512_cvtepi32_pd(a); +} + +// function compress: convert two Vec8d to one Vec16f +static inline Vec16f compress (Vec8d const low, Vec8d const high) { + __m256 t1 = _mm512_cvtpd_ps(low); + __m256 t2 = _mm512_cvtpd_ps(high); + return Vec16f(t1, t2); +} + +// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d +static inline Vec8d extend_low(Vec16f const a) { + return _mm512_cvtps_pd(_mm512_castps512_ps256(a)); +} + +// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d +static inline Vec8d extend_high (Vec16f const a) { + return _mm512_cvtps_pd(a.get_high()); +} + + +// Fused multiply and add functions + +// Multiply and add +static inline Vec8d mul_add(Vec8d const a, Vec8d const b, Vec8d const c) { + return _mm512_fmadd_pd(a, b, c); +} + +// Multiply and subtract +static inline Vec8d mul_sub(Vec8d const a, Vec8d const b, Vec8d const c) { + return _mm512_fmsub_pd(a, b, c); +} + +// Multiply and inverse subtract +static inline Vec8d nmul_add(Vec8d const a, Vec8d const b, Vec8d const c) { + return _mm512_fnmadd_pd(a, b, c); +} + +// Multiply and subtract with extra precision on the intermediate calculations. used internally in math functions +static inline Vec8d mul_sub_x(Vec8d const a, Vec8d const b, Vec8d const c) { + return _mm512_fmsub_pd(a, b, c); +} + + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 +static inline Vec8q exponent(Vec8d const a) { + Vec8uq t1 = _mm512_castpd_si512(a);// reinterpret as 64-bit integer + Vec8uq t2 = t1 << 1; // shift out sign bit + Vec8uq t3 = t2 >> 53; // shift down logical to position 0 + Vec8q t4 = Vec8q(t3) - 0x3FF; // subtract bias from exponent + return t4; +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0) = 1.0, fraction(5.0) = 1.25 +static inline Vec8d fraction(Vec8d const a) { + return _mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero); +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0 +// n >= 1024 gives +INF +// n <= -1023 gives 0.0 +// This function will never produce denormals, and never raise exceptions +static inline Vec8d exp2(Vec8q const n) { + Vec8q t1 = max(n, -0x3FF); // limit to allowed range + Vec8q t2 = min(t1, 0x400); + Vec8q t3 = t2 + 0x3FF; // add bias + Vec8q t4 = t3 << 52; // put exponent into position 52 + return _mm512_castsi512_pd(t4); // reinterpret as double +} +//static Vec8d exp2(Vec8d const x); // defined in vectormath_exp.h + + + +/***************************************************************************** +* +* Functions for reinterpretation between vector types +* +*****************************************************************************/ + +// AVX512 requires gcc version 4.9 or higher. Apparently the problem with mangling intrinsic vector types no longer exists in gcc 4.x + +static inline __m512i reinterpret_i (__m512i const x) { + return x; +} + +static inline __m512i reinterpret_i (__m512 const x) { + return _mm512_castps_si512(x); +} + +static inline __m512i reinterpret_i (__m512d const x) { + return _mm512_castpd_si512(x); +} + +static inline __m512 reinterpret_f (__m512i const x) { + return _mm512_castsi512_ps(x); +} + +static inline __m512 reinterpret_f (__m512 const x) { + return x; +} + +static inline __m512 reinterpret_f (__m512d const x) { + return _mm512_castpd_ps(x); +} + +static inline __m512d reinterpret_d (__m512i const x) { + return _mm512_castsi512_pd(x); +} + +static inline __m512d reinterpret_d (__m512 const x) { + return _mm512_castps_pd(x); +} + +static inline __m512d reinterpret_d (__m512d const x) { + return x; +} + +// Function infinite4f: returns a vector where all elements are +INF +static inline Vec16f infinite16f() { + return reinterpret_f(Vec16i(0x7F800000)); +} + +// Function nan4f: returns a vector where all elements are +NAN (quiet) +static inline Vec16f nan16f(int n = 0x100) { + return nan_vec(n); +} + +// Function infinite2d: returns a vector where all elements are +INF +static inline Vec8d infinite8d() { + return reinterpret_d(Vec8q(0x7FF0000000000000)); +} + +// Function nan8d: returns a vector where all elements are +NAN (quiet NAN) +static inline Vec8d nan8d(int n = 0x10) { + return nan_vec(n); +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// Permute vector of 8 64-bit integers. +template +static inline Vec8d permute8(Vec8d const a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m512d y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_pd(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<8>(indexs); // permutation pattern + constexpr uint8_t ppat = (L.a[0] & 3) | (L.a[1]<<2 & 0xC) | (L.a[2]<<4 & 0x30) | (L.a[3]<<6 & 0xC0); + y = _mm512_shuffle_f64x2(a, a, ppat); + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in all lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm512_unpackhi_pd(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm512_unpacklo_pd(y, y); + } + else { // general permute within lanes + constexpr uint8_t mm0 = (i0&1) | (i1&1)<<1 | (i2&1)<<2 | (i3&1)<<3 | (i4&1)<<4 | (i5&1)<<5 | (i6&1)<<6 | (i7&1)<<7; + y = _mm512_permute_pd(a, mm0); // select within same lane + } + } + else { // different patterns in all lanes + if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + y = _mm512_castsi512_pd(_mm512_alignr_epi64 (_mm512_castpd_si512(y), _mm512_castpd_si512(y), rot)); + } + else if constexpr ((flags & perm_broadcast) != 0) { // broadcast one element + constexpr int e = flags >> perm_rot_count; + if constexpr(e != 0) { + y = _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(y), _mm512_castpd_si512(y), e)); + } + y = _mm512_broadcastsd_pd(_mm512_castpd512_pd128(y)); + } + else if constexpr ((flags & perm_compress) != 0) { + y = _mm512_maskz_compress_pd(__mmask8(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm512_maskz_expand_pd(__mmask8(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing + if constexpr ((flags & perm_zeroing) == 0) { // no zeroing. use vpermilps + const __m512i pmask = constant16ui (); + return _mm512_permutevar_pd(a, pmask); + } + else { // with zeroing. pshufb may be marginally better because it needs no extra zero mask + const EList bm = pshufb_mask(indexs); + return _mm512_castsi512_pd(_mm512_shuffle_epi8(_mm512_castpd_si512(y), Vec8q().load(bm.a))); + } + } + else { + // full permute needed + const __m512i pmask = constant16ui < + i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0, i4 & 7, 0, i5 & 7, 0, i6 & 7, 0, i7 & 7, 0>(); + y = _mm512_permutexvar_pd(pmask, y); + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_pd(zero_mask<8>(indexs), y); + } + return y; +} + + +// Permute vector of 16 32-bit integers. +template +static inline Vec16f permute16(Vec16f const a) { + int constexpr indexs[16] = { // indexes as array + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + __m512 y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_ps(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<16>(indexs); // permutation pattern + y = _mm512_castpd_ps( + permute8 + (Vec8d(_mm512_castps_pd(a)))); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in all lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm512_unpackhi_ps(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm512_unpacklo_ps(y, y); + } + else { // general permute within lanes + y = _mm512_permute_ps(a, uint8_t(flags >> perm_ipattern)); + } + } + else { // different patterns in all lanes + if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + y = _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(y), _mm512_castps_si512(y), rot)); + } + else if constexpr ((flags & perm_broadcast) != 0) { // broadcast one element + constexpr int e = flags >> perm_rot_count; // element index + if constexpr(e != 0) { + y = _mm512_castsi512_ps(_mm512_alignr_epi32(_mm512_castps_si512(y), _mm512_castps_si512(y), e)); + } + y = _mm512_broadcastss_ps(_mm512_castps512_ps128(y)); + } + else if constexpr ((flags & perm_zext) != 0) { // zero extension + y = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(_mm512_castsi512_si256(_mm512_castps_si512(y)))); + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_compress) != 0) { + y = _mm512_maskz_compress_ps(__mmask16(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm512_maskz_expand_ps(__mmask16(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing + if constexpr ((flags & perm_zeroing) == 0) { // no zeroing. use vpermilps + const __m512i pmask = constant16ui (); + return _mm512_permutevar_ps(a, pmask); + } + else { // with zeroing. pshufb may be marginally better because it needs no extra zero mask + const EList bm = pshufb_mask(indexs); + return _mm512_castsi512_ps(_mm512_shuffle_epi8(_mm512_castps_si512(a), Vec16i().load(bm.a))); + } + } + else { + // full permute needed + const __m512i pmaskf = constant16ui < + i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15, + i8 & 15, i9 & 15, i10 & 15, i11 & 15, i12 & 15, i13 & 15, i14 & 15, i15 & 15>(); + y = _mm512_permutexvar_ps(pmaskf, a); + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_ps(zero_mask<16>(indexs), y); + } + return y; +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ + +template +static inline Vec8d blend8(Vec8d const a, Vec8d const b) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m512d y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_pd(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute8 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes + return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs); // blend mask + y = _mm512_mask_mov_pd (a, mb, b); + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 128-bit blocks + constexpr EList L = largeblock_perm<8>(indexs); // get 128-bit blend pattern + constexpr uint8_t shuf = (L.a[0] & 3) | (L.a[1] & 3) << 2 | (L.a[2] & 3) << 4 | (L.a[3] & 3) << 6; + if constexpr (make_bit_mask<8, 0x103>(indexs) == 0) { // fits vshufi64x2 (a,b) + y = _mm512_shuffle_f64x2(a, b, shuf); + } + else if constexpr (make_bit_mask<8, 0x203>(indexs) == 0) { // fits vshufi64x2 (b,a) + y = _mm512_shuffle_f64x2(b, a, shuf); + } + else { + const EList bm = perm_mask_broad(indexs); + y = _mm512_permutex2var_pd(a, Vec8q().load(bm.a), b); + } + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm512_unpacklo_pd (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm512_unpacklo_pd (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm512_unpackhi_pd (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm512_unpackhi_pd (b, a); + } + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm512_shuffle_pd(a, b, uint8_t(flags >> blend_shufpattern)); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm512_shuffle_pd(b, a, uint8_t(flags >> blend_shufpattern)); + } + else { // No special cases + const EList bm = perm_mask_broad(indexs); + y = _mm512_permutex2var_pd(a, Vec8q().load(bm.a), b); + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_pd(zero_mask<8>(indexs), y); + } + return y; +} + + +template +static inline Vec16f blend16(Vec16f const a, Vec16f const b) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; // indexes as array + __m512 y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_ps(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute16 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes + return permute16 < + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint16_t mb = (uint16_t)make_bit_mask<16, 0x304>(indexs); // blend mask + y = _mm512_mask_mov_ps(a, mb, b); + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 64-bit blocks + constexpr EList L = largeblock_perm<16>(indexs); // get 64-bit blend pattern + y = _mm512_castpd_ps(blend8 < + L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7] > + (Vec8d(_mm512_castps_pd(a)), Vec8d(_mm512_castps_pd(b)))); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & blend_same_pattern) != 0) { + // same pattern in all 128-bit lanes. check if pattern fits special cases + if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm512_unpacklo_ps(a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm512_unpacklo_ps(b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm512_unpackhi_ps(a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm512_unpackhi_ps(b, a); + } + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm512_shuffle_ps(a, b, uint8_t(flags >> blend_shufpattern)); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm512_shuffle_ps(b, a, uint8_t(flags >> blend_shufpattern)); + } + else { + // Use vshufps twice. This generates two instructions in the dependency chain, + // but we are avoiding the slower lane-crossing instruction, and saving 64 + // bytes of data cache. + auto shuf = [](int const (&a)[16]) constexpr { // get pattern for vpshufd + int pat[4] = {-1,-1,-1,-1}; + for (int i = 0; i < 16; i++) { + int ix = a[i]; + if (ix >= 0 && pat[i&3] < 0) { + pat[i&3] = ix; + } + } + return (pat[0] & 3) | (pat[1] & 3) << 2 | (pat[2] & 3) << 4 | (pat[3] & 3) << 6; + }; + constexpr uint8_t pattern = uint8_t(shuf(indexs)); // permute pattern + constexpr uint16_t froma = (uint16_t)make_bit_mask<16, 0x004>(indexs); // elements from a + constexpr uint16_t fromb = (uint16_t)make_bit_mask<16, 0x304>(indexs); // elements from b + y = _mm512_maskz_shuffle_ps( froma, a, a, pattern); + y = _mm512_mask_shuffle_ps (y, fromb, b, b, pattern); + return y; // we have already zeroed any unused elements + } + } + else { // No special cases + const EList bm = perm_mask_broad(indexs); + y = _mm512_permutex2var_ps(a, Vec16i().load(bm.a), b); + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_ps(zero_mask<16>(indexs), y); + } + return y; +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec16f lookup16(Vec16i const index, Vec16f const table) { + return _mm512_permutexvar_ps(index, table); +} + +template +static inline Vec16f lookup(Vec16i const index, float const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 16) { + Vec16f table1 = Vec16f().load((float*)table); + return lookup16(index, table1); + } + if constexpr (n <= 32) { + Vec16f table1 = Vec16f().load((float*)table); + Vec16f table2 = Vec16f().load((float*)table + 16); + return _mm512_permutex2var_ps(table1, index, table2); + } + // n > 32. Limit index + Vec16ui index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec16ui(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec16ui(index), uint32_t(n-1)); + } + return _mm512_i32gather_ps(index1, (const float*)table, 4); +} + + +static inline Vec8d lookup8(Vec8q const index, Vec8d const table) { + return _mm512_permutexvar_pd(index, table); +} + +template +static inline Vec8d lookup(Vec8q const index, double const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + Vec8d table1 = Vec8d().load((double*)table); + return lookup8(index, table1); + } + if constexpr (n <= 16) { + Vec8d table1 = Vec8d().load((double*)table); + Vec8d table2 = Vec8d().load((double*)table + 8); + return _mm512_permutex2var_pd(table1, index, table2); + } + // n > 16. Limit index + Vec8uq index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec8uq(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec8uq(index), uint32_t(n-1)); + } + return _mm512_i64gather_pd(index1, (const double*)table, 8); +} + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15 +template +static inline Vec16f gather16f(void const * a) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 15) { + // load one contiguous block and permute + if constexpr (imax > 15) { + // make sure we don't read past the end of the array + Vec16f b = Vec16f().load((float const *)a + imax-15); + return permute16 (b); + } + else { + Vec16f b = Vec16f().load((float const *)a + imin); + return permute16 (b); + } + } + if constexpr ((i0imax-16) && (i1imax-16) && (i2imax-16) && (i3imax-16) + && (i4imax-16) && (i5imax-16) && (i6imax-16) && (i7imax-16) + && (i8imax-16) && (i9imax-16) && (i10imax-16) && (i11imax-16) + && (i12imax-16) && (i13imax-16) && (i14imax-16) && (i15imax-16) ) { + // load two contiguous blocks and blend + Vec16f b = Vec16f().load((float const *)a + imin); + Vec16f c = Vec16f().load((float const *)a + imax-15); + const int j0 = i0 (b, c); + } + // use gather instruction + return _mm512_i32gather_ps(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const float *)a, 4); +} + + +template +static inline Vec8d gather8d(void const * a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 7) { + // load one contiguous block and permute + if constexpr (imax > 7) { + // make sure we don't read past the end of the array + Vec8d b = Vec8d().load((double const *)a + imax-7); + return permute8 (b); + } + else { + Vec8d b = Vec8d().load((double const *)a + imin); + return permute8 (b); + } + } + if constexpr ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) + && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { + // load two contiguous blocks and blend + Vec8d b = Vec8d().load((double const *)a + imin); + Vec8d c = Vec8d().load((double const *)a + imax-7); + const int j0 = i0(b, c); + } + // use gather instruction + return _mm512_i64gather_pd(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const double *)a, 8); +} + +/***************************************************************************** +* +* Vector scatter functions +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +*****************************************************************************/ + +template + static inline void scatter(Vec16f const data, float * array) { + __m512i indx = constant16ui(); + Vec16fb mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0, + i8>=0, i9>=0, i10>=0, i11>=0, i12>=0, i13>=0, i14>=0, i15>=0); + _mm512_mask_i32scatter_ps(array, mask, indx, data, 4); +} + +template +static inline void scatter(Vec8d const data, double * array) { + __m256i indx = constant8ui(); + Vec8db mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0); + _mm512_mask_i32scatter_pd(array, mask, indx, data, 8); +} + + +/***************************************************************************** +* +* Scatter functions with variable indexes +* +*****************************************************************************/ + +static inline void scatter(Vec16i const index, uint32_t limit, Vec16f const data, float * destination) { + Vec16fb mask = Vec16ui(index) < limit; + _mm512_mask_i32scatter_ps(destination, mask, index, data, 4); +} + +static inline void scatter(Vec8q const index, uint32_t limit, Vec8d const data, double * destination) { + Vec8db mask = Vec8uq(index) < uint64_t(limit); + _mm512_mask_i64scatter_pd(destination, (uint8_t)mask, index, data, 8); +} + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8d const data, double * destination) { +#if INSTRSET >= 10 // __AVX512VL__, __AVX512DQ__ + __mmask8 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit)); +#else + __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); +#endif + _mm512_mask_i32scatter_pd(destination, (__mmask8)mask, index, data, 8); +} + + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORF512_H diff --git a/DFTTest/VCL2/vectorf512e.h b/DFTTest/VCL2/vectorf512e.h new file mode 100644 index 0000000..66c1aaf --- /dev/null +++ b/DFTTest/VCL2/vectorf512e.h @@ -0,0 +1,1935 @@ +/**************************** vectorf512.h ******************************* +* Author: Agner Fog +* Date created: 2014-07-23 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining 512-bit floating point vector classes +* Emulated for processors without AVX512 instruction set +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec16f Vector of 16 single precision floating point numbers +* Vec16fb Vector of 16 Booleans for use with Vec16f +* Vec8d Vector of 8 double precision floating point numbers +* Vec8db Vector of 8 Booleans for use with Vec8d +* +* Each vector object is represented internally in the CPU as two 256-bit registers. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2014-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORF512E_H +#define VECTORF512E_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +#if defined (VECTORF512_H) +#error Two different versions of vectorf512.h included +#endif + +#include "vectori512e.h" + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +/***************************************************************************** +* +* Vec16fb: Vector of 16 broad booleans for use with Vec16f +* +*****************************************************************************/ +class Vec16fb : public Vec16b { +public: + // Default constructor: + Vec16fb () { + } + // Constructor to build from all elements: + Vec16fb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, + bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) : + Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) { + } + // Constructor from Vec16b + Vec16fb (Vec16b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Constructor from two Vec8fb + Vec16fb (Vec8fb const x0, Vec8fb const x1) { +#ifdef VECTORF256E_H + z0 = reinterpret_i(x0); + z1 = reinterpret_i(x1); +#else + z0 = x0; + z1 = x1; +#endif + } + // Constructor to broadcast scalar value: + Vec16fb(bool b) : Vec16b(b) { + } + // Assignment operator to broadcast scalar value: + Vec16fb & operator = (bool b) { + *this = Vec16b(b); + return *this; + } + // Get low and high half + Vec8fb get_low() const { + return reinterpret_f(Vec8i(z0)); + } + Vec8fb get_high() const { + return reinterpret_f(Vec8i(z1)); + } + // Member function to change a bitfield to a boolean vector + Vec16fb & load_bits(uint16_t a) { + z0 = Vec8ib().load_bits(uint8_t(a)); + z1 = Vec8ib().load_bits(uint8_t(a>>8)); + return *this; + } + // Prevent constructing from int, etc. + Vec16fb(int b) = delete; + Vec16fb & operator = (int x) = delete; +}; + +// Define operators for Vec16fb + +// vector operator & : bitwise and +static inline Vec16fb operator & (Vec16fb const a, Vec16fb const b) { + return Vec16fb(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec16fb operator && (Vec16fb const a, Vec16fb const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec16fb operator | (Vec16fb const a, Vec16fb const b) { + return Vec16fb(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} +static inline Vec16fb operator || (Vec16fb const a, Vec16fb const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec16fb operator ^ (Vec16fb const a, Vec16fb const b) { + return Vec16fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator == : xnor +static inline Vec16fb operator == (Vec16fb const a, Vec16fb const b) { + return Vec16fb(Vec16fb(a) ^ Vec16fb(~b)); +} + +// vector operator != : xor +static inline Vec16fb operator != (Vec16fb const a, Vec16fb const b) { + return Vec16fb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec16fb operator ~ (Vec16fb const a) { + return Vec16fb(~a.get_low(), ~a.get_high()); +} + +// vector operator ! : element not +static inline Vec16fb operator ! (Vec16fb const a) { + return ~a; +} + +// vector operator &= : bitwise and +static inline Vec16fb & operator &= (Vec16fb & a, Vec16fb const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec16fb & operator |= (Vec16fb & a, Vec16fb const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec16fb & operator ^= (Vec16fb & a, Vec16fb const b) { + a = a ^ b; + return a; +} + + +/***************************************************************************** +* +* Vec8db: Vector of 8 broad booleans for use with Vec8d +* +*****************************************************************************/ + +class Vec8db : public Vec512b { +public: + // Default constructor: + Vec8db () { + } + // Constructor to build from all elements: + Vec8db(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) { + z0 = Vec4qb(x0, x1, x2, x3); + z1 = Vec4qb(x4, x5, x6, x7); + } + // Construct from Vec512b + Vec8db (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Constructor from two Vec4db + Vec8db (Vec4db const x0, Vec4db const x1) { +#ifdef VECTORF256E_H + z0 = reinterpret_i(x0); + z1 = reinterpret_i(x1); +#else + z0 = x0; + z1 = x1; +#endif + } + // Constructor to broadcast single value: + Vec8db(bool b) { + z0 = z1 = Vec8i(-int32_t(b)); + } + // Assignment operator to broadcast scalar value: + Vec8db & operator = (bool b) { + *this = Vec8db(b); + return *this; + } + Vec8db & insert(int index, bool a) { + if (index < 4) { + z0 = Vec4q(z0).insert(index, -(int64_t)a); + } + else { + z1 = Vec4q(z1).insert(index-4, -(int64_t)a); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + if ((uint32_t)index < 4) { + return Vec4q(z0).extract(index) != 0; + } + else { + return Vec4q(z1).extract(index-4) != 0; + } + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Get low and high half + Vec4db get_low() const { + return reinterpret_d(Vec4q(z0)); + } + Vec4db get_high() const { + return reinterpret_d(Vec4q(z1)); + } + // Member function to change a bitfield to a boolean vector + Vec8db & load_bits(uint8_t a) { + z0 = Vec4qb().load_bits(a); + z1 = Vec4qb().load_bits(uint8_t(a>>4u)); + return *this; + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. because of ambiguity + Vec8db(int b) = delete; + // Prevent assigning int because of ambiguity + Vec8db & operator = (int x) = delete; +}; + +// Define operators for Vec8db + +// vector operator & : bitwise and +static inline Vec8db operator & (Vec8db const a, Vec8db const b) { + return Vec8db(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec8db operator && (Vec8db const a, Vec8db const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec8db operator | (Vec8db const a, Vec8db const b) { + return Vec8db(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} +static inline Vec8db operator || (Vec8db const a, Vec8db const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec8db operator ^ (Vec8db const a, Vec8db const b) { + return Vec8db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator == : xnor +static inline Vec8db operator == (Vec8db const a, Vec8db const b) { + return Vec8db(Vec8db(a) ^ Vec8db(~b)); +} + +// vector operator != : xor +static inline Vec8db operator != (Vec8db const a, Vec8db const b) { + return Vec8db(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec8db operator ~ (Vec8db const a) { + return Vec8db(~a.get_low(), ~a.get_high()); +} + +// vector operator ! : element not +static inline Vec8db operator ! (Vec8db const a) { + return ~a; +} + +// vector operator &= : bitwise and +static inline Vec8db & operator &= (Vec8db & a, Vec8db const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec8db & operator |= (Vec8db & a, Vec8db const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec8db & operator ^= (Vec8db & a, Vec8db const b) { + a = a ^ b; + return a; +} + + +/***************************************************************************** +* +* Vec16f: Vector of 16 single precision floating point values +* +*****************************************************************************/ + +class Vec16f { +protected: + Vec8f z0; + Vec8f z1; +public: + // Default constructor: + Vec16f() { + } + // Constructor to broadcast the same value into all elements: + Vec16f(float f) { + z0 = z1 = Vec8f(f); + } + // Constructor to build from all elements: + Vec16f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7, + float f8, float f9, float f10, float f11, float f12, float f13, float f14, float f15) { + z0 = Vec8f(f0, f1, f2, f3, f4, f5, f6, f7); + z1 = Vec8f(f8, f9, f10, f11, f12, f13, f14, f15); + } + // Constructor to build from two Vec8f: + Vec16f(Vec8f const a0, Vec8f const a1) { + z0 = a0; + z1 = a1; + } + // split into two halves + Vec8f get_low() const { + return z0; + } + Vec8f get_high() const { + return z1; + } + // Member function to load from array (unaligned) + Vec16f & load(float const * p) { + z0 = Vec8f().load(p); + z1 = Vec8f().load(p+8); + return *this; + } + // Member function to load from array, aligned by 64 + // You may use load_a instead of load if you are certain that p points to an address divisible by 64 + Vec16f & load_a(float const * p) { + z0 = Vec8f().load_a(p); + z1 = Vec8f().load_a(p+8); + return *this; + } + // Member function to store into array (unaligned) + void store(float * p) const { + Vec8f(z0).store(p); + Vec8f(z1).store(p+8); + } + // Member function to store into array, aligned by 64 + // You may use store_a instead of store if you are certain that p points to an address divisible by 64 + void store_a(float * p) const { + Vec8f(z0).store_a(p); + Vec8f(z1).store_a(p+8); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 64 + void store_nt(float * p) const { + Vec8f(z0).store_nt(p); + Vec8f(z1).store_nt(p+8); + } + // Partial load. Load n elements and set the rest to 0 + Vec16f & load_partial(int n, float const * p) { + if (n < 8) { + z0 = Vec8f().load_partial(n, p); + z1 = Vec8f(0.f); + } + else { + z0 = Vec8f().load(p); + z1 = Vec8f().load_partial(n-8, p + 8); + } + return *this; + } + // Partial store. Store n elements + void store_partial(int n, float * p) const { + if (n < 8) { + Vec8f(z0).store_partial(n, p); + } + else { + Vec8f(z0).store(p); + Vec8f(z1).store_partial(n-8, p+8); + } + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec16f & cutoff(int n) { + if (n < 8) { + z0 = Vec8f(z0).cutoff(n); + z1 = Vec8f(0.f); + } + else { + z1 = Vec8f(z1).cutoff(n-8); + } + return *this; + } + // Member function to change a single element in vector + Vec16f const insert(int index, float value) { + if ((uint32_t)index < 8) { + z0 = Vec8f(z0).insert(index, value); + } + else { + z1 = Vec8f(z1).insert(index-8, value); + } + return *this; + } + // Member function extract a single element from vector + float extract(int index) const { + float a[16]; + store(a); + return a[index & 15]; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + float operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 16; + } +}; + + +/***************************************************************************** +* +* Operators for Vec16f +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec16f operator + (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator + : add vector and scalar +static inline Vec16f operator + (Vec16f const a, float b) { + return a + Vec16f(b); +} +static inline Vec16f operator + (float a, Vec16f const b) { + return Vec16f(a) + b; +} + +// vector operator += : add +static inline Vec16f & operator += (Vec16f & a, Vec16f const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec16f operator ++ (Vec16f & a, int) { + Vec16f a0 = a; + a = a + 1.0f; + return a0; +} + +// prefix operator ++ +static inline Vec16f & operator ++ (Vec16f & a) { + a = a + 1.0f; + return a; +} + +// vector operator - : subtract element by element +static inline Vec16f operator - (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator - : subtract vector and scalar +static inline Vec16f operator - (Vec16f const a, float b) { + return a - Vec16f(b); +} +static inline Vec16f operator - (float a, Vec16f const b) { + return Vec16f(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec16f operator - (Vec16f const a) { + return Vec16f(-a.get_low(), -a.get_high()); +} + +// vector operator -= : subtract +static inline Vec16f & operator -= (Vec16f & a, Vec16f const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec16f operator -- (Vec16f & a, int) { + Vec16f a0 = a; + a = a - 1.0f; + return a0; +} + +// prefix operator -- +static inline Vec16f & operator -- (Vec16f & a) { + a = a - 1.0f; + return a; +} + +// vector operator * : multiply element by element +static inline Vec16f operator * (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator * : multiply vector and scalar +static inline Vec16f operator * (Vec16f const a, float b) { + return a * Vec16f(b); +} +static inline Vec16f operator * (float a, Vec16f const b) { + return Vec16f(a) * b; +} + +// vector operator *= : multiply +static inline Vec16f & operator *= (Vec16f & a, Vec16f const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec16f operator / (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() / b.get_low(), a.get_high() / b.get_high()); +} + +// vector operator / : divide vector and scalar +static inline Vec16f operator / (Vec16f const a, float b) { + return a / Vec16f(b); +} +static inline Vec16f operator / (float a, Vec16f const b) { + return Vec16f(a) / b; +} + +// vector operator /= : divide +static inline Vec16f & operator /= (Vec16f & a, Vec16f const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec16fb operator == (Vec16f const a, Vec16f const b) { + return Vec16fb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec16fb operator != (Vec16f const a, Vec16f const b) { + return Vec16fb(a.get_low() != b.get_low(), a.get_high() != b.get_high()); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec16fb operator < (Vec16f const a, Vec16f const b) { + return Vec16fb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec16fb operator <= (Vec16f const a, Vec16f const b) { + return Vec16fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high()); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec16fb operator > (Vec16f const a, Vec16f const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec16fb operator >= (Vec16f const a, Vec16f const b) { + return b <= a; +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec16f operator & (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec16f & operator &= (Vec16f & a, Vec16f const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec16f and Vec16fb +static inline Vec16f operator & (Vec16f const a, Vec16fb const b) { + return Vec16f(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec16f operator & (Vec16fb const a, Vec16f const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec16f operator | (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} + +// vector operator |= : bitwise or +static inline Vec16f & operator |= (Vec16f & a, Vec16f const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16f operator ^ (Vec16f const a, Vec16f const b) { + return Vec16f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ^= : bitwise xor +static inline Vec16f & operator ^= (Vec16f & a, Vec16f const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec16fb operator ! (Vec16f const a) { + return Vec16fb(!a.get_low(), !a.get_high()); +} + + +/***************************************************************************** +* +* Functions for Vec16f +* +*****************************************************************************/ + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed. +static inline Vec16f select (Vec16fb const s, Vec16f const a, Vec16f const b) { + return Vec16f(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16f if_add (Vec16fb const f, Vec16f const a, Vec16f const b) { + return Vec16f(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec16f if_sub (Vec16fb const f, Vec16f const a, Vec16f const b) { + return Vec16f(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec16f if_mul (Vec16fb const f, Vec16f const a, Vec16f const b) { + return Vec16f(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional divide +static inline Vec16f if_div (Vec16fb const f, Vec16f const a, Vec16f const b) { + return Vec16f(if_div(f.get_low(), a.get_low(), b.get_low()), if_div(f.get_high(), a.get_high(), b.get_high())); +} + +// Horizontal add: Calculates the sum of all vector elements. +static inline float horizontal_add (Vec16f const a) { + return horizontal_add(a.get_low() + a.get_high()); +} + +// function max: a > b ? a : b +static inline Vec16f max(Vec16f const a, Vec16f const b) { + return Vec16f(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec16f min(Vec16f const a, Vec16f const b) { + return Vec16f(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +// Removes sign bit, even for -0.0f, -INF and -NAN +static inline Vec16f abs(Vec16f const a) { + return Vec16f(abs(a.get_low()), abs(a.get_high())); +} + +// function sqrt: square root +static inline Vec16f sqrt(Vec16f const a) { + return Vec16f(sqrt(a.get_low()), sqrt(a.get_high())); +} + +// function square: a * a +static inline Vec16f square(Vec16f const a) { + return a * a; +} + +// pow(Vec16f, int): +template static Vec16f pow(Vec16f const a, TT const n); + +// Raise floating point numbers to integer power n +template <> +inline Vec16f pow(Vec16f const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec16f pow(Vec16f const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec16f pow_n(Vec16f const a) { + if (n < 0) return Vec16f(1.0f) / pow_n<-n>(a); + if (n == 0) return Vec16f(1.0f); + if (n >= 256) return pow(a, n); + Vec16f x = a; // a^(2^i) + Vec16f y; // accumulator + const int lowest = n - (n & (n-1));// lowest set bit in n + if (n & 1) y = x; + if (n < 2) return y; + x = x*x; // x^2 + if (n & 2) { + if (lowest == 2) y = x; else y *= x; + } + if (n < 4) return y; + x = x*x; // x^4 + if (n & 4) { + if (lowest == 4) y = x; else y *= x; + } + if (n < 8) return y; + x = x*x; // x^8 + if (n & 8) { + if (lowest == 8) y = x; else y *= x; + } + if (n < 16) return y; + x = x*x; // x^16 + if (n & 16) { + if (lowest == 16) y = x; else y *= x; + } + if (n < 32) return y; + x = x*x; // x^32 + if (n & 32) { + if (lowest == 32) y = x; else y *= x; + } + if (n < 64) return y; + x = x*x; // x^64 + if (n & 64) { + if (lowest == 64) y = x; else y *= x; + } + if (n < 128) return y; + x = x*x; // x^128 + if (n & 128) { + if (lowest == 128) y = x; else y *= x; + } + return y; +} + +template +static inline Vec16f pow(Vec16f const a, Const_int_t) { + return pow_n(a); +} + + +// function round: round to nearest integer (even). (result as float vector) +static inline Vec16f round(Vec16f const a) { + return Vec16f(round(a.get_low()), round(a.get_high())); +} + +// function truncate: round towards zero. (result as float vector) +static inline Vec16f truncate(Vec16f const a) { + return Vec16f(truncate(a.get_low()), truncate(a.get_high())); +} + +// function floor: round towards minus infinity. (result as float vector) +static inline Vec16f floor(Vec16f const a) { + return Vec16f(floor(a.get_low()), floor(a.get_high())); +} + +// function ceil: round towards plus infinity. (result as float vector) +static inline Vec16f ceil(Vec16f const a) { + return Vec16f(ceil(a.get_low()), ceil(a.get_high())); +} + +// function roundi: round to nearest integer (even). (result as integer vector) +static inline Vec16i roundi(Vec16f const a) { + return Vec16i(roundi(a.get_low()), roundi(a.get_high())); +} +//static inline Vec16i round_to_int(Vec16f const a) {return roundi(a);} // deprecated + +// function truncatei: round towards zero. (result as integer vector) +static inline Vec16i truncatei(Vec16f const a) { + return Vec16i(truncatei(a.get_low()), truncatei(a.get_high())); +} +//static inline Vec16i truncate_to_int(Vec16f const a) {return truncatei(a);} // deprecated + +// function to_float: convert integer vector to float vector +static inline Vec16f to_float(Vec16i const a) { + return Vec16f(to_float(a.get_low()), to_float(a.get_high())); +} + +// function to_float: convert unsigned integer vector to float vector +static inline Vec16f to_float(Vec16ui const a) { + return Vec16f(to_float(a.get_low()), to_float(a.get_high())); +} + + +// Approximate math functions + +// approximate reciprocal (Faster than 1.f / a. +// relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512) +static inline Vec16f approx_recipr(Vec16f const a) { + return Vec16f(approx_recipr(a.get_low()), approx_recipr(a.get_high())); +} + +// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). +// Relative accuracy better than 2^-11 without AVX512, 2^-14 with AVX512) +static inline Vec16f approx_rsqrt(Vec16f const a) { + return Vec16f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high())); +} + +// Fused multiply and add functions + +// Multiply and add +static inline Vec16f mul_add(Vec16f const a, Vec16f const b, Vec16f const c) { + return Vec16f(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high())); +} + +// Multiply and subtract +static inline Vec16f mul_sub(Vec16f const a, Vec16f const b, Vec16f const c) { + return Vec16f(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high())); +} + +// Multiply and inverse subtract +static inline Vec16f nmul_add(Vec16f const a, Vec16f const b, Vec16f const c) { + return Vec16f(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high())); +} + +// Multiply and subtract with extra precision on the intermediate calculations, +// even if FMA instructions not supported, using Veltkamp-Dekker split +static inline Vec16f mul_sub_x(Vec16f const a, Vec16f const b, Vec16f const c) { + return Vec16f(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high())); +} + + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 +static inline Vec16i exponent(Vec16f const a) { + return Vec16i(exponent(a.get_low()), exponent(a.get_high())); +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f +static inline Vec16f fraction(Vec16f const a) { + return Vec16f(fraction(a.get_low()), fraction(a.get_high())); +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0f +// n >= 128 gives +INF +// n <= -127 gives 0.0f +// This function will never produce denormals, and never raise exceptions +static inline Vec16f exp2(Vec16i const n) { + return Vec16f(exp2(n.get_low()), exp2(n.get_high())); +} +//static Vec16f exp2(Vec16f const x); // defined in vectormath_exp.h + + +// Categorization functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0f, -INF and -NAN +// Note that sign_bit(Vec16f(-0.0f)) gives true, while Vec16f(-0.0f) < Vec16f(0.0f) gives false +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb sign_bit(Vec16f const a) { + return Vec16fb(sign_bit(a.get_low()), sign_bit(a.get_high())); +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec16f sign_combine(Vec16f const a, Vec16f const b) { + return Vec16f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high())); +} + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb is_finite(Vec16f const a) { + return Vec16fb(is_finite(a.get_low()), is_finite(a.get_high())); +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb is_inf(Vec16f const a) { + return Vec16fb(is_inf(a.get_low()), is_inf(a.get_high())); +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) +static inline Vec16fb is_nan(Vec16f const a) { + return Vec16fb(is_nan(a.get_low()), is_nan(a.get_high())); +} + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec16fb is_subnormal(Vec16f const a) { + return Vec16fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec16fb is_zero_or_subnormal(Vec16f const a) { + return Vec16fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high())); +} + +// Function infinite4f: returns a vector where all elements are +INF +static inline Vec16f infinite16f() { + Vec8f inf = infinite8f(); + return Vec16f(inf, inf); +} + +// Function nan4f: returns a vector where all elements are +NAN (quiet) +static inline Vec16f nan16f(int n = 0x10) { + Vec8f nan = nan8f(n); + return Vec16f(nan, nan); +} + +// change signs on vectors Vec16f +// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change +// ("static" is removed from change_sign templates because it seems to generate problems for +// the Clang compiler with nested template calls. "static" is probably superfluous anyway.) +template +inline Vec16f change_sign(Vec16f const a) { + return Vec16f(change_sign(a.get_low()), change_sign(a.get_high())); +} + + +/***************************************************************************** +* +* Vec8d: Vector of 8 double precision floating point values +* +*****************************************************************************/ + +class Vec8d { +protected: + Vec4d z0; + Vec4d z1; +public: + // Default constructor: + Vec8d() { + } + // Constructor to broadcast the same value into all elements: + Vec8d(double d) { + z0 = z1 = Vec4d(d); + } + // Constructor to build from all elements: + Vec8d(double d0, double d1, double d2, double d3, double d4, double d5, double d6, double d7) { + z0 = Vec4d(d0, d1, d2, d3); + z1 = Vec4d(d4, d5, d6, d7); + } + // Constructor to build from two Vec4d: + Vec8d(Vec4d const a0, Vec4d const a1) { + z0 = a0; + z1 = a1; + } + // Member function to load from array (unaligned) + Vec8d & load(double const * p) { + z0.load(p); + z1.load(p+4); + return *this; + } + // Member function to load from array, aligned by 64 + // You may use load_a instead of load if you are certain that p points to an address divisible by 64 + Vec8d & load_a(double const * p) { + z0.load_a(p); + z1.load_a(p+4); + return *this; + } + // Member function to store into array (unaligned) + void store(double * p) const { + z0.store(p); + z1.store(p+4); + } + // Member function to store into array, aligned by 64 + // You may use store_a instead of store if you are certain that p points to an address divisible by 64 + void store_a(double * p) const { + z0.store_a(p); + z1.store_a(p+4); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 64 + void store_nt(double * p) const { + z0.store_nt(p); + z1.store_nt(p+4); + } + // Partial load. Load n elements and set the rest to 0 + Vec8d & load_partial(int n, double const * p) { + if (n < 4) { + z0.load_partial(n, p); + z1 = Vec4d(0.); + } + else { + z0.load(p); + z1.load_partial(n-4, p+4); + } + return *this; + } + // Partial store. Store n elements + void store_partial(int n, double * p) const { + if (n < 4) { + z0.store_partial(n, p); + } + else { + z0.store(p); + z1.store_partial(n-4, p+4); + } + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec8d & cutoff(int n) { + if (n < 4) { + z0.cutoff(n); + z1 = Vec4d(0.); + } + else { + z1.cutoff(n-4); + } + return *this; + } + // Member function to change a single element in vector + Vec8d const insert(int index, double value) { + if ((uint32_t)index < 4) { + z0.insert(index, value); + } + else { + z1.insert(index-4, value); + } + return *this; + } + // Member function extract a single element from vector + double extract(int index) const { + double a[8]; + store(a); + return a[index & 7]; + } + + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + double operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4d: + Vec4d get_low() const { + return z0; + } + Vec4d get_high() const { + return z1; + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 17; + } +}; + + +/***************************************************************************** +* +* Operators for Vec8d +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec8d operator + (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator + : add vector and scalar +static inline Vec8d operator + (Vec8d const a, double b) { + return a + Vec8d(b); +} +static inline Vec8d operator + (double a, Vec8d const b) { + return Vec8d(a) + b; +} + +// vector operator += : add +static inline Vec8d & operator += (Vec8d & a, Vec8d const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8d operator ++ (Vec8d & a, int) { + Vec8d a0 = a; + a = a + 1.0; + return a0; +} + +// prefix operator ++ +static inline Vec8d & operator ++ (Vec8d & a) { + a = a + 1.0; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8d operator - (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator - : subtract vector and scalar +static inline Vec8d operator - (Vec8d const a, double b) { + return a - Vec8d(b); +} +static inline Vec8d operator - (double a, Vec8d const b) { + return Vec8d(a) - b; +} + +// vector operator - : unary minus +// Change sign bit, even for 0, INF and NAN +static inline Vec8d operator - (Vec8d const a) { + return Vec8d(-a.get_low(), -a.get_high()); +} + +// vector operator -= : subtract +static inline Vec8d & operator -= (Vec8d & a, Vec8d const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8d operator -- (Vec8d & a, int) { + Vec8d a0 = a; + a = a - 1.0; + return a0; +} + +// prefix operator -- +static inline Vec8d & operator -- (Vec8d & a) { + a = a - 1.0; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8d operator * (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator * : multiply vector and scalar +static inline Vec8d operator * (Vec8d const a, double b) { + return a * Vec8d(b); +} +static inline Vec8d operator * (double a, Vec8d const b) { + return Vec8d(a) * b; +} + +// vector operator *= : multiply +static inline Vec8d & operator *= (Vec8d & a, Vec8d const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec8d operator / (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() / b.get_low(), a.get_high() / b.get_high()); +} + +// vector operator / : divide vector and scalar +static inline Vec8d operator / (Vec8d const a, double b) { + return a / Vec8d(b); +} +static inline Vec8d operator / (double a, Vec8d const b) { + return Vec8d(a) / b; +} + +// vector operator /= : divide +static inline Vec8d & operator /= (Vec8d & a, Vec8d const b) { + a = a / b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8db operator == (Vec8d const a, Vec8d const b) { + return Vec8db(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8db operator != (Vec8d const a, Vec8d const b) { + return Vec8db(a.get_low() != b.get_low(), a.get_high() != b.get_high()); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8db operator < (Vec8d const a, Vec8d const b) { + return Vec8db(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b +static inline Vec8db operator <= (Vec8d const a, Vec8d const b) { + return Vec8db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high()); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8db operator > (Vec8d const a, Vec8d const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b +static inline Vec8db operator >= (Vec8d const a, Vec8d const b) { + return b <= a; +} + +// Bitwise logical operators + +// vector operator & : bitwise and +static inline Vec8d operator & (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec8d & operator &= (Vec8d & a, Vec8d const b) { + a = a & b; + return a; +} + +// vector operator & : bitwise and of Vec8d and Vec8db +static inline Vec8d operator & (Vec8d const a, Vec8db const b) { + return Vec8d(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +static inline Vec8d operator & (Vec8db const a, Vec8d const b) { + return b & a; +} + +// vector operator | : bitwise or +static inline Vec8d operator | (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} + +// vector operator |= : bitwise or +static inline Vec8d & operator |= (Vec8d & a, Vec8d const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8d operator ^ (Vec8d const a, Vec8d const b) { + return Vec8d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ^= : bitwise xor +static inline Vec8d & operator ^= (Vec8d & a, Vec8d const b) { + a = a ^ b; + return a; +} + +// vector operator ! : logical not. Returns Boolean vector +static inline Vec8db operator ! (Vec8d const a) { + return Vec8db(!a.get_low(), !a.get_high()); +} + +/***************************************************************************** +* +* Functions for Vec8d +* +*****************************************************************************/ + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8d select (Vec8db const s, Vec8d const a, Vec8d const b) { + return Vec8d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8d if_add (Vec8db const f, Vec8d const a, Vec8d const b) { + return Vec8d(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec8d if_sub (Vec8db const f, Vec8d const a, Vec8d const b) { + return Vec8d(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec8d if_mul (Vec8db const f, Vec8d const a, Vec8d const b) { + return Vec8d(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional divide +static inline Vec8d if_div (Vec8db const f, Vec8d const a, Vec8d const b) { + return Vec8d(if_div(f.get_low(), a.get_low(), b.get_low()), if_div(f.get_high(), a.get_high(), b.get_high())); +} + +// General arithmetic functions, etc. + +// Horizontal add: Calculates the sum of all vector elements. +static inline double horizontal_add (Vec8d const a) { + return horizontal_add(a.get_low() + a.get_high()); +} + +// function max: a > b ? a : b +static inline Vec8d max(Vec8d const a, Vec8d const b) { + return Vec8d(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec8d min(Vec8d const a, Vec8d const b) { + return Vec8d(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} +// NAN-safe versions of maximum and minimum are in vector_convert.h + +// function abs: absolute value +// Removes sign bit, even for -0.0f, -INF and -NAN +static inline Vec8d abs(Vec8d const a) { + return Vec8d(abs(a.get_low()), abs(a.get_high())); +} + +// function sqrt: square root +static inline Vec8d sqrt(Vec8d const a) { + return Vec8d(sqrt(a.get_low()), sqrt(a.get_high())); +} + +// function square: a * a +static inline Vec8d square(Vec8d const a) { + return a * a; +} + +// pow(Vec8d, int): +template static Vec8d pow(Vec8d const a, TT const n); + +// Raise floating point numbers to integer power n +template <> +inline Vec8d pow(Vec8d const x0, int const n) { + return pow_template_i(x0, n); +} + +// allow conversion from unsigned int +template <> +inline Vec8d pow(Vec8d const x0, uint32_t const n) { + return pow_template_i(x0, (int)n); +} + + +// Raise floating point numbers to integer power n, where n is a compile-time constant +template +static inline Vec8d pow_n(Vec8d const a) { + if (n < 0) return Vec8d(1.0) / pow_n<-n>(a); + if (n == 0) return Vec8d(1.0); + if (n >= 256) return pow(a, n); + Vec8d x = a; // a^(2^i) + Vec8d y; // accumulator + const int lowest = n - (n & (n-1));// lowest set bit in n + if (n & 1) y = x; + if (n < 2) return y; + x = x*x; // x^2 + if (n & 2) { + if (lowest == 2) y = x; else y *= x; + } + if (n < 4) return y; + x = x*x; // x^4 + if (n & 4) { + if (lowest == 4) y = x; else y *= x; + } + if (n < 8) return y; + x = x*x; // x^8 + if (n & 8) { + if (lowest == 8) y = x; else y *= x; + } + if (n < 16) return y; + x = x*x; // x^16 + if (n & 16) { + if (lowest == 16) y = x; else y *= x; + } + if (n < 32) return y; + x = x*x; // x^32 + if (n & 32) { + if (lowest == 32) y = x; else y *= x; + } + if (n < 64) return y; + x = x*x; // x^64 + if (n & 64) { + if (lowest == 64) y = x; else y *= x; + } + if (n < 128) return y; + x = x*x; // x^128 + if (n & 128) { + if (lowest == 128) y = x; else y *= x; + } + return y; +} + +template +static inline Vec8d pow(Vec8d const a, Const_int_t) { + return pow_n(a); +} + + +// function round: round to nearest integer (even). (result as double vector) +static inline Vec8d round(Vec8d const a) { + return Vec8d(round(a.get_low()), round(a.get_high())); +} + +// function truncate: round towards zero. (result as double vector) +static inline Vec8d truncate(Vec8d const a) { + return Vec8d(truncate(a.get_low()), truncate(a.get_high())); +} + +// function floor: round towards minus infinity. (result as double vector) +static inline Vec8d floor(Vec8d const a) { + return Vec8d(floor(a.get_low()), floor(a.get_high())); +} + +// function ceil: round towards plus infinity. (result as double vector) +static inline Vec8d ceil(Vec8d const a) { + return Vec8d(ceil(a.get_low()), ceil(a.get_high())); +} + +// function round_to_int32: round to nearest integer (even). (result as integer vector) +static inline Vec8i round_to_int32(Vec8d const a) { + // Note: assume MXCSR control register is set to rounding + return Vec8i(round_to_int32(a.get_low()), round_to_int32(a.get_high())); +} +//static inline Vec8i round_to_int(Vec8d const a) {return round_to_int32(a);} // deprecated + +// function truncate_to_int32: round towards zero. (result as integer vector) +static inline Vec8i truncate_to_int32(Vec8d const a) { + return Vec8i(truncate_to_int32(a.get_low()), truncate_to_int32(a.get_high())); +} +//static inline Vec8i truncate_to_int(Vec8d const a) {return truncate_to_int32(a);} // deprecated + +// function truncatei: round towards zero. (inefficient) +static inline Vec8q truncatei(Vec8d const a) { + return Vec8q(truncatei(a.get_low()), truncatei(a.get_high())); +} +//static inline Vec8q truncate_to_int64(Vec8d const a) {return truncatei(a);} // deprecated + +// function roundi: round to nearest or even. (inefficient) +static inline Vec8q roundi(Vec8d const a) { + return Vec8q(roundi(a.get_low()), roundi(a.get_high())); +} +//static inline Vec8q round_to_int64(Vec8d const a) {return roundi(a);} // deprecated + +// function to_double: convert integer vector elements to double vector (inefficient) +static inline Vec8d to_double(Vec8q const a) { + return Vec8d(to_double(a.get_low()), to_double(a.get_high())); +} + +// function to_double: convert unsigned integer vector elements to double vector (inefficient) +static inline Vec8d to_double(Vec8uq const a) { + return Vec8d(to_double(a.get_low()), to_double(a.get_high())); +} + +// function to_double: convert integer vector to double vector +static inline Vec8d to_double(Vec8i const a) { + return Vec8d(to_double(a.get_low()), to_double(a.get_high())); +} + +// function compress: convert two Vec8d to one Vec16f +static inline Vec16f compress (Vec8d const low, Vec8d const high) { + return Vec16f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high())); +} + +// Function extend_low : convert Vec16f vector elements 0 - 3 to Vec8d +static inline Vec8d extend_low(Vec16f const a) { + return Vec8d(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : convert Vec16f vector elements 4 - 7 to Vec8d +static inline Vec8d extend_high (Vec16f const a) { + return Vec8d(extend_low(a.get_high()), extend_high(a.get_high())); +} + +// Fused multiply and add functions + +// Multiply and add +static inline Vec8d mul_add(Vec8d const a, Vec8d const b, Vec8d const c) { + return Vec8d(mul_add(a.get_low(), b.get_low(), c.get_low()), mul_add(a.get_high(), b.get_high(), c.get_high())); +} + +// Multiply and subtract +static inline Vec8d mul_sub(Vec8d const a, Vec8d const b, Vec8d const c) { + return Vec8d(mul_sub(a.get_low(), b.get_low(), c.get_low()), mul_sub(a.get_high(), b.get_high(), c.get_high())); +} + +// Multiply and inverse subtract +static inline Vec8d nmul_add(Vec8d const a, Vec8d const b, Vec8d const c) { + return Vec8d(nmul_add(a.get_low(), b.get_low(), c.get_low()), nmul_add(a.get_high(), b.get_high(), c.get_high())); +} + +// Multiply and subtract with extra precision on the intermediate calculations, +// even if FMA instructions not supported, using Veltkamp-Dekker split +static inline Vec8d mul_sub_x(Vec8d const a, Vec8d const b, Vec8d const c) { + return Vec8d(mul_sub_x(a.get_low(), b.get_low(), c.get_low()), mul_sub_x(a.get_high(), b.get_high(), c.get_high())); +} + +// Math functions using fast bit manipulation + +// Extract the exponent as an integer +// exponent(a) = floor(log2(abs(a))); +// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 +static inline Vec8q exponent(Vec8d const a) { + return Vec8q(exponent(a.get_low()), exponent(a.get_high())); +} + +// Extract the fraction part of a floating point number +// a = 2^exponent(a) * fraction(a), except for a = 0 +// fraction(1.0) = 1.0, fraction(5.0) = 1.25 +static inline Vec8d fraction(Vec8d const a) { + return Vec8d(fraction(a.get_low()), fraction(a.get_high())); +} + +// Fast calculation of pow(2,n) with n integer +// n = 0 gives 1.0 +// n >= 1024 gives +INF +// n <= -1023 gives 0.0 +// This function will never produce denormals, and never raise exceptions +static inline Vec8d exp2(Vec8q const n) { + return Vec8d(exp2(n.get_low()), exp2(n.get_high())); +} +//static Vec8d exp2(Vec8d const x); // defined in vectormath_exp.h + + +// Categorization functions + +// Function sign_bit: gives true for elements that have the sign bit set +// even for -0.0, -INF and -NAN +// Note that sign_bit(Vec8d(-0.0)) gives true, while Vec8d(-0.0) < Vec8d(0.0) gives false +static inline Vec8db sign_bit(Vec8d const a) { + return Vec8db(sign_bit(a.get_low()), sign_bit(a.get_high())); +} + +// Function sign_combine: changes the sign of a when b has the sign bit set +// same as select(sign_bit(b), -a, a) +static inline Vec8d sign_combine(Vec8d const a, Vec8d const b) { + return Vec8d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high())); +} + +// Function is_finite: gives true for elements that are normal, denormal or zero, +// false for INF and NAN +static inline Vec8db is_finite(Vec8d const a) { + return Vec8db(is_finite(a.get_low()), is_finite(a.get_high())); +} + +// Function is_inf: gives true for elements that are +INF or -INF +// false for finite numbers and NAN +static inline Vec8db is_inf(Vec8d const a) { + return Vec8db(is_inf(a.get_low()), is_inf(a.get_high())); +} + +// Function is_nan: gives true for elements that are +NAN or -NAN +// false for finite numbers and +/-INF +static inline Vec8db is_nan(Vec8d const a) { + return Vec8db(is_nan(a.get_low()), is_nan(a.get_high())); +} + +// Function is_subnormal: gives true for elements that are denormal (subnormal) +// false for finite numbers, zero, NAN and INF +static inline Vec8db is_subnormal(Vec8d const a) { + return Vec8db(is_subnormal(a.get_low()), is_subnormal(a.get_high())); +} + +// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) +// false for finite numbers, NAN and INF +static inline Vec8db is_zero_or_subnormal(Vec8d const a) { + return Vec8db(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high())); +} + +// Function infinite2d: returns a vector where all elements are +INF +static inline Vec8d infinite8d() { + Vec4d inf = infinite4d(); + return Vec8d(inf, inf); +} + +// Function nan8d: returns a vector where all elements are +NAN (quiet NAN) +static inline Vec8d nan8d(int n = 0x10) { + Vec4d nan = nan4d(n); + return Vec8d(nan, nan); +} + +// change signs on vectors Vec8d +// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change +template +inline Vec8d change_sign(Vec8d const a) { + return Vec8d(change_sign(a.get_low()), change_sign(a.get_high())); +} + + +/***************************************************************************** +* +* Functions for reinterpretation between vector types +* +*****************************************************************************/ + +static inline Vec512b reinterpret_i (Vec512b const x) { + return x; +} + +static inline Vec512b reinterpret_i (Vec16f const x) { + return Vec512b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); +} + +static inline Vec512b reinterpret_i (Vec8d const x) { + return Vec512b(reinterpret_i(x.get_low()), reinterpret_i(x.get_high())); +} + +static inline Vec16f reinterpret_f (Vec512b const x) { + return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high()))); +} + +static inline Vec16f reinterpret_f (Vec16f const x) { + return x; +} + +static inline Vec16f reinterpret_f (Vec8d const x) { + return Vec16f(Vec8f(reinterpret_f(x.get_low())), Vec8f(reinterpret_f(x.get_high()))); +} + +static inline Vec8d reinterpret_d (Vec512b const x) { + return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high()))); +} + +static inline Vec8d reinterpret_d (Vec16f const x) { + return Vec8d(Vec4d(reinterpret_d(x.get_low())), Vec4d(reinterpret_d(x.get_high()))); +} + +static inline Vec8d reinterpret_d (Vec8d const x) { + return x; +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// Permute vector of 8 double +// Index -1 gives 0, index -256 means don't care. +template +static inline Vec8d permute8(Vec8d const a) { + return Vec8d(blend4 (a.get_low(), a.get_high()), + blend4 (a.get_low(), a.get_high())); +} + +// Permute vector of 16 float +// Index -1 gives 0, index -256 means don't care. +template +static inline Vec16f permute16(Vec16f const a) { + return Vec16f(blend8 (a.get_low(), a.get_high()), + blend8 (a.get_low(), a.get_high())); +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ + +// blend vectors Vec8d +template +static inline Vec8d blend8(Vec8d const a, Vec8d const b) { + Vec4d x0 = blend_half(a, b); + Vec4d x1 = blend_half(a, b); + return Vec8d(x0, x1); +} + +template +static inline Vec16f blend16(Vec16f const a, Vec16f const b) { + Vec8f x0 = blend_half(a, b); + Vec8f x1 = blend_half(a, b); + return Vec16f(x0, x1); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec16f lookup16(Vec16i const index, Vec16f const table) { + float tab[16]; + table.store(tab); + Vec8f t0 = reinterpret_f(lookup<16>(index.get_low(), tab)); + Vec8f t1 = reinterpret_f(lookup<16>(index.get_high(), tab)); + return Vec16f(t0, t1); +} + +template +static inline Vec16f lookup(Vec16i const index, float const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 16) return lookup16(index, Vec16f().load(table)); + // n > 16. Limit index + Vec16ui i1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + i1 = Vec16ui(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + i1 = min(Vec16ui(index), n-1); + } + float const * t = table; + return Vec16f(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]], + t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]); +} + +static inline Vec8d lookup8(Vec8q const index, Vec8d const table) { + double tab[8]; + table.store(tab); + Vec4d t0 = reinterpret_d(lookup<8>(index.get_low(), tab)); + Vec4d t1 = reinterpret_d(lookup<8>(index.get_high(), tab)); + return Vec8d(t0, t1); +} + +template +static inline Vec8d lookup(Vec8q const index, double const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + return lookup8(index, Vec8d().load(table)); + } + // n > 8. Limit index + Vec8uq i1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + i1 = Vec8uq(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + i1 = min(Vec8uq(index), n-1); + } + double const * t = table; + return Vec8d(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]); +} + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ + +// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15 +template +static inline Vec16f gather16f(void const * a) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 15) { + // load one contiguous block and permute + if constexpr (imax > 15) { + // make sure we don't read past the end of the array + Vec16f b = Vec16f().load((float const *)a + imax-15); + return permute16 (b); + } + else { + Vec16f b = Vec16f().load((float const *)a + imin); + return permute16 (b); + } + } + if constexpr ((i0imax-16) && (i1imax-16) && (i2imax-16) && (i3imax-16) + && (i4imax-16) && (i5imax-16) && (i6imax-16) && (i7imax-16) + && (i8imax-16) && (i9imax-16) && (i10imax-16) && (i11imax-16) + && (i12imax-16) && (i13imax-16) && (i14imax-16) && (i15imax-16) ) { + // load two contiguous blocks and blend + Vec16f b = Vec16f().load((float const *)a + imin); + Vec16f c = Vec16f().load((float const *)a + imax-15); + const int j0 = i0 (b, c); + } + // use lookup function + return lookup(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const float *)a); +} + + +template +static inline Vec8d gather8d(void const * a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 7) { + // load one contiguous block and permute + if constexpr (imax > 7) { + // make sure we don't read past the end of the array + Vec8d b = Vec8d().load((double const *)a + imax-7); + return permute8 (b); + } + else { + Vec8d b = Vec8d().load((double const *)a + imin); + return permute8 (b); + } + } + if constexpr ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) + && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { + // load two contiguous blocks and blend + Vec8d b = Vec8d().load((double const *)a + imin); + Vec8d c = Vec8d().load((double const *)a + imax-7); + const int j0 = i0(b, c); + } + // use lookup function + return lookup(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const double *)a); +} + + +/***************************************************************************** +* +* Vector scatter functions +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +*****************************************************************************/ + +template + static inline void scatter(Vec16f const data, float * array) { + const int index[16] = {i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}; + for (int i = 0; i < 16; i++) { + if (index[i] >= 0) array[index[i]] = data[i]; + } +} + +template +static inline void scatter(Vec8d const data, double * array) { + const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; + for (int i = 0; i < 8; i++) { + if (index[i] >= 0) array[index[i]] = data[i]; + } +} + +// Scatter functions with variable indexes: + +static inline void scatter(Vec16i const index, uint32_t limit, Vec16f const data, float * destination) { + uint32_t ix[16]; index.store(ix); + for (int i = 0; i < 16; i++) { + if (ix[i] < limit) destination[ix[i]] = data[i]; + } +} + +static inline void scatter(Vec8q const index, uint32_t limit, Vec8d const data, double * destination) { + uint64_t ix[8]; index.store(ix); + for (int i = 0; i < 8; i++) { + if (ix[i] < limit) destination[ix[i]] = data[i]; + } +} + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8d const data, double * destination) { + uint32_t ix[8]; index.store(ix); + for (int i = 0; i < 8; i++) { + if (ix[i] < limit) destination[ix[i]] = data[i]; + } +} + + +/***************************************************************************** +* +* Boolean <-> bitfield conversion functions +* +*****************************************************************************/ + +// to_bits: convert boolean vector to integer bitfield +static inline uint16_t to_bits(Vec16fb const x) { + return to_bits(Vec16ib(x)); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec8db const x) { + return to_bits(Vec8qb(x)); +} + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORF512E_H diff --git a/DFTTest/VCL2/vectori128.h b/DFTTest/VCL2/vectori128.h new file mode 100644 index 0000000..a3edef1 --- /dev/null +++ b/DFTTest/VCL2/vectori128.h @@ -0,0 +1,7112 @@ +/**************************** vectori128.h ******************************* +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining 128-bit integer vector classes +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec128b Vector of 128 bits. Used internally as base class +* Vec16c Vector of 16 8-bit signed integers +* Vec16uc Vector of 16 8-bit unsigned integers +* Vec16cb Vector of 16 Booleans for use with Vec16c and Vec16uc +* Vec8s Vector of 8 16-bit signed integers +* Vec8us Vector of 8 16-bit unsigned integers +* Vec8sb Vector of 8 Booleans for use with Vec8s and Vec8us +* Vec4i Vector of 4 32-bit signed integers +* Vec4ui Vector of 4 32-bit unsigned integers +* Vec4ib Vector of 4 Booleans for use with Vec4i and Vec4ui +* Vec2q Vector of 2 64-bit signed integers +* Vec2uq Vector of 2 64-bit unsigned integers +* Vec2qb Vector of 2 Booleans for use with Vec2q and Vec2uq +* +* Each vector object is represented internally in the CPU as a 128-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORI128_H +#define VECTORI128_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +#ifdef VCL_NAMESPACE // optional namespace +namespace VCL_NAMESPACE { +#endif + + +// Generate a constant vector of 4 integers stored in memory. +template +static inline constexpr __m128i constant4ui() { + /* + const union { + uint32_t i[4]; + __m128i xmm; + } u = { {i0,i1,i2,i3} }; + return u.xmm; + */ + return _mm_setr_epi32(i0, i1, i2, i3); +} + + +/***************************************************************************** +* +* Compact boolean vectors +* +*****************************************************************************/ +#if INSTRSET >= 9 +class Vec8b; // allow forward reference to Vec8b + +#if INSTRSET == 9 && MAX_VECTOR_SIZE >= 512 // special case of mixed compact and broad vectors +class Vec8ib; +class Vec8fb; +class Vec4qb; +class Vec4db; +#endif + +// Compact vector of 16 booleans +class Vec16b { +protected: + __mmask16 mm; // Boolean mask register +public: + // Default constructor: + Vec16b() { + } + // Constructor to convert from type __mmask16 used in intrinsics + Vec16b(__mmask16 x) { + mm = x; + } + // Constructor to build from all elements: + Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, + bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) { + mm = uint16_t( + (uint16_t)b0 | (uint16_t)b1 << 1 | (uint16_t)b2 << 2 | (uint16_t)b3 << 3 | + (uint16_t)b4 << 4 | (uint16_t)b5 << 5 | (uint16_t)b6 << 6 | (uint16_t)b7 << 7 | + (uint16_t)b8 << 8 | (uint16_t)b9 << 9 | (uint16_t)b10 << 10 | (uint16_t)b11 << 11 | + (uint16_t)b12 << 12 | (uint16_t)b13 << 13 | (uint16_t)b14 << 14 | (uint16_t)b15 << 15); + } + // Constructor to broadcast single value: + Vec16b(bool b) { + mm = __mmask16(-int16_t(b)); + } + // Constructor to make from two halves. Implemented below after declaration of Vec8b + inline Vec16b(Vec8b const x0, Vec8b const x1); +#if INSTRSET == 9 && MAX_VECTOR_SIZE >= 512 // special case of mixed compact and broad vectors + inline Vec16b(Vec8ib const x0, Vec8ib const x1); // in vectorf512.h + inline Vec16b(Vec8fb const x0, Vec8fb const x1); // in vectorf512.h +#endif + + // Assignment operator to convert from type __mmask16 used in intrinsics: + Vec16b & operator = (__mmask16 x) { + mm = x; + return *this; + } + // Assignment operator to broadcast scalar value: + Vec16b & operator = (bool b) { + mm = Vec16b(b); + return *this; + } + // Type cast operator to convert to __mmask16 used in intrinsics + operator __mmask16() const { + return mm; + } + // split into two halves +#if INSTRSET >= 10 + Vec8b get_low() const; + Vec8b get_high() const; +#elif INSTRSET == 9 && MAX_VECTOR_SIZE >= 512 // special case of mixed compact and broad vectors + Vec8ib get_low() const; // in vectorf512.h + Vec8ib get_high() const; // in vectorf512.h +#endif + // Member function to change a single element in vector + Vec16b const insert(int index, bool value) { + mm = __mmask16(((uint16_t)mm & ~(1 << index)) | (int)value << index); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return ((uint32_t)mm >> index) & 1; + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec16b & load_bits(uint16_t a) { + mm = __mmask16(a); + return *this; + } + // Number of elements + static constexpr int size() { + return 16; + } + // Type of elements + static constexpr int elementtype() { + return 2; + } + // I would like to prevent implicit conversion from int, but this is + // not possible because __mmask16 and int16_t are treated as the same type: + // Vec16b(int b) = delete; + // Vec16b & operator = (int x) = delete; +}; + +#if INSTRSET >= 10 +class Vec2b; +class Vec4b; +#endif + +// Compact vector of 8 booleans +class Vec8b { +#if INSTRSET < 10 + // There is a problem in the case where we have AVX512F but not AVX512DQ: + // We have 8-bit masks, but 8-bit mask operations (KMOVB, KANDB, etc.) require AVX512DQ. + // We have to use 16-bit mask operations on 8-bit masks (KMOVW, KANDW, etc.). + // I don't know if this is necessary, but I am using __mmask16 rather than __mmask8 + // in this case to avoid that the compiler generates 8-bit mask instructions. + // We may get warnings in MS compiler when using __mmask16 on intrinsic functions + // that require __mmask8, but I would rather have warnings than code that crashes. + #define Vec8b_masktype __mmask16 +#else + #define Vec8b_masktype __mmask8 +#endif +protected: + Vec8b_masktype mm; // Boolean mask register +public: + // Default constructor: + Vec8b() { + } + // Constructor to convert from type __mmask8 used in intrinsics + Vec8b(__mmask8 x) { + mm = __mmask8(x); + } + // Constructor to convert from type __mmask16 used in intrinsics + Vec8b(__mmask16 x) { + mm = Vec8b_masktype(x); + } + // Constructor to make from two halves +#if INSTRSET >= 10 + inline Vec8b(Vec4b const x0, Vec4b const x1); // Implemented below after declaration of Vec4b +#elif INSTRSET == 9 && MAX_VECTOR_SIZE >= 512 // special case of mixed compact and broad vectors + inline Vec8b(Vec4qb const x0, Vec4qb const x1); // in vectorf512.h + inline Vec8b(Vec4db const x0, Vec4db const x1); // in vectorf512.h +#endif + + // Assignment operator to convert from type __mmask16 used in intrinsics: + Vec8b & operator = (Vec8b_masktype x) { + mm = Vec8b_masktype(x); + return *this; + } + // Constructor to build from all elements: + Vec8b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { + mm = uint8_t( + (uint8_t)b0 | (uint8_t)b1 << 1 | (uint8_t)b2 << 2 | (uint8_t)b3 << 3 | + (uint8_t)b4 << 4 | (uint8_t)b5 << 5 | (uint8_t)b6 << 6 | (uint8_t)b7 << 7); + } + // Constructor to broadcast single value: + Vec8b(bool b) { + mm = Vec8b_masktype(-int16_t(b)); + } + // Assignment operator to broadcast scalar value: + Vec8b & operator = (bool b) { + mm = Vec8b_masktype(Vec8b(b)); + return *this; + } + // Type cast operator to convert to __mmask16 used in intrinsics + operator Vec8b_masktype() const { + return mm; + } + // split into two halves +#if INSTRSET >= 10 + Vec4b get_low() const; + Vec4b get_high() const; +#elif INSTRSET == 9 && MAX_VECTOR_SIZE >= 512 // special case of mixed compact and broad vectors + Vec4qb get_low() const; // in vectorf512.h + Vec4qb get_high() const; // in vectorf512.h +#endif + // Member function to change a single element in vector + Vec8b const insert(int index, bool value) { + mm = Vec8b_masktype(((uint8_t)mm & ~(1 << index)) | (int)value << index); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return ((uint32_t)mm >> index) & 1; + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec8b & load_bits(uint8_t a) { + mm = Vec8b_masktype(a); + return *this; + } + // Number of elements + static constexpr int size() { + return 8; + } + // Type of elements + static constexpr int elementtype() { + return 2; + } +}; + +// Members of Vec16b that refer to Vec8b: +inline Vec16b::Vec16b(Vec8b const x0, Vec8b const x1) { + mm = uint8_t(x0) | uint16_t(x1) << 8; +} +#if INSTRSET >= 10 +inline Vec8b Vec16b::get_low() const { + return Vec8b().load_bits(uint8_t(mm)); +} +inline Vec8b Vec16b::get_high() const { + return Vec8b().load_bits(uint8_t((uint16_t)mm >> 8u)); +} +#endif + +#endif // INSTRSET >= 9 + +#if INSTRSET >= 10 +class Vec4b : public Vec8b { +public: + // Default constructor: + Vec4b() { + } + // Constructor to make from two halves + inline Vec4b(Vec2b const x0, Vec2b const x1); // Implemented below after declaration of Vec4b + + // Constructor to convert from type __mmask8 used in intrinsics + Vec4b(__mmask8 x) { + mm = x; + } + // Assignment operator to convert from type __mmask16 used in intrinsics: + Vec4b & operator = (__mmask8 x) { + mm = x; + return *this; + } + // Constructor to build from all elements: + Vec4b(bool b0, bool b1, bool b2, bool b3) { + mm = (uint8_t)b0 | (uint8_t)b1 << 1 | (uint8_t)b2 << 2 | (uint8_t)b3 << 3; + } + // Constructor to broadcast single value: + Vec4b(bool b) { + mm = -int8_t(b) & 0x0F; + } + // Assignment operator to broadcast scalar value: + Vec4b & operator = (bool b) { + mm = Vec4b(b); + return *this; + } + // split into two halves + Vec2b get_low() const; // Implemented below after declaration of Vec4b + Vec2b get_high() const; // Implemented below after declaration of Vec4b + + // Member function to change a bitfield to a boolean vector + Vec4b & load_bits(uint8_t a) { + mm = a & 0x0F; + return *this; + } + // Number of elements + static constexpr int size() { + return 4; + } +}; + +class Vec2b : public Vec8b { +public: + // Default constructor: + Vec2b() { + } + // Constructor to convert from type __mmask8 used in intrinsics + Vec2b(__mmask8 x) { + mm = x; + } + // Assignment operator to convert from type __mmask16 used in intrinsics: + Vec2b & operator = (__mmask8 x) { + mm = x; + return *this; + } + // Constructor to build from all elements: + Vec2b(bool b0, bool b1) { + mm = (uint8_t)b0 | (uint8_t)b1 << 1; + } + // Constructor to broadcast single value: + Vec2b(bool b) { + mm = -int8_t(b) & 0x03; + } + // Assignment operator to broadcast scalar value: + Vec2b & operator = (bool b) { + mm = Vec2b(b); + return *this; + } + // Member function to change a bitfield to a boolean vector + Vec2b & load_bits(uint8_t a) { + mm = a & 0x03; + return *this; + } + // Number of elements + static constexpr int size() { + return 2; + } +}; + +// Members of Vec8b that refer to Vec4b: +inline Vec8b::Vec8b(Vec4b const x0, Vec4b const x1) { + mm = (uint8_t(x0) & 0x0F) | (uint8_t(x1) << 4); +} +inline Vec4b Vec8b::get_low() const { + return Vec4b().load_bits(mm & 0xF); +} +inline Vec4b Vec8b::get_high() const { + return Vec4b().load_bits(mm >> 4u); +} +// Members of Vec4b that refer to Vec2b: +inline Vec4b::Vec4b(Vec2b const x0, Vec2b const x1) { + mm = (uint8_t(x0) & 0x03) | (uint8_t(x1) << 2); +} +inline Vec2b Vec4b::get_low() const { + return Vec2b().load_bits(mm & 3); +} +inline Vec2b Vec4b::get_high() const { + return Vec2b().load_bits(mm >> 2u); +} + +#endif + +/***************************************************************************** +* +* Define operators and functions for Vec16b +* +*****************************************************************************/ + +#if INSTRSET >= 9 + +// vector operator & : and +static inline Vec16b operator & (Vec16b a, Vec16b b) { + return _mm512_kand(__mmask16(a), __mmask16(b)); +} +static inline Vec16b operator && (Vec16b a, Vec16b b) { + return a & b; +} + +// vector operator | : or +static inline Vec16b operator | (Vec16b a, Vec16b b) { + return _mm512_kor(__mmask16(a), __mmask16(b)); +} +static inline Vec16b operator || (Vec16b a, Vec16b b) { + return a | b; +} + +// vector operator ^ : xor +static inline Vec16b operator ^ (Vec16b a, Vec16b b) { + return _mm512_kxor(__mmask16(a), __mmask16(b)); +} + +// vector operator == : xnor +static inline Vec16b operator == (Vec16b a, Vec16b b) { + return _mm512_kxnor(__mmask16(a), __mmask16(b)); +} + +// vector operator != : xor +static inline Vec16b operator != (Vec16b a, Vec16b b) { + return a ^ b; +} + +// vector operator ~ : not +static inline Vec16b operator ~ (Vec16b a) { + return _mm512_knot(__mmask16(a)); +} + +// vector operator ! : element not +static inline Vec16b operator ! (Vec16b a) { + return ~a; +} + +// vector operator &= : and +static inline Vec16b & operator &= (Vec16b & a, Vec16b b) { + a = a & b; + return a; +} + +// vector operator |= : or +static inline Vec16b & operator |= (Vec16b & a, Vec16b b) { + a = a | b; + return a; +} + +// vector operator ^= : xor +static inline Vec16b & operator ^= (Vec16b & a, Vec16b b) { + a = a ^ b; + return a; +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec16b const a) { + return __mmask16(a) == 0xFFFF; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec16b const a) { + return __mmask16(a) != 0; +} + +// function andnot: a & ~ b +static inline Vec16b andnot(Vec16b const a, Vec16b const b) { + return _mm512_kandn(b, a); +} + +#endif + + +/***************************************************************************** +* +* Define operators and functions for Vec8b +* +*****************************************************************************/ + +#if INSTRSET >= 9 // compact boolean vectors + +// vector operator & : and +static inline Vec8b operator & (Vec8b a, Vec8b b) { +#if INSTRSET >= 10 // 8-bit mask operations require AVX512DQ + // _kand_mask8(__mmask8(a), __mmask8(b)) // not defined + // must convert result to 8 bit, because bitwise operators promote everything to 32 bit results + return __mmask8(__mmask8(a) & __mmask8(b)); +#else + return _mm512_kand(__mmask16(a), __mmask16(b)); +#endif +} +static inline Vec8b operator && (Vec8b a, Vec8b b) { + return a & b; +} + +// vector operator | : or +static inline Vec8b operator | (Vec8b a, Vec8b b) { +#if INSTRSET >= 10 // 8-bit mask operations require AVX512DQ + return __mmask8(__mmask8(a) | __mmask8(b)); // _kor_mask8(__mmask8(a), __mmask8(b)); +#else + return _mm512_kor(__mmask16(a), __mmask16(b)); +#endif +} +static inline Vec8b operator || (Vec8b a, Vec8b b) { + return a | b; +} + +// vector operator ^ : xor +static inline Vec8b operator ^ (Vec8b a, Vec8b b) { +#if INSTRSET >= 10 // 8-bit mask operations require AVX512DQ + return __mmask8(__mmask8(a) ^ __mmask8(b)); // _kxor_mask8(__mmask8(a), __mmask8(b)); +#else + return _mm512_kxor(__mmask16(a), __mmask16(b)); +#endif +} + +// vector operator == : xnor +static inline Vec8b operator == (Vec8b a, Vec8b b) { +#if INSTRSET >= 10 // 8-bit mask operations require AVX512DQ + return __mmask8(~(__mmask8(a) ^ __mmask8(b))); // _kxnor_mask8(__mmask8(a), __mmask8(b)); +#else + return __mmask16(uint8_t(__mmask8(a) ^ __mmask8(b))); +#endif +} + +// vector operator != : xor +static inline Vec8b operator != (Vec8b a, Vec8b b) { + return a ^ b; +} + +// vector operator ~ : not +static inline Vec8b operator ~ (Vec8b a) { +#if INSTRSET >= 10 // 8-bit mask operations require AVX512DQ + return __mmask8(~__mmask8(a)); //_knot_mask8(__mmask8(a)); +#else + return _mm512_knot(__mmask16(a)); +#endif +} + +// vector operator ! : element not +static inline Vec8b operator ! (Vec8b a) { + return ~a; +} + +// vector operator &= : and +static inline Vec8b & operator &= (Vec8b & a, Vec8b b) { + a = a & b; + return a; +} + +// vector operator |= : or +static inline Vec8b & operator |= (Vec8b & a, Vec8b b) { + a = a | b; + return a; +} + +// vector operator ^= : xor +static inline Vec8b & operator ^= (Vec8b & a, Vec8b b) { + a = a ^ b; + return a; +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec8b const a) { + return uint8_t(Vec8b_masktype(a)) == 0xFFu; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec8b const a) { + return uint8_t(Vec8b_masktype(a)) != 0; +} + +// function andnot: a & ~ b +static inline Vec8b andnot(Vec8b const a, Vec8b const b) { + return Vec8b_masktype(_mm512_kandn(b, a)); +} +#endif + + +/***************************************************************************** +* +* Define operators for Vec4b +* +*****************************************************************************/ + +#if INSTRSET >= 10 // compact boolean vectors + +// vector operator & : and +static inline Vec4b operator & (Vec4b a, Vec4b b) { + return __mmask8(__mmask8(a) & __mmask8(b)); // _kand_mask8(__mmask8(a), __mmask8(b)) // not defined +} +static inline Vec4b operator && (Vec4b a, Vec4b b) { + return a & b; +} + +// vector operator | : or +static inline Vec4b operator | (Vec4b a, Vec4b b) { + return __mmask8(__mmask8(a) | __mmask8(b)); // _kor_mask8(__mmask8(a), __mmask8(b)); +} +static inline Vec4b operator || (Vec4b a, Vec4b b) { + return a | b; +} + +// vector operator ^ : xor +static inline Vec4b operator ^ (Vec4b a, Vec4b b) { + return __mmask8(__mmask8(a) ^ __mmask8(b)); // _kxor_mask8(__mmask8(a), __mmask8(b)); +} + +// vector operator ~ : not +static inline Vec4b operator ~ (Vec4b a) { + return __mmask8(__mmask8(a) ^ 0x0F); +} + +// vector operator == : xnor +static inline Vec4b operator == (Vec4b a, Vec4b b) { + return ~(a ^ b); +} + +// vector operator != : xor +static inline Vec4b operator != (Vec4b a, Vec4b b) { + return a ^ b; +} + +// vector operator ! : element not +static inline Vec4b operator ! (Vec4b a) { + return ~a; +} + +// vector operator &= : and +static inline Vec4b & operator &= (Vec4b & a, Vec4b b) { + a = a & b; + return a; +} + +// vector operator |= : or +static inline Vec4b & operator |= (Vec4b & a, Vec4b b) { + a = a | b; + return a; +} + +// vector operator ^= : xor +static inline Vec4b & operator ^= (Vec4b & a, Vec4b b) { + a = a ^ b; + return a; +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec4b const a) { + return (__mmask8(a) & 0x0F) == 0x0F; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec4b const a) { + return (__mmask8(a) & 0x0F) != 0; +} + +// function andnot: a & ~ b +static inline Vec4b andnot(Vec4b const a, Vec4b const b) { + return __mmask8(andnot(Vec8b(a), Vec8b(b))); +} + + +/***************************************************************************** +* +* Define operators for Vec2b +* +*****************************************************************************/ + +// vector operator & : and +static inline Vec2b operator & (Vec2b a, Vec2b b) { + return __mmask8(__mmask8(a) & __mmask8(b)); // _kand_mask8(__mmask8(a), __mmask8(b)) // not defined +} +static inline Vec2b operator && (Vec2b a, Vec2b b) { + return a & b; +} + +// vector operator | : or +static inline Vec2b operator | (Vec2b a, Vec2b b) { + return __mmask8(__mmask8(a) | __mmask8(b)); // _kor_mask8(__mmask8(a), __mmask8(b)); +} +static inline Vec2b operator || (Vec2b a, Vec2b b) { + return a | b; +} + +// vector operator ^ : xor +static inline Vec2b operator ^ (Vec2b a, Vec2b b) { + return __mmask8(__mmask8(a) ^ __mmask8(b)); // _kxor_mask8(__mmask8(a), __mmask8(b)); +} + +// vector operator ~ : not +static inline Vec2b operator ~ (Vec2b a) { + return __mmask8(__mmask8(a) ^ 0x03); +} + +// vector operator == : xnor +static inline Vec2b operator == (Vec2b a, Vec2b b) { + return ~(a ^ b); +} + +// vector operator != : xor +static inline Vec2b operator != (Vec2b a, Vec2b b) { + return a ^ b; +} + +// vector operator ! : element not +static inline Vec2b operator ! (Vec2b a) { + return ~a; +} + +// vector operator &= : and +static inline Vec2b & operator &= (Vec2b & a, Vec2b b) { + a = a & b; + return a; +} + +// vector operator |= : or +static inline Vec2b & operator |= (Vec2b & a, Vec2b b) { + a = a | b; + return a; +} + +// vector operator ^= : xor +static inline Vec2b & operator ^= (Vec2b & a, Vec2b b) { + a = a ^ b; + return a; +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec2b const a) { + return (__mmask8(a) & 0x03) == 0x03; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec2b const a) { + return (__mmask8(a) & 0x03) != 0; +} + +// function andnot: a & ~ b +static inline Vec2b andnot(Vec2b const a, Vec2b const b) { + return __mmask8(andnot(Vec8b(a), Vec8b(b))); +} +#endif + +/***************************************************************************** +* +* Vector of 128 bits. Used internally as base class +* +*****************************************************************************/ +class Vec128b { +protected: + __m128i xmm; // Integer vector +public: + // Default constructor: + Vec128b() { + } + // Constructor to convert from type __m128i used in intrinsics: + Vec128b(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec128b & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128i used in intrinsics + operator __m128i() const { + return xmm; + } + // Member function to load from array (unaligned) + Vec128b & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array, aligned by 16 + // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, + // Merom, Wolfdale, and Atom), but not on other processors from Intel, AMD or VIA. + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 16. + void load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + } + // Member function to store 32-bit integer into array + void store_si32(void * p) const { + *(int32_t*)p = _mm_cvtsi128_si32(xmm); + } + // Member function to store 64-bit integer into array + void storel(void * p) const { + _mm_storel_epi64((__m128i*)p, xmm); + } + // Member function to store into array (unaligned) + void store(void * p) const { + _mm_storeu_si128((__m128i*)p, xmm); + } + // Member function storing into array, aligned by 16 + // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, + // Merom, Wolfdale, and Atom), but not on other processors from Intel, AMD or VIA. + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 16. + void store_a(void * p) const { + _mm_store_si128((__m128i*)p, xmm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 16 + void store_nt(void * p) const { + _mm_stream_si128((__m128i*)p, xmm); + } + static constexpr int size() { + return 128; + } + static constexpr int elementtype() { + return 1; + } + typedef __m128i registertype; +}; + +// Define operators for this class + +// vector operator & : bitwise and +static inline Vec128b operator & (Vec128b const a, Vec128b const b) { + return _mm_and_si128(a, b); +} +static inline Vec128b operator && (Vec128b const a, Vec128b const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec128b operator | (Vec128b const a, Vec128b const b) { + return _mm_or_si128(a, b); +} +static inline Vec128b operator || (Vec128b const a, Vec128b const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec128b operator ^ (Vec128b const a, Vec128b const b) { + return _mm_xor_si128(a, b); +} + +// vector operator ~ : bitwise not +static inline Vec128b operator ~ (Vec128b const a) { + return _mm_xor_si128(a, _mm_set1_epi32(-1)); +} + +// vector operator &= : bitwise and +static inline Vec128b & operator &= (Vec128b & a, Vec128b const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec128b & operator |= (Vec128b & a, Vec128b const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec128b & operator ^= (Vec128b & a, Vec128b const b) { + a = a ^ b; + return a; +} + +// Define functions for this class + +// function andnot: a & ~ b +static inline Vec128b andnot(Vec128b const a, Vec128b const b) { + return _mm_andnot_si128(b, a); +} + +static inline __m128i zero_si128() { + return _mm_setzero_si128(); +} + + +/***************************************************************************** +* +* selectb function +* +*****************************************************************************/ +// Select between two sources, byte by byte, using broad boolean vector s. +// Used in various functions and operators +// Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed. +// The implementation depends on the instruction set: +// If SSE4.1 is supported then only bit 7 in each byte of s is checked, +// otherwise all bits in s are used. +static inline __m128i selectb(__m128i const s, __m128i const a, __m128i const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_blendv_epi8(b, a, s); +#else + return _mm_or_si128(_mm_and_si128(s, a), _mm_andnot_si128(s, b)); +#endif +} + + +/***************************************************************************** +* +* Horizontal Boolean functions +* +*****************************************************************************/ + +static inline bool horizontal_and(Vec128b const a) { +#if INSTRSET >= 5 // SSE4.1. Use PTEST + return _mm_testc_si128(a, _mm_set1_epi32(-1)) != 0; +#else + __m128i t1 = _mm_unpackhi_epi64(a, a); // get 64 bits down + __m128i t2 = _mm_and_si128(a, t1); // and 64 bits +#ifdef __x86_64__ + int64_t t5 = _mm_cvtsi128_si64(t2); // transfer 64 bits to integer + return t5 == int64_t(-1); +#else + __m128i t3 = _mm_srli_epi64(t2, 32); // get 32 bits down + __m128i t4 = _mm_and_si128(t2, t3); // and 32 bits + int t5 = _mm_cvtsi128_si32(t4); // transfer 32 bits to integer + return t5 == -1; +#endif // __x86_64__ +#endif // INSTRSET +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or(Vec128b const a) { +#if INSTRSET >= 5 // SSE4.1. Use PTEST + return !_mm_testz_si128(a, a); +#else + __m128i t1 = _mm_unpackhi_epi64(a, a); // get 64 bits down + __m128i t2 = _mm_or_si128(a, t1); // and 64 bits +#ifdef __x86_64__ + int64_t t5 = _mm_cvtsi128_si64(t2); // transfer 64 bits to integer + return t5 != int64_t(0); +#else + __m128i t3 = _mm_srli_epi64(t2, 32); // get 32 bits down + __m128i t4 = _mm_or_si128(t2, t3); // and 32 bits + int t5 = _mm_cvtsi128_si32(t4); // transfer to integer + return t5 != 0; +#endif // __x86_64__ +#endif // INSTRSET +} + + +/***************************************************************************** +* +* Vector of 16 8-bit signed integers +* +*****************************************************************************/ + +class Vec16c : public Vec128b { +public: + // Default constructor: + Vec16c() { + } + // Constructor to broadcast the same value into all elements: + Vec16c(int i) { + xmm = _mm_set1_epi8((char)i); + } + // Constructor to build from all elements: + Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, + int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) { + xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec16c(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec16c & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128i used in intrinsics + operator __m128i() const { + return xmm; + } + // Member function to load 64-bit integer from array + Vec16c & loadl(void const * p) { + xmm = _mm_loadl_epi64((__m128i const*)p); + return *this; + } + // Member function to load from array (unaligned) + Vec16c & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec16c & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec16c & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + xmm = _mm_maskz_loadu_epi8(__mmask16((1u << n) - 1), p); +#else + if (n >= 16) load(p); + else if (n <= 0) * this = 0; + else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) { + // p is at least 16 bytes from a page boundary. OK to read 16 bytes + load(p); + } + else { + // worst case. read 1 byte at a time and suffer store forwarding penalty + char x[16]; + for (int i = 0; i < n; i++) x[i] = ((char const *)p)[i]; + load(x); + } + cutoff(n); +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + _mm_mask_storeu_epi8(p, __mmask16((1u << n) - 1), xmm); +#else + if (n >= 16) { + store(p); + return; + } + if (n <= 0) return; + // we are not using _mm_maskmoveu_si128 because it is too slow on many processors + union { + int8_t c[16]; + int16_t s[8]; + int32_t i[4]; + int64_t q[2]; + } u; + store(u.c); + int j = 0; + if (n & 8) { + *(int64_t*)p = u.q[0]; + j += 8; + } + if (n & 4) { + ((int32_t*)p)[j / 4] = u.i[j / 4]; + j += 4; + } + if (n & 2) { + ((int16_t*)p)[j / 2] = u.s[j / 2]; + j += 2; + } + if (n & 1) { + ((int8_t*)p)[j] = u.c[j]; + } +#endif + } + + // cut off vector to n elements. The last 16-n elements are set to zero + Vec16c & cutoff(int n) { +#if INSTRSET >= 10 + xmm = _mm_maskz_mov_epi8(__mmask16((1u << n) - 1), xmm); +#else + if (uint32_t(n) >= 16) return *this; + const char mask[32] = { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + *this &= Vec16c().load(mask + 16 - n); +#endif + return *this; + } + // Member function to change a single element in vector + Vec16c const insert(int index, int8_t value) { +#if INSTRSET >= 10 + xmm = _mm_mask_set1_epi8(xmm, __mmask16(1u << index), value); +#else + const int8_t maskl[32] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + __m128i broad = _mm_set1_epi8(value); // broadcast value into all elements + __m128i mask = _mm_loadu_si128((__m128i const*)(maskl + 16 - (index & 0x0F))); // mask with FF at index position + xmm = selectb(mask, broad, xmm); +#endif + return *this; + } + /* Note: The extract(), insert(), size(), [], etc. all use int index for consistency. + An unsigned type for index might cause problems in case of underflow, for example: + for (i = 0; i < a.size() - 4; i++) a[i] = ... + This would go nuts if a.size() is 2. + */ + + // Member function extract a single element from vector + int8_t extract(int index) const { +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + __m128i x = _mm_maskz_compress_epi8(__mmask16(1u << index), xmm); + return (int8_t)_mm_cvtsi128_si32(x); +#else + int8_t x[16]; + store(x); + return x[index & 0x0F]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int8_t operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 4; + } +}; + + +/***************************************************************************** +* +* Vec16cb: Vector of 16 Booleans for use with Vec16c and Vec16uc +* +*****************************************************************************/ +#if INSTRSET < 10 // broad boolean vectors +class Vec16cb : public Vec16c { +public: + // Default constructor + Vec16cb() {} + // Constructor to build from all elements: + Vec16cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, + bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) { + xmm = Vec16c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), + -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15)); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec16cb(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec16cb & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec16cb(bool b) : Vec16c(-int8_t(b)) { + } + // Assignment operator to broadcast scalar value: + Vec16cb & operator = (bool b) { + *this = Vec16cb(b); + return *this; + } + // Member function to change a single element in vector + Vec16cb & insert(int index, bool a) { + Vec16c::insert(index, -(int)a); + return *this; + } + // Member function to change a bitfield to a boolean vector + Vec16cb & load_bits(uint16_t a) { + uint16_t an = uint16_t(~a); // invert because we have no compare-not-equal +#if INSTRSET >= 4 // SSSE3 (PSHUFB available under SSSE3) + __m128i a1 = _mm_cvtsi32_si128(an); // load into xmm register + __m128i dist = constant4ui<0, 0, 0x01010101, 0x01010101>(); + __m128i a2 = _mm_shuffle_epi8(a1, dist); // one byte of a in each element + __m128i mask = constant4ui<0x08040201, 0x80402010, 0x08040201, 0x80402010>(); + __m128i a3 = _mm_and_si128(a2, mask); // isolate one bit in each byte +#else + __m128i b1 = _mm_set1_epi8((int8_t)an); // broadcast low byte + __m128i b2 = _mm_set1_epi8((int8_t)(an >> 8)); // broadcast high byte + __m128i m1 = constant4ui<0x08040201, 0x80402010, 0, 0>(); + __m128i m2 = constant4ui<0, 0, 0x08040201, 0x80402010>(); + __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte of lower half + __m128i c2 = _mm_and_si128(b2, m2); // isolate one bit in each byte of upper half + __m128i a3 = _mm_or_si128(c1, c2); +#endif + xmm = _mm_cmpeq_epi8(a3, _mm_setzero_si128()); // compare with 0 + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec16c::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec16cb(int b) = delete; + Vec16cb & operator = (int x) = delete; +}; + +#else +typedef Vec16b Vec16cb; // compact boolean vector +#endif // broad boolean vectors + + +/***************************************************************************** +* +* Define operators for Vec16cb +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec16cb operator & (Vec16cb const a, Vec16cb const b) { + return Vec16cb(Vec128b(a) & Vec128b(b)); +} +static inline Vec16cb operator && (Vec16cb const a, Vec16cb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec16cb & operator &= (Vec16cb & a, Vec16cb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec16cb operator | (Vec16cb const a, Vec16cb const b) { + return Vec16cb(Vec128b(a) | Vec128b(b)); +} +static inline Vec16cb operator || (Vec16cb const a, Vec16cb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec16cb & operator |= (Vec16cb & a, Vec16cb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16cb operator ^ (Vec16cb const a, Vec16cb const b) { + return Vec16cb(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec16cb & operator ^= (Vec16cb & a, Vec16cb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec16cb operator == (Vec16cb const a, Vec16cb const b) { + return Vec16cb(a ^ (~b)); +} + +// vector operator != : xor +static inline Vec16cb operator != (Vec16cb const a, Vec16cb const b) { + return Vec16cb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec16cb operator ~ (Vec16cb const a) { + return Vec16cb(~Vec128b(a)); +} + +// vector operator ! : element not +static inline Vec16cb operator ! (Vec16cb const a) { + return ~a; +} + +// vector function andnot +static inline Vec16cb andnot(Vec16cb const a, Vec16cb const b) { + return Vec16cb(andnot(Vec128b(a), Vec128b(b))); +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec16cb const a) { + return _mm_movemask_epi8(a) == 0xFFFF; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec16cb const a) { +#if INSTRSET >= 5 // SSE4.1. Use PTEST + return !_mm_testz_si128(a, a); +#else + return _mm_movemask_epi8(a) != 0; +#endif +} +#endif // broad boolean vectors + + +/***************************************************************************** +* +* Define operators for Vec16c +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec16c operator + (Vec16c const a, Vec16c const b) { + return _mm_add_epi8(a, b); +} +// vector operator += : add +static inline Vec16c & operator += (Vec16c & a, Vec16c const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec16c operator ++ (Vec16c & a, int) { + Vec16c a0 = a; + a = a + 1; + return a0; +} + +// prefix operator ++ +static inline Vec16c & operator ++ (Vec16c & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec16c operator - (Vec16c const a, Vec16c const b) { + return _mm_sub_epi8(a, b); +} +// vector operator - : unary minus +static inline Vec16c operator - (Vec16c const a) { + return _mm_sub_epi8(_mm_setzero_si128(), a); +} +// vector operator -= : add +static inline Vec16c & operator -= (Vec16c & a, Vec16c const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec16c operator -- (Vec16c & a, int) { + Vec16c a0 = a; + a = a - 1; + return a0; +} + +// prefix operator -- +static inline Vec16c & operator -- (Vec16c & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec16c operator * (Vec16c const a, Vec16c const b) { + // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies + __m128i aodd = _mm_srli_epi16(a, 8); // odd numbered elements of a + __m128i bodd = _mm_srli_epi16(b, 8); // odd numbered elements of b + __m128i muleven = _mm_mullo_epi16(a, b); // product of even numbered elements + __m128i mulodd = _mm_mullo_epi16(aodd, bodd);// product of odd numbered elements + mulodd = _mm_slli_epi16(mulodd, 8); // put odd numbered elements back in place +#if INSTRSET >= 10 // AVX512VL + AVX512BW + return _mm_mask_mov_epi8(mulodd, 0x5555, muleven); +#else + __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions + return selectb(mask, muleven, mulodd); // interleave even and odd +#endif +} + +// vector operator *= : multiply +static inline Vec16c & operator *= (Vec16c & a, Vec16c const b) { + a = a * b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec16c operator << (Vec16c const a, int b) { + uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out + __m128i am = _mm_and_si128(a, _mm_set1_epi8((char)mask));// remove bits that will overflow + __m128i res = _mm_sll_epi16(am, _mm_cvtsi32_si128(b));// 16-bit shifts + return res; +} +// vector operator <<= : shift left +static inline Vec16c & operator <<= (Vec16c & a, int b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic all elements +static inline Vec16c operator >> (Vec16c const a, int b) { + __m128i aeven = _mm_slli_epi16(a, 8); // even numbered elements of a. get sign bit in position + aeven = _mm_sra_epi16(aeven, _mm_cvtsi32_si128(b + 8));// shift arithmetic, back to position + __m128i aodd = _mm_sra_epi16(a, _mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic +#if INSTRSET >= 10 // AVX512VL + AVX512BW + return _mm_mask_mov_epi8(aodd, 0x5555, aeven); +#else + __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions + __m128i res = selectb(mask, aeven, aodd); // interleave even and odd + return res; +#endif +} +// vector operator >>= : shift right arithmetic +static inline Vec16c & operator >>= (Vec16c & a, int b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec16cb operator == (Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, b, 0); +#else + return _mm_cmpeq_epi8(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec16cb operator != (Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, b, 4); +#else + return Vec16cb(Vec16c(~(a == b))); +#endif +} + +// vector operator > : returns true for elements for which a > b (signed) +static inline Vec16cb operator > (Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, b, 6); +#else + return _mm_cmpgt_epi8(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b (signed) +static inline Vec16cb operator < (Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec16cb operator >= (Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, b, 5); +#else + return Vec16cb(Vec16c(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec16cb operator <= (Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec16c operator & (Vec16c const a, Vec16c const b) { + return Vec16c(Vec128b(a) & Vec128b(b)); +} +static inline Vec16c operator && (Vec16c const a, Vec16c const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec16c & operator &= (Vec16c & a, Vec16c const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec16c operator | (Vec16c const a, Vec16c const b) { + return Vec16c(Vec128b(a) | Vec128b(b)); +} +static inline Vec16c operator || (Vec16c const a, Vec16c const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec16c & operator |= (Vec16c & a, Vec16c const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16c operator ^ (Vec16c const a, Vec16c const b) { + return Vec16c(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec16c & operator ^= (Vec16c & a, Vec16c const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec16c operator ~ (Vec16c const a) { + return Vec16c(~Vec128b(a)); +} + +// vector operator ! : logical not, returns true for elements == 0 +static inline Vec16cb operator ! (Vec16c const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi8_mask(a, _mm_setzero_si128(), 0); +#else + return _mm_cmpeq_epi8(a, _mm_setzero_si128()); +#endif +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. +static inline Vec16c select(Vec16cb const s, Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi8(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16c if_add(Vec16cb const f, Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi8(a, f, a, b); +#else + return a + (Vec16c(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec16c if_sub(Vec16cb const f, Vec16c const a, Vec16c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi8(a, f, a, b); +#else + return a - (Vec16c(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec16c if_mul(Vec16cb const f, Vec16c const a, Vec16c const b) { + return select(f, a * b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int32_t horizontal_add(Vec16c const a) { + __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128()); + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); + __m128i sum3 = _mm_add_epi16(sum1, sum2); + int8_t sum4 = (int8_t)_mm_cvtsi128_si32(sum3); // truncate to 8 bits + return sum4; // sign extend to 32 bits +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is sign-extended before addition to avoid overflow +static inline int32_t horizontal_add_x(Vec16c const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epi8(a); + __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E); // high element + __m128i sum3 = _mm_add_epi32(sum1, sum2); // sum + return _mm_cvtsi128_si32(sum3); +#else + __m128i aeven = _mm_slli_epi16(a, 8); // even numbered elements of a. get sign bit in position + aeven = _mm_srai_epi16(aeven, 8); // sign extend even numbered elements + __m128i aodd = _mm_srai_epi16(a, 8); // sign extend odd numbered elements + __m128i sum1 = _mm_add_epi16(aeven, aodd); // add even and odd elements + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding +#if INSTRSET >= 4 && false // SSSE3 + __m128i sum2 = _mm_hadd_epi16(sum1, sum1); + __m128i sum3 = _mm_hadd_epi16(sum2, sum2); + __m128i sum4 = _mm_hadd_epi16(sum3, sum3); +#else + __m128i sum2 = _mm_add_epi16(sum1, _mm_unpackhi_epi64(sum1, sum1)); + __m128i sum3 = _mm_add_epi16(sum2, _mm_shuffle_epi32(sum2, 1)); + __m128i sum4 = _mm_add_epi16(sum3, _mm_shufflelo_epi16(sum3, 1)); +#endif + int16_t sum5 = (int16_t)_mm_cvtsi128_si32(sum4); // 16 bit sum + return sum5; // sign extend to 32 bits +#endif +} + + +// function add_saturated: add element by element, signed with saturation +static inline Vec16c add_saturated(Vec16c const a, Vec16c const b) { + return _mm_adds_epi8(a, b); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec16c sub_saturated(Vec16c const a, Vec16c const b) { + return _mm_subs_epi8(a, b); +} + +// function max: a > b ? a : b +static inline Vec16c max(Vec16c const a, Vec16c const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_max_epi8(a, b); +#else // SSE2 + __m128i signbit = _mm_set1_epi32(0x80808080); + __m128i a1 = _mm_xor_si128(a, signbit); // add 0x80 + __m128i b1 = _mm_xor_si128(b, signbit); // add 0x80 + __m128i m1 = _mm_max_epu8(a1, b1); // unsigned max + return _mm_xor_si128(m1, signbit); // sub 0x80 +#endif +} + +// function min: a < b ? a : b +static inline Vec16c min(Vec16c const a, Vec16c const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_min_epi8(a, b); +#else // SSE2 + __m128i signbit = _mm_set1_epi32(0x80808080); + __m128i a1 = _mm_xor_si128(a, signbit); // add 0x80 + __m128i b1 = _mm_xor_si128(b, signbit); // add 0x80 + __m128i m1 = _mm_min_epu8(a1, b1); // unsigned min + return _mm_xor_si128(m1, signbit); // sub 0x80 +#endif +} + +// function abs: a >= 0 ? a : -a +static inline Vec16c abs(Vec16c const a) { +#if INSTRSET >= 4 // SSSE3 supported + return _mm_abs_epi8(a); +#else // SSE2 + __m128i nega = _mm_sub_epi8(_mm_setzero_si128(), a); + return _mm_min_epu8(a, nega); // unsigned min (the negative value is bigger when compared as unsigned) +#endif +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec16c abs_saturated(Vec16c const a) { + __m128i absa = abs(a); // abs(a) +#if INSTRSET >= 10 + return _mm_min_epu8(absa, Vec16c(0x7F)); +#else + __m128i overfl = _mm_cmpgt_epi8(_mm_setzero_si128(), absa);// 0 > a + return _mm_add_epi8(absa, overfl); // subtract 1 if 0x80 +#endif +} + +// function rotate_left: rotate each element left by b bits +// Use negative count to rotate right +static inline Vec16c rotate_left(Vec16c const a, int b) { +#ifdef __XOP__ // AMD XOP instruction set + return (Vec16c)_mm_rot_epi8(a, _mm_set1_epi8(b)); +#else // SSE2 instruction set + uint8_t mask = 0xFFu << b; // mask off overflow bits + __m128i m = _mm_set1_epi8(mask); + __m128i bb = _mm_cvtsi32_si128(b & 7); // b modulo 8 + __m128i mbb = _mm_cvtsi32_si128((-b) & 7); // 8-b modulo 8 + __m128i left = _mm_sll_epi16(a, bb); // a << b + __m128i right = _mm_srl_epi16(a, mbb); // a >> 8-b + left = _mm_and_si128(m, left); // mask off overflow bits + right = _mm_andnot_si128(m, right); + return _mm_or_si128(left, right); // combine left and right shifted bits +#endif +} + + +/***************************************************************************** +* +* Vector of 16 8-bit unsigned integers +* +*****************************************************************************/ + +class Vec16uc : public Vec16c { +public: + // Default constructor: + Vec16uc() { + } + // Constructor to broadcast the same value into all elements: + Vec16uc(uint32_t i) { + xmm = _mm_set1_epi8((char)i); + } + // Constructor to build from all elements: + Vec16uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, + uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15) { + xmm = _mm_setr_epi8((int8_t)i0, (int8_t)i1, (int8_t)i2, (int8_t)i3, (int8_t)i4, (int8_t)i5, (int8_t)i6, + (int8_t)i7, (int8_t)i8, (int8_t)i9, (int8_t)i10, (int8_t)i11, (int8_t)i12, (int8_t)i13, (int8_t)i14, (int8_t)i15); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec16uc(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec16uc & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Member function to load 64-bit integer from array + Vec16uc & loadl(void const * p) { + xmm = _mm_loadl_epi64((__m128i const*)p); + return *this; + } + // Member function to load from array (unaligned) + Vec16uc & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec16uc & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec16uc const insert(int index, uint8_t value) { + Vec16c::insert(index, (int8_t)value); + return *this; + } + // Member function extract a single element from vector + uint8_t extract(int index) const { + return uint8_t(Vec16c::extract(index)); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint8_t operator [] (int index) const { + return extract(index); + } + static constexpr int elementtype() { + return 5; + } +}; + +// Define operators for this class + +// vector operator << : shift left all elements +static inline Vec16uc operator << (Vec16uc const a, uint32_t b) { + uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out + __m128i am = _mm_and_si128(a, _mm_set1_epi8((char)mask)); // remove bits that will overflow + __m128i res = _mm_sll_epi16(am, _mm_cvtsi32_si128((int)b)); // 16-bit shifts + return res; +} + +// vector operator << : shift left all elements +static inline Vec16uc operator << (Vec16uc const a, int32_t b) { + return a << (uint32_t)b; +} + +// vector operator >> : shift right logical all elements +static inline Vec16uc operator >> (Vec16uc const a, uint32_t b) { + uint32_t mask = (uint32_t)0xFF << (uint32_t)b; // mask to remove bits that are shifted out + __m128i am = _mm_and_si128(a, _mm_set1_epi8((char)mask)); // remove bits that will overflow + __m128i res = _mm_srl_epi16(am, _mm_cvtsi32_si128((int)b)); // 16-bit shifts + return res; +} + +// vector operator >> : shift right logical all elements +static inline Vec16uc operator >> (Vec16uc const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right logical +static inline Vec16uc & operator >>= (Vec16uc & a, int b) { + a = a >> b; + return a; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec16cb operator >= (Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu8_mask(a, b, 5); +#else + return (Vec16cb)_mm_cmpeq_epi8(_mm_max_epu8(a, b), a); // a == max(a,b) +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec16cb operator <= (Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu8_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec16cb operator > (Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu8_mask(a, b, 6); +#else + return Vec16cb(Vec16c(~(b >= a))); +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec16cb operator < (Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu8_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator + : add +static inline Vec16uc operator + (Vec16uc const a, Vec16uc const b) { + return Vec16uc(Vec16c(a) + Vec16c(b)); +} + +// vector operator - : subtract +static inline Vec16uc operator - (Vec16uc const a, Vec16uc const b) { + return Vec16uc(Vec16c(a) - Vec16c(b)); +} + +// vector operator * : multiply +static inline Vec16uc operator * (Vec16uc const a, Vec16uc const b) { + return Vec16uc(Vec16c(a) * Vec16c(b)); +} + +// vector operator & : bitwise and +static inline Vec16uc operator & (Vec16uc const a, Vec16uc const b) { + return Vec16uc(Vec128b(a) & Vec128b(b)); +} +static inline Vec16uc operator && (Vec16uc const a, Vec16uc const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec16uc operator | (Vec16uc const a, Vec16uc const b) { + return Vec16uc(Vec128b(a) | Vec128b(b)); +} +static inline Vec16uc operator || (Vec16uc const a, Vec16uc const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec16uc operator ^ (Vec16uc const a, Vec16uc const b) { + return Vec16uc(Vec128b(a) ^ Vec128b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec16uc operator ~ (Vec16uc const a) { + return Vec16uc(~Vec128b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16uc select(Vec16cb const s, Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi8(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16uc if_add(Vec16cb const f, Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi8(a, f, a, b); +#else + return a + (Vec16uc(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec16uc if_sub(Vec16cb const f, Vec16uc const a, Vec16uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi8(a, f, a, b); +#else + return a - (Vec16uc(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec16uc if_mul(Vec16cb const f, Vec16uc const a, Vec16uc const b) { + return select(f, a * b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +// (Note: horizontal_add_x(Vec16uc) is slightly faster) +static inline uint32_t horizontal_add(Vec16uc const a) { + __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128()); + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); + __m128i sum3 = _mm_add_epi16(sum1, sum2); + uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3); // truncate to 16 bits + return sum4; +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is zero-extended before addition to avoid overflow +static inline uint32_t horizontal_add_x(Vec16uc const a) { + __m128i sum1 = _mm_sad_epu8(a, _mm_setzero_si128()); + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); + __m128i sum3 = _mm_add_epi16(sum1, sum2); + return (uint32_t)_mm_cvtsi128_si32(sum3); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec16uc add_saturated(Vec16uc const a, Vec16uc const b) { + return _mm_adds_epu8(a, b); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec16uc sub_saturated(Vec16uc const a, Vec16uc const b) { + return _mm_subs_epu8(a, b); +} + +// function max: a > b ? a : b +static inline Vec16uc max(Vec16uc const a, Vec16uc const b) { + return _mm_max_epu8(a, b); +} + +// function min: a < b ? a : b +static inline Vec16uc min(Vec16uc const a, Vec16uc const b) { + return _mm_min_epu8(a, b); +} + + + +/***************************************************************************** +* +* Vector of 8 16-bit signed integers +* +*****************************************************************************/ + +class Vec8s : public Vec128b { +public: + // Default constructor: + Vec8s() { + } + // Constructor to broadcast the same value into all elements: + Vec8s(int i) { + xmm = _mm_set1_epi16((int16_t)i); + } + // Constructor to build from all elements: + Vec8s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7) { + xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec8s(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec8s & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128i used in intrinsics + operator __m128i() const { + return xmm; + } + // Member function to load 64-bit integer from array + Vec8s & loadl(void const * p) { + xmm = _mm_loadl_epi64((__m128i const*)p); + return *this; + } + // Member function to load from array (unaligned) + Vec8s & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec8s & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Member function to load 8 unsigned 8-bit integers from array + Vec8s & load_8uc(void const * p) { +#if INSTRSET >= 5 // SSE4.1 + xmm = _mm_cvtepu8_epi16(Vec16uc().loadl(p)); +#else + xmm = _mm_unpacklo_epi8(Vec16uc().loadl(p), _mm_setzero_si128()); +#endif + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec8s & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + xmm = _mm_maskz_loadu_epi16(__mmask8((1u << n) - 1), p); +#else + if (n >= 8) load(p); + else if (n <= 0) * this = 0; + else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) { + // p is at least 16 bytes from a page boundary. OK to read 16 bytes + load(p); + } + else { + // worst case. read 1 byte at a time and suffer store forwarding penalty + int16_t x[8]; + for (int i = 0; i < n; i++) x[i] = ((int16_t const *)p)[i]; + load(x); + } + cutoff(n); +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + _mm_mask_storeu_epi16(p, __mmask8((1u << n) - 1), xmm); +#else + if (n >= 8) { + store(p); + return; + } + if (n <= 0) return; + // we are not using _mm_maskmoveu_si128 because it is too slow on many processors + union { + int8_t c[16]; + int16_t s[8]; + int32_t i[4]; + int64_t q[2]; + } u; + store(u.c); + int j = 0; + if (n & 4) { + *(int64_t*)p = u.q[0]; + j += 8; + } + if (n & 2) { + ((int32_t*)p)[j / 4] = u.i[j / 4]; + j += 4; + } + if (n & 1) { + ((int16_t*)p)[j / 2] = u.s[j / 2]; + } +#endif + } + + // cut off vector to n elements. The last 8-n elements are set to zero + Vec8s & cutoff(int n) { +#if INSTRSET >= 10 + xmm = _mm_maskz_mov_epi16(__mmask8((1u << n) - 1), xmm); +#else + *this = Vec16c(xmm).cutoff(n * 2); +#endif + return *this; + } + // Member function to change a single element in vector + Vec8s const insert(int index, int16_t value) { +#if INSTRSET >= 10 + xmm = _mm_mask_set1_epi16(xmm, __mmask8(1u << index), value); +#else + switch (index) { + case 0: + xmm = _mm_insert_epi16(xmm, value, 0); break; + case 1: + xmm = _mm_insert_epi16(xmm, value, 1); break; + case 2: + xmm = _mm_insert_epi16(xmm, value, 2); break; + case 3: + xmm = _mm_insert_epi16(xmm, value, 3); break; + case 4: + xmm = _mm_insert_epi16(xmm, value, 4); break; + case 5: + xmm = _mm_insert_epi16(xmm, value, 5); break; + case 6: + xmm = _mm_insert_epi16(xmm, value, 6); break; + case 7: + xmm = _mm_insert_epi16(xmm, value, 7); break; + } +#endif + return *this; + } + // Member function extract a single element from vector + int16_t extract(int index) const { +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + __m128i x = _mm_maskz_compress_epi16(__mmask8(1u << index), xmm); + return (int16_t)_mm_cvtsi128_si32(x); +#else + switch (index) { + case 0: + return (int16_t)_mm_extract_epi16(xmm, 0); + case 1: + return (int16_t)_mm_extract_epi16(xmm, 1); + case 2: + return (int16_t)_mm_extract_epi16(xmm, 2); + case 3: + return (int16_t)_mm_extract_epi16(xmm, 3); + case 4: + return (int16_t)_mm_extract_epi16(xmm, 4); + case 5: + return (int16_t)_mm_extract_epi16(xmm, 5); + case 6: + return (int16_t)_mm_extract_epi16(xmm, 6); + case 7: + return (int16_t)_mm_extract_epi16(xmm, 7); + } + return 0; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int16_t operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 6; + } +}; + +/***************************************************************************** +* +* Vec8sb: Vector of 8 Booleans for use with Vec8s and Vec8us +* +*****************************************************************************/ +#if INSTRSET < 10 // broad boolean vectors + +class Vec8sb : public Vec8s { +public: + // Constructor to build from all elements: + Vec8sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) { + xmm = Vec8s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7)); + } + // Default constructor: + Vec8sb() { + } + // Constructor to convert from type __m128i used in intrinsics: + Vec8sb(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec8sb & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec8sb(bool b) : Vec8s(-int16_t(b)) { + } + // Assignment operator to broadcast scalar value: + Vec8sb & operator = (bool b) { + *this = Vec8sb(b); + return *this; + } + Vec8sb & insert(int index, bool a) { + Vec8s::insert(index, -(int16_t)a); + return *this; + } + // Member function extract a single element from vector + // Note: This function is inefficient. Use store function if extracting more than one element + bool extract(int index) const { + return Vec8s::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec8sb & load_bits(uint8_t a) { + __m128i b1 = _mm_set1_epi8((int8_t)a); // broadcast byte. Invert because we have no compare-not-equal + __m128i m1 = constant4ui<0x00020001, 0x00080004, 0x00200010, 0x00800040>(); + __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte + xmm = _mm_cmpgt_epi16(c1, _mm_setzero_si128()); // compare with 0 + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec8sb(int b) = delete; + Vec8sb & operator = (int x) = delete; +}; +#else +typedef Vec8b Vec8sb; +#endif + + +/***************************************************************************** +* +* Define operators for Vec8sb +* +*****************************************************************************/ +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec8sb operator & (Vec8sb const a, Vec8sb const b) { + return Vec8sb(Vec128b(a) & Vec128b(b)); +} +static inline Vec8sb operator && (Vec8sb const a, Vec8sb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec8sb & operator &= (Vec8sb & a, Vec8sb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8sb operator | (Vec8sb const a, Vec8sb const b) { + return Vec8sb(Vec128b(a) | Vec128b(b)); +} +static inline Vec8sb operator || (Vec8sb const a, Vec8sb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec8sb & operator |= (Vec8sb & a, Vec8sb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8sb operator ^ (Vec8sb const a, Vec8sb const b) { + return Vec8sb(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec8sb & operator ^= (Vec8sb & a, Vec8sb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec8sb operator == (Vec8sb const a, Vec8sb const b) { + return Vec8sb(a ^ (~b)); +} + +// vector operator != : xor +static inline Vec8sb operator != (Vec8sb const a, Vec8sb const b) { + return Vec8sb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec8sb operator ~ (Vec8sb const a) { + return Vec8sb(~Vec128b(a)); +} + +// vector operator ! : element not +static inline Vec8sb operator ! (Vec8sb const a) { + return ~a; +} + +// vector function andnot +static inline Vec8sb andnot(Vec8sb const a, Vec8sb const b) { + return Vec8sb(andnot(Vec128b(a), Vec128b(b))); +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec8sb const a) { + return _mm_movemask_epi8(a) == 0xFFFF; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec8sb const a) { +#if INSTRSET >= 5 // SSE4.1. Use PTEST + return !_mm_testz_si128(a, a); +#else + return _mm_movemask_epi8(a) != 0; +#endif +} +#endif // broad boolean vectors + + +/***************************************************************************** +* +* operators for Vec8s +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec8s operator + (Vec8s const a, Vec8s const b) { + return _mm_add_epi16(a, b); +} +// vector operator += : add +static inline Vec8s & operator += (Vec8s & a, Vec8s const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8s operator ++ (Vec8s & a, int) { + Vec8s a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec8s & operator ++ (Vec8s & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8s operator - (Vec8s const a, Vec8s const b) { + return _mm_sub_epi16(a, b); +} +// vector operator - : unary minus +static inline Vec8s operator - (Vec8s const a) { + return _mm_sub_epi16(_mm_setzero_si128(), a); +} +// vector operator -= : subtract +static inline Vec8s & operator -= (Vec8s & a, Vec8s const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8s operator -- (Vec8s & a, int) { + Vec8s a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec8s & operator -- (Vec8s & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8s operator * (Vec8s const a, Vec8s const b) { + return _mm_mullo_epi16(a, b); +} + +// vector operator *= : multiply +static inline Vec8s & operator *= (Vec8s & a, Vec8s const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec8s operator << (Vec8s const a, int b) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128(b)); +} + +// vector operator <<= : shift left +static inline Vec8s & operator <<= (Vec8s & a, int b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec8s operator >> (Vec8s const a, int b) { + return _mm_sra_epi16(a, _mm_cvtsi32_si128(b)); +} + +// vector operator >>= : shift right arithmetic +static inline Vec8s & operator >>= (Vec8s & a, int b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8sb operator == (Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmpeq_epi16_mask(a, b); +#else + return _mm_cmpeq_epi16(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8sb operator != (Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmpneq_epi16_mask(a, b); +#else + return Vec8sb(~(a == b)); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8sb operator > (Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi16_mask(a, b, 6); +#else + return _mm_cmpgt_epi16(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8sb operator < (Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi16_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec8sb operator >= (Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi16_mask(a, b, 5); +#else + return Vec8sb(~(b > a)); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec8sb operator <= (Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi16_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec8s operator & (Vec8s const a, Vec8s const b) { + return Vec8s(Vec128b(a) & Vec128b(b)); +} +static inline Vec8s operator && (Vec8s const a, Vec8s const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec8s & operator &= (Vec8s & a, Vec8s const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8s operator | (Vec8s const a, Vec8s const b) { + return Vec8s(Vec128b(a) | Vec128b(b)); +} +static inline Vec8s operator || (Vec8s const a, Vec8s const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec8s & operator |= (Vec8s & a, Vec8s const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8s operator ^ (Vec8s const a, Vec8s const b) { + return Vec8s(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec8s & operator ^= (Vec8s & a, Vec8s const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec8s operator ~ (Vec8s const a) { + return Vec8s(~Vec128b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8s select(Vec8sb const s, Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi16(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8s if_add(Vec8sb const f, Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi16(a, f, a, b); +#else + return a + (Vec8s(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec8s if_sub(Vec8sb const f, Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi16(a, f, a, b); +#else + return a - (Vec8s(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec8s if_mul(Vec8sb const f, Vec8s const a, Vec8s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mullo_epi16(a, f, a, b); +#else + return select(f, a * b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int16_t horizontal_add(Vec8s const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epi16(a); + __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E); // high element + __m128i sum3 = _mm_add_epi32(sum1, sum2); // sum + int16_t sum4 = _mm_cvtsi128_si32(sum3); // truncate to 16 bits + return sum4; // sign extend to 32 bits + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding +#elif INSTRSET >= 4 && false // SSSE3 + __m128i sum1 = _mm_hadd_epi16(a, a); // horizontally add 8 elements in 3 steps + __m128i sum2 = _mm_hadd_epi16(sum1, sum1); + __m128i sum3 = _mm_hadd_epi16(sum2, sum2); + int16_t sum4 = (int16_t)_mm_cvtsi128_si32(sum3); // 16 bit sum + return sum4; // sign extend to 32 bits +#else // SSE2 + __m128i sum1 = _mm_unpackhi_epi64(a, a); // 4 high elements + __m128i sum2 = _mm_add_epi16(a, sum1); // 4 sums + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); // 2 high elements + __m128i sum4 = _mm_add_epi16(sum2, sum3); // 2 sums + __m128i sum5 = _mm_shufflelo_epi16(sum4, 0x01); // 1 high element + __m128i sum6 = _mm_add_epi16(sum4, sum5); // 1 sum + int16_t sum7 = (int16_t)_mm_cvtsi128_si32(sum6); // 16 bit sum + return sum7; // sign extend to 32 bits +#endif +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are sign extended before adding to avoid overflow +static inline int32_t horizontal_add_x(Vec8s const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epi16(a); + __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E); // high element + __m128i sum3 = _mm_add_epi32(sum1, sum2); // sum + return _mm_cvtsi128_si32(sum3); +#else + __m128i aeven = _mm_slli_epi32(a, 16); // even numbered elements of a. get sign bit in position + aeven = _mm_srai_epi32(aeven, 16); // sign extend even numbered elements + __m128i aodd = _mm_srai_epi32(a, 16); // sign extend odd numbered elements + __m128i sum1 = _mm_add_epi32(aeven, aodd); // add even and odd elements +#if INSTRSET >= 4 && false // SSSE3 + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128i sum2 = _mm_hadd_epi32(sum1, sum1); // horizontally add 4 elements in 2 steps + __m128i sum3 = _mm_hadd_epi32(sum2, sum2); + return _mm_cvtsi128_si32(sum3); +#else // SSE2 + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); // 2 high elements + __m128i sum3 = _mm_add_epi32(sum1, sum2); + __m128i sum4 = _mm_shuffle_epi32(sum3, 1); // 1 high elements + __m128i sum5 = _mm_add_epi32(sum3, sum4); + return _mm_cvtsi128_si32(sum5); // 32 bit sum +#endif +#endif +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec8s add_saturated(Vec8s const a, Vec8s const b) { + return _mm_adds_epi16(a, b); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec8s sub_saturated(Vec8s const a, Vec8s const b) { + return _mm_subs_epi16(a, b); +} + +// function max: a > b ? a : b +static inline Vec8s max(Vec8s const a, Vec8s const b) { + return _mm_max_epi16(a, b); +} + +// function min: a < b ? a : b +static inline Vec8s min(Vec8s const a, Vec8s const b) { + return _mm_min_epi16(a, b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec8s abs(Vec8s const a) { +#if INSTRSET >= 4 // SSSE3 supported + return _mm_abs_epi16(a); +#else // SSE2 + __m128i nega = _mm_sub_epi16(_mm_setzero_si128(), a); + return _mm_max_epi16(a, nega); +#endif +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec8s abs_saturated(Vec8s const a) { +#if INSTRSET >= 10 + return _mm_min_epu16(abs(a), Vec8s(0x7FFF)); +#else + __m128i absa = abs(a); // abs(a) + __m128i overfl = _mm_srai_epi16(absa, 15); // sign + return _mm_add_epi16(absa, overfl); // subtract 1 if 0x8000 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec8s rotate_left(Vec8s const a, int b) { +#ifdef __XOP__ // AMD XOP instruction set + return (Vec8s)_mm_rot_epi16(a, _mm_set1_epi16(b)); +#else // SSE2 instruction set + __m128i left = _mm_sll_epi16(a, _mm_cvtsi32_si128(b & 0x0F)); // a << b + __m128i right = _mm_srl_epi16(a, _mm_cvtsi32_si128((-b) & 0x0F)); // a >> (16 - b) + __m128i rot = _mm_or_si128(left, right); // or + return rot; +#endif +} + + +/***************************************************************************** +* +* Vector of 8 16-bit unsigned integers +* +*****************************************************************************/ + +class Vec8us : public Vec8s { +public: + // Default constructor: + Vec8us() { + } + // Constructor to broadcast the same value into all elements: + Vec8us(uint32_t i) { + xmm = _mm_set1_epi16((int16_t)i); + } + // Constructor to build from all elements: + Vec8us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7) { + xmm = _mm_setr_epi16((int16_t)i0, (int16_t)i1, (int16_t)i2, (int16_t)i3, (int16_t)i4, (int16_t)i5, (int16_t)i6, (int16_t)i7); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec8us(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec8us & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Member function to load 64-bit integer from array + Vec8us & loadl(void const * p) { + xmm = _mm_loadl_epi64((__m128i const*)p); + return *this; + } + // Member function to load from array (unaligned) + Vec8us & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec8us & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Member function to change a single element in vector + // Note: This function is inefficient. Use load function if changing more than one element + Vec8us const insert(int index, uint16_t value) { + Vec8s::insert(index, (int16_t)value); + return *this; + } + // Member function extract a single element from vector + uint16_t extract(int index) const { + return (uint16_t)Vec8s::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint16_t operator [] (int index) const { + return extract(index); + } + static constexpr int elementtype() { + return 7; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec8us operator + (Vec8us const a, Vec8us const b) { + return Vec8us(Vec8s(a) + Vec8s(b)); +} + +// vector operator - : subtract +static inline Vec8us operator - (Vec8us const a, Vec8us const b) { + return Vec8us(Vec8s(a) - Vec8s(b)); +} + +// vector operator * : multiply +static inline Vec8us operator * (Vec8us const a, Vec8us const b) { + return Vec8us(Vec8s(a) * Vec8s(b)); +} + +// vector operator / : divide +// See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec8us operator >> (Vec8us const a, uint32_t b) { + return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec8us operator >> (Vec8us const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right logical +static inline Vec8us & operator >>= (Vec8us & a, int b) { + a = a >> b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec8us operator << (Vec8us const a, uint32_t b) { + return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)b)); +} + +// vector operator << : shift left all elements +static inline Vec8us operator << (Vec8us const a, int32_t b) { + return a << (uint32_t)b; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec8sb operator >= (Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epu16_mask(a, b, 5); +#elif defined (__XOP__) // AMD XOP instruction set + return (Vec8sb)_mm_comge_epu16(a, b); +#elif INSTRSET >= 5 // SSE4.1 + __m128i max_ab = _mm_max_epu16(a, b); // max(a,b), unsigned + return _mm_cmpeq_epi16(a, max_ab); // a == max(a,b) +#else // SSE2 instruction set + __m128i s = _mm_subs_epu16(b, a); // b-a, saturated + return _mm_cmpeq_epi16(s, _mm_setzero_si128()); // s == 0 +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec8sb operator <= (Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epu16_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec8sb operator > (Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epu16_mask(a, b, 6); +#elif defined (__XOP__) // AMD XOP instruction set + return (Vec8sb)_mm_comgt_epu16(a, b); +#else // SSE2 instruction set + return Vec8sb(~(b >= a)); +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec8sb operator < (Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epu16_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator & : bitwise and +static inline Vec8us operator & (Vec8us const a, Vec8us const b) { + return Vec8us(Vec128b(a) & Vec128b(b)); +} +static inline Vec8us operator && (Vec8us const a, Vec8us const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec8us operator | (Vec8us const a, Vec8us const b) { + return Vec8us(Vec128b(a) | Vec128b(b)); +} +static inline Vec8us operator || (Vec8us const a, Vec8us const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec8us operator ^ (Vec8us const a, Vec8us const b) { + return Vec8us(Vec128b(a) ^ Vec128b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec8us operator ~ (Vec8us const a) { + return Vec8us(~Vec128b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8us select(Vec8sb const s, Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi16(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8us if_add(Vec8sb const f, Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi16(a, f, a, b); +#else + return a + (Vec8us(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec8us if_sub(Vec8sb const f, Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi16(a, f, a, b); +#else + return a - (Vec8us(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec8us if_mul(Vec8sb const f, Vec8us const a, Vec8us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mullo_epi16(a, f, a, b); +#else + return select(f, a * b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. +// Overflow will wrap around +static inline uint32_t horizontal_add(Vec8us const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epu16(a); + __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E); // high element + __m128i sum3 = _mm_add_epi32(sum1, sum2); // sum + uint16_t sum4 = _mm_cvtsi128_si32(sum3); // truncate to 16 bits + return sum4; // zero extend to 32 bits +#elif INSTRSET >= 4 && false // SSSE3 + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128i sum1 = _mm_hadd_epi16(a, a); // horizontally add 8 elements in 3 steps + __m128i sum2 = _mm_hadd_epi16(sum1, sum1); + __m128i sum3 = _mm_hadd_epi16(sum2, sum2); + uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3); // 16 bit sum + return sum4; // zero extend to 32 bits +#else // SSE2 + __m128i sum1 = _mm_unpackhi_epi64(a, a); // 4 high elements + __m128i sum2 = _mm_add_epi16(a, sum1); // 4 sums + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); // 2 high elements + __m128i sum4 = _mm_add_epi16(sum2, sum3); // 2 sums + __m128i sum5 = _mm_shufflelo_epi16(sum4, 0x01); // 1 high element + __m128i sum6 = _mm_add_epi16(sum4, sum5); // 1 sum + uint16_t sum7 = (uint16_t)_mm_cvtsi128_si32(sum6); // 16 bit sum + return sum7; // zero extend to 32 bits +#endif +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is zero-extended before addition to avoid overflow +static inline uint32_t horizontal_add_x(Vec8us const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epu16(a); + __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E); // high element + __m128i sum3 = _mm_add_epi32(sum1, sum2); // sum + return (uint32_t)_mm_cvtsi128_si32(sum3); + /* +#elif INSTRSET >= 4 // SSSE3 + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for even positions + __m128i aeven = _mm_and_si128(a,mask); // even numbered elements of a + __m128i aodd = _mm_srli_epi32(a,16); // zero extend odd numbered elements + __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements + __m128i sum2 = _mm_hadd_epi32(sum1,sum1); // horizontally add 4 elements in 2 steps + __m128i sum3 = _mm_hadd_epi32(sum2,sum2); + return (uint32_t)_mm_cvtsi128_si32(sum3); + */ +#else // SSE2 +#if INSTRSET >= 10 // AVX512VL + AVX512BW + __m128i aeven = _mm_maskz_mov_epi16(0x55, a); +#else + __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for even positions + __m128i aeven = _mm_and_si128(a, mask); // even numbered elements of a +#endif + __m128i aodd = _mm_srli_epi32(a, 16); // zero extend odd numbered elements + __m128i sum1 = _mm_add_epi32(aeven, aodd); // add even and odd elements + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); // 2 high elements + __m128i sum3 = _mm_add_epi32(sum1, sum2); + __m128i sum4 = _mm_shuffle_epi32(sum3, 0x01); // 1 high elements + __m128i sum5 = _mm_add_epi32(sum3, sum4); + return (uint32_t)_mm_cvtsi128_si32(sum5); // 16 bit sum +#endif +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec8us add_saturated(Vec8us const a, Vec8us const b) { + return _mm_adds_epu16(a, b); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec8us sub_saturated(Vec8us const a, Vec8us const b) { + return _mm_subs_epu16(a, b); +} + +// function max: a > b ? a : b +static inline Vec8us max(Vec8us const a, Vec8us const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_max_epu16(a, b); +#else // SSE2 + __m128i signbit = _mm_set1_epi32(0x80008000); + __m128i a1 = _mm_xor_si128(a, signbit); // add 0x8000 + __m128i b1 = _mm_xor_si128(b, signbit); // add 0x8000 + __m128i m1 = _mm_max_epi16(a1, b1); // signed max + return _mm_xor_si128(m1, signbit); // sub 0x8000 +#endif +} + +// function min: a < b ? a : b +static inline Vec8us min(Vec8us const a, Vec8us const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_min_epu16(a, b); +#else // SSE2 + __m128i signbit = _mm_set1_epi32(0x80008000); + __m128i a1 = _mm_xor_si128(a, signbit); // add 0x8000 + __m128i b1 = _mm_xor_si128(b, signbit); // add 0x8000 + __m128i m1 = _mm_min_epi16(a1, b1); // signed min + return _mm_xor_si128(m1, signbit); // sub 0x8000 +#endif +} + + +/***************************************************************************** +* +* Vector of 4 32-bit signed integers +* +*****************************************************************************/ + +class Vec4i : public Vec128b { +public: + // Default constructor: + Vec4i() { + } + // Constructor to broadcast the same value into all elements: + Vec4i(int i) { + xmm = _mm_set1_epi32(i); + } + // Constructor to build from all elements: + Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) { + xmm = _mm_setr_epi32(i0, i1, i2, i3); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec4i(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec4i & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128i used in intrinsics + operator __m128i() const { + return xmm; + } + // Member function to load from array (unaligned) + Vec4i & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec4i & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Member function to load 4 unsigned 8-bit integers from array + Vec4i & load_4uc(void const * p) { +#if INSTRSET >= 5 // SSE4.1 + xmm = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t const*)p)); +#else + __m128i zero = _mm_setzero_si128(); + xmm = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t const*)p), zero), zero); +#endif + return *this; + } + // Member function to load 4 unsigned 16-bit integers from array + Vec4i & load_4us(void const * p) { +#if INSTRSET >= 5 // SSE4.1 + xmm = _mm_cvtepu16_epi32(Vec8us().loadl(p)); +#else + xmm = _mm_unpacklo_epi16(Vec8us().loadl(p), _mm_setzero_si128()); +#endif + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec4i & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + xmm = _mm_maskz_loadu_epi32(__mmask8((1u << n) - 1), p); +#else + switch (n) { + case 0: + *this = 0; break; + case 1: + xmm = _mm_cvtsi32_si128(*(int32_t const*)p); break; + case 2: + // intrinsic for movq is missing! + xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], 0, 0); break; + case 3: + xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], ((int32_t const*)p)[2], 0); break; + case 4: + load(p); break; + default: + break; + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + _mm_mask_storeu_epi32(p, __mmask8((1u << n) - 1), xmm); +#else + union { + int32_t i[4]; + int64_t q[2]; + } u; + switch (n) { + case 1: + *(int32_t*)p = _mm_cvtsi128_si32(xmm); break; + case 2: + // intrinsic for movq is missing! + store(u.i); + *(int64_t*)p = u.q[0]; break; + case 3: + store(u.i); + *(int64_t*)p = u.q[0]; + ((int32_t*)p)[2] = u.i[2]; break; + case 4: + store(p); break; + default: + break; + } +#endif + } + + // cut off vector to n elements. The last 4-n elements are set to zero + Vec4i & cutoff(int n) { +#if INSTRSET >= 10 + xmm = _mm_maskz_mov_epi32(__mmask8((1u << n) - 1), xmm); +#else + * this = Vec16c(xmm).cutoff(n * 4); +#endif + return *this; + } + // Member function to change a single element in vector + Vec4i const insert(int index, int32_t value) { +#if INSTRSET >= 10 + xmm = _mm_mask_set1_epi32(xmm, __mmask8(1u << index), value); +#else + __m128i broad = _mm_set1_epi32(value); // broadcast value into all elements + const int32_t maskl[8] = { 0,0,0,0,-1,0,0,0 }; + __m128i mask = _mm_loadu_si128((__m128i const*)(maskl + 4 - (index & 3))); // mask with FFFFFFFF at index position + xmm = selectb(mask, broad, xmm); +#endif + return *this; + } + // Member function extract a single element from vector + int32_t extract(int index) const { +#if INSTRSET >= 10 + __m128i x = _mm_maskz_compress_epi32(__mmask8(1u << index), xmm); + return _mm_cvtsi128_si32(x); +#else + int32_t x[4]; + store(x); + return x[index & 3]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int32_t operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 4; + } + static constexpr int elementtype() { + return 8; + } +}; + + +/***************************************************************************** +* +* Vec4ib: Vector of 4 Booleans for use with Vec4i and Vec4ui +* +*****************************************************************************/ +#if INSTRSET < 10 // broad boolean vectors + +class Vec4ib : public Vec4i { +public: + // Default constructor: + Vec4ib() { + } + // Constructor to build from all elements: + Vec4ib(bool x0, bool x1, bool x2, bool x3) { + xmm = Vec4i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3)); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec4ib(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec4ib & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec4ib(bool b) : Vec4i(-int32_t(b)) { + } + // Assignment operator to broadcast scalar value: + Vec4ib & operator = (bool b) { + *this = Vec4ib(b); + return *this; + } + Vec4ib & insert(int index, bool a) { + Vec4i::insert(index, -(int)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec4i::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec4ib & load_bits(uint8_t a) { + __m128i b1 = _mm_set1_epi8((int8_t)a); // broadcast byte + __m128i m1 = constant4ui<1, 2, 4, 8>(); + __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte + xmm = _mm_cmpgt_epi32(c1, _mm_setzero_si128()); // compare signed because no numbers are negative + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec4ib(int b) = delete; + Vec4ib & operator = (int x) = delete; +}; + +#else + +typedef Vec4b Vec4ib; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Define operators for Vec4ib +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec4ib operator & (Vec4ib const a, Vec4ib const b) { + return Vec4ib(Vec128b(a) & Vec128b(b)); +} +static inline Vec4ib operator && (Vec4ib const a, Vec4ib const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec4ib & operator &= (Vec4ib & a, Vec4ib const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec4ib operator | (Vec4ib const a, Vec4ib const b) { + return Vec4ib(Vec128b(a) | Vec128b(b)); +} +static inline Vec4ib operator || (Vec4ib const a, Vec4ib const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec4ib & operator |= (Vec4ib & a, Vec4ib const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec4ib operator ^ (Vec4ib const a, Vec4ib const b) { + return Vec4ib(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec4ib & operator ^= (Vec4ib & a, Vec4ib const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec4ib operator == (Vec4ib const a, Vec4ib const b) { + return Vec4ib(a ^ (~b)); +} + +// vector operator != : xor +static inline Vec4ib operator != (Vec4ib const a, Vec4ib const b) { + return Vec4ib(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec4ib operator ~ (Vec4ib const a) { + return Vec4ib(~Vec128b(a)); +} + +// vector operator ! : element not +static inline Vec4ib operator ! (Vec4ib const a) { + return ~a; +} + +// vector function andnot +static inline Vec4ib andnot(Vec4ib const a, Vec4ib const b) { + return Vec4ib(andnot(Vec128b(a), Vec128b(b))); +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec4ib const a) { + return _mm_movemask_epi8(a) == 0xFFFF; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec4ib const a) { +#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST + return !_mm_testz_si128(a, a); +#else + return _mm_movemask_epi8(a) != 0; +#endif +} +#endif // broad boolean vectors + + +/***************************************************************************** +* +* Operators for Vec4i +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec4i operator + (Vec4i const a, Vec4i const b) { + return _mm_add_epi32(a, b); +} +// vector operator += : add +static inline Vec4i & operator += (Vec4i & a, Vec4i const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec4i operator ++ (Vec4i & a, int) { + Vec4i a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec4i & operator ++ (Vec4i & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec4i operator - (Vec4i const a, Vec4i const b) { + return _mm_sub_epi32(a, b); +} +// vector operator - : unary minus +static inline Vec4i operator - (Vec4i const a) { + return _mm_sub_epi32(_mm_setzero_si128(), a); +} +// vector operator -= : subtract +static inline Vec4i & operator -= (Vec4i & a, Vec4i const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec4i operator -- (Vec4i & a, int) { + Vec4i a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec4i & operator -- (Vec4i & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec4i operator * (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 5 // SSE4.1 instruction set + return _mm_mullo_epi32(a, b); +#else + __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) + __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) + __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) + __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) + __m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0) + __m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2) + return _mm_unpacklo_epi64(prod01, prod23); // (ab3,ab2,ab1,ab0) +#endif +} + +// vector operator *= : multiply +static inline Vec4i & operator *= (Vec4i & a, Vec4i const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec4i operator << (Vec4i const a, int32_t b) { + return _mm_sll_epi32(a, _mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec4i & operator <<= (Vec4i & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec4i operator >> (Vec4i const a, int32_t b) { + return _mm_sra_epi32(a, _mm_cvtsi32_si128(b)); +} +// vector operator >>= : shift right arithmetic +static inline Vec4i & operator >>= (Vec4i & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec4ib operator == (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, b, 0); +#else + return _mm_cmpeq_epi32(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec4ib operator != (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, b, 4); +#else + return Vec4ib(Vec4i(~(a == b))); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec4ib operator > (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, b, 6); +#else + return _mm_cmpgt_epi32(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec4ib operator < (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec4ib operator >= (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, b, 5); +#else + return Vec4ib(Vec4i(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec4ib operator <= (Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec4i operator & (Vec4i const a, Vec4i const b) { + return Vec4i(Vec128b(a) & Vec128b(b)); +} +static inline Vec4i operator && (Vec4i const a, Vec4i const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec4i & operator &= (Vec4i & a, Vec4i const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec4i operator | (Vec4i const a, Vec4i const b) { + return Vec4i(Vec128b(a) | Vec128b(b)); +} +static inline Vec4i operator || (Vec4i const a, Vec4i const b) { + return a | b; +} +// vector operator |= : bitwise and +static inline Vec4i & operator |= (Vec4i & a, Vec4i const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec4i operator ^ (Vec4i const a, Vec4i const b) { + return Vec4i(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise and +static inline Vec4i & operator ^= (Vec4i & a, Vec4i const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec4i operator ~ (Vec4i const a) { + return Vec4i(~Vec128b(a)); +} + +// vector operator ! : returns true for elements == 0 +static inline Vec4ib operator ! (Vec4i const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi32_mask(a, _mm_setzero_si128(), 0); +#else + return _mm_cmpeq_epi32(a, _mm_setzero_si128()); +#endif +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. +// (s is signed) +static inline Vec4i select(Vec4ib const s, Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi32(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec4i if_add(Vec4ib const f, Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi32(a, f, a, b); +#else + return a + (Vec4i(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec4i if_sub(Vec4ib const f, Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi32(a, f, a, b); +#else + return a - (Vec4i(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec4i if_mul(Vec4ib const f, Vec4i const a, Vec4i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mullo_epi32(a, f, a, b); +#else + return select(f, a * b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int32_t horizontal_add(Vec4i const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epi32(a); + __m128i sum2 = _mm_shuffle_epi32(sum1, 0x0E); // high element + __m128i sum3 = _mm_add_epi32(sum1, sum2); // sum + return _mm_cvtsi128_si32(sum3); // truncate to 32 bits +#elif INSTRSET >= 4 & false // SSSE3 + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128i sum1 = _mm_hadd_epi32(a, a); // horizontally add 4 elements in 2 steps + __m128i sum2 = _mm_hadd_epi32(sum1, sum1); + return _mm_cvtsi128_si32(sum2); // 32 bit sum +#else // SSE2 + __m128i sum1 = _mm_unpackhi_epi64(a, a); // 2 high elements + __m128i sum2 = _mm_add_epi32(a, sum1); // 2 sums + __m128i sum3 = _mm_shuffle_epi32(sum2, 0x01); // 1 high element + __m128i sum4 = _mm_add_epi32(sum2, sum3); // 2 sums + return _mm_cvtsi128_si32(sum4); // 32 bit sum +#endif +} + +// function used below +static inline int64_t _emulate_movq(__m128i const x) { +#ifdef __x86_64__ + return _mm_cvtsi128_si64(x); +#else + // 64 bit registers not available + union { + __m128i m; + int64_t y; + } u; + _mm_storel_epi64(&u.m, x); + return u.y; +#endif +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are sign extended before adding to avoid overflow +static inline int64_t horizontal_add_x(Vec4i const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epi32(a); +#else // SSE2 + __m128i signs = _mm_srai_epi32(a, 31); // sign of all elements + __m128i a01 = _mm_unpacklo_epi32(a, signs); // sign-extended a0, a1 + __m128i a23 = _mm_unpackhi_epi32(a, signs); // sign-extended a2, a3 + __m128i sum1 = _mm_add_epi64(a01, a23); // add +#endif + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); // high qword + __m128i sum3 = _mm_add_epi64(sum1, sum2); // add + return _emulate_movq(sum3); +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec4i add_saturated(Vec4i const a, Vec4i const b) { + // is there a faster method? + __m128i sum = _mm_add_epi32(a, b); // a + b + __m128i axb = _mm_xor_si128(a, b); // check if a and b have different sign + __m128i axs = _mm_xor_si128(a, sum); // check if a and sum have different sign + __m128i overf1 = _mm_andnot_si128(axb, axs); // check if sum has wrong sign + __m128i overf2 = _mm_srai_epi32(overf1, 31); // -1 if overflow + __m128i asign = _mm_srli_epi32(a, 31); // 1 if a < 0 + __m128i sat1 = _mm_srli_epi32(overf2, 1); // 7FFFFFFF if overflow + __m128i sat2 = _mm_add_epi32(sat1, asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow + return selectb(overf2, sat2, sum); // sum if not overflow, else sat2 +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec4i sub_saturated(Vec4i const a, Vec4i const b) { + __m128i diff = _mm_sub_epi32(a, b); // a + b + __m128i axb = _mm_xor_si128(a, b); // check if a and b have different sign + __m128i axs = _mm_xor_si128(a, diff); // check if a and sum have different sign + __m128i overf1 = _mm_and_si128(axb, axs); // check if sum has wrong sign + __m128i overf2 = _mm_srai_epi32(overf1, 31); // -1 if overflow + __m128i asign = _mm_srli_epi32(a, 31); // 1 if a < 0 + __m128i sat1 = _mm_srli_epi32(overf2, 1); // 7FFFFFFF if overflow + __m128i sat2 = _mm_add_epi32(sat1, asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow + return selectb(overf2, sat2, diff); // diff if not overflow, else sat2 +} + +// function max: a > b ? a : b +static inline Vec4i max(Vec4i const a, Vec4i const b) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_max_epi32(a, b); +#else + __m128i greater = _mm_cmpgt_epi32(a, b); + return selectb(greater, a, b); +#endif +} + +// function min: a < b ? a : b +static inline Vec4i min(Vec4i const a, Vec4i const b) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_min_epi32(a, b); +#else + __m128i greater = _mm_cmpgt_epi32(a, b); + return selectb(greater, b, a); +#endif +} + +// function abs: a >= 0 ? a : -a +static inline Vec4i abs(Vec4i const a) { +#if INSTRSET >= 4 // SSSE3 supported + return _mm_abs_epi32(a); +#else // SSE2 + __m128i sign = _mm_srai_epi32(a, 31); // sign of a + __m128i inv = _mm_xor_si128(a, sign); // invert bits if negative + return _mm_sub_epi32(inv, sign); // add 1 +#endif +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec4i abs_saturated(Vec4i const a) { +#if INSTRSET >= 10 + return _mm_min_epu32(abs(a), Vec4i(0x7FFFFFFF)); +#else + __m128i absa = abs(a); // abs(a) + __m128i overfl = _mm_srai_epi32(absa, 31); // sign + return _mm_add_epi32(absa, overfl); // subtract 1 if 0x80000000 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec4i rotate_left(Vec4i const a, int b) { +#if INSTRSET >= 10 // __AVX512VL__ + return _mm_rolv_epi32(a, _mm_set1_epi32(b)); +#elif defined __XOP__ // AMD XOP instruction set + return _mm_rot_epi32(a, _mm_set1_epi32(b)); +#else // SSE2 instruction set + __m128i left = _mm_sll_epi32(a, _mm_cvtsi32_si128(b & 0x1F)); // a << b + __m128i right = _mm_srl_epi32(a, _mm_cvtsi32_si128((-b) & 0x1F));// a >> (32 - b) + __m128i rot = _mm_or_si128(left, right); // or + return rot; +#endif +} + + +/***************************************************************************** +* +* Vector of 4 32-bit unsigned integers +* +*****************************************************************************/ + +class Vec4ui : public Vec4i { +public: + // Default constructor: + Vec4ui() { + } + // Constructor to broadcast the same value into all elements: + Vec4ui(uint32_t i) { + xmm = _mm_set1_epi32((int32_t)i); + } + // Constructor to build from all elements: + Vec4ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { + xmm = _mm_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec4ui(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec4ui & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec4ui & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec4ui & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec4ui const insert(int index, uint32_t value) { + Vec4i::insert(index, (int32_t)value); + return *this; + } + // Member function extract a single element from vector + uint32_t extract(int index) const { + return (uint32_t)Vec4i::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint32_t operator [] (int index) const { + return extract(index); + } + static constexpr int elementtype() { + return 9; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec4ui operator + (Vec4ui const a, Vec4ui const b) { + return Vec4ui(Vec4i(a) + Vec4i(b)); +} + +// vector operator - : subtract +static inline Vec4ui operator - (Vec4ui const a, Vec4ui const b) { + return Vec4ui(Vec4i(a) - Vec4i(b)); +} + +// vector operator * : multiply +static inline Vec4ui operator * (Vec4ui const a, Vec4ui const b) { + return Vec4ui(Vec4i(a) * Vec4i(b)); +} + +// vector operator / : divide. See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec4ui operator >> (Vec4ui const a, uint32_t b) { + return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)b)); +} +// vector operator >> : shift right logical all elements +static inline Vec4ui operator >> (Vec4ui const a, int32_t b) { + return a >> (uint32_t)b; +} +// vector operator >>= : shift right logical +static inline Vec4ui & operator >>= (Vec4ui & a, int b) { + a = a >> b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec4ui operator << (Vec4ui const a, uint32_t b) { + return Vec4ui((Vec4i)a << (int32_t)b); +} +// vector operator << : shift left all elements +static inline Vec4ui operator << (Vec4ui const a, int32_t b) { + return Vec4ui((Vec4i)a << (int32_t)b); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec4ib operator > (Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu32_mask(a, b, 6); +#elif defined (__XOP__) // AMD XOP instruction set + return (Vec4ib)_mm_comgt_epu32(a, b); +#else // SSE2 instruction set + __m128i signbit = _mm_set1_epi32(0x80000000); + __m128i a1 = _mm_xor_si128(a, signbit); + __m128i b1 = _mm_xor_si128(b, signbit); + return (Vec4ib)_mm_cmpgt_epi32(a1, b1); // signed compare +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec4ib operator < (Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu32_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec4ib operator >= (Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu32_mask(a, b, 5); +#else +#ifdef __XOP__ // AMD XOP instruction set + return (Vec4ib)_mm_comge_epu32(a, b); +#elif INSTRSET >= 5 // SSE4.1 + __m128i max_ab = _mm_max_epu32(a, b); // max(a,b), unsigned + return (Vec4ib)_mm_cmpeq_epi32(a, max_ab); // a == max(a,b) +#else // SSE2 instruction set + return Vec4ib(Vec4i(~(b > a))); +#endif +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec4ib operator <= (Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu32_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec4ui operator & (Vec4ui const a, Vec4ui const b) { + return Vec4ui(Vec128b(a) & Vec128b(b)); +} +static inline Vec4ui operator && (Vec4ui const a, Vec4ui const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec4ui operator | (Vec4ui const a, Vec4ui const b) { + return Vec4ui(Vec128b(a) | Vec128b(b)); +} +static inline Vec4ui operator || (Vec4ui const a, Vec4ui const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec4ui operator ^ (Vec4ui const a, Vec4ui const b) { + return Vec4ui(Vec128b(a) ^ Vec128b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec4ui operator ~ (Vec4ui const a) { + return Vec4ui(~Vec128b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec4ui select(Vec4ib const s, Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi32(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec4ui if_add(Vec4ib const f, Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi32(a, f, a, b); +#else + return a + (Vec4ui(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec4ui if_sub(Vec4ib const f, Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi32(a, f, a, b); +#else + return a - (Vec4ui(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec4ui if_mul(Vec4ib const f, Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mullo_epi32(a, f, a, b); +#else + return select(f, a * b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint32_t horizontal_add(Vec4ui const a) { + return (uint32_t)horizontal_add((Vec4i)a); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are zero extended before adding to avoid overflow +static inline uint64_t horizontal_add_x(Vec4ui const a) { +#ifdef __XOP__ // AMD XOP instruction set + __m128i sum1 = _mm_haddq_epu32(a); +#else // SSE2 + __m128i zero = _mm_setzero_si128(); // 0 + __m128i a01 = _mm_unpacklo_epi32(a, zero); // zero-extended a0, a1 + __m128i a23 = _mm_unpackhi_epi32(a, zero); // zero-extended a2, a3 + __m128i sum1 = _mm_add_epi64(a01, a23); // add +#endif + __m128i sum2 = _mm_unpackhi_epi64(sum1, sum1); // high qword + __m128i sum3 = _mm_add_epi64(sum1, sum2); // add + return (uint64_t)_emulate_movq(sum3); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec4ui add_saturated(Vec4ui const a, Vec4ui const b) { + Vec4ui sum = a + b; + Vec4ui aorb = Vec4ui(a | b); +#if INSTRSET >= 10 + Vec4b overflow = _mm_cmp_epu32_mask(sum, aorb, 1); + return _mm_mask_set1_epi32(sum, overflow, -1); +#else + Vec4ui overflow = Vec4ui(sum < aorb); // overflow if a + b < (a | b) + return Vec4ui(sum | overflow); // return 0xFFFFFFFF if overflow +#endif +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec4ui sub_saturated(Vec4ui const a, Vec4ui const b) { + Vec4ui diff = a - b; +#if INSTRSET >= 10 + Vec4b nunderflow = _mm_cmp_epu32_mask(diff, a, 2); // not underflow if a - b <= a + return _mm_maskz_mov_epi32(nunderflow, diff); // zero if underflow +#else + Vec4ui underflow = Vec4ui(diff > a); // underflow if a - b > a + return _mm_andnot_si128(underflow, diff); // return 0 if underflow +#endif +} + +// function max: a > b ? a : b +static inline Vec4ui max(Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_max_epu32(a, b); +#else // SSE2 + return select(a > b, a, b); +#endif +} + +// function min: a < b ? a : b +static inline Vec4ui min(Vec4ui const a, Vec4ui const b) { +#if INSTRSET >= 5 // SSE4.1 + return _mm_min_epu32(a, b); +#else // SSE2 + return select(a > b, b, a); +#endif +} + + +/***************************************************************************** +* +* Vector of 2 64-bit signed integers +* +*****************************************************************************/ + +class Vec2q : public Vec128b { +public: + // Default constructor: + Vec2q() { + } + // Constructor to broadcast the same value into all elements: + Vec2q(int64_t i) { + xmm = _mm_set1_epi64x(i); + } + // Constructor to build from all elements: + Vec2q(int64_t i0, int64_t i1) { + xmm = _mm_set_epi64x(i1, i0); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec2q(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec2q & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Type cast operator to convert to __m128i used in intrinsics + operator __m128i() const { + return xmm; + } + // Member function to load from array (unaligned) + Vec2q & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec2q & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec2q & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + xmm = _mm_maskz_loadu_epi64(__mmask8((1u << n) - 1), p); +#else + switch (n) { + case 0: + *this = 0; break; + case 1: + // intrinsic for movq is missing! + *this = Vec2q(*(int64_t const*)p, 0); break; + case 2: + load(p); break; + default: + break; + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + _mm_mask_storeu_epi64(p, __mmask8((1u << n) - 1), xmm); +#else + switch (n) { + case 1: + int64_t q[2]; + store(q); + *(int64_t*)p = q[0]; break; + case 2: + store(p); break; + default: + break; + } +#endif + } + // cut off vector to n elements. The last 2-n elements are set to zero + Vec2q & cutoff(int n) { +#if INSTRSET >= 10 + xmm = _mm_maskz_mov_epi64(__mmask8((1u << n) - 1), xmm); +#else + *this = Vec16c(xmm).cutoff(n * 8); +#endif + return *this; + } + // Member function to change a single element in vector + // Note: This function is inefficient. Use load function if changing more than one element + Vec2q const insert(int index, int64_t value) { +#if INSTRSET >= 10 + xmm = _mm_mask_set1_epi64(xmm, __mmask8(1u << index), value); +#elif INSTRSET >= 5 && defined(__x86_64__) // SSE4.1 supported, 64 bit mode + if (index == 0) { + xmm = _mm_insert_epi64(xmm, value, 0); + } + else { + xmm = _mm_insert_epi64(xmm, value, 1); + } +#else // SSE2 +#if defined(__x86_64__) // 64 bit mode + __m128i v = _mm_cvtsi64_si128(value); // 64 bit load +#else + union { + __m128i m; + int64_t ii; + } u; + u.ii = value; + __m128i v = _mm_loadl_epi64(&u.m); +#endif + if (index == 0) { + v = _mm_unpacklo_epi64(v, v); + xmm = _mm_unpackhi_epi64(v, xmm); + } + else { // index = 1 + xmm = _mm_unpacklo_epi64(xmm, v); + } +#endif + return *this; + } + // Member function extract a single element from vector + int64_t extract(int index) const { +#if INSTRSET >= 10 + __m128i x = _mm_mask_unpackhi_epi64(xmm, __mmask8(index), xmm, xmm); + return _emulate_movq(x); +#else + int64_t x[2]; + store(x); + return x[index & 1]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int64_t operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 2; + } + static constexpr int elementtype() { + return 10; + } +}; + + +/***************************************************************************** +* +* Vec2qb: Vector of 2 Booleans for use with Vec2q and Vec2uq +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// Definition will be different for the AVX512 instruction set +class Vec2qb : public Vec2q { +public: + // Default constructor: + Vec2qb() { + } + // Constructor to build from all elements: + Vec2qb(bool x0, bool x1) { + xmm = Vec2q(-int64_t(x0), -int64_t(x1)); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec2qb(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec2qb & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec2qb(bool b) : Vec2q(-int64_t(b)) { + } + // Assignment operator to broadcast scalar value: + Vec2qb & operator = (bool b) { + *this = Vec2qb(b); + return *this; + } + Vec2qb & insert(int index, bool a) { + Vec2q::insert(index, -(int64_t)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec2q::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec2qb & load_bits(uint8_t a) { + __m128i b1 = _mm_set1_epi8((int8_t)a); // broadcast byte + __m128i m1 = constant4ui<1, 1, 2, 2>(); + __m128i c1 = _mm_and_si128(b1, m1); // isolate one bit in each byte + xmm = _mm_cmpgt_epi32(c1, _mm_setzero_si128()); // compare with 0 (64 bit compare requires SSE4.1) + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec2qb(int b) = delete; + Vec2qb & operator = (int x) = delete; +}; + +#else + +typedef Vec2b Vec2qb; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Define operators for Vec2qb +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec2qb operator & (Vec2qb const a, Vec2qb const b) { + return Vec2qb(Vec128b(a) & Vec128b(b)); +} +static inline Vec2qb operator && (Vec2qb const a, Vec2qb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec2qb & operator &= (Vec2qb & a, Vec2qb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec2qb operator | (Vec2qb const a, Vec2qb const b) { + return Vec2qb(Vec128b(a) | Vec128b(b)); +} +static inline Vec2qb operator || (Vec2qb const a, Vec2qb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec2qb & operator |= (Vec2qb & a, Vec2qb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec2qb operator ^ (Vec2qb const a, Vec2qb const b) { + return Vec2qb(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec2qb & operator ^= (Vec2qb & a, Vec2qb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec2qb operator == (Vec2qb const a, Vec2qb const b) { + return Vec2qb(a ^ (~b)); +} + +// vector operator != : xor +static inline Vec2qb operator != (Vec2qb const a, Vec2qb const b) { + return Vec2qb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec2qb operator ~ (Vec2qb const a) { + return Vec2qb(~Vec128b(a)); +} + +// vector operator ! : element not +static inline Vec2qb operator ! (Vec2qb const a) { + return ~a; +} + +// vector function andnot +static inline Vec2qb andnot(Vec2qb const a, Vec2qb const b) { + return Vec2qb(andnot(Vec128b(a), Vec128b(b))); +} + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec2qb const a) { + return _mm_movemask_epi8(a) == 0xFFFF; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec2qb const a) { +#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST + return !_mm_testz_si128(a, a); +#else + return _mm_movemask_epi8(a) != 0; +#endif +} + +#endif // broad boolean vectors + + + +/***************************************************************************** +* +* Operators for Vec2q +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec2q operator + (Vec2q const a, Vec2q const b) { + return _mm_add_epi64(a, b); +} +// vector operator += : add +static inline Vec2q & operator += (Vec2q & a, Vec2q const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec2q operator ++ (Vec2q & a, int) { + Vec2q a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec2q & operator ++ (Vec2q & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec2q operator - (Vec2q const a, Vec2q const b) { + return _mm_sub_epi64(a, b); +} +// vector operator - : unary minus +static inline Vec2q operator - (Vec2q const a) { + return _mm_sub_epi64(_mm_setzero_si128(), a); +} +// vector operator -= : subtract +static inline Vec2q & operator -= (Vec2q & a, Vec2q const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec2q operator -- (Vec2q & a, int) { + Vec2q a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec2q & operator -- (Vec2q & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec2q operator * (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm_mullo_epi64(a, b); +#elif INSTRSET >= 5 // SSE4.1 supported + // Split into 32-bit multiplies + __m128i bswap = _mm_shuffle_epi32(b, 0xB1); // b0H,b0L,b1H,b1L (swap H<->L) + __m128i prodlh = _mm_mullo_epi32(a, bswap); // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products + __m128i zero = _mm_setzero_si128(); // 0 + __m128i prodlh2 = _mm_hadd_epi32(prodlh, zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0 + __m128i prodlh3 = _mm_shuffle_epi32(prodlh2, 0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L + __m128i prodll = _mm_mul_epu32(a, b); // a0Lb0L,a1Lb1L, 64 bit unsigned products + __m128i prod = _mm_add_epi64(prodll, prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 + return prod; +#else // SSE2 + int64_t aa[2], bb[2]; + a.store(aa); // split into elements + b.store(bb); + return Vec2q(aa[0] * bb[0], aa[1] * bb[1]); // multiply elements separetely +#endif +} + +// vector operator *= : multiply +static inline Vec2q & operator *= (Vec2q & a, Vec2q const b) { + a = a * b; + return a; +} + +// vector operator << : shift left +static inline Vec2q operator << (Vec2q const a, int32_t b) { + return _mm_sll_epi64(a, _mm_cvtsi32_si128(b)); +} + +// vector operator <<= : shift left +static inline Vec2q & operator <<= (Vec2q & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec2q operator >> (Vec2q const a, int32_t b) { +#if INSTRSET >= 10 // AVX512VL + return _mm_sra_epi64(a, _mm_cvtsi32_si128(b)); +#else + __m128i bb, shi, slo, sra2; + if (b <= 32) { + bb = _mm_cvtsi32_si128(b); // b + shi = _mm_sra_epi32(a, bb); // a >> b signed dwords + slo = _mm_srl_epi64(a, bb); // a >> b unsigned qwords + } + else { // b > 32 + bb = _mm_cvtsi32_si128(b - 32); // b - 32 + shi = _mm_srai_epi32(a, 31); // sign of a + sra2 = _mm_sra_epi32(a, bb); // a >> (b-32) signed dwords + slo = _mm_srli_epi64(sra2, 32); // a >> (b-32) >> 32 (second shift unsigned qword) + } +#if INSTRSET >= 5 // SSE4.1 + return _mm_blend_epi16(slo, shi, 0xCC); +#else + __m128i mask = _mm_setr_epi32(0, -1, 0, -1); // mask for high part containing only sign + return selectb(mask, shi, slo); +#endif +#endif +} + +// vector operator >>= : shift right arithmetic +static inline Vec2q & operator >>= (Vec2q & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec2qb operator == (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epi64_mask(a, b, 0); +#elif INSTRSET >= 5 // SSE4.1 supported + return _mm_cmpeq_epi64(a, b); +#else // SSE2 + // no 64 compare instruction. Do two 32 bit compares + __m128i com32 = _mm_cmpeq_epi32(a, b); // 32 bit compares + __m128i com32s = _mm_shuffle_epi32(com32, 0xB1); // swap low and high dwords + __m128i test = _mm_and_si128(com32, com32s); // low & high + __m128i teste = _mm_srai_epi32(test, 31); // extend sign bit to 32 bits + __m128i testee = _mm_shuffle_epi32(teste, 0xF5); // extend sign bit to 64 bits + return Vec2qb(Vec2q(testee)); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec2qb operator != (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epi64_mask(a, b, 4); +#elif defined (__XOP__) // AMD XOP instruction set + return Vec2qb(_mm_comneq_epi64(a, b)); +#else // SSE2 instruction set + return Vec2qb(Vec2q(~(a == b))); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec2qb operator < (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epi64_mask(a, b, 1); +#elif INSTRSET >= 6 // SSE4.2 supported + return Vec2qb(Vec2q(_mm_cmpgt_epi64(b, a))); +#else // SSE2 + // no 64 compare instruction. Subtract + __m128i s = _mm_sub_epi64(a, b); // a-b + // a < b if a and b have same sign and s < 0 or (a < 0 and b >= 0) + // The latter () corrects for overflow + __m128i axb = _mm_xor_si128(a, b); // a ^ b + __m128i anb = _mm_andnot_si128(b, a); // a & ~b + __m128i snaxb = _mm_andnot_si128(axb, s); // s & ~(a ^ b) + __m128i or1 = _mm_or_si128(anb, snaxb); // (a & ~b) | (s & ~(a ^ b)) + __m128i teste = _mm_srai_epi32(or1, 31); // extend sign bit to 32 bits + __m128i testee = _mm_shuffle_epi32(teste, 0xF5); // extend sign bit to 64 bits + return testee; +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec2qb operator > (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi64_mask(a, b, 6); +#else + return b < a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec2qb operator >= (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epi64_mask(a, b, 5); +#elif defined (__XOP__) // AMD XOP instruction set + return Vec2qb(_mm_comge_epi64(a, b)); +#else // SSE2 instruction set + return Vec2qb(Vec2q(~(a < b))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec2qb operator <= (Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epi64_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec2q operator & (Vec2q const a, Vec2q const b) { + return Vec2q(Vec128b(a) & Vec128b(b)); +} +static inline Vec2q operator && (Vec2q const a, Vec2q const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec2q & operator &= (Vec2q & a, Vec2q const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec2q operator | (Vec2q const a, Vec2q const b) { + return Vec2q(Vec128b(a) | Vec128b(b)); +} +static inline Vec2q operator || (Vec2q const a, Vec2q const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec2q & operator |= (Vec2q & a, Vec2q const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec2q operator ^ (Vec2q const a, Vec2q const b) { + return Vec2q(Vec128b(a) ^ Vec128b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec2q & operator ^= (Vec2q & a, Vec2q const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec2q operator ~ (Vec2q const a) { + return Vec2q(~Vec128b(a)); +} + +// vector operator ! : logical not, returns true for elements == 0 +static inline Vec2qb operator ! (Vec2q const a) { + return a == Vec2q(_mm_setzero_si128()); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec2q select(Vec2qb const s, Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi64(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec2q if_add(Vec2qb const f, Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi64(a, f, a, b); +#else + return a + (Vec2q(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec2q if_sub(Vec2qb const f, Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi64(a, f, a, b); +#else + return a - (Vec2q(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec2q if_mul(Vec2qb const f, Vec2q const a, Vec2q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mullo_epi64(a, f, a, b); +#else + return select(f, a * b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int64_t horizontal_add(Vec2q const a) { + __m128i sum1 = _mm_unpackhi_epi64(a, a); // high element + __m128i sum2 = _mm_add_epi64(a, sum1); // sum + return _emulate_movq(sum2); +} + +// function max: a > b ? a : b +static inline Vec2q max(Vec2q const a, Vec2q const b) { + return select(a > b, a, b); +} + +// function min: a < b ? a : b +static inline Vec2q min(Vec2q const a, Vec2q const b) { + return select(a < b, a, b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec2q abs(Vec2q const a) { +#if INSTRSET >= 10 // AVX512VL + return _mm_abs_epi64(a); +#elif INSTRSET >= 6 // SSE4.2 supported + __m128i sign = _mm_cmpgt_epi64(_mm_setzero_si128(), a);// 0 > a + __m128i inv = _mm_xor_si128(a, sign); // invert bits if negative + return _mm_sub_epi64(inv, sign); // add 1 +#else // SSE2 + __m128i signh = _mm_srai_epi32(a, 31); // sign in high dword + __m128i sign = _mm_shuffle_epi32(signh, 0xF5); // copy sign to low dword + __m128i inv = _mm_xor_si128(a, sign); // invert bits if negative + return _mm_sub_epi64(inv, sign); // add 1 +#endif +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec2q abs_saturated(Vec2q const a) { +#if INSTRSET >= 10 + return _mm_min_epu64(abs(a), Vec2q(0x7FFFFFFFFFFFFFFF)); +#elif INSTRSET >= 6 // SSE4.2 supported + __m128i absa = abs(a); // abs(a) + __m128i overfl = _mm_cmpgt_epi64(_mm_setzero_si128(), absa);// 0 > a + return _mm_add_epi64(absa, overfl); // subtract 1 if 0x8000000000000000 +#else // SSE2 + __m128i absa = abs(a); // abs(a) + __m128i signh = _mm_srai_epi32(absa, 31); // sign in high dword + __m128i overfl = _mm_shuffle_epi32(signh, 0xF5); // copy sign to low dword + return _mm_add_epi64(absa, overfl); // subtract 1 if 0x8000000000000000 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec2q rotate_left(Vec2q const a, int b) { +#if INSTRSET >= 10 // __AVX512VL__ + return _mm_rolv_epi64(a, _mm_set1_epi64x(int64_t(b))); +#elif defined __XOP__ // AMD XOP instruction set + return (Vec2q)_mm_rot_epi64(a, Vec2q(b)); +#else // SSE2 instruction set + __m128i left = _mm_sll_epi64(a, _mm_cvtsi32_si128(b & 0x3F)); // a << b + __m128i right = _mm_srl_epi64(a, _mm_cvtsi32_si128((-b) & 0x3F));// a >> (64 - b) + __m128i rot = _mm_or_si128(left, right); // or + return (Vec2q)rot; +#endif +} + + +/***************************************************************************** +* +* Vector of 2 64-bit unsigned integers +* +*****************************************************************************/ + +class Vec2uq : public Vec2q { +public: + // Default constructor: + Vec2uq() { + } + // Constructor to broadcast the same value into all elements: + Vec2uq(uint64_t i) { + xmm = Vec2q((int64_t)i); + } + // Constructor to build from all elements: + Vec2uq(uint64_t i0, uint64_t i1) { + xmm = Vec2q((int64_t)i0, (int64_t)i1); + } + // Constructor to convert from type __m128i used in intrinsics: + Vec2uq(__m128i const x) { + xmm = x; + } + // Assignment operator to convert from type __m128i used in intrinsics: + Vec2uq & operator = (__m128i const x) { + xmm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec2uq & load(void const * p) { + xmm = _mm_loadu_si128((__m128i const*)p); + return *this; + } + // Member function to load from array (aligned) + Vec2uq & load_a(void const * p) { + xmm = _mm_load_si128((__m128i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec2uq const insert(int index, uint64_t value) { + Vec2q::insert(index, (int64_t)value); + return *this; + } + // Member function extract a single element from vector + uint64_t extract(int index) const { + return (uint64_t)Vec2q::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint64_t operator [] (int index) const { + return extract(index); + } + static constexpr int elementtype() { + return 11; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec2uq operator + (Vec2uq const a, Vec2uq const b) { + return Vec2uq(Vec2q(a) + Vec2q(b)); +} + +// vector operator - : subtract +static inline Vec2uq operator - (Vec2uq const a, Vec2uq const b) { + return Vec2uq(Vec2q(a) - Vec2q(b)); +} + +// vector operator * : multiply element by element +static inline Vec2uq operator * (Vec2uq const a, Vec2uq const b) { + return Vec2uq(Vec2q(a) * Vec2q(b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec2uq operator >> (Vec2uq const a, uint32_t b) { + return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec2uq operator >> (Vec2uq const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right logical +static inline Vec2uq & operator >>= (Vec2uq & a, int b) { + a = a >> b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec2uq operator << (Vec2uq const a, uint32_t b) { + return Vec2uq((Vec2q)a << (int32_t)b); +} + +// vector operator << : shift left all elements +static inline Vec2uq operator << (Vec2uq const a, int32_t b) { + return Vec2uq((Vec2q)a << b); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec2qb operator > (Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epu64_mask(a, b, 6); +#elif defined ( __XOP__ ) // AMD XOP instruction set + return Vec2qb(_mm_comgt_epu64(a, b)); +#elif INSTRSET >= 6 // SSE4.2 + __m128i sign64 = constant4ui<0, 0x80000000, 0, 0x80000000>(); + __m128i aflip = _mm_xor_si128(a, sign64); // flip sign bits to use signed compare + __m128i bflip = _mm_xor_si128(b, sign64); + Vec2q cmp = _mm_cmpgt_epi64(aflip, bflip); + return Vec2qb(cmp); +#else // SSE2 instruction set + __m128i sign32 = _mm_set1_epi32(0x80000000); // sign bit of each dword + __m128i aflip = _mm_xor_si128(a, sign32); // a with sign bits flipped to use signed compare + __m128i bflip = _mm_xor_si128(b, sign32); // b with sign bits flipped to use signed compare + __m128i equal = _mm_cmpeq_epi32(a, b); // a == b, dwords + __m128i bigger = _mm_cmpgt_epi32(aflip, bflip); // a > b, dwords + __m128i biggerl = _mm_shuffle_epi32(bigger, 0xA0); // a > b, low dwords copied to high dwords + __m128i eqbig = _mm_and_si128(equal, biggerl); // high part equal and low part bigger + __m128i hibig = _mm_or_si128(bigger, eqbig); // high part bigger or high part equal and low part bigger + __m128i big = _mm_shuffle_epi32(hibig, 0xF5); // result copied to low part + return Vec2qb(Vec2q(big)); +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec2qb operator < (Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu64_mask(a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec2qb operator >= (Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // broad boolean vectors + return _mm_cmp_epu64_mask(a, b, 5); +#elif defined (__XOP__) // AMD XOP instruction set + return Vec2qb(_mm_comge_epu64(a, b)); +#else // SSE2 instruction set + return Vec2qb(Vec2q(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec2qb operator <= (Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_cmp_epu64_mask(a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec2uq operator & (Vec2uq const a, Vec2uq const b) { + return Vec2uq(Vec128b(a) & Vec128b(b)); +} +static inline Vec2uq operator && (Vec2uq const a, Vec2uq const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec2uq operator | (Vec2uq const a, Vec2uq const b) { + return Vec2uq(Vec128b(a) | Vec128b(b)); +} +static inline Vec2uq operator || (Vec2uq const a, Vec2uq const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec2uq operator ^ (Vec2uq const a, Vec2uq const b) { + return Vec2uq(Vec128b(a) ^ Vec128b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec2uq operator ~ (Vec2uq const a) { + return Vec2uq(~Vec128b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec2uq select(Vec2qb const s, Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mov_epi64(b, s, a); +#else + return selectb(s, a, b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec2uq if_add(Vec2qb const f, Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_add_epi64(a, f, a, b); +#else + return a + (Vec2uq(f) & b); +#endif +} + +// Conditional sub: For all vector elements i: result[i] = f[i] ? (a[i] - b[i]) : a[i] +static inline Vec2uq if_sub(Vec2qb const f, Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_sub_epi64(a, f, a, b); +#else + return a - (Vec2uq(f) & b); +#endif +} + +// Conditional mul: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] +static inline Vec2uq if_mul(Vec2qb const f, Vec2uq const a, Vec2uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm_mask_mullo_epi64(a, f, a, b); +#else + return select(f, a * b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint64_t horizontal_add(Vec2uq const a) { + return (uint64_t)horizontal_add((Vec2q)a); +} + +// function max: a > b ? a : b +static inline Vec2uq max(Vec2uq const a, Vec2uq const b) { + return select(a > b, a, b); +} + +// function min: a < b ? a : b +static inline Vec2uq min(Vec2uq const a, Vec2uq const b) { + return select(a > b, b, a); +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. +* +* The indexes are inserted as template parameters in <>. +* These indexes must be constants. +* Each template parameter is an index to the element you want to select. +* An index of -1 will generate zero. +* An index of V_DC means don't care. This gives the best instruction that +* fits the remaining indexes +* +* Example: +* Vec4i a(10,11,12,13); // a is (10,11,12,13) +* Vec4i b, c; +* b = permute4<0,0,2,2>(a); // b is (10,10,12,12) +* c = permute4<3,2,-1,-1>(a); // c is (13,12, 0, 0) +* +* A lot of the code here is metaprogramming aiming to find the instructions +* that best fits the template parameters and instruction set. +* The final optimized code will contain only one or a few instructions. +* Higher instruction sets may give you more efficient code. +* +*****************************************************************************/ + +// permute Vec2q +template +static inline Vec2q permute2(Vec2q const a) { + constexpr int indexs[2] = { i0, i1 }; // indexes as array + __m128i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128(); // just return zero + + constexpr bool fit_shleft = (flags & perm_shleft) != 0; + constexpr bool fit_shright = (flags & perm_shright) != 0; + constexpr bool fit_punpckh = (flags & perm_punpckh) != 0; + constexpr bool fit_punpckl = (flags & perm_punpckl) != 0; + constexpr bool fit_zeroing = (flags & perm_zeroing) != 0; + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + // try to fit various instructions + + if constexpr (fit_shleft && fit_zeroing) { + // pslldq does both permutation and zeroing. if zeroing not needed use punpckl instead + return _mm_bslli_si128(a, 8); + } + if constexpr (fit_shright && fit_zeroing) { + // psrldq does both permutation and zeroing. if zeroing not needed use punpckh instead + return _mm_bsrli_si128(a, 8); + } + if constexpr (fit_punpckh) { // fits punpckhi + y = _mm_unpackhi_epi64(a, a); + } + else if constexpr (fit_punpckl) { // fits punpcklo + y = _mm_unpacklo_epi64(a, a); + } + else { // needs general permute + y = _mm_shuffle_epi32(a, i0 * 0x0A + i1 * 0xA0 + 0x44); + } + } + if constexpr (fit_zeroing) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi64(zero_mask<2>(indexs), y); +#else // use unpack to avoid using data cache + if constexpr (i0 == -1) { + y = _mm_unpackhi_epi64(_mm_setzero_si128(), y); + } + else if constexpr (i1 == -1) { + y = _mm_unpacklo_epi64(y, _mm_setzero_si128()); + } +#endif + } + return y; +} + +template +static inline Vec2uq permute2(Vec2uq const a) { + return Vec2uq(permute2 ((Vec2q)a)); +} + +// permute Vec4i +template +static inline Vec4i permute4(Vec4i const a) { + constexpr int indexs[4] = {i0, i1, i2, i3}; // indexes as array + __m128i y = a; // result + + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128(); + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { + // use larger permutation + constexpr EList L = largeblock_perm<4>(indexs); // permutation pattern + y = permute2 (Vec2q(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_shleft) != 0) { // fits pslldq + y = _mm_bslli_si128(a, (16-(flags >> perm_rot_count)) & 0xF); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_shright) != 0) { // fits psrldq + y = _mm_bsrli_si128(a, (flags >> perm_rot_count) & 0xF); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } +#if INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask + else if constexpr ((flags & perm_zeroing) != 0) { + // Do both permutation and zeroing with PSHUFB instruction + const EList bm = pshufb_mask(indexs); + return _mm_shuffle_epi8(a, Vec4i().load(bm.a)); + } +#endif + else if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm_unpackhi_epi32(a, a); + } + else if constexpr ((flags & perm_punpckl) != 0) { // fits punpcklo + y = _mm_unpacklo_epi32(a, a); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & perm_rotate) != 0) { // fits palignr + y = _mm_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF); + } +#endif + else { // needs general permute + y = _mm_shuffle_epi32(a, (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6); + } + } + if constexpr ((flags & perm_zeroing) != 0) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + // The mask-zero operation can be merged into the preceding instruction, whatever that is. + // A good optimizing compiler will do this automatically. + // I don't want to clutter all the branches above with this + y = _mm_maskz_mov_epi32 (zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec4i().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec4ui permute4(Vec4ui const a) { + return Vec4ui(permute4 (Vec4i(a))); +} + + +// permute Vec8s +template +static inline Vec8s permute8(Vec8s const a) { + // indexes as array + constexpr int indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + constexpr uint64_t flags16 = perm16_flags(indexs); + + constexpr bool fit_zeroing = (flags & perm_zeroing) != 0;// needs additional zeroing + constexpr bool L2L = (flags16 & 1) != 0; // from low to low 64-bit part + constexpr bool H2H = (flags16 & 2) != 0; // from high to high 64-bit part + constexpr bool H2L = (flags16 & 4) != 0; // from high to low 64-bit part + constexpr bool L2H = (flags16 & 8) != 0; // from low to high 64-bit part + constexpr uint8_t pL2L = uint8_t(flags16 >> 32); // low to low permute pattern + constexpr uint8_t pH2H = uint8_t(flags16 >> 40); // high to high permute pattern + constexpr uint8_t noperm = 0xE4; // pattern for no permute + + __m128i y = a; // result + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128(); + + if constexpr ((flags & perm_perm) != 0) { + // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { + // use larger permutation + constexpr EList L = largeblock_perm<8>(indexs); // permutation pattern + y = permute4 (Vec4i(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_shleft) != 0 && (flags & perm_addz) == 0) {// fits pslldq + return _mm_bslli_si128(a, (16-(flags >> perm_rot_count)) & 0xF); + } + else if constexpr ((flags & perm_shright) != 0 && (flags & perm_addz) == 0) {// fits psrldq + return _mm_bsrli_si128(a, (flags >> perm_rot_count) & 0xF); + } + else if constexpr ((flags & perm_broadcast) != 0 && (flags & perm_zeroing) == 0 && (flags >> perm_rot_count & 0xF) == 0) { +#if INSTRSET >= 8 // AVX2 + return _mm_broadcastw_epi16(y); +#else + y = _mm_shufflelo_epi16(a, 0); // broadcast of first element + return _mm_unpacklo_epi64(y, y); +#endif + } +#if INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask + else if constexpr (fit_zeroing) { + // Do both permutation and zeroing with PSHUFB instruction + const EList bm = pshufb_mask(indexs); + return _mm_shuffle_epi8(a, Vec8s().load(bm.a)); + } +#endif + else if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm_unpackhi_epi16(a, a); + } + else if constexpr ((flags & perm_punpckl) != 0) { // fits punpcklo + y = _mm_unpacklo_epi16(a, a); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & perm_rotate) != 0) { // fits palignr + y = _mm_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF); + } +#endif + else if constexpr (!H2L && !L2H) { // no crossing of 64-bit boundary + if constexpr (L2L && pL2L != noperm) { + y = _mm_shufflelo_epi16(y, pL2L); // permute low 64-bits + } + if constexpr (H2H && pH2H != noperm) { + y = _mm_shufflehi_epi16(y, pH2H); // permute high 64-bits + } + } +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + else if constexpr ((flags & perm_compress) != 0) { + y = _mm_maskz_compress_epi16(__mmask8(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm_maskz_expand_epi16(__mmask8(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif // AVX512VBMI2 +#if INSTRSET >= 4 // SSSE3 + else { // needs general permute + const EList bm = pshufb_mask(indexs); + y = _mm_shuffle_epi8(a, Vec8s().load(bm.a)); + return y; // _mm_shuffle_epi8 also does zeroing + } + } + if constexpr (fit_zeroing) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi16(zero_mask<8>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec8s().load(bm.a), y); +#endif + } + return y; +#else // INSTRSET < 4 + else { + // Difficult case. Use permutations of low and high half separately + constexpr uint8_t pH2L = uint8_t(flags16 >> 48); // high to low permute pattern + constexpr uint8_t pL2H = uint8_t(flags16 >> 56); // low to high permute pattern + __m128i yswap = _mm_shuffle_epi32(y, 0x4E); // swap low and high 64-bits + if constexpr (H2L && pH2L != noperm) { + yswap = _mm_shufflelo_epi16(yswap, pH2L); // permute low 64-bits + } + if constexpr (L2H && pL2H != noperm) { + yswap = _mm_shufflehi_epi16(yswap, pL2H); // permute high 64-bits + } + if constexpr (L2L && pL2L != noperm) { + y = _mm_shufflelo_epi16(y, pL2L); // permute low 64-bits + } + if constexpr (H2H && pH2H != noperm) { + y = _mm_shufflehi_epi16(y, pH2H); // permute high 64-bits + } + if constexpr (H2H || L2L) { // merge data from y and yswap + auto selb = make_bit_mask<8,0x102>(indexs);// blend by bit 2. invert upper half + const EList bm = make_broad_mask(selb);// convert to broad mask + y = selectb(Vec8s().load(bm.a), yswap, y); + } + else { + y = yswap; + } + } + } + if constexpr (fit_zeroing) { + // additional zeroing needed + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec8s().load(bm.a), y); + } + return y; +#endif +} + +template +static inline Vec8us permute8(Vec8us const a) { + return Vec8us(permute8 (Vec8s(a))); +} + +// permute Vec16c +template + static inline Vec16c permute16(Vec16c const a) { + + // indexes as array + constexpr int indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + constexpr bool fit_zeroing = (flags & perm_zeroing) != 0;// needs additional zeroing + + __m128i y = a; // result + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm_setzero_si128(); + + if constexpr ((flags & perm_perm) != 0) { + // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { + // use larger permutation + constexpr EList L = largeblock_perm<16>(indexs); // permutation pattern + y = permute8 (Vec8s(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_shleft) != 0) { // fits pslldq + y = _mm_bslli_si128(a, (16-(flags >> perm_rot_count)) & 0xF); + if ((flags & perm_addz) == 0) return y; + } + else if constexpr ((flags & perm_shright) != 0) { // fits psrldq + y = _mm_bsrli_si128(a, (flags >> perm_rot_count) & 0xF); + if ((flags & perm_addz) == 0) return y; + } +#if INSTRSET >= 4 && INSTRSET < 10 // SSSE3, but no compact mask + else if constexpr (fit_zeroing) { + // Do both permutation and zeroing with PSHUFB instruction + const EList bm = pshufb_mask(indexs); + return _mm_shuffle_epi8(a, Vec16c().load(bm.a)); + } +#endif + else if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm_unpackhi_epi8(a, a); + } + else if constexpr ((flags & perm_punpckl) != 0) { // fits punpcklo + y = _mm_unpacklo_epi8(a, a); + } +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + else if constexpr ((flags & perm_compress) != 0) { + y = _mm_maskz_compress_epi8(__mmask16(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm_maskz_expand_epi8(__mmask16(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif // AVX512VBMI2 +#if INSTRSET >= 8 // AVX2 + else if constexpr ((flags & perm_broadcast) != 0 && (flags & fit_zeroing) == 0 && (flags >> perm_rot_count & 0xF) == 0) { + return _mm_broadcastb_epi8(y); + } +#endif +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & perm_rotate) != 0) { // fits palignr + y = _mm_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF); + } + else { // needs general permute + const EList bm = pshufb_mask(indexs); + y = _mm_shuffle_epi8(a, Vec16c().load(bm.a)); + return y; // _mm_shuffle_epi8 also does zeroing + } + } +#else + else { + // Difficult case. Use permutations of low and high half separately + Vec16c swapped, te2e, te2o, to2e, to2o, combeven, combodd; + + // get permutation indexes for four 16-bit permutes: + // k = 0: e2e: even bytes of source to even bytes of destination + // k = 1: o2e: odd bytes of source to even bytes of destination + // k = 2: e2o: even bytes of source to odd bytes of destination + // k = 3: o2o: odd bytes of source to odd bytes of destination + auto eoperm = [](uint8_t const k, int const (&indexs)[16]) constexpr { + uint8_t ix = 0; // index element + uint64_t r = 0; // return value + uint8_t i = (k >> 1) & 1; // look at odd indexes if destination is odd + for (; i < 16; i += 2) { + ix = (indexs[i] >= 0 && ((indexs[i] ^ k) & 1) == 0) ? (uint8_t)indexs[i]/2u : 0xFFu; + r |= uint64_t(ix) << (i / 2u * 8u); + } + return r; + }; + constexpr uint64_t ixe2e = eoperm(0, indexs); + constexpr uint64_t ixo2e = eoperm(1, indexs); + constexpr uint64_t ixe2o = eoperm(2, indexs); + constexpr uint64_t ixo2o = eoperm(3, indexs); + + constexpr bool e2e = ixe2e != -1ll; // even bytes of source to odd bytes of destination + constexpr bool e2o = ixe2o != -1ll; // even bytes of source to odd bytes of destination + constexpr bool o2e = ixo2e != -1ll; // odd bytes of source to even bytes of destination + constexpr bool o2o = ixo2o != -1ll; // odd bytes of source to odd bytes of destination + + if constexpr (e2o || o2e) swapped = rotate_left(Vec8s(a), 8); // swap odd and even bytes + + if constexpr (e2e) te2e = permute8 < int8_t(ixe2e), int8_t(ixe2e>>8), int8_t(ixe2e>>16), int8_t(ixe2e>>24), + int8_t(ixe2e>>32), int8_t(ixe2e>>40), int8_t(ixe2e>>48), int8_t(ixe2e>>56)> (Vec8s(a)); + + if constexpr (e2o) te2o = permute8 < int8_t(ixe2o), int8_t(ixe2o>>8), int8_t(ixe2o>>16), int8_t(ixe2o>>24), + int8_t(ixe2o>>32), int8_t(ixe2o>>40), int8_t(ixe2o>>48), int8_t(ixe2o>>56)> (Vec8s(swapped)); + + if constexpr (o2e) to2e = permute8 < int8_t(ixo2e), int8_t(ixo2e>>8), int8_t(ixo2e>>16), int8_t(ixo2e>>24), + int8_t(ixo2e>>32), int8_t(ixo2e>>40), int8_t(ixo2e>>48), int8_t(ixo2e>>56)> (Vec8s(swapped)); + + if constexpr (o2o) to2o = permute8 < int8_t(ixo2o), int8_t(ixo2o>>8), int8_t(ixo2o>>16), int8_t(ixo2o>>24), + int8_t(ixo2o>>32), int8_t(ixo2o>>40), int8_t(ixo2o>>48), int8_t(ixo2o>>56)> (Vec8s(a)); + + if constexpr (e2e && o2e) combeven = te2e | to2e; + else if constexpr (e2e) combeven = te2e; + else if constexpr (o2e) combeven = to2e; + else combeven = _mm_setzero_si128(); + + if constexpr (e2o && o2o) combodd = te2o | to2o; + else if constexpr (e2o) combodd = te2o; + else if constexpr (o2o) combodd = to2o; + else combodd = _mm_setzero_si128(); + + __m128i maske = constant4ui < // mask used even bytes + (i0 < 0 ? 0 : 0xFF) | (i2 < 0 ? 0 : 0xFF0000u), + (i4 < 0 ? 0 : 0xFF) | (i6 < 0 ? 0 : 0xFF0000u), + (i8 < 0 ? 0 : 0xFF) | (i10 < 0 ? 0 : 0xFF0000u), + (i12 < 0 ? 0 : 0xFF) | (i14 < 0 ? 0 : 0xFF0000u) >(); + __m128i masko = constant4ui < // mask used odd bytes + (i1 < 0 ? 0 : 0xFF00) | (i3 < 0 ? 0 : 0xFF000000u), + (i5 < 0 ? 0 : 0xFF00) | (i7 < 0 ? 0 : 0xFF000000u), + (i9 < 0 ? 0 : 0xFF00) | (i11 < 0 ? 0 : 0xFF000000u), + (i13 < 0 ? 0 : 0xFF00) | (i15 < 0 ? 0 : 0xFF000000u) >(); + + return _mm_or_si128( // combine even and odd bytes + _mm_and_si128(combeven, maske), + _mm_and_si128(combodd, masko)); + } + } +#endif + if constexpr (fit_zeroing) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi8(zero_mask<16>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec16c().load(bm.a), y); +#endif + } + return y; +} + +template + static inline Vec16uc permute16(Vec16uc const a) { + return Vec16uc(permute16 (Vec16c(a))); +} + + +/***************************************************************************** +* +* Vector blend functions +* +****************************************************************************** +* +* These blend functions can mix elements from two different vectors of N +* elements eadh and optionally set some elements to zero. +* +* The N indexes are inserted as template parameters in <>. +* These indexes must be compile-time constants. Each template parameter +* selects an element from one of the input vectors a and b. +* An index in the range 0 .. N-1 selects the corresponding element from a. +* An index in the range N .. 2*N-1 selects an element from b. +* An index with the value -1 gives zero in the corresponding element of +* the result. +* An index with the value V_DC means don't care. The code will select the +* optimal sequence of instructions that fits the remaining indexes. +* +* Example: +* Vec4i a(100,101,102,103); // a is (100, 101, 102, 103) +* Vec4i b(200,201,202,203); // b is (200, 201, 202, 203) +* Vec4i c; +* c = blend4<1,4,-1,7> (a,b); // c is (101, 200, 0, 203) +* +* A lot of the code here is metaprogramming aiming to find the instructions +* that best fit the template parameters and instruction set. The metacode +* will be reduced out to leave only a few vector instructions in release +* mode with optimization on. +*****************************************************************************/ + +// permute and blend Vec2q +template +static inline Vec2q blend2(Vec2q const a, Vec2q const b) { + int constexpr indexs[2] = { i0, i1 }; // indexes as array + __m128i y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128(); + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute2 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute2 (b); + } + + if constexpr ((flags & (blend_perma | blend_permb)) == 0) {// no permutation, only blending +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_epi64 (a, (uint8_t)make_bit_mask<2, 0x301>(indexs), b); +#elif INSTRSET >= 5 // SSE4.1 + y = _mm_blend_epi16 (a, b, ((i0 & 2) ? 0x0F : 0) | ((i1 & 2) ? 0xF0 : 0)); +#else // SSE2 + const EList bm = make_broad_mask(make_bit_mask<2, 0x301>(indexs)); + y = selectb(Vec2q().load(bm.a), b, a); +#endif + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm_unpacklo_epi64 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm_unpacklo_epi64 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm_unpackhi_epi64 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm_unpackhi_epi64 (b, a); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern); + } +#endif +#if ALLOW_FP_PERMUTE // allow floating point permute instructions on integer vectors + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), (flags >> blend_shufpattern) & 3)); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), (flags >> blend_shufpattern) & 3)); + } +#endif + else { // No special cases. permute a and b separately, then blend. + // This will not occur if ALLOW_FP_PERMUTE is true +#if INSTRSET >= 5 // SSE4.1 + constexpr bool dozero = false; +#else // SSE2 + constexpr bool dozero = true; +#endif + constexpr EList L = blend_perm_indexes<2, (int)dozero>(indexs); // get permutation indexes + __m128i ya = permute2(a); + __m128i yb = permute2(b); +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_epi64 (ya, (uint8_t)make_bit_mask<2, 0x301>(indexs), yb); +#elif INSTRSET >= 5 // SSE4.1 + y = _mm_blend_epi16 (ya, yb, ((i0 & 2) ? 0x0F : 0) | ((i1 & 2) ? 0xF0 : 0)); +#else // SSE2 + return _mm_or_si128(ya, yb); +#endif + } + + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi64(zero_mask<2>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec2q().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec2uq blend2(Vec2uq const a, Vec2uq const b) { + return Vec2uq(blend2 (Vec2q(a), Vec2q(b))); +} + + +// permute and blend Vec4i +template +static inline Vec4i blend4(Vec4i const a, Vec4i const b) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + __m128i y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + constexpr bool blendonly = (flags & (blend_perma | blend_permb)) == 0; // no permutation, only blending + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128(); + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute4 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute4 < i0<0?i0:i0&3, i1<0?i1:i1&3, i2<0?i2:i2&3, i3<0?i3:i3&3> (b); + } + if constexpr ((flags & blend_largeblock) != 0) { // fits blending with larger block size + constexpr EList L = largeblock_indexes<4>(indexs); + y = blend2 (Vec2q(a), Vec2q(b)); + if constexpr ((flags & blend_addz) == 0) { + return y; // any zeroing has been done by larger blend + } + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm_unpacklo_epi32 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm_unpacklo_epi32 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm_unpackhi_epi32 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm_unpackhi_epi32 (b, a); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern); + } +#endif +#if ALLOW_FP_PERMUTE // allow floating point permute instructions on integer vectors + else if constexpr ((flags & blend_shufab) != 0 && !blendonly) { // use floating point instruction shufps + y = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), uint8_t(flags >> blend_shufpattern))); + } + else if constexpr ((flags & blend_shufba) != 0 && !blendonly) { // use floating point instruction shufps + y = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), uint8_t(flags >> blend_shufpattern))); + } +#endif + else { // No special cases. permute a and b separately, then blend. +#if INSTRSET >= 5 // SSE4.1 + constexpr bool dozero = false; +#else // SSE2 + constexpr bool dozero = true; +#endif + Vec4i ya = a, yb = b; // a and b permuted + constexpr EList L = blend_perm_indexes<4, (int)dozero>(indexs); // get permutation indexes + if constexpr ((flags & blend_perma) != 0 || dozero) { + ya = permute4 (a); + } + if constexpr ((flags & blend_permb) != 0 || dozero) { + yb = permute4 (b); + } +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_epi32 (ya, (uint8_t)make_bit_mask<4, 0x302>(indexs), yb); +#elif INSTRSET >= 5 // SSE4.1 + constexpr uint8_t mm = ((i0 & 4) ? 0x03 : 0) | ((i1 & 4) ? 0x0C : 0) | ((i2 & 4) ? 0x30 : 0) | ((i3 & 4) ? 0xC0 : 0); + y = _mm_blend_epi16 (ya, yb, mm); +#else // SSE2. dozero = true + return _mm_or_si128(ya, yb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi32(zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec4i().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec4ui blend4(Vec4ui const a, Vec4ui const b) { + return Vec4ui(blend4(Vec4i(a), Vec4i(b))); +} + + +// permute and blend Vec8s +template +static inline Vec8s blend8(Vec8s const a, Vec8s const b) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m128i y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128(); + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute8 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute8 < i0<0?i0:i0&7, i1<0?i1:i1&7, i2<0?i2:i2&7, i3<0?i3:i3&7, + i4<0?i4:i4&7, i5<0?i5:i5&7, i6<0?i6:i6&7, i7<0?i7:i7&7 > (b); + } + if constexpr ((flags & blend_largeblock) != 0) { // fits blending with larger block size + constexpr EList L = largeblock_indexes<8>(indexs); + y = blend4 (Vec4i(a), Vec4i(b)); + if constexpr ((flags & blend_addz) == 0) { + return y; // any zeroing has been done by larger blend + } + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm_unpacklo_epi16 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm_unpacklo_epi16 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm_unpackhi_epi16 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm_unpackhi_epi16 (b, a); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern); + } +#endif + else { // No special cases. +#if INSTRSET >= 10 // AVX512BW + const EList bm = perm_mask_broad(indexs); + return _mm_maskz_permutex2var_epi16(zero_mask<8>(indexs), a, Vec8s().load(bm.a), b); +#endif + // full blend instruction not available, + // permute a and b separately, then blend. +#if INSTRSET >= 5 // SSE4.1 + constexpr bool dozero = (flags & blend_zeroing) != 0; +#else // SSE2 + constexpr bool dozero = true; +#endif + Vec8s ya = a, yb = b; // a and b permuted + constexpr EList L = blend_perm_indexes<8, (int)dozero>(indexs); // get permutation indexes + if constexpr ((flags & blend_perma) != 0 || dozero) { + ya = permute8 (a); + } + if constexpr ((flags & blend_permb) != 0 || dozero) { + yb = permute8 (b); + } + if constexpr (dozero) { // unused elements are zero + return _mm_or_si128(ya, yb); + } + else { // blend ya and yb + +#if INSTRSET >= 5 // SSE4.1 + constexpr uint8_t mm = ((i0 & 8) ? 0x01 : 0) | ((i1 & 8) ? 0x02 : 0) | ((i2 & 8) ? 0x04 : 0) | ((i3 & 8) ? 0x08 : 0) | + ((i4 & 8) ? 0x10 : 0) | ((i5 & 8) ? 0x20 : 0) | ((i6 & 8) ? 0x40 : 0) | ((i7 & 8) ? 0x80 : 0); + y = _mm_blend_epi16 (ya, yb, mm); +#endif + } + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed after special cases +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi16(zero_mask<8>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec8s().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec8us blend8(Vec8us const a, Vec8us const b) { + return Vec8us(blend8(Vec8s(a), Vec8s(b))); +} + + +// permute and blend Vec16c +template + static inline Vec16c blend16(Vec16c const a, Vec16c const b) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; // indexes as array + __m128i y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm_setzero_si128(); + + else if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute16 (a); + } + else if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes + return permute16 < L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (b); + } +#if INSTRSET >= 4 // SSSE3 + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm_alignr_epi8(b, a, flags >> blend_rotpattern); + } +#endif + else if constexpr ((flags & blend_largeblock) != 0) { // fits blending with larger block size + constexpr EList L = largeblock_indexes<16>(indexs); + y = blend8 (Vec8s(a), Vec8s(b)); + if constexpr ((flags & blend_addz) == 0) { + return y; // any zeroing has been done by larger blend + } + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm_unpacklo_epi8 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm_unpacklo_epi8 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm_unpackhi_epi8 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm_unpackhi_epi8 (b, a); + } + else { // No special cases. Full permute needed +#if INSTRSET >= 10 && defined ( __AVX512VBMI__ ) // AVX512VBMI + const EList bm = perm_mask_broad(indexs); + return _mm_maskz_permutex2var_epi8(zero_mask<16>(indexs), a, Vec16c().load(bm.a), b); +#endif // __AVX512VBMI__ + + // full blend instruction not available, + // permute a and b separately, then blend. +#if INSTRSET >= 10 // AVX512VL +//#elif INSTRSET >= 5 // SSE4.1 // This is optimal only if both permute16<> calls below have simple special cases + constexpr bool dozero = (flags & blend_zeroing) != 0; +#else // SSE2 + constexpr bool dozero = true; +#endif + Vec16c ya = a, yb = b; // a and b permuted + constexpr EList L = blend_perm_indexes<16, (int)dozero>(indexs); // get permutation indexes + if constexpr ((flags & blend_perma) != 0 || dozero) { + ya = permute16 (a); + } + if constexpr ((flags & blend_permb) != 0 || dozero) { + yb = permute16 (b); + } + if constexpr (dozero) { // unused fields in ya and yb are zero + return _mm_or_si128(ya, yb); + } + else { +#if INSTRSET >= 10 // AVX512VL + y = _mm_mask_mov_epi8 (ya, (__mmask16)make_bit_mask<16, 0x304>(indexs), yb); +#endif + } + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm_maskz_mov_epi8(zero_mask<16>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm_and_si128(Vec16c().load(bm.a), y); +#endif + } + return y; +} + +template + static inline Vec16uc blend16(Vec16uc const a, Vec16uc const b) { + return Vec16uc(blend16(Vec16c(a), Vec16c(b))); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +* This can be used for several purposes: +* - table lookup +* - permute or blend with variable indexes +* - blend from more than two sources +* - gather non-contiguous data +* +* An index out of range may produce any value - the actual value produced is +* implementation dependent and may be different for different instruction +* sets. An index out of range does not produce an error message or exception. +* +* Example: +* Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) +* Vec4i b(100,101,102,103); // table b is (100, 101, 102, 103) +* Vec4i c; +* c = lookup4 (a,b); // c is (102, 100, 100, 103) +* +*****************************************************************************/ + +static inline Vec16c lookup16(Vec16c const index, Vec16c const table) { +#if INSTRSET >= 5 // SSSE3 + return _mm_shuffle_epi8(table, index); +#else + uint8_t ii[16]; + int8_t tt[16], rr[16]; + table.store(tt); index.store(ii); + for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x0F]; + return Vec16c().load(rr); +#endif +} + +static inline Vec16c lookup32(Vec16c const index, Vec16c const table0, Vec16c const table1) { +#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM + return (Vec16c)_mm_perm_epi8(table0, table1, index); +#elif INSTRSET >= 5 // SSSE3 + Vec16c r0 = _mm_shuffle_epi8(table0, index + 0x70); // make negative index for values >= 16 + Vec16c r1 = _mm_shuffle_epi8(table1, (index ^ 0x10) + 0x70); // make negative index for values < 16 + return r0 | r1; +#else + uint8_t ii[16]; + int8_t tt[32], rr[16]; + table0.store(tt); table1.store(tt + 16); index.store(ii); + for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F]; + return Vec16c().load(rr); +#endif +} + +template +static inline Vec16c lookup(Vec16c const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 16) return lookup16(index, Vec16c().load(table)); + if constexpr (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16)); + // n > 32. Limit index + Vec16uc index1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec16uc(index) & uint8_t(n - 1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec16uc(index), uint8_t(n - 1)); + } + uint8_t ii[16]; index1.store(ii); + int8_t rr[16]; + for (int j = 0; j < 16; j++) { + rr[j] = ((int8_t*)table)[ii[j]]; + } + return Vec16c().load(rr); +} + +static inline Vec8s lookup8(Vec8s const index, Vec8s const table) { +#if INSTRSET >= 5 // SSSE3 + return _mm_shuffle_epi8(table, index * 0x202 + 0x100); +#else + int16_t ii[8], tt[8], rr[8]; + table.store(tt); index.store(ii); + for (int j = 0; j < 8; j++) rr[j] = tt[ii[j] & 0x07]; + return Vec8s().load(rr); +#endif +} + +static inline Vec8s lookup16(Vec8s const index, Vec8s const table0, Vec8s const table1) { +#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM + return (Vec8s)_mm_perm_epi8(table0, table1, index * 0x202 + 0x100); +#elif INSTRSET >= 5 // SSSE3 + Vec8s r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x202) + Vec16c(Vec8s(0x7170))); + Vec8s r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x202 ^ 0x1010) + Vec16c(Vec8s(0x7170))); + return r0 | r1; +#else + int16_t ii[16], tt[32], rr[16]; + table0.store(tt); table1.store(tt + 8); index.store(ii); + for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F]; + return Vec8s().load(rr); +#endif +} + +template +static inline Vec8s lookup(Vec8s const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) return lookup8(index, Vec8s().load(table)); + if constexpr (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8)); + // n > 16. Limit index + Vec8us index1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec8us(index) & (n - 1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec8us(index), n - 1); + } +#if INSTRSET >= 8 // AVX2. Use VPERMD + Vec8s t1 = _mm_i32gather_epi32((const int *)table, __m128i((Vec4i(index1)) & (Vec4i(0x0000FFFF))), 2); // even positions + Vec8s t2 = _mm_i32gather_epi32((const int *)table, _mm_srli_epi32(index1, 16), 2); // odd positions + return blend8<0, 8, 2, 10, 4, 12, 6, 14>(t1, t2); +#else + uint16_t ii[8]; index1.store(ii); + return Vec8s(((int16_t*)table)[ii[0]], ((int16_t*)table)[ii[1]], ((int16_t*)table)[ii[2]], ((int16_t*)table)[ii[3]], + ((int16_t*)table)[ii[4]], ((int16_t*)table)[ii[5]], ((int16_t*)table)[ii[6]], ((int16_t*)table)[ii[7]]); +#endif +} + + +static inline Vec4i lookup4(Vec4i const index, Vec4i const table) { +#if INSTRSET >= 5 // SSSE3 + return _mm_shuffle_epi8(table, index * 0x04040404 + 0x03020100); +#else + return Vec4i(table[index[0]], table[index[1]], table[index[2]], table[index[3]]); +#endif +} + +static inline Vec4i lookup8(Vec4i const index, Vec4i const table0, Vec4i const table1) { + // return Vec4i(lookup16(Vec8s(index * 0x20002 + 0x10000), Vec8s(table0), Vec8s(table1))); +#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM + return (Vec4i)_mm_perm_epi8(table0, table1, index * 0x04040404 + 0x03020100); +#elif INSTRSET >= 8 // AVX2. Use VPERMD + __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector + return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index))); + +#elif INSTRSET >= 4 // SSSE3 + Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x04040404) + Vec16c(Vec4i(0x73727170))); + Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x04040404 ^ 0x10101010) + Vec16c(Vec4i(0x73727170))); + return r0 | r1; +#else // SSE2 + int32_t ii[4], tt[8], rr[4]; + table0.store(tt); table1.store(tt + 4); index.store(ii); + for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x07]; + return Vec4i().load(rr); +#endif +} + +static inline Vec4i lookup16(Vec4i const index, Vec4i const table0, Vec4i const table1, Vec4i const table2, Vec4i const table3) { +#if INSTRSET >= 8 // AVX2. Use VPERMD + __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector + __m256i table23 = _mm256_inserti128_si256(_mm256_castsi128_si256(table2), table3, 1); // join tables into 256 bit vector + __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index))); + __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table23, _mm256_castsi128_si256(index ^ 8))); + return select(index >= 8, Vec4i(r1), Vec4i(r0)); + //return _mm_blendv_epi8(r0, r1, index >= 8); + +#elif defined (__XOP__) // AMD XOP instruction set. Use VPPERM + Vec4i r0 = _mm_perm_epi8(table0, table1, ((index) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu); + Vec4i r1 = _mm_perm_epi8(table2, table3, ((index ^ 8) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu); + return r0 | r1; + +#elif INSTRSET >= 5 // SSSE3 + Vec16c aa = Vec16c(Vec4i(0x73727170)); + Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c((index) * 0x04040404) + aa); + Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c((index ^ 4) * 0x04040404) + aa); + Vec4i r2 = _mm_shuffle_epi8(table2, Vec16c((index ^ 8) * 0x04040404) + aa); + Vec4i r3 = _mm_shuffle_epi8(table3, Vec16c((index ^ 12) * 0x04040404) + aa); + return (r0 | r1) | (r2 | r3); + +#else // SSE2 + int32_t ii[4], tt[16], rr[4]; + table0.store(tt); table1.store(tt + 4); table2.store(tt + 8); table3.store(tt + 12); + index.store(ii); + for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x0F]; + return Vec4i().load(rr); +#endif +} + +template +static inline Vec4i lookup(Vec4i const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 4) return lookup4(index, Vec4i().load(table)); + if constexpr (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4)); + // n > 8. Limit index + Vec4ui index1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec4ui(index) & (n - 1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec4ui(index), n - 1); + } +#if INSTRSET >= 8 // AVX2. Use VPERMD + return _mm_i32gather_epi32((const int *)table, index1, 4); +#else + uint32_t ii[4]; index1.store(ii); + return Vec4i(((int32_t*)table)[ii[0]], ((int32_t*)table)[ii[1]], ((int32_t*)table)[ii[2]], ((int32_t*)table)[ii[3]]); +#endif +} + + +static inline Vec2q lookup2(Vec2q const index, Vec2q const table) { +#if INSTRSET >= 5 // SSSE3 + return _mm_shuffle_epi8(table, index * 0x0808080808080808ll + 0x0706050403020100ll); +#else + int64_t ii[2], tt[2]; + table.store(tt); index.store(ii); + return Vec2q(tt[int(ii[0])], tt[int(ii[1])]); +#endif +} + +template +static inline Vec2q lookup(Vec2q const index, void const * table) { + if constexpr (n <= 0) return 0; + // n > 0. Limit index + Vec2uq index1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec2uq(index) & (n - 1); + } + else { + // n is not a power of 2, limit to n-1. + // There is no 64-bit min instruction, but we can use the 32-bit unsigned min, + // since n is a 32-bit integer + index1 = Vec2uq(min(Vec2uq(index), constant4ui())); + } + uint32_t ii[4]; index1.store(ii); // use only lower 32 bits of each index + int64_t const * tt = (int64_t const *)table; + return Vec2q(tt[ii[0]], tt[ii[2]]); +} + + +/***************************************************************************** +* +* Byte shifts +* +*****************************************************************************/ + +// Function shift_bytes_up: shift whole vector left by b bytes. +template +static inline Vec16c shift_bytes_up(Vec16c const a) { +#if INSTRSET >= 4 // SSSE3 + if (b < 16) { + return _mm_alignr_epi8(a, _mm_setzero_si128(), 16 - b); + } + else { + return _mm_setzero_si128(); // zero + } +#else + int8_t dat[32]; + if (b < 16) { + Vec16c(0).store(dat); + a.store(dat + b); + return Vec16c().load(dat); + } + else return 0; +#endif +} + +// Function shift_bytes_down: shift whole vector right by b bytes +template +static inline Vec16c shift_bytes_down(Vec16c const a) { +#if INSTRSET >= 4 // SSSE3 + if (b < 16) { + return _mm_alignr_epi8(_mm_setzero_si128(), a, b); + } + else { + return _mm_setzero_si128(); + } +#else + int8_t dat[32]; + if (b < 16) { + a.store(dat); + Vec16c(0).store(dat + 16); + return Vec16c().load(dat + b); + } + else return 0; +#endif +} + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// find lowest and highest index +template +constexpr int min_index(const int (&a)[N]) { + int ix = a[0]; + for (int i = 1; i < N; i++) { + if (a[i] < ix) ix = a[i]; + } + return ix; +} + +template +constexpr int max_index(const int (&a)[N]) { + int ix = a[0]; + for (int i = 1; i < N; i++) { + if (a[i] > ix) ix = a[i]; + } + return ix; +} + +// Load elements from array a with indices i0, i1, i2, i3 +template +static inline Vec4i gather4i(void const * a) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 3) { + // load one contiguous block and permute + if constexpr (imax > 3) { + // make sure we don't read past the end of the array + Vec4i b = Vec4i().load((int32_t const *)a + imax - 3); + return permute4(b); + } + else { + Vec4i b = Vec4i().load((int32_t const *)a + imin); + return permute4(b); + } + } + if constexpr ((i0imax - 4) && (i1imax - 4) && (i2imax - 4) && (i3imax - 4)) { + // load two contiguous blocks and blend + Vec4i b = Vec4i().load((int32_t const *)a + imin); + Vec4i c = Vec4i().load((int32_t const *)a + imax - 3); + constexpr int j0 = i0 < imin + 4 ? i0 - imin : 7 - imax + i0; + constexpr int j1 = i1 < imin + 4 ? i1 - imin : 7 - imax + i1; + constexpr int j2 = i2 < imin + 4 ? i2 - imin : 7 - imax + i2; + constexpr int j3 = i3 < imin + 4 ? i3 - imin : 7 - imax + i3; + return blend4(b, c); + } + // use AVX2 gather if available +#if INSTRSET >= 8 + return _mm_i32gather_epi32((const int *)a, Vec4i(i0, i1, i2, i3), 4); +#else + return lookup(Vec4i(i0, i1, i2, i3), a); +#endif +} + +// Load elements from array a with indices i0, i1 +template +static inline Vec2q gather2q(void const * a) { + constexpr int imin = i0 < i1 ? i0 : i1; + constexpr int imax = i0 > i1 ? i0 : i1; + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 1) { + // load one contiguous block and permute + if constexpr (imax > 1) { + // make sure we don't read past the end of the array + Vec2q b = Vec2q().load((int64_t const *)a + imax - 1); + return permute2(b); + } + else { + Vec2q b = Vec2q().load((int64_t const *)a + imin); + return permute2(b); + } + } + return Vec2q(((int64_t*)a)[i0], ((int64_t*)a)[i1]); +} + + +/***************************************************************************** +* +* Vector scatter functions with fixed indexes +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +* The scatter functions are useful if the data are distributed in a sparce +* manner into the array. If the array is dense then it is more efficient +* to permute the data into the right positions and then write the whole +* permuted vector into the array. +* +* Example: +* Vec8q a(10,11,12,13,14,15,16,17); +* int64_t b[16] = {0}; +* scatter<0,2,14,10,1,-1,5,9>(a,b); // b = (10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0) +* +*****************************************************************************/ + +template +static inline void scatter(Vec4i const data, void * destination) { +#if INSTRSET >= 10 // AVX512VL + __m128i indx = constant4ui(); + __mmask8 mask = uint8_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3)); + _mm_mask_i32scatter_epi32((int*)destination, mask, indx, data, 4); + +#elif INSTRSET >= 9 // AVX512F + __m512i indx = _mm512_castsi128_si512(constant4ui()); + __mmask16 mask = uint16_t((i0 >= 0) | ((i1 >= 0) << 1) | ((i2 >= 0) << 2) | ((i3 >= 0) << 3)); + _mm512_mask_i32scatter_epi32(destination, mask, indx, _mm512_castsi128_si512(data), 4); + +#else + int32_t * arr = (int32_t*)destination; + const int index[4] = { i0,i1,i2,i3 }; + for (int i = 0; i < 4; i++) { + if (index[i] >= 0) arr[index[i]] = data[i]; + } +#endif +} + +template +static inline void scatter(Vec2q const data, void * destination) { + int64_t* arr = (int64_t*)destination; + if (i0 >= 0) arr[i0] = data[0]; + if (i1 >= 0) arr[i1] = data[1]; +} + + +/***************************************************************************** +* +* Scatter functions with variable indexes +* +*****************************************************************************/ + +static inline void scatter(Vec4i const index, uint32_t limit, Vec4i const data, void * destination) { +#if INSTRSET >= 10 // AVX512VL + __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); + _mm_mask_i32scatter_epi32((int*)destination, mask, index, data, 4); +#elif INSTRSET >= 9 // AVX512F + __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xF, _mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit))); + _mm512_mask_i32scatter_epi32((int*)destination, mask, _mm512_castsi128_si512(index), _mm512_castsi128_si512(data), 4); +#else + int32_t* arr = (int32_t*)destination; + for (int i = 0; i < 4; i++) { + if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; + } +#endif +} + +static inline void scatter(Vec2q const index, uint32_t limit, Vec2q const data, void * destination) { + int64_t* arr = (int64_t*)destination; + if (uint64_t(index[0]) < uint64_t(limit)) arr[index[0]] = data[0]; + if (uint64_t(index[1]) < uint64_t(limit)) arr[index[1]] = data[1]; +} + + +/***************************************************************************** +* +* Functions for conversion between integer sizes +* +*****************************************************************************/ + +// Extend 8-bit integers to 16-bit integers, signed and unsigned + +// Function extend_low : extends the low 8 elements to 16 bits with sign extension +static inline Vec8s extend_low(Vec16c const a) { + __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), a); // 0 > a + return _mm_unpacklo_epi8(a, sign); // interleave with sign extensions +} + +// Function extend_high : extends the high 8 elements to 16 bits with sign extension +static inline Vec8s extend_high(Vec16c const a) { + __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(), a); // 0 > a + return _mm_unpackhi_epi8(a, sign); // interleave with sign extensions +} + +// Function extend_low : extends the low 8 elements to 16 bits with zero extension +static inline Vec8us extend_low(Vec16uc const a) { + return _mm_unpacklo_epi8(a, _mm_setzero_si128()); // interleave with zero extensions +} + +// Function extend_high : extends the high 8 elements to 16 bits with zero extension +static inline Vec8us extend_high(Vec16uc const a) { + return _mm_unpackhi_epi8(a, _mm_setzero_si128()); // interleave with zero extensions +} + +// Extend 16-bit integers to 32-bit integers, signed and unsigned + +// Function extend_low : extends the low 4 elements to 32 bits with sign extension +static inline Vec4i extend_low(Vec8s const a) { + __m128i sign = _mm_srai_epi16(a, 15); // sign bit + return _mm_unpacklo_epi16(a, sign); // interleave with sign extensions +} + +// Function extend_high : extends the high 4 elements to 32 bits with sign extension +static inline Vec4i extend_high(Vec8s const a) { + __m128i sign = _mm_srai_epi16(a, 15); // sign bit + return _mm_unpackhi_epi16(a, sign); // interleave with sign extensions +} + +// Function extend_low : extends the low 4 elements to 32 bits with zero extension +static inline Vec4ui extend_low(Vec8us const a) { + return _mm_unpacklo_epi16(a, _mm_setzero_si128()); // interleave with zero extensions +} + +// Function extend_high : extends the high 4 elements to 32 bits with zero extension +static inline Vec4ui extend_high(Vec8us const a) { + return _mm_unpackhi_epi16(a, _mm_setzero_si128()); // interleave with zero extensions +} + +// Extend 32-bit integers to 64-bit integers, signed and unsigned + +// Function extend_low : extends the low 2 elements to 64 bits with sign extension +static inline Vec2q extend_low(Vec4i const a) { + __m128i sign = _mm_srai_epi32(a, 31); // sign bit + return _mm_unpacklo_epi32(a, sign); // interleave with sign extensions +} + +// Function extend_high : extends the high 2 elements to 64 bits with sign extension +static inline Vec2q extend_high(Vec4i const a) { + __m128i sign = _mm_srai_epi32(a, 31); // sign bit + return _mm_unpackhi_epi32(a, sign); // interleave with sign extensions +} + +// Function extend_low : extends the low 2 elements to 64 bits with zero extension +static inline Vec2uq extend_low(Vec4ui const a) { + return _mm_unpacklo_epi32(a, _mm_setzero_si128()); // interleave with zero extensions +} + +// Function extend_high : extends the high 2 elements to 64 bits with zero extension +static inline Vec2uq extend_high(Vec4ui const a) { + return _mm_unpackhi_epi32(a, _mm_setzero_si128()); // interleave with zero extensions +} + +// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Overflow wraps around +static inline Vec16c compress(Vec8s const low, Vec8s const high) { + __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes + __m128i lowm = _mm_and_si128(low, mask); // bytes of low + __m128i highm = _mm_and_si128(high, mask); // bytes of high + return _mm_packus_epi16(lowm, highm); // unsigned pack +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed, with saturation +static inline Vec16c compress_saturated(Vec8s const low, Vec8s const high) { + return _mm_packs_epi16(low, high); +} + +// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers +// Unsigned, overflow wraps around +static inline Vec16uc compress(Vec8us const low, Vec8us const high) { + return Vec16uc(compress((Vec8s)low, (Vec8s)high)); +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Unsigned, with saturation +static inline Vec16uc compress_saturated(Vec8us const low, Vec8us const high) { +#if INSTRSET >= 5 // SSE4.1 supported + __m128i maxval = _mm_set1_epi32(0x00FF00FF); // maximum value + __m128i low1 = _mm_min_epu16(low, maxval); // upper limit + __m128i high1 = _mm_min_epu16(high, maxval); // upper limit + return _mm_packus_epi16(low1, high1); // this instruction saturates from signed 32 bit to unsigned 16 bit +#else + __m128i zero = _mm_setzero_si128(); + __m128i signlow = _mm_cmpgt_epi16(zero, low); // sign bit of low + __m128i signhi = _mm_cmpgt_epi16(zero, high); // sign bit of high + __m128i slow2 = _mm_srli_epi16(signlow, 8); // FF if low negative + __m128i shigh2 = _mm_srli_epi16(signhi, 8); // FF if high negative + __m128i maskns = _mm_set1_epi32(0x7FFF7FFF); // mask for removing sign bit + __m128i lowns = _mm_and_si128(low, maskns); // low, with sign bit removed + __m128i highns = _mm_and_si128(high, maskns); // high, with sign bit removed + __m128i lowo = _mm_or_si128(lowns, slow2); // low, sign bit replaced by 00FF + __m128i higho = _mm_or_si128(highns, shigh2); // high, sign bit replaced by 00FF + return _mm_packus_epi16(lowo, higho); // this instruction saturates from signed 16 bit to unsigned 8 bit +#endif +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed to unsigned, with saturation +static inline Vec16uc compress_saturated_s2u(Vec8s const low, Vec8s const high) { + return _mm_packus_epi16(low, high); // this instruction saturates from signed 16 bit to unsigned 8 bit +} + +// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec8s compress(Vec4i const low, Vec4i const high) { +#if INSTRSET >= 5 // SSE4.1 supported + __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for low words + __m128i lowm = _mm_and_si128(low, mask); // bytes of low + __m128i highm = _mm_and_si128(high, mask); // bytes of high + return _mm_packus_epi32(lowm, highm); // unsigned pack +#else + __m128i low1 = _mm_shufflelo_epi16(low, 0xD8); // low words in place + __m128i high1 = _mm_shufflelo_epi16(high, 0xD8); // low words in place + __m128i low2 = _mm_shufflehi_epi16(low1, 0xD8); // low words in place + __m128i high2 = _mm_shufflehi_epi16(high1, 0xD8); // low words in place + __m128i low3 = _mm_shuffle_epi32(low2, 0xD8); // low dwords of low to pos. 0 and 32 + __m128i high3 = _mm_shuffle_epi32(high2, 0xD8); // low dwords of high to pos. 0 and 32 + return _mm_unpacklo_epi64(low3, high3); // interleave +#endif +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed with saturation +static inline Vec8s compress_saturated(Vec4i const low, Vec4i const high) { + return _mm_packs_epi32(low, high); // pack with signed saturation +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec8us compress(Vec4ui const low, Vec4ui const high) { + return Vec8us(compress((Vec4i)low, (Vec4i)high)); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Unsigned, with saturation +static inline Vec8us compress_saturated(Vec4ui const low, Vec4ui const high) { +#if INSTRSET >= 5 // SSE4.1 supported + __m128i maxval = _mm_set1_epi32(0x0000FFFF); // maximum value + __m128i low1 = _mm_min_epu32(low, maxval); // upper limit + __m128i high1 = _mm_min_epu32(high, maxval); // upper limit + return _mm_packus_epi32(low1, high1); // this instruction saturates from signed 32 bit to unsigned 16 bit +#else + __m128i zero = _mm_setzero_si128(); + __m128i lowzero = _mm_cmpeq_epi16(low, zero); // for each word is zero + __m128i highzero = _mm_cmpeq_epi16(high, zero); // for each word is zero + __m128i mone = _mm_set1_epi32(-1); // FFFFFFFF + __m128i lownz = _mm_xor_si128(lowzero, mone); // for each word is nonzero + __m128i highnz = _mm_xor_si128(highzero, mone); // for each word is nonzero + __m128i lownz2 = _mm_srli_epi32(lownz, 16); // shift down to low dword + __m128i highnz2 = _mm_srli_epi32(highnz, 16); // shift down to low dword + __m128i lowsatur = _mm_or_si128(low, lownz2); // low, saturated + __m128i hisatur = _mm_or_si128(high, highnz2); // high, saturated + return Vec8us(compress(Vec4i(lowsatur), Vec4i(hisatur))); +#endif +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed to unsigned, with saturation +static inline Vec8us compress_saturated_s2u(Vec4i const low, Vec4i const high) { +#if INSTRSET >= 5 // SSE4.1 supported + return _mm_packus_epi32(low, high); // this instruction saturates from signed 32 bit to unsigned 16 bit +#else + __m128i val_32 = _mm_set1_epi32(0x8000); + __m128i val_16 = _mm_set1_epi16(-0x8000); + __m128i low1 = _mm_sub_epi32(low, val_32); + __m128i high1 = _mm_sub_epi32(high, val_32); + return _mm_add_epi16(_mm_packs_epi32(low1, high1), val_16); +#endif +} + +// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Overflow wraps around +static inline Vec4i compress(Vec2q const low, Vec2q const high) { + __m128i low2 = _mm_shuffle_epi32(low, 0xD8); // low dwords of low to pos. 0 and 32 + __m128i high2 = _mm_shuffle_epi32(high, 0xD8); // low dwords of high to pos. 0 and 32 + return _mm_unpacklo_epi64(low2, high2); // interleave +} + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Signed, with saturation +// This function is very inefficient unless the SSE4.2 instruction set is supported +static inline Vec4i compress_saturated(Vec2q const low, Vec2q const high) { + Vec2q maxval = _mm_set_epi32(0, 0x7FFFFFFF, 0, 0x7FFFFFFF); + Vec2q minval = _mm_set_epi32(-1, 0x80000000, -1, 0x80000000); + Vec2q low1 = min(low, maxval); + Vec2q high1 = min(high, maxval); + Vec2q low2 = max(low1, minval); + Vec2q high2 = max(high1, minval); + return compress(low2, high2); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec4ui compress(Vec2uq const low, Vec2uq const high) { + return Vec4ui(compress((Vec2q)low, (Vec2q)high)); +} + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Unsigned, with saturation +static inline Vec4ui compress_saturated(Vec2uq const low, Vec2uq const high) { + __m128i zero = _mm_setzero_si128(); + __m128i lowzero = _mm_cmpeq_epi32(low, zero); // for each dword is zero + __m128i highzero = _mm_cmpeq_epi32(high, zero); // for each dword is zero + __m128i mone = _mm_set1_epi32(-1); // FFFFFFFF + __m128i lownz = _mm_xor_si128(lowzero, mone); // for each dword is nonzero + __m128i highnz = _mm_xor_si128(highzero, mone); // for each dword is nonzero + __m128i lownz2 = _mm_srli_epi64(lownz, 32); // shift down to low dword + __m128i highnz2 = _mm_srli_epi64(highnz, 32); // shift down to low dword + __m128i lowsatur = _mm_or_si128(low, lownz2); // low, saturated + __m128i hisatur = _mm_or_si128(high, highnz2); // high, saturated + return Vec4ui(compress(Vec2q(lowsatur), Vec2q(hisatur))); +} + + + +/***************************************************************************** +* +* Integer division operators +* +****************************************************************************** +* +* The instruction set does not support integer vector division. Instead, we +* are using a method for fast integer division based on multiplication and +* shift operations. This method is faster than simple integer division if the +* same divisor is used multiple times. +* +* All elements in a vector are divided by the same divisor. It is not possible +* to divide different elements of the same vector by different divisors. +* +* The parameters used for fast division are stored in an object of a +* Divisor class. This object can be created implicitly, for example in: +* Vec4i a, b; int c; +* a = b / c; +* or explicitly as: +* a = b / Divisor_i(c); +* +* It takes more time to compute the parameters used for fast division than to +* do the division. Therefore, it is advantageous to use the same divisor object +* multiple times. For example, to divide 80 unsigned short integers by 10: +* +* uint16_t dividends[80], quotients[80]; // numbers to work with +* Divisor_us div10(10); // make divisor object for dividing by 10 +* Vec8us temp; // temporary vector +* for (int i = 0; i < 80; i += 8) { // loop for 4 elements per iteration +* temp.load(dividends+i); // load 4 elements +* temp /= div10; // divide each element by 10 +* temp.store(quotients+i); // store 4 elements +* } +* +* The parameters for fast division can also be computed at compile time. This is +* an advantage if the divisor is known at compile time. Use the const_int or const_uint +* macro to do this. For example, for signed integers: +* Vec8s a, b; +* a = b / const_int(10); +* Or, for unsigned integers: +* Vec8us a, b; +* a = b / const_uint(10); +* +* The division of a vector of 16-bit integers is faster than division of a vector +* of other integer sizes. +* +* +* Mathematical formula, used for signed division with fixed or variable divisor: +* (From T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication, +* Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation. +* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 ) +* x = dividend +* d = abs(divisor) +* w = integer word size, bits +* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1 +* L = max(L,1) +* m = 1 + 2^(w+L-1)/d - 2^w [division should overflow to 0 if d = 1] +* sh1 = L-1 +* q = x + (m*x >> w) [high part of signed multiplication with 2w bits] +* q = (q >> sh1) - (x<0 ? -1 : 0) +* if (divisor < 0) q = -q +* result = trunc(x/d) = q +* +* Mathematical formula, used for unsigned division with variable divisor: +* (Also from T. Granlund and P. L. Montgomery) +* x = dividend +* d = divisor +* w = integer word size, bits +* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1 +* m = 1 + 2^w * (2^L-d) / d [2^L should overflow to 0 if L = w] +* sh1 = min(L,1) +* sh2 = max(L-1,0) +* t = m*x >> w [high part of unsigned multiplication with 2w bits] +* result = floor(x/d) = (((x-t) >> sh1) + t) >> sh2 +* +* Mathematical formula, used for unsigned division with fixed divisor: +* (From Terje Mathisen, unpublished) +* x = dividend +* d = divisor +* w = integer word size, bits +* b = floor(log2(d)) = bit_scan_reverse(d) +* f = 2^(w+b) / d [exact division] +* If f is an integer then d is a power of 2 then go to case A +* If the fractional part of f is < 0.5 then go to case B +* If the fractional part of f is > 0.5 then go to case C +* Case A: [shift only] +* result = x >> b +* Case B: [round down f and compensate by adding one to x] +* result = ((x+1)*floor(f)) >> (w+b) [high part of unsigned multiplication with 2w bits] +* Case C: [round up f, no compensation for rounding error] +* result = (x*ceil(f)) >> (w+b) [high part of unsigned multiplication with 2w bits] +* +* +*****************************************************************************/ + +// encapsulate parameters for fast division on vector of 4 32-bit signed integers +class Divisor_i { +protected: + __m128i multiplier; // multiplier used in fast division + __m128i shift1; // shift count used in fast division + __m128i sign; // sign of divisor +public: + Divisor_i() {}; // Default constructor + Divisor_i(int32_t d) { // Constructor with divisor + set(d); + } + Divisor_i(int m, int s1, int sgn) { // Constructor with precalculated multiplier, shift and sign + multiplier = _mm_set1_epi32(m); + shift1 = _mm_cvtsi32_si128(s1); + sign = _mm_set1_epi32(sgn); + } + void set(int32_t d) { // Set or change divisor, calculate parameters + const int32_t d1 = ::abs(d); + int32_t sh, m; + if (d1 > 1) { + sh = (int)bit_scan_reverse(uint32_t(d1 - 1)); // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1 + m = int32_t((int64_t(1) << (32 + sh)) / d1 - ((int64_t(1) << 32) - 1)); // calculate multiplier + } + else { + m = 1; // for d1 = 1 + sh = 0; + if (d == 0) m /= d; // provoke error here if d = 0 + if (uint32_t(d) == 0x80000000u) { // fix overflow for this special case + m = 0x80000001; + sh = 30; + } + } + multiplier = _mm_set1_epi32(m); // broadcast multiplier + shift1 = _mm_cvtsi32_si128(sh); // shift count + //sign = _mm_set1_epi32(d < 0 ? -1 : 0); // bug in VS2019, 32 bit release. Replace by this: + if (d < 0) sign = _mm_set1_epi32(-1); else sign = _mm_set1_epi32(0); // sign of divisor + } + __m128i getm() const { // get multiplier + return multiplier; + } + __m128i gets1() const { // get shift count + return shift1; + } + __m128i getsign() const { // get sign of divisor + return sign; + } +}; + +// encapsulate parameters for fast division on vector of 4 32-bit unsigned integers +class Divisor_ui { +protected: + __m128i multiplier; // multiplier used in fast division + __m128i shift1; // shift count 1 used in fast division + __m128i shift2; // shift count 2 used in fast division +public: + Divisor_ui() {}; // Default constructor + Divisor_ui(uint32_t d) { // Constructor with divisor + set(d); + } + Divisor_ui(uint32_t m, int s1, int s2) { // Constructor with precalculated multiplier and shifts + multiplier = _mm_set1_epi32((int32_t)m); + shift1 = _mm_setr_epi32(s1, 0, 0, 0); + shift2 = _mm_setr_epi32(s2, 0, 0, 0); + } + void set(uint32_t d) { // Set or change divisor, calculate parameters + uint32_t L, L2, sh1, sh2, m; + switch (d) { + case 0: + m = sh1 = sh2 = 1 / d; // provoke error for d = 0 + break; + case 1: + m = 1; sh1 = sh2 = 0; // parameters for d = 1 + break; + case 2: + m = 1; sh1 = 1; sh2 = 0; // parameters for d = 2 + break; + default: // general case for d > 2 + L = bit_scan_reverse(d - 1) + 1; // ceil(log2(d)) + L2 = uint32_t(L < 32 ? 1 << L : 0); // 2^L, overflow to 0 if L = 32 + m = 1 + uint32_t((uint64_t(L2 - d) << 32) / d);// multiplier + sh1 = 1; sh2 = L - 1; // shift counts + } + multiplier = _mm_set1_epi32((int32_t)m); + shift1 = _mm_setr_epi32((int32_t)sh1, 0, 0, 0); + shift2 = _mm_setr_epi32((int32_t)sh2, 0, 0, 0); + } + __m128i getm() const { // get multiplier + return multiplier; + } + __m128i gets1() const { // get shift count 1 + return shift1; + } + __m128i gets2() const { // get shift count 2 + return shift2; + } +}; + + +// encapsulate parameters for fast division on vector of 8 16-bit signed integers +class Divisor_s { +protected: + __m128i multiplier; // multiplier used in fast division + __m128i shift1; // shift count used in fast division + __m128i sign; // sign of divisor +public: + Divisor_s() {}; // Default constructor + Divisor_s(int16_t d) { // Constructor with divisor + set(d); + } + Divisor_s(int16_t m, int s1, int sgn) { // Constructor with precalculated multiplier, shift and sign + multiplier = _mm_set1_epi16(m); + shift1 = _mm_setr_epi32(s1, 0, 0, 0); + sign = _mm_set1_epi32(sgn); + } + void set(int16_t d) { // Set or change divisor, calculate parameters + const int32_t d1 = ::abs(d); + int32_t sh, m; + if (d1 > 1) { + sh = (int32_t)bit_scan_reverse(uint32_t(d1-1));// shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1 + m = ((int32_t(1) << (16 + sh)) / d1 - ((int32_t(1) << 16) - 1)); // calculate multiplier + } + else { + m = 1; // for d1 = 1 + sh = 0; + if (d == 0) m /= d; // provoke error here if d = 0 + if (uint16_t(d) == 0x8000u) { // fix overflow for this special case + m = 0x8001; + sh = 14; + } + } + multiplier = _mm_set1_epi16(int16_t(m)); // broadcast multiplier + shift1 = _mm_setr_epi32(sh, 0, 0, 0); // shift count + sign = _mm_set1_epi32(d < 0 ? -1 : 0); // sign of divisor + } + __m128i getm() const { // get multiplier + return multiplier; + } + __m128i gets1() const { // get shift count + return shift1; + } + __m128i getsign() const { // get sign of divisor + return sign; + } +}; + + +// encapsulate parameters for fast division on vector of 8 16-bit unsigned integers +class Divisor_us { +protected: + __m128i multiplier; // multiplier used in fast division + __m128i shift1; // shift count 1 used in fast division + __m128i shift2; // shift count 2 used in fast division +public: + Divisor_us() {}; // Default constructor + Divisor_us(uint16_t d) { // Constructor with divisor + set(d); + } + Divisor_us(uint16_t m, int s1, int s2) { // Constructor with precalculated multiplier and shifts + multiplier = _mm_set1_epi16((int16_t)m); + shift1 = _mm_setr_epi32(s1, 0, 0, 0); + shift2 = _mm_setr_epi32(s2, 0, 0, 0); + } + void set(uint16_t d) { // Set or change divisor, calculate parameters + uint16_t L, L2, sh1, sh2, m; + switch (d) { + case 0: + m = sh1 = sh2 = 1u / d; // provoke error for d = 0 + break; + case 1: + m = 1; sh1 = sh2 = 0; // parameters for d = 1 + break; + case 2: + m = 1; sh1 = 1; sh2 = 0; // parameters for d = 2 + break; + default: // general case for d > 2 + L = (uint16_t)bit_scan_reverse(d - 1u) + 1u; // ceil(log2(d)) + L2 = uint16_t(1 << L); // 2^L, overflow to 0 if L = 16 + m = 1u + uint16_t((uint32_t(L2 - d) << 16) / d); // multiplier + sh1 = 1; sh2 = L - 1u; // shift counts + } + multiplier = _mm_set1_epi16((int16_t)m); + shift1 = _mm_setr_epi32((int32_t)sh1, 0, 0, 0); + shift2 = _mm_setr_epi32((int32_t)sh2, 0, 0, 0); + } + __m128i getm() const { // get multiplier + return multiplier; + } + __m128i gets1() const { // get shift count 1 + return shift1; + } + __m128i gets2() const { // get shift count 2 + return shift2; + } +}; + + +// vector operator / : divide each element by divisor + +// vector of 4 32-bit signed integers +static inline Vec4i operator / (Vec4i const a, Divisor_i const d) { +#if INSTRSET >= 5 + __m128i t1 = _mm_mul_epi32(a, d.getm()); // 32x32->64 bit signed multiplication of a[0] and a[2] + __m128i t2 = _mm_srli_epi64(t1, 32); // high dword of result 0 and 2 + __m128i t3 = _mm_srli_epi64(a, 32); // get a[1] and a[3] into position for multiplication + __m128i t4 = _mm_mul_epi32(t3, d.getm()); // 32x32->64 bit signed multiplication of a[1] and a[3] + __m128i t7 = _mm_blend_epi16(t2, t4, 0xCC); + __m128i t8 = _mm_add_epi32(t7, a); // add + __m128i t9 = _mm_sra_epi32(t8, d.gets1()); // shift right arithmetic + __m128i t10 = _mm_srai_epi32(a, 31); // sign of a + __m128i t11 = _mm_sub_epi32(t10, d.getsign()); // sign of a - sign of d + __m128i t12 = _mm_sub_epi32(t9, t11); // + 1 if a < 0, -1 if d < 0 + return _mm_xor_si128(t12, d.getsign()); // change sign if divisor negative +#else // not SSE4.1 + __m128i t1 = _mm_mul_epu32(a, d.getm()); // 32x32->64 bit unsigned multiplication of a[0] and a[2] + __m128i t2 = _mm_srli_epi64(t1, 32); // high dword of result 0 and 2 + __m128i t3 = _mm_srli_epi64(a, 32); // get a[1] and a[3] into position for multiplication + __m128i t4 = _mm_mul_epu32(t3, d.getm()); // 32x32->64 bit unsigned multiplication of a[1] and a[3] + __m128i t5 = _mm_set_epi32(-1, 0, -1, 0); // mask of dword 1 and 3 + __m128i t6 = _mm_and_si128(t4, t5); // high dword of result 1 and 3 + __m128i t7 = _mm_or_si128(t2, t6); // combine all four results of unsigned high mul into one vector + // convert unsigned to signed high multiplication (from: H S Warren: Hacker's delight, 2003, p. 132) + __m128i u1 = _mm_srai_epi32(a, 31); // sign of a + __m128i u2 = _mm_srai_epi32(d.getm(), 31); // sign of m [ m is always negative, except for abs(d) = 1 ] + __m128i u3 = _mm_and_si128(d.getm(), u1); // m * sign of a + __m128i u4 = _mm_and_si128(a, u2); // a * sign of m + __m128i u5 = _mm_add_epi32(u3, u4); // sum of sign corrections + __m128i u6 = _mm_sub_epi32(t7, u5); // high multiplication result converted to signed + __m128i t8 = _mm_add_epi32(u6, a); // add a + __m128i t9 = _mm_sra_epi32(t8, d.gets1()); // shift right arithmetic + __m128i t10 = _mm_sub_epi32(u1, d.getsign()); // sign of a - sign of d + __m128i t11 = _mm_sub_epi32(t9, t10); // + 1 if a < 0, -1 if d < 0 + return _mm_xor_si128(t11, d.getsign()); // change sign if divisor negative +#endif +} + +// vector of 4 32-bit unsigned integers +static inline Vec4ui operator / (Vec4ui const a, Divisor_ui const d) { + __m128i t1 = _mm_mul_epu32(a, d.getm()); // 32x32->64 bit unsigned multiplication of a[0] and a[2] + __m128i t2 = _mm_srli_epi64(t1, 32); // high dword of result 0 and 2 + __m128i t3 = _mm_srli_epi64(a, 32); // get a[1] and a[3] into position for multiplication + __m128i t4 = _mm_mul_epu32(t3, d.getm()); // 32x32->64 bit unsigned multiplication of a[1] and a[3] +#if INSTRSET >= 5 // SSE4.1 supported + __m128i t7 = _mm_blend_epi16(t2, t4, 0xCC); // blend two results +#else + __m128i t5 = _mm_set_epi32(-1, 0, -1, 0); // mask of dword 1 and 3 + __m128i t6 = _mm_and_si128(t4, t5); // high dword of result 1 and 3 + __m128i t7 = _mm_or_si128(t2, t6); // combine all four results into one vector +#endif + __m128i t8 = _mm_sub_epi32(a, t7); // subtract + __m128i t9 = _mm_srl_epi32(t8, d.gets1()); // shift right logical + __m128i t10 = _mm_add_epi32(t7, t9); // add + return _mm_srl_epi32(t10, d.gets2()); // shift right logical +} + +// vector of 8 16-bit signed integers +static inline Vec8s operator / (Vec8s const a, Divisor_s const d) { + __m128i t1 = _mm_mulhi_epi16(a, d.getm()); // multiply high signed words + __m128i t2 = _mm_add_epi16(t1, a); // + a + __m128i t3 = _mm_sra_epi16(t2, d.gets1()); // shift right arithmetic + __m128i t4 = _mm_srai_epi16(a, 15); // sign of a + __m128i t5 = _mm_sub_epi16(t4, d.getsign()); // sign of a - sign of d + __m128i t6 = _mm_sub_epi16(t3, t5); // + 1 if a < 0, -1 if d < 0 + return _mm_xor_si128(t6, d.getsign()); // change sign if divisor negative +} + +// vector of 8 16-bit unsigned integers +static inline Vec8us operator / (Vec8us const a, Divisor_us const d) { + __m128i t1 = _mm_mulhi_epu16(a, d.getm()); // multiply high unsigned words + __m128i t2 = _mm_sub_epi16(a, t1); // subtract + __m128i t3 = _mm_srl_epi16(t2, d.gets1()); // shift right logical + __m128i t4 = _mm_add_epi16(t1, t3); // add + return _mm_srl_epi16(t4, d.gets2()); // shift right logical +} + + +// vector of 16 8-bit signed integers +static inline Vec16c operator / (Vec16c const a, Divisor_s const d) { + // expand into two Vec8s + Vec8s low = extend_low(a) / d; + Vec8s high = extend_high(a) / d; + return compress(low, high); +} + +// vector of 16 8-bit unsigned integers +static inline Vec16uc operator / (Vec16uc const a, Divisor_us const d) { + // expand into two Vec8s + Vec8us low = extend_low(a) / d; + Vec8us high = extend_high(a) / d; + return compress(low, high); +} + +// vector operator /= : divide +static inline Vec8s & operator /= (Vec8s & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec8us & operator /= (Vec8us & a, Divisor_us const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec4i & operator /= (Vec4i & a, Divisor_i const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec4ui & operator /= (Vec4ui & a, Divisor_ui const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec16c & operator /= (Vec16c & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec16uc & operator /= (Vec16uc & a, Divisor_us const d) { + a = a / d; + return a; +} + +/***************************************************************************** +* +* Integer division 2: divisor is a compile-time constant +* +*****************************************************************************/ + +// Divide Vec4i by compile-time constant +template +static inline Vec4i divide_by_i(Vec4i const x) { + static_assert(d != 0, "Integer division by zero"); // Error message if dividing by zero + if constexpr (d == 1) return x; + if constexpr (d == -1) return -x; + if constexpr (uint32_t(d) == 0x80000000u) return Vec4i(x == Vec4i(0x80000000)) & 1; // prevent overflow when changing sign + constexpr uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);// compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits) + if constexpr ((d1 & (d1 - 1)) == 0) { + // d1 is a power of 2. use shift + constexpr int k = bit_scan_reverse_const(d1); + __m128i sign; + if constexpr (k > 1) sign = _mm_srai_epi32(x, k-1); else sign = x;// k copies of sign bit + __m128i bias = _mm_srli_epi32(sign, 32 - k); // bias = x >= 0 ? 0 : k-1 + __m128i xpbias = _mm_add_epi32(x, bias); // x + bias + __m128i q = _mm_srai_epi32(xpbias, k); // (x + bias) >> k + if (d > 0) return q; // d > 0: return q + return _mm_sub_epi32(_mm_setzero_si128(), q); // d < 0: return -q + } + // general case + constexpr int32_t sh = bit_scan_reverse_const(uint32_t(d1) - 1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case) + constexpr int32_t mult = int(1 + (uint64_t(1) << (32 + sh)) / uint32_t(d1) - (int64_t(1) << 32)); // multiplier + const Divisor_i div(mult, sh, d < 0 ? -1 : 0); + return x / div; +} + +// define Vec4i a / const_int(d) +template +static inline Vec4i operator / (Vec4i const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec4i a / const_uint(d) +template +static inline Vec4i operator / (Vec4i const a, Const_uint_t) { + static_assert(d < 0x80000000u, "Dividing signed by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec4i & operator /= (Vec4i & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec4i & operator /= (Vec4i & a, Const_uint_t b) { + a = a / b; + return a; +} + + +// Divide Vec4ui by compile-time constant +template +static inline Vec4ui divide_by_ui(Vec4ui const x) { + static_assert(d != 0, "Integer division by zero"); // Error message if dividing by zero + if constexpr (d == 1) return x; // divide by 1 + constexpr int b = bit_scan_reverse_const(d); // floor(log2(d)) + if constexpr ((uint32_t(d) & (uint32_t(d) - 1)) == 0) { + // d is a power of 2. use shift + return _mm_srli_epi32(x, b); // x >> b + } + // general case (d > 2) + constexpr uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d + constexpr uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d + constexpr bool round_down = (2 * rem < d); // check if fraction is less than 0.5 + constexpr uint32_t mult1 = round_down ? mult : mult + 1; + // do 32*32->64 bit unsigned multiplication and get high part of result +#if INSTRSET >= 10 + const __m128i multv = _mm_maskz_set1_epi32(0x05, mult1);// zero-extend mult and broadcast +#else + const __m128i multv = Vec2uq(uint64_t(mult1)); // zero-extend mult and broadcast +#endif + __m128i t1 = _mm_mul_epu32(x, multv); // 32x32->64 bit unsigned multiplication of x[0] and x[2] + if constexpr (round_down) { + t1 = _mm_add_epi64(t1, multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow + } + __m128i t2 = _mm_srli_epi64(t1, 32); // high dword of result 0 and 2 + __m128i t3 = _mm_srli_epi64(x, 32); // get x[1] and x[3] into position for multiplication + __m128i t4 = _mm_mul_epu32(t3, multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3] + if constexpr (round_down) { + t4 = _mm_add_epi64(t4, multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow + } +#if INSTRSET >= 5 // SSE4.1 supported + __m128i t7 = _mm_blend_epi16(t2, t4, 0xCC); // blend two results +#else + __m128i t5 = _mm_set_epi32(-1, 0, -1, 0); // mask of dword 1 and 3 + __m128i t6 = _mm_and_si128(t4, t5); // high dword of result 1 and 3 + __m128i t7 = _mm_or_si128(t2, t6); // combine all four results into one vector +#endif + Vec4ui q = _mm_srli_epi32(t7, b); // shift right by b + return q; // no overflow possible +} + +// define Vec4ui a / const_uint(d) +template +static inline Vec4ui operator / (Vec4ui const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec4ui a / const_int(d) +template +static inline Vec4ui operator / (Vec4ui const a, Const_int_t) { + static_assert(d < 0x80000000u, "Dividing unsigned integer by negative value is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec4ui & operator /= (Vec4ui & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec4ui & operator /= (Vec4ui & a, Const_int_t b) { + a = a / b; + return a; +} + + +// Divide Vec8s by compile-time constant +template +static inline Vec8s divide_by_i(Vec8s const x) { + constexpr int16_t d0 = int16_t(d); // truncate d to 16 bits + static_assert(d0 != 0, "Integer division by zero"); // Error message if dividing by zero + if constexpr (d0 == 1) return x; // divide by 1 + if constexpr (d0 == -1) return -x; // divide by -1 + if constexpr (uint16_t(d0) == 0x8000u) return Vec8s(x == Vec8s(0x8000)) & 1;// prevent overflow when changing sign + // if (d > 0x7FFF || d < -0x8000) return 0; // not relevant when d truncated to 16 bits + const uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);// compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits) + if constexpr ((d1 & (d1 - 1)) == 0) { + // d is a power of 2. use shift + constexpr int k = bit_scan_reverse_const(uint32_t(d1)); + __m128i sign; + if constexpr (k > 1) sign = _mm_srai_epi16(x, k-1); else sign = x;// k copies of sign bit + __m128i bias = _mm_srli_epi16(sign, 16 - k); // bias = x >= 0 ? 0 : k-1 + __m128i xpbias = _mm_add_epi16(x, bias); // x + bias + __m128i q = _mm_srai_epi16(xpbias, k); // (x + bias) >> k + if (d0 > 0) return q; // d0 > 0: return q + return _mm_sub_epi16(_mm_setzero_si128(), q); // d0 < 0: return -q + } + // general case + const int L = bit_scan_reverse_const(uint16_t(d1 - 1)) + 1;// ceil(log2(d)). (d < 2 handled above) + const int16_t mult = int16_t(1 + (1u << (15 + L)) / uint32_t(d1) - 0x10000);// multiplier + const int shift1 = L - 1; + const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1); + return x / div; +} + +// define Vec8s a / const_int(d) +template +static inline Vec8s operator / (Vec8s const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec8s a / const_uint(d) +template +static inline Vec8s operator / (Vec8s const a, Const_uint_t) { + static_assert(d < 0x8000u, "Dividing signed by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec8s & operator /= (Vec8s & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec8s & operator /= (Vec8s & a, Const_uint_t b) { + a = a / b; + return a; +} + + +// Divide Vec8us by compile-time constant +template +static inline Vec8us divide_by_ui(Vec8us const x) { + constexpr uint16_t d0 = uint16_t(d); // truncate d to 16 bits + static_assert(d0 != 0, "Integer division by zero"); // Error message if dividing by zero + if constexpr (d0 == 1) return x; // divide by 1 + constexpr int b = bit_scan_reverse_const(d0); // floor(log2(d)) + if constexpr ((d0 & (d0 - 1u)) == 0) { + // d is a power of 2. use shift + return _mm_srli_epi16(x, b); // x >> b + } + // general case (d > 2) + constexpr uint16_t mult = uint16_t((1u << uint32_t(b+16)) / d0); // multiplier = 2^(32+b) / d + constexpr uint32_t rem = (uint32_t(1) << uint32_t(b + 16)) - uint32_t(d0) * mult;// remainder 2^(32+b) % d + constexpr bool round_down = (2u * rem < d0); // check if fraction is less than 0.5 + Vec8us x1 = x; + if (round_down) { + x1 = x1 + 1u; // round down mult and compensate by adding 1 to x + } + constexpr uint16_t mult1 = round_down ? mult : mult + 1; + const __m128i multv = _mm_set1_epi16((int16_t)mult1); // broadcast mult + __m128i xm = _mm_mulhi_epu16(x1, multv); // high part of 16x16->32 bit unsigned multiplication + Vec8us q = _mm_srli_epi16(xm, (int)b); // shift right by b + if constexpr (round_down) { + Vec8sb overfl = (x1 == (Vec8us)_mm_setzero_si128());// check for overflow of x+1 + return select(overfl, Vec8us(uint16_t(mult1 >> (uint16_t)b)), q); // deal with overflow (rarely needed) + } + else { + return q; // no overflow possible + } +} + +// define Vec8us a / const_uint(d) +template +static inline Vec8us operator / (Vec8us const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec8us a / const_int(d) +template +static inline Vec8us operator / (Vec8us const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec8us & operator /= (Vec8us & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec8us & operator /= (Vec8us & a, Const_int_t b) { + a = a / b; + return a; +} + + +// define Vec16c a / const_int(d) +template +static inline Vec16c operator / (Vec16c const a, Const_int_t) { + // expand into two Vec8s + Vec8s low = extend_low(a) / Const_int_t(); + Vec8s high = extend_high(a) / Const_int_t(); + return compress(low, high); +} + +// define Vec16c a / const_uint(d) +template +static inline Vec16c operator / (Vec16c const a, Const_uint_t) { + static_assert (uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned"); + return a / Const_int_t(); // signed divide +} + +// vector operator /= : divide +template +static inline Vec16c & operator /= (Vec16c & a, Const_int_t b) { + a = a / b; + return a; +} +// vector operator /= : divide +template +static inline Vec16c & operator /= (Vec16c & a, Const_uint_t b) { + a = a / b; + return a; +} + +// define Vec16uc a / const_uint(d) +template +static inline Vec16uc operator / (Vec16uc const a, Const_uint_t) { + // expand into two Vec8usc + Vec8us low = extend_low(a) / Const_uint_t(); + Vec8us high = extend_high(a) / Const_uint_t(); + return compress(low, high); +} + +// define Vec16uc a / const_int(d) +template +static inline Vec16uc operator / (Vec16uc const a, Const_int_t) { + static_assert (int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous"); + return a / Const_uint_t(); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec16uc & operator /= (Vec16uc & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16uc & operator /= (Vec16uc & a, Const_int_t b) { + a = a / b; + return a; +} + + +/***************************************************************************** +* +* Boolean <-> bitfield conversion functions +* +*****************************************************************************/ + +#if INSTRSET >= 9 // compact boolean vector Vec16b + +// to_bits: convert boolean vector to integer bitfield +static inline uint16_t to_bits(Vec16b const x) { + return __mmask16(x); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec8b const x) { + return uint8_t(Vec8b_masktype(x)); +} + +#endif + +#if INSTRSET >= 10 // compact boolean vectors, other sizes + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec4b const x) { + return __mmask8(x) & 0x0F; +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec2b const x) { + return __mmask8(x) & 0x03; +} + +#else // broad boolean vectors + +// to_bits: convert boolean vector to integer bitfield +static inline uint16_t to_bits(Vec16cb const x) { + return (uint16_t)_mm_movemask_epi8(x); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec8sb const x) { + __m128i a = _mm_packs_epi16(x, x); // 16-bit words to bytes + return (uint8_t)_mm_movemask_epi8(a); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec4ib const x) { + __m128i a = _mm_packs_epi32(x, x); // 32-bit dwords to 16-bit words + __m128i b = _mm_packs_epi16(a, a); // 16-bit words to bytes + return uint8_t(_mm_movemask_epi8(b) & 0xF); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint8_t to_bits(Vec2qb const x) { + uint32_t a = (uint32_t)_mm_movemask_epi8(x); + return (a & 1) | ((a >> 7) & 2); +} + +#endif + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORI128_H diff --git a/DFTTest/VCL2/vectori256.h b/DFTTest/VCL2/vectori256.h new file mode 100644 index 0000000..035693f --- /dev/null +++ b/DFTTest/VCL2/vectori256.h @@ -0,0 +1,5778 @@ +/**************************** vectori256.h ******************************* +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining integer vector classes as interface to intrinsic +* functions in x86 microprocessors with AVX2 and later instruction sets. +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec256b Vector of 256 bits. Used internally as base class +* Vec32c Vector of 32 8-bit signed integers +* Vec32uc Vector of 32 8-bit unsigned integers +* Vec32cb Vector of 32 Booleans for use with Vec32c and Vec32uc +* Vec16s Vector of 16 16-bit signed integers +* Vec16us Vector of 16 16-bit unsigned integers +* Vec16sb Vector of 16 Booleans for use with Vec16s and Vec16us +* Vec8i Vector of 8 32-bit signed integers +* Vec8ui Vector of 8 32-bit unsigned integers +* Vec8ib Vector of 8 Booleans for use with Vec8i and Vec8ui +* Vec4q Vector of 4 64-bit signed integers +* Vec4uq Vector of 4 64-bit unsigned integers +* Vec4qb Vector of 4 Booleans for use with Vec4q and Vec4uq +* +* Each vector object is represented internally in the CPU as a 256-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORI256_H +#define VECTORI256_H 1 + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +// check combination of header files +#if defined (VECTORI256E_H) +#error Two different versions of vectori256.h included +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +// Generate a constant vector of 8 integers stored in memory. +template + static inline constexpr __m256i constant8ui() { + /* + const union { + uint32_t i[8]; + __m256i ymm; + } u = { {i0,i1,i2,i3,i4,i5,i6,i7} }; + return u.ymm; + */ + return _mm256_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7); +} + + +// Join two 128-bit vectors +#define set_m128ir(lo,hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo),(hi),1) + +/***************************************************************************** +* +* Compact boolean vectors +* +*****************************************************************************/ + +#if INSTRSET >= 10 // 32-bit and 64-bit masks require AVX512BW + +// Compact vector of 32 booleans +class Vec32b { +protected: + __mmask32 mm; // Boolean mask register +public: + // Default constructor: + Vec32b() { + } + // Constructor to convert from type __mmask32 used in intrinsics + // Made explicit to prevent implicit conversion from int + Vec32b(__mmask32 x) { + mm = x; + } + /* + // Constructor to build from all elements: + Vec32b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, + bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15, + bool b16, bool b17, bool b18, bool b19, bool b20, bool b21, bool b22, bool b23, + bool b24, bool b25, bool b26, bool b27, bool b28, bool b29, bool b30, bool b31) { + mm = uint32_t( + (uint32_t)b0 | (uint32_t)b1 << 1 | (uint32_t)b2 << 2 | (uint32_t)b3 << 3 | + (uint32_t)b4 << 4 | (uint32_t)b5 << 5 | (uint32_t)b6 << 6 | (uint32_t)b7 << 7 | + (uint32_t)b8 << 8 | (uint32_t)b9 << 9 | (uint32_t)b10 << 10 | (uint32_t)b11 << 11 | + (uint32_t)b12 << 12 | (uint32_t)b13 << 13 | (uint32_t)b14 << 14 | (uint32_t)b15 << 15 | + (uint32_t)b16 << 16 | (uint32_t)b17 << 17 | (uint32_t)b18 << 18 | (uint32_t)b19 << 19 | + (uint32_t)b20 << 20 | (uint32_t)b21 << 21 | (uint32_t)b22 << 22 | (uint32_t)b23 << 23 | + (uint32_t)b24 << 24 | (uint32_t)b25 << 25 | (uint32_t)b26 << 26 | (uint32_t)b27 << 27 | + (uint32_t)b28 << 28 | (uint32_t)b29 << 29 | (uint32_t)b30 << 30 | (uint32_t)b31 << 31); + } */ + // Constructor to broadcast single value: + Vec32b(bool b) { + mm = __mmask32(-int32_t(b)); + } + // Constructor to make from two halves + Vec32b(Vec16b const x0, Vec16b const x1) { + mm = uint16_t(__mmask16(x0)) | uint32_t(__mmask16(x1)) << 16; + } + // Assignment operator to convert from type __mmask32 used in intrinsics: + Vec32b & operator = (__mmask32 x) { + mm = x; + return *this; + } + // Assignment operator to broadcast scalar value: + Vec32b & operator = (bool b) { + mm = Vec32b(b); + return *this; + } + // Type cast operator to convert to __mmask32 used in intrinsics + operator __mmask32() const { + return mm; + } + // split into two halves + Vec16b get_low() const { + return Vec16b(__mmask16(mm)); + } + Vec16b get_high() const { + return Vec16b(__mmask16(mm >> 16)); + } + // Member function to change a single element in vector + Vec32b const insert(int index, bool value) { + mm = __mmask32(((uint32_t)mm & ~(1 << index)) | (uint32_t)value << index); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return ((uint32_t)mm >> index) & 1; + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec32b & load_bits(uint32_t a) { + mm = __mmask32(a); + return *this; + } + // Number of elements + static constexpr int size() { + return 32; + } + // Type of elements + static constexpr int elementtype() { + return 2; + } +}; + +#endif + + +/***************************************************************************** +* +* Vector of 256 bits. Used as base class +* +*****************************************************************************/ + +class Vec256b { +protected: + __m256i ymm; // Integer vector +public: + // Default constructor: + Vec256b() {} + + // Constructor to broadcast the same value into all elements + // Removed because of undesired implicit conversions: + //Vec256b(int i) {ymm = _mm256_set1_epi32(-(i & 1));} + + // Constructor to build from two Vec128b: + Vec256b(Vec128b const a0, Vec128b const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec256b(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec256b & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Type cast operator to convert to __m256i used in intrinsics + operator __m256i() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec256b & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 32, but there is hardly any speed advantage of load_a on modern processors + Vec256b & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to store into array (unaligned) + void store(void * p) const { + _mm256_storeu_si256((__m256i*)p, ymm); + } + // Member function storing into array, aligned by 32 + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 32, but there is hardly any speed advantage of load_a on modern processors + void store_a(void * p) const { + _mm256_store_si256((__m256i*)p, ymm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 32 + void store_nt(void * p) const { + _mm256_stream_si256((__m256i*)p, ymm); + } + // Member functions to split into two Vec128b: + Vec128b get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec128b get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int size() { + return 256; + } + static constexpr int elementtype() { + return 1; + } + typedef __m256i registertype; +}; + + +// Define operators and functions for this class + +// vector operator & : bitwise and +static inline Vec256b operator & (Vec256b const a, Vec256b const b) { + return _mm256_and_si256(a, b); +} +static inline Vec256b operator && (Vec256b const a, Vec256b const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec256b operator | (Vec256b const a, Vec256b const b) { + return _mm256_or_si256(a, b); +} +static inline Vec256b operator || (Vec256b const a, Vec256b const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec256b operator ^ (Vec256b const a, Vec256b const b) { + return _mm256_xor_si256(a, b); +} + +// vector operator ~ : bitwise not +static inline Vec256b operator ~ (Vec256b const a) { + return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); +} + +// vector operator &= : bitwise and +static inline Vec256b & operator &= (Vec256b & a, Vec256b const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec256b & operator |= (Vec256b & a, Vec256b const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec256b & operator ^= (Vec256b & a, Vec256b const b) { + a = a ^ b; + return a; +} + +// function andnot: a & ~ b +static inline Vec256b andnot (Vec256b const a, Vec256b const b) { + return _mm256_andnot_si256(b, a); +} + +static inline __m256i zero_si256() { + return _mm256_setzero_si256(); +} + + +/***************************************************************************** +* +* selectb function +* +*****************************************************************************/ +// Select between two sources, byte by byte. Used in various functions and operators +// Corresponds to this pseudocode: +// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i]; +// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed. +// Only bit 7 in each byte of s is checked, +static inline __m256i selectb (__m256i const s, __m256i const a, __m256i const b) { + return _mm256_blendv_epi8 (b, a, s); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec256b const a) { + return _mm256_testc_si256(a,_mm256_set1_epi32(-1)) != 0; +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec256b const a) { + return ! _mm256_testz_si256(a,a); +} + + +/***************************************************************************** +* +* Vector of 32 8-bit signed integers +* +*****************************************************************************/ + +class Vec32c : public Vec256b { +public: + // Default constructor: + Vec32c(){ + } + // Constructor to broadcast the same value into all elements: + Vec32c(int i) { + ymm = _mm256_set1_epi8((char)i); + } + // Constructor to build from all elements: + Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, + int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, + int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23, + int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) { + ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); + } + // Constructor to build from two Vec16c: + Vec32c(Vec16c const a0, Vec16c const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec32c(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec32c & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Constructor to convert from type Vec256b used in emulation + Vec32c(Vec256b const & x) { // gcc requires const & + ymm = x; + } + // Type cast operator to convert to __m256i used in intrinsics + operator __m256i() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec32c & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec32c & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec32c & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_maskz_loadu_epi8(__mmask32(((uint64_t)1 << n) - 1), p); +#else + if (n <= 0) { + *this = 0; + } + else if (n <= 16) { + *this = Vec32c(Vec16c().load_partial(n, p), 0); + } + else if (n < 32) { + *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char const*)p+16)); + } + else { + load(p); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + _mm256_mask_storeu_epi8(p, __mmask32(((uint64_t)1 << n) - 1), ymm); +#else + if (n <= 0) { + return; + } + else if (n <= 16) { + get_low().store_partial(n, p); + } + else if (n < 32) { + get_low().store(p); + get_high().store_partial(n-16, (char*)p+16); + } + else { + store(p); + } +#endif + } + // cut off vector to n elements. The last 32-n elements are set to zero + Vec32c & cutoff(int n) { +#if INSTRSET >= 10 + ymm = _mm256_maskz_mov_epi8(__mmask32(((uint64_t)1 << n) - 1), ymm); +#else + if (uint32_t(n) >= 32) return *this; + const union { + int32_t i[16]; + char c[64]; + } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}}; + *this &= Vec32c().load(mask.c+32-n); +#endif + return *this; + } + // Member function to change a single element in vector + Vec32c const insert(int index, int8_t value) { +#if INSTRSET >= 10 + ymm = _mm256_mask_set1_epi8(ymm, __mmask32(1u << index), value); +#else + const int8_t maskl[64] = {0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, + -1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0}; + __m256i broad = _mm256_set1_epi8(value); // broadcast value into all elements + __m256i mask = _mm256_loadu_si256((__m256i const*)(maskl+32-(index & 0x1F))); // mask with FF at index position + ymm = selectb(mask,broad,ymm); +#endif + return *this; + } + // Member function extract a single element from vector + int8_t extract(int index) const { +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + __m256i x = _mm256_maskz_compress_epi8(__mmask32(1u << index), ymm); + return (int8_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(x)); +#else + int8_t x[32]; + store(x); + return x[index & 0x1F]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int8_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec16c: + Vec16c get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec16c get_high() const { + return _mm256_extracti128_si256(ymm,1); + } + static constexpr int size() { + return 32; + } + static constexpr int elementtype() { + return 4; + } +}; + + +/***************************************************************************** +* +* Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors +class Vec32cb : public Vec32c { +public: + // Default constructor: + Vec32cb(){ + } + // Constructor to build from all elements: + /* + Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, + bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15, + bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23, + bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) : + Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), + -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15), + -int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23), + -int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31)) + {} */ + // Constructor to convert from type __m256i used in intrinsics: + Vec32cb(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec32cb & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec32cb(bool b) : Vec32c(-int8_t(b)) { + } + // Constructor to convert from Vec32c + Vec32cb(Vec32c const a) { + ymm = a; + } + // Assignment operator to broadcast scalar value: + Vec32cb & operator = (bool b) { + *this = Vec32cb(b); + return *this; + } + // Constructor to build from two Vec16cb: + Vec32cb(Vec16cb const a0, Vec16cb const a1) : Vec32c(Vec16c(a0), Vec16c(a1)) { + } + // Member functions to split into two Vec16c: + Vec16cb get_low() const { + return Vec16cb(Vec32c::get_low()); + } + Vec16cb get_high() const { + return Vec16cb(Vec32c::get_high()); + } + Vec32cb & insert (int index, bool a) { + Vec32c::insert(index, -(int8_t)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec32c::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec32cb & load_bits(uint32_t a) { + __m256i b1 = _mm256_set1_epi32((int32_t)~a); // broadcast a. Invert because we have no compare-not-equal + __m256i m1 = constant8ui<0,0,0x01010101,0x01010101,0x02020202,0x02020202,0x03030303,0x03030303>(); + __m256i c1 = _mm256_shuffle_epi8(b1, m1); // get right byte in each position + __m256i m2 = constant8ui<0x08040201,0x80402010,0x08040201,0x80402010,0x08040201,0x80402010,0x08040201,0x80402010>(); + __m256i d1 = _mm256_and_si256(c1, m2); // isolate one bit in each byte + ymm = _mm256_cmpeq_epi8(d1,_mm256_setzero_si256());// compare with 0 + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec32cb(int b) = delete; + Vec32cb & operator = (int x) = delete; +}; +#else + +typedef Vec32b Vec32cb; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Define operators and functions for Vec32b or Vec32cb +* +*****************************************************************************/ + +// vector operator & : bitwise and +static inline Vec32cb operator & (Vec32cb const a, Vec32cb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask32(__mmask32(a) & __mmask32(b)); // _kand_mask32 not defined in all compilers +#else + return Vec32c(Vec256b(a) & Vec256b(b)); +#endif +} +static inline Vec32cb operator && (Vec32cb const a, Vec32cb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec32cb operator | (Vec32cb const a, Vec32cb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask32(__mmask32(a) | __mmask32(b)); // _kor_mask32 +#else + return Vec32c(Vec256b(a) | Vec256b(b)); +#endif +} +static inline Vec32cb operator || (Vec32cb const a, Vec32cb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec32cb operator ^ (Vec32cb const a, Vec32cb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask32(__mmask32(a) ^ __mmask32(b)); // _kxor_mask32 +#else + return Vec32c(Vec256b(a) ^ Vec256b(b)); +#endif +} +// vector operator ^= : bitwise xor +static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec32cb operator == (Vec32cb const a, Vec32cb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask32(__mmask32(a) ^ ~__mmask32(b)); // _kxnor_mask32 +#else + return Vec32c(a ^ (~b)); +#endif +} + +// vector operator != : xor +static inline Vec32cb operator != (Vec32cb const a, Vec32cb const b) { + return Vec32cb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec32cb operator ~ (Vec32cb const a) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask32(~ __mmask32(a)); // _knot_mask32 +#else + return Vec32c( ~ Vec256b(a)); +#endif +} + +// vector operator ! : element not +static inline Vec32cb operator ! (Vec32cb const a) { + return ~ a; +} + +// vector function andnot +static inline Vec32cb andnot (Vec32cb const a, Vec32cb const b) { +#if INSTRSET >= 10 // compact boolean vectors + return __mmask32(~__mmask32(b) & __mmask32(a)); // _kandn_mask32 +#else + return Vec32c(andnot(Vec256b(a), Vec256b(b))); +#endif +} + +#if INSTRSET >= 10 // compact boolean vectors + +// horizontal_and. Returns true if all elements are true +static inline bool horizontal_and(Vec32b const a) { + return __mmask32(a) == 0xFFFFFFFF; +} + +// horizontal_or. Returns true if at least one element is true +static inline bool horizontal_or(Vec32b const a) { + return __mmask32(a) != 0; +} + +// fix bug in gcc version 70400 header file: _mm256_cmp_epi8_mask returns 16 bit mask, should be 32 bit +template +static inline __mmask32 _mm256_cmp_epi8_mask_fix(__m256i a, __m256i b) { +#if defined (GCC_VERSION) && GCC_VERSION < 70900 && ! defined (__INTEL_COMPILER) + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)a, (__v32qi)b, i, (__mmask32)(-1)); +#else + return _mm256_cmp_epi8_mask(a, b, i); +#endif +} + +template +static inline __mmask32 _mm256_cmp_epu8_mask_fix(__m256i a, __m256i b) { +#if defined (GCC_VERSION) && GCC_VERSION < 70900 && ! defined (__INTEL_COMPILER) + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)a, (__v32qi)b, i, (__mmask32)(-1)); +#else + return _mm256_cmp_epu8_mask(a, b, i); +#endif +} + +#endif + + +/***************************************************************************** +* +* Operators for Vec32c +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec32c operator + (Vec32c const a, Vec32c const b) { + return _mm256_add_epi8(a, b); +} +// vector operator += : add +static inline Vec32c & operator += (Vec32c & a, Vec32c const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec32c operator ++ (Vec32c & a, int) { + Vec32c a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec32c & operator ++ (Vec32c & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec32c operator - (Vec32c const a, Vec32c const b) { + return _mm256_sub_epi8(a, b); +} +// vector operator - : unary minus +static inline Vec32c operator - (Vec32c const a) { + return _mm256_sub_epi8(_mm256_setzero_si256(), a); +} +// vector operator -= : add +static inline Vec32c & operator -= (Vec32c & a, Vec32c const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec32c operator -- (Vec32c & a, int) { + Vec32c a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec32c & operator -- (Vec32c & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec32c operator * (Vec32c const a, Vec32c const b) { + // There is no 8-bit multiply in AVX2. Split into two 16-bit multiplications + __m256i aodd = _mm256_srli_epi16(a,8); // odd numbered elements of a + __m256i bodd = _mm256_srli_epi16(b,8); // odd numbered elements of b + __m256i muleven = _mm256_mullo_epi16(a,b); // product of even numbered elements + __m256i mulodd = _mm256_mullo_epi16(aodd,bodd); // product of odd numbered elements + mulodd = _mm256_slli_epi16(mulodd,8); // put odd numbered elements back in place +#if INSTRSET >= 10 // AVX512VL + AVX512BW + return _mm256_mask_mov_epi8(mulodd, 0x55555555, muleven); +#else + __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for even positions + __m256i product = selectb(mask,muleven,mulodd); // interleave even and odd + return product; +#endif +} + +// vector operator *= : multiply +static inline Vec32c & operator *= (Vec32c & a, Vec32c const b) { + a = a * b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec32c operator << (Vec32c const a, int b) { + uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out + __m256i am = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow + __m256i res = _mm256_sll_epi16(am,_mm_cvtsi32_si128(b)); // 16-bit shifts + return res; +} + +// vector operator <<= : shift left +static inline Vec32c & operator <<= (Vec32c & a, int b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic all elements +static inline Vec32c operator >> (Vec32c const a, int b) { + __m256i aeven = _mm256_slli_epi16(a,8); // even numbered elements of a. get sign bit in position + aeven = _mm256_sra_epi16(aeven,_mm_cvtsi32_si128(b+8)); // shift arithmetic, back to position + __m256i aodd = _mm256_sra_epi16(a,_mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic +#if INSTRSET >= 10 // AVX512VL + AVX512BW + return _mm256_mask_mov_epi8(aodd, 0x55555555, aeven); +#else + __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for even positions + __m256i res = selectb(mask,aeven,aodd); // interleave even and odd + return res; +#endif +} + +// vector operator >>= : shift right artihmetic +static inline Vec32c & operator >>= (Vec32c & a, int b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec32cb operator == (Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + //return _mm256_cmp_epi8_mask (a, b, 0); + return _mm256_cmp_epi8_mask_fix<0> (a, b); +#else + return _mm256_cmpeq_epi8(a,b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec32cb operator != (Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi8_mask_fix<4> (a, b); +#else + return Vec32cb(Vec32c(~(a == b))); +#endif +} + +// vector operator > : returns true for elements for which a > b (signed) +static inline Vec32cb operator > (Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi8_mask_fix<6> (a, b); +#else + return _mm256_cmpgt_epi8(a,b); +#endif +} + +// vector operator < : returns true for elements for which a < b (signed) +static inline Vec32cb operator < (Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi8_mask_fix<1> (a, b); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec32cb operator >= (Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi8_mask_fix<5> (a, b); +#else + return Vec32cb(Vec32c(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec32cb operator <= (Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi8_mask_fix<2> (a, b); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec32c operator & (Vec32c const a, Vec32c const b) { + return Vec32c(Vec256b(a) & Vec256b(b)); +} +static inline Vec32c operator && (Vec32c const a, Vec32c const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec32c & operator &= (Vec32c & a, Vec32c const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec32c operator | (Vec32c const a, Vec32c const b) { + return Vec32c(Vec256b(a) | Vec256b(b)); +} +static inline Vec32c operator || (Vec32c const a, Vec32c const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec32c & operator |= (Vec32c & a, Vec32c const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec32c operator ^ (Vec32c const a, Vec32c const b) { + return Vec32c(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec32c & operator ^= (Vec32c & a, Vec32c const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec32c operator ~ (Vec32c const a) { + return Vec32c( ~ Vec256b(a)); +} + +// vector operator ! : logical not, returns true for elements == 0 +static inline Vec32cb operator ! (Vec32c const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi8_mask_fix<0> (a, _mm256_setzero_si256()); +#else + return _mm256_cmpeq_epi8(a,_mm256_setzero_si256()); +#endif +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec32c select (Vec32cb const s, Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi8(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec32c if_add (Vec32cb const f, Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi8 (a, f, a, b); +#else + return a + (Vec32c(f) & b); +#endif +} + +// Conditional subtract +static inline Vec32c if_sub (Vec32cb const f, Vec32c const a, Vec32c const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi8 (a, f, a, b); +#else + return a - (Vec32c(f) & b); +#endif +} + +// Conditional multiply +static inline Vec32c if_mul (Vec32cb const f, Vec32c const a, Vec32c const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int8_t horizontal_add (Vec32c const a) { + __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256()); + __m256i sum2 = _mm256_shuffle_epi32(sum1,2); + __m256i sum3 = _mm256_add_epi16(sum1,sum2); + __m128i sum4 = _mm256_extracti128_si256(sum3,1); + __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); + int8_t sum6 = (int8_t)_mm_cvtsi128_si32(sum5); // truncate to 8 bits + return sum6; // sign extend to 32 bits +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is sign-extended before addition to avoid overflow +static inline int32_t horizontal_add_x (Vec32c const a) { + __m256i aeven = _mm256_slli_epi16(a,8); // even numbered elements of a. get sign bit in position + aeven = _mm256_srai_epi16(aeven,8); // sign extend even numbered elements + __m256i aodd = _mm256_srai_epi16(a,8); // sign extend odd numbered elements + __m256i sum1 = _mm256_add_epi16(aeven,aodd); // add even and odd elements + __m128i sum2 = _mm_add_epi16(_mm256_extracti128_si256(sum1,1),_mm256_castsi256_si128(sum1)); + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding +#if false + __m128i sum3 = _mm_hadd_epi16(sum2,sum2); + __m128i sum4 = _mm_hadd_epi16(sum3,sum3); + __m128i sum5 = _mm_hadd_epi16(sum4,sum4); +#else + __m128i sum3 = _mm_add_epi16(sum2,_mm_unpackhi_epi64(sum2,sum2)); + __m128i sum4 = _mm_add_epi16(sum3,_mm_shuffle_epi32(sum3,1)); + __m128i sum5 = _mm_add_epi16(sum4,_mm_shufflelo_epi16(sum4,1)); +#endif + int16_t sum6 = (int16_t)_mm_cvtsi128_si32(sum5); // 16 bit sum + return sum6; // sign extend to 32 bits +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec32c add_saturated(Vec32c const a, Vec32c const b) { + return _mm256_adds_epi8(a, b); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec32c sub_saturated(Vec32c const a, Vec32c const b) { + return _mm256_subs_epi8(a, b); +} + +// function max: a > b ? a : b +static inline Vec32c max(Vec32c const a, Vec32c const b) { + return _mm256_max_epi8(a,b); +} + +// function min: a < b ? a : b +static inline Vec32c min(Vec32c const a, Vec32c const b) { + return _mm256_min_epi8(a,b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec32c abs(Vec32c const a) { + return _mm256_abs_epi8(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec32c abs_saturated(Vec32c const a) { + __m256i absa = abs(a); // abs(a) +#if INSTRSET >= 10 + return _mm256_min_epu8(absa, Vec32c(0x7F)); +#else + __m256i overfl = _mm256_cmpgt_epi8(_mm256_setzero_si256(), absa);// 0 > a + return _mm256_add_epi8(absa, overfl); // subtract 1 if 0x80 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec32c rotate_left(Vec32c const a, int b) { + uint8_t mask = 0xFFu << b; // mask off overflow bits + __m256i m = _mm256_set1_epi8(mask); + __m128i bb = _mm_cvtsi32_si128(b & 7); // b modulo 8 + __m128i mbb = _mm_cvtsi32_si128((- b) & 7); // 8-b modulo 8 + __m256i left = _mm256_sll_epi16(a, bb); // a << b + __m256i right = _mm256_srl_epi16(a, mbb); // a >> 8-b + left = _mm256_and_si256(m, left); // mask off overflow bits + right = _mm256_andnot_si256(m, right); + return _mm256_or_si256(left, right); // combine left and right shifted bits +} + + +/***************************************************************************** +* +* Vector of 16 8-bit unsigned integers +* +*****************************************************************************/ + +class Vec32uc : public Vec32c { +public: + // Default constructor: + Vec32uc(){ + } + // Constructor to broadcast the same value into all elements: + Vec32uc(uint32_t i) { + ymm = _mm256_set1_epi8((char)i); + } + // Constructor to build from all elements: + Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, + uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, + uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, + uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) { + ymm = _mm256_setr_epi8((int8_t)i0, (int8_t)i1, (int8_t)i2, (int8_t)i3, (int8_t)i4, (int8_t)i5, (int8_t)i6, (int8_t)i7, (int8_t)i8, (int8_t)i9, (int8_t)i10, (int8_t)i11, (int8_t)i12, (int8_t)i13, (int8_t)i14, (int8_t)i15, + (int8_t)i16, (int8_t)i17, (int8_t)i18, (int8_t)i19, (int8_t)i20, (int8_t)i21, (int8_t)i22, (int8_t)i23, (int8_t)i24, (int8_t)i25, (int8_t)i26, (int8_t)i27, (int8_t)i28, (int8_t)i29, (int8_t)i30, (int8_t)i31); + } + // Constructor to build from two Vec16uc: + Vec32uc(Vec16uc const a0, Vec16uc const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec32uc(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec32uc & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec32uc & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec32uc & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec32uc const insert(int index, uint8_t value) { + Vec32c::insert(index, (int8_t)value); + return *this; + } + // Member function extract a single element from vector + uint8_t extract(int index) const { + return (uint8_t)Vec32c::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint8_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec16uc: + Vec16uc get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec16uc get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int elementtype() { + return 5; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec32uc operator + (Vec32uc const a, Vec32uc const b) { + return Vec32uc (Vec32c(a) + Vec32c(b)); +} + +// vector operator - : subtract +static inline Vec32uc operator - (Vec32uc const a, Vec32uc const b) { + return Vec32uc (Vec32c(a) - Vec32c(b)); +} + +// vector operator * : multiply +static inline Vec32uc operator * (Vec32uc const a, Vec32uc const b) { + return Vec32uc (Vec32c(a) * Vec32c(b)); +} + +// vector operator << : shift left all elements +static inline Vec32uc operator << (Vec32uc const a, uint32_t b) { + uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out + __m256i am = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow + __m256i res = _mm256_sll_epi16(am,_mm_cvtsi32_si128((int)b)); // 16-bit shifts + return res; +} + +// vector operator << : shift left all elements +static inline Vec32uc operator << (Vec32uc const a, int32_t b) { + return a << (uint32_t)b; +} + +// vector operator >> : shift right logical all elements +static inline Vec32uc operator >> (Vec32uc const a, uint32_t b) { + uint32_t mask = (uint32_t)0xFF << (uint32_t)b; // mask to remove bits that are shifted out + __m256i am = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow + __m256i res = _mm256_srl_epi16(am,_mm_cvtsi32_si128((int)b)); // 16-bit shifts + return res; +} + +// vector operator >> : shift right logical all elements +static inline Vec32uc operator >> (Vec32uc const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right artihmetic +static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec32cb operator >= (Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + //return _mm256_cmp_epu8_mask (a, b, 5); + return _mm256_cmp_epu8_mask_fix<5> (a, b); +#else + return _mm256_cmpeq_epi8(_mm256_max_epu8(a,b), a); // a == max(a,b) +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec32cb operator <= (Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu8_mask_fix<2> (a, b); +#else + return b >= a; +#endif +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec32cb operator > (Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu8_mask_fix<6> (a, b); +#else + return Vec32cb(Vec32c(~(b >= a))); +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec32cb operator < (Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu8_mask_fix<1> (a, b); +#else + return b > a; +#endif +} + +// vector operator & : bitwise and +static inline Vec32uc operator & (Vec32uc const a, Vec32uc const b) { + return Vec32uc(Vec256b(a) & Vec256b(b)); +} +static inline Vec32uc operator && (Vec32uc const a, Vec32uc const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec32uc operator | (Vec32uc const a, Vec32uc const b) { + return Vec32uc(Vec256b(a) | Vec256b(b)); +} +static inline Vec32uc operator || (Vec32uc const a, Vec32uc const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec32uc operator ^ (Vec32uc const a, Vec32uc const b) { + return Vec32uc(Vec256b(a) ^ Vec256b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec32uc operator ~ (Vec32uc const a) { + return Vec32uc( ~ Vec256b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec32uc select (Vec32cb const s, Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi8(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec32uc if_add (Vec32cb const f, Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi8 (a, f, a, b); +#else + return a + (Vec32uc(f) & b); +#endif +} + +// Conditional subtract +static inline Vec32uc if_sub (Vec32cb const f, Vec32uc const a, Vec32uc const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi8 (a, f, a, b); +#else + return a - (Vec32uc(f) & b); +#endif +} + +// Conditional multiply +static inline Vec32uc if_mul (Vec32cb const f, Vec32uc const a, Vec32uc const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +// (Note: horizontal_add_x(Vec32uc) is slightly faster) +static inline uint8_t horizontal_add (Vec32uc const a) { + __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256()); + __m256i sum2 = _mm256_shuffle_epi32(sum1,2); + __m256i sum3 = _mm256_add_epi16(sum1,sum2); + __m128i sum4 = _mm256_extracti128_si256(sum3,1); + __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); + uint8_t sum6 = (uint8_t)_mm_cvtsi128_si32(sum5); // truncate to 8 bits + return sum6; // zero extend to 32 bits +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is zero-extended before addition to avoid overflow +static inline uint32_t horizontal_add_x (Vec32uc const a) { + __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256()); + __m256i sum2 = _mm256_shuffle_epi32(sum1,2); + __m256i sum3 = _mm256_add_epi16(sum1,sum2); + __m128i sum4 = _mm256_extracti128_si256(sum3,1); + __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); + return (uint32_t)_mm_cvtsi128_si32(sum5); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec32uc add_saturated(Vec32uc const a, Vec32uc const b) { + return _mm256_adds_epu8(a, b); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec32uc sub_saturated(Vec32uc const a, Vec32uc const b) { + return _mm256_subs_epu8(a, b); +} + +// function max: a > b ? a : b +static inline Vec32uc max(Vec32uc const a, Vec32uc const b) { + return _mm256_max_epu8(a,b); +} + +// function min: a < b ? a : b +static inline Vec32uc min(Vec32uc const a, Vec32uc const b) { + return _mm256_min_epu8(a,b); +} + + +/***************************************************************************** +* +* Vector of 16 16-bit signed integers +* +*****************************************************************************/ + +class Vec16s : public Vec256b { +public: + // Default constructor: + Vec16s() { + } + // Constructor to broadcast the same value into all elements: + Vec16s(int i) { + ymm = _mm256_set1_epi16((int16_t)i); + } + // Constructor to build from all elements: + Vec16s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7, + int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) { + ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 ); + } + // Constructor to build from two Vec8s: + Vec16s(Vec8s const a0, Vec8s const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec16s(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec16s & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Constructor to convert from type Vec256b used in emulation: + Vec16s(Vec256b const & x) { + ymm = x; + } + // Type cast operator to convert to __m256i used in intrinsics + operator __m256i() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec16s & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec16s & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to load 16 unsigned 8-bit integers from array + Vec16s & load_16uc(void const * p) { + ymm = _mm256_cvtepu8_epi16(Vec16uc().load(p)); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec16s & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_maskz_loadu_epi16(__mmask16((1u << n) - 1), p); +#else + if (n <= 0) { + *this = 0; + } + else if (n <= 8) { + *this = Vec16s(Vec8s().load_partial(n, p), 0); + } + else if (n < 16) { + *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t const*)p+8)); + } + else { + load(p); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + AVX512BW + _mm256_mask_storeu_epi16(p, __mmask16((1u << n) - 1), ymm); +#else + if (n <= 0) { + return; + } + else if (n <= 8) { + get_low().store_partial(n, p); + } + else if (n < 16) { + get_low().store(p); + get_high().store_partial(n-8, (int16_t*)p+8); + } + else { + store(p); + } +#endif + } + // cut off vector to n elements. The last 16-n elements are set to zero + Vec16s & cutoff(int n) { +#if INSTRSET >= 10 + ymm = _mm256_maskz_mov_epi16(__mmask16((1u << n) - 1), ymm); +#else + *this = Vec16s(Vec32c(*this).cutoff(n * 2)); +#endif + return *this; + } + // Member function to change a single element in vector + Vec16s const insert(int index, int16_t value) { +#if INSTRSET >= 10 + ymm = _mm256_mask_set1_epi16(ymm, __mmask16(1u << index), value); +#else + const int16_t m[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + __m256i mask = Vec256b().load(m + 16 - (index & 0x0F)); + __m256i broad = _mm256_set1_epi16(value); + ymm = selectb(mask, broad, ymm); +#endif + return *this; + } + // Member function extract a single element from vector + int16_t extract(int index) const { +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + __m256i x = _mm256_maskz_compress_epi16(__mmask16(1u << index), ymm); + return (int16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(x)); +#else + int16_t x[16]; // find faster version + store(x); + return x[index & 0x0F]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int16_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec8s: + Vec8s get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec8s get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 6; + } +}; + + +/***************************************************************************** +* +* Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec16sb : public Vec16s { +public: + // Default constructor: + Vec16sb() { + } + // Constructor to build from all elements: + /* + Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, + bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) : + Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7), + -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15)) + {} */ + // Constructor to convert from type __m256i used in intrinsics: + Vec16sb(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec16sb & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec16sb(bool b) : Vec16s(-int16_t(b)) { + } + // Constructor to convert from type Vec256b used in emulation: + Vec16sb(Vec256b const & x) : Vec16s(x) { + } + // Assignment operator to broadcast scalar value: + Vec16sb & operator = (bool b) { + *this = Vec16sb(b); + return *this; + } + // Constructor to build from two Vec8sb: + Vec16sb(Vec8sb const a0, Vec8sb const a1) : Vec16s(Vec8s(a0), Vec8s(a1)) { + } + Vec8sb get_low() const { + return Vec8sb(Vec16s::get_low()); + } + Vec8sb get_high() const { + return Vec8sb(Vec16s::get_high()); + } + Vec16sb & insert(int index, bool a) { + Vec16s::insert(index, -(int)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec16s::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec16sb & load_bits(uint16_t a) { + __m256i b1 = _mm256_set1_epi16((int16_t)a); // broadcast a + __m256i m1 = constant8ui<0,0,0,0,0x00010001,0x00010001,0x00010001,0x00010001>(); + __m256i c1 = _mm256_shuffle_epi8(b1, m1); // get right byte in each position + __m256i m2 = constant8ui<0x00020001,0x00080004,0x00200010,0x00800040,0x00020001,0x00080004,0x00200010,0x00800040>(); + __m256i d1 = _mm256_and_si256(c1, m2); // isolate one bit in each byte + ymm = _mm256_cmpgt_epi16(d1, _mm256_setzero_si256()); // compare with 0 + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec16sb(int b) = delete; + Vec16sb & operator = (int x) = delete; +}; + +#else + +typedef Vec16b Vec16sb; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Define operators for Vec16sb +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec16sb operator & (Vec16sb const a, Vec16sb const b) { + return Vec16sb(Vec256b(a) & Vec256b(b)); +} +static inline Vec16sb operator && (Vec16sb const a, Vec16sb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec16sb operator | (Vec16sb const a, Vec16sb const b) { + return Vec16sb(Vec256b(a) | Vec256b(b)); +} +static inline Vec16sb operator || (Vec16sb const a, Vec16sb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16sb operator ^ (Vec16sb const a, Vec16sb const b) { + return Vec16sb(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec16sb operator == (Vec16sb const a, Vec16sb const b) { + return Vec16sb(a ^ Vec16sb(~b)); +} + +// vector operator != : xor +static inline Vec16sb operator != (Vec16sb const a, Vec16sb const b) { + return Vec16sb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec16sb operator ~ (Vec16sb const a) { + return Vec16sb( ~ Vec256b(a)); +} + +// vector operator ! : element not +static inline Vec16sb operator ! (Vec16sb const a) { + return ~ a; +} + +// vector function andnot +static inline Vec16sb andnot (Vec16sb const a, Vec16sb const b) { + return Vec16sb(andnot(Vec256b(a), Vec256b(b))); +} + +#endif + +/***************************************************************************** +* +* Operators for Vec16s +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec16s operator + (Vec16s const a, Vec16s const b) { + return _mm256_add_epi16(a, b); +} +// vector operator += : add +static inline Vec16s & operator += (Vec16s & a, Vec16s const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec16s operator ++ (Vec16s & a, int) { + Vec16s a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec16s & operator ++ (Vec16s & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec16s operator - (Vec16s const a, Vec16s const b) { + return _mm256_sub_epi16(a, b); +} +// vector operator - : unary minus +static inline Vec16s operator - (Vec16s const a) { + return _mm256_sub_epi16(_mm256_setzero_si256(), a); +} +// vector operator -= : subtract +static inline Vec16s & operator -= (Vec16s & a, Vec16s const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec16s operator -- (Vec16s & a, int) { + Vec16s a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec16s & operator -- (Vec16s & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec16s operator * (Vec16s const a, Vec16s const b) { + return _mm256_mullo_epi16(a, b); +} +// vector operator *= : multiply +static inline Vec16s & operator *= (Vec16s & a, Vec16s const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + + +// vector operator << : shift left +static inline Vec16s operator << (Vec16s const a, int b) { + return _mm256_sll_epi16(a,_mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec16s & operator <<= (Vec16s & a, int b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec16s operator >> (Vec16s const a, int b) { + return _mm256_sra_epi16(a,_mm_cvtsi32_si128(b)); +} +// vector operator >>= : shift right arithmetic +static inline Vec16s & operator >>= (Vec16s & a, int b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec16sb operator == (Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, b, 0); +#else + return _mm256_cmpeq_epi16(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec16sb operator != (Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, b, 4); +#else + return Vec16sb(Vec16s(~(a == b))); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec16sb operator > (Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, b, 6); +#else + return _mm256_cmpgt_epi16(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec16sb operator < (Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec16sb operator >= (Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, b, 5); +#else + return Vec16sb(Vec16s(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec16sb operator <= (Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec16s operator & (Vec16s const a, Vec16s const b) { + return Vec16s(Vec256b(a) & Vec256b(b)); +} +static inline Vec16s operator && (Vec16s const a, Vec16s const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec16s & operator &= (Vec16s & a, Vec16s const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec16s operator | (Vec16s const a, Vec16s const b) { + return Vec16s(Vec256b(a) | Vec256b(b)); +} +static inline Vec16s operator || (Vec16s const a, Vec16s const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec16s & operator |= (Vec16s & a, Vec16s const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16s operator ^ (Vec16s const a, Vec16s const b) { + return Vec16s(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec16s & operator ^= (Vec16s & a, Vec16s const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec16s operator ~ (Vec16s const a) { + return Vec16s( ~ Vec256b(a)); +} + +// vector operator ! : logical not, returns true for elements == 0 +static inline Vec16sb operator ! (Vec16s const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi16_mask (a, _mm256_setzero_si256(), 0); +#else + return _mm256_cmpeq_epi16(a,_mm256_setzero_si256()); +#endif +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16s select (Vec16sb const s, Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi16(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16s if_add (Vec16sb const f, Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi16 (a, f, a, b); +#else + return a + (Vec16s(f) & b); +#endif +} + +// Conditional subtract +static inline Vec16s if_sub (Vec16sb const f, Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi16 (a, f, a, b); +#else + return a - (Vec16s(f) & b); +#endif +} + +// Conditional multiply +static inline Vec16s if_mul (Vec16sb const f, Vec16s const a, Vec16s const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mullo_epi16 (a, f, a, b); +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int16_t horizontal_add (Vec16s const a) { + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128i sum1 = _mm_add_epi16(_mm256_extracti128_si256(a,1),_mm256_castsi256_si128(a)); + __m128i sum2 = _mm_add_epi16(sum1,_mm_unpackhi_epi64(sum1,sum1)); + __m128i sum3 = _mm_add_epi16(sum2,_mm_shuffle_epi32(sum2,1)); + __m128i sum4 = _mm_add_epi16(sum3,_mm_shufflelo_epi16(sum3,1)); + return (int16_t)_mm_cvtsi128_si32(sum4); // truncate to 16 bits +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are sign extended before adding to avoid overflow +static inline int32_t horizontal_add_x (Vec16s const a) { + __m256i aeven = _mm256_slli_epi32(a,16); // even numbered elements of a. get sign bit in position + aeven = _mm256_srai_epi32(aeven,16); // sign extend even numbered elements + __m256i aodd = _mm256_srai_epi32(a,16); // sign extend odd numbered elements + __m256i sum1 = _mm256_add_epi32(aeven,aodd); // add even and odd elements + __m128i sum2 = _mm_add_epi32(_mm256_extracti128_si256(sum1,1),_mm256_castsi256_si128(sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_unpackhi_epi64(sum2,sum2)); + __m128i sum4 = _mm_add_epi32(sum3,_mm_shuffle_epi32(sum3,1)); + return (int16_t)_mm_cvtsi128_si32(sum4); // truncate to 16 bits +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec16s add_saturated(Vec16s const a, Vec16s const b) { + return _mm256_adds_epi16(a, b); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec16s sub_saturated(Vec16s const a, Vec16s const b) { + return _mm256_subs_epi16(a, b); +} + +// function max: a > b ? a : b +static inline Vec16s max(Vec16s const a, Vec16s const b) { + return _mm256_max_epi16(a,b); +} + +// function min: a < b ? a : b +static inline Vec16s min(Vec16s const a, Vec16s const b) { + return _mm256_min_epi16(a,b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec16s abs(Vec16s const a) { + return _mm256_abs_epi16(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec16s abs_saturated(Vec16s const a) { +#if INSTRSET >= 10 + return _mm256_min_epu16(abs(a), Vec16s(0x7FFF)); +#else + __m256i absa = abs(a); // abs(a) + __m256i overfl = _mm256_srai_epi16(absa,15); // sign + return _mm256_add_epi16(absa,overfl); // subtract 1 if 0x8000 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec16s rotate_left(Vec16s const a, int b) { + __m256i left = _mm256_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F)); // a << b + __m256i right = _mm256_srl_epi16(a,_mm_cvtsi32_si128((-b) & 0x0F)); // a >> (16 - b) + return _mm256_or_si256(left,right); // or +} + + +/***************************************************************************** +* +* Vector of 16 16-bit unsigned integers +* +*****************************************************************************/ + +class Vec16us : public Vec16s { +public: + // Default constructor: + Vec16us(){ + } + // Constructor to broadcast the same value into all elements: + Vec16us(uint32_t i) { + ymm = _mm256_set1_epi16((int16_t)i); + } + // Constructor to build from all elements: + Vec16us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, + uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) { + ymm = _mm256_setr_epi16((int16_t)i0, (int16_t)i1, (int16_t)i2, (int16_t)i3, (int16_t)i4, (int16_t)i5, (int16_t)i6, (int16_t)i7, + (int16_t)i8, (int16_t)i9, (int16_t)i10, (int16_t)i11, (int16_t)i12, (int16_t)i13, (int16_t)i14, (int16_t)i15); + } + // Constructor to build from two Vec8us: + Vec16us(Vec8us const a0, Vec8us const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec16us(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec16us & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec16us & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec16us & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec16us const insert(int index, uint16_t value) { + Vec16s::insert(index, (int16_t)value); + return *this; + } + // Member function extract a single element from vector + uint16_t extract(int index) const { + return (uint16_t)Vec16s::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint16_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec8us: + Vec8us get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec8us get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int elementtype() { + return 7; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec16us operator + (Vec16us const a, Vec16us const b) { + return Vec16us (Vec16s(a) + Vec16s(b)); +} + +// vector operator - : subtract +static inline Vec16us operator - (Vec16us const a, Vec16us const b) { + return Vec16us (Vec16s(a) - Vec16s(b)); +} + +// vector operator * : multiply +static inline Vec16us operator * (Vec16us const a, Vec16us const b) { + return Vec16us (Vec16s(a) * Vec16s(b)); +} + +// vector operator / : divide +// See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec16us operator >> (Vec16us const a, uint32_t b) { + return _mm256_srl_epi16(a,_mm_cvtsi32_si128((int)b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec16us operator >> (Vec16us const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right artihmetic +static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec16us operator << (Vec16us const a, uint32_t b) { + return _mm256_sll_epi16(a,_mm_cvtsi32_si128((int)b)); +} + +// vector operator << : shift left all elements +static inline Vec16us operator << (Vec16us const a, int32_t b) { + return a << (uint32_t)b; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec16sb operator >= (Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu16_mask (a, b, 5); +#else + __m256i max_ab = _mm256_max_epu16(a,b); // max(a,b), unsigned + return _mm256_cmpeq_epi16(a,max_ab); // a == max(a,b) +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec16sb operator <= (Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu16_mask (a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec16sb operator > (Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu16_mask (a, b, 6); +#else + return Vec16sb(Vec16s(~(b >= a))); +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec16sb operator < (Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu16_mask (a, b, 1); +#else + return b > a; +#endif +} + +// vector operator & : bitwise and +static inline Vec16us operator & (Vec16us const a, Vec16us const b) { + return Vec16us(Vec256b(a) & Vec256b(b)); +} +static inline Vec16us operator && (Vec16us const a, Vec16us const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec16us operator | (Vec16us const a, Vec16us const b) { + return Vec16us(Vec256b(a) | Vec256b(b)); +} +static inline Vec16us operator || (Vec16us const a, Vec16us const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec16us operator ^ (Vec16us const a, Vec16us const b) { + return Vec16us(Vec256b(a) ^ Vec256b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec16us operator ~ (Vec16us const a) { + return Vec16us( ~ Vec256b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16us select (Vec16sb const s, Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi16(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16us if_add (Vec16sb const f, Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi16 (a, f, a, b); +#else + return a + (Vec16us(f) & b); +#endif +} + +// Conditional subtract +static inline Vec16us if_sub (Vec16sb const f, Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi16 (a, f, a, b); +#else + return a - (Vec16us(f) & b); +#endif +} + +// Conditional multiply +static inline Vec16us if_mul (Vec16sb const f, Vec16us const a, Vec16us const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mullo_epi16 (a, f, a, b); +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint16_t horizontal_add (Vec16us const a) { + return (uint16_t)horizontal_add(Vec16s(a)); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is zero-extended before addition to avoid overflow +static inline uint32_t horizontal_add_x (Vec16us const a) { +#if INSTRSET >= 10 + __m256i aeven = _mm256_maskz_mov_epi16 (__mmask16(0x5555), a); +#else + __m256i mask = _mm256_set1_epi32(0x0000FFFF); // mask for even positions + __m256i aeven = _mm256_and_si256(a,mask); // even numbered elements of a +#endif + __m256i aodd = _mm256_srli_epi32(a,16); // zero extend odd numbered elements + __m256i sum1 = _mm256_add_epi32(aeven,aodd); // add even and odd elements + __m128i sum2 = _mm_add_epi32(_mm256_extracti128_si256(sum1,1),_mm256_castsi256_si128(sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_unpackhi_epi64(sum2,sum2)); + __m128i sum4 = _mm_add_epi32(sum3,_mm_shuffle_epi32(sum3,1)); + return (int16_t)_mm_cvtsi128_si32(sum4); // truncate to 16 bits +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec16us add_saturated(Vec16us const a, Vec16us const b) { + return _mm256_adds_epu16(a, b); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec16us sub_saturated(Vec16us const a, Vec16us const b) { + return _mm256_subs_epu16(a, b); +} + +// function max: a > b ? a : b +static inline Vec16us max(Vec16us const a, Vec16us const b) { + return _mm256_max_epu16(a,b); +} + +// function min: a < b ? a : b +static inline Vec16us min(Vec16us const a, Vec16us const b) { + return _mm256_min_epu16(a,b); +} + + +/***************************************************************************** +* +* Vector of 8 32-bit signed integers +* +*****************************************************************************/ + +class Vec8i : public Vec256b { +public: + // Default constructor: + Vec8i() { + } + // Constructor to broadcast the same value into all elements: + Vec8i(int i) { + ymm = _mm256_set1_epi32(i); + } + // Constructor to build from all elements: + Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) { + ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7); + } + // Constructor to build from two Vec4i: + Vec8i(Vec4i const a0, Vec4i const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec8i(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec8i & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Type cast operator to convert to __m256i used in intrinsics + operator __m256i() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec8i & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec8i & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to load 8 unsigned 8-bit integers from array + Vec8i & load_8uc(void const * p) { + ymm = _mm256_cvtepu8_epi32(Vec16uc().loadl(p)); + return *this; + } + // Member function to load 8 unsigned 16-bit integers from array + Vec8i & load_8us(void const * p) { + ymm = _mm256_cvtepu16_epi32(Vec8us().load(p)); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec8i & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_maskz_loadu_epi32(__mmask8((1u << n) - 1), p); +#else + if (n <= 0) { + *this = 0; + } + else if (n <= 4) { + *this = Vec8i(Vec4i().load_partial(n, p), 0); + } + else if (n < 8) { + *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t const*)p+4)); + } + else { + load(p); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + _mm256_mask_storeu_epi32(p, __mmask8((1u << n) - 1), ymm); +#else + if (n <= 0) { + return; + } + else if (n <= 4) { + get_low().store_partial(n, p); + } + else if (n < 8) { + get_low().store(p); + get_high().store_partial(n-4, (int32_t*)p+4); + } + else { + store(p); + } +#endif + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec8i & cutoff(int n) { +#if INSTRSET >= 10 + ymm = _mm256_maskz_mov_epi32(__mmask8((1u << n) - 1), ymm); +#else + *this = Vec32c(*this).cutoff(n * 4); +#endif + return *this; + } + // Member function to change a single element in vector + Vec8i const insert(int index, int32_t value) { +#if INSTRSET >= 10 + ymm = _mm256_mask_set1_epi32(ymm, __mmask8(1u << index), value); +#else + __m256i broad = _mm256_set1_epi32(value); // broadcast value into all elements + const int32_t maskl[16] = {0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0}; + __m256i mask = Vec256b().load(maskl + 8 - (index & 7)); // mask with FFFFFFFF at index position + ymm = selectb (mask, broad, ymm); +#endif + return *this; + } + // Member function extract a single element from vector + int32_t extract(int index) const { +#if INSTRSET >= 10 + __m256i x = _mm256_maskz_compress_epi32(__mmask8(1u << index), ymm); + return _mm_cvtsi128_si32(_mm256_castsi256_si128(x)); +#else + int32_t x[8]; + store(x); + return x[index & 7]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int32_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4i: + Vec4i get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec4i get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 8; + } +}; + + +/***************************************************************************** +* +* Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec8ib : public Vec8i { +public: + // Default constructor: + Vec8ib() { + } + // Constructor to build from all elements: + Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) : + Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7)) + {} + // Constructor to convert from type __m256i used in intrinsics: + Vec8ib(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec8ib & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec8ib(bool b) : Vec8i(-int32_t(b)) { + } + // Assignment operator to broadcast scalar value: + Vec8ib & operator = (bool b) { + *this = Vec8ib(b); + return *this; + } + // Constructor to build from two Vec4ib: + Vec8ib(Vec4ib const a0, Vec4ib const a1) : Vec8i(Vec4i(a0), Vec4i(a1)) { + } + Vec4ib get_low() const { + return Vec4ib(Vec8i::get_low()); + } + Vec4ib get_high() const { + return Vec4ib(Vec8i::get_high()); + } + Vec8ib & insert (int index, bool a) { + Vec8i::insert(index, -(int)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec8i::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec8ib & load_bits(uint8_t a) { + __m256i b1 = _mm256_set1_epi32((int32_t)a); // broadcast a + __m256i m2 = constant8ui<1,2,4,8,0x10,0x20,0x40,0x80>(); + __m256i d1 = _mm256_and_si256(b1, m2); // isolate one bit in each dword + ymm = _mm256_cmpgt_epi32(d1, _mm256_setzero_si256()); // compare with 0 + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec8ib(int b) = delete; + Vec8ib & operator = (int x) = delete; +}; + +#else + +typedef Vec8b Vec8ib; // compact boolean vector + +#endif + + +/***************************************************************************** +* +* Define operators for Vec8ib +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec8ib operator & (Vec8ib const a, Vec8ib const b) { + return Vec8ib(Vec256b(a) & Vec256b(b)); +} +static inline Vec8ib operator && (Vec8ib const a, Vec8ib const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8ib operator | (Vec8ib const a, Vec8ib const b) { + return Vec8ib(Vec256b(a) | Vec256b(b)); +} +static inline Vec8ib operator || (Vec8ib const a, Vec8ib const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8ib operator ^ (Vec8ib const a, Vec8ib const b) { + return Vec8ib(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec8ib operator == (Vec8ib const a, Vec8ib const b) { + return Vec8ib(a ^ (~b)); +} + +// vector operator != : xor +static inline Vec8ib operator != (Vec8ib const a, Vec8ib const b) { + return Vec8ib(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec8ib operator ~ (Vec8ib const a) { + return Vec8ib( ~ Vec256b(a)); +} + +// vector operator ! : element not +static inline Vec8ib operator ! (Vec8ib const a) { + return ~ a; +} + +// vector function andnot +static inline Vec8ib andnot (Vec8ib const a, Vec8ib const b) { + return Vec8ib(andnot(Vec256b(a), Vec256b(b))); +} + +#endif + +/***************************************************************************** +* +* Operators for Vec8i +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec8i operator + (Vec8i const a, Vec8i const b) { + return _mm256_add_epi32(a, b); +} +// vector operator += : add +static inline Vec8i & operator += (Vec8i & a, Vec8i const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8i operator ++ (Vec8i & a, int) { + Vec8i a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec8i & operator ++ (Vec8i & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8i operator - (Vec8i const a, Vec8i const b) { + return _mm256_sub_epi32(a, b); +} +// vector operator - : unary minus +static inline Vec8i operator - (Vec8i const a) { + return _mm256_sub_epi32(_mm256_setzero_si256(), a); +} +// vector operator -= : subtract +static inline Vec8i & operator -= (Vec8i & a, Vec8i const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8i operator -- (Vec8i & a, int) { + Vec8i a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec8i & operator -- (Vec8i & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8i operator * (Vec8i const a, Vec8i const b) { + return _mm256_mullo_epi32(a, b); +} +// vector operator *= : multiply +static inline Vec8i & operator *= (Vec8i & a, Vec8i const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec8i operator << (Vec8i const a, int32_t b) { + return _mm256_sll_epi32(a, _mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec8i & operator <<= (Vec8i & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec8i operator >> (Vec8i const a, int32_t b) { + return _mm256_sra_epi32(a, _mm_cvtsi32_si128(b)); +} +// vector operator >>= : shift right arithmetic +static inline Vec8i & operator >>= (Vec8i & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8ib operator == (Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, b, 0); +#else + return _mm256_cmpeq_epi32(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8ib operator != (Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, b, 4); +#else + return Vec8ib(Vec8i(~(a == b))); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8ib operator > (Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, b, 6); +#else + return _mm256_cmpgt_epi32(a, b); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8ib operator < (Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec8ib operator >= (Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, b, 5); +#else + return Vec8ib(Vec8i(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec8ib operator <= (Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec8i operator & (Vec8i const a, Vec8i const b) { + return Vec8i(Vec256b(a) & Vec256b(b)); +} +static inline Vec8i operator && (Vec8i const a, Vec8i const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec8i & operator &= (Vec8i & a, Vec8i const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8i operator | (Vec8i const a, Vec8i const b) { + return Vec8i(Vec256b(a) | Vec256b(b)); +} +static inline Vec8i operator || (Vec8i const a, Vec8i const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec8i & operator |= (Vec8i & a, Vec8i const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8i operator ^ (Vec8i const a, Vec8i const b) { + return Vec8i(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec8i & operator ^= (Vec8i & a, Vec8i const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec8i operator ~ (Vec8i const a) { + return Vec8i( ~ Vec256b(a)); +} + +// vector operator ! : returns true for elements == 0 +static inline Vec8ib operator ! (Vec8i const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi32_mask (a, _mm256_setzero_si256(), 0); +#else + return _mm256_cmpeq_epi32(a, _mm256_setzero_si256()); +#endif +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8i select (Vec8ib const s, Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi32(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8i if_add (Vec8ib const f, Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi32 (a, f, a, b); +#else + return a + (Vec8i(f) & b); +#endif +} + +// Conditional subtract +static inline Vec8i if_sub (Vec8ib const f, Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi32 (a, f, a, b); +#else + return a - (Vec8i(f) & b); +#endif +} + +// Conditional multiply +static inline Vec8i if_mul (Vec8ib const f, Vec8i const a, Vec8i const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mullo_epi32 (a, f, a, b); +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int32_t horizontal_add (Vec8i const a) { + // The hadd instruction is inefficient, and may be split into two instructions for faster decoding + __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(a,1),_mm256_castsi256_si128(a)); + __m128i sum2 = _mm_add_epi32(sum1,_mm_unpackhi_epi64(sum1,sum1)); + __m128i sum3 = _mm_add_epi32(sum2,_mm_shuffle_epi32(sum2,1)); + return (int32_t)_mm_cvtsi128_si32(sum3); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are sign extended before adding to avoid overflow +// static inline int64_t horizontal_add_x (Vec8i const a); // defined below + +// function add_saturated: add element by element, signed with saturation +static inline Vec8i add_saturated(Vec8i const a, Vec8i const b) { + __m256i sum = _mm256_add_epi32(a, b); // a + b + __m256i axb = _mm256_xor_si256(a, b); // check if a and b have different sign + __m256i axs = _mm256_xor_si256(a, sum); // check if a and sum have different sign + __m256i overf1 = _mm256_andnot_si256(axb,axs); // check if sum has wrong sign + __m256i overf2 = _mm256_srai_epi32(overf1,31); // -1 if overflow + __m256i asign = _mm256_srli_epi32(a,31); // 1 if a < 0 + __m256i sat1 = _mm256_srli_epi32(overf2,1); // 7FFFFFFF if overflow + __m256i sat2 = _mm256_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow + return selectb(overf2,sat2,sum); // sum if not overflow, else sat2 +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec8i sub_saturated(Vec8i const a, Vec8i const b) { + __m256i diff = _mm256_sub_epi32(a, b); // a + b + __m256i axb = _mm256_xor_si256(a, b); // check if a and b have different sign + __m256i axs = _mm256_xor_si256(a, diff); // check if a and sum have different sign + __m256i overf1 = _mm256_and_si256(axb,axs); // check if sum has wrong sign + __m256i overf2 = _mm256_srai_epi32(overf1,31); // -1 if overflow + __m256i asign = _mm256_srli_epi32(a,31); // 1 if a < 0 + __m256i sat1 = _mm256_srli_epi32(overf2,1); // 7FFFFFFF if overflow + __m256i sat2 = _mm256_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow + return selectb(overf2,sat2,diff); // diff if not overflow, else sat2 +} + +// function max: a > b ? a : b +static inline Vec8i max(Vec8i const a, Vec8i const b) { + return _mm256_max_epi32(a,b); +} + +// function min: a < b ? a : b +static inline Vec8i min(Vec8i const a, Vec8i const b) { + return _mm256_min_epi32(a,b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec8i abs(Vec8i const a) { + return _mm256_abs_epi32(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec8i abs_saturated(Vec8i const a) { +#if INSTRSET >= 10 + return _mm256_min_epu32(abs(a), Vec8i(0x7FFFFFFF)); +#else + __m256i absa = abs(a); // abs(a) + __m256i overfl = _mm256_srai_epi32(absa,31); // sign + return _mm256_add_epi32(absa,overfl); // subtract 1 if 0x80000000 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec8i rotate_left(Vec8i const a, int b) { +#if INSTRSET >= 10 // __AVX512VL__ + return _mm256_rolv_epi32(a, _mm256_set1_epi32(b)); +#else + __m256i left = _mm256_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F)); // a << b + __m256i right = _mm256_srl_epi32(a,_mm_cvtsi32_si128((-b) & 0x1F));// a >> (32 - b) + __m256i rot = _mm256_or_si256(left,right); // or + return rot; +#endif +} + + +/***************************************************************************** +* +* Vector of 8 32-bit unsigned integers +* +*****************************************************************************/ + +class Vec8ui : public Vec8i { +public: + // Default constructor: + Vec8ui() { + } + // Constructor to broadcast the same value into all elements: + Vec8ui(uint32_t i) { + ymm = _mm256_set1_epi32((int32_t)i); + } + // Constructor to build from all elements: + Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) { + ymm = _mm256_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4, (int32_t)i5, (int32_t)i6, (int32_t)i7); + } + // Constructor to build from two Vec4ui: + Vec8ui(Vec4ui const a0, Vec4ui const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec8ui(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec8ui & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec8ui & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec8ui & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec8ui const insert(int index, uint32_t value) { + Vec8i::insert(index, (int32_t)value); + return *this; + } + // Member function extract a single element from vector + uint32_t extract(int index) const { + return (uint32_t)Vec8i::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint32_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4ui: + Vec4ui get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec4ui get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int elementtype() { + return 9; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec8ui operator + (Vec8ui const a, Vec8ui const b) { + return Vec8ui (Vec8i(a) + Vec8i(b)); +} + +// vector operator - : subtract +static inline Vec8ui operator - (Vec8ui const a, Vec8ui const b) { + return Vec8ui (Vec8i(a) - Vec8i(b)); +} + +// vector operator * : multiply +static inline Vec8ui operator * (Vec8ui const a, Vec8ui const b) { + return Vec8ui (Vec8i(a) * Vec8i(b)); +} + +// vector operator / : divide +// See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec8ui operator >> (Vec8ui const a, uint32_t b) { + return _mm256_srl_epi32(a,_mm_cvtsi32_si128((int)b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec8ui operator >> (Vec8ui const a, int32_t b) { + return a >> (uint32_t)b; +} +// vector operator >>= : shift right logical +static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec8ui operator << (Vec8ui const a, uint32_t b) { + return Vec8ui ((Vec8i)a << (int32_t)b); +} +// vector operator << : shift left all elements +static inline Vec8ui operator << (Vec8ui const a, int32_t b) { + return Vec8ui ((Vec8i)a << (int32_t)b); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec8ib operator > (Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu32_mask (a, b, 6); +#else + __m256i signbit = _mm256_set1_epi32(0x80000000); + __m256i a1 = _mm256_xor_si256(a,signbit); + __m256i b1 = _mm256_xor_si256(b,signbit); + return _mm256_cmpgt_epi32(a1,b1); // signed compare +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec8ib operator < (Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu32_mask (a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec8ib operator >= (Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu32_mask (a, b, 5); +#else + __m256i max_ab = _mm256_max_epu32(a,b); // max(a,b), unsigned + return _mm256_cmpeq_epi32(a,max_ab); // a == max(a,b) +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec8ib operator <= (Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu32_mask (a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec8ui operator & (Vec8ui const a, Vec8ui const b) { + return Vec8ui(Vec256b(a) & Vec256b(b)); +} +static inline Vec8ui operator && (Vec8ui const a, Vec8ui const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec8ui operator | (Vec8ui const a, Vec8ui const b) { + return Vec8ui(Vec256b(a) | Vec256b(b)); +} +static inline Vec8ui operator || (Vec8ui const a, Vec8ui const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec8ui operator ^ (Vec8ui const a, Vec8ui const b) { + return Vec8ui(Vec256b(a) ^ Vec256b(b)); +} + +// vector operator ~ : bitwise not +static inline Vec8ui operator ~ (Vec8ui const a) { + return Vec8ui( ~ Vec256b(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8ui select (Vec8ib const s, Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi32(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8ui if_add (Vec8ib const f, Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi32 (a, f, a, b); +#else + return a + (Vec8ui(f) & b); +#endif +} + +// Conditional subtract +static inline Vec8ui if_sub (Vec8ib const f, Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi32 (a, f, a, b); +#else + return a - (Vec8ui(f) & b); +#endif +} + +// Conditional multiply +static inline Vec8ui if_mul (Vec8ib const f, Vec8ui const a, Vec8ui const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mullo_epi32 (a, f, a, b); +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint32_t horizontal_add (Vec8ui const a) { + return (uint32_t)horizontal_add((Vec8i)a); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are zero extended before adding to avoid overflow +// static inline uint64_t horizontal_add_x (Vec8ui const a); // defined later + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec8ui add_saturated(Vec8ui const a, Vec8ui const b) { + Vec8ui sum = a + b; + Vec8ui aorb = Vec8ui(a | b); +#if INSTRSET >= 10 + Vec8b overflow = _mm256_cmp_epu32_mask(sum, aorb, 1); + return _mm256_mask_set1_epi32(sum, overflow, -1); +#else + Vec8ui overflow = Vec8ui(sum < aorb); // overflow if a + b < (a | b) + return Vec8ui(sum | overflow); // return 0xFFFFFFFF if overflow +#endif +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec8ui sub_saturated(Vec8ui const a, Vec8ui const b) { + Vec8ui diff = a - b; +#if INSTRSET >= 10 + Vec8b nunderflow = _mm256_cmp_epu32_mask(diff, a, 2); // not underflow if a - b <= a + return _mm256_maskz_mov_epi32(nunderflow, diff); // zero if underflow +#else + Vec8ui underflow = Vec8ui(diff > a); // underflow if a - b > a + return _mm256_andnot_si256(underflow, diff); // return 0 if underflow +#endif +} + +// function max: a > b ? a : b +static inline Vec8ui max(Vec8ui const a, Vec8ui const b) { + return _mm256_max_epu32(a,b); +} + +// function min: a < b ? a : b +static inline Vec8ui min(Vec8ui const a, Vec8ui const b) { + return _mm256_min_epu32(a,b); +} + + +/***************************************************************************** +* +* Vector of 4 64-bit signed integers +* +*****************************************************************************/ + +class Vec4q : public Vec256b { +public: + // Default constructor: + Vec4q() { + } + // Constructor to broadcast the same value into all elements: + Vec4q(int64_t i) { + ymm = _mm256_set1_epi64x(i); + } + // Constructor to build from all elements: + Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + ymm = _mm256_setr_epi64x(i0, i1, i2, i3); + } + // Constructor to build from two Vec2q: + Vec4q(Vec2q const a0, Vec2q const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec4q(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec4q & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Type cast operator to convert to __m256i used in intrinsics + operator __m256i() const { + return ymm; + } + // Member function to load from array (unaligned) + Vec4q & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec4q & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec4q & load_partial(int n, void const * p) { +#if INSTRSET >= 10 // AVX512VL + ymm = _mm256_maskz_loadu_epi64(__mmask8((1u << n) - 1), p); +#else + if (n <= 0) { + *this = 0; + } + else if (n <= 2) { + *this = Vec4q(Vec2q().load_partial(n, p), 0); + } + else if (n < 4) { + *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t const*)p+2)); + } + else { + load(p); + } +#endif + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { +#if INSTRSET >= 10 // AVX512VL + _mm256_mask_storeu_epi64(p, __mmask8((1u << n) - 1), ymm); +#else + if (n <= 0) { + return; + } + else if (n <= 2) { + get_low().store_partial(n, p); + } + else if (n < 4) { + get_low().store(p); + get_high().store_partial(n-2, (int64_t*)p+2); + } + else { + store(p); + } +#endif + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec4q & cutoff(int n) { +#if INSTRSET >= 10 + ymm = _mm256_maskz_mov_epi64(__mmask8((1u << n) - 1), ymm); +#else + *this = Vec32c(*this).cutoff(n * 8); +#endif + return *this; + } + // Member function to change a single element in vector + Vec4q const insert(int index, int64_t value) { +#if INSTRSET >= 10 + ymm = _mm256_mask_set1_epi64(ymm, __mmask8(1u << index), value); +#else + Vec4q x(value); + switch (index) { + case 0: + ymm = _mm256_blend_epi32(ymm,x,0x03); break; + case 1: + ymm = _mm256_blend_epi32(ymm,x,0x0C); break; + case 2: + ymm = _mm256_blend_epi32(ymm,x,0x30); break; + case 3: + ymm = _mm256_blend_epi32(ymm,x,0xC0); break; + } +#endif + return *this; + } + // Member function extract a single element from vector + int64_t extract(int index) const { +#if INSTRSET >= 10 + __m256i x = _mm256_maskz_compress_epi64(__mmask8(1u << index), ymm); + return _emulate_movq(_mm256_castsi256_si128(x)); +#else + int64_t x[4]; + store(x); + return x[index & 3]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int64_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2q: + Vec2q get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec2q get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int size() { + return 4; + } + static constexpr int elementtype() { + return 10; + } +}; + +/***************************************************************************** +* +* Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +class Vec4qb : public Vec4q { +public: + // Default constructor: + Vec4qb() { + } + // Constructor to build from all elements: + Vec4qb(bool x0, bool x1, bool x2, bool x3) : + Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) { + } + // Constructor to convert from type __m256i used in intrinsics: + Vec4qb(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec4qb & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Constructor to broadcast scalar value: + Vec4qb(bool b) : Vec4q(-int64_t(b)) { + } + // Assignment operator to broadcast scalar value: + Vec4qb & operator = (bool b) { + *this = Vec4qb(b); + return *this; + } + // Constructor to build from two Vec2qb: + Vec4qb(Vec2qb const a0, Vec2qb const a1) : Vec4q(Vec2q(a0), Vec2q(a1)) { + } + // Member functions to split into two Vec2qb: + Vec2qb get_low() const { + return Vec2qb(Vec4q::get_low()); + } + Vec2qb get_high() const { + return Vec2qb(Vec4q::get_high()); + } + Vec4qb & insert (int index, bool a) { + Vec4q::insert(index, -(int64_t)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec4q::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Member function to change a bitfield to a boolean vector + Vec4qb & load_bits(uint8_t a) { + __m256i b1 = _mm256_set1_epi32((int32_t)a); // broadcast a + __m256i m2 = constant8ui<1,0,2,0,4,0,8,0>(); + __m256i d1 = _mm256_and_si256(b1, m2); // isolate one bit in each dword + ymm = _mm256_cmpgt_epi64(d1, _mm256_setzero_si256()); // we can use signed compare here because no value is negative + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec4qb(int b) = delete; + Vec4qb & operator = (int x) = delete; +}; + +#else + +typedef Vec4b Vec4qb; // compact boolean vector + +#endif + +/***************************************************************************** +* +* Define operators for Vec4qb +* +*****************************************************************************/ + +#if INSTRSET < 10 // broad boolean vectors + +// vector operator & : bitwise and +static inline Vec4qb operator & (Vec4qb const a, Vec4qb const b) { + return Vec4qb(Vec256b(a) & Vec256b(b)); +} +static inline Vec4qb operator && (Vec4qb const a, Vec4qb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec4qb operator | (Vec4qb const a, Vec4qb const b) { + return Vec4qb(Vec256b(a) | Vec256b(b)); +} +static inline Vec4qb operator || (Vec4qb const a, Vec4qb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec4qb operator ^ (Vec4qb const a, Vec4qb const b) { + return Vec4qb(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec4qb operator == (Vec4qb const a, Vec4qb const b) { + return Vec4qb(a ^ (~b)); +} + +// vector operator != : xor +static inline Vec4qb operator != (Vec4qb const a, Vec4qb const b) { + return Vec4qb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec4qb operator ~ (Vec4qb const a) { + return Vec4qb( ~ Vec256b(a)); +} + +// vector operator ! : element not +static inline Vec4qb operator ! (Vec4qb const a) { + return ~ a; +} + +// vector function andnot +static inline Vec4qb andnot (Vec4qb const a, Vec4qb const b) { + return Vec4qb(andnot(Vec256b(a), Vec256b(b))); +} + +#endif + + +/***************************************************************************** +* +* Operators for Vec4q +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec4q operator + (Vec4q const a, Vec4q const b) { + return _mm256_add_epi64(a, b); +} +// vector operator += : add +static inline Vec4q & operator += (Vec4q & a, Vec4q const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec4q operator ++ (Vec4q & a, int) { + Vec4q a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec4q & operator ++ (Vec4q & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec4q operator - (Vec4q const a, Vec4q const b) { + return _mm256_sub_epi64(a, b); +} +// vector operator - : unary minus +static inline Vec4q operator - (Vec4q const a) { + return _mm256_sub_epi64(_mm256_setzero_si256(), a); +} +// vector operator -= : subtract +static inline Vec4q & operator -= (Vec4q & a, Vec4q const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec4q operator -- (Vec4q & a, int) { + Vec4q a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec4q & operator -- (Vec4q & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec4q operator * (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // __AVX512DQ__ __AVX512VL__ + return _mm256_mullo_epi64(a, b); +#else + // Split into 32-bit multiplies + __m256i bswap = _mm256_shuffle_epi32(b,0xB1); // swap H<->L + __m256i prodlh = _mm256_mullo_epi32(a,bswap); // 32 bit L*H products + __m256i zero = _mm256_setzero_si256(); // 0 + __m256i prodlh2 = _mm256_hadd_epi32(prodlh,zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0 + __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2,0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L + __m256i prodll = _mm256_mul_epu32(a,b); // a0Lb0L,a1Lb1L, 64 bit unsigned products + __m256i prod = _mm256_add_epi64(prodll,prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 + return prod; +#endif +} + +// vector operator *= : multiply +static inline Vec4q & operator *= (Vec4q & a, Vec4q const b) { + a = a * b; + return a; +} + +// vector operator << : shift left +static inline Vec4q operator << (Vec4q const a, int32_t b) { + return _mm256_sll_epi64(a, _mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec4q & operator <<= (Vec4q & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec4q operator >> (Vec4q const a, int32_t b) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_sra_epi64(a, _mm_cvtsi32_si128(b)); +#else + __m128i bb; + __m256i shi, slo, sra2; + if (b <= 32) { + bb = _mm_cvtsi32_si128(b); // b + shi = _mm256_sra_epi32(a,bb); // a >> b signed dwords + slo = _mm256_srl_epi64(a,bb); // a >> b unsigned qwords + } + else { // b > 32 + bb = _mm_cvtsi32_si128(b-32); // b - 32 + shi = _mm256_srai_epi32(a,31); // sign of a + sra2 = _mm256_sra_epi32(a,bb); // a >> (b-32) signed dwords + slo = _mm256_srli_epi64(sra2,32); // a >> (b-32) >> 32 (second shift unsigned qword) + } + return _mm256_blend_epi32(slo,shi,0xAA); +#endif +} +// vector operator >>= : shift right arithmetic +static inline Vec4q & operator >>= (Vec4q & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec4qb operator == (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, b, 0); +#else + return _mm256_cmpeq_epi64(a, b); +#endif +} + +// vector operator != : returns true for elements for which a != b +static inline Vec4qb operator != (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, b, 4); +#else + return Vec4qb(Vec4q(~(a == b))); +#endif +} + +// vector operator < : returns true for elements for which a < b +static inline Vec4qb operator < (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, b, 1); +#else + return _mm256_cmpgt_epi64(b, a); +#endif +} + +// vector operator > : returns true for elements for which a > b +static inline Vec4qb operator > (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, b, 6); +#else + return b < a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec4qb operator >= (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, b, 5); +#else + return Vec4qb(Vec4q(~(a < b))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec4qb operator <= (Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec4q operator & (Vec4q const a, Vec4q const b) { + return Vec4q(Vec256b(a) & Vec256b(b)); +} +static inline Vec4q operator && (Vec4q const a, Vec4q const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec4q & operator &= (Vec4q & a, Vec4q const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec4q operator | (Vec4q const a, Vec4q const b) { + return Vec4q(Vec256b(a) | Vec256b(b)); +} +static inline Vec4q operator || (Vec4q const a, Vec4q const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec4q & operator |= (Vec4q & a, Vec4q const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec4q operator ^ (Vec4q const a, Vec4q const b) { + return Vec4q(Vec256b(a) ^ Vec256b(b)); +} +// vector operator ^= : bitwise xor +static inline Vec4q & operator ^= (Vec4q & a, Vec4q const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec4q operator ~ (Vec4q const a) { + return Vec4q( ~ Vec256b(a)); +} + +// vector operator ! : logical not, returns true for elements == 0 +static inline Vec4qb operator ! (Vec4q const a) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epi64_mask (a, _mm256_setzero_si256(), 0); +#else + return a == Vec4q(_mm256_setzero_si256()); +#endif +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec4q select (Vec4qb const s, Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi64(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec4q if_add (Vec4qb const f, Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi64 (a, f, a, b); +#else + return a + (Vec4q(f) & b); +#endif +} + +// Conditional subtract +static inline Vec4q if_sub (Vec4qb const f, Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi64 (a, f, a, b); +#else + return a - (Vec4q(f) & b); +#endif +} + +// Conditional multiply +static inline Vec4q if_mul (Vec4qb const f, Vec4q const a, Vec4q const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mullo_epi64 (a, f, a, b); +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int64_t horizontal_add (Vec4q const a) { + __m256i sum1 = _mm256_shuffle_epi32(a,0x0E); // high element + __m256i sum2 = _mm256_add_epi64(a,sum1); // sum + __m128i sum3 = _mm256_extracti128_si256(sum2, 1); // get high part + __m128i sum4 = _mm_add_epi64(_mm256_castsi256_si128(sum2),sum3); // add low and high parts + return _emulate_movq(sum4); +} + +// function max: a > b ? a : b +static inline Vec4q max(Vec4q const a, Vec4q const b) { + return select(a > b, a, b); +} + +// function min: a < b ? a : b +static inline Vec4q min(Vec4q const a, Vec4q const b) { + return select(a < b, a, b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec4q abs(Vec4q const a) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_abs_epi64(a); +#else + __m256i sign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);// 0 > a + __m256i inv = _mm256_xor_si256(a, sign); // invert bits if negative + return _mm256_sub_epi64(inv, sign); // add 1 +#endif +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec4q abs_saturated(Vec4q const a) { +#if INSTRSET >= 10 + return _mm256_min_epu64(abs(a), Vec4q(0x7FFFFFFFFFFFFFFF)); +#else + __m256i absa = abs(a); // abs(a) + __m256i overfl = _mm256_cmpgt_epi64(_mm256_setzero_si256(), absa); // 0 > a + return _mm256_add_epi64(absa, overfl); // subtract 1 if 0x8000000000000000 +#endif +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec4q rotate_left(Vec4q const a, int b) { +#if INSTRSET >= 10 // __AVX512VL__ + return _mm256_rolv_epi64(a, _mm256_set1_epi64x(int64_t(b))); +#else + __m256i left = _mm256_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F)); // a << b + __m256i right = _mm256_srl_epi64(a,_mm_cvtsi32_si128((-b) & 0x3F)); // a >> (64 - b) + __m256i rot = _mm256_or_si256(left, right); // or + return rot; +#endif +} + + +/***************************************************************************** +* +* Vector of 4 64-bit unsigned integers +* +*****************************************************************************/ + +class Vec4uq : public Vec4q { +public: + // Default constructor: + Vec4uq() { + } + // Constructor to broadcast the same value into all elements: + Vec4uq(uint64_t i) { + ymm = Vec4q((int64_t)i); + } + // Constructor to build from all elements: + Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) { + ymm = Vec4q((int64_t)i0, (int64_t)i1, (int64_t)i2, (int64_t)i3); + } + // Constructor to build from two Vec2uq: + Vec4uq(Vec2uq const a0, Vec2uq const a1) { + ymm = set_m128ir(a0, a1); + } + // Constructor to convert from type __m256i used in intrinsics: + Vec4uq(__m256i const x) { + ymm = x; + } + // Assignment operator to convert from type __m256i used in intrinsics: + Vec4uq & operator = (__m256i const x) { + ymm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec4uq & load(void const * p) { + ymm = _mm256_loadu_si256((__m256i const*)p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec4uq & load_a(void const * p) { + ymm = _mm256_load_si256((__m256i const*)p); + return *this; + } + // Member function to change a single element in vector + Vec4uq const insert(int index, uint64_t value) { + Vec4q::insert(index, (int64_t)value); + return *this; + } + // Member function extract a single element from vector + uint64_t extract(int index) const { + return (uint64_t)Vec4q::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint64_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2uq: + Vec2uq get_low() const { + return _mm256_castsi256_si128(ymm); + } + Vec2uq get_high() const { + return _mm256_extractf128_si256(ymm,1); + } + static constexpr int elementtype() { + return 11; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec4uq operator + (Vec4uq const a, Vec4uq const b) { + return Vec4uq (Vec4q(a) + Vec4q(b)); +} + +// vector operator - : subtract +static inline Vec4uq operator - (Vec4uq const a, Vec4uq const b) { + return Vec4uq (Vec4q(a) - Vec4q(b)); +} + +// vector operator * : multiply element by element +static inline Vec4uq operator * (Vec4uq const a, Vec4uq const b) { + return Vec4uq (Vec4q(a) * Vec4q(b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec4uq operator >> (Vec4uq const a, uint32_t b) { + return _mm256_srl_epi64(a,_mm_cvtsi32_si128((int)b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec4uq operator >> (Vec4uq const a, int32_t b) { + return a >> (uint32_t)b; +} +// vector operator >>= : shift right artihmetic +static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator << : shift left all elements +static inline Vec4uq operator << (Vec4uq const a, uint32_t b) { + return Vec4uq ((Vec4q)a << (int32_t)b); +} +// vector operator << : shift left all elements +static inline Vec4uq operator << (Vec4uq const a, int32_t b) { + return Vec4uq ((Vec4q)a << b); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec4qb operator > (Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu64_mask (a, b, 6); +#else + __m256i sign64 = Vec4uq(0x8000000000000000); + __m256i aflip = _mm256_xor_si256(a, sign64); + __m256i bflip = _mm256_xor_si256(b, sign64); + Vec4q cmp = _mm256_cmpgt_epi64(aflip,bflip); + return Vec4qb(cmp); +#endif +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec4qb operator < (Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu64_mask (a, b, 1); +#else + return b > a; +#endif +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec4qb operator >= (Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu64_mask (a, b, 5); +#else + return Vec4qb(Vec4q(~(b > a))); +#endif +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec4qb operator <= (Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_cmp_epu64_mask (a, b, 2); +#else + return b >= a; +#endif +} + +// vector operator & : bitwise and +static inline Vec4uq operator & (Vec4uq const a, Vec4uq const b) { + return Vec4uq(Vec256b(a) & Vec256b(b)); +} +static inline Vec4uq operator && (Vec4uq const a, Vec4uq const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec4uq operator | (Vec4uq const a, Vec4uq const b) { + return Vec4uq(Vec256b(a) | Vec256b(b)); +} +static inline Vec4uq operator || (Vec4uq const a, Vec4uq const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec4uq operator ^ (Vec4uq const a, Vec4uq const b) { + return Vec4uq(Vec256b(a) ^ Vec256b(b)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec4uq select (Vec4qb const s, Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mov_epi64(b, s, a); +#else + return selectb(s,a,b); +#endif +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec4uq if_add (Vec4qb const f, Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_add_epi64 (a, f, a, b); +#else + return a + (Vec4uq(f) & b); +#endif +} + +// Conditional subtract +static inline Vec4uq if_sub (Vec4qb const f, Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_sub_epi64 (a, f, a, b); +#else + return a - (Vec4uq(f) & b); +#endif +} + +// Conditional multiply +static inline Vec4uq if_mul (Vec4qb const f, Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // compact boolean vectors + return _mm256_mask_mullo_epi64 (a, f, a, b); +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint64_t horizontal_add (Vec4uq const a) { + return (uint64_t)horizontal_add((Vec4q)a); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Elements are sign/zero extended before adding to avoid overflow +static inline int64_t horizontal_add_x (Vec8i const a) { + __m256i signs = _mm256_srai_epi32(a,31); // sign of all elements + Vec4q a01 = _mm256_unpacklo_epi32(a,signs); // sign-extended a0, a1, a4, a5 + Vec4q a23 = _mm256_unpackhi_epi32(a,signs); // sign-extended a2, a3, a6, a7 + return horizontal_add(a01 + a23); +} + +static inline uint64_t horizontal_add_x (Vec8ui const a) { + __m256i zero = _mm256_setzero_si256(); // 0 + __m256i a01 = _mm256_unpacklo_epi32(a,zero); // zero-extended a0, a1 + __m256i a23 = _mm256_unpackhi_epi32(a,zero); // zero-extended a2, a3 + return (uint64_t)horizontal_add(Vec4q(a01) + Vec4q(a23)); +} + +// function max: a > b ? a : b +static inline Vec4uq max(Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_max_epu64 (a, b); +#else + return Vec4uq(select(a > b, a, b)); +#endif +} + +// function min: a < b ? a : b +static inline Vec4uq min(Vec4uq const a, Vec4uq const b) { +#if INSTRSET >= 10 // AVX512VL + return _mm256_min_epu64 (a, b); +#else + return Vec4uq(select(a > b, b, a)); +#endif +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// Permute vector of 4 64-bit integers. +template +static inline Vec4q permute4(Vec4q const a) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + __m256i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & perm_largeblock) != 0) { // permute 128-bit blocks + constexpr EList L = largeblock_perm<4>(indexs); // get 128-bit permute pattern + constexpr int j0 = L.a[0]; + constexpr int j1 = L.a[1]; +#ifndef ZEXT_MISSING + if constexpr (j0 == 0 && j1 == -1 && !(flags & perm_addz)) { // zero extend + return _mm256_zextsi128_si256(_mm256_castsi256_si128(y)); + } + if constexpr (j0 == 1 && j1 < 0 && !(flags & perm_addz)) { // extract upper part, zero extend + return _mm256_zextsi128_si256(_mm256_extracti128_si256(y, 1)); + } +#endif + if constexpr ((flags & perm_perm) != 0 && !(flags & perm_zeroing)) { + return _mm256_permute2x128_si256(y, y, (j0 & 1) | (j1 & 1) << 4); + } + } + if constexpr ((flags & perm_perm) != 0) { // permutation needed + if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in both lanes + // try to fit various instructions + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm256_unpackhi_epi64(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm256_unpacklo_epi64(y, y); + } + else { // general permute + y = _mm256_shuffle_epi32(a, uint8_t(flags >> perm_ipattern)); + } + } + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) { + y = _mm256_broadcastq_epi64(_mm256_castsi256_si128(y)); // broadcast first element + } + else { // different patterns in two lanes +#if INSTRSET >= 10 // AVX512VL + if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + return _mm256_maskz_alignr_epi64 (zero_mask<4>(indexs), y, y, rot); + } + else { // full permute + constexpr uint8_t mms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6; + constexpr __mmask8 mmz = zero_mask<4>(indexs);//(i0 >= 0) | (i1 >= 0) << 1 | (i2 >= 0) << 2 | (i3 >= 0) << 3; + return _mm256_maskz_permutex_epi64(mmz, a, mms); + } +#else + // full permute + constexpr int ms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6; + y = _mm256_permute4x64_epi64(a, ms); +#endif + } + } + if constexpr ((flags & perm_zeroing) != 0) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi64(zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec4q().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec4uq permute4(Vec4uq const a) { + return Vec4uq (permute4 (Vec4q(a))); +} + + +// Permute vector of 8 32-bit integers. +template +static inline Vec8i permute8(Vec8i const a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m256i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<8>(indexs); // permutation pattern + y = permute4 (Vec4q(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in both lanes + // try to fit various instructions + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm256_unpackhi_epi32(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm256_unpacklo_epi32(y, y); + } + else { // general permute + y = _mm256_shuffle_epi32(a, uint8_t(flags >> perm_ipattern)); + } + } +#if INSTRSET >= 10 + else if constexpr ((flags & perm_broadcast) != 0 && (flags & perm_zeroing) == 0) { + constexpr uint8_t e = flags >> perm_rot_count & 0xF; // broadcast one element + if constexpr (e > 0) { + y = _mm256_alignr_epi32(y, y, e); + } + return _mm256_broadcastd_epi32(_mm256_castsi256_si128(y)); +#else + else if constexpr ((flags & perm_broadcast) != 0 && (flags & perm_zeroing) == 0 && (flags >> perm_rot_count == 0)) { + return _mm256_broadcastd_epi32(_mm256_castsi256_si128(y)); // broadcast first element +#endif + } + else if constexpr ((flags & perm_zext) != 0) { + y = _mm256_cvtepu32_epi64(_mm256_castsi256_si128(y)); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } +#if INSTRSET >= 10 // AVX512VL + else if constexpr ((flags & perm_compress) != 0) { + y = _mm256_maskz_compress_epi32(__mmask8(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm256_maskz_expand_epi32(__mmask8(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif + else { // different patterns in two lanes +#if INSTRSET >= 10 // AVX512VL + if constexpr ((flags & perm_rotate_big) != 0) { // fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + return _mm256_maskz_alignr_epi32(zero_mask<8>(indexs), y, y, rot); + } + else +#endif + if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm256_shuffle_epi8(a, Vec8i().load(bm.a)); + } + // full permute needed + __m256i permmask = constant8ui < + i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7 > (); +#if INSTRSET >= 10 // AVX512VL + return _mm256_maskz_permutexvar_epi32 (zero_mask<8>(indexs), permmask, y); +#else + y =_mm256_permutevar8x32_epi32(y, permmask); +#endif + } + } + if constexpr ((flags & perm_zeroing) != 0) { + // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi32(zero_mask<8>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec8i().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec8ui permute8(Vec8ui const a) { + return Vec8ui (permute8 (Vec8i(a))); +} + + +// Permute vector of 16 16-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template +static inline Vec16s permute16(Vec16s const a) { + int constexpr indexs[16] = { // indexes as array + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + __m256i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<16>(indexs); // permutation pattern + y = permute8 (Vec8i(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in both lanes + // try to fit various instructions + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm256_unpackhi_epi16(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm256_unpacklo_epi16(y, y); + } + else if constexpr ((flags & perm_rotate) != 0) { // fits palignr. rotate within lanes + y = _mm256_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF); + } + else { + // flags for 16 bit permute instructions + constexpr uint64_t flags16 = perm16_flags(indexs); + constexpr bool L2L = (flags16 & 1) != 0; // from low to low 64-bit part + constexpr bool H2H = (flags16 & 2) != 0; // from high to high 64-bit part + constexpr bool H2L = (flags16 & 4) != 0; // from high to low 64-bit part + constexpr bool L2H = (flags16 & 8) != 0; // from low to high 64-bit part + constexpr uint8_t pL2L = uint8_t(flags16 >> 32);// low to low permute pattern + constexpr uint8_t pH2H = uint8_t(flags16 >> 40);// high to high permute pattern + constexpr uint8_t noperm = 0xE4; // pattern for no permute + if constexpr (!H2L && !L2H) { // simple case. no crossing of 64-bit boundary + if constexpr (L2L && pL2L != noperm) { + y = _mm256_shufflelo_epi16(y, pL2L); // permute low 64-bits + } + if constexpr (H2H && pH2H != noperm) { + y = _mm256_shufflehi_epi16(y, pH2H); // permute high 64-bits + } + } + else { // use pshufb + const EList bm = pshufb_mask(indexs); + return _mm256_shuffle_epi8(a, Vec16s().load(bm.a)); + } + } + } + else { // different patterns in two lanes + if constexpr ((flags & perm_zext) != 0) { // fits zero extension + y = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(y)); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + else if constexpr ((flags & perm_compress) != 0) { + y = _mm256_maskz_compress_epi16(__mmask16(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm256_maskz_expand_epi16(__mmask16(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif // AVX512VBMI2 + else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm256_shuffle_epi8(a, Vec16s().load(bm.a)); + } + else if constexpr ((flags & perm_rotate_big) != 0) {// fits full rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count) * 2; // rotate count + __m256i swap = _mm256_permute4x64_epi64(a,0x4E);// swap 128-bit halves + if (rot <= 16) { + y = _mm256_alignr_epi8(swap, y, rot); + } + else { + y = _mm256_alignr_epi8(y, swap, rot & 15); + } + } + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) { + y = _mm256_broadcastw_epi16(_mm256_castsi256_si128(y)); // broadcast first element + } + else { // full permute needed +#if INSTRSET >= 10 // AVX512VL + const EList bm = perm_mask_broad(indexs); + y = _mm256_permutexvar_epi16(Vec16s().load(bm.a), y); +#else // no full permute instruction available + __m256i swap = _mm256_permute4x64_epi64(y,0x4E);// swap high and low 128-bit lane + const EList bm1 = pshufb_mask(indexs); + const EList bm2 = pshufb_mask(indexs); + __m256i r1 = _mm256_shuffle_epi8(swap, Vec16s().load(bm1.a)); + __m256i r2 = _mm256_shuffle_epi8(y, Vec16s().load(bm2.a)); + return _mm256_or_si256(r1, r2); +#endif + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi16(zero_mask<16>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec16s().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec16us permute16(Vec16us const a) { + return Vec16us (permute16 (Vec16s(a))); +} + + +template +static inline Vec32c permute32(Vec32c const a) { + int constexpr indexs[32] = { // indexes as array + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 }; + + __m256i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<32>(indexs); // permutation pattern + y = permute16 (Vec16s(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in both lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm256_unpackhi_epi8(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm256_unpacklo_epi8(y, y); + } + else if constexpr ((flags & perm_rotate) != 0) { // fits palignr. rotate within lanes + y = _mm256_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF); + } + else { // use pshufb + const EList bm = pshufb_mask(indexs); + return _mm256_shuffle_epi8(a, Vec32c().load(bm.a)); + } + } + else { // different patterns in two lanes + if constexpr ((flags & perm_zext) != 0) { // fits zero extension + y = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(y)); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + else if constexpr ((flags & perm_compress) != 0) { + y = _mm256_maskz_compress_epi8(__mmask32(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm256_maskz_expand_epi8(__mmask32(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif // AVX512VBMI2 + else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm256_shuffle_epi8(a, Vec32c().load(bm.a)); + } + else if constexpr ((flags & perm_rotate_big) != 0) {// fits full rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotate count + __m256i swap = _mm256_permute4x64_epi64(a,0x4E);// swap 128-bit halves + if (rot <= 16) { + y = _mm256_alignr_epi8(swap, y, rot); + } + else { + y = _mm256_alignr_epi8(y, swap, rot & 15); + } + } + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) { + y = _mm256_broadcastb_epi8(_mm256_castsi256_si128(y)); // broadcast first element + } + else { // full permute needed +#if INSTRSET >= 10 && defined ( __AVX512VBMI__ ) // AVX512VBMI + const EList bm = perm_mask_broad(indexs); + y = _mm256_permutexvar_epi8(Vec32c().load(bm.a), y); +#else + // no full permute instruction available + __m256i swap = _mm256_permute4x64_epi64(y, 0x4E); // swap high and low 128-bit lane + const EList bm1 = pshufb_mask(indexs); + const EList bm2 = pshufb_mask(indexs); + __m256i r1 = _mm256_shuffle_epi8(swap, Vec32c().load(bm1.a)); + __m256i r2 = _mm256_shuffle_epi8(y, Vec32c().load(bm2.a)); + return _mm256_or_si256(r1, r2); +#endif + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi8(zero_mask<32>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec32c().load(bm.a), y); +#endif + } + return y; +} + +template < + int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, + int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15, + int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, + int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 > + static inline Vec32uc permute32(Vec32uc const a) { + return Vec32uc (permute32 (Vec32c(a))); +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ + +// permute and blend Vec4q +template +static inline Vec4q blend4(Vec4q const a, Vec4q const b) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + __m256i y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute4 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + return permute4 (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs); // blend mask +#if INSTRSET >= 10 // AVX512VL + y = _mm256_mask_mov_epi64 (a, mb, b); +#else // AVX2 + y = _mm256_blend_epi32(a, b, ((mb & 1) | (mb & 2) << 1 | (mb & 4) << 2 | (mb & 8) << 3) * 3); // duplicate each bit +#endif + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 128-bit blocks + constexpr EList L = largeblock_perm<4>(indexs); // get 128-bit blend pattern + constexpr uint8_t pp = (L.a[0] & 0xF) | uint8_t(L.a[1] & 0xF) << 4; + y = _mm256_permute2x128_si256(a, b, pp); + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm256_unpacklo_epi64 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm256_unpacklo_epi64 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm256_unpackhi_epi64 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm256_unpackhi_epi64 (b, a); + } + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm256_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm256_alignr_epi8(b, a, flags >> blend_rotpattern); + } +#if ALLOW_FP_PERMUTE // allow floating point permute instructions on integer vectors + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), (flags >> blend_shufpattern) & 0xF)); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), (flags >> blend_shufpattern) & 0xF)); + } +#endif + else { // No special cases +#if INSTRSET >= 10 // AVX512VL. use vpermi2q + __m256i const maskp = constant8ui(); + return _mm256_maskz_permutex2var_epi64 (zero_mask<4>(indexs), a, maskp, b); +#else // permute a and b separately, then blend. + constexpr EList L = blend_perm_indexes<4, 0>(indexs); // get permutation indexes + __m256i ya = permute4(a); + __m256i yb = permute4(b); + constexpr uint8_t mb = (uint8_t)make_bit_mask<4, 0x302>(indexs); // blend mask + y = _mm256_blend_epi32(ya, yb, ((mb & 1) | (mb & 2) << 1 | (mb & 4) << 2 | (mb & 8) << 3) * 3); // duplicate each bit +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi64(zero_mask<4>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec4q().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec4uq blend4(Vec4uq const a, Vec4uq const b) { + return Vec4uq(blend4 (Vec4q(a),Vec4q(b))); +} + + +// permute and blend Vec8i +template +static inline Vec8i blend8(Vec8i const a, Vec8i const b) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m256i y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 32-bit blocks + constexpr EList L = largeblock_perm<8>(indexs); // get 32-bit blend pattern + y = blend4 (Vec4q(a), Vec4q(b)); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute8 (a); + } + else if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes + return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b); + } + else if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs); // blend mask +#if INSTRSET >= 10 // AVX512VL + y = _mm256_mask_mov_epi32 (a, mb, b); +#else // AVX2 + y = _mm256_blend_epi32(a, b, mb); +#endif + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm256_unpacklo_epi32 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm256_unpacklo_epi32 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm256_unpackhi_epi32 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm256_unpackhi_epi32 (b, a); + } + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm256_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm256_alignr_epi8(b, a, flags >> blend_rotpattern); + } +#if ALLOW_FP_PERMUTE // allow floating point permute instructions on integer vectors + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), uint8_t(flags >> blend_shufpattern))); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), uint8_t(flags >> blend_shufpattern))); + } +#endif + else { // No special cases +#if INSTRSET >= 10 // AVX512VL. use vpermi2d + __m256i const maskp = constant8ui (); + return _mm256_maskz_permutex2var_epi32 (zero_mask<8>(indexs), a, maskp, b); +#else // permute a and b separately, then blend. + constexpr EList L = blend_perm_indexes<8, 0>(indexs); // get permutation indexes + __m256i ya = permute8(a); + __m256i yb = permute8(b); + constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs); // blend mask + y = _mm256_blend_epi32(ya, yb, mb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi32(zero_mask<8>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec8i().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec8ui blend8(Vec8ui const a, Vec8ui const b) { + return Vec8ui( blend8 (Vec8i(a),Vec8i(b))); +} + + +// permute and blend Vec16s +template + static inline Vec16s blend16(Vec16s const a, Vec16s const b) { + int constexpr indexs[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 };// indexes as array + __m256i y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 32-bit blocks + constexpr EList L = largeblock_perm<16>(indexs); // get 32-bit blend pattern + y = blend8 (Vec8i(a), Vec8i(b)); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute16 (a); + } + else if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes + return permute16 < + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]> (b); + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm256_unpacklo_epi16 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm256_unpacklo_epi16 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm256_unpackhi_epi16 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm256_unpackhi_epi16 (b, a); + } + else if constexpr ((flags & blend_rotateab) != 0) { + y = _mm256_alignr_epi8(a, b, flags >> blend_rotpattern); + } + else if constexpr ((flags & blend_rotateba) != 0) { + y = _mm256_alignr_epi8(b, a, flags >> blend_rotpattern); + } + else { // No special cases +#if INSTRSET >= 10 // AVX512VL. use vpermi2w + if constexpr ((flags & (blend_perma | blend_permb)) != 0) { + const EList bm = perm_mask_broad(indexs); + return _mm256_maskz_permutex2var_epi16(zero_mask<16>(indexs), a, Vec16s().load(bm.a), b); + } +#endif + // permute a and b separately, then blend. + Vec16s ya = a, yb = b; // a and b permuted + constexpr EList L = blend_perm_indexes<16, 0>(indexs); // get permutation indexes + if constexpr ((flags & blend_perma) != 0) { + ya = permute16< + L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], + L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] >(ya); + } + if constexpr ((flags & blend_permb) != 0) { + yb = permute16< + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] >(yb); + } + constexpr uint16_t mb = (uint16_t)make_bit_mask<16, 0x304>(indexs); // blend mask +#if INSTRSET >= 10 // AVX512VL + y = _mm256_mask_mov_epi16 (ya, mb, yb); +#else // AVX2 + if ((flags & blend_same_pattern) != 0) { // same blend pattern in both 128-bit lanes + y = _mm256_blend_epi16(ya, yb, (uint8_t)mb); + } + else { + const EList bm = make_broad_mask(mb); + y = _mm256_blendv_epi8 (ya, yb, Vec16s().load(bm.a)); + } +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi16(zero_mask<16>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec16s().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec16us blend16(Vec16us const a, Vec16us const b) { + return Vec16us( blend16 (Vec16s(a),Vec16s(b))); +} + + +// permute and blend Vec32c +template +static inline Vec32c blend32(Vec32c const a, Vec32c const b) { + int constexpr indexs[32] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23,i24, i25, i26, i27, i28, i29, i30, i31 }; // indexes as array + __m256i y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm256_setzero_si256(); // just return zero + + if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 16-bit blocks + constexpr EList L = largeblock_perm<32>(indexs); // get 16-bit blend pattern + y = blend16 < L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], + L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > + (Vec16s(a), Vec16s(b)); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute32 (a); + } + else if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<32, 2>(indexs); // get permutation indexes + return permute32 < + L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39], + L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47], + L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55], + L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] > (b); + } + else { // No special cases +#if INSTRSET >= 10 && defined (__AVX512VBMI__) // AVX512VL + AVX512VBMI. use vpermi2b + if constexpr ((flags & (blend_perma | blend_permb)) != 0) { + const EList bm = perm_mask_broad(indexs); + return _mm256_maskz_permutex2var_epi8(zero_mask<32>(indexs), a, Vec32c().load(bm.a), b); + } +#endif + // permute a and b separately, then blend. + Vec32c ya = a, yb = b; // a and b permuted + constexpr EList L = blend_perm_indexes<32, 0>(indexs); // get permutation indexes + if constexpr ((flags & blend_perma) != 0) { + ya = permute32 < + L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], + L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15], + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (ya); + } + if constexpr ((flags & blend_permb) != 0) { + yb = permute32 < + L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39], + L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47], + L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55], + L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] > (yb); + } + constexpr uint32_t mb = (uint32_t)make_bit_mask<32, 0x305>(indexs);// blend mask +#if INSTRSET >= 10 // AVX512VL + y = _mm256_mask_mov_epi8 (ya, mb, yb); +#else // AVX2 + const EList bm = make_broad_mask(mb); + y = _mm256_blendv_epi8 (ya, yb, Vec32c().load(bm.a)); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed +#if INSTRSET >= 10 // use compact mask + y = _mm256_maskz_mov_epi8(zero_mask<32>(indexs), y); +#else // use broad mask + const EList bm = zero_mask_broad(indexs); + y = _mm256_and_si256(Vec32c().load(bm.a), y); +#endif + } + return y; +} + +template +static inline Vec32uc blend32(Vec32uc const a, Vec32uc const b) { + return Vec32uc (blend32 (Vec32c(a), Vec32c(b))); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec32c lookup32(Vec32c const index, Vec32c const table) { +#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM + Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low()); + Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high()); + return Vec32c(t0, t1); +#else + Vec32c f0 = constant8ui<0,0,0,0,0x10101010,0x10101010,0x10101010,0x10101010>(); + Vec32c f1 = constant8ui<0x10101010,0x10101010,0x10101010,0x10101010,0,0,0,0>(); + Vec32c tablef = _mm256_permute4x64_epi64(table, 0x4E); // low and high parts swapped + Vec32c r0 = _mm256_shuffle_epi8(table, (index ^ f0) + 0x70); + Vec32c r1 = _mm256_shuffle_epi8(tablef, (index ^ f1) + 0x70); + return r0 | r1; +#endif +} + +template +static inline Vec32c lookup(Vec32uc const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 16) { + Vec16c tt = Vec16c().load(table); + Vec16c r0 = lookup16(index.get_low(), tt); + Vec16c r1 = lookup16(index.get_high(), tt); + return Vec32c(r0, r1); + } + if constexpr (n <= 32) return lookup32(index, Vec32c().load(table)); + // n > 32. Limit index + Vec32uc index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec32uc(index) & uint8_t(n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec32uc(index), uint8_t(n-1)); + } + Vec8ui mask0 = Vec8ui(0x000000FF); // mask 8 bits + Vec32c t0 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & Vec8ui(index1)), 1); // positions 0, 4, 8, ... + Vec32c t1 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1, 8)), 1); // positions 1, 5, 9, ... + Vec32c t2 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1,16)), 1); // positions 2, 6, 10, ... + Vec32c t3 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1,24), 1); // positions 3, 7, 11, ... + t0 = t0 & Vec32c(mask0); + t1 = _mm256_slli_epi32(t1 & Vec32c(mask0), 8); + t2 = _mm256_slli_epi32(t2 & Vec32c(mask0), 16); + t3 = _mm256_slli_epi32(t3, 24); + return (t0 | t3) | (t1 | t2); +} + +template +static inline Vec32c lookup(Vec32c const index, void const * table) { + return lookup(Vec32uc(index), table); +} + + +static inline Vec16s lookup16(Vec16s const index, Vec16s const table) { + return Vec16s(lookup32(Vec32c(index * 0x202 + 0x100), Vec32c(table))); +} + +template +static inline Vec16s lookup(Vec16s const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + Vec8s table1 = Vec8s().load(table); + return Vec16s( + lookup8 (index.get_low(), table1), + lookup8 (index.get_high(), table1)); + } + if constexpr (n <= 16) return lookup16(index, Vec16s().load(table)); + // n > 16. Limit index + Vec16us index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec16us(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec16us(index), n-1); + } + Vec16s t1 = _mm256_i32gather_epi32((const int *)table, __m256i(Vec8ui(index1) & 0x0000FFFF), 2); // even positions + Vec16s t2 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1, 16) , 2); // odd positions + return blend16<0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30>(t1, t2); +} + +static inline Vec8i lookup8(Vec8i const index, Vec8i const table) { + return _mm256_permutevar8x32_epi32(table, index); +} + +template +static inline Vec8i lookup(Vec8i const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + Vec8i table1 = Vec8i().load(table); + return lookup8(index, table1); + } + if constexpr (n <= 16) { + Vec8i table1 = Vec8i().load(table); + Vec8i table2 = Vec8i().load((int32_t const*)table + 8); + Vec8i y1 = lookup8(index, table1); + Vec8i y2 = lookup8(index, table2); + Vec8ib s = index > 7; + return select(s, y2, y1); + } + // n > 16. Limit index + Vec8ui index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec8ui(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec8ui(index), n-1); + } + return _mm256_i32gather_epi32((const int *)table, index1, 4); +} + +static inline Vec4q lookup4(Vec4q const index, Vec4q const table) { + return Vec4q(lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table))); +} + +template +static inline Vec4q lookup(Vec4q const index, int64_t const * table) { + if constexpr (n <= 0) return 0; + // n > 0. Limit index + Vec4uq index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec4uq(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1. + // There is no 64-bit min instruction, but we can use the 32-bit unsigned min, + // since n is a 32-bit integer + index1 = Vec4uq(min(Vec8ui(index), constant8ui())); + } +/* old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long +#if defined (__clang__) && CLANG_VERSION < 30400 +// clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed + return _mm256_i64gather_epi64((const int *)table, index1, 8); +#elif defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) +// Old MS and Intel use non-standard type __int64 + return _mm256_i64gather_epi64((const int64_t *)table, index1, 8); +#else +// Gnu, Clang 3.4, MS 11.0 +*/ + return _mm256_i64gather_epi64((const long long *)table, index1, 8); +//#endif +} + + +/***************************************************************************** +* +* Byte shifts +* +*****************************************************************************/ + +// Function shift_bytes_up: shift whole vector left by b bytes. +template +static inline Vec32c shift_bytes_up(Vec32c const a) { + __m256i ahi, alo; + if constexpr (b == 0) return a; +#if INSTRSET >= 10 // AVX512VL + else if constexpr ((b & 3) == 0) { // b is divisible by 4 + return _mm256_alignr_epi32(a, _mm256_setzero_si256(), (8 - (b >> 2)) & 7); + } +#endif + else if constexpr (b < 16) { + alo = a; + ahi = _mm256_inserti128_si256 (_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1);// shift a 16 bytes up, zero lower part + } + else if constexpr (b < 32) { + alo = _mm256_inserti128_si256 (_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1);// shift a 16 bytes up, zero lower part + ahi = _mm256_setzero_si256(); + } + else { + return _mm256_setzero_si256(); // zero + } + if constexpr ((b & 0xF) == 0) return alo; // modulo 16. no more shift needeed + return _mm256_alignr_epi8(alo, ahi, 16-(b & 0xF)); // shift within 16-bytes lane +} + +// Function shift_bytes_down: shift whole vector right by b bytes +template +static inline Vec32c shift_bytes_down(Vec32c const a) { +#if INSTRSET >= 10 // AVX512VL + if constexpr ((b & 3) == 0) { // b is divisible by 4 + return _mm256_alignr_epi32(_mm256_setzero_si256(), a, (b >> 2) & 7); + } +#endif + __m256i ahi, alo; + if constexpr (b < 16) { + // shift a 16 bytes down, zero upper part + alo = _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_extracti128_si256(a, 1), 0);// make sure the upper part is zero (otherwise, an optimizing compiler can mess it up) + ahi = a; + } + else if constexpr (b < 32) { + alo = _mm256_setzero_si256(); // zero + ahi = _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_extracti128_si256(a, 1), 0);// shift a 16 bytes down, zero upper part + } + else { + return _mm256_setzero_si256(); // zero + } + if constexpr ((b & 0xF) == 0) return ahi; // modulo 16. no more shift needeed + return _mm256_alignr_epi8(alo, ahi, b & 0xF); // shift within 16-bytes lane +} + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7 +template +static inline Vec8i gather8i(void const * a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 7) { + // load one contiguous block and permute + if constexpr (imax > 7) { + // make sure we don't read past the end of the array + Vec8i b = Vec8i().load((int32_t const *)a + imax-7); + return permute8(b); + } + else { + Vec8i b = Vec8i().load((int32_t const *)a + imin); + return permute8(b); + } + } + if constexpr ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) + && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { + // load two contiguous blocks and blend + Vec8i b = Vec8i().load((int32_t const *)a + imin); + Vec8i c = Vec8i().load((int32_t const *)a + imax-7); + constexpr int j0 = i0(b, c); + } + // use AVX2 gather + return _mm256_i32gather_epi32((const int *)a, Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), 4); +} + +template +static inline Vec4q gather4q(void const * a) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 3) { + // load one contiguous block and permute + if constexpr (imax > 3) { + // make sure we don't read past the end of the array + Vec4q b = Vec4q().load((int64_t const *)a + imax-3); + return permute4(b); + } + else { + Vec4q b = Vec4q().load((int64_t const *)a + imin); + return permute4(b); + } + } + if constexpr ((i0imax-4) && (i1imax-4) && (i2imax-4) && (i3imax-4)) { + // load two contiguous blocks and blend + Vec4q b = Vec4q().load((int64_t const *)a + imin); + Vec4q c = Vec4q().load((int64_t const *)a + imax-3); + const int j0 = i0(b, c); + } + // use AVX2 gather + return _mm256_i32gather_epi64((const long long *)a, Vec4i(i0,i1,i2,i3), 8); +} + + +/***************************************************************************** +* +* Vector scatter functions +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +*****************************************************************************/ + +template +static inline void scatter(Vec8i const data, void * array) { +#if INSTRSET >= 10 // __AVX512VL__ + __m256i indx = constant8ui(); + __mmask8 mask = uint8_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) | + ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7)); + _mm256_mask_i32scatter_epi32((int*)array, mask, indx, data, 4); +#elif INSTRSET >= 9 // __AVX512F__ + __m512i indx = _mm512_castsi256_si512(constant8ui()); + __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3) | + ((i4>=0)<<4) | ((i5>=0)<<5) | ((i6>=0)<<6) | ((i7>=0)<<7)); + _mm512_mask_i32scatter_epi32((int*)array, mask, indx, _mm512_castsi256_si512(data), 4); +#else + int32_t* arr = (int32_t*)array; + const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; + for (int i = 0; i < 8; i++) { + if (index[i] >= 0) arr[index[i]] = data[i]; + } +#endif +} + +template +static inline void scatter(Vec4q const data, void * array) { +#if INSTRSET >= 10 // __AVX512VL__ + __m128i indx = constant4ui(); + __mmask8 mask = uint8_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3)); + _mm256_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8); +#elif INSTRSET >= 9 // __AVX512F__ + __m256i indx = _mm256_castsi128_si256(constant4ui()); + __mmask16 mask = uint16_t((i0>=0) | ((i1>=0)<<1) | ((i2>=0)<<2) | ((i3>=0)<<3)); + _mm512_mask_i32scatter_epi64((long long*)array, (__mmask8)mask, indx, _mm512_castsi256_si512(data), 8); +#else + int64_t* arr = (int64_t*)array; + const int index[4] = {i0,i1,i2,i3}; + for (int i = 0; i < 4; i++) { + if (index[i] >= 0) arr[index[i]] = data[i]; + } +#endif +} + + +/***************************************************************************** +* +* Scatter functions with variable indexes +* +*****************************************************************************/ + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8i const data, void * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit)); + _mm256_mask_i32scatter_epi32((int*)destination, mask, index, data, 4); +#elif INSTRSET >= 9 // __AVX512F__ + // 16 bit mask, upper 8 bits are 0. Usually, we can rely on the upper bit of an extended vector to be zero, but we will mask then off the be sure + //__mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); + __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xFF, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); + _mm512_mask_i32scatter_epi32((int*)destination, mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 4); +#else + int32_t* arr = (int32_t*)destination; + for (int i = 0; i < 8; i++) { + if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; + } +#endif +} + +static inline void scatter(Vec4q const index, uint32_t limit, Vec4q const data, void * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit))); + _mm256_mask_i64scatter_epi64((long long*)destination, mask, index, data, 8); +#elif INSTRSET >= 9 // __AVX512F__ + // 16 bit mask. upper 12 bits are 0 + __mmask16 mask = _mm512_mask_cmplt_epu64_mask(0xF, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit)))); + _mm512_mask_i64scatter_epi64((long long*)destination, (__mmask8)mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 8); +#else + int64_t* arr = (int64_t*)destination; + for (int i = 0; i < 4; i++) { + if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i]; + } +#endif +} + +static inline void scatter(Vec4i const index, uint32_t limit, Vec4q const data, void * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask8 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); + _mm256_mask_i32scatter_epi64((long long*)destination, mask, index, data, 8); +#elif INSTRSET >= 9 // __AVX512F__ + // 16 bit mask. upper 12 bits are 0 + __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xF, _mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit))); + _mm512_mask_i32scatter_epi64((long long*)destination, (__mmask8)mask, _mm256_castsi128_si256(index), _mm512_castsi256_si512(data), 8); +#else + int64_t* arr = (int64_t*)destination; + for (int i = 0; i < 4; i++) { + if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; + } +#endif +} + +/***************************************************************************** +* +* Functions for conversion between integer sizes +* +*****************************************************************************/ + +// Extend 8-bit integers to 16-bit integers, signed and unsigned + +// Function extend_low : extends the low 16 elements to 16 bits with sign extension +static inline Vec16s extend_low (Vec32c const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0x10); // get bits 64-127 to position 128-191 + __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2 + return _mm256_unpacklo_epi8(a2, sign); // interleave with sign extensions +} + +// Function extend_high : extends the high 16 elements to 16 bits with sign extension +static inline Vec16s extend_high (Vec32c const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8); // get bits 128-191 to position 64-127 + __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2 + return _mm256_unpackhi_epi8(a2, sign); // interleave with sign extensions +} + +// Function extend_low : extends the low 16 elements to 16 bits with zero extension +static inline Vec16us extend_low (Vec32uc const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0x10); // get bits 64-127 to position 128-191 + return _mm256_unpacklo_epi8(a2, _mm256_setzero_si256()); // interleave with zero extensions +} + +// Function extend_high : extends the high 19 elements to 16 bits with zero extension +static inline Vec16us extend_high (Vec32uc const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8); // get bits 128-191 to position 64-127 + return _mm256_unpackhi_epi8(a2, _mm256_setzero_si256()); // interleave with zero extensions +} + +// Extend 16-bit integers to 32-bit integers, signed and unsigned + +// Function extend_low : extends the low 8 elements to 32 bits with sign extension +static inline Vec8i extend_low (Vec16s const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0x10); // get bits 64-127 to position 128-191 + __m256i sign = _mm256_srai_epi16(a2, 15); // sign bit + return _mm256_unpacklo_epi16(a2 ,sign); // interleave with sign extensions +} + +// Function extend_high : extends the high 8 elements to 32 bits with sign extension +static inline Vec8i extend_high (Vec16s const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8); // get bits 128-191 to position 64-127 + __m256i sign = _mm256_srai_epi16(a2, 15); // sign bit + return _mm256_unpackhi_epi16(a2, sign); // interleave with sign extensions +} + +// Function extend_low : extends the low 8 elements to 32 bits with zero extension +static inline Vec8ui extend_low (Vec16us const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0x10); // get bits 64-127 to position 128-191 + return _mm256_unpacklo_epi16(a2, _mm256_setzero_si256()); // interleave with zero extensions +} + +// Function extend_high : extends the high 8 elements to 32 bits with zero extension +static inline Vec8ui extend_high (Vec16us const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8); // get bits 128-191 to position 64-127 + return _mm256_unpackhi_epi16(a2, _mm256_setzero_si256()); // interleave with zero extensions +} + +// Extend 32-bit integers to 64-bit integers, signed and unsigned + +// Function extend_low : extends the low 4 elements to 64 bits with sign extension +static inline Vec4q extend_low (Vec8i const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0x10); // get bits 64-127 to position 128-191 + __m256i sign = _mm256_srai_epi32(a2, 31); // sign bit + return _mm256_unpacklo_epi32(a2, sign); // interleave with sign extensions +} + +// Function extend_high : extends the high 4 elements to 64 bits with sign extension +static inline Vec4q extend_high (Vec8i const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8); // get bits 128-191 to position 64-127 + __m256i sign = _mm256_srai_epi32(a2, 31); // sign bit + return _mm256_unpackhi_epi32(a2, sign); // interleave with sign extensions +} + +// Function extend_low : extends the low 4 elements to 64 bits with zero extension +static inline Vec4uq extend_low (Vec8ui const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0x10); // get bits 64-127 to position 128-191 + return _mm256_unpacklo_epi32(a2, _mm256_setzero_si256()); // interleave with zero extensions +} + +// Function extend_high : extends the high 4 elements to 64 bits with zero extension +static inline Vec4uq extend_high (Vec8ui const a) { + __m256i a2 = _mm256_permute4x64_epi64(a, 0xC8); // get bits 128-191 to position 64-127 + return _mm256_unpackhi_epi32(a2, _mm256_setzero_si256()); // interleave with zero extensions +} + +// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Overflow wraps around +static inline Vec32c compress (Vec16s const low, Vec16s const high) { + __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for low bytes + __m256i lowm = _mm256_and_si256(low, mask); // bytes of low + __m256i highm = _mm256_and_si256(high, mask); // bytes of high + __m256i pk = _mm256_packus_epi16(lowm, highm); // unsigned pack + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed, with saturation +static inline Vec32c compress_saturated (Vec16s const low, Vec16s const high) { + __m256i pk = _mm256_packs_epi16(low,high); // packed with signed saturation + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers +// Unsigned, overflow wraps around +static inline Vec32uc compress (Vec16us const low, Vec16us const high) { + return Vec32uc (compress((Vec16s)low, (Vec16s)high)); +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Unsigned, with saturation +static inline Vec32uc compress_saturated (Vec16us const low, Vec16us const high) { + __m256i maxval = _mm256_set1_epi32(0x00FF00FF); // maximum value + __m256i low1 = _mm256_min_epu16(low,maxval); // upper limit + __m256i high1 = _mm256_min_epu16(high,maxval); // upper limit + __m256i pk = _mm256_packus_epi16(low1,high1); // this instruction saturates from signed 32 bit to unsigned 16 bit + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed to unsigned, with saturation +static inline Vec32uc compress_saturated_s2u (Vec16s const low, Vec16s const high) { + __m256i pk = _mm256_packus_epi16(low,high); // this instruction saturates from signed 16 bit to unsigned 8 bit + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec16s compress (Vec8i const low, Vec8i const high) { + __m256i mask = _mm256_set1_epi32(0x0000FFFF); // mask for low words + __m256i lowm = _mm256_and_si256(low,mask); // words of low + __m256i highm = _mm256_and_si256(high,mask); // words of high + __m256i pk = _mm256_packus_epi32(lowm,highm); // unsigned pack + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed with saturation +static inline Vec16s compress_saturated (Vec8i const low, Vec8i const high) { + __m256i pk = _mm256_packs_epi32(low,high); // pack with signed saturation + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec16us compress (Vec8ui const low, Vec8ui const high) { + return Vec16us (compress((Vec8i)low, (Vec8i)high)); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Unsigned, with saturation +static inline Vec16us compress_saturated (Vec8ui const low, Vec8ui const high) { + __m256i maxval = _mm256_set1_epi32(0x0000FFFF); // maximum value + __m256i low1 = _mm256_min_epu32(low,maxval); // upper limit + __m256i high1 = _mm256_min_epu32(high,maxval); // upper limit + __m256i pk = _mm256_packus_epi32(low1,high1); // this instruction saturates from signed 32 bit to unsigned 16 bit + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed to unsigned, with saturation +static inline Vec16us compress_saturated_s2u (Vec8i const low, Vec8i const high) { + __m256i pk = _mm256_packus_epi32(low,high); // this instruction saturates from signed 32 bit to unsigned 16 bit + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Overflow wraps around +static inline Vec8i compress (Vec4q const low, Vec4q const high) { + __m256i low2 = _mm256_shuffle_epi32(low,0xD8); // low dwords of low to pos. 0 and 32 + __m256i high2 = _mm256_shuffle_epi32(high,0xD8); // low dwords of high to pos. 0 and 32 + __m256i pk = _mm256_unpacklo_epi64(low2,high2); // interleave + return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place +} + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Signed, with saturation +static inline Vec8i compress_saturated (Vec4q const a, Vec4q const b) { + Vec4q maxval = constant8ui<0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0>(); + Vec4q minval = constant8ui<0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF>(); + Vec4q a1 = min(a,maxval); + Vec4q b1 = min(b,maxval); + Vec4q a2 = max(a1,minval); + Vec4q b2 = max(b1,minval); + return compress(a2,b2); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec8ui compress (Vec4uq const low, Vec4uq const high) { + return Vec8ui (compress((Vec4q)low, (Vec4q)high)); +} + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Unsigned, with saturation +static inline Vec8ui compress_saturated (Vec4uq const low, Vec4uq const high) { + __m256i zero = _mm256_setzero_si256(); // 0 + __m256i lowzero = _mm256_cmpeq_epi32(low,zero); // for each dword is zero + __m256i highzero = _mm256_cmpeq_epi32(high,zero); // for each dword is zero + __m256i mone = _mm256_set1_epi32(-1); // FFFFFFFF + __m256i lownz = _mm256_xor_si256(lowzero,mone); // for each dword is nonzero + __m256i highnz = _mm256_xor_si256(highzero,mone); // for each dword is nonzero + __m256i lownz2 = _mm256_srli_epi64(lownz,32); // shift down to low dword + __m256i highnz2 = _mm256_srli_epi64(highnz,32); // shift down to low dword + __m256i lowsatur = _mm256_or_si256(low,lownz2); // low, saturated + __m256i hisatur = _mm256_or_si256(high,highnz2); // high, saturated + return Vec8ui (compress(Vec4q(lowsatur), Vec4q(hisatur))); +} + + +/***************************************************************************** +* +* Integer division operators +* +* Please see the file vectori128.h for explanation. +* +*****************************************************************************/ + +// vector operator / : divide each element by divisor + +// vector of 8 32-bit signed integers +static inline Vec8i operator / (Vec8i const a, Divisor_i const d) { + __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier + __m256i sgn = _mm256_broadcastq_epi64(d.getsign()); // broadcast sign of d + __m256i t1 = _mm256_mul_epi32(a,m); // 32x32->64 bit signed multiplication of even elements of a + __m256i t2 = _mm256_srli_epi64(t1,32); // high dword of even numbered results + __m256i t3 = _mm256_srli_epi64(a,32); // get odd elements of a into position for multiplication + __m256i t4 = _mm256_mul_epi32(t3,m); // 32x32->64 bit signed multiplication of odd elements + __m256i t7 = _mm256_blend_epi32(t2,t4,0xAA); + __m256i t8 = _mm256_add_epi32(t7,a); // add + __m256i t9 = _mm256_sra_epi32(t8,d.gets1()); // shift right artihmetic + __m256i t10 = _mm256_srai_epi32(a,31); // sign of a + __m256i t11 = _mm256_sub_epi32(t10,sgn); // sign of a - sign of d + __m256i t12 = _mm256_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 + return _mm256_xor_si256(t12,sgn); // change sign if divisor negative +} + +// vector of 8 32-bit unsigned integers +static inline Vec8ui operator / (Vec8ui const a, Divisor_ui const d) { + __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier + __m256i t1 = _mm256_mul_epu32(a,m); // 32x32->64 bit unsigned multiplication of even elements of a + __m256i t2 = _mm256_srli_epi64(t1,32); // high dword of even numbered results + __m256i t3 = _mm256_srli_epi64(a,32); // get odd elements of a into position for multiplication + __m256i t4 = _mm256_mul_epu32(t3,m); // 32x32->64 bit unsigned multiplication of odd elements + __m256i t7 = _mm256_blend_epi32(t2,t4,0xAA); + __m256i t8 = _mm256_sub_epi32(a,t7); // subtract + __m256i t9 = _mm256_srl_epi32(t8,d.gets1()); // shift right logical + __m256i t10 = _mm256_add_epi32(t7,t9); // add + return _mm256_srl_epi32(t10,d.gets2()); // shift right logical +} + +// vector of 16 16-bit signed integers +static inline Vec16s operator / (Vec16s const a, Divisor_s const d) { + __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier + __m256i sgn = _mm256_broadcastq_epi64(d.getsign()); // broadcast sign of d + __m256i t1 = _mm256_mulhi_epi16(a, m); // multiply high signed words + __m256i t2 = _mm256_add_epi16(t1,a); // + a + __m256i t3 = _mm256_sra_epi16(t2,d.gets1()); // shift right artihmetic + __m256i t4 = _mm256_srai_epi16(a,15); // sign of a + __m256i t5 = _mm256_sub_epi16(t4,sgn); // sign of a - sign of d + __m256i t6 = _mm256_sub_epi16(t3,t5); // + 1 if a < 0, -1 if d < 0 + return _mm256_xor_si256(t6,sgn); // change sign if divisor negative +} + +// vector of 16 16-bit unsigned integers +static inline Vec16us operator / (Vec16us const a, Divisor_us const d) { + __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier + __m256i t1 = _mm256_mulhi_epu16(a, m); // multiply high signed words + __m256i t2 = _mm256_sub_epi16(a,t1); // subtract + __m256i t3 = _mm256_srl_epi16(t2,d.gets1()); // shift right logical + __m256i t4 = _mm256_add_epi16(t1,t3); // add + return _mm256_srl_epi16(t4,d.gets2()); // shift right logical +} + +// vector of 32 8-bit signed integers +static inline Vec32c operator / (Vec32c const a, Divisor_s const d) { +#if INSTRSET >= 10 + // sign-extend even-numbered and odd-numbered elements to 16 bits + Vec16s even = _mm256_srai_epi16(_mm256_slli_epi16(a, 8),8); + Vec16s odd = _mm256_srai_epi16(a, 8); + Vec16s evend = even / d; // divide even-numbered elements + Vec16s oddd = odd / d; // divide odd-numbered elements + oddd = _mm256_slli_epi16(oddd, 8); // shift left to put back in place + __m256i res = _mm256_mask_mov_epi8(evend, 0xAAAAAAAA, oddd); // interleave even and odd + return res; +#else + // expand into two Vec16s + Vec16s low = extend_low(a) / d; + Vec16s high = extend_high(a) / d; + return compress(low,high); +#endif +} + + +// vector of 32 8-bit unsigned integers +static inline Vec32uc operator / (Vec32uc const a, Divisor_us const d) { + // zero-extend even-numbered and odd-numbered elements to 16 bits +#if INSTRSET >= 10 + Vec16us even = _mm256_maskz_mov_epi8(__mmask32(0x55555555), a); + Vec16us odd = _mm256_srli_epi16(a, 8); + Vec16us evend = even / d; // divide even-numbered elements + Vec16us oddd = odd / d; // divide odd-numbered elements + oddd = _mm256_slli_epi16(oddd, 8); // shift left to put back in place + __m256i res = _mm256_mask_mov_epi8(evend, 0xAAAAAAAA, oddd); // interleave even and odd + return res; +#else + // expand into two Vec16s + Vec16us low = extend_low(a) / d; + Vec16us high = extend_high(a) / d; + return compress(low,high); +#endif +} + +// vector operator /= : divide +static inline Vec8i & operator /= (Vec8i & a, Divisor_i const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec16s & operator /= (Vec16s & a, Divisor_s const d) { + a = a / d; + return a; +} + + +// vector operator /= : divide +static inline Vec16us & operator /= (Vec16us & a, Divisor_us const d) { + a = a / d; + return a; + +} + +// vector operator /= : divide +static inline Vec32c & operator /= (Vec32c & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const d) { + a = a / d; + return a; +} + + +/***************************************************************************** +* +* Integer division 2: divisor is a compile-time constant +* +*****************************************************************************/ + +// Divide Vec8i by compile-time constant +template +static inline Vec8i divide_by_i(Vec8i const x) { + static_assert(d != 0, "Integer division by zero"); + if constexpr (d == 1) return x; + if constexpr (d == -1) return -x; + if constexpr (uint32_t(d) == 0x80000000u) return Vec8i(x == Vec8i(0x80000000)) & 1; // prevent overflow when changing sign + constexpr uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d);// compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits) + if constexpr ((d1 & (d1-1)) == 0) { + // d1 is a power of 2. use shift + constexpr int k = bit_scan_reverse_const(d1); + __m256i sign; + if constexpr (k > 1) sign = _mm256_srai_epi32(x, k-1); else sign = x; // k copies of sign bit + __m256i bias = _mm256_srli_epi32(sign, 32-k); // bias = x >= 0 ? 0 : k-1 + __m256i xpbias = _mm256_add_epi32 (x, bias); // x + bias + __m256i q = _mm256_srai_epi32(xpbias, k); // (x + bias) >> k + if (d > 0) return q; // d > 0: return q + return _mm256_sub_epi32(_mm256_setzero_si256(), q);// d < 0: return -q + } + // general case + constexpr int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);// ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case) + constexpr int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));// multiplier + const Divisor_i div(mult, sh, d < 0 ? -1 : 0); + return x / div; +} + +// define Vec8i a / const_int(d) +template +static inline Vec8i operator / (Vec8i const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec8i a / const_uint(d) +template +static inline Vec8i operator / (Vec8i const a, Const_uint_t) { + static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec8i & operator /= (Vec8i & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec8i & operator /= (Vec8i & a, Const_uint_t b) { + a = a / b; + return a; +} + + +// Divide Vec8ui by compile-time constant +template +static inline Vec8ui divide_by_ui(Vec8ui const x) { + static_assert(d != 0, "Integer division by zero"); + if constexpr (d == 1) return x; // divide by 1 + constexpr int b = bit_scan_reverse_const(d); // floor(log2(d)) + if constexpr ((uint32_t(d) & (uint32_t(d)-1)) == 0) { + // d is a power of 2. use shift + return _mm256_srli_epi32(x, b); // x >> b + } + // general case (d > 2) + constexpr uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d + constexpr uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d + constexpr bool round_down = (2*rem < d); // check if fraction is less than 0.5 + constexpr uint32_t mult1 = round_down ? mult : mult + 1; + // do 32*32->64 bit unsigned multiplication and get high part of result +#if INSTRSET >= 10 + const __m256i multv = _mm256_maskz_set1_epi32(0x55, mult1);// zero-extend mult and broadcast +#else + const __m256i multv = Vec4uq(uint64_t(mult1)); // zero-extend mult and broadcast +#endif + __m256i t1 = _mm256_mul_epu32(x,multv); // 32x32->64 bit unsigned multiplication of x[0] and x[2] + if constexpr (round_down) { + t1 = _mm256_add_epi64(t1,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow + } + __m256i t2 = _mm256_srli_epi64(t1,32); // high dword of result 0 and 2 + __m256i t3 = _mm256_srli_epi64(x,32); // get x[1] and x[3] into position for multiplication + __m256i t4 = _mm256_mul_epu32(t3,multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3] + if constexpr (round_down) { + t4 = _mm256_add_epi64(t4,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow + } + __m256i t7 = _mm256_blend_epi32(t2,t4,0xAA); + Vec8ui q = _mm256_srli_epi32(t7, b); // shift right by b + return q; // no overflow possible +} + +// define Vec8ui a / const_uint(d) +template +static inline Vec8ui operator / (Vec8ui const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec8ui a / const_int(d) +template +static inline Vec8ui operator / (Vec8ui const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t b) { + a = a / b; + return a; +} + + +// Divide Vec16s by compile-time constant +template +static inline Vec16s divide_by_i(Vec16s const x) { + constexpr int16_t d0 = int16_t(d); // truncate d to 16 bits + static_assert(d0 != 0, "Integer division by zero"); + if constexpr (d0 == 1) return x; // divide by 1 + if constexpr (d0 == -1) return -x; // divide by -1 + if constexpr (uint16_t(d0) == 0x8000u) return Vec16s(x == Vec16s(0x8000)) & 1;// prevent overflow when changing sign + constexpr uint16_t d1 = d0 > 0 ? d0 : -d0; // compile-time abs(d0) + if constexpr ((d1 & (d1-1)) == 0) { + // d is a power of 2. use shift + constexpr int k = bit_scan_reverse_const(uint32_t(d1)); + __m256i sign; + if constexpr (k > 1) sign = _mm256_srai_epi16(x, k-1); else sign = x;// k copies of sign bit + __m256i bias = _mm256_srli_epi16(sign, 16-k); // bias = x >= 0 ? 0 : k-1 + __m256i xpbias = _mm256_add_epi16 (x, bias); // x + bias + __m256i q = _mm256_srai_epi16(xpbias, k); // (x + bias) >> k + if constexpr (d0 > 0) return q; // d0 > 0: return q + return _mm256_sub_epi16(_mm256_setzero_si256(), q);// d0 < 0: return -q + } + // general case + constexpr int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1;// ceil(log2(d)). (d < 2 handled above) + constexpr int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier + constexpr int shift1 = L - 1; + const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1); + return x / div; +} + +// define Vec16s a / const_int(d) +template +static inline Vec16s operator / (Vec16s const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec16s a / const_uint(d) +template +static inline Vec16s operator / (Vec16s const a, Const_uint_t) { + static_assert(d < 0x8000u, "Dividing signed integer by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec16s & operator /= (Vec16s & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16s & operator /= (Vec16s & a, Const_uint_t b) { + a = a / b; + return a; +} + + +// Divide Vec16us by compile-time constant +template +static inline Vec16us divide_by_ui(Vec16us const x) { + constexpr uint16_t d0 = uint16_t(d); // truncate d to 16 bits + static_assert(d0 != 0, "Integer division by zero"); + if constexpr (d0 == 1) return x; // divide by 1 + constexpr int b = bit_scan_reverse_const((uint32_t)d0);// floor(log2(d)) + if constexpr ((d0 & (d0-1)) == 0) { + // d is a power of 2. use shift + return _mm256_srli_epi16(x, b); // x >> b + } + // general case (d > 2) + constexpr uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0);// multiplier = 2^(32+b) / d + constexpr uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d + constexpr bool round_down = (2*rem < d0); // check if fraction is less than 0.5 + Vec16us x1 = x; + if constexpr (round_down) { + x1 = x1 + 1; // round down mult and compensate by adding 1 to x + } + constexpr uint16_t mult1 = round_down ? mult : mult + 1; + const __m256i multv = _mm256_set1_epi16((int16_t)mult1);// broadcast mult + __m256i xm = _mm256_mulhi_epu16(x1, multv); // high part of 16x16->32 bit unsigned multiplication + Vec16us q = _mm256_srli_epi16(xm, b); // shift right by b + if constexpr (round_down) { + Vec16sb overfl = (x1 == Vec16us(_mm256_setzero_si256())); // check for overflow of x+1 + return select(overfl, Vec16us(uint16_t(mult1 >> (uint16_t)b)), q); // deal with overflow (rarely needed) + } + else { + return q; // no overflow possible + } +} + +// define Vec16us a / const_uint(d) +template +static inline Vec16us operator / (Vec16us const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec16us a / const_int(d) +template +static inline Vec16us operator / (Vec16us const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec16us & operator /= (Vec16us & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16us & operator /= (Vec16us & a, Const_int_t b) { + a = a / b; + return a; +} + +// define Vec32c a / const_int(d) +template +static inline Vec32c operator / (Vec32c const a, Const_int_t) { + // expand into two Vec16s + Vec16s low = extend_low(a) / Const_int_t(); + Vec16s high = extend_high(a) / Const_int_t(); + return compress(low,high); +} + +// define Vec32c a / const_uint(d) +template +static inline Vec32c operator / (Vec32c const a, Const_uint_t) { + static_assert(uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned"); + return a / Const_int_t(); // signed divide +} + +// vector operator /= : divide +template +static inline Vec32c & operator /= (Vec32c & a, Const_int_t b) { + a = a / b; + return a; +} +// vector operator /= : divide +template +static inline Vec32c & operator /= (Vec32c & a, Const_uint_t b) { + a = a / b; + return a; +} + +// define Vec32uc a / const_uint(d) +template +static inline Vec32uc operator / (Vec32uc const a, Const_uint_t) { + // expand into two Vec16us + Vec16us low = extend_low(a) / Const_uint_t(); + Vec16us high = extend_high(a) / Const_uint_t(); + return compress(low,high); +} + +// define Vec32uc a / const_int(d) +template +static inline Vec32uc operator / (Vec32uc const a, Const_int_t) { + static_assert(int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous"); + return a / Const_uint_t(); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t b) { + a = a / b; + return a; +} + + +/***************************************************************************** +* +* Boolean <-> bitfield conversion functions +* +*****************************************************************************/ + +#if INSTRSET >= 10 // compact boolean vectors, other sizes + +// to_bits: convert boolean vector to integer bitfield +static inline uint32_t to_bits(Vec32b const x) { + return __mmask32(x); +} + +#else + +// to_bits: convert boolean vector to integer bitfield +static inline uint32_t to_bits(Vec32cb const x) { + return (uint32_t)_mm256_movemask_epi8(x); +} + +static inline uint16_t to_bits(Vec16sb const x) { + __m128i a = _mm_packs_epi16(x.get_low(), x.get_high()); // 16-bit words to bytes + return (uint16_t)_mm_movemask_epi8(a); +} + +static inline uint8_t to_bits(Vec8ib const x) { + __m128i a = _mm_packs_epi32(x.get_low(), x.get_high()); // 32-bit dwords to 16-bit words + __m128i b = _mm_packs_epi16(a, a); // 16-bit words to bytes + return (uint8_t)_mm_movemask_epi8(b); +} + +static inline uint8_t to_bits(Vec4qb const x) { + uint32_t a = (uint32_t)_mm256_movemask_epi8(x); + return ((a & 1) | ((a >> 7) & 2)) | (((a >> 14) & 4) | ((a >> 21) & 8)); +} + +#endif + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORI256_H diff --git a/DFTTest/vectorclass/vectori256e.h b/DFTTest/VCL2/vectori256e.h similarity index 55% rename from DFTTest/vectorclass/vectori256e.h rename to DFTTest/VCL2/vectori256e.h index 4bdfb91..df47594 100644 --- a/DFTTest/vectorclass/vectori256e.h +++ b/DFTTest/VCL2/vectori256e.h @@ -1,15 +1,17 @@ /**************************** vectori256e.h ******************************* * Author: Agner Fog * Date created: 2012-05-30 -* Last modified: 2017-02-19 -* Version: 1.27 -* Project: vector classes +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library * Description: * Header file defining 256-bit integer point vector classes as interface * to intrinsic functions. Emulated for processors without AVX2 instruction set. * +* Instructions: see vcl_manual.pdf +* * The following vector classes are defined here: -* Vec256b Vector of 256 1-bit unsigned integers or Booleans +* Vec256b Vector of 256 bits. Used internally as base class * Vec32c Vector of 32 8-bit signed integers * Vec32uc Vector of 32 8-bit unsigned integers * Vec32cb Vector of 32 Booleans for use with Vec32c and Vec32uc @@ -23,83 +25,56 @@ * Vec4uq Vector of 4 64-bit unsigned integers * Vec4qb Vector of 4 Booleans for use with Vec4q and Vec4uq * -* For detailed instructions, see VectorClass.pdf +* Each vector object is represented internally in the CPU as two 128-bit registers. +* This header file defines operators and functions for these vectors. * -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. *****************************************************************************/ -// check combination of header files -#if defined (VECTORI256_H) -#if VECTORI256_H != 1 -#error Two different versions of vectori256.h included +#ifndef VECTORI256E_H +#define VECTORI256E_H 1 + +#ifndef VECTORCLASS_H +#include "vectorclass.h" #endif -#else -#define VECTORI256_H 1 -#ifdef VECTORF256_H -#error Please put header file vectori256.h or vectori256e.h before vectorf256e.h +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed #endif +// check combination of header files +#if defined (VECTORI256_H) +#error Two different versions of vectori256.h included +#endif -#include "vectori128.h" #ifdef VCL_NAMESPACE namespace VCL_NAMESPACE { #endif -/***************************************************************************** -* -* base class Vec256ie -* -*****************************************************************************/ -// base class to replace Vec256ie when AVX2 is not supported -class Vec256ie { -protected: - __m128i y0; // low half - __m128i y1; // high half -public: - Vec256ie(void) {}; // default constructor - Vec256ie(__m128i x0, __m128i x1) { // constructor to build from two __m128i - y0 = x0; y1 = x1; - } - __m128i get_low() const { // get low half - return y0; - } - __m128i get_high() const { // get high half - return y1; - } -}; - /***************************************************************************** * -* Vector of 256 1-bit unsigned integers or Booleans +* Vector of 256 bits. used as base class * *****************************************************************************/ -class Vec256b : public Vec256ie { +class Vec256b { +protected: + __m128i y0; // low half + __m128i y1; // high half public: // Default constructor: Vec256b() { } - // Constructor to broadcast the same value into all elements - // Removed because of undesired implicit conversions - //Vec256b(int i) { - // y1 = y0 = _mm_set1_epi32(-(i & 1));} - + Vec256b(__m128i x0, __m128i x1) { // constructor to build from two __m128i + y0 = x0; y1 = x1; + } // Constructor to build from two Vec128b: - Vec256b(Vec128b const & a0, Vec128b const & a1) { + Vec256b(Vec128b const a0, Vec128b const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec256b(Vec256ie const & x) { - y0 = x.get_low(); y1 = x.get_high(); - } - // Assignment operator to convert from type Vec256ie - Vec256b & operator = (Vec256ie const & x) { - y0 = x.get_low(); y1 = x.get_high(); - return *this; - } // Member function to load from array (unaligned) Vec256b & load(void const * p) { y0 = _mm_loadu_si128((__m128i const*)p); @@ -126,37 +101,14 @@ class Vec256b : public Vec256ie { _mm_store_si128((__m128i*)p, y0); _mm_store_si128((__m128i*)p + 1, y1); } - // Member function to store into array using a non-temporal memory hint, aligned by 32 - void stream(void * p) const { + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 32 + void store_nt(void * p) const { _mm_stream_si128((__m128i*)p, y0); _mm_stream_si128((__m128i*)p + 1, y1); } - // Member function to change a single bit - // Note: This function is inefficient. Use load function if changing more than one bit - Vec256b const & set_bit(uint32_t index, int value) { - if (index < 128) { - y0 = Vec128b(y0).set_bit(index, value); - } - else { - y1 = Vec128b(y1).set_bit(index-128, value); - } - return *this; - } - // Member function to get a single bit - // Note: This function is inefficient. Use store function if reading more than one bit - int get_bit(uint32_t index) const { - if (index < 128) { - return Vec128b(y0).get_bit(index); - } - else { - return Vec128b(y1).get_bit(index-128); - } - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return get_bit(index) != 0; - } // Member functions to split into two Vec128b: Vec128b get_low() const { return y0; @@ -164,122 +116,88 @@ class Vec256b : public Vec256ie { Vec128b get_high() const { return y1; } - static int size () { + static constexpr int size() { return 256; } + static constexpr int elementtype() { + return 1; + } }; // Define operators for this class // vector operator & : bitwise and -static inline Vec256b operator & (Vec256b const & a, Vec256b const & b) { +static inline Vec256b operator & (Vec256b const a, Vec256b const b) { return Vec256b(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec256b operator && (Vec256b const & a, Vec256b const & b) { +static inline Vec256b operator && (Vec256b const a, Vec256b const b) { return a & b; } // vector operator | : bitwise or -static inline Vec256b operator | (Vec256b const & a, Vec256b const & b) { +static inline Vec256b operator | (Vec256b const a, Vec256b const b) { return Vec256b(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec256b operator || (Vec256b const & a, Vec256b const & b) { +static inline Vec256b operator || (Vec256b const a, Vec256b const b) { return a | b; } // vector operator ^ : bitwise xor -static inline Vec256b operator ^ (Vec256b const & a, Vec256b const & b) { +static inline Vec256b operator ^ (Vec256b const a, Vec256b const b) { return Vec256b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ~ : bitwise not -static inline Vec256b operator ~ (Vec256b const & a) { +static inline Vec256b operator ~ (Vec256b const a) { return Vec256b(~a.get_low(), ~a.get_high()); } // vector operator &= : bitwise and -static inline Vec256b & operator &= (Vec256b & a, Vec256b const & b) { +static inline Vec256b & operator &= (Vec256b & a, Vec256b const b) { a = a & b; return a; } // vector operator |= : bitwise or -static inline Vec256b & operator |= (Vec256b & a, Vec256b const & b) { +static inline Vec256b & operator |= (Vec256b & a, Vec256b const b) { a = a | b; return a; } // vector operator ^= : bitwise xor -static inline Vec256b & operator ^= (Vec256b & a, Vec256b const & b) { +static inline Vec256b & operator ^= (Vec256b & a, Vec256b const b) { a = a ^ b; return a; } -// Define functions for this class - -static inline Vec256b zero_256b() { - Vec128b zero = _mm_setzero_si128(); - return Vec256b(zero, zero); -} - -// function andnot: a & ~ b -static inline Vec256b andnot (Vec256b const & a, Vec256b const & b) { - return Vec256b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); -} - - /***************************************************************************** * -* Generate compile-time constant vector +* Functions for this class * *****************************************************************************/ -// Generate a constant vector of 8 integers stored in memory. -// Can be converted to any integer vector type -template -static inline Vec256ie constant8i() { - static const union { - int32_t i[8]; - __m128i y[2]; - } u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; - return Vec256ie(u.y[0], u.y[1]); -} -template -static inline Vec256ie constant8ui() { - return constant8i(); +// function andnot: a & ~ b +static inline Vec256b andnot (Vec256b const a, Vec256b const b) { + return Vec256b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); } - -/***************************************************************************** -* -* selectb function -* -*****************************************************************************/ // Select between two sources, byte by byte. Used in various functions and operators // Corresponds to this pseudocode: // for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed. -// Only bit 7 in each byte of s is checked, -static inline Vec256ie selectb (Vec256ie const & s, Vec256ie const & a, Vec256ie const & b) { - return Vec256ie(selectb(s.get_low(), a.get_low(), b.get_low()), - selectb(s.get_high(), a.get_high(), b.get_high())); +// Only bit 7 in each byte of s is checked, +static inline Vec256b selectb (Vec256b const s, Vec256b const a, Vec256b const b) { + return Vec256b(selectb(s.get_low(), a.get_low(), b.get_low()), + selectb(s.get_high(), a.get_high(), b.get_high())); } - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - // horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec256b const & a) { +static inline bool horizontal_and (Vec256b const a) { return horizontal_and(a.get_low() & a.get_high()); } // horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec256b const & a) { +static inline bool horizontal_or (Vec256b const a) { return horizontal_or(a.get_low() | a.get_high()); } @@ -301,23 +219,23 @@ class Vec32c : public Vec256b { } // Constructor to build from all elements: Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, - int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, + int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23, int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) { y0 = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); } // Constructor to build from two Vec16c: - Vec32c(Vec16c const & a0, Vec16c const & a1) { + Vec32c(Vec16c const a0, Vec16c const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec32c(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec32c(Vec256b const & x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec32c & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec32c & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; @@ -377,9 +295,8 @@ class Vec32c : public Vec256b { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec32c const & insert(uint32_t index, int8_t value) { - if (index < 16) { + Vec32c const insert(int index, int8_t value) { + if ((uint32_t)index < 16) { y0 = Vec16c(y0).insert(index, value); } else { @@ -388,8 +305,8 @@ class Vec32c : public Vec256b { return *this; } // Member function extract a single element from vector - int8_t extract(uint32_t index) const { - if (index < 16) { + int8_t extract(int index) const { + if ((uint32_t)index < 16) { return Vec16c(y0).extract(index); } else { @@ -398,7 +315,7 @@ class Vec32c : public Vec256b { } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - int8_t operator [] (uint32_t index) const { + int8_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec16c: @@ -408,9 +325,12 @@ class Vec32c : public Vec256b { Vec16c get_high() const { return y1; } - static int size () { + static constexpr int size() { return 32; } + static constexpr int elementtype() { + return 4; + } }; @@ -423,24 +343,15 @@ class Vec32c : public Vec256b { class Vec32cb : public Vec32c { public: // Default constructor: - Vec32cb(){} - // Constructor to build from all elements: - Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, - bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15, - bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23, - bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) : - Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), - -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15), - -int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23), - -int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31)) - {} - // Constructor to convert from type Vec256ie - Vec32cb(Vec256ie const & x) { + Vec32cb() {} + + // Constructor to convert from type Vec256b + Vec32cb(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec32cb & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec32cb & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; @@ -453,10 +364,9 @@ class Vec32cb : public Vec32c { *this = Vec32cb(b); return *this; } -private: // Prevent constructing from int, etc. - Vec32cb(int b); - Vec32cb & operator = (int x); -public: + // Constructor to build from two Vec16cb: + Vec32cb(Vec16cb const a0, Vec16cb const a1) : Vec32c(Vec16c(a0), Vec16c(a1)) { + } // Member functions to split into two Vec16c: Vec16cb get_low() const { return y0; @@ -467,16 +377,28 @@ class Vec32cb : public Vec32c { Vec32cb & insert (int index, bool a) { Vec32c::insert(index, -(int)a); return *this; - } + } // Member function extract a single element from vector - bool extract(uint32_t index) const { + bool extract(int index) const { return Vec32c::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { + bool operator [] (int index) const { return extract(index); } + // Member function to change a bitfield to a boolean vector + Vec32cb & load_bits(uint32_t a) { + y0 = Vec16cb().load_bits(uint16_t(a)); + y1 = Vec16cb().load_bits(uint16_t(a>>16)); + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec32cb(int b) = delete; + Vec32cb & operator = (int x) = delete; }; @@ -487,53 +409,63 @@ class Vec32cb : public Vec32c { *****************************************************************************/ // vector operator & : bitwise and -static inline Vec32cb operator & (Vec32cb const & a, Vec32cb const & b) { +static inline Vec32cb operator & (Vec32cb const a, Vec32cb const b) { return Vec32cb(Vec256b(a) & Vec256b(b)); } -static inline Vec32cb operator && (Vec32cb const & a, Vec32cb const & b) { +static inline Vec32cb operator && (Vec32cb const a, Vec32cb const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const & b) { +static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec32cb operator | (Vec32cb const & a, Vec32cb const & b) { +static inline Vec32cb operator | (Vec32cb const a, Vec32cb const b) { return Vec32cb(Vec256b(a) | Vec256b(b)); } -static inline Vec32cb operator || (Vec32cb const & a, Vec32cb const & b) { +static inline Vec32cb operator || (Vec32cb const a, Vec32cb const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const & b) { +static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec32cb operator ^ (Vec32cb const & a, Vec32cb const & b) { +static inline Vec32cb operator ^ (Vec32cb const a, Vec32cb const b) { return Vec32cb(Vec256b(a) ^ Vec256b(b)); } // vector operator ^= : bitwise xor -static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const & b) { +static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const b) { a = a ^ b; return a; } +// vector operator == : xnor +static inline Vec32cb operator == (Vec32cb const a, Vec32cb const b) { + return Vec32cb(Vec256b(a) ^ Vec256b(~b)); +} + +// vector operator != : xor +static inline Vec32cb operator != (Vec32cb const a, Vec32cb const b) { + return Vec32cb(a ^ b); +} + // vector operator ~ : bitwise not -static inline Vec32cb operator ~ (Vec32cb const & a) { +static inline Vec32cb operator ~ (Vec32cb const a) { return Vec32cb( ~ Vec256b(a)); } // vector operator ! : element not -static inline Vec32cb operator ! (Vec32cb const & a) { +static inline Vec32cb operator ! (Vec32cb const a) { return ~ a; } // vector function andnot -static inline Vec32cb andnot (Vec32cb const & a, Vec32cb const & b) { +static inline Vec32cb andnot (Vec32cb const a, Vec32cb const b) { return Vec32cb(andnot(Vec256b(a), Vec256b(b))); } @@ -545,12 +477,12 @@ static inline Vec32cb andnot (Vec32cb const & a, Vec32cb const & b) { *****************************************************************************/ // vector operator + : add element by element -static inline Vec32c operator + (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator + (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator += : add -static inline Vec32c & operator += (Vec32c & a, Vec32c const & b) { +static inline Vec32c & operator += (Vec32c & a, Vec32c const b) { a = a + b; return a; } @@ -569,17 +501,17 @@ static inline Vec32c & operator ++ (Vec32c & a) { } // vector operator - : subtract element by element -static inline Vec32c operator - (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator - (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : unary minus -static inline Vec32c operator - (Vec32c const & a) { +static inline Vec32c operator - (Vec32c const a) { return Vec32c(-a.get_low(), -a.get_high()); } // vector operator -= : add -static inline Vec32c & operator -= (Vec32c & a, Vec32c const & b) { +static inline Vec32c & operator -= (Vec32c & a, Vec32c const b) { a = a - b; return a; } @@ -598,29 +530,29 @@ static inline Vec32c & operator -- (Vec32c & a) { } // vector operator * : multiply element by element -static inline Vec32c operator * (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator * (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator *= : multiply -static inline Vec32c & operator *= (Vec32c & a, Vec32c const & b) { +static inline Vec32c & operator *= (Vec32c & a, Vec32c const b) { a = a * b; return a; } // vector of 32 8-bit signed integers -static inline Vec32c operator / (Vec32c const & a, Divisor_s const & d) { +static inline Vec32c operator / (Vec32c const a, Divisor_s const d) { return Vec32c(a.get_low() / d, a.get_high() / d); } // vector operator /= : divide -static inline Vec32c & operator /= (Vec32c & a, Divisor_s const & d) { +static inline Vec32c & operator /= (Vec32c & a, Divisor_s const d) { a = a / d; return a; } // vector operator << : shift left all elements -static inline Vec32c operator << (Vec32c const & a, int b) { +static inline Vec32c operator << (Vec32c const a, int b) { return Vec32c(a.get_low() << b, a.get_high() << b); } @@ -631,7 +563,7 @@ static inline Vec32c & operator <<= (Vec32c & a, int b) { } // vector operator >> : shift right arithmetic all elements -static inline Vec32c operator >> (Vec32c const & a, int b) { +static inline Vec32c operator >> (Vec32c const a, int b) { return Vec32c(a.get_low() >> b, a.get_high() >> b); } @@ -642,141 +574,149 @@ static inline Vec32c & operator >>= (Vec32c & a, int b) { } // vector operator == : returns true for elements for which a == b -static inline Vec32cb operator == (Vec32c const & a, Vec32c const & b) { +static inline Vec32cb operator == (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b -static inline Vec32cb operator != (Vec32c const & a, Vec32c const & b) { +static inline Vec32cb operator != (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } // vector operator > : returns true for elements for which a > b (signed) -static inline Vec32cb operator > (Vec32c const & a, Vec32c const & b) { +static inline Vec32cb operator > (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b (signed) -static inline Vec32cb operator < (Vec32c const & a, Vec32c const & b) { +static inline Vec32cb operator < (Vec32c const a, Vec32c const b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec32cb operator >= (Vec32c const & a, Vec32c const & b) { +static inline Vec32cb operator >= (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec32cb operator <= (Vec32c const & a, Vec32c const & b) { +static inline Vec32cb operator <= (Vec32c const a, Vec32c const b) { return b >= a; } // vector operator & : bitwise and -static inline Vec32c operator & (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator & (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec32c operator && (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator && (Vec32c const a, Vec32c const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec32c & operator &= (Vec32c & a, Vec32c const & b) { +static inline Vec32c & operator &= (Vec32c & a, Vec32c const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec32c operator | (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator | (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec32c operator || (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator || (Vec32c const a, Vec32c const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec32c & operator |= (Vec32c & a, Vec32c const & b) { +static inline Vec32c & operator |= (Vec32c & a, Vec32c const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec32c operator ^ (Vec32c const & a, Vec32c const & b) { +static inline Vec32c operator ^ (Vec32c const a, Vec32c const b) { return Vec32c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec32c & operator ^= (Vec32c & a, Vec32c const & b) { +static inline Vec32c & operator ^= (Vec32c & a, Vec32c const b) { a = a ^ b; return a; } // vector operator ~ : bitwise not -static inline Vec32c operator ~ (Vec32c const & a) { +static inline Vec32c operator ~ (Vec32c const a) { return Vec32c(~a.get_low(), ~a.get_high()); } // vector operator ! : logical not, returns true for elements == 0 -static inline Vec32cb operator ! (Vec32c const & a) { +static inline Vec32cb operator ! (Vec32c const a) { return Vec32c(!a.get_low(), !a.get_high()); } // Functions for this class -// Select between two operands. Corresponds to this pseudocode: +// Select between two operands using broad boolean vectors. Corresponds to this pseudocode: // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -static inline Vec32c select (Vec32cb const & s, Vec32c const & a, Vec32c const & b) { +static inline Vec32c select (Vec32cb const s, Vec32c const a, Vec32c const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec32c if_add (Vec32cb const & f, Vec32c const & a, Vec32c const & b) { +static inline Vec32c if_add (Vec32cb const f, Vec32c const a, Vec32c const b) { return a + (Vec32c(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec32c const & a) { - return horizontal_add(a.get_low() + a.get_high()); +// Conditional subtract +static inline Vec32c if_sub (Vec32cb const f, Vec32c const a, Vec32c const b) { + return a - (Vec32c(f) & b); +} + +// Conditional multiply +static inline Vec32c if_mul (Vec32cb const f, Vec32c const a, Vec32c const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint8_t horizontal_add (Vec32c const a) { + return (uint8_t)horizontal_add(a.get_low() + a.get_high()); } // Horizontal add extended: Calculates the sum of all vector elements. // Each element is sign-extended before addition to avoid overflow -static inline int32_t horizontal_add_x (Vec32c const & a) { +static inline int32_t horizontal_add_x (Vec32c const a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); } - // function add_saturated: add element by element, signed with saturation -static inline Vec32c add_saturated(Vec32c const & a, Vec32c const & b) { +static inline Vec32c add_saturated(Vec32c const a, Vec32c const b) { return Vec32c(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); } // function sub_saturated: subtract element by element, signed with saturation -static inline Vec32c sub_saturated(Vec32c const & a, Vec32c const & b) { +static inline Vec32c sub_saturated(Vec32c const a, Vec32c const b) { return Vec32c(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); } // function max: a > b ? a : b -static inline Vec32c max(Vec32c const & a, Vec32c const & b) { +static inline Vec32c max(Vec32c const a, Vec32c const b) { return Vec32c(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec32c min(Vec32c const & a, Vec32c const & b) { +static inline Vec32c min(Vec32c const a, Vec32c const b) { return Vec32c(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } // function abs: a >= 0 ? a : -a -static inline Vec32c abs(Vec32c const & a) { +static inline Vec32c abs(Vec32c const a) { return Vec32c(abs(a.get_low()), abs(a.get_high())); } // function abs_saturated: same as abs, saturate if overflow -static inline Vec32c abs_saturated(Vec32c const & a) { +static inline Vec32c abs_saturated(Vec32c const a) { return Vec32c(abs_saturated(a.get_low()), abs_saturated(a.get_high())); } // function rotate_left all elements // Use negative count to rotate right -static inline Vec32c rotate_left(Vec32c const & a, int b) { +static inline Vec32c rotate_left(Vec32c const a, int b) { return Vec32c(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b)); } @@ -798,22 +738,22 @@ class Vec32uc : public Vec32c { } // Constructor to build from all elements: Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, - uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, + uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) { - y0 = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); - y1 = _mm_setr_epi8(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); + y0 = _mm_setr_epi8((int8_t)i0, (int8_t)i1, (int8_t)i2, (int8_t)i3, (int8_t)i4, (int8_t)i5, (int8_t)i6, (int8_t)i7, (int8_t)i8, (int8_t)i9, (int8_t)i10, (int8_t)i11, (int8_t)i12, (int8_t)i13, (int8_t)i14, (int8_t)i15); + y1 = _mm_setr_epi8((int8_t)i16, (int8_t)i17, (int8_t)i18, (int8_t)i19, (int8_t)i20, (int8_t)i21, (int8_t)i22, (int8_t)i23, (int8_t)i24, (int8_t)i25, (int8_t)i26, (int8_t)i27, (int8_t)i28, (int8_t)i29, (int8_t)i30, (int8_t)i31); } // Constructor to build from two Vec16uc: - Vec32uc(Vec16uc const & a0, Vec16uc const & a1) { + Vec32uc(Vec16uc const a0, Vec16uc const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec32uc(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec32uc(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec32uc & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec32uc & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -830,18 +770,17 @@ class Vec32uc : public Vec32c { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec32uc const & insert(uint32_t index, uint8_t value) { - Vec32c::insert(index, value); + Vec32uc const insert(int index, uint8_t value) { + Vec32c::insert(index, (int8_t)value); return *this; } // Member function extract a single element from vector - uint8_t extract(uint32_t index) const { - return Vec32c::extract(index); + uint8_t extract(int index) const { + return (uint8_t)Vec32c::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - uint8_t operator [] (uint32_t index) const { + uint8_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec16uc: @@ -851,53 +790,56 @@ class Vec32uc : public Vec32c { Vec16uc get_high() const { return y1; } + static constexpr int elementtype() { + return 5; + } }; // Define operators for this class // vector operator + : add -static inline Vec32uc operator + (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +static inline Vec32uc operator + (Vec32uc const a, Vec32uc const b) { + return Vec32uc(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator - : subtract -static inline Vec32uc operator - (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +static inline Vec32uc operator - (Vec32uc const a, Vec32uc const b) { + return Vec32uc(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator * : multiply -static inline Vec32uc operator * (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +static inline Vec32uc operator * (Vec32uc const a, Vec32uc const b) { + return Vec32uc(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator / : divide -static inline Vec32uc operator / (Vec32uc const & a, Divisor_us const & d) { +static inline Vec32uc operator / (Vec32uc const a, Divisor_us const d) { return Vec32uc(a.get_low() / d, a.get_high() / d); } // vector operator /= : divide -static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const & d) { +static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const d) { a = a / d; return a; } // vector operator << : shift left all elements -static inline Vec32uc operator << (Vec32uc const & a, uint32_t b) { - return Vec32uc(a.get_low() << b, a.get_high() << b); +static inline Vec32uc operator << (Vec32uc const a, uint32_t b) { + return Vec32uc(a.get_low() << b, a.get_high() << b); } // vector operator << : shift left all elements -static inline Vec32uc operator << (Vec32uc const & a, int32_t b) { +static inline Vec32uc operator << (Vec32uc const a, int32_t b) { return a << (uint32_t)b; } // vector operator >> : shift right logical all elements -static inline Vec32uc operator >> (Vec32uc const & a, uint32_t b) { - return Vec32uc(a.get_low() >> b, a.get_high() >> b); +static inline Vec32uc operator >> (Vec32uc const a, uint32_t b) { + return Vec32uc(a.get_low() >> b, a.get_high() >> b); } // vector operator >> : shift right logical all elements -static inline Vec32uc operator >> (Vec32uc const & a, int32_t b) { +static inline Vec32uc operator >> (Vec32uc const a, int32_t b) { return a >> (uint32_t)b; } @@ -908,48 +850,48 @@ static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) { } // vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec32cb operator >= (Vec32uc const & a, Vec32uc const & b) { - return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +static inline Vec32cb operator >= (Vec32uc const a, Vec32uc const b) { + return Vec32c(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec32cb operator <= (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32cb operator <= (Vec32uc const a, Vec32uc const b) { return b >= a; } // vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec32cb operator > (Vec32uc const & a, Vec32uc const & b) { - return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high()); +static inline Vec32cb operator > (Vec32uc const a, Vec32uc const b) { + return Vec32c(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec32cb operator < (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32cb operator < (Vec32uc const a, Vec32uc const b) { return b > a; } // vector operator & : bitwise and -static inline Vec32uc operator & (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc operator & (Vec32uc const a, Vec32uc const b) { return Vec32uc(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec32uc operator && (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc operator && (Vec32uc const a, Vec32uc const b) { return a & b; } // vector operator | : bitwise or -static inline Vec32uc operator | (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc operator | (Vec32uc const a, Vec32uc const b) { return Vec32uc(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec32uc operator || (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc operator || (Vec32uc const a, Vec32uc const b) { return a | b; } // vector operator ^ : bitwise xor -static inline Vec32uc operator ^ (Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc operator ^ (Vec32uc const a, Vec32uc const b) { return Vec32uc(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ~ : bitwise not -static inline Vec32uc operator ~ (Vec32uc const & a) { +static inline Vec32uc operator ~ (Vec32uc const a) { return Vec32uc(~a.get_low(), ~a.get_high()); } @@ -959,50 +901,58 @@ static inline Vec32uc operator ~ (Vec32uc const & a) { // for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec32uc select (Vec32cb const & s, Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc select (Vec32cb const s, Vec32uc const a, Vec32uc const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec32uc if_add (Vec32cb const & f, Vec32uc const & a, Vec32uc const & b) { +static inline Vec32uc if_add (Vec32cb const f, Vec32uc const a, Vec32uc const b) { return a + (Vec32uc(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around +// Conditional subtract +static inline Vec32uc if_sub (Vec32cb const f, Vec32uc const a, Vec32uc const b) { + return a - (Vec32uc(f) & b); +} + +// Conditional multiply +static inline Vec32uc if_mul (Vec32cb const f, Vec32uc const a, Vec32uc const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around // (Note: horizontal_add_x(Vec32uc) is slightly faster) -static inline uint32_t horizontal_add (Vec32uc const & a) { +static inline uint32_t horizontal_add (Vec32uc const a) { return horizontal_add(a.get_low() + a.get_high()); } // Horizontal add extended: Calculates the sum of all vector elements. // Each element is zero-extended before addition to avoid overflow -static inline uint32_t horizontal_add_x (Vec32uc const & a) { +static inline uint32_t horizontal_add_x (Vec32uc const a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); } // function add_saturated: add element by element, unsigned with saturation -static inline Vec32uc add_saturated(Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); +static inline Vec32uc add_saturated(Vec32uc const a, Vec32uc const b) { + return Vec32uc(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); } // function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec32uc sub_saturated(Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); +static inline Vec32uc sub_saturated(Vec32uc const a, Vec32uc const b) { + return Vec32uc(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); } // function max: a > b ? a : b -static inline Vec32uc max(Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); +static inline Vec32uc max(Vec32uc const a, Vec32uc const b) { + return Vec32uc(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec32uc min(Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); +static inline Vec32uc min(Vec32uc const a, Vec32uc const b) { + return Vec32uc(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } - /***************************************************************************** * * Vector of 16 16-bit signed integers @@ -1025,15 +975,15 @@ class Vec16s : public Vec256b { y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15); } // Constructor to build from two Vec8s: - Vec16s(Vec8s const & a0, Vec8s const & a1) { + Vec16s(Vec8s const a0, Vec8s const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec16s(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec16s(Vec256b const & x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec16s & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec16s & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -1049,12 +999,6 @@ class Vec16s : public Vec256b { y1 = _mm_load_si128((__m128i const*)p + 1); return *this; } - // Member function to load 16 8-bit unsigned integers from array - Vec16s & load_16uc(void const * p) { - y0 = Vec8s().load_8uc(p); - y1 = Vec8s().load_8uc((uint8_t const*)p + 8); - return *this; - } // Partial load. Load n elements and set the rest to 0 Vec16s & load_partial(int n, void const * p) { if (n <= 0) { @@ -1089,13 +1033,12 @@ class Vec16s : public Vec256b { } // cut off vector to n elements. The last 16-n elements are set to zero Vec16s & cutoff(int n) { - *this = Vec32c(*this).cutoff(n * 2); + *this = Vec16s(Vec32c(*this).cutoff(n * 2)); return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec16s const & insert(uint32_t index, int16_t value) { - if (index < 8) { + Vec16s const insert(int index, int16_t value) { + if ((uint32_t)index < 8) { y0 = Vec8s(y0).insert(index, value); } else { @@ -1104,8 +1047,8 @@ class Vec16s : public Vec256b { return *this; } // Member function extract a single element from vector - int16_t extract(uint32_t index) const { - if (index < 8) { + int16_t extract(int index) const { + if ((uint32_t)index < 8) { return Vec8s(y0).extract(index); } else { @@ -1114,7 +1057,7 @@ class Vec16s : public Vec256b { } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - int16_t operator [] (uint32_t index) const { + int16_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec8s: @@ -1124,9 +1067,12 @@ class Vec16s : public Vec256b { Vec8s get_high() const { return y1; } - static int size () { + static constexpr int size() { return 16; } + static constexpr int elementtype() { + return 6; + } }; @@ -1144,15 +1090,15 @@ class Vec16sb : public Vec16s { // Constructor to build from all elements: Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) : - Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7), + Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7), -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15)) {} - // Constructor to convert from type Vec256ie - Vec16sb(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec16sb(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec16sb & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec16sb & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -1164,10 +1110,9 @@ class Vec16sb : public Vec16s { *this = Vec16sb(b); return *this; } -private: // Prevent constructing from int, etc. - Vec16sb(int b); - Vec16sb & operator = (int x); -public: + // Constructor to build from two Vec8sb: + Vec16sb(Vec8sb const a0, Vec8sb const a1) : Vec16s(Vec8s(a0), Vec8s(a1)) { + } // Member functions to split into two Vec8s: Vec8sb get_low() const { return y0; @@ -1178,16 +1123,28 @@ class Vec16sb : public Vec16s { Vec16sb & insert (int index, bool a) { Vec16s::insert(index, -(int)a); return *this; - } + } // Member function extract a single element from vector - bool extract(uint32_t index) const { + bool extract(int index) const { return Vec16s::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { + bool operator [] (int index) const { return extract(index); } + // Member function to change a bitfield to a boolean vector + Vec16sb & load_bits(uint16_t a) { + y0 = Vec8sb().load_bits(uint8_t(a)); + y1 = Vec8sb().load_bits(uint8_t(a>>8)); + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec16sb(int b) = delete; + Vec16sb & operator = (int x) = delete; }; @@ -1198,53 +1155,63 @@ class Vec16sb : public Vec16s { *****************************************************************************/ // vector operator & : bitwise and -static inline Vec16sb operator & (Vec16sb const & a, Vec16sb const & b) { +static inline Vec16sb operator & (Vec16sb const a, Vec16sb const b) { return Vec16sb(Vec256b(a) & Vec256b(b)); } -static inline Vec16sb operator && (Vec16sb const & a, Vec16sb const & b) { +static inline Vec16sb operator && (Vec16sb const a, Vec16sb const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const & b) { +static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec16sb operator | (Vec16sb const & a, Vec16sb const & b) { +static inline Vec16sb operator | (Vec16sb const a, Vec16sb const b) { return Vec16sb(Vec256b(a) | Vec256b(b)); } -static inline Vec16sb operator || (Vec16sb const & a, Vec16sb const & b) { +static inline Vec16sb operator || (Vec16sb const a, Vec16sb const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const & b) { +static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec16sb operator ^ (Vec16sb const & a, Vec16sb const & b) { +static inline Vec16sb operator ^ (Vec16sb const a, Vec16sb const b) { return Vec16sb(Vec256b(a) ^ Vec256b(b)); } // vector operator ^= : bitwise xor -static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const & b) { +static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const b) { a = a ^ b; return a; } +// vector operator == : xnor +static inline Vec16sb operator == (Vec16sb const a, Vec16sb const b) { + return Vec16sb(Vec256b(a) ^ Vec256b(~b)); +} + +// vector operator != : xor +static inline Vec16sb operator != (Vec16sb const a, Vec16sb const b) { + return Vec16sb(a ^ b); +} + // vector operator ~ : bitwise not -static inline Vec16sb operator ~ (Vec16sb const & a) { +static inline Vec16sb operator ~ (Vec16sb const a) { return Vec16sb( ~ Vec256b(a)); } // vector operator ! : element not -static inline Vec16sb operator ! (Vec16sb const & a) { +static inline Vec16sb operator ! (Vec16sb const a) { return ~ a; } // vector function andnot -static inline Vec16sb andnot (Vec16sb const & a, Vec16sb const & b) { +static inline Vec16sb andnot (Vec16sb const a, Vec16sb const b) { return Vec16sb(andnot(Vec256b(a), Vec256b(b))); } @@ -1256,12 +1223,12 @@ static inline Vec16sb andnot (Vec16sb const & a, Vec16sb const & b) { *****************************************************************************/ // vector operator + : add element by element -static inline Vec16s operator + (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator + (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator += : add -static inline Vec16s & operator += (Vec16s & a, Vec16s const & b) { +static inline Vec16s & operator += (Vec16s & a, Vec16s const b) { a = a + b; return a; } @@ -1280,17 +1247,17 @@ static inline Vec16s & operator ++ (Vec16s & a) { } // vector operator - : subtract element by element -static inline Vec16s operator - (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator - (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : unary minus -static inline Vec16s operator - (Vec16s const & a) { +static inline Vec16s operator - (Vec16s const a) { return Vec16s(-a.get_low(), -a.get_high()); } // vector operator -= : subtract -static inline Vec16s & operator -= (Vec16s & a, Vec16s const & b) { +static inline Vec16s & operator -= (Vec16s & a, Vec16s const b) { a = a - b; return a; } @@ -1309,29 +1276,29 @@ static inline Vec16s & operator -- (Vec16s & a) { } // vector operator * : multiply element by element -static inline Vec16s operator * (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator * (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator *= : multiply -static inline Vec16s & operator *= (Vec16s & a, Vec16s const & b) { +static inline Vec16s & operator *= (Vec16s & a, Vec16s const b) { a = a * b; return a; } // vector operator / : divide all elements by same integer -static inline Vec16s operator / (Vec16s const & a, Divisor_s const & d) { +static inline Vec16s operator / (Vec16s const a, Divisor_s const d) { return Vec16s(a.get_low() / d, a.get_high() / d); } // vector operator /= : divide -static inline Vec16s & operator /= (Vec16s & a, Divisor_s const & d) { +static inline Vec16s & operator /= (Vec16s & a, Divisor_s const d) { a = a / d; return a; } // vector operator << : shift left -static inline Vec16s operator << (Vec16s const & a, int b) { +static inline Vec16s operator << (Vec16s const a, int b) { return Vec16s(a.get_low() << b, a.get_high() << b); } @@ -1342,7 +1309,7 @@ static inline Vec16s & operator <<= (Vec16s & a, int b) { } // vector operator >> : shift right arithmetic -static inline Vec16s operator >> (Vec16s const & a, int b) { +static inline Vec16s operator >> (Vec16s const a, int b) { return Vec16s(a.get_low() >> b, a.get_high() >> b); } @@ -1353,141 +1320,145 @@ static inline Vec16s & operator >>= (Vec16s & a, int b) { } // vector operator == : returns true for elements for which a == b -static inline Vec16sb operator == (Vec16s const & a, Vec16s const & b) { +static inline Vec16sb operator == (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b -static inline Vec16sb operator != (Vec16s const & a, Vec16s const & b) { +static inline Vec16sb operator != (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } // vector operator > : returns true for elements for which a > b -static inline Vec16sb operator > (Vec16s const & a, Vec16s const & b) { +static inline Vec16sb operator > (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b -static inline Vec16sb operator < (Vec16s const & a, Vec16s const & b) { +static inline Vec16sb operator < (Vec16s const a, Vec16s const b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec16sb operator >= (Vec16s const & a, Vec16s const & b) { +static inline Vec16sb operator >= (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec16sb operator <= (Vec16s const & a, Vec16s const & b) { +static inline Vec16sb operator <= (Vec16s const a, Vec16s const b) { return b >= a; } // vector operator & : bitwise and -static inline Vec16s operator & (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator & (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec16s operator && (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator && (Vec16s const a, Vec16s const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec16s & operator &= (Vec16s & a, Vec16s const & b) { +static inline Vec16s & operator &= (Vec16s & a, Vec16s const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec16s operator | (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator | (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec16s operator || (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator || (Vec16s const a, Vec16s const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec16s & operator |= (Vec16s & a, Vec16s const & b) { +static inline Vec16s & operator |= (Vec16s & a, Vec16s const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec16s operator ^ (Vec16s const & a, Vec16s const & b) { +static inline Vec16s operator ^ (Vec16s const a, Vec16s const b) { return Vec16s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec16s & operator ^= (Vec16s & a, Vec16s const & b) { +static inline Vec16s & operator ^= (Vec16s & a, Vec16s const b) { a = a ^ b; return a; } // vector operator ~ : bitwise not -static inline Vec16s operator ~ (Vec16s const & a) { +static inline Vec16s operator ~ (Vec16s const a) { return Vec16s(~Vec256b(a)); } -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec16sb operator ! (Vec16s const & a) { - return Vec16s(!a.get_low(), !a.get_high()); -} - // Functions for this class // Select between two operands. Corresponds to this pseudocode: // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec16s select (Vec16sb const & s, Vec16s const & a, Vec16s const & b) { +static inline Vec16s select (Vec16sb const s, Vec16s const a, Vec16s const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec16s if_add (Vec16sb const & f, Vec16s const & a, Vec16s const & b) { +static inline Vec16s if_add (Vec16sb const f, Vec16s const a, Vec16s const b) { return a + (Vec16s(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec16s const & a) { +// Conditional subtract +static inline Vec16s if_sub (Vec16sb const f, Vec16s const a, Vec16s const b) { + return a - (Vec16s(f) & b); +} + +// Conditional multiply +static inline Vec16s if_mul (Vec16sb const f, Vec16s const a, Vec16s const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int16_t horizontal_add (Vec16s const a) { return horizontal_add(a.get_low() + a.get_high()); } // Horizontal add extended: Calculates the sum of all vector elements. // Elements are sign extended before adding to avoid overflow -static inline int32_t horizontal_add_x (Vec16s const & a) { +static inline int32_t horizontal_add_x (Vec16s const a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); } // function add_saturated: add element by element, signed with saturation -static inline Vec16s add_saturated(Vec16s const & a, Vec16s const & b) { +static inline Vec16s add_saturated(Vec16s const a, Vec16s const b) { return Vec16s(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); } // function sub_saturated: subtract element by element, signed with saturation -static inline Vec16s sub_saturated(Vec16s const & a, Vec16s const & b) { +static inline Vec16s sub_saturated(Vec16s const a, Vec16s const b) { return Vec16s(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); } // function max: a > b ? a : b -static inline Vec16s max(Vec16s const & a, Vec16s const & b) { +static inline Vec16s max(Vec16s const a, Vec16s const b) { return Vec16s(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec16s min(Vec16s const & a, Vec16s const & b) { +static inline Vec16s min(Vec16s const a, Vec16s const b) { return Vec16s(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } // function abs: a >= 0 ? a : -a -static inline Vec16s abs(Vec16s const & a) { +static inline Vec16s abs(Vec16s const a) { return Vec16s(abs(a.get_low()), abs(a.get_high())); } // function abs_saturated: same as abs, saturate if overflow -static inline Vec16s abs_saturated(Vec16s const & a) { +static inline Vec16s abs_saturated(Vec16s const a) { return Vec16s(abs_saturated(a.get_low()), abs_saturated(a.get_high())); } // function rotate_left all elements // Use negative count to rotate right -static inline Vec16s rotate_left(Vec16s const & a, int b) { +static inline Vec16s rotate_left(Vec16s const a, int b) { return Vec16s(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b)); } @@ -1510,19 +1481,19 @@ class Vec16us : public Vec16s { // Constructor to build from all elements: Vec16us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) { - y0 = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); - y1 = _mm_setr_epi16(i8, i9, i10, i11, i12, i13, i14, i15 ); + y0 = _mm_setr_epi16((int16_t)i0, (int16_t)i1, (int16_t)i2, (int16_t)i3, (int16_t)i4, (int16_t)i5, (int16_t)i6, (int16_t)i7); + y1 = _mm_setr_epi16((int16_t)i8, (int16_t)i9, (int16_t)i10, (int16_t)i11, (int16_t)i12, (int16_t)i13, (int16_t)i14, (int16_t)i15); } // Constructor to build from two Vec8us: - Vec16us(Vec8us const & a0, Vec8us const & a1) { + Vec16us(Vec8us const a0, Vec8us const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec16us(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec16us(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec16us & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec16us & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -1539,18 +1510,17 @@ class Vec16us : public Vec16s { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec16us const & insert(uint32_t index, uint16_t value) { - Vec16s::insert(index, value); + Vec16us const insert(int index, uint16_t value) { + Vec16s::insert(index, (int16_t)value); return *this; } // Member function extract a single element from vector - uint16_t extract(uint32_t index) const { - return Vec16s::extract(index); + uint16_t extract(int index) const { + return (uint16_t)Vec16s::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - uint16_t operator [] (uint32_t index) const { + uint16_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec8us: @@ -1560,43 +1530,46 @@ class Vec16us : public Vec16s { Vec8us get_high() const { return y1; } + static constexpr int elementtype() { + return 7; + } }; // Define operators for this class // vector operator + : add -static inline Vec16us operator + (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator + (Vec16us const a, Vec16us const b) { return Vec16us(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator - : subtract -static inline Vec16us operator - (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator - (Vec16us const a, Vec16us const b) { return Vec16us(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator * : multiply -static inline Vec16us operator * (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator * (Vec16us const a, Vec16us const b) { return Vec16us(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator / : divide -static inline Vec16us operator / (Vec16us const & a, Divisor_us const & d) { +static inline Vec16us operator / (Vec16us const a, Divisor_us const d) { return Vec16us(a.get_low() / d, a.get_high() / d); } // vector operator /= : divide -static inline Vec16us & operator /= (Vec16us & a, Divisor_us const & d) { +static inline Vec16us & operator /= (Vec16us & a, Divisor_us const d) { a = a / d; return a; } // vector operator >> : shift right logical all elements -static inline Vec16us operator >> (Vec16us const & a, uint32_t b) { +static inline Vec16us operator >> (Vec16us const a, uint32_t b) { return Vec16us(a.get_low() >> b, a.get_high() >> b); } // vector operator >> : shift right logical all elements -static inline Vec16us operator >> (Vec16us const & a, int b) { +static inline Vec16us operator >> (Vec16us const a, int b) { return a >> (uint32_t)b; } @@ -1607,58 +1580,58 @@ static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) { } // vector operator << : shift left all elements -static inline Vec16us operator << (Vec16us const & a, uint32_t b) { +static inline Vec16us operator << (Vec16us const a, uint32_t b) { return Vec16us(a.get_low() << b, a.get_high() << b); } // vector operator << : shift left all elements -static inline Vec16us operator << (Vec16us const & a, int32_t b) { +static inline Vec16us operator << (Vec16us const a, int32_t b) { return a << (uint32_t)b; } // vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec16sb operator >= (Vec16us const & a, Vec16us const & b) { +static inline Vec16sb operator >= (Vec16us const a, Vec16us const b) { return Vec16s(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec16sb operator <= (Vec16us const & a, Vec16us const & b) { +static inline Vec16sb operator <= (Vec16us const a, Vec16us const b) { return b >= a; } // vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec16sb operator > (Vec16us const & a, Vec16us const & b) { +static inline Vec16sb operator > (Vec16us const a, Vec16us const b) { return Vec16s(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec16sb operator < (Vec16us const & a, Vec16us const & b) { +static inline Vec16sb operator < (Vec16us const a, Vec16us const b) { return b > a; } // vector operator & : bitwise and -static inline Vec16us operator & (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator & (Vec16us const a, Vec16us const b) { return Vec16us(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec16us operator && (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator && (Vec16us const a, Vec16us const b) { return a & b; } // vector operator | : bitwise or -static inline Vec16us operator | (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator | (Vec16us const a, Vec16us const b) { return Vec16us(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec16us operator || (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator || (Vec16us const a, Vec16us const b) { return a | b; } // vector operator ^ : bitwise xor -static inline Vec16us operator ^ (Vec16us const & a, Vec16us const & b) { +static inline Vec16us operator ^ (Vec16us const a, Vec16us const b) { return Vec16us(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ~ : bitwise not -static inline Vec16us operator ~ (Vec16us const & a) { +static inline Vec16us operator ~ (Vec16us const a) { return Vec16us(~ Vec256b(a)); } @@ -1669,49 +1642,57 @@ static inline Vec16us operator ~ (Vec16us const & a) { // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each word in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec16us select (Vec16sb const & s, Vec16us const & a, Vec16us const & b) { +static inline Vec16us select (Vec16sb const s, Vec16us const a, Vec16us const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec16us if_add (Vec16sb const & f, Vec16us const & a, Vec16us const & b) { +static inline Vec16us if_add (Vec16sb const f, Vec16us const a, Vec16us const b) { return a + (Vec16us(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec16us const & a) { +// Conditional subtract +static inline Vec16us if_sub (Vec16sb const f, Vec16us const a, Vec16us const b) { + return a - (Vec16us(f) & b); +} + +// Conditional multiply +static inline Vec16us if_mul (Vec16sb const f, Vec16us const a, Vec16us const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint32_t horizontal_add (Vec16us const a) { return horizontal_add(a.get_low() + a.get_high()); } // Horizontal add extended: Calculates the sum of all vector elements. // Each element is zero-extended before addition to avoid overflow -static inline uint32_t horizontal_add_x (Vec16us const & a) { +static inline uint32_t horizontal_add_x (Vec16us const a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); } // function add_saturated: add element by element, unsigned with saturation -static inline Vec16us add_saturated(Vec16us const & a, Vec16us const & b) { +static inline Vec16us add_saturated(Vec16us const a, Vec16us const b) { return Vec16us(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); } // function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec16us sub_saturated(Vec16us const & a, Vec16us const & b) { +static inline Vec16us sub_saturated(Vec16us const a, Vec16us const b) { return Vec16us(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); } // function max: a > b ? a : b -static inline Vec16us max(Vec16us const & a, Vec16us const & b) { +static inline Vec16us max(Vec16us const a, Vec16us const b) { return Vec16us(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec16us min(Vec16us const & a, Vec16us const & b) { +static inline Vec16us min(Vec16us const a, Vec16us const b) { return Vec16us(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } - /***************************************************************************** * * Vector of 8 32-bit signed integers @@ -1733,15 +1714,15 @@ class Vec8i : public Vec256b { y1 = _mm_setr_epi32(i4, i5, i6, i7); } // Constructor to build from two Vec4i: - Vec8i(Vec4i const & a0, Vec4i const & a1) { + Vec8i(Vec4i const a0, Vec4i const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec8i(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec8i(Vec256b const & x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec8i & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec8i & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -1757,18 +1738,6 @@ class Vec8i : public Vec256b { y1 = _mm_load_si128((__m128i const*)p + 1); return *this; } - // Member function to load 8 8-bit unsigned integers from array - Vec8i & load_8uc(void const * p) { - y0 = Vec4i().load_4uc(p); - y1 = Vec4i().load_4uc((uint8_t const*)p + 4); - return *this; - } - // Member function to load 8 16-bit unsigned integers from array - Vec8i & load_8us(void const * p) { - y0 = Vec4i().load_4us(p); - y1 = Vec4i().load_4us((uint16_t const*)p + 4); - return *this; - } // Partial load. Load n elements and set the rest to 0 Vec8i & load_partial(int n, void const * p) { if (n <= 0) { @@ -1807,9 +1776,8 @@ class Vec8i : public Vec256b { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8i const & insert(uint32_t index, int32_t value) { - if (index < 4) { + Vec8i const insert(int index, int32_t value) { + if ((uint32_t)index < 4) { y0 = Vec4i(y0).insert(index, value); } else { @@ -1818,9 +1786,8 @@ class Vec8i : public Vec256b { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - int32_t extract(uint32_t index) const { - if (index < 4) { + int32_t extract(int index) const { + if ((uint32_t)index < 4) { return Vec4i(y0).extract(index); } else { @@ -1829,7 +1796,7 @@ class Vec8i : public Vec256b { } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - int32_t operator [] (uint32_t index) const { + int32_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec4i: @@ -1839,7 +1806,10 @@ class Vec8i : public Vec256b { Vec4i get_high() const { return y1; } - static int size () { + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { return 8; } }; @@ -1860,12 +1830,12 @@ class Vec8ib : public Vec8i { Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) : Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7)) {} - // Constructor to convert from type Vec256ie - Vec8ib(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec8ib(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec8ib & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec8ib & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -1877,10 +1847,9 @@ class Vec8ib : public Vec8i { *this = Vec8ib(b); return *this; } -private: // Prevent constructing from int, etc. - Vec8ib(int b); - Vec8ib & operator = (int x); -public: + // Constructor to build from two Vec4ib: + Vec8ib(Vec4ib const a0, Vec4ib const a1) : Vec8i(Vec4i(a0), Vec4i(a1)) { + } // Member functions to split into two Vec4i: Vec4ib get_low() const { return y0; @@ -1893,15 +1862,26 @@ class Vec8ib : public Vec8i { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - bool extract(uint32_t index) const { + bool extract(int index) const { return Vec8i::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { + bool operator [] (int index) const { return extract(index); } + // Member function to change a bitfield to a boolean vector + Vec8ib & load_bits(uint8_t a) { + y0 = Vec4ib().load_bits(uint16_t(a)); + y1 = Vec4ib().load_bits(uint16_t(a>>4)); + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec8ib(int b) = delete; + Vec8ib & operator = (int x) = delete; }; /***************************************************************************** @@ -1911,57 +1891,66 @@ class Vec8ib : public Vec8i { *****************************************************************************/ // vector operator & : bitwise and -static inline Vec8ib operator & (Vec8ib const & a, Vec8ib const & b) { +static inline Vec8ib operator & (Vec8ib const a, Vec8ib const b) { return Vec8ib(Vec256b(a) & Vec256b(b)); } -static inline Vec8ib operator && (Vec8ib const & a, Vec8ib const & b) { +static inline Vec8ib operator && (Vec8ib const a, Vec8ib const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const & b) { +static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec8ib operator | (Vec8ib const & a, Vec8ib const & b) { +static inline Vec8ib operator | (Vec8ib const a, Vec8ib const b) { return Vec8ib(Vec256b(a) | Vec256b(b)); } -static inline Vec8ib operator || (Vec8ib const & a, Vec8ib const & b) { +static inline Vec8ib operator || (Vec8ib const a, Vec8ib const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const & b) { +static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec8ib operator ^ (Vec8ib const & a, Vec8ib const & b) { +static inline Vec8ib operator ^ (Vec8ib const a, Vec8ib const b) { return Vec8ib(Vec256b(a) ^ Vec256b(b)); } // vector operator ^= : bitwise xor -static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const & b) { +static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const b) { a = a ^ b; return a; } +// vector operator == : xnor +static inline Vec8ib operator == (Vec8ib const a, Vec8ib const b) { + return Vec8ib(Vec256b(a) ^ Vec256b(~b)); +} + +// vector operator != : xor +static inline Vec8ib operator != (Vec8ib const a, Vec8ib const b) { + return Vec8ib(a ^ b); +} + // vector operator ~ : bitwise not -static inline Vec8ib operator ~ (Vec8ib const & a) { +static inline Vec8ib operator ~ (Vec8ib const a) { return Vec8ib( ~ Vec256b(a)); } // vector operator ! : element not -static inline Vec8ib operator ! (Vec8ib const & a) { +static inline Vec8ib operator ! (Vec8ib const a) { return ~ a; } // vector function andnot -static inline Vec8ib andnot (Vec8ib const & a, Vec8ib const & b) { +static inline Vec8ib andnot (Vec8ib const a, Vec8ib const b) { return Vec8ib(andnot(Vec256b(a), Vec256b(b))); } - /***************************************************************************** * * Operators for Vec8i @@ -1969,12 +1958,12 @@ static inline Vec8ib andnot (Vec8ib const & a, Vec8ib const & b) { *****************************************************************************/ // vector operator + : add element by element -static inline Vec8i operator + (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator + (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator += : add -static inline Vec8i & operator += (Vec8i & a, Vec8i const & b) { +static inline Vec8i & operator += (Vec8i & a, Vec8i const b) { a = a + b; return a; } @@ -1993,17 +1982,17 @@ static inline Vec8i & operator ++ (Vec8i & a) { } // vector operator - : subtract element by element -static inline Vec8i operator - (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator - (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : unary minus -static inline Vec8i operator - (Vec8i const & a) { +static inline Vec8i operator - (Vec8i const a) { return Vec8i(-a.get_low(), -a.get_high()); } // vector operator -= : subtract -static inline Vec8i & operator -= (Vec8i & a, Vec8i const & b) { +static inline Vec8i & operator -= (Vec8i & a, Vec8i const b) { a = a - b; return a; } @@ -2022,29 +2011,29 @@ static inline Vec8i & operator -- (Vec8i & a) { } // vector operator * : multiply element by element -static inline Vec8i operator * (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator * (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator *= : multiply -static inline Vec8i & operator *= (Vec8i & a, Vec8i const & b) { +static inline Vec8i & operator *= (Vec8i & a, Vec8i const b) { a = a * b; return a; } // vector operator / : divide all elements by same integer -static inline Vec8i operator / (Vec8i const & a, Divisor_i const & d) { +static inline Vec8i operator / (Vec8i const a, Divisor_i const d) { return Vec8i(a.get_low() / d, a.get_high() / d); } // vector operator /= : divide -static inline Vec8i & operator /= (Vec8i & a, Divisor_i const & d) { +static inline Vec8i & operator /= (Vec8i & a, Divisor_i const d) { a = a / d; return a; } // vector operator << : shift left -static inline Vec8i operator << (Vec8i const & a, int32_t b) { +static inline Vec8i operator << (Vec8i const a, int32_t b) { return Vec8i(a.get_low() << b, a.get_high() << b); } @@ -2055,7 +2044,7 @@ static inline Vec8i & operator <<= (Vec8i & a, int32_t b) { } // vector operator >> : shift right arithmetic -static inline Vec8i operator >> (Vec8i const & a, int32_t b) { +static inline Vec8i operator >> (Vec8i const a, int32_t b) { return Vec8i(a.get_low() >> b, a.get_high() >> b); } @@ -2066,78 +2055,78 @@ static inline Vec8i & operator >>= (Vec8i & a, int32_t b) { } // vector operator == : returns true for elements for which a == b -static inline Vec8ib operator == (Vec8i const & a, Vec8i const & b) { +static inline Vec8ib operator == (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b -static inline Vec8ib operator != (Vec8i const & a, Vec8i const & b) { +static inline Vec8ib operator != (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } - + // vector operator > : returns true for elements for which a > b -static inline Vec8ib operator > (Vec8i const & a, Vec8i const & b) { +static inline Vec8ib operator > (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b -static inline Vec8ib operator < (Vec8i const & a, Vec8i const & b) { +static inline Vec8ib operator < (Vec8i const a, Vec8i const b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec8ib operator >= (Vec8i const & a, Vec8i const & b) { +static inline Vec8ib operator >= (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec8ib operator <= (Vec8i const & a, Vec8i const & b) { +static inline Vec8ib operator <= (Vec8i const a, Vec8i const b) { return b >= a; } // vector operator & : bitwise and -static inline Vec8i operator & (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator & (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec8i operator && (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator && (Vec8i const a, Vec8i const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec8i & operator &= (Vec8i & a, Vec8i const & b) { +static inline Vec8i & operator &= (Vec8i & a, Vec8i const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec8i operator | (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator | (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec8i operator || (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator || (Vec8i const a, Vec8i const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec8i & operator |= (Vec8i & a, Vec8i const & b) { +static inline Vec8i & operator |= (Vec8i & a, Vec8i const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec8i operator ^ (Vec8i const & a, Vec8i const & b) { +static inline Vec8i operator ^ (Vec8i const a, Vec8i const b) { return Vec8i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec8i & operator ^= (Vec8i & a, Vec8i const & b) { +static inline Vec8i & operator ^= (Vec8i & a, Vec8i const b) { a = a ^ b; return a; } // vector operator ~ : bitwise not -static inline Vec8i operator ~ (Vec8i const & a) { +static inline Vec8i operator ~ (Vec8i const a) { return Vec8i(~a.get_low(), ~a.get_high()); } // vector operator ! : returns true for elements == 0 -static inline Vec8ib operator ! (Vec8i const & a) { +static inline Vec8ib operator ! (Vec8i const a) { return Vec8i(!a.get_low(), !a.get_high()); } @@ -2147,60 +2136,69 @@ static inline Vec8ib operator ! (Vec8i const & a) { // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec8i select (Vec8ib const & s, Vec8i const & a, Vec8i const & b) { +static inline Vec8i select (Vec8ib const s, Vec8i const a, Vec8i const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8i if_add (Vec8ib const & f, Vec8i const & a, Vec8i const & b) { +static inline Vec8i if_add (Vec8ib const f, Vec8i const a, Vec8i const b) { return a + (Vec8i(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec8i const & a) { +// Conditional subtract +static inline Vec8i if_sub (Vec8ib const f, Vec8i const a, Vec8i const b) { + return a - (Vec8i(f) & b); +} + +// Conditional multiply +static inline Vec8i if_mul (Vec8ib const f, Vec8i const a, Vec8i const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int32_t horizontal_add (Vec8i const a) { return horizontal_add(a.get_low() + a.get_high()); } // Horizontal add extended: Calculates the sum of all vector elements. // Elements are sign extended before adding to avoid overflow -static inline int64_t horizontal_add_x (Vec8i const & a) { +static inline int64_t horizontal_add_x (Vec8i const a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); } // function add_saturated: add element by element, signed with saturation -static inline Vec8i add_saturated(Vec8i const & a, Vec8i const & b) { +static inline Vec8i add_saturated(Vec8i const a, Vec8i const b) { return Vec8i(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); } // function sub_saturated: subtract element by element, signed with saturation -static inline Vec8i sub_saturated(Vec8i const & a, Vec8i const & b) { +static inline Vec8i sub_saturated(Vec8i const a, Vec8i const b) { return Vec8i(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); } // function max: a > b ? a : b -static inline Vec8i max(Vec8i const & a, Vec8i const & b) { +static inline Vec8i max(Vec8i const a, Vec8i const b) { return Vec8i(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec8i min(Vec8i const & a, Vec8i const & b) { +static inline Vec8i min(Vec8i const a, Vec8i const b) { return Vec8i(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } // function abs: a >= 0 ? a : -a -static inline Vec8i abs(Vec8i const & a) { +static inline Vec8i abs(Vec8i const a) { return Vec8i(abs(a.get_low()), abs(a.get_high())); } // function abs_saturated: same as abs, saturate if overflow -static inline Vec8i abs_saturated(Vec8i const & a) { +static inline Vec8i abs_saturated(Vec8i const a) { return Vec8i(abs_saturated(a.get_low()), abs_saturated(a.get_high())); } // function rotate_left all elements // Use negative count to rotate right -static inline Vec8i rotate_left(Vec8i const & a, int b) { +static inline Vec8i rotate_left(Vec8i const a, int b) { return Vec8i(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b)); } @@ -2218,23 +2216,23 @@ class Vec8ui : public Vec8i { } // Constructor to broadcast the same value into all elements: Vec8ui(uint32_t i) { - y1 = y0 = _mm_set1_epi32(i); + y1 = y0 = _mm_set1_epi32(int32_t(i)); } // Constructor to build from all elements: Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) { - y0 = _mm_setr_epi32(i0, i1, i2, i3); - y1 = _mm_setr_epi32(i4, i5, i6, i7); + y0 = _mm_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3); + y1 = _mm_setr_epi32((int32_t)i4, (int32_t)i5, (int32_t)i6, (int32_t)i7); } // Constructor to build from two Vec4ui: - Vec8ui(Vec4ui const & a0, Vec4ui const & a1) { + Vec8ui(Vec4ui const a0, Vec4ui const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec8ui(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec8ui(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec8ui & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec8ui & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -2251,18 +2249,17 @@ class Vec8ui : public Vec8i { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8ui const & insert(uint32_t index, uint32_t value) { - Vec8i::insert(index, value); + Vec8ui const insert(int index, uint32_t value) { + Vec8i::insert(index, (int32_t)value); return *this; } // Member function extract a single element from vector - uint32_t extract(uint32_t index) const { - return Vec8i::extract(index); + uint32_t extract(int index) const { + return (uint32_t)Vec8i::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - uint32_t operator [] (uint32_t index) const { + uint32_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec4ui: @@ -2272,43 +2269,46 @@ class Vec8ui : public Vec8i { Vec4ui get_high() const { return y1; } + static constexpr int elementtype() { + return 9; + } }; // Define operators for this class // vector operator + : add -static inline Vec8ui operator + (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator + (Vec8ui const a, Vec8ui const b) { return Vec8ui (Vec8i(a) + Vec8i(b)); } // vector operator - : subtract -static inline Vec8ui operator - (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator - (Vec8ui const a, Vec8ui const b) { return Vec8ui (Vec8i(a) - Vec8i(b)); } // vector operator * : multiply -static inline Vec8ui operator * (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator * (Vec8ui const a, Vec8ui const b) { return Vec8ui (Vec8i(a) * Vec8i(b)); } // vector operator / : divide all elements by same integer -static inline Vec8ui operator / (Vec8ui const & a, Divisor_ui const & d) { +static inline Vec8ui operator / (Vec8ui const a, Divisor_ui const d) { return Vec8ui(a.get_low() / d, a.get_high() / d); } // vector operator /= : divide -static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const & d) { +static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const d) { a = a / d; return a; } // vector operator >> : shift right logical all elements -static inline Vec8ui operator >> (Vec8ui const & a, uint32_t b) { +static inline Vec8ui operator >> (Vec8ui const a, uint32_t b) { return Vec8ui(a.get_low() >> b, a.get_high() >> b); } // vector operator >> : shift right logical all elements -static inline Vec8ui operator >> (Vec8ui const & a, int32_t b) { +static inline Vec8ui operator >> (Vec8ui const a, int32_t b) { return a >> (uint32_t)b; } @@ -2316,67 +2316,67 @@ static inline Vec8ui operator >> (Vec8ui const & a, int32_t b) { static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) { a = a >> b; return a; -} +} // vector operator >>= : shift right logical static inline Vec8ui & operator >>= (Vec8ui & a, int32_t b) { a = a >> b; return a; -} +} // vector operator << : shift left all elements -static inline Vec8ui operator << (Vec8ui const & a, uint32_t b) { +static inline Vec8ui operator << (Vec8ui const a, uint32_t b) { return Vec8ui ((Vec8i)a << (int32_t)b); } // vector operator << : shift left all elements -static inline Vec8ui operator << (Vec8ui const & a, int32_t b) { +static inline Vec8ui operator << (Vec8ui const a, int32_t b) { return Vec8ui ((Vec8i)a << (int32_t)b); } // vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec8ib operator > (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ib operator > (Vec8ui const a, Vec8ui const b) { return Vec8i(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec8ib operator < (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ib operator < (Vec8ui const a, Vec8ui const b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec8ib operator >= (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ib operator >= (Vec8ui const a, Vec8ui const b) { return Vec8i(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec8ib operator <= (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ib operator <= (Vec8ui const a, Vec8ui const b) { return b >= a; } // vector operator & : bitwise and -static inline Vec8ui operator & (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator & (Vec8ui const a, Vec8ui const b) { return Vec8ui(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec8ui operator && (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator && (Vec8ui const a, Vec8ui const b) { return a & b; } // vector operator | : bitwise or -static inline Vec8ui operator | (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator | (Vec8ui const a, Vec8ui const b) { return Vec8ui(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec8ui operator || (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator || (Vec8ui const a, Vec8ui const b) { return a | b; } // vector operator ^ : bitwise xor -static inline Vec8ui operator ^ (Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui operator ^ (Vec8ui const a, Vec8ui const b) { return Vec8ui(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ~ : bitwise not -static inline Vec8ui operator ~ (Vec8ui const & a) { +static inline Vec8ui operator ~ (Vec8ui const a) { return Vec8ui(~a.get_low(), ~a.get_high()); } @@ -2386,49 +2386,57 @@ static inline Vec8ui operator ~ (Vec8ui const & a) { // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; // Each word in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec8ui select (Vec8ib const & s, Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui select (Vec8ib const s, Vec8ui const a, Vec8ui const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8ui if_add (Vec8ib const & f, Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui if_add (Vec8ib const f, Vec8ui const a, Vec8ui const b) { return a + (Vec8ui(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec8ui const & a) { - return horizontal_add((Vec8i)a); +// Conditional subtract +static inline Vec8ui if_sub (Vec8ib const f, Vec8ui const a, Vec8ui const b) { + return a - (Vec8ui(f) & b); +} + +// Conditional multiply +static inline Vec8ui if_mul (Vec8ib const f, Vec8ui const a, Vec8ui const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint32_t horizontal_add (Vec8ui const a) { + return (uint32_t)horizontal_add((Vec8i)a); } // Horizontal add extended: Calculates the sum of all vector elements. // Elements are zero extended before adding to avoid overflow -static inline uint64_t horizontal_add_x (Vec8ui const & a) { +static inline uint64_t horizontal_add_x (Vec8ui const a) { return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); } // function add_saturated: add element by element, unsigned with saturation -static inline Vec8ui add_saturated(Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui add_saturated(Vec8ui const a, Vec8ui const b) { return Vec8ui(add_saturated(a.get_low(),b.get_low()), add_saturated(a.get_high(),b.get_high())); } // function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec8ui sub_saturated(Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui sub_saturated(Vec8ui const a, Vec8ui const b) { return Vec8ui(sub_saturated(a.get_low(),b.get_low()), sub_saturated(a.get_high(),b.get_high())); } // function max: a > b ? a : b -static inline Vec8ui max(Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui max(Vec8ui const a, Vec8ui const b) { return Vec8ui(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec8ui min(Vec8ui const & a, Vec8ui const & b) { +static inline Vec8ui min(Vec8ui const a, Vec8ui const b) { return Vec8ui(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } - /***************************************************************************** * * Vector of 4 64-bit signed integers @@ -2450,15 +2458,15 @@ class Vec4q : public Vec256b { y1 = Vec2q(i2,i3); } // Constructor to build from two Vec2q: - Vec4q(Vec2q const & a0, Vec2q const & a1) { + Vec4q(Vec2q const a0, Vec2q const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec4q(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec4q(Vec256b const & x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec4q & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec4q & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -2512,9 +2520,8 @@ class Vec4q : public Vec256b { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4q const & insert(uint32_t index, int64_t value) { - if (index < 2) { + Vec4q const insert(int index, int64_t value) { + if ((uint32_t)index < 2) { y0 = Vec2q(y0).insert(index, value); } else { @@ -2523,9 +2530,8 @@ class Vec4q : public Vec256b { return *this; } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - int64_t extract(uint32_t index) const { - if (index < 2) { + int64_t extract(int index) const { + if ((uint32_t)index < 2) { return Vec2q(y0).extract(index); } else { @@ -2534,7 +2540,7 @@ class Vec4q : public Vec256b { } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - int64_t operator [] (uint32_t index) const { + int64_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec2q: @@ -2544,9 +2550,12 @@ class Vec4q : public Vec256b { Vec2q get_high() const { return y1; } - static int size () { + static constexpr int size() { return 4; } + static constexpr int elementtype() { + return 10; + } }; @@ -2565,12 +2574,12 @@ class Vec4qb : public Vec4q { Vec4qb(bool x0, bool x1, bool x2, bool x3) : Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) { } - // Constructor to convert from type Vec256ie - Vec4qb(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec4qb(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec4qb & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec4qb & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -2582,10 +2591,9 @@ class Vec4qb : public Vec4q { *this = Vec4qb(b); return *this; } -private: // Prevent constructing from int, etc. - Vec4qb(int b); - Vec4qb & operator = (int x); -public: + // Constructor to build from two Vec2qb: + Vec4qb(Vec2qb const a0, Vec2qb const a1) : Vec4q(Vec2q(a0), Vec2q(a1)) { + } // Member functions to split into two Vec2qb: Vec2qb get_low() const { return y0; @@ -2596,17 +2604,28 @@ class Vec4qb : public Vec4q { Vec4qb & insert (int index, bool a) { Vec4q::insert(index, -(int64_t)a); return *this; - } + } // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - bool extract(uint32_t index) const { + bool extract(int index) const { return Vec4q::extract(index) != 0; } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { + bool operator [] (int index) const { return extract(index); } + // Member function to change a bitfield to a boolean vector + Vec4qb & load_bits(uint8_t a) { + y0 = Vec2qb().load_bits(a); + y1 = Vec2qb().load_bits(uint8_t(a>>2u)); + return *this; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. + Vec4qb(int b) = delete; + Vec4qb & operator = (int x) = delete; }; @@ -2617,53 +2636,63 @@ class Vec4qb : public Vec4q { *****************************************************************************/ // vector operator & : bitwise and -static inline Vec4qb operator & (Vec4qb const & a, Vec4qb const & b) { +static inline Vec4qb operator & (Vec4qb const a, Vec4qb const b) { return Vec4qb(Vec256b(a) & Vec256b(b)); } -static inline Vec4qb operator && (Vec4qb const & a, Vec4qb const & b) { +static inline Vec4qb operator && (Vec4qb const a, Vec4qb const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const & b) { +static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec4qb operator | (Vec4qb const & a, Vec4qb const & b) { +static inline Vec4qb operator | (Vec4qb const a, Vec4qb const b) { return Vec4qb(Vec256b(a) | Vec256b(b)); } -static inline Vec4qb operator || (Vec4qb const & a, Vec4qb const & b) { +static inline Vec4qb operator || (Vec4qb const a, Vec4qb const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const & b) { +static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec4qb operator ^ (Vec4qb const & a, Vec4qb const & b) { +static inline Vec4qb operator ^ (Vec4qb const a, Vec4qb const b) { return Vec4qb(Vec256b(a) ^ Vec256b(b)); } // vector operator ^= : bitwise xor -static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const & b) { +static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const b) { a = a ^ b; return a; } +// vector operator == : xnor +static inline Vec4qb operator == (Vec4qb const a, Vec4qb const b) { + return Vec4qb(Vec256b(a) ^ Vec256b(~b)); +} + +// vector operator != : xor +static inline Vec4qb operator != (Vec4qb const a, Vec4qb const b) { + return Vec4qb(a ^ b); +} + // vector operator ~ : bitwise not -static inline Vec4qb operator ~ (Vec4qb const & a) { +static inline Vec4qb operator ~ (Vec4qb const a) { return Vec4qb( ~ Vec256b(a)); } // vector operator ! : element not -static inline Vec4qb operator ! (Vec4qb const & a) { +static inline Vec4qb operator ! (Vec4qb const a) { return ~ a; } // vector function andnot -static inline Vec4qb andnot (Vec4qb const & a, Vec4qb const & b) { +static inline Vec4qb andnot (Vec4qb const a, Vec4qb const b) { return Vec4qb(andnot(Vec256b(a), Vec256b(b))); } @@ -2675,12 +2704,12 @@ static inline Vec4qb andnot (Vec4qb const & a, Vec4qb const & b) { *****************************************************************************/ // vector operator + : add element by element -static inline Vec4q operator + (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator + (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() + b.get_low(), a.get_high() + b.get_high()); } // vector operator += : add -static inline Vec4q & operator += (Vec4q & a, Vec4q const & b) { +static inline Vec4q & operator += (Vec4q & a, Vec4q const b) { a = a + b; return a; } @@ -2699,17 +2728,17 @@ static inline Vec4q & operator ++ (Vec4q & a) { } // vector operator - : subtract element by element -static inline Vec4q operator - (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator - (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() - b.get_low(), a.get_high() - b.get_high()); } // vector operator - : unary minus -static inline Vec4q operator - (Vec4q const & a) { +static inline Vec4q operator - (Vec4q const a) { return Vec4q(-a.get_low(), -a.get_high()); } // vector operator -= : subtract -static inline Vec4q & operator -= (Vec4q & a, Vec4q const & b) { +static inline Vec4q & operator -= (Vec4q & a, Vec4q const b) { a = a - b; return a; } @@ -2728,18 +2757,18 @@ static inline Vec4q & operator -- (Vec4q & a) { } // vector operator * : multiply element by element -static inline Vec4q operator * (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator * (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() * b.get_low(), a.get_high() * b.get_high()); } // vector operator *= : multiply -static inline Vec4q & operator *= (Vec4q & a, Vec4q const & b) { +static inline Vec4q & operator *= (Vec4q & a, Vec4q const b) { a = a * b; return a; } // vector operator << : shift left -static inline Vec4q operator << (Vec4q const & a, int32_t b) { +static inline Vec4q operator << (Vec4q const a, int32_t b) { return Vec4q(a.get_low() << b, a.get_high() << b); } @@ -2750,7 +2779,7 @@ static inline Vec4q & operator <<= (Vec4q & a, int32_t b) { } // vector operator >> : shift right arithmetic -static inline Vec4q operator >> (Vec4q const & a, int32_t b) { +static inline Vec4q operator >> (Vec4q const a, int32_t b) { return Vec4q(a.get_low() >> b, a.get_high() >> b); } @@ -2761,79 +2790,78 @@ static inline Vec4q & operator >>= (Vec4q & a, int32_t b) { } // vector operator == : returns true for elements for which a == b -static inline Vec4qb operator == (Vec4q const & a, Vec4q const & b) { +static inline Vec4qb operator == (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() == b.get_low(), a.get_high() == b.get_high()); } // vector operator != : returns true for elements for which a != b -static inline Vec4qb operator != (Vec4q const & a, Vec4q const & b) { +static inline Vec4qb operator != (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() != b.get_low(), a.get_high() != b.get_high()); } - + // vector operator < : returns true for elements for which a < b -static inline Vec4qb operator < (Vec4q const & a, Vec4q const & b) { +static inline Vec4qb operator < (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() < b.get_low(), a.get_high() < b.get_high()); } // vector operator > : returns true for elements for which a > b -static inline Vec4qb operator > (Vec4q const & a, Vec4q const & b) { +static inline Vec4qb operator > (Vec4q const a, Vec4q const b) { return b < a; } // vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec4qb operator >= (Vec4q const & a, Vec4q const & b) { +static inline Vec4qb operator >= (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec4qb operator <= (Vec4q const & a, Vec4q const & b) { +static inline Vec4qb operator <= (Vec4q const a, Vec4q const b) { return b >= a; } // vector operator & : bitwise and -static inline Vec4q operator & (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator & (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec4q operator && (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator && (Vec4q const a, Vec4q const b) { return a & b; } // vector operator &= : bitwise and -static inline Vec4q & operator &= (Vec4q & a, Vec4q const & b) { +static inline Vec4q & operator &= (Vec4q & a, Vec4q const b) { a = a & b; return a; } // vector operator | : bitwise or -static inline Vec4q operator | (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator | (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec4q operator || (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator || (Vec4q const a, Vec4q const b) { return a | b; } // vector operator |= : bitwise or -static inline Vec4q & operator |= (Vec4q & a, Vec4q const & b) { +static inline Vec4q & operator |= (Vec4q & a, Vec4q const b) { a = a | b; return a; } // vector operator ^ : bitwise xor -static inline Vec4q operator ^ (Vec4q const & a, Vec4q const & b) { +static inline Vec4q operator ^ (Vec4q const a, Vec4q const b) { return Vec4q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ^= : bitwise xor -static inline Vec4q & operator ^= (Vec4q & a, Vec4q const & b) { +static inline Vec4q & operator ^= (Vec4q & a, Vec4q const b) { a = a ^ b; return a; } - // vector operator ~ : bitwise not -static inline Vec4q operator ~ (Vec4q const & a) { +static inline Vec4q operator ~ (Vec4q const a) { return Vec4q(~a.get_low(), ~a.get_high()); } // vector operator ! : logical not, returns true for elements == 0 -static inline Vec4qb operator ! (Vec4q const & a) { +static inline Vec4qb operator ! (Vec4q const a) { return Vec4q(!a.get_low(), !a.get_high()); } @@ -2843,44 +2871,53 @@ static inline Vec4qb operator ! (Vec4q const & a) { // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; // Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec4q select (Vec4qb const & s, Vec4q const & a, Vec4q const & b) { +static inline Vec4q select (Vec4qb const s, Vec4q const a, Vec4q const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4q if_add (Vec4qb const & f, Vec4q const & a, Vec4q const & b) { +static inline Vec4q if_add (Vec4qb const f, Vec4q const a, Vec4q const b) { return a + (Vec4q(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int64_t horizontal_add (Vec4q const & a) { +// Conditional subtract +static inline Vec4q if_sub (Vec4qb const f, Vec4q const a, Vec4q const b) { + return a - (Vec4q(f) & b); +} + +// Conditional multiply +static inline Vec4q if_mul (Vec4qb const f, Vec4q const a, Vec4q const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int64_t horizontal_add (Vec4q const a) { return horizontal_add(a.get_low() + a.get_high()); } // function max: a > b ? a : b -static inline Vec4q max(Vec4q const & a, Vec4q const & b) { +static inline Vec4q max(Vec4q const a, Vec4q const b) { return Vec4q(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec4q min(Vec4q const & a, Vec4q const & b) { +static inline Vec4q min(Vec4q const a, Vec4q const b) { return Vec4q(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } // function abs: a >= 0 ? a : -a -static inline Vec4q abs(Vec4q const & a) { +static inline Vec4q abs(Vec4q const a) { return Vec4q(abs(a.get_low()), abs(a.get_high())); } // function abs_saturated: same as abs, saturate if overflow -static inline Vec4q abs_saturated(Vec4q const & a) { +static inline Vec4q abs_saturated(Vec4q const a) { return Vec4q(abs_saturated(a.get_low()), abs_saturated(a.get_high())); } // function rotate_left all elements // Use negative count to rotate right -static inline Vec4q rotate_left(Vec4q const & a, int b) { +static inline Vec4q rotate_left(Vec4q const a, int b) { return Vec4q(rotate_left(a.get_low(),b), rotate_left(a.get_high(),b)); } @@ -2898,23 +2935,23 @@ class Vec4uq : public Vec4q { } // Constructor to broadcast the same value into all elements: Vec4uq(uint64_t i) { - y1 = y0 = Vec2q(i); + y1 = y0 = Vec2q((int64_t)i); } // Constructor to build from all elements: Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) { - y0 = Vec2q(i0, i1); - y1 = Vec2q(i2, i3); + y0 = Vec2q((int64_t)i0, (int64_t)i1); + y1 = Vec2q((int64_t)i2, (int64_t)i3); } // Constructor to build from two Vec2uq: - Vec4uq(Vec2uq const & a0, Vec2uq const & a1) { + Vec4uq(Vec2uq const a0, Vec2uq const a1) { y0 = a0; y1 = a1; } - // Constructor to convert from type Vec256ie - Vec4uq(Vec256ie const & x) { + // Constructor to convert from type Vec256b + Vec4uq(Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); } - // Assignment operator to convert from type Vec256ie - Vec4uq & operator = (Vec256ie const & x) { + // Assignment operator to convert from type Vec256b + Vec4uq & operator = (Vec256b const x) { y0 = x.get_low(); y1 = x.get_high(); return *this; } @@ -2931,18 +2968,17 @@ class Vec4uq : public Vec4q { return *this; } // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4uq const & insert(uint32_t index, uint64_t value) { - Vec4q::insert(index, value); + Vec4uq const insert(int index, uint64_t value) { + Vec4q::insert(index, (int64_t)value); return *this; } // Member function extract a single element from vector - uint64_t extract(uint32_t index) const { - return Vec4q::extract(index); + uint64_t extract(int index) const { + return (uint64_t)Vec4q::extract(index); } // Extract a single element. Use store function if extracting more than one element. // Operator [] can only read an element, not write. - uint64_t operator [] (uint32_t index) const { + uint64_t operator [] (int index) const { return extract(index); } // Member functions to split into two Vec2uq: @@ -2952,32 +2988,35 @@ class Vec4uq : public Vec4q { Vec2uq get_high() const { return y1; } + static constexpr int elementtype() { + return 11; + } }; // Define operators for this class // vector operator + : add -static inline Vec4uq operator + (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator + (Vec4uq const a, Vec4uq const b) { return Vec4uq (Vec4q(a) + Vec4q(b)); } // vector operator - : subtract -static inline Vec4uq operator - (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator - (Vec4uq const a, Vec4uq const b) { return Vec4uq (Vec4q(a) - Vec4q(b)); } // vector operator * : multiply element by element -static inline Vec4uq operator * (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator * (Vec4uq const a, Vec4uq const b) { return Vec4uq (Vec4q(a) * Vec4q(b)); } // vector operator >> : shift right logical all elements -static inline Vec4uq operator >> (Vec4uq const & a, uint32_t b) { +static inline Vec4uq operator >> (Vec4uq const a, uint32_t b) { return Vec4uq(a.get_low() >> b, a.get_high() >> b); } // vector operator >> : shift right logical all elements -static inline Vec4uq operator >> (Vec4uq const & a, int32_t b) { +static inline Vec4uq operator >> (Vec4uq const a, int32_t b) { return a >> (uint32_t)b; } @@ -2985,61 +3024,61 @@ static inline Vec4uq operator >> (Vec4uq const & a, int32_t b) { static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) { a = a >> b; return a; -} +} // vector operator << : shift left all elements -static inline Vec4uq operator << (Vec4uq const & a, uint32_t b) { +static inline Vec4uq operator << (Vec4uq const a, uint32_t b) { return Vec4uq ((Vec4q)a << (int32_t)b); } // vector operator << : shift left all elements -static inline Vec4uq operator << (Vec4uq const & a, int32_t b) { +static inline Vec4uq operator << (Vec4uq const a, int32_t b) { return Vec4uq ((Vec4q)a << b); } // vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec4qb operator > (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4qb operator > (Vec4uq const a, Vec4uq const b) { return Vec4q(a.get_low() > b.get_low(), a.get_high() > b.get_high()); } // vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec4qb operator < (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4qb operator < (Vec4uq const a, Vec4uq const b) { return b > a; } // vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec4qb operator >= (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4qb operator >= (Vec4uq const a, Vec4uq const b) { return Vec4q(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); } // vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec4qb operator <= (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4qb operator <= (Vec4uq const a, Vec4uq const b) { return b >= a; } // vector operator & : bitwise and -static inline Vec4uq operator & (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator & (Vec4uq const a, Vec4uq const b) { return Vec4uq(a.get_low() & b.get_low(), a.get_high() & b.get_high()); } -static inline Vec4uq operator && (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator && (Vec4uq const a, Vec4uq const b) { return a & b; } // vector operator | : bitwise or -static inline Vec4uq operator | (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator | (Vec4uq const a, Vec4uq const b) { return Vec4q(a.get_low() | b.get_low(), a.get_high() | b.get_high()); } -static inline Vec4uq operator || (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator || (Vec4uq const a, Vec4uq const b) { return a | b; } // vector operator ^ : bitwise xor -static inline Vec4uq operator ^ (Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq operator ^ (Vec4uq const a, Vec4uq const b) { return Vec4uq(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); } // vector operator ~ : bitwise not -static inline Vec4uq operator ~ (Vec4uq const & a) { +static inline Vec4uq operator ~ (Vec4uq const a) { return Vec4uq(~a.get_low(), ~a.get_high()); } @@ -3049,28 +3088,37 @@ static inline Vec4uq operator ~ (Vec4uq const & a) { // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; // Each word in s must be either 0 (false) or -1 (true). No other values are allowed. // (s is signed) -static inline Vec4uq select (Vec4qb const & s, Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq select (Vec4qb const s, Vec4uq const a, Vec4uq const b) { return selectb(s,a,b); } // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4uq if_add (Vec4qb const & f, Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq if_add (Vec4qb const f, Vec4uq const a, Vec4uq const b) { return a + (Vec4uq(f) & b); } -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint64_t horizontal_add (Vec4uq const & a) { - return horizontal_add((Vec4q)a); +// Conditional subtract +static inline Vec4uq if_sub (Vec4qb const f, Vec4uq const a, Vec4uq const b) { + return a - (Vec4uq(f) & b); +} + +// Conditional multiply +static inline Vec4uq if_mul (Vec4qb const f, Vec4uq const a, Vec4uq const b) { + return select(f, a*b, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint64_t horizontal_add (Vec4uq const a) { + return (uint64_t)horizontal_add((Vec4q)a); } // function max: a > b ? a : b -static inline Vec4uq max(Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq max(Vec4uq const a, Vec4uq const b) { return Vec4uq(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high())); } // function min: a < b ? a : b -static inline Vec4uq min(Vec4uq const & a, Vec4uq const & b) { +static inline Vec4uq min(Vec4uq const a, Vec4uq const b) { return Vec4uq(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high())); } @@ -3082,82 +3130,67 @@ static inline Vec4uq min(Vec4uq const & a, Vec4uq const & b) { ****************************************************************************** * * These permute functions can reorder the elements of a vector and optionally -* set some elements to zero. +* set some elements to zero. See Vectori128.h for description * -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to select. -* An index of -1 will generate zero. An index of -256 means don't care. -* -* Example: - -* Vec8i a(10,11,12,13,14,15,16,17); // a is (10,11,12,13,14,15,16,17) -* Vec8i b; -* b = permute8i<0,2,7,7,-1,-1,1,1>(a); // b is (10,12,17,17, 0, 0,11,11) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. *****************************************************************************/ - -// Shuffle vector of 4 64-bit integers. -// Index -1 gives 0, index -256 means don't care. +// permute vector of 4 64-bit integers. +// Index -1 gives 0, index V_DC means don't care. template -static inline Vec4q permute4q(Vec4q const & a) { - return Vec4q(blend2q (a.get_low(), a.get_high()), - blend2q (a.get_low(), a.get_high())); +static inline Vec4q permute4(Vec4q const a) { + return Vec4q(blend2 (a.get_low(), a.get_high()), + blend2 (a.get_low(), a.get_high())); } template -static inline Vec4uq permute4uq(Vec4uq const & a) { - return Vec4uq (permute4q (a)); +static inline Vec4uq permute4(Vec4uq const a) { + return Vec4uq (permute4 (Vec4q(a))); } -// Shuffle vector of 8 32-bit integers. -// Index -1 gives 0, index -256 means don't care. +// permute vector of 8 32-bit integers. +// Index -1 gives 0, index V_DC means don't care. template -static inline Vec8i permute8i(Vec8i const & a) { - return Vec8i(blend4i (a.get_low(), a.get_high()), - blend4i (a.get_low(), a.get_high())); +static inline Vec8i permute8(Vec8i const a) { + return Vec8i(blend4 (a.get_low(), a.get_high()), + blend4 (a.get_low(), a.get_high())); } template -static inline Vec8ui permute8ui(Vec8ui const & a) { - return Vec8ui (permute8i (a)); +static inline Vec8ui permute4(Vec8ui const a) { + return Vec8ui (permute8 (Vec8i(a))); } -// Shuffle vector of 16 16-bit integers. -// Index -1 gives 0, index -256 means don't care. +// permute vector of 16 16-bit integers. +// Index -1 gives 0, index V_DC means don't care. template -static inline Vec16s permute16s(Vec16s const & a) { - return Vec16s(blend8s (a.get_low(), a.get_high()), - blend8s (a.get_low(), a.get_high())); +static inline Vec16s permute16(Vec16s const a) { + return Vec16s(blend8 (a.get_low(), a.get_high()), + blend8 (a.get_low(), a.get_high())); } template -static inline Vec16us permute16us(Vec16us const & a) { - return Vec16us (permute16s (a)); +static inline Vec16us permute16(Vec16us const a) { + return Vec16us (permute16 (Vec16s(a))); } -template -static inline Vec32c permute32c(Vec32c const & a) { - return Vec32c(blend16c (a.get_low(), a.get_high()), - blend16c (a.get_low(), a.get_high())); +static inline Vec32c permute32(Vec32c const a) { + return Vec32c(blend16 (a.get_low(), a.get_high()), + blend16 (a.get_low(), a.get_high())); } -template - static inline Vec32uc permute32uc(Vec32uc const & a) { - return Vec32uc (permute32c (a)); + static inline Vec32uc permute32(Vec32uc const a) { + return Vec32uc (permute32 (Vec32c(a))); } @@ -3165,479 +3198,69 @@ template . These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where higher indexes indicate an element from the second source -* vector. For example, if each vector has 8 elements, then indexes 0 - 7 -* will select an element from the first vector and indexes 8 - 15 will select -* an element from the second vector. A negative index will generate zero. -* -* Example: -* Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107) -* Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207) -* Vec8i c; -* c = blend8i<1,0,9,8,7,-1,15,15> (a,b); // c is (101, 100, 201, 200, 107, 0, 207, 207) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. *****************************************************************************/ -// helper function used below -template -static inline Vec2q select4(Vec4q const & a, Vec4q const & b) { - switch (n) { - case 0: - return a.get_low(); - case 1: - return a.get_high(); - case 2: - return b.get_low(); - case 3: - return b.get_high(); - } - return _mm_setzero_si128(); -} - // blend vectors Vec4q template -static inline Vec4q blend4q(Vec4q const & a, Vec4q const & b) { - const int j0 = i0 >= 0 ? i0/2 : i0; - const int j1 = i1 >= 0 ? i1/2 : i1; - const int j2 = i2 >= 0 ? i2/2 : i2; - const int j3 = i3 >= 0 ? i3/2 : i3; - Vec2q x0, x1; - - if (j0 == j1 || i0 < 0 || i1 < 0) { // both from same - const int k0 = j0 >= 0 ? j0 : j1; - x0 = permute2q (select4 (a,b)); - } - else { - x0 = blend2q (select4(a,b), select4(a,b)); - } - if (j2 == j3 || i2 < 0 || i3 < 0) { // both from same - const int k1 = j2 >= 0 ? j2 : j3; - x1 = permute2q (select4 (a,b)); - } - else { - x1 = blend2q (select4(a,b), select4(a,b)); - } - return Vec4q(x0,x1); -} - -template -static inline Vec4uq blend4uq(Vec4uq const & a, Vec4uq const & b) { - return Vec4uq( blend4q (a,b)); -} - -// helper function used below -template -static inline Vec4i select4(Vec8i const & a, Vec8i const & b) { - switch (n) { - case 0: - return a.get_low(); - case 1: - return a.get_high(); - case 2: - return b.get_low(); - case 3: - return b.get_high(); - } - return _mm_setzero_si128(); +static inline Vec4q blend4(Vec4q const& a, Vec4q const& b) { + Vec2q x0 = blend_half(a, b); + Vec2q x1 = blend_half(a, b); + return Vec4q(x0, x1); } // blend vectors Vec8i template -static inline Vec8i blend8i(Vec8i const & a, Vec8i const & b) { - const int j0 = i0 >= 0 ? i0/4 : i0; - const int j1 = i1 >= 0 ? i1/4 : i1; - const int j2 = i2 >= 0 ? i2/4 : i2; - const int j3 = i3 >= 0 ? i3/4 : i3; - const int j4 = i4 >= 0 ? i4/4 : i4; - const int j5 = i5 >= 0 ? i5/4 : i5; - const int j6 = i6 >= 0 ? i6/4 : i6; - const int j7 = i7 >= 0 ? i7/4 : i7; - Vec4i x0, x1; - - const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3; - const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7; - const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3; - const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7; - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; - - // Mask to zero out negative indexes - const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - if (r0 < 0) { - x0 = _mm_setzero_si128(); - } - else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) { - // i0 - i3 all from same source - x0 = permute4i (select4 (a,b)); - } - else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) { - // i0 - i3 all from two sources - const int k0 = i0 >= 0 ? i0 & 3 : i0; - const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0); - const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0); - const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0); - x0 = blend4i (select4(a,b), select4(a,b)); - } - else { - // i0 - i3 from three or four different sources - x0 = blend4i<0,1,6,7> ( - blend4i (select4(a,b), select4(a,b)), - blend4i<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4(a,b), select4(a,b))); - } - - if (r1 < 0) { - x1 = _mm_setzero_si128(); - } - else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) { - // i4 - i7 all from same source - x1 = permute4i (select4 (a,b)); - } - else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) { - // i4 - i7 all from two sources - const int k4 = i4 >= 0 ? i4 & 3 : i4; - const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0); - const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0); - const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0); - x1 = blend4i (select4(a,b), select4(a,b)); - } - else { - // i4 - i7 from three or four different sources - x1 = blend4i<0,1,6,7> ( - blend4i (select4(a,b), select4(a,b)), - blend4i<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4(a,b), select4(a,b))); - } - - return Vec8i(x0,x1); +static inline Vec8i blend8(Vec8i const& a, Vec8i const& b) { + Vec4i x0 = blend_half(a, b); + Vec4i x1 = blend_half(a, b); + return Vec8i(x0, x1); } -template -static inline Vec8ui blend8ui(Vec8ui const & a, Vec8ui const & b) { - return Vec8ui( blend8i (a,b)); +// blend vectors Vec16s +template +static inline Vec16s blend16(Vec16s const& a, Vec16s const& b) { + Vec8s x0 = blend_half(a, b); + Vec8s x1 = blend_half(a, b); + return Vec16s(x0, x1); } -// helper function used below -template -static inline Vec8s select4(Vec16s const & a, Vec16s const & b) { - switch (n) { - case 0: - return a.get_low(); - case 1: - return a.get_high(); - case 2: - return b.get_low(); - case 3: - return b.get_high(); - } - return _mm_setzero_si128(); -} - -template -static inline Vec16s blend16s(Vec16s const & a, Vec16s const & b) { - - const int j0 = i0 >= 0 ? i0 /8 : i0; - const int j1 = i1 >= 0 ? i1 /8 : i1; - const int j2 = i2 >= 0 ? i2 /8 : i2; - const int j3 = i3 >= 0 ? i3 /8 : i3; - const int j4 = i4 >= 0 ? i4 /8 : i4; - const int j5 = i5 >= 0 ? i5 /8 : i5; - const int j6 = i6 >= 0 ? i6 /8 : i6; - const int j7 = i7 >= 0 ? i7 /8 : i7; - const int j8 = i8 >= 0 ? i8 /8 : i8; - const int j9 = i9 >= 0 ? i9 /8 : i9; - const int j10 = i10 >= 0 ? i10/8 : i10; - const int j11 = i11 >= 0 ? i11/8 : i11; - const int j12 = i12 >= 0 ? i12/8 : i12; - const int j13 = i13 >= 0 ? i13/8 : i13; - const int j14 = i14 >= 0 ? i14/8 : i14; - const int j15 = i15 >= 0 ? i15/8 : i15; - - Vec8s x0, x1; - - const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7; - const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15; - const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7; - const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15; - - if (r0 < 0) { - x0 = _mm_setzero_si128(); - } - else if (r0 == s0) { - // i0 - i7 all from same source - x0 = permute8s (select4 (a,b)); - } - else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3 == s0) && (j4<0||j4==r0||j4 == s0) && (j5<0||j5==r0||j5 == s0) && (j6<0||j6==r0||j6 == s0) && (j7<0||j7==r0||j7 == s0)) { - // i0 - i7 all from two sources - const int k0 = i0 >= 0 ? i0 & 7 : i0; - const int k1 = (i1 >= 0 ? i1 & 7 : i1) | (j1 == s0 ? 8 : 0); - const int k2 = (i2 >= 0 ? i2 & 7 : i2) | (j2 == s0 ? 8 : 0); - const int k3 = (i3 >= 0 ? i3 & 7 : i3) | (j3 == s0 ? 8 : 0); - const int k4 = (i4 >= 0 ? i4 & 7 : i4) | (j4 == s0 ? 8 : 0); - const int k5 = (i5 >= 0 ? i5 & 7 : i5) | (j5 == s0 ? 8 : 0); - const int k6 = (i6 >= 0 ? i6 & 7 : i6) | (j6 == s0 ? 8 : 0); - const int k7 = (i7 >= 0 ? i7 & 7 : i7) | (j7 == s0 ? 8 : 0); - x0 = blend8s (select4(a,b), select4(a,b)); - } - else { - // i0 - i7 from three or four different sources - const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0; - const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1; - const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2; - const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3; - const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4; - const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5; - const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6; - const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7; - x0 = blend8s ( - blend8s< j0 & 2 ? -256 : i0 &15, j1 & 2 ? -256 : i1 &15, j2 & 2 ? -256 : i2 &15, j3 & 2 ? -256 : i3 &15, j4 & 2 ? -256 : i4 &15, j5 & 2 ? -256 : i5 &15, j6 & 2 ? -256 : i6 &15, j7 & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()), - blend8s<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high())); - } - - if (r1 < 0) { - x1 = _mm_setzero_si128(); - } - else if (r1 == s1) { - // i8 - i15 all from same source - x1 = permute8s (select4 (a,b)); - } - else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) { - // i8 - i15 all from two sources - const int k8 = i8 >= 0 ? i8 & 7 : i8; - const int k9 = (i9 >= 0 ? i9 & 7 : i9 ) | (j9 == s1 ? 8 : 0); - const int k10= (i10>= 0 ? i10& 7 : i10) | (j10== s1 ? 8 : 0); - const int k11= (i11>= 0 ? i11& 7 : i11) | (j11== s1 ? 8 : 0); - const int k12= (i12>= 0 ? i12& 7 : i12) | (j12== s1 ? 8 : 0); - const int k13= (i13>= 0 ? i13& 7 : i13) | (j13== s1 ? 8 : 0); - const int k14= (i14>= 0 ? i14& 7 : i14) | (j14== s1 ? 8 : 0); - const int k15= (i15>= 0 ? i15& 7 : i15) | (j15== s1 ? 8 : 0); - x1 = blend8s (select4(a,b), select4(a,b)); - } - else { - // i8 - i15 from three or four different sources - const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ; - const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ; - const int n10= j10>= 0 ? j10/2*8 + 2 : j10; - const int n11= j11>= 0 ? j11/2*8 + 3 : j11; - const int n12= j12>= 0 ? j12/2*8 + 4 : j12; - const int n13= j13>= 0 ? j13/2*8 + 5 : j13; - const int n14= j14>= 0 ? j14/2*8 + 6 : j14; - const int n15= j15>= 0 ? j15/2*8 + 7 : j15; - x1 = blend8s ( - blend8s< j8 & 2 ? -256 : i8 &15, j9 & 2 ? -256 : i9 &15, j10 & 2 ? -256 : i10 &15, j11 & 2 ? -256 : i11 &15, j12 & 2 ? -256 : i12 &15, j13 & 2 ? -256 : i13 &15, j14 & 2 ? -256 : i14 &15, j15 & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()), - blend8s<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high())); - } - return Vec16s(x0,x1); +template + static inline Vec32c blend32(Vec32c const& a, Vec32c const& b) { + Vec16c x0 = blend_half(a, b); + Vec16c x1 = blend_half(a, b); + return Vec32c(x0, x1); } -template -static inline Vec16us blend16us(Vec16us const & a, Vec16us const & b) { - return Vec16us( blend16s (a,b)); +// unsigned types: + +template +static inline Vec4uq blend4(Vec4uq const a, Vec4uq const b) { + return Vec4uq( blend4 (Vec4q(a),Vec4q(b))); } -// helper function used below -template -static inline Vec16c select4(Vec32c const & a, Vec32c const & b) { - switch (n) { - case 0: - return a.get_low(); - case 1: - return a.get_high(); - case 2: - return b.get_low(); - case 3: - return b.get_high(); - } - return _mm_setzero_si128(); +template +static inline Vec8ui blend8(Vec8ui const a, Vec8ui const b) { + return Vec8ui( blend8 (Vec8i(a),Vec8i(b))); } -template -static inline Vec32c blend32c(Vec32c const & a, Vec32c const & b) { - - // j0 - j31 indicate one of four 16-byte sources - const int j0 = i0 >= 0 ? i0 /16 : i0; - const int j1 = i1 >= 0 ? i1 /16 : i1; - const int j2 = i2 >= 0 ? i2 /16 : i2; - const int j3 = i3 >= 0 ? i3 /16 : i3; - const int j4 = i4 >= 0 ? i4 /16 : i4; - const int j5 = i5 >= 0 ? i5 /16 : i5; - const int j6 = i6 >= 0 ? i6 /16 : i6; - const int j7 = i7 >= 0 ? i7 /16 : i7; - const int j8 = i8 >= 0 ? i8 /16 : i8; - const int j9 = i9 >= 0 ? i9 /16 : i9; - const int j10 = i10 >= 0 ? i10/16 : i10; - const int j11 = i11 >= 0 ? i11/16 : i11; - const int j12 = i12 >= 0 ? i12/16 : i12; - const int j13 = i13 >= 0 ? i13/16 : i13; - const int j14 = i14 >= 0 ? i14/16 : i14; - const int j15 = i15 >= 0 ? i15/16 : i15; - const int j16 = i16 >= 0 ? i16/16 : i16; - const int j17 = i17 >= 0 ? i17/16 : i17; - const int j18 = i18 >= 0 ? i18/16 : i18; - const int j19 = i19 >= 0 ? i19/16 : i19; - const int j20 = i20 >= 0 ? i20/16 : i20; - const int j21 = i21 >= 0 ? i21/16 : i21; - const int j22 = i22 >= 0 ? i22/16 : i22; - const int j23 = i23 >= 0 ? i23/16 : i23; - const int j24 = i24 >= 0 ? i24/16 : i24; - const int j25 = i25 >= 0 ? i25/16 : i25; - const int j26 = i26 >= 0 ? i26/16 : i26; - const int j27 = i27 >= 0 ? i27/16 : i27; - const int j28 = i28 >= 0 ? i28/16 : i28; - const int j29 = i29 >= 0 ? i29/16 : i29; - const int j30 = i30 >= 0 ? i30/16 : i30; - const int j31 = i31 >= 0 ? i31/16 : i31; - - Vec16c x0, x1; - - // r0, s0 = first two sources of low destination (i0 - i15) - // r1, s1 = first two sources of high destination (i16 - i31) - const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7 >= 0 ? j7 : - j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15; - const int r1 = j16>= 0 ? j16: j17>= 0 ? j17: j18 >= 0 ? j18 : j19 >= 0 ? j19 : j20 >= 0 ? j20 : j21 >= 0 ? j21 : j22 >= 0 ? j22 : j23>= 0 ? j23: - j24>= 0 ? j24: j25>= 0 ? j25: j26 >= 0 ? j26 : j27 >= 0 ? j27 : j28 >= 0 ? j28 : j29 >= 0 ? j29 : j30 >= 0 ? j30 : j31; - const int s0 = (j1 >=0&&j1 !=r0)?j1 : (j2 >=0&&j2 !=r0)?j2 : (j3 >=0&&j3 !=r0)?j3 : (j4 >=0&&j4 !=r0)?j4 : (j5 >=0&&j5 !=r0)?j5 : (j6 >=0&&j6 !=r0)?j6 : (j7 >=0&&j7 !=r0)?j7 : - (j8 >=0&&j8 !=r0)?j8 : (j9 >=0&&j9 !=r0)?j9 : (j10>=0&&j10!=r0)?j10 : (j11>=0&&j11!=r0)?j11 : (j12>=0&&j12!=r0)?j12 : (j13>=0&&j13!=r0)?j13: (j14>=0&&j14!=r0)?j14: j15; - const int s1 = (j17>=0&&j17!=r1)?j17 : (j18>=0&&j18!=r1)?j18 : (j19>=0&&j19!=r1)?j19 : (j20>=0&&j20!=r1)?j20 : (j21>=0&&j21!=r1)?j21 : (j22>=0&&j22!=r1)?j22: (j23>=0&&j23!=r1)?j23: - (j24>=0&&j24!=r1)?j24 : (j25>=0&&j25!=r1)?j25 : (j26>=0&&j26!=r1)?j26 : (j27>=0&&j27!=r1)?j27 : (j28>=0&&j28!=r1)?j28 : (j29>=0&&j29!=r1)?j29: (j30>=0&&j30!=r1)?j30: j31; - - if (r0 < 0) { - x0 = _mm_setzero_si128(); - } - else if (r0 == s0) { - // i0 - i15 all from same source - x0 = permute16c< i0&-49, i1&-49, i2 &-49, i3 &-49, i4 &-49, i5 &-49, i6 &-49, i7 &-49, - i8&-49, i9&-49, i10&-49, i11&-49, i12&-49, i13&-49, i14&-49, i15&-49 > - (select4 (a,b)); - } - else if ((j2 <0||j2 ==r0||j2 ==s0) && (j3 <0||j3 ==r0||j3 ==s0) && (j4 <0||j4 ==r0||j4 ==s0) && (j5 <0||j5 ==r0||j5 ==s0) && (j6 <0||j6 ==r0||j6 ==s0) && (j7 <0||j7 ==r0||j7 ==s0) && (j8 <0||j8 ==r0||j8 ==s0) && - (j9 <0||j9 ==r0||j9 ==s0) && (j10<0||j10==r0||j10==s0) && (j11<0||j11==r0||j11==s0) && (j12<0||j12==r0||j12==s0) && (j13<0||j13==r0||j13==s0) && (j14<0||j14==r0||j14==s0) && (j15<0||j15==r0||j15==s0)) { - // i0 - i15 all from two sources - const int k0 = i0 >= 0 ? i0 & 15 : i0; - const int k1 = (i1 >= 0 ? i1 & 15 : i1 ) | (j1 == s0 ? 16 : 0); - const int k2 = (i2 >= 0 ? i2 & 15 : i2 ) | (j2 == s0 ? 16 : 0); - const int k3 = (i3 >= 0 ? i3 & 15 : i3 ) | (j3 == s0 ? 16 : 0); - const int k4 = (i4 >= 0 ? i4 & 15 : i4 ) | (j4 == s0 ? 16 : 0); - const int k5 = (i5 >= 0 ? i5 & 15 : i5 ) | (j5 == s0 ? 16 : 0); - const int k6 = (i6 >= 0 ? i6 & 15 : i6 ) | (j6 == s0 ? 16 : 0); - const int k7 = (i7 >= 0 ? i7 & 15 : i7 ) | (j7 == s0 ? 16 : 0); - const int k8 = (i8 >= 0 ? i8 & 15 : i8 ) | (j8 == s0 ? 16 : 0); - const int k9 = (i9 >= 0 ? i9 & 15 : i9 ) | (j9 == s0 ? 16 : 0); - const int k10= (i10>= 0 ? i10& 15 : i10) | (j10== s0 ? 16 : 0); - const int k11= (i11>= 0 ? i11& 15 : i11) | (j11== s0 ? 16 : 0); - const int k12= (i12>= 0 ? i12& 15 : i12) | (j12== s0 ? 16 : 0); - const int k13= (i13>= 0 ? i13& 15 : i13) | (j13== s0 ? 16 : 0); - const int k14= (i14>= 0 ? i14& 15 : i14) | (j14== s0 ? 16 : 0); - const int k15= (i15>= 0 ? i15& 15 : i15) | (j15== s0 ? 16 : 0); - x0 = blend16c (select4(a,b), select4(a,b)); - } - else { - // i0 - i15 from three or four different sources - const int n0 = j0 >= 0 ? j0 /2*16 + 0 : j0; - const int n1 = j1 >= 0 ? j1 /2*16 + 1 : j1; - const int n2 = j2 >= 0 ? j2 /2*16 + 2 : j2; - const int n3 = j3 >= 0 ? j3 /2*16 + 3 : j3; - const int n4 = j4 >= 0 ? j4 /2*16 + 4 : j4; - const int n5 = j5 >= 0 ? j5 /2*16 + 5 : j5; - const int n6 = j6 >= 0 ? j6 /2*16 + 6 : j6; - const int n7 = j7 >= 0 ? j7 /2*16 + 7 : j7; - const int n8 = j8 >= 0 ? j8 /2*16 + 8 : j8; - const int n9 = j9 >= 0 ? j9 /2*16 + 9 : j9; - const int n10= j10>= 0 ? j10/2*16 +10 : j10; - const int n11= j11>= 0 ? j11/2*16 +11 : j11; - const int n12= j12>= 0 ? j12/2*16 +12 : j12; - const int n13= j13>= 0 ? j13/2*16 +13 : j13; - const int n14= j14>= 0 ? j14/2*16 +14 : j14; - const int n15= j15>= 0 ? j15/2*16 +15 : j15; - - Vec16c x0a = blend16c< j0 & 2 ? -256 : i0 & 31, j1 & 2 ? -256 : i1 & 31, j2 & 2 ? -256 : i2 & 31, j3 & 2 ? -256 : i3 & 31, j4 & 2 ? -256 : i4 & 31, j5 & 2 ? -256 : i5 & 31, j6 & 2 ? -256 : i6 & 31, j7 & 2 ? -256 : i7 & 31, - j8 & 2 ? -256 : i8 & 31, j9 & 2 ? -256 : i9 & 31, j10 & 2 ? -256 : i10& 31, j11 & 2 ? -256 : i11& 31, j12 & 2 ? -256 : i12& 31, j13 & 2 ? -256 : i13& 31, j14 & 2 ? -256 : i14& 31, j15 & 2 ? -256 : i15& 31 > (a.get_low(),a.get_high()); - Vec16c x0b = blend16c<(j0^2)& 6 ? -256 : i0 & 31, (j1^2)& 6 ? -256 : i1 & 31, (j2 ^2)& 6 ? -256 : i2 & 31, (j3 ^2)& 6 ? -256 : i3 & 31, (j4 ^2)& 6 ? -256 : i4 & 31, (j5 ^2)& 6 ? -256 : i5 & 31, (j6 ^2)& 6 ? -256 : i6 & 31, (j7 ^2)& 6 ? -256 : i7 & 31, - (j8^2)& 6 ? -256 : i8 & 31, (j9^2)& 6 ? -256 : i9 & 31, (j10^2)& 6 ? -256 : i10& 31, (j11^2)& 6 ? -256 : i11& 31, (j12^2)& 6 ? -256 : i12& 31, (j13^2)& 6 ? -256 : i13& 31, (j14^2)& 6 ? -256 : i14& 31, (j15^2)& 6 ? -256 : i15& 31 > (b.get_low(),b.get_high()); - x0 = blend16c (x0a, x0b); - } - - if (r1 < 0) { - x1 = _mm_setzero_si128(); - } - else if (r1 == s1) { - // i16 - i31 all from same source - x1 = permute16c< i16&-49, i17&-49, i18&-49, i19&-49, i20&-49, i21&-49, i22&-49, i23&-49, - i24&-49, i25&-49, i26&-49, i27&-49, i28&-49, i29&-49, i30&-49, i31&-49 > - (select4 (a,b)); - } - else if ((j18<0||j18==r1||j18==s1) && (j19<0||j19==r1||j19==s1) && (j20<0||j20==r1||j20==s1) && (j21<0||j21==r1||j21==s1) && (j22<0||j22==r1||j22==s1) && (j23<0||j23==r1||j23==s1) && (j24<0||j24==r1||j24==s1) && - (j25<0||j25==r1||j25==s1) && (j26<0||j26==r1||j26==s1) && (j27<0||j27==r1||j27==s1) && (j28<0||j28==r1||j28==s1) && (j29<0||j29==r1||j29==s1) && (j30<0||j30==r1||j30==s1) && (j31<0||j31==r1||j31==s1)) { - // i16 - i31 all from two sources - const int k16= i16>= 0 ? i16& 15 : i16; - const int k17= (i17>= 0 ? i17& 15 : i17) | (j17== s1 ? 16 : 0); - const int k18= (i18>= 0 ? i18& 15 : i18) | (j18== s1 ? 16 : 0); - const int k19= (i19>= 0 ? i19& 15 : i19) | (j19== s1 ? 16 : 0); - const int k20= (i20>= 0 ? i20& 15 : i20) | (j20== s1 ? 16 : 0); - const int k21= (i21>= 0 ? i21& 15 : i21) | (j21== s1 ? 16 : 0); - const int k22= (i22>= 0 ? i22& 15 : i22) | (j22== s1 ? 16 : 0); - const int k23= (i23>= 0 ? i23& 15 : i23) | (j23== s1 ? 16 : 0); - const int k24= (i24>= 0 ? i24& 15 : i24) | (j24== s1 ? 16 : 0); - const int k25= (i25>= 0 ? i25& 15 : i25) | (j25== s1 ? 16 : 0); - const int k26= (i26>= 0 ? i26& 15 : i26) | (j26== s1 ? 16 : 0); - const int k27= (i27>= 0 ? i27& 15 : i27) | (j27== s1 ? 16 : 0); - const int k28= (i28>= 0 ? i28& 15 : i28) | (j28== s1 ? 16 : 0); - const int k29= (i29>= 0 ? i29& 15 : i29) | (j29== s1 ? 16 : 0); - const int k30= (i30>= 0 ? i30& 15 : i30) | (j30== s1 ? 16 : 0); - const int k31= (i31>= 0 ? i31& 15 : i31) | (j31== s1 ? 16 : 0); - x1 = blend16c (select4(a,b), select4(a,b)); - } - else { - // i16 - i31 from three or four different sources - const int n16= j16>= 0 ? j16/2*16 + 0 : j16; - const int n17= j17>= 0 ? j17/2*16 + 1 : j17; - const int n18= j18>= 0 ? j18/2*16 + 2 : j18; - const int n19= j19>= 0 ? j19/2*16 + 3 : j19; - const int n20= j20>= 0 ? j20/2*16 + 4 : j20; - const int n21= j21>= 0 ? j21/2*16 + 5 : j21; - const int n22= j22>= 0 ? j22/2*16 + 6 : j22; - const int n23= j23>= 0 ? j23/2*16 + 7 : j23; - const int n24= j24>= 0 ? j24/2*16 + 8 : j24; - const int n25= j25>= 0 ? j25/2*16 + 9 : j25; - const int n26= j26>= 0 ? j26/2*16 +10 : j26; - const int n27= j27>= 0 ? j27/2*16 +11 : j27; - const int n28= j28>= 0 ? j28/2*16 +12 : j28; - const int n29= j29>= 0 ? j29/2*16 +13 : j29; - const int n30= j30>= 0 ? j30/2*16 +14 : j30; - const int n31= j31>= 0 ? j31/2*16 +15 : j31; - x1 = blend16c ( - blend16c< j16 & 2 ? -256 : i16& 31, j17 & 2 ? -256 : i17& 31, j18 & 2 ? -256 : i18& 31, j19 & 2 ? -256 : i19& 31, j20 & 2 ? -256 : i20& 31, j21 & 2 ? -256 : i21& 31, j22 & 2 ? -256 : i22& 31, j23 & 2 ? -256 : i23& 31, - j24 & 2 ? -256 : i24& 31, j25 & 2 ? -256 : i25& 31, j26 & 2 ? -256 : i26& 31, j27 & 2 ? -256 : i27& 31, j28 & 2 ? -256 : i28& 31, j29 & 2 ? -256 : i29& 31, j30 & 2 ? -256 : i30& 31, j31 & 2 ? -256 : i31& 31 > (a.get_low(),a.get_high()), - blend16c<(j16^2)& 6 ? -256 : i16& 31, (j17^2)& 6 ? -256 : i17& 31, (j18^2)& 6 ? -256 : i18& 31, (j19^2)& 6 ? -256 : i19& 31, (j20^2)& 6 ? -256 : i20& 31, (j21^2)& 6 ? -256 : i21& 31, (j22^2)& 6 ? -256 : i22& 31, (j23^2)& 6 ? -256 : i23& 31, - (j24^2)& 6 ? -256 : i24& 31, (j25^2)& 6 ? -256 : i25& 31, (j26^2)& 6 ? -256 : i26& 31, (j27^2)& 6 ? -256 : i27& 31, (j28^2)& 6 ? -256 : i28& 31, (j29^2)& 6 ? -256 : i29& 31, (j30^2)& 6 ? -256 : i30& 31, (j31^2)& 6 ? -256 : i31& 31 > (b.get_low(),b.get_high())); - } - return Vec32c(x0,x1); +template +static inline Vec16us blend16(Vec16us const a, Vec16us const b) { + return Vec16us( blend16 (Vec16s(a),Vec16s(b))); } template < - int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, + int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15, int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 > - static inline Vec32uc blend32uc(Vec32uc const & a, Vec32uc const & b) { - return Vec32uc (blend32c (a, b)); + static inline Vec32uc blend32(Vec32uc const a, Vec32uc const b) { + return Vec32uc (blend32 (Vec32c(a), Vec32c(b))); } @@ -3650,25 +3273,9 @@ template < * These functions use vector elements as indexes into a table. * The table is given as one or more vectors or as an array. * -* This can be used for several purposes: -* - table lookup -* - permute or blend with variable indexes -* - blend from more than two sources -* - gather non-contiguous data -* -* An index out of range may produce any value - the actual value produced is -* implementation dependent and may be different for different instruction -* sets. An index out of range does not produce an error message or exception. -* -* Example: -* Vec8i a(2,0,0,6,4,3,5,0); // index a is ( 2, 0, 0, 6, 4, 3, 5, 0) -* Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107) -* Vec8i c; -* c = lookup8 (a,b); // c is (102, 100, 100, 106, 104, 103, 105, 100) -* *****************************************************************************/ -static inline Vec32c lookup32(Vec32c const & index, Vec32c const & table) { +static inline Vec32c lookup32(Vec32c const index, Vec32c const table) { #if defined (__XOP__) // AMD XOP instruction set. Use VPPERM Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low()); Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high()); @@ -3681,18 +3288,18 @@ static inline Vec32c lookup32(Vec32c const & index, Vec32c const & table) { } template -static inline Vec32c lookup(Vec32uc const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 16) { +static inline Vec32c lookup(Vec32uc const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 16) { Vec16c tt = Vec16c().load(table); Vec16c r0 = lookup16(index.get_low(), tt); Vec16c r1 = lookup16(index.get_high(), tt); return Vec32c(r0, r1); } - if (n <= 32) return lookup32(index, Vec32c().load(table)); + if constexpr (n <= 32) return lookup32(index, Vec32c().load(table)); // n > 32. Limit index Vec32uc index1; - if ((n & (n-1)) == 0) { + if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec32uc(index) & uint8_t(n-1); } @@ -3709,30 +3316,29 @@ static inline Vec32c lookup(Vec32uc const & index, void const * table) { } template -static inline Vec32c lookup(Vec32c const & index, void const * table) { +static inline Vec32c lookup(Vec32c const index, void const * table) { return lookup(Vec32uc(index), table); } - -static inline Vec16s lookup16(Vec16s const & index, Vec16s const & table) { +static inline Vec16s lookup16(Vec16s const index, Vec16s const table) { Vec8s t0 = lookup16(index.get_low() , table.get_low(), table.get_high()); Vec8s t1 = lookup16(index.get_high(), table.get_low(), table.get_high()); return Vec16s(t0, t1); } template -static inline Vec16s lookup(Vec16s const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 8) { - Vec8s table1 = Vec8s().load(table); - return Vec16s( +static inline Vec16s lookup(Vec16s const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + Vec8s table1 = Vec8s().load(table); + return Vec16s( lookup8 (index.get_low(), table1), lookup8 (index.get_high(), table1)); } - if (n <= 16) return lookup16(index, Vec16s().load(table)); + if constexpr (n <= 16) return lookup16(index, Vec16s().load(table)); // n > 16. Limit index Vec16us i1; - if ((n & (n-1)) == 0) { + if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n i1 = Vec16us(index) & (n-1); } @@ -3745,27 +3351,27 @@ static inline Vec16s lookup(Vec16s const & index, void const * table) { t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]); } -static inline Vec8i lookup8(Vec8i const & index, Vec8i const & table) { +static inline Vec8i lookup8(Vec8i const index, Vec8i const table) { Vec4i t0 = lookup8(index.get_low() , table.get_low(), table.get_high()); Vec4i t1 = lookup8(index.get_high(), table.get_low(), table.get_high()); return Vec8i(t0, t1); } template -static inline Vec8i lookup(Vec8i const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 4) { - Vec4i table1 = Vec4i().load(table); - return Vec8i( +static inline Vec8i lookup(Vec8i const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 4) { + Vec4i table1 = Vec4i().load(table); + return Vec8i( lookup4 (index.get_low(), table1), lookup4 (index.get_high(), table1)); } - if (n <= 8) { + if constexpr (n <= 8) { return lookup8(index, Vec8i().load(table)); } // n > 8. Limit index Vec8ui i1; - if ((n & (n-1)) == 0) { + if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n i1 = Vec8ui(index) & (n-1); } @@ -3777,16 +3383,16 @@ static inline Vec8i lookup(Vec8i const & index, void const * table) { return Vec8i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]); } -static inline Vec4q lookup4(Vec4q const & index, Vec4q const & table) { +static inline Vec4q lookup4(Vec4q const index, Vec4q const table) { return lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table)); } template -static inline Vec4q lookup(Vec4q const & index, void const * table) { - if (n <= 0) return 0; +static inline Vec4q lookup(Vec4q const index, void const * table) { + if constexpr (n <= 0) return 0; // n > 0. Limit index Vec4uq index1; - if ((n & (n-1)) == 0) { + if constexpr ((n & (n-1)) == 0) { // n is a power of 2, make index modulo n index1 = Vec4uq(index) & (n-1); } @@ -3794,42 +3400,45 @@ static inline Vec4q lookup(Vec4q const & index, void const * table) { // n is not a power of 2, limit to n-1. // There is no 64-bit min instruction, but we can use the 32-bit unsigned min, // since n is a 32-bit integer - index1 = Vec4uq(min(Vec8ui(index), constant8i())); + index1 = Vec4uq(min(Vec8ui(index), Vec8ui(n-1, 0, n-1, 0, n-1, 0, n-1, 0))); } uint32_t ii[8]; index1.store(ii); // use only lower 32 bits of each index int64_t const * tt = (int64_t const *)table; - return Vec4q(tt[ii[0]], tt[ii[2]], tt[ii[4]], tt[ii[6]]); + return Vec4q(tt[ii[0]], tt[ii[2]], tt[ii[4]], tt[ii[6]]); } /***************************************************************************** * -* Other permutations with variable indexes +* Byte shifts * *****************************************************************************/ // Function shift_bytes_up: shift whole vector left by b bytes. -// You may use a permute function instead if b is a compile-time constant -static inline Vec32c shift_bytes_up(Vec32c const & a, int b) { - if (b < 16) { - return Vec32c(shift_bytes_up(a.get_low(),b), shift_bytes_up(a.get_high(),b) | shift_bytes_down(a.get_low(),16-b)); - } - else { - return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(),b-16)); +template +static inline Vec32c shift_bytes_up(Vec32c const a) { + int8_t dat[64]; + if (b < 32) { + Vec32c(0).store(dat); + a.store(dat+b); + return Vec32c().load(dat); } + else return 0; } // Function shift_bytes_down: shift whole vector right by b bytes -// You may use a permute function instead if b is a compile-time constant -static inline Vec32c shift_bytes_down(Vec32c const & a, int b) { - if (b < 16) { - return Vec32c(shift_bytes_down(a.get_low(),b) | shift_bytes_up(a.get_high(),16-b), shift_bytes_down(a.get_high(),b)); - } - else { - return Vec32c(shift_bytes_down(a.get_high(),b-16), Vec16c(0)); +template +static inline Vec32c shift_bytes_down(Vec32c const a) { + int8_t dat[64]; + if (b < 32) { + a.store(dat); + Vec32c(0).store(dat+32); + return Vec32c().load(dat+b); } + else return 0; } + /***************************************************************************** * * Gather functions with fixed indexes @@ -3838,35 +3447,24 @@ static inline Vec32c shift_bytes_down(Vec32c const & a, int b) { // Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7 template static inline Vec8i gather8i(void const * a) { - Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index; // Error message if index is negative - const int i01min = i0 < i1 ? i0 : i1; - const int i23min = i2 < i3 ? i2 : i3; - const int i45min = i4 < i5 ? i4 : i5; - const int i67min = i6 < i7 ? i6 : i7; - const int i0123min = i01min < i23min ? i01min : i23min; - const int i4567min = i45min < i67min ? i45min : i67min; - const int imin = i0123min < i4567min ? i0123min : i4567min; - const int i01max = i0 > i1 ? i0 : i1; - const int i23max = i2 > i3 ? i2 : i3; - const int i45max = i4 > i5 ? i4 : i5; - const int i67max = i6 > i7 ? i6 : i7; - const int i0123max = i01max > i23max ? i01max : i23max; - const int i4567max = i45max > i67max ? i45max : i67max; - const int imax = i0123max > i4567max ? i0123max : i4567max; - - if (imax - imin <= 7) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 7) { // load one contiguous block and permute - if (imax > 7) { + if constexpr (imax > 7) { // make sure we don't read past the end of the array Vec8i b = Vec8i().load((int32_t const *)a + imax-7); - return permute8i(b); + return permute8(b); } else { Vec8i b = Vec8i().load((int32_t const *)a + imin); - return permute8i(b); + return permute8(b); } } - if ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) + if constexpr ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { // load two contiguous blocks and blend Vec8i b = Vec8i().load((int32_t const *)a + imin); @@ -3879,7 +3477,7 @@ static inline Vec8i gather8i(void const * a) { const int j5 = i5(b, c); + return blend8(b, c); } // use lookup function return lookup(Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), a); @@ -3887,26 +3485,24 @@ static inline Vec8i gather8i(void const * a) { template static inline Vec4q gather4q(void const * a) { - Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index; // Error message if index is negative - const int i01min = i0 < i1 ? i0 : i1; - const int i23min = i2 < i3 ? i2 : i3; - const int imin = i01min < i23min ? i01min : i23min; - const int i01max = i0 > i1 ? i0 : i1; - const int i23max = i2 > i3 ? i2 : i3; - const int imax = i01max > i23max ? i01max : i23max; - if (imax - imin <= 3) { + int constexpr indexs[4] = { i0, i1, i2, i3 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 3) { // load one contiguous block and permute - if (imax > 3) { + if constexpr (imax > 3) { // make sure we don't read past the end of the array Vec4q b = Vec4q().load((int64_t const *)a + imax-3); - return permute4q(b); + return permute4(b); } else { Vec4q b = Vec4q().load((int64_t const *)a + imin); - return permute4q(b); + return permute4(b); } } - if ((i0imax-4) && (i1imax-4) && (i2imax-4) && (i3imax-4)) { + if constexpr ((i0imax-4) && (i1imax-4) && (i2imax-4) && (i3imax-4)) { // load two contiguous blocks and blend Vec4q b = Vec4q().load((int64_t const *)a + imin); Vec4q c = Vec4q().load((int64_t const *)a + imax-3); @@ -3914,7 +3510,7 @@ static inline Vec4q gather4q(void const * a) { const int j1 = i1(b, c); + return blend4(b, c); } // use lookup function return lookup(Vec4q(i0,i1,i2,i3), a); @@ -3927,27 +3523,16 @@ static inline Vec4q gather4q(void const * a) { ****************************************************************************** * * These functions write the elements of a vector to arbitrary positions in an -* array in memory. Each vector element is written to an array position +* array in memory. Each vector element is written to an array position * determined by an index. An element is not written if the corresponding * index is out of range. * The indexes can be specified as constant template parameters or as an * integer vector. -* -* The scatter functions are useful if the data are distributed in a sparce -* manner into the array. If the array is dense then it is more efficient -* to permute the data into the right positions and then write the whole -* permuted vector into the array. -* -* Example: -* Vec8q a(10,11,12,13,14,15,16,17); -* int64_t b[16] = {0}; -* scatter<0,2,14,10,1,-1,5,9>(a,b); -* // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0} * *****************************************************************************/ template -static inline void scatter(Vec8i const & data, void * array) { +static inline void scatter(Vec8i const data, void * array) { int32_t* arr = (int32_t*)array; const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; for (int i = 0; i < 8; i++) { @@ -3956,7 +3541,7 @@ static inline void scatter(Vec8i const & data, void * array) { } template -static inline void scatter(Vec4q const & data, void * array) { +static inline void scatter(Vec4q const data, void * array) { int64_t* arr = (int64_t*)array; const int index[4] = {i0,i1,i2,i3}; for (int i = 0; i < 4; i++) { @@ -3964,26 +3549,28 @@ static inline void scatter(Vec4q const & data, void * array) { } } -static inline void scatter(Vec8i const & index, uint32_t limit, Vec8i const & data, void * array) { - int32_t* arr = (int32_t*)array; +// scatter functions with variable indexes + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8i const data, void * destination) { + int32_t* arr = (int32_t*)destination; for (int i = 0; i < 8; i++) { if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; } } -static inline void scatter(Vec4q const & index, uint32_t limit, Vec4q const & data, void * array) { - int64_t* arr = (int64_t*)array; +static inline void scatter(Vec4q const index, uint32_t limit, Vec4q const data, void * destination) { + int64_t* arr = (int64_t*)destination; for (int i = 0; i < 4; i++) { if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i]; } -} +} -static inline void scatter(Vec4i const & index, uint32_t limit, Vec4q const & data, void * array) { - int64_t* arr = (int64_t*)array; +static inline void scatter(Vec4i const index, uint32_t limit, Vec4q const data, void * destination) { + int64_t* arr = (int64_t*)destination; for (int i = 0; i < 4; i++) { if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; } -} +} /***************************************************************************** * @@ -3994,66 +3581,66 @@ static inline void scatter(Vec4i const & index, uint32_t limit, Vec4q const & da // Extend 8-bit integers to 16-bit integers, signed and unsigned // Function extend_low : extends the low 16 elements to 16 bits with sign extension -static inline Vec16s extend_low (Vec32c const & a) { +static inline Vec16s extend_low (Vec32c const a) { return Vec16s(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : extends the high 16 elements to 16 bits with sign extension -static inline Vec16s extend_high (Vec32c const & a) { +static inline Vec16s extend_high (Vec32c const a) { return Vec16s(extend_low(a.get_high()), extend_high(a.get_high())); } // Function extend_low : extends the low 16 elements to 16 bits with zero extension -static inline Vec16us extend_low (Vec32uc const & a) { +static inline Vec16us extend_low (Vec32uc const a) { return Vec16us(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : extends the high 19 elements to 16 bits with zero extension -static inline Vec16us extend_high (Vec32uc const & a) { +static inline Vec16us extend_high (Vec32uc const a) { return Vec16us(extend_low(a.get_high()), extend_high(a.get_high())); } // Extend 16-bit integers to 32-bit integers, signed and unsigned // Function extend_low : extends the low 8 elements to 32 bits with sign extension -static inline Vec8i extend_low (Vec16s const & a) { +static inline Vec8i extend_low (Vec16s const a) { return Vec8i(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : extends the high 8 elements to 32 bits with sign extension -static inline Vec8i extend_high (Vec16s const & a) { +static inline Vec8i extend_high (Vec16s const a) { return Vec8i(extend_low(a.get_high()), extend_high(a.get_high())); } // Function extend_low : extends the low 8 elements to 32 bits with zero extension -static inline Vec8ui extend_low (Vec16us const & a) { +static inline Vec8ui extend_low (Vec16us const a) { return Vec8ui(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : extends the high 8 elements to 32 bits with zero extension -static inline Vec8ui extend_high (Vec16us const & a) { +static inline Vec8ui extend_high (Vec16us const a) { return Vec8ui(extend_low(a.get_high()), extend_high(a.get_high())); } // Extend 32-bit integers to 64-bit integers, signed and unsigned // Function extend_low : extends the low 4 elements to 64 bits with sign extension -static inline Vec4q extend_low (Vec8i const & a) { +static inline Vec4q extend_low (Vec8i const a) { return Vec4q(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : extends the high 4 elements to 64 bits with sign extension -static inline Vec4q extend_high (Vec8i const & a) { +static inline Vec4q extend_high (Vec8i const a) { return Vec4q(extend_low(a.get_high()), extend_high(a.get_high())); } // Function extend_low : extends the low 4 elements to 64 bits with zero extension -static inline Vec4uq extend_low (Vec8ui const & a) { +static inline Vec4uq extend_low (Vec8ui const a) { return Vec4uq(extend_low(a.get_low()), extend_high(a.get_low())); } // Function extend_high : extends the high 4 elements to 64 bits with zero extension -static inline Vec4uq extend_high (Vec8ui const & a) { +static inline Vec4uq extend_high (Vec8ui const a) { return Vec4uq(extend_low(a.get_high()), extend_high(a.get_high())); } @@ -4061,25 +3648,25 @@ static inline Vec4uq extend_high (Vec8ui const & a) { // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers // Overflow wraps around -static inline Vec32c compress (Vec16s const & low, Vec16s const & high) { +static inline Vec32c compress (Vec16s const low, Vec16s const high) { return Vec32c(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers // Signed, with saturation -static inline Vec32c compress_saturated (Vec16s const & low, Vec16s const & high) { +static inline Vec32c compress_saturated (Vec16s const low, Vec16s const high) { return Vec32c(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers // Unsigned, overflow wraps around -static inline Vec32uc compress (Vec16us const & low, Vec16us const & high) { +static inline Vec32uc compress (Vec16us const low, Vec16us const high) { return Vec32uc(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers // Unsigned, with saturation -static inline Vec32uc compress_saturated (Vec16us const & low, Vec16us const & high) { +static inline Vec32uc compress_saturated (Vec16us const low, Vec16us const high) { return Vec32uc(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); } @@ -4087,25 +3674,25 @@ static inline Vec32uc compress_saturated (Vec16us const & low, Vec16us const & h // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Overflow wraps around -static inline Vec16s compress (Vec8i const & low, Vec8i const & high) { +static inline Vec16s compress (Vec8i const low, Vec8i const high) { return Vec16s(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Signed with saturation -static inline Vec16s compress_saturated (Vec8i const & low, Vec8i const & high) { +static inline Vec16s compress_saturated (Vec8i const low, Vec8i const high) { return Vec16s(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Overflow wraps around -static inline Vec16us compress (Vec8ui const & low, Vec8ui const & high) { +static inline Vec16us compress (Vec8ui const low, Vec8ui const high) { return Vec16us(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Unsigned, with saturation -static inline Vec16us compress_saturated (Vec8ui const & low, Vec8ui const & high) { +static inline Vec16us compress_saturated (Vec8ui const low, Vec8ui const high) { return Vec16us(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); } @@ -4113,25 +3700,25 @@ static inline Vec16us compress_saturated (Vec8ui const & low, Vec8ui const & hig // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers // Overflow wraps around -static inline Vec8i compress (Vec4q const & low, Vec4q const & high) { +static inline Vec8i compress (Vec4q const low, Vec4q const high) { return Vec8i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers // Signed, with saturation -static inline Vec8i compress_saturated (Vec4q const & low, Vec4q const & high) { +static inline Vec8i compress_saturated (Vec4q const low, Vec4q const high) { return Vec8i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); } // Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers // Overflow wraps around -static inline Vec8ui compress (Vec4uq const & low, Vec4uq const & high) { +static inline Vec8ui compress (Vec4uq const low, Vec4uq const high) { return Vec8ui (compress((Vec4q)low, (Vec4q)high)); } // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers // Unsigned, with saturation -static inline Vec8ui compress_saturated (Vec4uq const & low, Vec4uq const & high) { +static inline Vec8ui compress_saturated (Vec4uq const low, Vec4uq const high) { return Vec8ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); } @@ -4144,20 +3731,20 @@ static inline Vec8ui compress_saturated (Vec4uq const & low, Vec4uq const & high // Divide Vec8i by compile-time constant template -static inline Vec8i divide_by_i(Vec8i const & a) { - return Vec8i( divide_by_i(a.get_low()), divide_by_i(a.get_high())); +static inline Vec8i divide_by_i(Vec8i const a) { + return Vec8i(divide_by_i(a.get_low()), divide_by_i(a.get_high())); } // define Vec8i a / const_int(d) template -static inline Vec8i operator / (Vec8i const & a, Const_int_t) { +static inline Vec8i operator / (Vec8i const a, Const_int_t) { return divide_by_i(a); } // define Vec8i a / const_uint(d) template -static inline Vec8i operator / (Vec8i const & a, Const_uint_t) { - Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned +static inline Vec8i operator / (Vec8i const a, Const_uint_t) { + static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned"); return divide_by_i(a); // signed divide } @@ -4178,20 +3765,20 @@ static inline Vec8i & operator /= (Vec8i & a, Const_uint_t b) { // Divide Vec8ui by compile-time constant template -static inline Vec8ui divide_by_ui(Vec8ui const & a) { +static inline Vec8ui divide_by_ui(Vec8ui const a) { return Vec8ui( divide_by_ui(a.get_low()), divide_by_ui(a.get_high())); } // define Vec8ui a / const_uint(d) template -static inline Vec8ui operator / (Vec8ui const & a, Const_uint_t) { +static inline Vec8ui operator / (Vec8ui const a, Const_uint_t) { return divide_by_ui(a); } // define Vec8ui a / const_int(d) template -static inline Vec8ui operator / (Vec8ui const & a, Const_int_t) { - Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous +static inline Vec8ui operator / (Vec8ui const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); return divide_by_ui(a); // unsigned divide } @@ -4209,23 +3796,22 @@ static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t b) { return a; } - -// Divide Vec16s by compile-time constant +// Divide Vec16s by compile-time constant template -static inline Vec16s divide_by_i(Vec16s const & a) { +static inline Vec16s divide_by_i(Vec16s const a) { return Vec16s( divide_by_i(a.get_low()), divide_by_i(a.get_high())); } // define Vec16s a / const_int(d) template -static inline Vec16s operator / (Vec16s const & a, Const_int_t) { +static inline Vec16s operator / (Vec16s const a, Const_int_t) { return divide_by_i(a); } // define Vec16s a / const_uint(d) template -static inline Vec16s operator / (Vec16s const & a, Const_uint_t) { - Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned +static inline Vec16s operator / (Vec16s const a, Const_uint_t) { + static_assert(d < 0x8000u, "Dividing signed integer by overflowing unsigned"); return divide_by_i(a); // signed divide } @@ -4243,23 +3829,22 @@ static inline Vec16s & operator /= (Vec16s & a, Const_uint_t b) { return a; } - // Divide Vec16us by compile-time constant template -static inline Vec16us divide_by_ui(Vec16us const & a) { +static inline Vec16us divide_by_ui(Vec16us const a) { return Vec16us( divide_by_ui(a.get_low()), divide_by_ui(a.get_high())); } // define Vec16us a / const_uint(d) template -static inline Vec16us operator / (Vec16us const & a, Const_uint_t) { +static inline Vec16us operator / (Vec16us const a, Const_uint_t) { return divide_by_ui(a); } // define Vec16us a / const_int(d) template -static inline Vec16us operator / (Vec16us const & a, Const_int_t) { - Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous +static inline Vec16us operator / (Vec16us const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); return divide_by_ui(a); // unsigned divide } @@ -4277,10 +3862,9 @@ static inline Vec16us & operator /= (Vec16us & a, Const_int_t b) { return a; } - // define Vec32c a / const_int(d) template -static inline Vec32c operator / (Vec32c const & a, Const_int_t) { +static inline Vec32c operator / (Vec32c const a, Const_int_t) { // expand into two Vec16s Vec16s low = extend_low(a) / Const_int_t(); Vec16s high = extend_high(a) / Const_int_t(); @@ -4289,8 +3873,8 @@ static inline Vec32c operator / (Vec32c const & a, Const_int_t) { // define Vec32c a / const_uint(d) template -static inline Vec32c operator / (Vec32c const & a, Const_uint_t) { - Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned +static inline Vec32c operator / (Vec32c const a, Const_uint_t) { + static_assert(uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned"); return a / Const_int_t(); // signed divide } @@ -4309,7 +3893,7 @@ static inline Vec32c & operator /= (Vec32c & a, Const_uint_t b) { // define Vec32uc a / const_uint(d) template -static inline Vec32uc operator / (Vec32uc const & a, Const_uint_t) { +static inline Vec32uc operator / (Vec32uc const a, Const_uint_t) { // expand into two Vec16us Vec16us low = extend_low(a) / Const_uint_t(); Vec16us high = extend_high(a) / Const_uint_t(); @@ -4318,8 +3902,8 @@ static inline Vec32uc operator / (Vec32uc const & a, Const_uint_t) { // define Vec32uc a / const_int(d) template -static inline Vec32uc operator / (Vec32uc const & a, Const_int_t) { - Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous +static inline Vec32uc operator / (Vec32uc const a, Const_int_t) { + static_assert(int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous"); return a / Const_uint_t(); // unsigned divide } @@ -4337,50 +3921,6 @@ static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t b) { return a; } -/***************************************************************************** -* -* Horizontal scan functions -* -*****************************************************************************/ - -// Get index to the first element that is true. Return -1 if all are false -static inline int horizontal_find_first(Vec32cb const & x) { - int a1 = horizontal_find_first(x.get_low()); - if (a1 >= 0) return a1; - int a2 = horizontal_find_first(x.get_high()); - if (a2 < 0) return a2; - return a2 + 16;; -} - -static inline int horizontal_find_first(Vec16sb const & x) { - return horizontal_find_first(Vec32cb(x)) >> 1; -} - -static inline int horizontal_find_first(Vec8ib const & x) { - return horizontal_find_first(Vec32cb(x)) >> 2; -} - -static inline int horizontal_find_first(Vec4qb const & x) { - return horizontal_find_first(Vec32cb(x)) >> 3; -} - -// Count the number of elements that are true -static inline uint32_t horizontal_count(Vec32cb const & x) { - return horizontal_count(x.get_low()) + horizontal_count(x.get_high()); -} - -static inline uint32_t horizontal_count(Vec16sb const & x) { - return horizontal_count(Vec32cb(x)) >> 1; -} - -static inline uint32_t horizontal_count(Vec8ib const & x) { - return horizontal_count(Vec32cb(x)) >> 2; -} - -static inline uint32_t horizontal_count(Vec4qb const & x) { - return horizontal_count(Vec32cb(x)) >> 3; -} - /***************************************************************************** * * Boolean <-> bitfield conversion functions @@ -4388,47 +3928,27 @@ static inline uint32_t horizontal_count(Vec4qb const & x) { *****************************************************************************/ // to_bits: convert boolean vector to integer bitfield -static inline uint32_t to_bits(Vec32cb const & x) { +static inline uint32_t to_bits(Vec32cb const x) { return to_bits(x.get_low()) | (uint32_t)to_bits(x.get_high()) << 16; } -// to_Vec16c: convert integer bitfield to boolean vector -static inline Vec32cb to_Vec32cb(uint32_t x) { - return Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x>>16))); -} - // to_bits: convert boolean vector to integer bitfield -static inline uint16_t to_bits(Vec16sb const & x) { - return to_bits(x.get_low()) | (uint16_t)to_bits(x.get_high()) << 8; -} - -// to_Vec16sb: convert integer bitfield to boolean vector -static inline Vec16sb to_Vec16sb(uint16_t x) { - return Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x>>8))); +static inline uint16_t to_bits(Vec16sb const x) { + return uint16_t(to_bits(x.get_low()) | (uint16_t)to_bits(x.get_high()) << 8); } // to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec8ib const & x) { - return to_bits(x.get_low()) | (uint8_t)to_bits(x.get_high()) << 4; -} - -// to_Vec8ib: convert integer bitfield to boolean vector -static inline Vec8ib to_Vec8ib(uint8_t x) { - return Vec8i(to_Vec4ib(x), to_Vec4ib(x>>4)); +static inline uint8_t to_bits(Vec8ib const x) { + return uint8_t(to_bits(x.get_low()) | (uint8_t)to_bits(x.get_high()) << 4); } // to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4qb const & x) { - return to_bits(x.get_low()) | to_bits(x.get_high()) << 2; -} - -// to_Vec16c: convert integer bitfield to boolean vector -static inline Vec4qb to_Vec4qb(uint8_t x) { - return Vec4q(to_Vec2qb(x), to_Vec2qb(x>>2)); +static inline uint8_t to_bits(Vec4qb const x) { + return uint8_t(to_bits(x.get_low()) | to_bits(x.get_high()) << 2); } #ifdef VCL_NAMESPACE } #endif -#endif // VECTORI256_H +#endif // VECTORI256E_H diff --git a/DFTTest/VCL2/vectori512.h b/DFTTest/VCL2/vectori512.h new file mode 100644 index 0000000..eca5d0b --- /dev/null +++ b/DFTTest/VCL2/vectori512.h @@ -0,0 +1,2133 @@ +/**************************** vectori512.h ******************************* +* Author: Agner Fog +* Date created: 2014-07-23 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector class library +* Description: +* Header file defining 512-bit integer vector classes for 32 and 64 bit integers. +* For x86 microprocessors with AVX512F and later instruction sets. +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec16i Vector of 16 32-bit signed integers +* Vec16ui Vector of 16 32-bit unsigned integers +* Vec16ib Vector of 16 Booleans for use with Vec16i and Vec16ui +* Vec8q Vector of 8 64-bit signed integers +* Vec8uq Vector of 8 64-bit unsigned integers +* Vec8qb Vector of 8 Booleans for use with Vec8q and Vec8uq +* Other 512-bit integer vectors are defined in Vectori512s.h +* +* Each vector object is represented internally in the CPU as a 512-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORI512_H +#define VECTORI512_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +// check combination of header files +#ifdef VECTORI512E_H +#error Two different versions of vectori512.h included +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +// Generate a constant vector of 16 integers stored in memory. +// Can be converted to any integer vector type +template +static inline __m512i constant16ui() { + /* + const union { + uint32_t i[16]; + __m512i zmm; + } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}}; + return u.zmm; + */ + return _mm512_setr_epi32(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15); +} + + +/***************************************************************************** +* +* Boolean vector classes for AVX512 +* +*****************************************************************************/ + +typedef Vec16b Vec16ib; +typedef Vec16b Vec16uib; +typedef Vec8b Vec8qb; +typedef Vec8b Vec8uqb; + + +/***************************************************************************** +* +* Vector of 512 bits. Used as base class for Vec16i and Vec8q +* +*****************************************************************************/ +class Vec512b { +protected: + __m512i zmm; // Integer vector +public: + // Default constructor: + Vec512b() { + } + // Constructor to build from two Vec256b: + Vec512b(Vec256b const a0, Vec256b const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec512b(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec512b & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512i used in intrinsics + operator __m512i() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec512b & load(void const * p) { + zmm = _mm512_loadu_si512(p); + return *this; + } + // Member function to load from array, aligned by 64 + // You may use load_a instead of load if you are certain that p points to an address + // divisible by 64, but there is hardly any speed advantage of load_a on modern processors + Vec512b & load_a(void const * p) { + zmm = _mm512_load_si512(p); + return *this; + } + // Member function to store into array (unaligned) + void store(void * p) const { + _mm512_storeu_si512(p, zmm); + } + // Member function to store into array, aligned by 64 + // You may use store_a instead of store if you are certain that p points to an address + // divisible by 64, but there is hardly any speed advantage of store_a on modern processors + void store_a(void * p) const { + _mm512_store_si512(p, zmm); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 64 + void store_nt(void * p) const { + _mm512_stream_si512((__m512i*)p, zmm); + } + // Member functions to split into two Vec256b: + Vec256b get_low() const { + return _mm512_castsi512_si256(zmm); + } + Vec256b get_high() const { + return _mm512_extracti64x4_epi64(zmm,1); + } + static constexpr int size() { + return 512; + } + static constexpr int elementtype() { + return 1; + } + typedef __m512i registertype; +}; + +// Define operators and functions for this class + +// vector operator & : bitwise and +static inline Vec512b operator & (Vec512b const a, Vec512b const b) { + return _mm512_and_epi32(a, b); +} +static inline Vec512b operator && (Vec512b const a, Vec512b const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec512b operator | (Vec512b const a, Vec512b const b) { + return _mm512_or_epi32(a, b); +} +static inline Vec512b operator || (Vec512b const a, Vec512b const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec512b operator ^ (Vec512b const a, Vec512b const b) { + return _mm512_xor_epi32(a, b); +} + +// vector operator ~ : bitwise not +static inline Vec512b operator ~ (Vec512b const a) { + return _mm512_xor_epi32(a, _mm512_set1_epi32(-1)); +} + +// vector operator &= : bitwise and +static inline Vec512b & operator &= (Vec512b & a, Vec512b const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec512b & operator |= (Vec512b & a, Vec512b const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec512b & operator ^= (Vec512b & a, Vec512b const b) { + a = a ^ b; + return a; +} + +// function andnot: a & ~ b +static inline Vec512b andnot (Vec512b const a, Vec512b const b) { + return _mm512_andnot_epi32(b, a); +} + +static inline __m512i zero_si512() { + return _mm512_setzero_si512(); +} + + +/***************************************************************************** +* +* Vector of 16 32-bit signed integers +* +*****************************************************************************/ + +class Vec16i: public Vec512b { +public: + // Default constructor: + Vec16i() { + } + // Constructor to broadcast the same value into all elements: + Vec16i(int i) { + zmm = _mm512_set1_epi32(i); + } + // Constructor to build from all elements: + Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7, + int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) { + zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); + } + // Constructor to build from two Vec8i: + Vec16i(Vec8i const a0, Vec8i const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec16i(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec16i & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512i used in intrinsics + operator __m512i() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec16i & load(void const * p) { + zmm = _mm512_loadu_si512(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec16i & load_a(void const * p) { + zmm = _mm512_load_si512(p); + return *this; + } + // Member function to load 16 unsigned 8-bit integers from array + Vec16i & load_16uc(void const * p) { + zmm = _mm512_cvtepu8_epi32(Vec16uc().load(p)); + return *this; + } + // Member function to load 16 unsigned 16-bit integers from array + Vec16i & load_16us(void const * p) { + zmm = _mm512_cvtepu16_epi32(Vec16us().load(p)); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec16i & load_partial(int n, void const * p) { + zmm = _mm512_maskz_loadu_epi32(__mmask16((1u << n) - 1), p); + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + _mm512_mask_storeu_epi32(p, __mmask16((1u << n) - 1), zmm); + } + // cut off vector to n elements. The last 16-n elements are set to zero + Vec16i & cutoff(int n) { + zmm = _mm512_maskz_mov_epi32(__mmask16((1u << n) - 1), zmm); + return *this; + } + // Member function to change a single element in vector + Vec16i const insert(int index, int32_t value) { + zmm = _mm512_mask_set1_epi32(zmm, __mmask16(1u << index), value); + return *this; + } + // Member function extract a single element from vector + int32_t extract(int index) const { + __m512i x = _mm512_maskz_compress_epi32(__mmask16(1u << index), zmm); + return _mm_cvtsi128_si32(_mm512_castsi512_si128(x)); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int32_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec8i: + Vec8i get_low() const { + return _mm512_castsi512_si256(zmm); + } + Vec8i get_high() const { + return _mm512_extracti64x4_epi64(zmm,1); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 8; + } +}; + + +// Define operators for Vec16i + +// vector operator + : add element by element +static inline Vec16i operator + (Vec16i const a, Vec16i const b) { + return _mm512_add_epi32(a, b); +} +// vector operator += : add +static inline Vec16i & operator += (Vec16i & a, Vec16i const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec16i operator ++ (Vec16i & a, int) { + Vec16i a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec16i & operator ++ (Vec16i & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec16i operator - (Vec16i const a, Vec16i const b) { + return _mm512_sub_epi32(a, b); +} +// vector operator - : unary minus +static inline Vec16i operator - (Vec16i const a) { + return _mm512_sub_epi32(_mm512_setzero_epi32(), a); +} +// vector operator -= : subtract +static inline Vec16i & operator -= (Vec16i & a, Vec16i const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec16i operator -- (Vec16i & a, int) { + Vec16i a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec16i & operator -- (Vec16i & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec16i operator * (Vec16i const a, Vec16i const b) { + return _mm512_mullo_epi32(a, b); +} +// vector operator *= : multiply +static inline Vec16i & operator *= (Vec16i & a, Vec16i const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec16i operator << (Vec16i const a, int32_t b) { + return _mm512_sll_epi32(a, _mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec16i & operator <<= (Vec16i & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec16i operator >> (Vec16i const a, int32_t b) { + return _mm512_sra_epi32(a, _mm_cvtsi32_si128(b)); +} +// vector operator >>= : shift right arithmetic +static inline Vec16i & operator >>= (Vec16i & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec16ib operator == (Vec16i const a, Vec16i const b) { + return _mm512_cmpeq_epi32_mask(a, b); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec16ib operator != (Vec16i const a, Vec16i const b) { + return _mm512_cmpneq_epi32_mask(a, b); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec16ib operator > (Vec16i const a, Vec16i const b) { + return _mm512_cmp_epi32_mask(a, b, 6); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec16ib operator < (Vec16i const a, Vec16i const b) { + return _mm512_cmp_epi32_mask(a, b, 1); +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec16ib operator >= (Vec16i const a, Vec16i const b) { + return _mm512_cmp_epi32_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec16ib operator <= (Vec16i const a, Vec16i const b) { + return _mm512_cmp_epi32_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec16i operator & (Vec16i const a, Vec16i const b) { + return _mm512_and_epi32(a, b); +} +// vector operator &= : bitwise and +static inline Vec16i & operator &= (Vec16i & a, Vec16i const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec16i operator | (Vec16i const a, Vec16i const b) { + return _mm512_or_epi32(a, b); +} +// vector operator |= : bitwise or +static inline Vec16i & operator |= (Vec16i & a, Vec16i const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16i operator ^ (Vec16i const a, Vec16i const b) { + return _mm512_xor_epi32(a, b); +} +// vector operator ^= : bitwise xor +static inline Vec16i & operator ^= (Vec16i & a, Vec16i const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec16i operator ~ (Vec16i const a) { + return a ^ Vec16i(-1); + // This is potentially faster, but not on any current compiler: + //return _mm512_ternarylogic_epi32(_mm512_undefined_epi32(), _mm512_undefined_epi32(), a, 0x55); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16i select (Vec16ib const s, Vec16i const a, Vec16i const b) { + return _mm512_mask_mov_epi32(b, s, a); // conditional move may be optimized better by the compiler than blend + // return _mm512_mask_blend_epi32(s, b, a); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16i if_add (Vec16ib const f, Vec16i const a, Vec16i const b) { + return _mm512_mask_add_epi32(a, f, a, b); +} + +// Conditional subtract +static inline Vec16i if_sub (Vec16ib const f, Vec16i const a, Vec16i const b) { + return _mm512_mask_sub_epi32(a, f, a, b); +} + +// Conditional multiply +static inline Vec16i if_mul (Vec16ib const f, Vec16i const a, Vec16i const b) { + return _mm512_mask_mullo_epi32(a, f, a, b); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int32_t horizontal_add (Vec16i const a) { +#if defined(__INTEL_COMPILER) + return _mm512_reduce_add_epi32(a); +#else + return horizontal_add(a.get_low() + a.get_high()); +#endif +} + +// function add_saturated: add element by element, signed with saturation +// (is it faster to up-convert to 64 bit integers, and then downconvert the sum with saturation?) +static inline Vec16i add_saturated(Vec16i const a, Vec16i const b) { + __m512i sum = _mm512_add_epi32(a, b); // a + b + __m512i axb = _mm512_xor_epi32(a, b); // check if a and b have different sign + __m512i axs = _mm512_xor_epi32(a, sum); // check if a and sum have different sign + __m512i ovf1 = _mm512_andnot_epi32(axb,axs); // check if sum has wrong sign + __m512i ovf2 = _mm512_srai_epi32(ovf1,31); // -1 if overflow + __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask + __m512i asign = _mm512_srli_epi32(a,31); // 1 if a < 0 + __m512i sat1 = _mm512_srli_epi32(ovf2,1); // 7FFFFFFF if overflow + __m512i sat2 = _mm512_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow + return _mm512_mask_blend_epi32(ovf3, sum, sat2); // sum if not overflow, else sat2 +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec16i sub_saturated(Vec16i const a, Vec16i const b) { + __m512i diff = _mm512_sub_epi32(a, b); // a + b + __m512i axb = _mm512_xor_si512(a, b); // check if a and b have different sign + __m512i axs = _mm512_xor_si512(a, diff); // check if a and sum have different sign + __m512i ovf1 = _mm512_and_si512(axb,axs); // check if sum has wrong sign + __m512i ovf2 = _mm512_srai_epi32(ovf1,31); // -1 if overflow + __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask + __m512i asign = _mm512_srli_epi32(a,31); // 1 if a < 0 + __m512i sat1 = _mm512_srli_epi32(ovf2,1); // 7FFFFFFF if overflow + __m512i sat2 = _mm512_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow + return _mm512_mask_blend_epi32(ovf3, diff, sat2); // sum if not overflow, else sat2 +} + +// function max: a > b ? a : b +static inline Vec16i max(Vec16i const a, Vec16i const b) { + return _mm512_max_epi32(a,b); +} + +// function min: a < b ? a : b +static inline Vec16i min(Vec16i const a, Vec16i const b) { + return _mm512_min_epi32(a,b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec16i abs(Vec16i const a) { + return _mm512_abs_epi32(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec16i abs_saturated(Vec16i const a) { + return _mm512_min_epu32(abs(a), Vec16i(0x7FFFFFFF)); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec16i rotate_left(Vec16i const a, int b) { + return _mm512_rolv_epi32(a, Vec16i(b)); +} + + +/***************************************************************************** +* +* Vector of 16 32-bit unsigned integers +* +*****************************************************************************/ + +class Vec16ui : public Vec16i { +public: + // Default constructor: + Vec16ui() { + } + // Constructor to broadcast the same value into all elements: + Vec16ui(uint32_t i) { + zmm = _mm512_set1_epi32((int32_t)i); + } + // Constructor to build from all elements: + Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7, + uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) { + zmm = _mm512_setr_epi32((int32_t)i0, (int32_t)i1, (int32_t)i2, (int32_t)i3, (int32_t)i4, (int32_t)i5, (int32_t)i6, (int32_t)i7, + (int32_t)i8, (int32_t)i9, (int32_t)i10, (int32_t)i11, (int32_t)i12, (int32_t)i13, (int32_t)i14, (int32_t)i15); + } + // Constructor to build from two Vec8ui: + Vec16ui(Vec8ui const a0, Vec8ui const a1) { + zmm = Vec16i(Vec8i(a0), Vec8i(a1)); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec16ui(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec16ui & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec16ui & load(void const * p) { + Vec16i::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec16ui & load_a(void const * p) { + Vec16i::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec16ui const insert(int index, uint32_t value) { + Vec16i::insert(index, (int32_t)value); + return *this; + } + // Member function extract a single element from vector + uint32_t extract(int index) const { + return (uint32_t)Vec16i::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint32_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4ui: + Vec8ui get_low() const { + return Vec8ui(Vec16i::get_low()); + } + Vec8ui get_high() const { + return Vec8ui(Vec16i::get_high()); + } + static constexpr int elementtype() { + return 9; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec16ui operator + (Vec16ui const a, Vec16ui const b) { + return Vec16ui (Vec16i(a) + Vec16i(b)); +} + +// vector operator - : subtract +static inline Vec16ui operator - (Vec16ui const a, Vec16ui const b) { + return Vec16ui (Vec16i(a) - Vec16i(b)); +} + +// vector operator * : multiply +static inline Vec16ui operator * (Vec16ui const a, Vec16ui const b) { + return Vec16ui (Vec16i(a) * Vec16i(b)); +} + +// vector operator / : divide +// See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec16ui operator >> (Vec16ui const a, uint32_t b) { + return _mm512_srl_epi32(a, _mm_cvtsi32_si128((int32_t)b)); +} +static inline Vec16ui operator >> (Vec16ui const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right logical +static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical +static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec16ui operator << (Vec16ui const a, uint32_t b) { + return Vec16ui ((Vec16i)a << (int32_t)b); +} + +// vector operator << : shift left all elements +static inline Vec16ui operator << (Vec16ui const a, int32_t b) { + return Vec16ui ((Vec16i)a << (int32_t)b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec16ib operator < (Vec16ui const a, Vec16ui const b) { + return _mm512_cmp_epu32_mask(a, b, 1); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec16ib operator > (Vec16ui const a, Vec16ui const b) { + return _mm512_cmp_epu32_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec16ib operator >= (Vec16ui const a, Vec16ui const b) { + return _mm512_cmp_epu32_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec16ib operator <= (Vec16ui const a, Vec16ui const b) { + return _mm512_cmp_epu32_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec16ui operator & (Vec16ui const a, Vec16ui const b) { + return Vec16ui(Vec16i(a) & Vec16i(b)); +} + +// vector operator | : bitwise or +static inline Vec16ui operator | (Vec16ui const a, Vec16ui const b) { + return Vec16ui(Vec16i(a) | Vec16i(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec16ui operator ^ (Vec16ui const a, Vec16ui const b) { + return Vec16ui(Vec16i(a) ^ Vec16i(b)); +} + +// vector operator ~ : bitwise not +static inline Vec16ui operator ~ (Vec16ui const a) { + return Vec16ui( ~ Vec16i(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16ui select (Vec16ib const s, Vec16ui const a, Vec16ui const b) { + return Vec16ui(select(s, Vec16i(a), Vec16i(b))); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16ui if_add (Vec16ib const f, Vec16ui const a, Vec16ui const b) { + return Vec16ui(if_add(f, Vec16i(a), Vec16i(b))); +} + +// Conditional subtract +static inline Vec16ui if_sub (Vec16ib const f, Vec16ui const a, Vec16ui const b) { + return Vec16ui(if_sub(f, Vec16i(a), Vec16i(b))); +} + +// Conditional multiply +static inline Vec16ui if_mul (Vec16ib const f, Vec16ui const a, Vec16ui const b) { + return Vec16ui(if_mul(f, Vec16i(a), Vec16i(b))); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint32_t horizontal_add (Vec16ui const a) { + return (uint32_t)horizontal_add((Vec16i)a); +} + +// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec16ui add_saturated(Vec16ui const a, Vec16ui const b) { + Vec16ui sum = a + b; + Vec16ib overflow = sum < (a | b); // overflow if (a + b) < (a | b) + return _mm512_mask_set1_epi32(sum, overflow, -1); // 0xFFFFFFFF if overflow +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec16ui sub_saturated(Vec16ui const a, Vec16ui const b) { + Vec16ui diff = a - b; + return _mm512_maskz_mov_epi32(diff <= a, diff); // underflow if diff > a gives zero +} + +// function max: a > b ? a : b +static inline Vec16ui max(Vec16ui const a, Vec16ui const b) { + return _mm512_max_epu32(a,b); +} + +// function min: a < b ? a : b +static inline Vec16ui min(Vec16ui const a, Vec16ui const b) { + return _mm512_min_epu32(a,b); +} + + +/***************************************************************************** +* +* Vector of 8 64-bit signed integers +* +*****************************************************************************/ + +class Vec8q : public Vec512b { +public: + // Default constructor: + Vec8q() { + } + // Constructor to broadcast the same value into all elements: + Vec8q(int64_t i) { + zmm = _mm512_set1_epi64(i); + } + // Constructor to build from all elements: + Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) { + zmm = _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7); + } + // Constructor to build from two Vec4q: + Vec8q(Vec4q const a0, Vec4q const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec8q(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec8q & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512i used in intrinsics + operator __m512i() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec8q & load(void const * p) { + zmm = _mm512_loadu_si512(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec8q & load_a(void const * p) { + zmm = _mm512_load_si512(p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec8q & load_partial(int n, void const * p) { + zmm = _mm512_maskz_loadu_epi64(__mmask16((1 << n) - 1), p); + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + _mm512_mask_storeu_epi64(p, __mmask16((1 << n) - 1), zmm); + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec8q & cutoff(int n) { + zmm = _mm512_maskz_mov_epi64(__mmask16((1 << n) - 1), zmm); + return *this; + } + // Member function to change a single element in vector + Vec8q const insert(int index, int64_t value) { +#ifdef __x86_64__ + zmm = _mm512_mask_set1_epi64(zmm, __mmask16(1 << index), value); +#else + __m512i v = Vec8q(value); + zmm = _mm512_mask_mov_epi64(zmm, __mmask16(1 << index), v); +#endif + return *this; + } + // Member function extract a single element from vector + int64_t extract(int index) const { + __m512i x = _mm512_maskz_compress_epi64(__mmask8(1u << index), zmm); + return _emulate_movq(_mm512_castsi512_si128(x)); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int64_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2q: + Vec4q get_low() const { + return _mm512_castsi512_si256(zmm); + } + Vec4q get_high() const { + return _mm512_extracti64x4_epi64(zmm,1); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 10; + } +}; + + +// Define operators for Vec8q + +// vector operator + : add element by element +static inline Vec8q operator + (Vec8q const a, Vec8q const b) { + return _mm512_add_epi64(a, b); +} +// vector operator += : add +static inline Vec8q & operator += (Vec8q & a, Vec8q const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8q operator ++ (Vec8q & a, int) { + Vec8q a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec8q & operator ++ (Vec8q & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8q operator - (Vec8q const a, Vec8q const b) { + return _mm512_sub_epi64(a, b); +} +// vector operator - : unary minus +static inline Vec8q operator - (Vec8q const a) { + return _mm512_sub_epi64(_mm512_setzero_epi32(), a); +} +// vector operator -= : subtract +static inline Vec8q & operator -= (Vec8q & a, Vec8q const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8q operator -- (Vec8q & a, int) { + Vec8q a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec8q & operator -- (Vec8q & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8q operator * (Vec8q const a, Vec8q const b) { +#if INSTRSET >= 10 // __AVX512DQ__ + return _mm512_mullo_epi64(a, b); +#elif defined (__INTEL_COMPILER) + return _mm512_mullox_epi64(a, b); // _mm512_mullox_epi64 missing in gcc +#else + // instruction does not exist. Split into 32-bit multiplications + //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1); // swap H<->L + __m512i ahigh = _mm512_srli_epi64(a, 32); // high 32 bits of each a + __m512i bhigh = _mm512_srli_epi64(b, 32); // high 32 bits of each b + __m512i prodahb = _mm512_mul_epu32(ahigh, b); // ahigh*b + __m512i prodbha = _mm512_mul_epu32(bhigh, a); // bhigh*a + __m512i prodhl = _mm512_add_epi64(prodahb, prodbha); // sum of high*low products + __m512i prodhi = _mm512_slli_epi64(prodhl, 32); // same, shifted high + __m512i prodll = _mm512_mul_epu32(a, b); // alow*blow = 64 bit unsigned products + __m512i prod = _mm512_add_epi64(prodll, prodhi); // low*low+(high*low)<<32 + return prod; +#endif +} + +// vector operator *= : multiply +static inline Vec8q & operator *= (Vec8q & a, Vec8q const b) { + a = a * b; + return a; +} + +// vector operator << : shift left +static inline Vec8q operator << (Vec8q const a, int32_t b) { + return _mm512_sll_epi64(a, _mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec8q & operator <<= (Vec8q & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec8q operator >> (Vec8q const a, int32_t b) { + return _mm512_sra_epi64(a, _mm_cvtsi32_si128(b)); +} +// vector operator >>= : shift right arithmetic +static inline Vec8q & operator >>= (Vec8q & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8qb operator == (Vec8q const a, Vec8q const b) { + return Vec8qb(_mm512_cmpeq_epi64_mask(a, b)); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8qb operator != (Vec8q const a, Vec8q const b) { + return Vec8qb(_mm512_cmpneq_epi64_mask(a, b)); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8qb operator < (Vec8q const a, Vec8q const b) { + return _mm512_cmp_epi64_mask(a, b, 1); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8qb operator > (Vec8q const a, Vec8q const b) { + return _mm512_cmp_epi64_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec8qb operator >= (Vec8q const a, Vec8q const b) { + return _mm512_cmp_epi64_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec8qb operator <= (Vec8q const a, Vec8q const b) { + return _mm512_cmp_epi64_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec8q operator & (Vec8q const a, Vec8q const b) { + return _mm512_and_epi32(a, b); +} +// vector operator &= : bitwise and +static inline Vec8q & operator &= (Vec8q & a, Vec8q const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8q operator | (Vec8q const a, Vec8q const b) { + return _mm512_or_epi32(a, b); +} +// vector operator |= : bitwise or +static inline Vec8q & operator |= (Vec8q & a, Vec8q const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8q operator ^ (Vec8q const a, Vec8q const b) { + return _mm512_xor_epi32(a, b); +} +// vector operator ^= : bitwise xor +static inline Vec8q & operator ^= (Vec8q & a, Vec8q const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec8q operator ~ (Vec8q const a) { + return Vec8q(~ Vec16i(a)); + //return _mm512_ternarylogic_epi64(_mm512_undefined_epi32(), _mm512_undefined_epi32(), a, 0x55); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8q select (Vec8qb const s, Vec8q const a, Vec8q const b) { + // avoid warning in MS compiler if INSTRSET = 9 by casting mask to uint8_t, while __mmask8 is not supported in AVX512F + return _mm512_mask_mov_epi64(b, (uint8_t)s, a); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8q if_add (Vec8qb const f, Vec8q const a, Vec8q const b) { + return _mm512_mask_add_epi64(a, (uint8_t)f, a, b); +} + +// Conditional subtract +static inline Vec8q if_sub (Vec8qb const f, Vec8q const a, Vec8q const b) { + return _mm512_mask_sub_epi64(a, (uint8_t)f, a, b); +} + +// Conditional multiply +static inline Vec8q if_mul (Vec8qb const f, Vec8q const a, Vec8q const b) { +#if INSTRSET >= 10 + return _mm512_mask_mullo_epi64(a, f, a, b); // AVX512DQ +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int64_t horizontal_add (Vec8q const a) { +#if defined(__INTEL_COMPILER) + return _mm512_reduce_add_epi64(a); +#else + return horizontal_add(a.get_low()+a.get_high()); +#endif +} + +// Horizontal add extended: Calculates the sum of all vector elements +// Elements are sign extended before adding to avoid overflow +static inline int64_t horizontal_add_x (Vec16i const x) { + Vec8q a = _mm512_cvtepi32_epi64(x.get_low()); + Vec8q b = _mm512_cvtepi32_epi64(x.get_high()); + return horizontal_add(a+b); +} + +// Horizontal add extended: Calculates the sum of all vector elements +// Elements are zero extended before adding to avoid overflow +static inline uint64_t horizontal_add_x (Vec16ui const x) { + Vec8q a = _mm512_cvtepu32_epi64(x.get_low()); + Vec8q b = _mm512_cvtepu32_epi64(x.get_high()); + return (uint64_t)horizontal_add(a+b); +} + +// function max: a > b ? a : b +static inline Vec8q max(Vec8q const a, Vec8q const b) { + return _mm512_max_epi64(a, b); +} + +// function min: a < b ? a : b +static inline Vec8q min(Vec8q const a, Vec8q const b) { + return _mm512_min_epi64(a, b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec8q abs(Vec8q const a) { + return _mm512_abs_epi64(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec8q abs_saturated(Vec8q const a) { + return _mm512_min_epu64(abs(a), Vec8q(0x7FFFFFFFFFFFFFFF)); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec8q rotate_left(Vec8q const a, int b) { + return _mm512_rolv_epi64(a, Vec8q(b)); +} + + +/***************************************************************************** +* +* Vector of 8 64-bit unsigned integers +* +*****************************************************************************/ + +class Vec8uq : public Vec8q { +public: + // Default constructor: + Vec8uq() { + } + // Constructor to broadcast the same value into all elements: + Vec8uq(uint64_t i) { + zmm = Vec8q((int64_t)i); + } + // Constructor to convert from Vec8q: + Vec8uq(Vec8q const x) { + zmm = x; + } + // Constructor to convert from type __m512i used in intrinsics: + Vec8uq(__m512i const x) { + zmm = x; + } + // Constructor to build from all elements: + Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) { + zmm = Vec8q((int64_t)i0, (int64_t)i1, (int64_t)i2, (int64_t)i3, (int64_t)i4, (int64_t)i5, (int64_t)i6, (int64_t)i7); + } + // Constructor to build from two Vec4uq: + Vec8uq(Vec4uq const a0, Vec4uq const a1) { + zmm = Vec8q(Vec4q(a0), Vec4q(a1)); + } + // Assignment operator to convert from Vec8q: + Vec8uq & operator = (Vec8q const x) { + zmm = x; + return *this; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec8uq & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec8uq & load(void const * p) { + Vec8q::load(p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec8uq & load_a(void const * p) { + Vec8q::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec8uq const insert(int index, uint64_t value) { + Vec8q::insert(index, (int64_t)value); + return *this; + } + // Member function extract a single element from vector + uint64_t extract(int index) const { + return (uint64_t)Vec8q::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint64_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2uq: + Vec4uq get_low() const { + return Vec4uq(Vec8q::get_low()); + } + Vec4uq get_high() const { + return Vec4uq(Vec8q::get_high()); + } + static constexpr int elementtype() { + return 10; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec8uq operator + (Vec8uq const a, Vec8uq const b) { + return Vec8uq (Vec8q(a) + Vec8q(b)); +} + +// vector operator - : subtract +static inline Vec8uq operator - (Vec8uq const a, Vec8uq const b) { + return Vec8uq (Vec8q(a) - Vec8q(b)); +} + +// vector operator * : multiply element by element +static inline Vec8uq operator * (Vec8uq const a, Vec8uq const b) { + return Vec8uq (Vec8q(a) * Vec8q(b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec8uq operator >> (Vec8uq const a, uint32_t b) { + return _mm512_srl_epi64(a,_mm_cvtsi32_si128((int32_t)b)); +} +static inline Vec8uq operator >> (Vec8uq const a, int32_t b) { + return a >> (uint32_t)b; +} +// vector operator >>= : shift right artihmetic +static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) { + a = a >> b; + return a; +} +// vector operator >>= : shift right logical +static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec8uq operator << (Vec8uq const a, uint32_t b) { + return Vec8uq ((Vec8q)a << (int32_t)b); +} +// vector operator << : shift left all elements +static inline Vec8uq operator << (Vec8uq const a, int32_t b) { + return Vec8uq ((Vec8q)a << b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec8qb operator < (Vec8uq const a, Vec8uq const b) { + return _mm512_cmp_epu64_mask(a, b, 1); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec8qb operator > (Vec8uq const a, Vec8uq const b) { + return _mm512_cmp_epu64_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec8qb operator >= (Vec8uq const a, Vec8uq const b) { + return _mm512_cmp_epu64_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec8qb operator <= (Vec8uq const a, Vec8uq const b) { + return _mm512_cmp_epu64_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec8uq operator & (Vec8uq const a, Vec8uq const b) { + return Vec8uq(Vec8q(a) & Vec8q(b)); +} + +// vector operator | : bitwise or +static inline Vec8uq operator | (Vec8uq const a, Vec8uq const b) { + return Vec8uq(Vec8q(a) | Vec8q(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec8uq operator ^ (Vec8uq const a, Vec8uq const b) { + return Vec8uq(Vec8q(a) ^ Vec8q(b)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8uq select (Vec8qb const s, Vec8uq const a, Vec8uq const b) { + return Vec8uq(select(s, Vec8q(a), Vec8q(b))); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8uq if_add (Vec8qb const f, Vec8uq const a, Vec8uq const b) { + return _mm512_mask_add_epi64(a, (uint8_t)f, a, b); +} + +// Conditional subtract +static inline Vec8uq if_sub (Vec8qb const f, Vec8uq const a, Vec8uq const b) { + return _mm512_mask_sub_epi64(a, (uint8_t)f, a, b); +} + +// Conditional multiply +static inline Vec8uq if_mul (Vec8qb const f, Vec8uq const a, Vec8uq const b) { +#if INSTRSET >= 10 + return _mm512_mask_mullo_epi64(a, f, a, b); // AVX512DQ +#else + return select(f, a*b, a); +#endif +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint64_t horizontal_add (Vec8uq const a) { + return (uint64_t)horizontal_add(Vec8q(a)); +} + +// function max: a > b ? a : b +static inline Vec8uq max(Vec8uq const a, Vec8uq const b) { + return _mm512_max_epu64(a, b); +} + +// function min: a < b ? a : b +static inline Vec8uq min(Vec8uq const a, Vec8uq const b) { + return _mm512_min_epu64(a, b); +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// Permute vector of 8 64-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template +static inline Vec8q permute8(Vec8q const a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m512i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<8>(indexs); // permutation pattern + constexpr uint8_t ppat = (L.a[0] & 3) | (L.a[1]<<2 & 0xC) | (L.a[2]<<4 & 0x30) | (L.a[3]<<6 & 0xC0); + y = _mm512_shuffle_i64x2(a, a, ppat); + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in all lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm512_unpackhi_epi64(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm512_unpacklo_epi64(y, y); + } + else { // general permute + y = _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)uint8_t(flags >> perm_ipattern)); + } + } + else { // different patterns in all lanes + if constexpr ((flags & perm_rotate_big) != 0) {// fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + y = _mm512_alignr_epi64 (y, y, rot); + } + else if constexpr ((flags & perm_broadcast) != 0) { // broadcast one element + constexpr int e = flags >> perm_rot_count; + if constexpr(e != 0) { + y = _mm512_alignr_epi64(y, y, e); + } + y = _mm512_broadcastq_epi64(_mm512_castsi512_si128(y)); + } + else if constexpr ((flags & perm_compress) != 0) { + y = _mm512_maskz_compress_epi64(__mmask8(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm512_maskz_expand_epi64(__mmask8(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm512_shuffle_epi8(y, Vec8q().load(bm.a)); + } + else { + // full permute needed + const __m512i pmask = constant16ui < + i0 & 7, 0, i1 & 7, 0, i2 & 7, 0, i3 & 7, 0, i4 & 7, 0, i5 & 7, 0, i6 & 7, 0, i7 & 7, 0>(); + y = _mm512_permutexvar_epi64(pmask, y); + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi64(zero_mask<8>(indexs), y); + } + return y; +} + +template +static inline Vec8uq permute8(Vec8uq const a) { + return Vec8uq (permute8 (Vec8q(a))); +} + + +// Permute vector of 16 32-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template +static inline Vec16i permute16(Vec16i const a) { + int constexpr indexs[16] = { // indexes as array + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + __m512i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<16>(indexs); // permutation pattern + y = permute8 (Vec8q(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in all lanes + if constexpr ((flags & perm_punpckh) != 0) { // fits punpckhi + y = _mm512_unpackhi_epi32(y, y); + } + else if constexpr ((flags & perm_punpckl)!=0){ // fits punpcklo + y = _mm512_unpacklo_epi32(y, y); + } + else { // general permute + y = _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)uint8_t(flags >> perm_ipattern)); + } + } + else { // different patterns in all lanes + if constexpr ((flags & perm_rotate_big) != 0) {// fits big rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotation count + return _mm512_maskz_alignr_epi32 (zero_mask<16>(indexs), y, y, rot); + } + else if constexpr ((flags & perm_broadcast) != 0) { // broadcast one element + constexpr int e = flags >> perm_rot_count; // element index + if constexpr(e != 0) { + y = _mm512_alignr_epi32(y, y, e); + } + y = _mm512_broadcastd_epi32(_mm512_castsi512_si128(y)); + } + else if constexpr ((flags & perm_zext) != 0) { + y = _mm512_cvtepu32_epi64(_mm512_castsi512_si256(y)); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_compress) != 0) { + y = _mm512_maskz_compress_epi32(__mmask16(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm512_maskz_expand_epi32(__mmask16(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm512_shuffle_epi8(a, Vec16i().load(bm.a)); + } + else { + // full permute needed + const __m512i pmask = constant16ui < + i0 & 15, i1 & 15, i2 & 15, i3 & 15, i4 & 15, i5 & 15, i6 & 15, i7 & 15, + i8 & 15, i9 & 15, i10 & 15, i11 & 15, i12 & 15, i13 & 15, i14 & 15, i15 & 15>(); + return _mm512_maskz_permutexvar_epi32(zero_mask<16>(indexs), pmask, a); + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi32(zero_mask<16>(indexs), y); + } + return y; +} + +template +static inline Vec16ui permute16(Vec16ui const a) { + return Vec16ui (permute16 (Vec16i(a))); +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ + +// permute and blend Vec8q +template +static inline Vec8q blend8(Vec8q const a, Vec8q const b) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + __m512i y = a; // result + constexpr uint64_t flags = blend_flags(indexs); // get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute8 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<8, 2>(indexs); // get permutation indexes + return permute8 < L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15] > (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint8_t mb = (uint8_t)make_bit_mask<8, 0x303>(indexs); // blend mask + y = _mm512_mask_mov_epi64 (a, mb, b); + } + else if constexpr ((flags & blend_rotate_big) != 0) { // full rotate + constexpr uint8_t rot = uint8_t(flags >> blend_rotpattern); // rotate count + if constexpr (rot < 8) { + y = _mm512_alignr_epi64(b, a, rot); + } + else { + y = _mm512_alignr_epi64(a, b, rot & 7); + } + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 128-bit blocks + constexpr EList L = largeblock_perm<8>(indexs); // get 128-bit blend pattern + constexpr uint8_t shuf = (L.a[0] & 3) | (L.a[1] & 3) << 2 | (L.a[2] & 3) << 4 | (L.a[3] & 3) << 6; + if constexpr (make_bit_mask<8, 0x103>(indexs) == 0) { // fits vshufi64x2 (a,b) + y = _mm512_shuffle_i64x2(a, b, shuf); + } + else if constexpr (make_bit_mask<8, 0x203>(indexs) == 0) { // fits vshufi64x2 (b,a) + y = _mm512_shuffle_i64x2(b, a, shuf); + } + else { + const EList bm = perm_mask_broad(indexs); // full permute + y = _mm512_permutex2var_epi64(a, Vec8q().load(bm.a), b); + } + } + // check if pattern fits special cases + else if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm512_unpacklo_epi64 (a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm512_unpacklo_epi64 (b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm512_unpackhi_epi64 (a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm512_unpackhi_epi64 (b, a); + } +#if ALLOW_FP_PERMUTE // allow floating point permute instructions on integer vectors + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm512_castpd_si512(_mm512_shuffle_pd(_mm512_castsi512_pd(a), _mm512_castsi512_pd(b), uint8_t(flags >> blend_shufpattern))); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm512_castpd_si512(_mm512_shuffle_pd(_mm512_castsi512_pd(b), _mm512_castsi512_pd(a), uint8_t(flags >> blend_shufpattern))); + } +#else + // we might use 2 x _mm512_mask(z)_shuffle_epi32 like in blend16 below +#endif + else { // No special cases + const EList bm = perm_mask_broad(indexs); // full permute + y = _mm512_permutex2var_epi64(a, Vec8q().load(bm.a), b); + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi64(zero_mask<8>(indexs), y); + } + return y; +} + +template +static inline Vec8uq blend8(Vec8uq const a, Vec8uq const b) { + return Vec8uq( blend8 (Vec8q(a),Vec8q(b))); +} + + +// permute and blend Vec16i +template +static inline Vec16i blend16(Vec16i const a, Vec16i const b) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15}; // indexes as array + __m512i y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute16 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<16, 2>(indexs); // get permutation indexes + return permute16 < + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] > (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint16_t mb = (uint16_t)make_bit_mask<16, 0x304>(indexs); // blend mask + y = _mm512_mask_mov_epi32 (a, mb, b); + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 64-bit blocks + constexpr EList L = largeblock_perm<16>(indexs); // get 64-bit blend pattern + y = blend8 + (Vec8q(a), Vec8q(b)); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & blend_same_pattern) != 0) { + // same pattern in all 128-bit lanes. check if pattern fits special cases + if constexpr ((flags & blend_punpcklab) != 0) { + y = _mm512_unpacklo_epi32(a, b); + } + else if constexpr ((flags & blend_punpcklba) != 0) { + y = _mm512_unpacklo_epi32(b, a); + } + else if constexpr ((flags & blend_punpckhab) != 0) { + y = _mm512_unpackhi_epi32(a, b); + } + else if constexpr ((flags & blend_punpckhba) != 0) { + y = _mm512_unpackhi_epi32(b, a); + } +#if ALLOW_FP_PERMUTE // allow floating point permute instructions on integer vectors + else if constexpr ((flags & blend_shufab) != 0) { // use floating point instruction shufpd + y = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(a), _mm512_castsi512_ps(b), uint8_t(flags >> blend_shufpattern))); + } + else if constexpr ((flags & blend_shufba) != 0) { // use floating point instruction shufpd + y = _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(b), _mm512_castsi512_ps(a), uint8_t(flags >> blend_shufpattern))); + } +#endif + else { + // Use vpshufd twice. This generates two instructions in the dependency chain, + // but we are avoiding the slower lane-crossing instruction, and saving 64 + // bytes of data cache. + auto shuf = [](int const (&a)[16]) constexpr { // get pattern for vpshufd + int pat[4] = {-1,-1,-1,-1}; + for (int i = 0; i < 16; i++) { + int ix = a[i]; + if (ix >= 0 && pat[i&3] < 0) { + pat[i&3] = ix; + } + } + return (pat[0] & 3) | (pat[1] & 3) << 2 | (pat[2] & 3) << 4 | (pat[3] & 3) << 6; + }; + constexpr uint8_t pattern = uint8_t(shuf(indexs)); // permute pattern + constexpr uint16_t froma = (uint16_t)make_bit_mask<16, 0x004>(indexs); // elements from a + constexpr uint16_t fromb = (uint16_t)make_bit_mask<16, 0x304>(indexs); // elements from b + y = _mm512_maskz_shuffle_epi32( froma, a, (_MM_PERM_ENUM) pattern); + y = _mm512_mask_shuffle_epi32 (y, fromb, b, (_MM_PERM_ENUM) pattern); + return y; // we have already zeroed any unused elements + } + } + else if constexpr ((flags & blend_rotate_big) != 0) { // full rotate + constexpr uint8_t rot = uint8_t(flags >> blend_rotpattern); // rotate count + if constexpr (rot < 16) { + y = _mm512_alignr_epi32(b, a, rot); + } + else { + y = _mm512_alignr_epi32(a, b, rot & 0x0F); + } + } + + else { // No special cases + const EList bm = perm_mask_broad(indexs); // full permute + y = _mm512_permutex2var_epi32(a, Vec16i().load(bm.a), b); + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi32(zero_mask<16>(indexs), y); + } + return y; +} + +template +static inline Vec16ui blend16(Vec16ui const a, Vec16ui const b) { + return Vec16ui( blend16 (Vec16i(a),Vec16i(b))); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec16i lookup16(Vec16i const index, Vec16i const table) { + return _mm512_permutexvar_epi32(index, table); +} + +static inline Vec16i lookup32(Vec16i const index, Vec16i const table1, Vec16i const table2) { + return _mm512_permutex2var_epi32(table1, index, table2); +} + +static inline Vec16i lookup64(Vec16i const index, Vec16i const table1, Vec16i const table2, Vec16i const table3, Vec16i const table4) { + Vec16i d12 = _mm512_permutex2var_epi32(table1, index, table2); + Vec16i d34 = _mm512_permutex2var_epi32(table3, index, table4); + return select((index >> 5) != 0, d34, d12); +} + +template +static inline Vec16i lookup(Vec16i const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 16) { + Vec16i table1 = Vec16i().load(table); + return lookup16(index, table1); + } + if constexpr (n <= 32) { + Vec16i table1 = Vec16i().load(table); + Vec16i table2 = Vec16i().load((int8_t*)table + 64); + return _mm512_permutex2var_epi32(table1, index, table2); + } + // n > 32. Limit index + Vec16ui index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec16ui(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec16ui(index), uint32_t(n-1)); + } + return _mm512_i32gather_epi32(index1, (const int*)table, 4); + // return _mm512_i32gather_epi32(index1, table, _MM_UPCONV_EPI32_NONE, 4, 0); +} + + +static inline Vec8q lookup8(Vec8q const index, Vec8q const table) { + return _mm512_permutexvar_epi64(index, table); +} + +template +static inline Vec8q lookup(Vec8q const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + Vec8q table1 = Vec8q().load(table); + return lookup8(index, table1); + } + if constexpr (n <= 16) { + Vec8q table1 = Vec8q().load(table); + Vec8q table2 = Vec8q().load((int8_t*)table + 64); + return _mm512_permutex2var_epi64(table1, index, table2); + } + // n > 16. Limit index + Vec8uq index1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + index1 = Vec8uq(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + index1 = min(Vec8uq(index), uint32_t(n-1)); + } + return _mm512_i64gather_epi64(index1, (const long long*)table, 8); +} + + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ +// Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15 +template +static inline Vec16i gather16i(void const * a) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 15) { + // load one contiguous block and permute + if constexpr (imax > 15) { + // make sure we don't read past the end of the array + Vec16i b = Vec16i().load((int32_t const *)a + imax-15); + return permute16 (b); + } + else { + Vec16i b = Vec16i().load((int32_t const *)a + imin); + return permute16 (b); + } + } + if constexpr ((i0imax-16) && (i1imax-16) && (i2imax-16) && (i3imax-16) + && (i4imax-16) && (i5imax-16) && (i6imax-16) && (i7imax-16) + && (i8imax-16) && (i9imax-16) && (i10imax-16) && (i11imax-16) + && (i12imax-16) && (i13imax-16) && (i14imax-16) && (i15imax-16) ) { + // load two contiguous blocks and blend + Vec16i b = Vec16i().load((int32_t const *)a + imin); + Vec16i c = Vec16i().load((int32_t const *)a + imax-15); + const int j0 = i0 (b, c); + } + // use gather instruction + return _mm512_i32gather_epi32(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const int *)a, 4); +} + + +template +static inline Vec8q gather8q(void const * a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 7) { + // load one contiguous block and permute + if constexpr (imax > 7) { + // make sure we don't read past the end of the array + Vec8q b = Vec8q().load((int64_t const *)a + imax-7); + return permute8 (b); + } + else { + Vec8q b = Vec8q().load((int64_t const *)a + imin); + return permute8 (b); + } + } + if constexpr ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) + && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { + // load two contiguous blocks and blend + Vec8q b = Vec8q().load((int64_t const *)a + imin); + Vec8q c = Vec8q().load((int64_t const *)a + imax-7); + const int j0 = i0(b, c); + } + // use gather instruction + return _mm512_i64gather_epi64(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const long long *)a, 8); +} + +/***************************************************************************** +* +* Vector scatter functions +* +****************************************************************************** +* +* These functions write the elements of a vector to arbitrary positions in an +* array in memory. Each vector element is written to an array position +* determined by an index. An element is not written if the corresponding +* index is out of range. +* The indexes can be specified as constant template parameters or as an +* integer vector. +* +*****************************************************************************/ + +template + static inline void scatter(Vec16i const data, void * array) { + __m512i indx = constant16ui(); + Vec16ib mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0, + i8>=0, i9>=0, i10>=0, i11>=0, i12>=0, i13>=0, i14>=0, i15>=0); + _mm512_mask_i32scatter_epi32((int*)array, mask, indx, data, 4); +} + +template +static inline void scatter(Vec8q const data, void * array) { + __m256i indx = constant8ui(); + Vec8qb mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0); + _mm512_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8); +} + + +/***************************************************************************** +* +* Scatter functions with variable indexes +* +*****************************************************************************/ + +static inline void scatter(Vec16i const index, uint32_t limit, Vec16i const data, void * destination) { + Vec16ib mask = Vec16ui(index) < limit; + _mm512_mask_i32scatter_epi32((int*)destination, mask, index, data, 4); +} + +static inline void scatter(Vec8q const index, uint32_t limit, Vec8q const data, void * destination) { + Vec8qb mask = Vec8uq(index) < uint64_t(limit); + _mm512_mask_i64scatter_epi64((long long *)destination, (uint8_t)mask, index, data, 8); +} + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8q const data, void * destination) { +#if INSTRSET >= 10 // __AVX512VL__ + __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit)); +#else + __mmask16 mask = _mm512_mask_cmplt_epu32_mask(0xFFu, _mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); +#endif + _mm512_mask_i32scatter_epi64((long long *)destination, (uint8_t)mask, index, data, 8); +} + + +/***************************************************************************** +* +* Functions for conversion between integer sizes +* +*****************************************************************************/ + +// Extend 32-bit integers to 64-bit integers, signed and unsigned + +// Function extend_low : extends the low 8 elements to 64 bits with sign extension +static inline Vec8q extend_low (Vec16i const a) { + return _mm512_cvtepi32_epi64(a.get_low()); +} + +// Function extend_high : extends the high 8 elements to 64 bits with sign extension +static inline Vec8q extend_high (Vec16i const a) { + return _mm512_cvtepi32_epi64(a.get_high()); +} + +// Function extend_low : extends the low 8 elements to 64 bits with zero extension +static inline Vec8uq extend_low (Vec16ui const a) { + return _mm512_cvtepu32_epi64(a.get_low()); +} + +// Function extend_high : extends the high 8 elements to 64 bits with zero extension +static inline Vec8uq extend_high (Vec16ui const a) { + return _mm512_cvtepu32_epi64(a.get_high()); +} + +// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Overflow wraps around +static inline Vec16i compress (Vec8q const low, Vec8q const high) { + Vec8i low2 = _mm512_cvtepi64_epi32(low); + Vec8i high2 = _mm512_cvtepi64_epi32(high); + return Vec16i(low2, high2); +} +static inline Vec16ui compress (Vec8uq const low, Vec8uq const high) { + return Vec16ui(compress(Vec8q(low), Vec8q(high))); +} + +// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Signed, with saturation +static inline Vec16i compress_saturated (Vec8q const low, Vec8q const high) { + Vec8i low2 = _mm512_cvtsepi64_epi32(low); + Vec8i high2 = _mm512_cvtsepi64_epi32(high); + return Vec16i(low2, high2); +} + +// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Unsigned, with saturation +static inline Vec16ui compress_saturated (Vec8uq const low, Vec8uq const high) { + Vec8ui low2 = _mm512_cvtusepi64_epi32(low); + Vec8ui high2 = _mm512_cvtusepi64_epi32(high); + return Vec16ui(low2, high2); +} + + +/***************************************************************************** +* +* Integer division operators +* +* Please see the file vectori128.h for explanation. +* +*****************************************************************************/ + +// vector operator / : divide each element by divisor + +// vector of 16 32-bit signed integers +static inline Vec16i operator / (Vec16i const a, Divisor_i const d) { + __m512i m = _mm512_broadcast_i32x4(d.getm()); // broadcast multiplier + __m512i sgn = _mm512_broadcast_i32x4(d.getsign()); // broadcast sign of d + __m512i t1 = _mm512_mul_epi32(a,m); // 32x32->64 bit signed multiplication of even elements of a + __m512i t3 = _mm512_srli_epi64(a,32); // get odd elements of a into position for multiplication + __m512i t4 = _mm512_mul_epi32(t3,m); // 32x32->64 bit signed multiplication of odd elements + __m512i t2 = _mm512_srli_epi64(t1,32); // dword of even index results + __m512i t7 = _mm512_mask_mov_epi32(t2, 0xAAAA, t4); // blend two results + __m512i t8 = _mm512_add_epi32(t7,a); // add + __m512i t9 = _mm512_sra_epi32(t8,d.gets1()); // shift right artihmetic + __m512i t10 = _mm512_srai_epi32(a,31); // sign of a + __m512i t11 = _mm512_sub_epi32(t10,sgn); // sign of a - sign of d + __m512i t12 = _mm512_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 + return _mm512_xor_si512(t12,sgn); // change sign if divisor negative +} + +// vector of 16 32-bit unsigned integers +static inline Vec16ui operator / (Vec16ui const a, Divisor_ui const d) { + __m512i m = _mm512_broadcast_i32x4(d.getm()); // broadcast multiplier + __m512i t1 = _mm512_mul_epu32(a,m); // 32x32->64 bit unsigned multiplication of even elements of a + __m512i t3 = _mm512_srli_epi64(a,32); // get odd elements of a into position for multiplication + __m512i t4 = _mm512_mul_epu32(t3,m); // 32x32->64 bit unsigned multiplication of odd elements + __m512i t2 = _mm512_srli_epi64(t1,32); // high dword of even index results + __m512i t7 = _mm512_mask_mov_epi32(t2, 0xAAAA, t4); // blend two results + __m512i t8 = _mm512_sub_epi32(a,t7); // subtract + __m512i t9 = _mm512_srl_epi32(t8,d.gets1()); // shift right logical + __m512i t10 = _mm512_add_epi32(t7,t9); // add + return _mm512_srl_epi32(t10,d.gets2()); // shift right logical +} + +// vector operator /= : divide +static inline Vec16i & operator /= (Vec16i & a, Divisor_i const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const d) { + a = a / d; + return a; +} + + +/***************************************************************************** +* +* Integer division 2: divisor is a compile-time constant +* +*****************************************************************************/ + +// Divide Vec16i by compile-time constant +template +static inline Vec16i divide_by_i(Vec16i const x) { + static_assert(d != 0, "Integer division by zero"); + if constexpr (d == 1) return x; + if constexpr (d == -1) return -x; + if constexpr (uint32_t(d) == 0x80000000u) { + return _mm512_maskz_set1_epi32(x == Vec16i(0x80000000), 1); // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0; + } + constexpr uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d); // compile-time abs(d). (force compiler to treat d as 32 bits, not 64 bits) + if constexpr ((d1 & (d1-1)) == 0) { + // d1 is a power of 2. use shift + constexpr int k = bit_scan_reverse_const(d1); + __m512i sign; + if constexpr (k > 1) sign = _mm512_srai_epi32(x, k-1); else sign = x; // k copies of sign bit + __m512i bias = _mm512_srli_epi32(sign, 32-k); // bias = x >= 0 ? 0 : k-1 + __m512i xpbias = _mm512_add_epi32 (x, bias); // x + bias + __m512i q = _mm512_srai_epi32(xpbias, k); // (x + bias) >> k + if (d > 0) return q; // d > 0: return q + return _mm512_sub_epi32(_mm512_setzero_epi32(), q); // d < 0: return -q + } + // general case + constexpr int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case) + constexpr int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32)); // multiplier + const Divisor_i div(mult, sh, d < 0 ? -1 : 0); + return x / div; +} + +// define Vec8i a / const_int(d) +template +static inline Vec16i operator / (Vec16i const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec16i a / const_uint(d) +template +static inline Vec16i operator / (Vec16i const a, Const_uint_t) { + static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec16i & operator /= (Vec16i & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16i & operator /= (Vec16i & a, Const_uint_t b) { + a = a / b; + return a; +} + + +// Divide Vec16ui by compile-time constant +template +static inline Vec16ui divide_by_ui(Vec16ui const x) { + static_assert(d != 0, "Integer division by zero"); + if constexpr (d == 1) return x; // divide by 1 + constexpr int b = bit_scan_reverse_const(d); // floor(log2(d)) + if constexpr ((uint32_t(d) & (uint32_t(d)-1)) == 0) { + // d is a power of 2. use shift + return _mm512_srli_epi32(x, b); // x >> b + } + // general case (d > 2) + constexpr uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d + constexpr uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d + constexpr bool round_down = (2*rem < d); // check if fraction is less than 0.5 + constexpr uint32_t mult1 = round_down ? mult : mult + 1; + + // do 32*32->64 bit unsigned multiplication and get high part of result + const __m512i multv = _mm512_maskz_set1_epi32(0x5555, mult1); // zero-extend mult and broadcast + __m512i t1 = _mm512_mul_epu32(x,multv); // 32x32->64 bit unsigned multiplication of even elements + if constexpr (round_down) { + t1 = _mm512_add_epi64(t1,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow + } + __m512i t2 = _mm512_srli_epi64(t1,32); // high dword of result 0 and 2 + __m512i t3 = _mm512_srli_epi64(x,32); // get odd elements into position for multiplication + __m512i t4 = _mm512_mul_epu32(t3,multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3] + if constexpr (round_down) { + t4 = _mm512_add_epi64(t4,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow + } + __m512i t7 = _mm512_mask_mov_epi32(t2, 0xAAAA, t4); // blend two results + Vec16ui q = _mm512_srli_epi32(t7, b); // shift right by b + return q; // no overflow possible +} + +// define Vec8ui a / const_uint(d) +template +static inline Vec16ui operator / (Vec16ui const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec8ui a / const_int(d) +template +static inline Vec16ui operator / (Vec16ui const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t b) { + a = a / b; + return a; +} + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORI512_H diff --git a/DFTTest/VCL2/vectori512e.h b/DFTTest/VCL2/vectori512e.h new file mode 100644 index 0000000..e1dc4d9 --- /dev/null +++ b/DFTTest/VCL2/vectori512e.h @@ -0,0 +1,2342 @@ +/**************************** vectori512e.h ******************************* +* Author: Agner Fog +* Date created: 2014-07-23 +* Last modified: 2020-03-26 +* Version: 2.01.02 +* Project: vector classes +* Description: +* Header file defining 512-bit integer vector classes for 32 and 64 bit integers. +* Emulated for processors without AVX512 instruction set. +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec16i Vector of 16 32-bit signed integers +* Vec16ui Vector of 16 32-bit unsigned integers +* Vec16ib Vector of 16 Booleans for use with Vec16i and Vec16ui +* Vec8q Vector of 8 64-bit signed integers +* Vec8uq Vector of 8 64-bit unsigned integers +* Vec8qb Vector of 8 Booleans for use with Vec8q and Vec8uq +* +* Each vector object is represented internally in the CPU as two 256-bit registers. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +*****************************************************************************/ + +#ifndef VECTORI512E_H +#define VECTORI512E_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +// check combination of header files +#if defined (VECTORI512_H) +#error Two different versions of vectori512.h included +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + + +/***************************************************************************** +* +* Vector of 512 bits +* +*****************************************************************************/ + +class Vec512b { +protected: + Vec256b z0; // low half + Vec256b z1; // high half +public: + // Default constructor: + Vec512b() { + } + // Constructor to build from two Vec256b: + Vec512b(Vec256b const a0, Vec256b const a1) { + z0 = a0; z1 = a1; + } + // Member function to load from array (unaligned) + Vec512b & load(void const * p) { + z0 = Vec8i().load(p); + z1 = Vec8i().load((int32_t const*)p+8); + return *this; + } + // Member function to load from array, aligned by 64 + Vec512b & load_a(void const * p) { + z0 = Vec8i().load_a(p); + z1 = Vec8i().load_a((int32_t const*)p+8); + return *this; + } + // Member function to store into array (unaligned) + void store(void * p) const { + Vec8i(z0).store(p); + Vec8i(z1).store((int32_t*)p+8); + } + // Member function to store into array, aligned by 64 + void store_a(void * p) const { + Vec8i(z0).store_a(p); + Vec8i(z1).store_a((int32_t*)p+8); + } + // Member function storing to aligned uncached memory (non-temporal store). + // This may be more efficient than store_a when storing large blocks of memory if it + // is unlikely that the data will stay in the cache until it is read again. + // Note: Will generate runtime error if p is not aligned by 64 + void store_nt(void * p) const { + Vec8i(z0).store_nt(p); + Vec8i(z1).store_nt((int32_t*)p+8); + } + Vec256b get_low() const { // get low half + return z0; + } + Vec256b get_high() const { // get high half + return z1; + } + static constexpr int size() { + return 512; + } +}; + +// Define operators for this class + +// vector operator & : bitwise and +static inline Vec512b operator & (Vec512b const a, Vec512b const b) { + return Vec512b(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec512b operator && (Vec512b const a, Vec512b const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec512b operator | (Vec512b const a, Vec512b const b) { + return Vec512b(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} +static inline Vec512b operator || (Vec512b const a, Vec512b const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec512b operator ^ (Vec512b const a, Vec512b const b) { + return Vec512b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ~ : bitwise not +static inline Vec512b operator ~ (Vec512b const a) { + return Vec512b(~a.get_low(), ~a.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec512b & operator &= (Vec512b & a, Vec512b const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec512b & operator |= (Vec512b & a, Vec512b const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec512b & operator ^= (Vec512b & a, Vec512b const b) { + a = a ^ b; + return a; +} + +// Define functions for this class + +// function andnot: a & ~ b +static inline Vec512b andnot (Vec512b const a, Vec512b const b) { + return Vec512b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high())); +} + + +/***************************************************************************** +* +* Boolean vector (broad) base classes +* +*****************************************************************************/ + +class Vec16b : public Vec512b { +public: + // Default constructor: + Vec16b () { + } + // Constructor to build from all elements: + Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, + bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) { + *this = Vec512b(Vec8i(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7), Vec8i(-(int)b8, -(int)b9, -(int)b10, -(int)b11, -(int)b12, -(int)b13, -(int)b14, -(int)b15)); + } + // Constructor to convert from type Vec512b + Vec16b (Vec512b const & x) { // gcc requires const & here + z0 = x.get_low(); + z1 = x.get_high(); + } + // Constructor to make from two halves + Vec16b (Vec8ib const x0, Vec8ib const x1) { + z0 = x0; + z1 = x1; + } + // Constructor to make from two halves + Vec16b (Vec8i const x0, Vec8i const x1) { + z0 = x0; + z1 = x1; + } + // Constructor to broadcast single value: + Vec16b(bool b) { + z0 = z1 = Vec8i(-int32_t(b)); + } + // Assignment operator to broadcast scalar value: + Vec16b & operator = (bool b) { + z0 = z1 = Vec8i(-int32_t(b)); + return *this; + } + // split into two halves + Vec8ib get_low() const { + return Vec8ib(z0); + } + Vec8ib get_high() const { + return Vec8ib(z1); + } + /* + // Assignment operator to convert from type Vec512b + Vec16b & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } */ + // Member function to change a single element in vector + // Note: This function is inefficient. Use load function if changing more than one element + Vec16b const insert(int index, bool value) { + if ((uint32_t)index < 8) { + z0 = Vec8ib(z0).insert(index, value); + } + else { + z1 = Vec8ib(z1).insert(index-8, value); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + if ((uint32_t)index < 8) { + return Vec8ib(z0).extract(index); + } + else { + return Vec8ib(z1).extract(index-8); + } + } + // Extract a single element. Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 3; + } + // Prevent constructing from int, etc. because of ambiguity + Vec16b(int b) = delete; + // Prevent assigning int because of ambiguity + Vec16b & operator = (int x) = delete; +}; + +// Define operators for this class + +// vector operator & : bitwise and +static inline Vec16b operator & (Vec16b const a, Vec16b const b) { + return Vec16b(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec16b operator && (Vec16b const a, Vec16b const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec16b operator | (Vec16b const a, Vec16b const b) { + return Vec16b(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} +static inline Vec16b operator || (Vec16b const a, Vec16b const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec16b operator ^ (Vec16b const a, Vec16b const b) { + return Vec16b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ~ : bitwise not +static inline Vec16b operator ~ (Vec16b const a) { + return Vec16b(~(a.get_low()), ~(a.get_high())); +} + +// vector operator ! : element not +static inline Vec16b operator ! (Vec16b const a) { + return ~a; +} + +// vector operator &= : bitwise and +static inline Vec16b & operator &= (Vec16b & a, Vec16b const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec16b & operator |= (Vec16b & a, Vec16b const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec16b & operator ^= (Vec16b & a, Vec16b const b) { + a = a ^ b; + return a; +} + +/***************************************************************************** +* +* Functions for boolean vectors +* +*****************************************************************************/ + +// function andnot: a & ~ b +static inline Vec16b andnot (Vec16b const a, Vec16b const b) { + return Vec16b(Vec8ib(andnot(a.get_low(),b.get_low())), Vec8ib(andnot(a.get_high(),b.get_high()))); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec16b const a) { + return horizontal_and(a.get_low() & a.get_high()); +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec16b const a) { + return horizontal_or(a.get_low() | a.get_high()); +} + + +/***************************************************************************** +* +* Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui +* +*****************************************************************************/ + +class Vec16ib : public Vec16b { +public: + // Default constructor: + Vec16ib () { + } + /* + Vec16ib (Vec16b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } */ + // Constructor to build from all elements: + Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, + bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) { + z0 = Vec8ib(x0, x1, x2, x3, x4, x5, x6, x7); + z1 = Vec8ib(x8, x9, x10, x11, x12, x13, x14, x15); + } + // Constructor to convert from type Vec512b + Vec16ib (Vec512b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Construct from two halves + Vec16ib (Vec8ib const x0, Vec8ib const x1) { + z0 = x0; + z1 = x1; + } + // Assignment operator to convert from type Vec512b + Vec16ib & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Constructor to broadcast scalar value: + Vec16ib(bool b) : Vec16b(b) { + } + // Assignment operator to broadcast scalar value: + Vec16ib & operator = (bool b) { + *this = Vec16b(b); + return *this; + } + // Member function to change a bitfield to a boolean vector + Vec16ib & load_bits(uint16_t a) { + z0 = Vec8ib().load_bits(uint8_t(a)); + z1 = Vec8ib().load_bits(uint8_t(a>>8)); + return *this; + } + // Prevent constructing from int, etc. + Vec16ib(int b) = delete; + Vec16ib & operator = (int x) = delete; +}; + +// Define operators for Vec16ib + +// vector operator & : bitwise and +static inline Vec16ib operator & (Vec16ib const a, Vec16ib const b) { + return Vec16b(a) & Vec16b(b); +} +static inline Vec16ib operator && (Vec16ib const a, Vec16ib const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec16ib operator | (Vec16ib const a, Vec16ib const b) { + return Vec16b(a) | Vec16b(b); +} +static inline Vec16ib operator || (Vec16ib const a, Vec16ib const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec16ib operator ^ (Vec16ib const a, Vec16ib const b) { + return Vec16b(a) ^ Vec16b(b); +} + +// vector operator == : xnor +static inline Vec16ib operator == (Vec16ib const a, Vec16ib const b) { + return Vec16ib(Vec16b(a) ^ Vec16b(~b)); +} + +// vector operator != : xor +static inline Vec16ib operator != (Vec16ib const a, Vec16ib const b) { + return Vec16ib(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec16ib operator ~ (Vec16ib const a) { + return ~Vec16b(a); +} + +// vector operator ! : element not +static inline Vec16ib operator ! (Vec16ib const a) { + return ~a; +} + +// vector operator &= : bitwise and +static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib const b) { + a = a ^ b; + return a; +} + +// vector function andnot +static inline Vec16ib andnot (Vec16ib const a, Vec16ib const b) { + return Vec16ib(andnot(Vec16b(a), Vec16b(b))); +} + + +/***************************************************************************** +* +* Vec8b: Base class vector of 8 Booleans +* +*****************************************************************************/ + +class Vec8b : public Vec16b { +public: + // Default constructor: + Vec8b () { + } + /* + Vec8b (Vec16b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } */ + // Constructor to convert from type Vec512b + Vec8b (Vec512b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // construct from two halves + Vec8b (Vec4qb const x0, Vec4qb const x1) { + z0 = x0; + z1 = x1; + } + // Constructor to broadcast single value: + Vec8b(bool b) { + z0 = z1 = Vec8i(-int32_t(b)); + } + // Assignment operator to broadcast scalar value: + Vec8b & operator = (bool b) { + z0 = z1 = Vec8i(-int32_t(b)); + return *this; + } + // split into two halves + Vec4qb get_low() const { + return Vec4qb(z0); + } + Vec4qb get_high() const { + return Vec4qb(z1); + } + /* + // Assignment operator to convert from type Vec512b + Vec8b & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } */ + // Member function to change a single element in vector + Vec8b const insert(int index, bool value) { + if ((uint32_t)index < 4) { + z0 = Vec4qb(z0).insert(index, value); + } + else { + z1 = Vec4qb(z1).insert(index-4, value); + } + return *this; + } + bool extract(int index) const { + if ((uint32_t)index < 4) { + return Vec4qb(Vec4q(z0)).extract(index); + } + else { + return Vec4qb(Vec4q(z1)).extract(index-4); + } + } + bool operator [] (int index) const { + return extract(index); + } + static constexpr int size() { + return 8; + } + // Prevent constructing from int, etc. because of ambiguity + Vec8b(int b) = delete; + // Prevent assigning int because of ambiguity + Vec8b & operator = (int x) = delete; +}; + + +/***************************************************************************** +* +* Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu +* +*****************************************************************************/ + +class Vec8qb : public Vec8b { +public: + // Default constructor: + Vec8qb () { + } + Vec8qb (Vec16b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Constructor to build from all elements: + Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) { + z0 = Vec4qb(x0, x1, x2, x3); + z1 = Vec4qb(x4, x5, x6, x7); + } + // Constructor to convert from type Vec512b + Vec8qb (Vec512b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // construct from two halves + Vec8qb (Vec4qb const x0, Vec4qb const x1) { + z0 = x0; + z1 = x1; + } + // Assignment operator to convert from type Vec512b + Vec8qb & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Constructor to broadcast single value: + Vec8qb(bool b) : Vec8b(b) { + } + // Assignment operator to broadcast scalar value: + Vec8qb & operator = (bool b) { + *this = Vec8b(b); + return *this; + } + // Member function to change a bitfield to a boolean vector + Vec8qb & load_bits(uint8_t a) { + z0 = Vec4qb().load_bits(a); + z1 = Vec4qb().load_bits(uint8_t(a>>4u)); + return *this; + } + // Prevent constructing from int, etc. because of ambiguity + Vec8qb(int b) = delete; + // Prevent assigning int because of ambiguity + Vec8qb & operator = (int x) = delete; +}; + +// Define operators for Vec8qb + +// vector operator & : bitwise and +static inline Vec8qb operator & (Vec8qb const a, Vec8qb const b) { + return Vec16b(a) & Vec16b(b); +} +static inline Vec8qb operator && (Vec8qb const a, Vec8qb const b) { + return a & b; +} + +// vector operator | : bitwise or +static inline Vec8qb operator | (Vec8qb const a, Vec8qb const b) { + return Vec16b(a) | Vec16b(b); +} +static inline Vec8qb operator || (Vec8qb const a, Vec8qb const b) { + return a | b; +} + +// vector operator ^ : bitwise xor +static inline Vec8qb operator ^ (Vec8qb const a, Vec8qb const b) { + return Vec16b(a) ^ Vec16b(b); +} + +// vector operator == : xnor +static inline Vec8qb operator == (Vec8qb const a, Vec8qb const b) { + return Vec8qb(Vec16b(a) ^ Vec16b(~b)); +} + +// vector operator != : xor +static inline Vec8qb operator != (Vec8qb const a, Vec8qb const b) { + return Vec8qb(a ^ b); +} + +// vector operator ~ : bitwise not +static inline Vec8qb operator ~ (Vec8qb const a) { + return ~Vec16b(a); +} + +// vector operator ! : element not +static inline Vec8qb operator ! (Vec8qb const a) { + return ~a; +} + +// vector operator &= : bitwise and +static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb const b) { + a = a & b; + return a; +} + +// vector operator |= : bitwise or +static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb const b) { + a = a | b; + return a; +} + +// vector operator ^= : bitwise xor +static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb const b) { + a = a ^ b; + return a; +} + +// vector function andnot +static inline Vec8qb andnot (Vec8qb const a, Vec8qb const b) { + return Vec8qb(andnot(Vec16b(a), Vec16b(b))); +} + + +/***************************************************************************** +* +* Vector of 16 32-bit signed integers +* +*****************************************************************************/ + +class Vec16i: public Vec512b { +public: + // Default constructor: + Vec16i() { + } + // Constructor to broadcast the same value into all elements: + Vec16i(int i) { + z0 = z1 = Vec8i(i); + } + // Constructor to build from all elements: + Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7, + int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) { + z0 = Vec8i(i0, i1, i2, i3, i4, i5, i6, i7); + z1 = Vec8i(i8, i9, i10, i11, i12, i13, i14, i15); + } + // Constructor to build from two Vec8i: + Vec16i(Vec8i const a0, Vec8i const a1) { + *this = Vec512b(a0, a1); + } + // Constructor to convert from type Vec512b + Vec16i(Vec512b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Assignment operator to convert from type Vec512b + Vec16i & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Member function to load from array (unaligned) + Vec16i & load(void const * p) { + Vec512b::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec16i & load_a(void const * p) { + Vec512b::load_a(p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec16i & load_partial(int n, void const * p) { + if (n < 8) { + z0 = Vec8i().load_partial(n, p); + z1 = Vec8i(0); + } + else { + z0 = Vec8i().load(p); + z1 = Vec8i().load_partial(n - 8, (int32_t const*)p + 8); + } + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + if (n < 8) { + Vec8i(get_low()).store_partial(n, p); + } + else { + Vec8i(get_low()).store(p); + Vec8i(get_high()).store_partial(n - 8, (int32_t *)p + 8); + } + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec16i & cutoff(int n) { + if (n < 8) { + z0 = Vec8i(z0).cutoff(n); + z1 = Vec8i(0); + } + else { + z1 = Vec8i(z1).cutoff(n - 8); + } + return *this; + } + // Member function to change a single element in vector + Vec16i const insert(int index, int32_t value) { + if ((uint32_t)index < 8) { + z0 = Vec8i(z0).insert(index, value); + } + else { + z1 = Vec8i(z1).insert(index - 8, value); + } + return *this; + } + // Member function extract a single element from vector + int32_t extract(int index) const { + if ((uint32_t)index < 8) { + return Vec8i(z0).extract(index); + } + else { + return Vec8i(z1).extract(index - 8); + } + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int32_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec8i: + Vec8i get_low() const { + return Vec8i(z0); + } + Vec8i get_high() const { + return Vec8i(z1); + } + static constexpr int size() { + return 16; + } + static constexpr int elementtype() { + return 8; + } +}; + + +// Define operators for Vec16i + +// vector operator + : add element by element +static inline Vec16i operator + (Vec16i const a, Vec16i const b) { + return Vec16i(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator += : add +static inline Vec16i & operator += (Vec16i & a, Vec16i const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec16i operator ++ (Vec16i & a, int) { + Vec16i a0 = a; + a = a + 1; + return a0; +} + +// prefix operator ++ +static inline Vec16i & operator ++ (Vec16i & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec16i operator - (Vec16i const a, Vec16i const b) { + return Vec16i(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator - : unary minus +static inline Vec16i operator - (Vec16i const a) { + return Vec16i(-a.get_low(), -a.get_high()); +} + +// vector operator -= : subtract +static inline Vec16i & operator -= (Vec16i & a, Vec16i const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec16i operator -- (Vec16i & a, int) { + Vec16i a0 = a; + a = a - 1; + return a0; +} + +// prefix operator -- +static inline Vec16i & operator -- (Vec16i & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec16i operator * (Vec16i const a, Vec16i const b) { + return Vec16i(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator *= : multiply +static inline Vec16i & operator *= (Vec16i & a, Vec16i const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec16i operator << (Vec16i const a, int32_t b) { + return Vec16i(a.get_low() << b, a.get_high() << b); +} + +// vector operator <<= : shift left +static inline Vec16i & operator <<= (Vec16i & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec16i operator >> (Vec16i const a, int32_t b) { + return Vec16i(a.get_low() >> b, a.get_high() >> b); +} + +// vector operator >>= : shift right arithmetic +static inline Vec16i & operator >>= (Vec16i & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec16ib operator == (Vec16i const a, Vec16i const b) { + return Vec16ib(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec16ib operator != (Vec16i const a, Vec16i const b) { + return Vec16ib(a.get_low() != b.get_low(), a.get_high() != b.get_high()); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec16ib operator > (Vec16i const a, Vec16i const b) { + return Vec16ib(a.get_low() > b.get_low(), a.get_high() > b.get_high()); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec16ib operator < (Vec16i const a, Vec16i const b) { + return b > a; +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec16ib operator >= (Vec16i const a, Vec16i const b) { + return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec16ib operator <= (Vec16i const a, Vec16i const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec16i operator & (Vec16i const a, Vec16i const b) { + return Vec16i(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec16i & operator &= (Vec16i & a, Vec16i const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec16i operator | (Vec16i const a, Vec16i const b) { + return Vec16i(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} + +// vector operator |= : bitwise or +static inline Vec16i & operator |= (Vec16i & a, Vec16i const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec16i operator ^ (Vec16i const a, Vec16i const b) { + return Vec16i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ^= : bitwise xor +static inline Vec16i & operator ^= (Vec16i & a, Vec16i const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec16i operator ~ (Vec16i const a) { + return Vec16i(~(a.get_low()), ~(a.get_high())); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16i select (Vec16ib const s, Vec16i const a, Vec16i const b) { + return Vec16i(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16i if_add (Vec16ib const f, Vec16i const a, Vec16i const b) { + return Vec16i(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec16i if_sub (Vec16ib const f, Vec16i const a, Vec16i const b) { + return Vec16i(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec16i if_mul (Vec16ib const f, Vec16i const a, Vec16i const b) { + return Vec16i(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int32_t horizontal_add (Vec16i const a) { + return horizontal_add(a.get_low() + a.get_high()); +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec16i add_saturated(Vec16i const a, Vec16i const b) { + return Vec16i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high())); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec16i sub_saturated(Vec16i const a, Vec16i const b) { + return Vec16i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high())); +} + +// function max: a > b ? a : b +static inline Vec16i max(Vec16i const a, Vec16i const b) { + return Vec16i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec16i min(Vec16i const a, Vec16i const b) { + return Vec16i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + +// function abs: a >= 0 ? a : -a +static inline Vec16i abs(Vec16i const a) { + return Vec16i(abs(a.get_low()), abs(a.get_high())); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec16i abs_saturated(Vec16i const a) { + return Vec16i(abs_saturated(a.get_low()), abs_saturated(a.get_high())); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec16i rotate_left(Vec16i const a, int b) { + return Vec16i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); +} + + +/***************************************************************************** +* +* Vector of 16 32-bit unsigned integers +* +*****************************************************************************/ + +class Vec16ui : public Vec16i { +public: + // Default constructor: + Vec16ui() { + }; + // Constructor to broadcast the same value into all elements: + Vec16ui(uint32_t i) { + z0 = z1 = Vec8ui(i); + }; + // Constructor to build from all elements: + Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7, + uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) { + z0 = Vec8ui(i0, i1, i2, i3, i4, i5, i6, i7); + z1 = Vec8ui(i8, i9, i10, i11, i12, i13, i14, i15); + }; + // Constructor to build from two Vec8ui: + Vec16ui(Vec8ui const a0, Vec8ui const a1) { + z0 = a0; + z1 = a1; + } + // Constructor to convert from type Vec512b + Vec16ui(Vec512b const & x) { + *this = x; + } + // Assignment operator to convert from type Vec512b + Vec16ui & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Member function to load from array (unaligned) + Vec16ui & load(void const * p) { + Vec16i::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec16ui & load_a(void const * p) { + Vec16i::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec16ui const insert(int index, uint32_t value) { + Vec16i::insert(index, (int32_t)value); + return *this; + } + // Member function extract a single element from vector + uint32_t extract(int index) const { + return (uint32_t)Vec16i::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint32_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec4ui: + Vec8ui get_low() const { + return Vec8ui(Vec16i::get_low()); + } + Vec8ui get_high() const { + return Vec8ui(Vec16i::get_high()); + } + static constexpr int elementtype() { + return 9; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec16ui operator + (Vec16ui const a, Vec16ui const b) { + return Vec16ui (Vec16i(a) + Vec16i(b)); +} + +// vector operator - : subtract +static inline Vec16ui operator - (Vec16ui const a, Vec16ui const b) { + return Vec16ui (Vec16i(a) - Vec16i(b)); +} + +// vector operator * : multiply +static inline Vec16ui operator * (Vec16ui const a, Vec16ui const b) { + return Vec16ui (Vec16i(a) * Vec16i(b)); +} + +// vector operator / : divide. See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec16ui operator >> (Vec16ui const a, uint32_t b) { + return Vec16ui(a.get_low() >> b, a.get_high() >> b); +} + +// vector operator >> : shift right logical all elements +static inline Vec16ui operator >> (Vec16ui const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right logical +static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical +static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec16ui operator << (Vec16ui const a, uint32_t b) { + return Vec16ui ((Vec16i)a << (int32_t)b); +} + +// vector operator << : shift left all elements +static inline Vec16ui operator << (Vec16ui const a, int32_t b) { + return Vec16ui ((Vec16i)a << (int32_t)b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec16ib operator < (Vec16ui const a, Vec16ui const b) { + return Vec16ib(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec16ib operator > (Vec16ui const a, Vec16ui const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec16ib operator >= (Vec16ui const a, Vec16ui const b) { + return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec16ib operator <= (Vec16ui const a, Vec16ui const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec16ui operator & (Vec16ui const a, Vec16ui const b) { + return Vec16ui(Vec16i(a) & Vec16i(b)); +} + +// vector operator | : bitwise or +static inline Vec16ui operator | (Vec16ui const a, Vec16ui const b) { + return Vec16ui(Vec16i(a) | Vec16i(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec16ui operator ^ (Vec16ui const a, Vec16ui const b) { + return Vec16ui(Vec16i(a) ^ Vec16i(b)); +} + +// vector operator ~ : bitwise not +static inline Vec16ui operator ~ (Vec16ui const a) { + return Vec16ui( ~ Vec16i(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec16ui select (Vec16ib const s, Vec16ui const a, Vec16ui const b) { + return Vec16ui(select(s, Vec16i(a), Vec16i(b))); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec16ui if_add (Vec16ib const f, Vec16ui const a, Vec16ui const b) { + return Vec16ui(if_add(f, Vec16i(a), Vec16i(b))); +} + +// Conditional subtract +static inline Vec16ui if_sub (Vec16ib const f, Vec16ui const a, Vec16ui const b) { + return Vec16ui(if_sub(f, Vec16i(a), Vec16i(b))); +} + +// Conditional multiply +static inline Vec16ui if_mul (Vec16ib const f, Vec16ui const a, Vec16ui const b) { + return Vec16ui(if_mul(f, Vec16i(a), Vec16i(b))); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint32_t horizontal_add (Vec16ui const a) { + return (uint32_t)horizontal_add((Vec16i)a); +} + +// horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec16ui add_saturated(Vec16ui const a, Vec16ui const b) { + return Vec16ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high())); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec16ui sub_saturated(Vec16ui const a, Vec16ui const b) { + return Vec16ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high())); +} + +// function max: a > b ? a : b +static inline Vec16ui max(Vec16ui const a, Vec16ui const b) { + return Vec16ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec16ui min(Vec16ui const a, Vec16ui const b) { + return Vec16ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + + +/***************************************************************************** +* +* Vector of 8 64-bit signed integers +* +*****************************************************************************/ + +class Vec8q : public Vec512b { +public: + // Default constructor: + Vec8q() { + } + // Constructor to broadcast the same value into all elements: + Vec8q(int64_t i) { + z0 = z1 = Vec4q(i); + } + // Constructor to build from all elements: + Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) { + z0 = Vec4q(i0, i1, i2, i3); + z1 = Vec4q(i4, i5, i6, i7); + } + // Constructor to build from two Vec4q: + Vec8q(Vec4q const a0, Vec4q const a1) { + z0 = a0; + z1 = a1; + } + // Constructor to convert from type Vec512b + Vec8q(Vec512b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Assignment operator to convert from type Vec512b + Vec8q & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Member function to load from array (unaligned) + Vec8q & load(void const * p) { + z0 = Vec4q().load(p); + z1 = Vec4q().load((int64_t const*)p+4); + return *this; + } + // Member function to load from array, aligned by 64 + Vec8q & load_a(void const * p) { + z0 = Vec4q().load_a(p); + z1 = Vec4q().load_a((int64_t const*)p+4); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec8q & load_partial(int n, void const * p) { + if (n < 4) { + z0 = Vec4q().load_partial(n, p); + z1 = Vec4q(0); + } + else { + z0 = Vec4q().load(p); + z1 = Vec4q().load_partial(n - 4, (int64_t const*)p + 4); + } + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + if (n < 4) { + Vec4q(get_low()).store_partial(n, p); + } + else { + Vec4q(get_low()).store(p); + Vec4q(get_high()).store_partial(n - 4, (int64_t *)p + 4); + } + } + // cut off vector to n elements. The last 8-n elements are set to zero + Vec8q & cutoff(int n) { + if (n < 4) { + z0 = Vec4q(z0).cutoff(n); + z1 = Vec4q(0); + } + else { + z1 = Vec4q(z1).cutoff(n - 4); + } + return *this; + } + // Member function to change a single element in vector + Vec8q const insert(int index, int64_t value) { + if ((uint32_t)index < 4) { + z0 = Vec4q(z0).insert(index, value); + } + else { + z1 = Vec4q(z1).insert(index-4, value); + } + return *this; + } + // Member function extract a single element from vector + int64_t extract(int index) const { + if ((uint32_t)index < 4) { + return Vec4q(z0).extract(index); + } + else { + return Vec4q(z1).extract(index - 4); + } + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int64_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2q: + Vec4q get_low() const { + return Vec4q(z0); + } + Vec4q get_high() const { + return Vec4q(z1); + } + static constexpr int size() { + return 8; + } + static constexpr int elementtype() { + return 10; + } +}; + +// Define operators for Vec8q + +// vector operator + : add element by element +static inline Vec8q operator + (Vec8q const a, Vec8q const b) { + return Vec8q(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator += : add +static inline Vec8q & operator += (Vec8q & a, Vec8q const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec8q operator ++ (Vec8q & a, int) { + Vec8q a0 = a; + a = a + 1; + return a0; +} + +// prefix operator ++ +static inline Vec8q & operator ++ (Vec8q & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec8q operator - (Vec8q const a, Vec8q const b) { + return Vec8q(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator - : unary minus +static inline Vec8q operator - (Vec8q const a) { + return Vec8q(- a.get_low(), - a.get_high()); +} + +// vector operator -= : subtract +static inline Vec8q & operator -= (Vec8q & a, Vec8q const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec8q operator -- (Vec8q & a, int) { + Vec8q a0 = a; + a = a - 1; + return a0; +} + +// prefix operator -- +static inline Vec8q & operator -- (Vec8q & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec8q operator * (Vec8q const a, Vec8q const b) { + return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator *= : multiply +static inline Vec8q & operator *= (Vec8q & a, Vec8q const b) { + a = a * b; + return a; +} + +// vector operator << : shift left +static inline Vec8q operator << (Vec8q const a, int32_t b) { + return Vec8q(a.get_low() << b, a.get_high() << b); +} + +// vector operator <<= : shift left +static inline Vec8q & operator <<= (Vec8q & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec8q operator >> (Vec8q const a, int32_t b) { + return Vec8q(a.get_low() >> b, a.get_high() >> b); +} + +// vector operator >>= : shift right arithmetic +static inline Vec8q & operator >>= (Vec8q & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec8qb operator == (Vec8q const a, Vec8q const b) { + return Vec8qb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec8qb operator != (Vec8q const a, Vec8q const b) { + return Vec8qb(a.get_low() != b.get_low(), a.get_high() != b.get_high()); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec8qb operator < (Vec8q const a, Vec8q const b) { + return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec8qb operator > (Vec8q const a, Vec8q const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec8qb operator >= (Vec8q const a, Vec8q const b) { + return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec8qb operator <= (Vec8q const a, Vec8q const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec8q operator & (Vec8q const a, Vec8q const b) { + return Vec8q(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec8q & operator &= (Vec8q & a, Vec8q const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec8q operator | (Vec8q const a, Vec8q const b) { + return Vec8q(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} + +// vector operator |= : bitwise or +static inline Vec8q & operator |= (Vec8q & a, Vec8q const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec8q operator ^ (Vec8q const a, Vec8q const b) { + return Vec8q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} +// vector operator ^= : bitwise xor +static inline Vec8q & operator ^= (Vec8q & a, Vec8q const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec8q operator ~ (Vec8q const a) { + return Vec8q(~(a.get_low()), ~(a.get_high())); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8q select (Vec8qb const s, Vec8q const a, Vec8q const b) { + return Vec8q(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8q if_add (Vec8qb const f, Vec8q const a, Vec8q const b) { + return Vec8q(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec8q if_sub (Vec8qb const f, Vec8q const a, Vec8q const b) { + return Vec8q(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec8q if_mul (Vec8qb const f, Vec8q const a, Vec8q const b) { + return Vec8q(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int64_t horizontal_add (Vec8q const a) { + return horizontal_add(a.get_low() + a.get_high()); +} + +// Horizontal add extended: Calculates the sum of all vector elements +// Elements are sign extended before adding to avoid overflow +static inline int64_t horizontal_add_x (Vec16i const x) { + return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high()); +} + +// Horizontal add extended: Calculates the sum of all vector elements +// Elements are zero extended before adding to avoid overflow +static inline uint64_t horizontal_add_x (Vec16ui const x) { + return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high()); +} + +// function max: a > b ? a : b +static inline Vec8q max(Vec8q const a, Vec8q const b) { + return Vec8q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec8q min(Vec8q const a, Vec8q const b) { + return Vec8q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + +// function abs: a >= 0 ? a : -a +static inline Vec8q abs(Vec8q const a) { + return Vec8q(abs(a.get_low()), abs(a.get_high())); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec8q abs_saturated(Vec8q const a) { + return Vec8q(abs_saturated(a.get_low()), abs_saturated(a.get_high())); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec8q rotate_left(Vec8q const a, int b) { + return Vec8q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); +} + + +/***************************************************************************** +* +* Vector of 8 64-bit unsigned integers +* +*****************************************************************************/ + +class Vec8uq : public Vec8q { +public: + // Default constructor: + Vec8uq() { + } + // Constructor to broadcast the same value into all elements: + Vec8uq(uint64_t i) { + z0 = z1 = Vec4uq(i); + } + // Constructor to convert from Vec8q: + Vec8uq(Vec8q const x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Constructor to convert from type Vec512b + Vec8uq(Vec512b const & x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Constructor to build from all elements: + Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) { + z0 = Vec4q((int64_t)i0, (int64_t)i1, (int64_t)i2, (int64_t)i3); + z1 = Vec4q((int64_t)i4, (int64_t)i5, (int64_t)i6, (int64_t)i7); + } + // Constructor to build from two Vec4uq: + Vec8uq(Vec4uq const a0, Vec4uq const a1) { + z0 = a0; + z1 = a1; + } + // Assignment operator to convert from Vec8q: + Vec8uq & operator = (Vec8q const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Assignment operator to convert from type Vec512b + Vec8uq & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Member function to load from array (unaligned) + Vec8uq & load(void const * p) { + Vec8q::load(p); + return *this; + } + // Member function to load from array, aligned by 32 + Vec8uq & load_a(void const * p) { + Vec8q::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec8uq const insert(int index, uint64_t value) { + Vec8q::insert(index, (int64_t)value); + return *this; + } + // Member function extract a single element from vector + uint64_t extract(int index) const { + return (uint64_t)Vec8q::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint64_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec2uq: + Vec4uq get_low() const { + return Vec4uq(Vec8q::get_low()); + } + Vec4uq get_high() const { + return Vec4uq(Vec8q::get_high()); + } + static constexpr int elementtype() { + return 11; + } +}; + +// Define operators for this class + +// vector operator + : add +static inline Vec8uq operator + (Vec8uq const a, Vec8uq const b) { + return Vec8uq (Vec8q(a) + Vec8q(b)); +} + +// vector operator - : subtract +static inline Vec8uq operator - (Vec8uq const a, Vec8uq const b) { + return Vec8uq (Vec8q(a) - Vec8q(b)); +} + +// vector operator * : multiply element by element +static inline Vec8uq operator * (Vec8uq const a, Vec8uq const b) { + return Vec8uq (Vec8q(a) * Vec8q(b)); +} + +// vector operator >> : shift right logical all elements +static inline Vec8uq operator >> (Vec8uq const a, uint32_t b) { + return Vec8uq(a.get_low() >> b, a.get_high() >> b); +} + +// vector operator >> : shift right logical all elements +static inline Vec8uq operator >> (Vec8uq const a, int32_t b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right artihmetic +static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical +static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec8uq operator << (Vec8uq const a, uint32_t b) { + return Vec8uq ((Vec8q)a << (int32_t)b); +} + +// vector operator << : shift left all elements +static inline Vec8uq operator << (Vec8uq const a, int32_t b) { + return Vec8uq ((Vec8q)a << b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec8qb operator < (Vec8uq const a, Vec8uq const b) { + return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec8qb operator > (Vec8uq const a, Vec8uq const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec8qb operator >= (Vec8uq const a, Vec8uq const b) { + return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec8qb operator <= (Vec8uq const a, Vec8uq const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec8uq operator & (Vec8uq const a, Vec8uq const b) { + return Vec8uq(Vec8q(a) & Vec8q(b)); +} + +// vector operator | : bitwise or +static inline Vec8uq operator | (Vec8uq const a, Vec8uq const b) { + return Vec8uq(Vec8q(a) | Vec8q(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec8uq operator ^ (Vec8uq const a, Vec8uq const b) { + return Vec8uq(Vec8q(a) ^ Vec8q(b)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec8uq select (Vec8qb const s, Vec8uq const a, Vec8uq const b) { + return Vec8uq(select(s, Vec8q(a), Vec8q(b))); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec8uq if_add (Vec8qb const f, Vec8uq const a, Vec8uq const b) { + return Vec8uq(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec8uq if_sub (Vec8qb const f, Vec8uq const a, Vec8uq const b) { + return Vec8uq(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec8uq if_mul (Vec8qb const f, Vec8uq const a, Vec8uq const b) { + return Vec8uq(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline uint64_t horizontal_add (Vec8uq const a) { + return (uint64_t)horizontal_add(Vec8q(a)); +} + +// function max: a > b ? a : b +static inline Vec8uq max(Vec8uq const a, Vec8uq const b) { + return Vec8uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec8uq min(Vec8uq const a, Vec8uq const b) { + return Vec8uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// Permute vector of 8 64-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template +static inline Vec8q permute8(Vec8q const a) { + return Vec8q(blend4 (a.get_low(), a.get_high()), + blend4 (a.get_low(), a.get_high())); +} + +template +static inline Vec8uq permute8(Vec8uq const& a) { + return Vec8uq(permute8(Vec8q(a))); +} + +// Permute vector of 16 32-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template +static inline Vec16i permute16(Vec16i const a) { + return Vec16i(blend8 (a.get_low(), a.get_high()), + blend8 (a.get_low(), a.get_high())); +} + +template +static inline Vec16ui permute16(Vec16ui const a) { + return Vec16ui (permute16 (Vec16i(a))); +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ + +// blend vectors Vec8q +template +static inline Vec8q blend8(Vec8q const a, Vec8q const b) { + Vec4q x0 = blend_half(a, b); + Vec4q x1 = blend_half(a, b); + return Vec8q(x0, x1); +} + +template +static inline Vec8uq blend8(Vec8uq const a, Vec8uq const b) { + return Vec8uq( blend8 (Vec8q(a),Vec8q(b))); +} + +template +static inline Vec16i blend16(Vec16i const a, Vec16i const b) { + Vec8i x0 = blend_half(a, b); + Vec8i x1 = blend_half(a, b); + return Vec16i(x0, x1); +} + +template +static inline Vec16ui blend16(Vec16ui const a, Vec16ui const b) { + return Vec16ui( blend16 (Vec16i(a),Vec16i(b))); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors or as an array. +* +*****************************************************************************/ + +static inline Vec16i lookup16(Vec16i const i1, Vec16i const table) { + int32_t t[16]; + table.store(t); + return Vec16i(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]], + t[i1[8]], t[i1[9]], t[i1[10]], t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]); +} + +template +static inline Vec16i lookup(Vec16i const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 8) { + Vec8i table1 = Vec8i().load(table); + return Vec16i( + lookup8(index.get_low(), table1), + lookup8(index.get_high(), table1)); + } + if constexpr (n <= 16) return lookup16(index, Vec16i().load(table)); + // n > 16. Limit index + Vec16ui i1; + if constexpr ((n & (n - 1)) == 0) { + // n is a power of 2, make index modulo n + i1 = Vec16ui(index) & (n - 1); + } + else { + // n is not a power of 2, limit to n-1 + i1 = min(Vec16ui(index), n - 1); + } + int32_t const * t = (int32_t const *)table; + return Vec16i(t[i1[0]], t[i1[1]], t[i1[2]], t[i1[3]], t[i1[4]], t[i1[5]], t[i1[6]], t[i1[7]], + t[i1[8]], t[i1[9]], t[i1[10]], t[i1[11]], t[i1[12]], t[i1[13]], t[i1[14]], t[i1[15]]); +} + +static inline Vec16i lookup32(Vec16i const index, Vec16i const table1, Vec16i const table2) { + int32_t tab[32]; + table1.store(tab); table2.store(tab+16); + Vec8i t0 = lookup<32>(index.get_low(), tab); + Vec8i t1 = lookup<32>(index.get_high(), tab); + return Vec16i(t0, t1); +} + +static inline Vec16i lookup64(Vec16i const index, Vec16i const table1, Vec16i const table2, Vec16i const table3, Vec16i const table4) { + int32_t tab[64]; + table1.store(tab); table2.store(tab + 16); table3.store(tab + 32); table4.store(tab + 48); + Vec8i t0 = lookup<64>(index.get_low(), tab); + Vec8i t1 = lookup<64>(index.get_high(), tab); + return Vec16i(t0, t1); +} + + +static inline Vec8q lookup8(Vec8q const index, Vec8q const table) { + int64_t tab[8]; + table.store(tab); + Vec4q t0 = lookup<8>(index.get_low(), tab); + Vec4q t1 = lookup<8>(index.get_high(), tab); + return Vec8q(t0, t1); +} + +template +static inline Vec8q lookup(Vec8q const index, void const * table) { + if constexpr (n <= 0) return 0; + if constexpr (n <= 4) { + Vec4q table1 = Vec4q().load(table); + return Vec8q( + lookup4 (index.get_low(), table1), + lookup4 (index.get_high(), table1)); + } + if constexpr (n <= 8) { + return lookup8(index, Vec8q().load(table)); + } + // n > 8. Limit index + Vec8uq i1; + if constexpr ((n & (n-1)) == 0) { + // n is a power of 2, make index modulo n + i1 = Vec8uq(index) & (n-1); + } + else { + // n is not a power of 2, limit to n-1 + i1 = min(Vec8uq(index), n-1); + } + int64_t const * t = (int64_t const *)table; + return Vec8q(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]); +} + +/***************************************************************************** +* +* Vector scatter functions +* +*****************************************************************************/ + +template + static inline void scatter(Vec16i const data, void * array) { + int32_t* arr = (int32_t*)array; + const int index[16] = {i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}; + for (int i = 0; i < 16; i++) { + if (index[i] >= 0) arr[index[i]] = data[i]; + } +} + +template +static inline void scatter(Vec8q const data, void * array) { + int64_t* arr = (int64_t*)array; + const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; + for (int i = 0; i < 8; i++) { + if (index[i] >= 0) arr[index[i]] = data[i]; + } +} + +static inline void scatter(Vec16i const index, uint32_t limit, Vec16i const data, void * array) { + int32_t* arr = (int32_t*)array; + for (int i = 0; i < 16; i++) { + if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; + } +} + +static inline void scatter(Vec8q const index, uint32_t limit, Vec8q const data, void * array) { + int64_t* arr = (int64_t*)array; + for (int i = 0; i < 8; i++) { + if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i]; + } +} + +static inline void scatter(Vec8i const index, uint32_t limit, Vec8q const data, void * array) { + int64_t* arr = (int64_t*)array; + for (int i = 0; i < 8; i++) { + if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; + } +} + +// Scatter functions with variable indexes: + +static inline void scatter16i(Vec16i index, uint32_t limit, Vec16i data, void * destination) { + uint32_t ix[16]; index.store(ix); + for (int i = 0; i < 16; i++) { + if (ix[i] < limit) ((int*)destination)[ix[i]] = data[i]; + } +} + +static inline void scatter8q(Vec8q index, uint32_t limit, Vec8q data, void * destination) { + uint64_t ix[8]; index.store(ix); + for (int i = 0; i < 8; i++) { + if (ix[i] < limit) ((int64_t*)destination)[ix[i]] = data[i]; + } +} + +static inline void scatter8i(Vec8i index, uint32_t limit, Vec8i data, void * destination) { + uint32_t ix[8]; index.store(ix); + for (int i = 0; i < 8; i++) { + if (ix[i] < limit) ((int*)destination)[ix[i]] = data[i]; + } +} + +static inline void scatter4q(Vec4q index, uint32_t limit, Vec4q data, void * destination) { + uint64_t ix[4]; index.store(ix); + for (int i = 0; i < 4; i++) { + if (ix[i] < limit) ((int64_t*)destination)[ix[i]] = data[i]; + } +} + +static inline void scatter4i(Vec4i index, uint32_t limit, Vec4i data, void * destination) { + uint32_t ix[4]; index.store(ix); + for (int i = 0; i < 4; i++) { + if (ix[i] < limit) ((int*)destination)[ix[i]] = data[i]; + } +} + +/***************************************************************************** +* +* Gather functions with fixed indexes +* +*****************************************************************************/ + +template +static inline Vec16i gather16i(void const * a) { + int constexpr indexs[16] = { i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 }; + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 15) { + // load one contiguous block and permute + if constexpr (imax > 15) { + // make sure we don't read past the end of the array + Vec16i b = Vec16i().load((int32_t const *)a + imax-15); + return permute16 (b); + } + else { + Vec16i b = Vec16i().load((int32_t const *)a + imin); + return permute16 (b); + } + } + if constexpr ((i0imax-16) && (i1imax-16) && (i2imax-16) && (i3imax-16) + && (i4imax-16) && (i5imax-16) && (i6imax-16) && (i7imax-16) + && (i8imax-16) && (i9imax-16) && (i10imax-16) && (i11imax-16) + && (i12imax-16) && (i13imax-16) && (i14imax-16) && (i15imax-16) ) { + // load two contiguous blocks and blend + Vec16i b = Vec16i().load((int32_t const *)a + imin); + Vec16i c = Vec16i().load((int32_t const *)a + imax-15); + const int j0 = i0 (b, c); + } + // use lookup function + return lookup(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), a); +} + + +template +static inline Vec8q gather8q(void const * a) { + int constexpr indexs[8] = { i0, i1, i2, i3, i4, i5, i6, i7 }; // indexes as array + constexpr int imin = min_index(indexs); + constexpr int imax = max_index(indexs); + static_assert(imin >= 0, "Negative index in gather function"); + + if constexpr (imax - imin <= 7) { + // load one contiguous block and permute + if constexpr (imax > 7) { + // make sure we don't read past the end of the array + Vec8q b = Vec8q().load((int64_t const *)a + imax-7); + return permute8 (b); + } + else { + Vec8q b = Vec8q().load((int64_t const *)a + imin); + return permute8 (b); + } + } + if constexpr ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) + && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { + // load two contiguous blocks and blend + Vec8q b = Vec8q().load((int64_t const *)a + imin); + Vec8q c = Vec8q().load((int64_t const *)a + imax-7); + const int j0 = i0(b, c); + } + // use lookup function + return lookup(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), a); +} + + +/***************************************************************************** +* +* Functions for conversion between integer sizes +* +*****************************************************************************/ + +// Extend 16-bit integers to 32-bit integers, signed and unsigned +/* +// Function extend_to_int : extends Vec16s to Vec16i with sign extension +static inline Vec16i extend_to_int (Vec16s const a) { + return Vec16i(extend_low(a), extend_high(a)); +} + +// Function extend_to_int : extends Vec16us to Vec16ui with zero extension +static inline Vec16ui extend_to_int (Vec16us const a) { + return Vec16i(extend_low(a), extend_high(a)); +} + +// Function extend_to_int : extends Vec16c to Vec16i with sign extension +static inline Vec16i extend_to_int (Vec16c const a) { + return extend_to_int(Vec16s(extend_low(a), extend_high(a))); +} + +// Function extend_to_int : extends Vec16uc to Vec16ui with zero extension +static inline Vec16ui extend_to_int (Vec16uc const a) { + return extend_to_int(Vec16s(extend_low(a), extend_high(a))); +}*/ + + +// Extend 32-bit integers to 64-bit integers, signed and unsigned + +// Function extend_low : extends the low 8 elements to 64 bits with sign extension +static inline Vec8q extend_low (Vec16i const a) { + return Vec8q(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : extends the high 8 elements to 64 bits with sign extension +static inline Vec8q extend_high (Vec16i const a) { + return Vec8q(extend_low(a.get_high()), extend_high(a.get_high())); +} + +// Function extend_low : extends the low 8 elements to 64 bits with zero extension +static inline Vec8uq extend_low (Vec16ui const a) { + return Vec8q(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : extends the high 8 elements to 64 bits with zero extension +static inline Vec8uq extend_high (Vec16ui const a) { + return Vec8q(extend_low(a.get_high()), extend_high(a.get_high())); +} + +// Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation +/* +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Overflow wraps around +static inline Vec16c compress_to_int8 (Vec16i const a) { + Vec16s b = compress(a.get_low(), a.get_high()); + Vec16c c = compress(b.get_low(), b.get_high()); + return c; +} + +static inline Vec16s compress_to_int16 (Vec16i const a) { + return compress(a.get_low(), a.get_high()); +} + +// with signed saturation +static inline Vec16c compress_to_int8_saturated (Vec16i const a) { + Vec16s b = compress_saturated(a.get_low(), a.get_high()); + Vec16c c = compress_saturated(b.get_low(), b.get_high()); + return c; +} + +static inline Vec16s compress_to_int16_saturated (Vec16i const a) { + return compress_saturated(a.get_low(), a.get_high()); +} + +// with unsigned saturation +static inline Vec16uc compress_to_int8_saturated (Vec16ui const a) { + Vec16us b = compress_saturated(a.get_low(), a.get_high()); + Vec16uc c = compress_saturated(b.get_low(), b.get_high()); + return c; +} + +static inline Vec16us compress_to_int16_saturated (Vec16ui const a) { + return compress_saturated(a.get_low(), a.get_high()); +}*/ + +// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Overflow wraps around +static inline Vec16i compress (Vec8q const low, Vec8q const high) { + return Vec16i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); +} + +// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Signed, with saturation +static inline Vec16i compress_saturated (Vec8q const low, Vec8q const high) { + return Vec16i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); +} + +// Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers +// Unsigned, with saturation +static inline Vec16ui compress_saturated (Vec8uq const low, Vec8uq const high) { + return Vec16ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); +} + + +/***************************************************************************** +* +* Integer division operators +* Please see the file vectori128.h for explanation. +* +*****************************************************************************/ + +// vector operator / : divide each element by divisor + +// vector operator / : divide all elements by same integer +static inline Vec16i operator / (Vec16i const a, Divisor_i const d) { + return Vec16i(a.get_low() / d, a.get_high() / d); +} + +// vector operator /= : divide +static inline Vec16i & operator /= (Vec16i & a, Divisor_i const d) { + a = a / d; + return a; +} + +// vector operator / : divide all elements by same integer +static inline Vec16ui operator / (Vec16ui const a, Divisor_ui const d) { + return Vec16ui(a.get_low() / d, a.get_high() / d); +} + +// vector operator /= : divide +static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const d) { + a = a / d; + return a; +} + + +/***************************************************************************** +* +* Integer division 2: divisor is a compile-time constant +* +*****************************************************************************/ + +// Divide Vec16i by compile-time constant +template +static inline Vec16i divide_by_i(Vec16i const a) { + return Vec16i(divide_by_i(a.get_low()), divide_by_i(a.get_high())); +} + +// define Vec16i a / const_int(d) +template +static inline Vec16i operator / (Vec16i const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec16i a / const_uint(d) +template +static inline Vec16i operator / (Vec16i const a, Const_uint_t) { + static_assert(d < 0x80000000u, "Dividing signed integer by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec16i & operator /= (Vec16i & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16i & operator /= (Vec16i & a, Const_uint_t b) { + a = a / b; + return a; +} + +// Divide Vec16ui by compile-time constant +template +static inline Vec16ui divide_by_ui(Vec16ui const a) { + return Vec16ui( divide_by_ui(a.get_low()), divide_by_ui(a.get_high())); +} + +// define Vec16ui a / const_uint(d) +template +static inline Vec16ui operator / (Vec16ui const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec16ui a / const_int(d) +template +static inline Vec16ui operator / (Vec16ui const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t b) { + a = a / b; + return a; +} + + +/***************************************************************************** +* +* Boolean <-> bitfield conversion functions +* +*****************************************************************************/ + +// to_bits: convert to integer bitfield +static inline uint16_t to_bits(Vec16b const a) { + return uint16_t(to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8)); +} + +// to_bits: convert to integer bitfield +static inline uint16_t to_bits(Vec16ib const a) { + return uint16_t(to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8)); +} + +// to_bits: convert to integer bitfield +static inline uint8_t to_bits(Vec8b const a) { + return uint8_t(to_bits(a.get_low()) | (to_bits(a.get_high()) << 4)); +} + + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORI512E_H diff --git a/DFTTest/VCL2/vectori512s.h b/DFTTest/VCL2/vectori512s.h new file mode 100644 index 0000000..38000a6 --- /dev/null +++ b/DFTTest/VCL2/vectori512s.h @@ -0,0 +1,2315 @@ +/**************************** vectori512s.h ******************************** +* Author: Agner Fog +* Date created: 2019-04-20 +* Last modified: 2020-02-23 +* Version: 2.01.01 +* Project: vector classes +* Description: +* Header file defining 512-bit integer vector classes for 8 and 16 bit integers. +* For x86 microprocessors with AVX512BW and later instruction sets. +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec64c Vector of 64 8-bit signed integers +* Vec64uc Vector of 64 8-bit unsigned integers +* Vec64cb Vector of 64 booleans for use with Vec64c and Vec64uc +* Vec32s Vector of 32 16-bit signed integers +* Vec32us Vector of 32 16-bit unsigned integers +* Vec32sb Vector of 32 booleans for use with Vec32s and Vec32us +* Other 512-bit integer vectors are defined in Vectori512.h +* +* Each vector object is represented internally in the CPU as a 512-bit register. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ + +#ifndef VECTORI512S_H +#define VECTORI512S_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +// check combination of header files +#ifdef VECTORI512SE_H +#error Two different versions of vectorf256.h included +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + + +/***************************************************************************** +* +* Vector of 64 8-bit signed integers +* +*****************************************************************************/ + +class Vec64c: public Vec512b { +public: + // Default constructor: + Vec64c() { + } + // Constructor to broadcast the same value into all elements: + Vec64c(int8_t i) { + zmm = _mm512_set1_epi8(i); + } + // Constructor to build from all elements: + Vec64c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, + int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, + int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23, + int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31, + int8_t i32, int8_t i33, int8_t i34, int8_t i35, int8_t i36, int8_t i37, int8_t i38, int8_t i39, + int8_t i40, int8_t i41, int8_t i42, int8_t i43, int8_t i44, int8_t i45, int8_t i46, int8_t i47, + int8_t i48, int8_t i49, int8_t i50, int8_t i51, int8_t i52, int8_t i53, int8_t i54, int8_t i55, + int8_t i56, int8_t i57, int8_t i58, int8_t i59, int8_t i60, int8_t i61, int8_t i62, int8_t i63) { + // _mm512_set_epi8 and _mm512_set_epi16 missing in GCC 7.4.0 + int8_t aa[64] = { + i0, i1, i2, i3, i4, i5, i6, i7,i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 }; + load(aa); + } + // Constructor to build from two Vec32c: + Vec64c(Vec32c const a0, Vec32c const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec64c(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec64c & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512i used in intrinsics + operator __m512i() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec64c & load(void const * p) { + zmm = _mm512_loadu_si512(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec64c & load_a(void const * p) { + zmm = _mm512_load_si512(p); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec64c & load_partial(int n, void const * p) { + if (n >= 64) { + zmm = _mm512_loadu_si512(p); + } + else { + zmm = _mm512_maskz_loadu_epi8(__mmask64(((uint64_t)1 << n) - 1), p); + } + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + if (n >= 64) { + // _mm512_storeu_epi8(p, zmm); + _mm512_storeu_si512(p, zmm); + } + else { + _mm512_mask_storeu_epi8(p, __mmask64(((uint64_t)1 << n) - 1), zmm); + } + } + // cut off vector to n elements. The last 64-n elements are set to zero + Vec64c & cutoff(int n) { + if (n < 64) { + zmm = _mm512_maskz_mov_epi8(__mmask64(((uint64_t)1 << n) - 1), zmm); + } + return *this; + } + // Member function to change a single element in vector + Vec64c const insert(int index, int8_t value) { + zmm = _mm512_mask_set1_epi8(zmm, __mmask64((uint64_t)1 << index), value); + return *this; + } + // Member function extract a single element from vector + int8_t extract(int index) const { +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + __m512i x = _mm512_maskz_compress_epi8(__mmask64((uint64_t)1 << index), zmm); + return (int8_t)_mm_cvtsi128_si32(_mm512_castsi512_si128(x)); +#else + int8_t a[64]; + store(a); + return a[index & 63]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int8_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec32c: + Vec32c get_low() const { + return _mm512_castsi512_si256(zmm); + } + Vec32c get_high() const { + return _mm512_extracti64x4_epi64(zmm,1); + } + static constexpr int size() { + return 64; + } + static constexpr int elementtype() { + return 4; + } +}; + + +/***************************************************************************** +* +* Vec64b: Vector of 64 Booleans for use with Vec64c and Vec64uc +* +*****************************************************************************/ + +class Vec64b { +protected: + __mmask64 mm; // Boolean vector +public: + // Default constructor: + Vec64b () { + } + // Constructor to build from all elements: + /* + Vec64b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7, + bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15, + bool b16, bool b17, bool b18, bool b19, bool b20, bool b21, bool b22, bool b23, + bool b24, bool b25, bool b26, bool b27, bool b28, bool b29, bool b30, bool b31, + bool b32, bool b33, bool b34, bool b35, bool b36, bool b37, bool b38, bool b39, + bool b40, bool b41, bool b42, bool b43, bool b44, bool b45, bool b46, bool b47, + bool b48, bool b49, bool b50, bool b51, bool b52, bool b53, bool b54, bool b55, + bool b56, bool b57, bool b58, bool b59, bool b60, bool b61, bool b62, bool b63) { + mm = uint64_t( + (uint64_t)b0 | (uint64_t)b1 << 1 | (uint64_t)b2 << 2 | (uint64_t)b3 << 3 | + (uint64_t)b4 << 4 | (uint64_t)b5 << 5 | (uint64_t)b6 << 6 | (uint64_t)b7 << 7 | + (uint64_t)b8 << 8 | (uint64_t)b9 << 9 | (uint64_t)b10 << 10 | (uint64_t)b11 << 11 | + (uint64_t)b12 << 12 | (uint64_t)b13 << 13 | (uint64_t)b14 << 14 | (uint64_t)b15 << 15 | + (uint64_t)b16 << 16 | (uint64_t)b17 << 17 | (uint64_t)b18 << 18 | (uint64_t)b19 << 19 | + (uint64_t)b20 << 20 | (uint64_t)b21 << 21 | (uint64_t)b22 << 22 | (uint64_t)b23 << 23 | + (uint64_t)b24 << 24 | (uint64_t)b25 << 25 | (uint64_t)b26 << 26 | (uint64_t)b27 << 27 | + (uint64_t)b28 << 28 | (uint64_t)b29 << 29 | (uint64_t)b30 << 30 | (uint64_t)b31 << 31 | + (uint64_t)b32 << 32 | (uint64_t)b33 << 33 | (uint64_t)b34 << 34 | (uint64_t)b35 << 35 | + (uint64_t)b36 << 36 | (uint64_t)b37 << 37 | (uint64_t)b38 << 38 | (uint64_t)b39 << 39 | + (uint64_t)b40 << 40 | (uint64_t)b41 << 41 | (uint64_t)b42 << 42 | (uint64_t)b43 << 43 | + (uint64_t)b44 << 44 | (uint64_t)b45 << 45 | (uint64_t)b46 << 46 | (uint64_t)b47 << 47 | + (uint64_t)b48 << 48 | (uint64_t)b49 << 49 | (uint64_t)b50 << 50 | (uint64_t)b51 << 51 | + (uint64_t)b52 << 52 | (uint64_t)b53 << 53 | (uint64_t)b54 << 54 | (uint64_t)b55 << 55 | + (uint64_t)b56 << 56 | (uint64_t)b57 << 57 | (uint64_t)b58 << 58 | (uint64_t)b59 << 59 | + (uint64_t)b60 << 60 | (uint64_t)b61 << 61 | (uint64_t)b62 << 62 | (uint64_t)b63 << 63); + } */ + // Constructor to convert from type __mmask64 used in intrinsics: + Vec64b (__mmask64 x) { + mm = x; + } + // Constructor to broadcast single value: + Vec64b(bool b) { + mm = __mmask64(-int64_t(b)); + } + // Constructor to make from two halves + Vec64b(Vec32b const x0, Vec32b const x1) { + mm = uint32_t(__mmask32(x0)) | uint64_t(__mmask32(x1)) << 32; + } + // Assignment operator to convert from type __mmask64 used in intrinsics: + Vec64b & operator = (__mmask64 x) { + mm = x; + return *this; + } + // Assignment operator to broadcast scalar value: + Vec64b & operator = (bool b) { + mm = Vec64b(b); + return *this; + } + // split into two halves + Vec32b get_low() const { + return Vec32b(__mmask32(mm)); + } + Vec32b get_high() const { + return Vec32b(__mmask32(mm >> 32)); + } + // Member function to change a single element in vector + Vec64b & insert (uint32_t index, bool a) { + uint64_t mask = uint64_t(1) << index; + mm = (mm & ~mask) | uint64_t(a) << index; + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return ((mm >> index) & 1) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Type cast operator to convert to __mmask64 used in intrinsics + operator __mmask64() const { + return mm; + } + // Member function to change a bitfield to a boolean vector + Vec64b & load_bits(uint64_t a) { + mm = __mmask64(a); + return *this; + } + static constexpr int size() { + return 64; + } + static constexpr int elementtype() { + return 2; + } +}; + +typedef Vec64b Vec64cb; // compact boolean vector +typedef Vec64b Vec64ucb; // compact boolean vector + + +/***************************************************************************** +* +* Define operators and functions for Vec64cb +* +*****************************************************************************/ + +// vector operator & : bitwise and +static inline Vec64cb operator & (Vec64cb const a, Vec64cb const b) { + //return _kand_mask64(a, b); + return __mmask64(a) & __mmask64(b); +} +static inline Vec64cb operator && (Vec64cb const a, Vec64cb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec64cb & operator &= (Vec64cb & a, Vec64cb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec64cb operator | (Vec64cb const a, Vec64cb const b) { + //return _kor_mask64(a, b); + return __mmask64(a) | __mmask64(b); +} +static inline Vec64cb operator || (Vec64cb const a, Vec64cb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec64cb & operator |= (Vec64cb & a, Vec64cb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec64cb operator ^ (Vec64cb const a, Vec64cb const b) { + //return _kxor_mask64(a, b); + return __mmask64(a) ^ __mmask64(b); +} +// vector operator ^= : bitwise xor +static inline Vec64cb & operator ^= (Vec64cb & a, Vec64cb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec64cb operator == (Vec64cb const a, Vec64cb const b) { + return __mmask64(a) ^ ~ __mmask64(b); + //return _kxnor_mask64(a, b); // not all compilers have this intrinsic +} + +// vector operator != : xor +static inline Vec64cb operator != (Vec64cb const a, Vec64cb const b) { + //return _kxor_mask64(a, b); + return __mmask64(a) ^ __mmask64(b); +} + +// vector operator ~ : bitwise not +static inline Vec64cb operator ~ (Vec64cb const a) { + //return _knot_mask64(a); + return ~ __mmask64(a); +} + +// vector operator ! : element not +static inline Vec64cb operator ! (Vec64cb const a) { + return ~a; +} + +// vector function andnot +static inline Vec64cb andnot (Vec64cb const a, Vec64cb const b) { + //return _kxnor_mask64(b, a); + return __mmask64(a) & ~ __mmask64(b); +} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec64cb const a) { + return int64_t(__mmask64(a)) == -(int64_t)(1); +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec64cb const a) { + return int64_t(__mmask64(a)) != 0; +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint64_t to_bits(Vec64cb x) { + return uint64_t(__mmask64(x)); +} + + +/***************************************************************************** +* +* Define operators for Vec64c +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec64c operator + (Vec64c const a, Vec64c const b) { + return _mm512_add_epi8(a, b); +} +// vector operator += : add +static inline Vec64c & operator += (Vec64c & a, Vec64c const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec64c operator ++ (Vec64c & a, int) { + Vec64c a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec64c & operator ++ (Vec64c & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec64c operator - (Vec64c const a, Vec64c const b) { + return _mm512_sub_epi8(a, b); +} +// vector operator - : unary minus +static inline Vec64c operator - (Vec64c const a) { + return _mm512_sub_epi8(_mm512_setzero_epi32(), a); +} +// vector operator -= : subtract +static inline Vec64c & operator -= (Vec64c & a, Vec64c const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec64c operator -- (Vec64c & a, int) { + Vec64c a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec64c & operator -- (Vec64c & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec64c operator * (Vec64c const a, Vec64c const b) { + // There is no 8-bit multiply. Split into two 16-bit multiplies + __m512i aodd = _mm512_srli_epi16(a,8); // odd numbered elements of a + __m512i bodd = _mm512_srli_epi16(b,8); // odd numbered elements of b + __m512i muleven = _mm512_mullo_epi16(a,b); // product of even numbered elements + __m512i mulodd = _mm512_mullo_epi16(aodd,bodd); // product of odd numbered elements + mulodd = _mm512_slli_epi16(mulodd,8); // put odd numbered elements back in place + __m512i product = _mm512_mask_mov_epi8(muleven, 0xAAAAAAAAAAAAAAAA, mulodd); // interleave even and odd + return product; +} + +// vector operator *= : multiply +static inline Vec64c & operator *= (Vec64c & a, Vec64c const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec64c operator << (Vec64c const a, int32_t b) { + uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out + __m512i am = _mm512_and_si512(a,_mm512_set1_epi8((char)mask));// remove bits that will overflow + __m512i res = _mm512_sll_epi16(am,_mm_cvtsi32_si128(b)); // 16-bit shifts + return res; +} + +// vector operator <<= : shift left +static inline Vec64c & operator <<= (Vec64c & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec64c operator >> (Vec64c const a, int32_t b) { + __m512i aeven = _mm512_slli_epi16(a, 8); // even numbered elements of a. get sign bit in position + aeven = _mm512_sra_epi16(aeven, _mm_cvtsi32_si128(b + 8)); // shift arithmetic, back to position + __m512i aodd = _mm512_sra_epi16(a, _mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic + __m512i res = _mm512_mask_mov_epi8(aeven, 0xAAAAAAAAAAAAAAAA, aodd);// interleave even and odd + return res; +} +// vector operator >>= : shift right arithmetic +static inline Vec64c & operator >>= (Vec64c & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec64cb operator == (Vec64c const a, Vec64c const b) { + return _mm512_cmpeq_epi8_mask(a, b); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec64cb operator != (Vec64c const a, Vec64c const b) { + return _mm512_cmpneq_epi8_mask(a, b); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec64cb operator > (Vec64c const a, Vec64c const b) { + return _mm512_cmp_epi8_mask(a, b, 6); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec64cb operator < (Vec64c const a, Vec64c const b) { + return _mm512_cmp_epi8_mask(a, b, 1); +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec64cb operator >= (Vec64c const a, Vec64c const b) { + return _mm512_cmp_epi8_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec64cb operator <= (Vec64c const a, Vec64c const b) { + return _mm512_cmp_epi8_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec64c operator & (Vec64c const a, Vec64c const b) { + return _mm512_and_epi32(a, b); +} + +// vector operator &= : bitwise and +static inline Vec64c & operator &= (Vec64c & a, Vec64c const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec64c operator | (Vec64c const a, Vec64c const b) { + return _mm512_or_epi32(a, b); +} + +// vector operator |= : bitwise or +static inline Vec64c & operator |= (Vec64c & a, Vec64c const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec64c operator ^ (Vec64c const a, Vec64c const b) { + return _mm512_xor_epi32(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec64c & operator ^= (Vec64c & a, Vec64c const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec64c operator ~ (Vec64c const a) { + return Vec64c(~ Vec16i(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec64c select (Vec64cb const s, Vec64c const a, Vec64c const b) { + return _mm512_mask_mov_epi8(b, s, a); // conditional move may be optimized better by the compiler than blend +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec64c if_add (Vec64cb const f, Vec64c const a, Vec64c const b) { + return _mm512_mask_add_epi8(a, f, a, b); +} + +// Conditional subtract +static inline Vec64c if_sub (Vec64cb const f, Vec64c const a, Vec64c const b) { + return _mm512_mask_sub_epi8(a, f, a, b); +} + +// Conditional multiply +static inline Vec64c if_mul (Vec64cb const f, Vec64c const a, Vec64c const b) { + Vec64c m = a * b; + return select(f, m, a); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int8_t horizontal_add (Vec64c const a) { + __m512i sum1 = _mm512_sad_epu8(a,_mm512_setzero_si512()); + return (int8_t)horizontal_add(Vec8q(sum1)); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is sign-extended before addition to avoid overflow +static inline int32_t horizontal_add_x (Vec64c const a) { + return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec64c add_saturated(Vec64c const a, Vec64c const b) { + return _mm512_adds_epi8(a, b); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec64c sub_saturated(Vec64c const a, Vec64c const b) { + return _mm512_subs_epi8(a, b); +} + +// function max: a > b ? a : b +static inline Vec64c max(Vec64c const a, Vec64c const b) { + return _mm512_max_epi8(a,b); +} + +// function min: a < b ? a : b +static inline Vec64c min(Vec64c const a, Vec64c const b) { + return _mm512_min_epi8(a,b); + +} + +// function abs: a >= 0 ? a : -a +static inline Vec64c abs(Vec64c const a) { + return _mm512_abs_epi8(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec64c abs_saturated(Vec64c const a) { + return _mm512_min_epu8(abs(a), Vec64c(0x7F)); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec64c rotate_left(Vec64c const a, int b) { + uint8_t mask = 0xFFu << b; // mask off overflow bits + __m512i m = _mm512_set1_epi8(mask); + __m128i bb = _mm_cvtsi32_si128(b & 7); // b modulo 8 + __m128i mbb = _mm_cvtsi32_si128((- b) & 7); // 8-b modulo 8 + __m512i left = _mm512_sll_epi16(a, bb); // a << b + __m512i right = _mm512_srl_epi16(a, mbb); // a >> 8-b + left = _mm512_and_si512(m, left); // mask off overflow bits + right = _mm512_andnot_si512(m, right); + return _mm512_or_si512(left, right); // combine left and right shifted bits +} + + +/***************************************************************************** +* +* Vector of 64 8-bit unsigned integers +* +*****************************************************************************/ + +class Vec64uc : public Vec64c { +public: + // Default constructor: + Vec64uc() { + } + // Constructor to broadcast the same value into all elements: + Vec64uc(uint8_t i) { + zmm = _mm512_set1_epi8((int8_t)i); + } + // Constructor to build from all elements: + Vec64uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, + uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, + uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, + uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31, + uint8_t i32, uint8_t i33, uint8_t i34, uint8_t i35, uint8_t i36, uint8_t i37, uint8_t i38, uint8_t i39, + uint8_t i40, uint8_t i41, uint8_t i42, uint8_t i43, uint8_t i44, uint8_t i45, uint8_t i46, uint8_t i47, + uint8_t i48, uint8_t i49, uint8_t i50, uint8_t i51, uint8_t i52, uint8_t i53, uint8_t i54, uint8_t i55, + uint8_t i56, uint8_t i57, uint8_t i58, uint8_t i59, uint8_t i60, uint8_t i61, uint8_t i62, uint8_t i63) + : Vec64c(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63) {} + + // Constructor to build from two Vec32uc: + Vec64uc(Vec32uc const a0, Vec32uc const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec64uc(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec64uc & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec64uc & load(void const * p) { + Vec64c::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec64uc & load_a(void const * p) { + Vec64c::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec64uc const insert(int index, uint8_t value) { + Vec64c::insert(index, (int8_t)value); + return *this; + } + // Member function extract a single element from vector + uint8_t extract(int index) const { + return (uint8_t)Vec64c::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint8_t operator [] (int index) const { + return (uint8_t)Vec64c::extract(index); + } + // Member functions to split into two Vec32uc: + Vec32uc get_low() const { + return Vec32uc(Vec64c::get_low()); + } + Vec32uc get_high() const { + return Vec32uc(Vec64c::get_high()); + } + static constexpr int elementtype() { + return 5; + } +}; + +// Define operators for this class + +// vector operator + : add element by element +static inline Vec64uc operator + (Vec64uc const a, Vec64uc const b) { + return _mm512_add_epi8(a, b); +} + +// vector operator - : subtract element by element +static inline Vec64uc operator - (Vec64uc const a, Vec64uc const b) { + return _mm512_sub_epi8(a, b); +} + +// vector operator ' : multiply element by element +static inline Vec64uc operator * (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) * Vec64c(b)); +} + +// vector operator / : divide. See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec64uc operator >> (Vec64uc const a, uint32_t b) { + uint32_t mask = (uint32_t)0xFF << (uint32_t)b; // mask to remove bits that are shifted out + __m512i am = _mm512_and_si512(a,_mm512_set1_epi8((char)mask)); // remove bits that will overflow + __m512i res = _mm512_srl_epi16(am,_mm_cvtsi32_si128((int32_t)b));// 16-bit shifts + return res; +} +static inline Vec64uc operator >> (Vec64uc const a, int b) { + return a >> (uint32_t)b; +} + +// vector operator >>= : shift right logical +static inline Vec64uc & operator >>= (Vec64uc & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical (signed b) +static inline Vec64uc & operator >>= (Vec64uc & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec64uc operator << (Vec64uc const a, uint32_t b) { + return Vec64uc(Vec64c(a) << int32_t(b)); +} +static inline Vec64uc operator << (Vec64uc const a, int b) { + return a << (uint32_t)b; +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec64cb operator < (Vec64uc const a, Vec64uc const b) { + return _mm512_cmp_epu8_mask(a, b, 1); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec64cb operator > (Vec64uc const a, Vec64uc const b) { + return _mm512_cmp_epu8_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec64cb operator >= (Vec64uc const a, Vec64uc const b) { + return _mm512_cmp_epu8_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec64cb operator <= (Vec64uc const a, Vec64uc const b) { + return _mm512_cmp_epu8_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec64uc operator & (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) & Vec64c(b)); +} + +// vector operator | : bitwise or +static inline Vec64uc operator | (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) | Vec64c(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec64uc operator ^ (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) ^ Vec64c(b)); +} + +// vector operator ~ : bitwise not +static inline Vec64uc operator ~ (Vec64uc const a) { + return Vec64uc( ~ Vec64c(a)); +} + + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec64uc select (Vec64cb const s, Vec64uc const a, Vec64uc const b) { + return Vec64uc(select(s, Vec64c(a), Vec64c(b))); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec64uc if_add (Vec64cb const f, Vec64uc const a, Vec64uc const b) { + return _mm512_mask_add_epi8(a, f, a, b); +} + +// Conditional subtract +static inline Vec64uc if_sub (Vec64cb const f, Vec64uc const a, Vec64uc const b) { + return _mm512_mask_sub_epi8(a, f, a, b); +} + +// Conditional multiply +static inline Vec64uc if_mul (Vec64cb const f, Vec64uc const a, Vec64uc const b) { + Vec64uc m = a * b; + return select(f, m, a); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec64uc add_saturated(Vec64uc const a, Vec64uc const b) { + return _mm512_adds_epu8(a, b); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec64uc sub_saturated(Vec64uc const a, Vec64uc const b) { + return _mm512_subs_epu8(a, b); +} + +// function max: a > b ? a : b +static inline Vec64uc max(Vec64uc const a, Vec64uc const b) { + return _mm512_max_epu8(a,b); +} + +// function min: a < b ? a : b +static inline Vec64uc min(Vec64uc const a, Vec64uc const b) { + return _mm512_min_epu8(a,b); +} + + +/***************************************************************************** +* +* Vector of 32 16-bit signed integers +* +*****************************************************************************/ + +class Vec32s: public Vec512b { +public: + // Default constructor: + Vec32s() { + } + // Constructor to broadcast the same value into all elements: + Vec32s(int16_t i) { + zmm = _mm512_set1_epi16(i); + } + // Constructor to build from all elements: + Vec32s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7, + int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15, + int16_t i16, int16_t i17, int16_t i18, int16_t i19, int16_t i20, int16_t i21, int16_t i22, int16_t i23, + int16_t i24, int16_t i25, int16_t i26, int16_t i27, int16_t i28, int16_t i29, int16_t i30, int16_t i31) { +#if true + // _mm512_set_epi16 missing in GCC 7.4.0. This may be more efficient after all: + int16_t aa[32] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 }; + load(aa); +#else + zmm = _mm512_set_epi16( + i31, i30, i29, i28, i27, i26, i25, i24, i23, i22, i21, i20, i19, i18, i17, i16, + i15, i14, i13, i12, i11, i10, i9, i8, i7, i6, i5, i4, i3, i2, i1, i0); +#endif + } + // Constructor to build from two Vec16s: + Vec32s(Vec16s const a0, Vec16s const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec32s(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec32s & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Type cast operator to convert to __m512i used in intrinsics + operator __m512i() const { + return zmm; + } + // Member function to load from array (unaligned) + Vec32s & load(void const * p) { + zmm = _mm512_loadu_si512(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec32s & load_a(void const * p) { + zmm = _mm512_load_si512(p); + return *this; + } + // Member function to load 32 unsigned 8-bit integers from array + Vec32s & load_32uc(void const * p) { + zmm = _mm512_cvtepu8_epi16(Vec32uc().load(p)); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec32s & load_partial(int n, void const * p) { + zmm = _mm512_maskz_loadu_epi16(__mmask32(((uint64_t)1 << n) - 1), p); + return *this; + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + _mm512_mask_storeu_epi16(p, __mmask32(((uint64_t)1 << n) - 1), zmm); + } + // cut off vector to n elements. The last 32-n elements are set to zero + Vec32s & cutoff(int n) { + zmm = _mm512_maskz_mov_epi16(__mmask32(((uint64_t)1 << n) - 1), zmm); + return *this; + } + // Member function to change a single element in vector + Vec32s const insert(int index, int16_t value) { + zmm = _mm512_mask_set1_epi16(zmm, __mmask64((uint64_t)1 << index), value); + return *this; + } + // Member function extract a single element from vector + int16_t extract(int index) const { +#if INSTRSET >= 10 && defined (__AVX512VBMI2__) + __m512i x = _mm512_maskz_compress_epi16(__mmask32(1u << index), zmm); + return (int16_t)_mm_cvtsi128_si32(_mm512_castsi512_si128(x)); +#else + int16_t a[32]; + store(a); + return a[index & 31]; +#endif + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int16_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec16s: + Vec16s get_low() const { + return _mm512_castsi512_si256(zmm); + } + Vec16s get_high() const { + return _mm512_extracti64x4_epi64(zmm,1); + } + static constexpr int size() { + return 32; + } + static constexpr int elementtype() { + return 6; + } +}; + + +/***************************************************************************** +* +* Vec32sb: Vector of 64 Booleans for use with Vec32s and Vec32us +* +*****************************************************************************/ + +typedef Vec32b Vec32sb; // compact boolean vector + + +/***************************************************************************** +* +* Define operators for Vec32s +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec32s operator + (Vec32s const a, Vec32s const b) { + return _mm512_add_epi16(a, b); +} +// vector operator += : add +static inline Vec32s & operator += (Vec32s & a, Vec32s const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec32s operator ++ (Vec32s & a, int) { + Vec32s a0 = a; + a = a + 1; + return a0; +} +// prefix operator ++ +static inline Vec32s & operator ++ (Vec32s & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec32s operator - (Vec32s const a, Vec32s const b) { + return _mm512_sub_epi16(a, b); +} +// vector operator - : unary minus +static inline Vec32s operator - (Vec32s const a) { + return _mm512_sub_epi16(_mm512_setzero_epi32(), a); +} +// vector operator -= : subtract +static inline Vec32s & operator -= (Vec32s & a, Vec32s const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec32s operator -- (Vec32s & a, int) { + Vec32s a0 = a; + a = a - 1; + return a0; +} +// prefix operator -- +static inline Vec32s & operator -- (Vec32s & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec32s operator * (Vec32s const a, Vec32s const b) { + return _mm512_mullo_epi16(a, b); +} + +// vector operator *= : multiply +static inline Vec32s & operator *= (Vec32s & a, Vec32s const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer. See bottom of file + +// vector operator << : shift left +static inline Vec32s operator << (Vec32s const a, int32_t b) { + return _mm512_sll_epi16(a, _mm_cvtsi32_si128(b)); +} +// vector operator <<= : shift left +static inline Vec32s & operator <<= (Vec32s & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec32s operator >> (Vec32s const a, int32_t b) { + return _mm512_sra_epi16(a, _mm_cvtsi32_si128(b)); +} +// vector operator >>= : shift right arithmetic +static inline Vec32s & operator >>= (Vec32s & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec32sb operator == (Vec32s const a, Vec32s const b) { + return _mm512_cmpeq_epi16_mask(a, b); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec32sb operator != (Vec32s const a, Vec32s const b) { + return _mm512_cmpneq_epi16_mask(a, b); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec32sb operator > (Vec32s const a, Vec32s const b) { + return _mm512_cmp_epi16_mask(a, b, 6); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec32sb operator < (Vec32s const a, Vec32s const b) { + return _mm512_cmp_epi16_mask(a, b, 1); +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec32sb operator >= (Vec32s const a, Vec32s const b) { + return _mm512_cmp_epi16_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec32sb operator <= (Vec32s const a, Vec32s const b) { + return _mm512_cmp_epi16_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec32s operator & (Vec32s const a, Vec32s const b) { + return _mm512_and_epi32(a, b); +} + +// vector operator &= : bitwise and +static inline Vec32s & operator &= (Vec32s & a, Vec32s const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec32s operator | (Vec32s const a, Vec32s const b) { + return _mm512_or_epi32(a, b); +} + +// vector operator |= : bitwise or +static inline Vec32s & operator |= (Vec32s & a, Vec32s const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec32s operator ^ (Vec32s const a, Vec32s const b) { + return _mm512_xor_epi32(a, b); +} + +// vector operator ^= : bitwise xor +static inline Vec32s & operator ^= (Vec32s & a, Vec32s const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec32s operator ~ (Vec32s const a) { + return Vec32s(~ Vec16i(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec32s select (Vec32sb const s, Vec32s const a, Vec32s const b) { + return _mm512_mask_mov_epi16(b, s, a); // conditional move may be optimized better by the compiler than blend +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec32s if_add (Vec32sb const f, Vec32s const a, Vec32s const b) { + return _mm512_mask_add_epi16(a, f, a, b); +} + +// Conditional subtract +static inline Vec32s if_sub (Vec32sb const f, Vec32s const a, Vec32s const b) { + return _mm512_mask_sub_epi16(a, f, a, b); +} + +// Conditional multiply +static inline Vec32s if_mul (Vec32sb const f, Vec32s const a, Vec32s const b) { + return _mm512_mask_mullo_epi16(a, f, a, b); +} + +// Horizontal add: Calculates the sum of all vector elements. +// Overflow will wrap around +static inline int16_t horizontal_add (Vec32s const a) { + Vec16s s = a.get_low() + a.get_high(); + return horizontal_add(s); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is sign-extended before addition to avoid overflow +static inline int32_t horizontal_add_x (Vec32s const a) { + return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec32s add_saturated(Vec32s const a, Vec32s const b) { + return _mm512_adds_epi16(a, b); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec32s sub_saturated(Vec32s const a, Vec32s const b) { + return _mm512_subs_epi16(a, b); +} + +// function max: a > b ? a : b +static inline Vec32s max(Vec32s const a, Vec32s const b) { + return _mm512_max_epi16(a,b); +} + +// function min: a < b ? a : b +static inline Vec32s min(Vec32s const a, Vec32s const b) { + return _mm512_min_epi16(a,b); +} + +// function abs: a >= 0 ? a : -a +static inline Vec32s abs(Vec32s const a) { + return _mm512_abs_epi16(a); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec32s abs_saturated(Vec32s const a) { + return _mm512_min_epu16(abs(a), Vec32s(0x7FFF)); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec32s rotate_left(Vec32s const a, int b) { + __m512i left = _mm512_sll_epi16(a,_mm_cvtsi32_si128(b & 0xF)); // a << b + __m512i right = _mm512_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0xF)); // a >> (32 - b) + __m512i rot = _mm512_or_si512(left,right); // or + return rot; +} + + +/***************************************************************************** +* +* Vector of 32 16-bit unsigned integers +* +*****************************************************************************/ + +class Vec32us : public Vec32s { +public: + // Default constructor: + Vec32us() { + } + // Constructor to broadcast the same value into all elements: + Vec32us(uint16_t i) { + zmm = _mm512_set1_epi16((int16_t)i); + } + // Constructor to build from all elements. Inherit from Vec32s + Vec32us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, + uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15, + uint16_t i16, uint16_t i17, uint16_t i18, uint16_t i19, uint16_t i20, uint16_t i21, uint16_t i22, uint16_t i23, + uint16_t i24, uint16_t i25, uint16_t i26, uint16_t i27, uint16_t i28, uint16_t i29, uint16_t i30, uint16_t i31) + : Vec32s(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31) {} + + // Constructor to build from two Vec16us: + Vec32us(Vec16us const a0, Vec16us const a1) { + zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1); + } + // Constructor to convert from type __m512i used in intrinsics: + Vec32us(__m512i const x) { + zmm = x; + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec32us & operator = (__m512i const x) { + zmm = x; + return *this; + } + // Member function to load from array (unaligned) + Vec32us & load(void const * p) { + Vec32s::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec32us & load_a(void const * p) { + Vec32s::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec32us const insert(int index, uint16_t value) { + Vec32s::insert(index, (int16_t)value); + return *this; + } + // Member function extract a single element from vector + uint16_t extract(int index) const { + return (uint16_t)Vec32s::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint16_t operator [] (int index) const { + return (uint16_t)Vec32s::extract(index); + } + // Member functions to split into two Vec16us: + Vec16us get_low() const { + return Vec16us(Vec32s::get_low()); + } + Vec16us get_high() const { + return Vec16us(Vec32s::get_high()); + } + static constexpr int elementtype() { + return 7; + } +}; + +// Define operators for this class + +// vector operator + : add element by element +static inline Vec32us operator + (Vec32us const a, Vec32us const b) { + return _mm512_add_epi16(a, b); +} + +// vector operator - : subtract element by element +static inline Vec32us operator - (Vec32us const a, Vec32us const b) { + return _mm512_sub_epi16(a, b); +} + +// vector operator * : multiply element by element +static inline Vec32us operator * (Vec32us const a, Vec32us const b) { + return _mm512_mullo_epi16(a, b); +} + +// vector operator / : divide +// See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec32us operator >> (Vec32us const a, uint32_t b) { + return _mm512_srl_epi16(a, _mm_cvtsi32_si128((int)b)); +} +static inline Vec32us operator >> (Vec32us const a, int b) { + return a >> uint32_t(b); +} + +// vector operator >>= : shift right logical +static inline Vec32us & operator >>= (Vec32us & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical (signed b) +static inline Vec32us & operator >>= (Vec32us & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec32us operator << (Vec32us const a, uint32_t b) { + return _mm512_sll_epi16(a, _mm_cvtsi32_si128((int)b)); +} +static inline Vec32us operator << (Vec32us const a, int b) { + return a << uint32_t(b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec32sb operator < (Vec32us const a, Vec32us const b) { + return _mm512_cmp_epu16_mask(a, b, 1); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec32sb operator > (Vec32us const a, Vec32us const b) { + return _mm512_cmp_epu16_mask(a, b, 6); +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec32sb operator >= (Vec32us const a, Vec32us const b) { + return _mm512_cmp_epu16_mask(a, b, 5); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec32sb operator <= (Vec32us const a, Vec32us const b) { + return _mm512_cmp_epu16_mask(a, b, 2); +} + +// vector operator & : bitwise and +static inline Vec32us operator & (Vec32us const a, Vec32us const b) { + return Vec32us(Vec32s(a) & Vec32s(b)); +} + +// vector operator | : bitwise or +static inline Vec32us operator | (Vec32us const a, Vec32us const b) { + return Vec32us(Vec32s(a) | Vec32s(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec32us operator ^ (Vec32us const a, Vec32us const b) { + return Vec32us(Vec32s(a) ^ Vec32s(b)); +} + +// vector operator ~ : bitwise not +static inline Vec32us operator ~ (Vec32us const a) { + return Vec32us( ~ Vec32s(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec32us select (Vec32sb const s, Vec32us const a, Vec32us const b) { + return Vec32us(select(s, Vec32s(a), Vec32s(b))); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec32us if_add (Vec32sb const f, Vec32us const a, Vec32us const b) { + return _mm512_mask_add_epi16(a, f, a, b); +} + +// Conditional subtract +static inline Vec32us if_sub (Vec32sb const f, Vec32us const a, Vec32us const b) { + return _mm512_mask_sub_epi16(a, f, a, b); +} + +// Conditional multiply +static inline Vec32us if_mul (Vec32sb const f, Vec32us const a, Vec32us const b) { + return _mm512_mask_mullo_epi16(a, f, a, b); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec32us add_saturated(Vec32us const a, Vec32us const b) { + return _mm512_adds_epu16(a, b); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec32us sub_saturated(Vec32us const a, Vec32us const b) { + return _mm512_subs_epu16(a, b); +} + +// function max: a > b ? a : b +static inline Vec32us max(Vec32us const a, Vec32us const b) { + return _mm512_max_epu16(a,b); +} + +// function min: a < b ? a : b +static inline Vec32us min(Vec32us const a, Vec32us const b) { + return _mm512_min_epu16(a,b); +} + + +/***************************************************************************** +* +* Vector permute functions +* +****************************************************************************** +* +* These permute functions can reorder the elements of a vector and optionally +* set some elements to zero. See Vectori128.h for description +* +*****************************************************************************/ + +// Permute vector of 32 16-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template + static inline Vec32s permute32(Vec32s const a) { + int constexpr indexs[32] = { i0... }; + __m512i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert(sizeof... (i0) == 32, "permute32 must have 32 indexes"); + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<32>(indexs); // permutation pattern + y = permute16 (Vec16i(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else if constexpr ((flags & perm_same_pattern) != 0) { // same pattern in all lanes + if constexpr ((flags & perm_rotate) != 0) { // fits palignr. rotate within lanes + y = _mm512_alignr_epi8(a, a, (flags >> perm_rot_count) & 0xF); + } + else { // use pshufb + const EList bm = pshufb_mask(indexs); + return _mm512_shuffle_epi8(a, Vec32s().load(bm.a)); + } + } + else { // different patterns in all lanes + if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm512_shuffle_epi8(a, Vec32s().load(bm.a)); + } + else if constexpr ((flags & perm_rotate_big) != 0) {// fits full rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count) * 2; // rotate count + constexpr uint8_t r1 = (rot >> 4 << 1) & 7; + constexpr uint8_t r2 = (r1 + 2) & 7; + __m512i y1 = a, y2 = a; + if constexpr (r1 != 0) y1 = _mm512_alignr_epi64 (a, a, r1); // rotate 128-bit blocks + if constexpr (r2 != 0) y2 = _mm512_alignr_epi64 (a, a, r2); // rotate 128-bit blocks + y = _mm512_alignr_epi8(y2, y1, rot & 15); + } + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) { + y = _mm512_broadcastw_epi16(_mm512_castsi512_si128(y)); // broadcast first element + } + else if constexpr ((flags & perm_zext) != 0) { // fits zero extension + y = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(y)); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } +#if defined (__AVX512VBMI2__) + else if constexpr ((flags & perm_compress) != 0) { + y = _mm512_maskz_compress_epi16(__mmask32(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm512_maskz_expand_epi16(__mmask32(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif // AVX512VBMI2 + else { // full permute needed + const EList bm = perm_mask_broad(indexs); + y = _mm512_permutexvar_epi16 (Vec32s().load(bm.a), y); + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi16(zero_mask<32>(indexs), y); + } + return y; +} + +template + static inline Vec32us permute32(Vec32us const a) { + return Vec32us (permute32 (Vec32s(a))); +} + + +// Permute vector of 64 8-bit integers. +// Index -1 gives 0, index V_DC means don't care. +template +static inline Vec64c permute64(Vec64c const a) { + int constexpr indexs[64] = { i0... }; + __m512i y = a; // result + // get flags for possibilities that fit the permutation pattern + constexpr uint64_t flags = perm_flags(indexs); + + static_assert(sizeof... (i0) == 64, "permute64 must have 64 indexes"); + static_assert((flags & perm_outofrange) == 0, "Index out of range in permute function"); + + if constexpr ((flags & perm_allzero) != 0) { + return _mm512_setzero_si512(); // just return zero + } + if constexpr ((flags & perm_perm) != 0) { // permutation needed + + if constexpr ((flags & perm_largeblock) != 0) { // use larger permutation + constexpr EList L = largeblock_perm<64>(indexs); // permutation pattern + y = permute32 < + L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], + L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15], + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31]> + (Vec32s(a)); + if (!(flags & perm_addz)) return y; // no remaining zeroing + } + else { + if constexpr ((flags & perm_cross_lane) == 0) { // no lane crossing. Use pshufb + const EList bm = pshufb_mask(indexs); + return _mm512_shuffle_epi8(a, Vec64c().load(bm.a)); + } + else if constexpr ((flags & perm_rotate_big) != 0) { // fits full rotate + constexpr uint8_t rot = uint8_t(flags >> perm_rot_count); // rotate count + constexpr uint8_t r1 = (rot >> 4 << 1) & 7; + constexpr uint8_t r2 = (r1 + 2) & 7; + __m512i y1 = a, y2 = a; + if constexpr (r1 != 0) y1 = _mm512_alignr_epi64(y, y, r1);// rotate 128-bit blocks + if constexpr (r2 != 0) y2 = _mm512_alignr_epi64(a, a, r2);// rotate 128-bit blocks + y = _mm512_alignr_epi8(y2, y1, rot & 15); + } + else if constexpr ((flags & perm_broadcast) != 0 && (flags >> perm_rot_count) == 0) { + y = _mm512_broadcastb_epi8(_mm512_castsi512_si128(y)); // broadcast first element + } + else if constexpr ((flags & perm_zext) != 0) { // fits zero extension + y = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(y)); // zero extension + if constexpr ((flags & perm_addz2) == 0) return y; + } +#if defined (__AVX512VBMI2__) + else if constexpr ((flags & perm_compress) != 0) { + y = _mm512_maskz_compress_epi8(__mmask64(compress_mask(indexs)), y); // compress + if constexpr ((flags & perm_addz2) == 0) return y; + } + else if constexpr ((flags & perm_expand) != 0) { + y = _mm512_maskz_expand_epi8(__mmask64(expand_mask(indexs)), y); // expand + if constexpr ((flags & perm_addz2) == 0) return y; + } +#endif // AVX512VBMI2 + else { // full permute needed +#ifdef __AVX512VBMI__ // full permute instruction available + const EList bm = perm_mask_broad(indexs); + y = _mm512_permutexvar_epi8(Vec64c().load(bm.a), y); +#else + // There is no 8-bit full permute. Use 16-bit permute + // getevenmask: get permutation mask for destination bytes with even position + auto getevenmask = [](int const (&indexs)[64]) constexpr { + EList u = {{0}}; // list to return + for (int i = 0; i < 64; i += 2) { // loop through even indexes + uint16_t ix = indexs[i] & 63; + // source bytes with odd position are in opposite 16-bit word becase of 32-bit rotation + u.a[i>>1] = ((ix >> 1) ^ (ix & 1)) | (((ix & 1) ^ 1) << 5); + } + return u; + }; + // getoddmask: get permutation mask for destination bytes with odd position + auto getoddmask = [](int const (&indexs)[64]) constexpr { + EList u = {{0}}; // list to return + for (int i = 1; i < 64; i += 2) { // loop through odd indexes + uint16_t ix = indexs[i] & 63; + u.a[i>>1] = (ix >> 1) | ((ix & 1) << 5); + } + return u; + }; + EList evenmask = getevenmask(indexs); + EList oddmask = getoddmask (indexs); + // Rotate to get odd bytes into even position, and vice versa. + // There is no 16-bit rotate, use 32-bit rotate. + // The wrong position of the odd bytes is compensated for in getevenmask + __m512i ro = _mm512_rol_epi32 (a, 8); // rotate + __m512i yeven = _mm512_permutex2var_epi16(ro, Vec32s().load(evenmask.a), a); // destination bytes with even position + __m512i yodd = _mm512_permutex2var_epi16(ro, Vec32s().load(oddmask.a), a); // destination bytes with odd position + __mmask64 maske = 0x5555555555555555; // mask for even position + y = _mm512_mask_mov_epi8(yodd, maske, yeven); // interleave even and odd position bytes +#endif + } + } + } + if constexpr ((flags & perm_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi8(zero_mask<64>(indexs), y); + } + return y; +} + +template +static inline Vec64uc permute64(Vec64uc const a) { + return Vec64uc(permute64(Vec64c(a))); +} + + +/***************************************************************************** +* +* Vector blend functions +* +*****************************************************************************/ + +// permute and blend Vec32s +template +static inline Vec32s blend32(Vec32s const a, Vec32s const b) { + int constexpr indexs[32] = { i0 ... }; // indexes as array + static_assert(sizeof... (i0) == 32, "blend32 must have 32 indexes"); + __m512i y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute32 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<32, 2>(indexs); // get permutation indexes + return permute32 < + L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39], + L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47], + L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55], + L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] > (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint32_t mb = (uint32_t)make_bit_mask<32, 0x305>(indexs); // blend mask + y = _mm512_mask_mov_epi16(a, mb, b); + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 32-bit blocks + constexpr EList L = largeblock_perm<32>(indexs); // get 32-bit blend pattern + y = blend16 + (Vec16i(a), Vec16i(b)); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else { // No special cases + const EList bm = perm_mask_broad(indexs); // full permute + y = _mm512_permutex2var_epi16(a, Vec32s().load(bm.a), b); + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi16(zero_mask<32>(indexs), y); + } + return y; +} + +template + static inline Vec32us blend32(Vec32us const a, Vec32us const b) { + return Vec32us(blend32 (Vec32s(a),Vec32s(b))); +} + + // permute and blend Vec64c +template +static inline Vec64c blend64(Vec64c const a, Vec64c const b) { + int constexpr indexs[64] = { i0 ... }; // indexes as array + static_assert(sizeof... (i0) == 64, "blend64 must have 64 indexes"); + __m512i y = a; // result + constexpr uint64_t flags = blend_flags(indexs);// get flags for possibilities that fit the index pattern + + static_assert((flags & blend_outofrange) == 0, "Index out of range in blend function"); + + if constexpr ((flags & blend_allzero) != 0) return _mm512_setzero_si512(); // just return zero + + if constexpr ((flags & blend_b) == 0) { // nothing from b. just permute a + return permute64 (a); + } + if constexpr ((flags & blend_a) == 0) { // nothing from a. just permute b + constexpr EList L = blend_perm_indexes<64, 2>(indexs); // get permutation indexes + return permute64 < + L.a[64], L.a[65], L.a[66], L.a[67], L.a[68], L.a[69], L.a[70], L.a[71], + L.a[72], L.a[73], L.a[74], L.a[75], L.a[76], L.a[77], L.a[78], L.a[79], + L.a[80], L.a[81], L.a[82], L.a[83], L.a[84], L.a[85], L.a[86], L.a[87], + L.a[88], L.a[89], L.a[90], L.a[91], L.a[92], L.a[93], L.a[94], L.a[95], + L.a[96], L.a[97], L.a[98], L.a[99], L.a[100], L.a[101], L.a[102], L.a[103], + L.a[104], L.a[105], L.a[106], L.a[107], L.a[108], L.a[109], L.a[110], L.a[111], + L.a[112], L.a[113], L.a[114], L.a[115], L.a[116], L.a[117], L.a[118], L.a[119], + L.a[120], L.a[121], L.a[122], L.a[123], L.a[124], L.a[125], L.a[126], L.a[127] + > (b); + } + if constexpr ((flags & (blend_perma | blend_permb)) == 0) { // no permutation, only blending + constexpr uint64_t mb = make_bit_mask<64, 0x306>(indexs); // blend mask + y = _mm512_mask_mov_epi8(a, mb, b); + } + else if constexpr ((flags & blend_largeblock) != 0) { // blend and permute 16-bit blocks + constexpr EList L = largeblock_perm<64>(indexs); // get 16-bit blend pattern + y = blend32 < + L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], + L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15], + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31] + > (Vec32s(a), Vec32s(b)); + if (!(flags & blend_addz)) return y; // no remaining zeroing + } + else { // No special cases +#ifdef __AVX512VBMI__ // AVX512VBMI + const EList bm = perm_mask_broad(indexs); // full permute + y = _mm512_permutex2var_epi8(a, Vec64c().load(bm.a), b); +#else // split into two permutes + constexpr EList L = blend_perm_indexes<64, 0> (indexs); + __m512i ya = permute64 < + L.a[0], L.a[1], L.a[2], L.a[3], L.a[4], L.a[5], L.a[6], L.a[7], + L.a[8], L.a[9], L.a[10], L.a[11], L.a[12], L.a[13], L.a[14], L.a[15], + L.a[16], L.a[17], L.a[18], L.a[19], L.a[20], L.a[21], L.a[22], L.a[23], + L.a[24], L.a[25], L.a[26], L.a[27], L.a[28], L.a[29], L.a[30], L.a[31], + L.a[32], L.a[33], L.a[34], L.a[35], L.a[36], L.a[37], L.a[38], L.a[39], + L.a[40], L.a[41], L.a[42], L.a[43], L.a[44], L.a[45], L.a[46], L.a[47], + L.a[48], L.a[49], L.a[50], L.a[51], L.a[52], L.a[53], L.a[54], L.a[55], + L.a[56], L.a[57], L.a[58], L.a[59], L.a[60], L.a[61], L.a[62], L.a[63] + > (a); + __m512i yb = permute64 < + L.a[64], L.a[65], L.a[66], L.a[67], L.a[68], L.a[69], L.a[70], L.a[71], + L.a[72], L.a[73], L.a[74], L.a[75], L.a[76], L.a[77], L.a[78], L.a[79], + L.a[80], L.a[81], L.a[82], L.a[83], L.a[84], L.a[85], L.a[86], L.a[87], + L.a[88], L.a[89], L.a[90], L.a[91], L.a[92], L.a[93], L.a[94], L.a[95], + L.a[96], L.a[97], L.a[98], L.a[99], L.a[100], L.a[101], L.a[102], L.a[103], + L.a[104], L.a[105], L.a[106], L.a[107], L.a[108], L.a[109], L.a[110], L.a[111], + L.a[112], L.a[113], L.a[114], L.a[115], L.a[116], L.a[117], L.a[118], L.a[119], + L.a[120], L.a[121], L.a[122], L.a[123], L.a[124], L.a[125], L.a[126], L.a[127] + > (b); + uint64_t bm = make_bit_mask<64, 0x306> (indexs); + y = _mm512_mask_mov_epi8(ya, bm, yb); +#endif + } + if constexpr ((flags & blend_zeroing) != 0) { // additional zeroing needed + y = _mm512_maskz_mov_epi8(zero_mask<64>(indexs), y); + } + return y; +} + +template +static inline Vec64uc blend64(Vec64uc const a, Vec64uc const b) { + return Vec64uc(blend64 (Vec64c(a), Vec64c(b))); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors +* +*****************************************************************************/ + +// lookup in table of 64 int8_t values +static inline Vec64c lookup64(Vec64c const index, Vec64c const table) { +#ifdef __AVX512VBMI__ // AVX512VBMI instruction set not supported yet (April 2019) + return _mm512_permutexvar_epi8(index, table); +#else + // broadcast each 128-bit lane, because int8_t shuffle is only within 128-bit lanes + __m512i lane0 = _mm512_broadcast_i32x4(_mm512_castsi512_si128(table)); + __m512i lane1 = _mm512_shuffle_i64x2(table, table, 0x55); + __m512i lane2 = _mm512_shuffle_i64x2(table, table, 0xAA); + __m512i lane3 = _mm512_shuffle_i64x2(table, table, 0xFF); + Vec64c laneselect = index >> 4; // upper part of index selects lane + // select and permute from each lane + Vec64c dat0 = _mm512_maskz_shuffle_epi8( laneselect==0, lane0, index); + Vec64c dat1 = _mm512_mask_shuffle_epi8 (dat0, laneselect==1, lane1, index); + Vec64c dat2 = _mm512_maskz_shuffle_epi8( laneselect==2, lane2, index); + Vec64c dat3 = _mm512_mask_shuffle_epi8 (dat2, laneselect==3, lane3, index); + return dat1 | dat3; +#endif +} + +// lookup in table of 128 int8_t values +static inline Vec64c lookup128(Vec64c const index, Vec64c const table1, Vec64c const table2) { +#ifdef __AVX512VBMI__ // AVX512VBMI instruction set not supported yet (April 2019) + return _mm512_permutex2var_epi8(table1, index, table2); + +#else + // use 16-bits permute, which is included in AVX512BW + __m512i ieven2 = _mm512_srli_epi16 (index, 1); // even pos bytes of index / 2 (extra bits will be ignored) + __m512i e1 = _mm512_permutex2var_epi16(table1, ieven2, table2); // 16-bits results for even pos index + __mmask32 me1 = (Vec32s(index) & 1) != 0; // even pos indexes are odd value + __m512i e2 = _mm512_mask_srli_epi16(e1, me1, e1, 8); // combined results for even pos index. get upper 8 bits down if index was odd + __m512i iodd2 = _mm512_srli_epi16 (index, 9); // odd pos bytes of index / 2 + __m512i o1 = _mm512_permutex2var_epi16(table1, iodd2, table2); // 16-bits results for odd pos index + __mmask32 mo1 = (Vec32s(index) & 0x100) == 0; // odd pos indexes have even value + __m512i o2 = _mm512_mask_slli_epi16(o1, mo1, o1, 8); // combined results for odd pos index. get lower 8 bits up if index was even + __mmask64 maske = 0x5555555555555555; // mask for even position + return _mm512_mask_mov_epi8(o2, maske, e2); // interleave even and odd position result +#endif +} + +// lookup in table of 256 int8_t values. +// The complete table of all possible 256 byte values is contained in four vectors +// The index is treated as unsigned +static inline Vec64c lookup256(Vec64c const index, Vec64c const table1, Vec64c const table2, Vec64c const table3, Vec64c const table4) { +#ifdef __AVX512VBMI__ // AVX512VBMI instruction set not supported yet (April 2019) + Vec64c d12 = _mm512_permutex2var_epi8(table1, index, table2); + Vec64c d34 = _mm512_permutex2var_epi8(table3, index, table4); + return select(index < 0, d34, d12); // use sign bit to select +#else + // the AVX512BW version of lookup128 ignores upper bytes of index + // (the compiler will optimize away common subexpressions of the two lookup128) + Vec64c d12 = lookup128(index, table1, table2); + Vec64c d34 = lookup128(index, table3, table4); + return select(index < 0, d34, d12); +#endif +} + + +// lookup in table of 32 values +static inline Vec32s lookup32(Vec32s const index, Vec32s const table) { + return _mm512_permutexvar_epi16(index, table); +} + +// lookup in table of 64 values +static inline Vec32s lookup64(Vec32s const index, Vec32s const table1, Vec32s const table2) { + return _mm512_permutex2var_epi16(table1, index, table2); +} + +// lookup in table of 128 values +static inline Vec32s lookup128(Vec32s const index, Vec32s const table1, Vec32s const table2, Vec32s const table3, Vec32s const table4) { + Vec32s d12 = _mm512_permutex2var_epi16(table1, index, table2); + Vec32s d34 = _mm512_permutex2var_epi16(table3, index, table4); + return select((index >> 6) != 0, d34, d12); +} + + +/***************************************************************************** +* +* Byte shifts +* +*****************************************************************************/ + +// Function shift_bytes_up: shift whole vector left by b bytes. +template +static inline Vec64c shift_bytes_up(Vec64c const a) { + __m512i ahi, alo; + if constexpr (b == 0) return a; + else if constexpr ((b & 3) == 0) { // b is divisible by 4 + return _mm512_alignr_epi32(a, _mm512_setzero_si512(), (16 - (b >> 2)) & 15); + } + else if constexpr (b < 16) { + alo = a; + ahi = _mm512_maskz_shuffle_i64x2(0xFC, a, a, 0x90); // shift a 16 bytes up, zero lower part + } + else if constexpr (b < 32) { + alo = _mm512_maskz_shuffle_i64x2(0xFC, a, a, 0x90); // shift a 16 bytes up, zero lower part + ahi = _mm512_maskz_shuffle_i64x2(0xF0, a, a, 0x40); // shift a 32 bytes up, zero lower part + } + else if constexpr (b < 48) { + alo = _mm512_maskz_shuffle_i64x2(0xF0, a, a, 0x40); // shift a 32 bytes up, zero lower part + ahi = _mm512_maskz_shuffle_i64x2(0xC0, a, a, 0x00); // shift a 48 bytes up, zero lower part + } + else if constexpr (b < 64) { + alo = _mm512_maskz_shuffle_i64x2(0xC0, a, a, 0x00); // shift a 48 bytes up, zero lower part + ahi = _mm512_setzero_si512(); // zero + } + else { + return _mm512_setzero_si512(); // zero + } + return _mm512_alignr_epi8(alo, ahi, 16-(b & 0xF)); // shift within 16-bytes lane +} + +// Function shift_bytes_down: shift whole vector right by b bytes +template +static inline Vec64c shift_bytes_down(Vec64c const a) { + if constexpr ((b & 3) == 0) { // b is divisible by 4 + return _mm512_alignr_epi32(_mm512_setzero_si512(), a, ((b >> 2) & 15)); + } + __m512i ahi, alo; + if constexpr (b < 16) { + alo = _mm512_maskz_shuffle_i64x2(0x3F, a, a, 0x39); // shift a 16 bytes down, zero upper part + ahi = a; + } + else if constexpr (b < 32) { + alo = _mm512_maskz_shuffle_i64x2(0x0F, a, a, 0x0E); // shift a 32 bytes down, zero upper part + ahi = _mm512_maskz_shuffle_i64x2(0x3F, a, a, 0x39); // shift a 16 bytes down, zero upper part + } + else if constexpr (b < 48) { + alo = _mm512_maskz_shuffle_i64x2(0x03, a, a, 0x03); // shift a 48 bytes down, zero upper part + ahi = _mm512_maskz_shuffle_i64x2(0x0F, a, a, 0x0E); // shift a 32 bytes down, zero upper part + } + else if constexpr (b < 64) { + alo = _mm512_setzero_si512(); + ahi = _mm512_maskz_shuffle_i64x2(0x03, a, a, 0x03); // shift a 48 bytes down, zero upper part + } + else { + return _mm512_setzero_si512(); // zero + } + return _mm512_alignr_epi8(alo, ahi, b & 0xF); // shift within 16-bytes lane +} + + +/***************************************************************************** +* +* Functions for conversion between integer sizes +* +*****************************************************************************/ + +// Extend 8-bit integers to 16-bit integers, signed and unsigned + +// Function extend_low : extends the low 32 elements to 16 bits with sign extension +static inline Vec32s extend_low (Vec64c const a) { + __m512i a2 = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a)); // get low 64-bit blocks + Vec64cb sign = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(),a2);// 0 > a2 + __m512i ss = _mm512_maskz_set1_epi8(sign, -1); + return _mm512_unpacklo_epi8(a2, ss); // interleave with sign extensions +} + +// Function extend_high : extends the high 16 elements to 16 bits with sign extension +static inline Vec32s extend_high (Vec64c const a) { + __m512i a2 = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a)); // get low 64-bit blocks + Vec64cb sign = _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(),a2);// 0 > a2 + __m512i ss = _mm512_maskz_set1_epi8(sign, -1); + return _mm512_unpacklo_epi8(a2, ss); // interleave with sign extensions +} + +// Function extend_low : extends the low 16 elements to 16 bits with zero extension +static inline Vec32us extend_low (Vec64uc const a) { + __m512i a2 = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a)); // get low 64-bit blocks + return _mm512_unpacklo_epi8(a2, _mm512_setzero_si512()); // interleave with zero extensions +} + +// Function extend_high : extends the high 19 elements to 16 bits with zero extension +static inline Vec32us extend_high (Vec64uc const a) { + __m512i a2 = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a)); // get low 64-bit blocks + return _mm512_unpacklo_epi8(a2, _mm512_setzero_si512()); // interleave with zero extensions +} + +// Extend 16-bit integers to 32-bit integers, signed and unsigned + +// Function extend_low : extends the low 8 elements to 32 bits with sign extension +static inline Vec16i extend_low (Vec32s const a) { + __m512i a2 = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a)); // get low 64-bit blocks + Vec32sb sign = _mm512_cmpgt_epi16_mask(_mm512_setzero_si512(),a2);// 0 > a2 + __m512i ss = _mm512_maskz_set1_epi16(sign, -1); + return _mm512_unpacklo_epi16(a2, ss); // interleave with sign extensions +} + +// Function extend_high : extends the high 8 elements to 32 bits with sign extension +static inline Vec16i extend_high (Vec32s const a) { + __m512i a2 = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a)); // get low 64-bit blocks + Vec32sb sign = _mm512_cmpgt_epi16_mask(_mm512_setzero_si512(),a2);// 0 > a2 + __m512i ss = _mm512_maskz_set1_epi16(sign, -1); + return _mm512_unpacklo_epi16(a2, ss); // interleave with sign extensions +} + +// Function extend_low : extends the low 8 elements to 32 bits with zero extension +static inline Vec16ui extend_low (Vec32us const a) { + __m512i a2 = permute8<0,V_DC,1,V_DC,2,V_DC,3,V_DC>(Vec8q(a)); // get low 64-bit blocks + return _mm512_unpacklo_epi16(a2, _mm512_setzero_si512()); // interleave with zero extensions +} + +// Function extend_high : extends the high 8 elements to 32 bits with zero extension +static inline Vec16ui extend_high (Vec32us const a) { + __m512i a2 = permute8<4,V_DC,5,V_DC,6,V_DC,7,V_DC>(Vec8q(a)); // get low 64-bit blocks + return _mm512_unpacklo_epi16(a2, _mm512_setzero_si512()); // interleave with zero extensions +} + + +// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Overflow wraps around +static inline Vec64c compress (Vec32s const low, Vec32s const high) { + __mmask64 mask = 0x5555555555555555; + __m512i lowm = _mm512_maskz_mov_epi8 (mask, low); // bytes of low + __m512i highm = _mm512_maskz_mov_epi8 (mask, high); // bytes of high + __m512i pk = _mm512_packus_epi16(lowm, highm); // unsigned pack + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed, with saturation +static inline Vec64c compress_saturated (Vec32s const low, Vec32s const high) { + __m512i pk = _mm512_packs_epi16(low,high); // packed with signed saturation + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers +// Unsigned, overflow wraps around +static inline Vec64uc compress (Vec32us const low, Vec32us const high) { + return Vec64uc (compress((Vec32s)low, (Vec32s)high)); +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Unsigned, with saturation +static inline Vec64uc compress_saturated (Vec32us const low, Vec32us const high) { + __m512i maxval = _mm512_set1_epi32(0x00FF00FF); // maximum value + __m512i low1 = _mm512_min_epu16(low,maxval); // upper limit + __m512i high1 = _mm512_min_epu16(high,maxval); // upper limit + __m512i pk = _mm512_packus_epi16(low1,high1); // this instruction saturates from signed 32 bit to unsigned 16 bit + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed to unsigned, with saturation +static inline Vec64uc compress_saturated_s2u (Vec32s const low, Vec32s const high) { + __m512i pk = _mm512_packus_epi16(low,high); // this instruction saturates from signed 16 bit to unsigned 8 bit + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec32s compress (Vec16i const low, Vec16i const high) { + __mmask32 mask = 0x55555555; + __m512i lowm = _mm512_maskz_mov_epi16 (mask, low); // words of low + __m512i highm = _mm512_maskz_mov_epi16 (mask, high); // words of high + __m512i pk = _mm512_packus_epi32(lowm, highm); // unsigned pack + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed with saturation +static inline Vec32s compress_saturated (Vec16i const low, Vec16i const high) { + __m512i pk = _mm512_packs_epi32(low,high); // pack with signed saturation + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec32us compress (Vec16ui const low, Vec16ui const high) { + return Vec32us (compress((Vec16i)low, (Vec16i)high)); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Unsigned, with saturation +static inline Vec32us compress_saturated (Vec16ui const low, Vec16ui const high) { + __m512i maxval = _mm512_set1_epi32(0x0000FFFF); // maximum value + __m512i low1 = _mm512_min_epu32(low,maxval); // upper limit + __m512i high1 = _mm512_min_epu32(high,maxval); // upper limit + __m512i pk = _mm512_packus_epi32(low1,high1); // this instruction saturates from signed 32 bit to unsigned 16 bit + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed to unsigned, with saturation +static inline Vec32us compress_saturated_s2u (Vec16i const low, Vec16i const high) { + __m512i pk = _mm512_packus_epi32(low,high); // this instruction saturates from signed 32 bit to unsigned 16 bit + __m512i in = constant16ui<0,0,2,0,4,0,6,0,1,0,3,0,5,0,7,0>(); + return _mm512_permutexvar_epi64(in, pk); // put in right place +} + + +/***************************************************************************** +* +* Integer division operators +* +* Please see the file vectori128.h for explanation. +* +*****************************************************************************/ + +// vector operator / : divide each element by divisor + +// vector of 32 16-bit signed integers +static inline Vec32s operator / (Vec32s const a, Divisor_s const d) { + __m512i m = _mm512_broadcastq_epi64(d.getm()); // broadcast multiplier + __m512i sgn = _mm512_broadcastq_epi64(d.getsign()); // broadcast sign of d + __m512i t1 = _mm512_mulhi_epi16(a, m); // multiply high signed words + __m512i t2 = _mm512_add_epi16(t1,a); // + a + __m512i t3 = _mm512_sra_epi16(t2,d.gets1()); // shift right artihmetic + __m512i t4 = _mm512_srai_epi16(a,15); // sign of a + __m512i t5 = _mm512_sub_epi16(t4,sgn); // sign of a - sign of d + __m512i t6 = _mm512_sub_epi16(t3,t5); // + 1 if a < 0, -1 if d < 0 + return _mm512_xor_si512(t6,sgn); // change sign if divisor negative +} + +// vector of 16 16-bit unsigned integers +static inline Vec32us operator / (Vec32us const a, Divisor_us const d) { + __m512i m = _mm512_broadcastq_epi64(d.getm()); // broadcast multiplier + __m512i t1 = _mm512_mulhi_epu16(a, m); // multiply high signed words + __m512i t2 = _mm512_sub_epi16(a,t1); // subtract + __m512i t3 = _mm512_srl_epi16(t2,d.gets1()); // shift right logical + __m512i t4 = _mm512_add_epi16(t1,t3); // add + return _mm512_srl_epi16(t4,d.gets2()); // shift right logical +} + +// vector of 32 8-bit signed integers +static inline Vec64c operator / (Vec64c const a, Divisor_s const d) { + // sign-extend even-numbered and odd-numbered elements to 16 bits + Vec32s even = _mm512_srai_epi16(_mm512_slli_epi16(a, 8),8); + Vec32s odd = _mm512_srai_epi16(a, 8); + Vec32s evend = even / d; // divide even-numbered elements + Vec32s oddd = odd / d; // divide odd-numbered elements + oddd = _mm512_slli_epi16(oddd, 8); // shift left to put back in place + __m512i res = _mm512_mask_mov_epi8(evend, 0xAAAAAAAAAAAAAAAA, oddd); // interleave even and odd + return res; +} + +// vector of 32 8-bit unsigned integers +static inline Vec64uc operator / (Vec64uc const a, Divisor_us const d) { + // zero-extend even-numbered and odd-numbered elements to 16 bits + Vec32us even = _mm512_maskz_mov_epi8(__mmask64(0x5555555555555555), a); + Vec32us odd = _mm512_srli_epi16(a, 8); + Vec32us evend = even / d; // divide even-numbered elements + Vec32us oddd = odd / d; // divide odd-numbered elements + oddd = _mm512_slli_epi16(oddd, 8); // shift left to put back in place + __m512i res = _mm512_mask_mov_epi8(evend, 0xAAAAAAAAAAAAAAAA, oddd); // interleave even and odd + return res; +} + +// vector operator /= : divide +static inline Vec32s & operator /= (Vec32s & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec32us & operator /= (Vec32us & a, Divisor_us const d) { + a = a / d; + return a; + +} + +// vector operator /= : divide +static inline Vec64c & operator /= (Vec64c & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec64uc & operator /= (Vec64uc & a, Divisor_us const d) { + a = a / d; + return a; +} + +/***************************************************************************** +* +* Integer division 2: divisor is a compile-time constant +* +*****************************************************************************/ + +// Divide Vec32s by compile-time constant +template +static inline Vec32s divide_by_i(Vec32s const x) { + constexpr int16_t d0 = int16_t(d); // truncate d to 16 bits + static_assert(d0 != 0, "Integer division by zero"); + if constexpr (d0 == 1) return x; // divide by 1 + if constexpr (d0 == -1) return -x; // divide by -1 + if constexpr (uint16_t(d0) == 0x8000u) { + return _mm512_maskz_set1_epi16(x == Vec32s((int16_t)0x8000u), 1); // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0; + } + constexpr uint16_t d1 = d0 > 0 ? d0 : -d0; // compile-time abs(d0) + if constexpr ((d1 & (d1-1)) == 0) { + // d is a power of 2. use shift + constexpr int k = bit_scan_reverse_const(uint32_t(d1)); + __m512i sign; + if constexpr (k > 1) sign = _mm512_srai_epi16(x, k-1); else sign = x; // k copies of sign bit + __m512i bias = _mm512_srli_epi16(sign, 16-k); // bias = x >= 0 ? 0 : k-1 + __m512i xpbias = _mm512_add_epi16 (x, bias); // x + bias + __m512i q = _mm512_srai_epi16(xpbias, k); // (x + bias) >> k + if (d0 > 0) return q; // d0 > 0: return q + return _mm512_sub_epi16(_mm512_setzero_si512(), q); // d0 < 0: return -q + } + // general case + constexpr int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1; // ceil(log2(d)). (d < 2 handled above) + constexpr int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier + constexpr int shift1 = L - 1; + const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1); + return x / div; +} + +// define Vec32s a / const_int(d) +template +static inline Vec32s operator / (Vec32s const a, Const_int_t) { + return divide_by_i(a); +} + +// define Vec32s a / const_uint(d) +template +static inline Vec32s operator / (Vec32s const a, Const_uint_t) { + static_assert(d < 0x8000u, "Dividing signed integer by overflowing unsigned"); + return divide_by_i(a); // signed divide +} + +// vector operator /= : divide +template +static inline Vec32s & operator /= (Vec32s & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec32s & operator /= (Vec32s & a, Const_uint_t b) { + a = a / b; + return a; +} + +// Divide Vec32us by compile-time constant +template +static inline Vec32us divide_by_ui(Vec32us const x) { + constexpr uint16_t d0 = uint16_t(d); // truncate d to 16 bits + static_assert(d0 != 0, "Integer division by zero"); + if constexpr (d0 == 1) return x; // divide by 1 + constexpr int b = bit_scan_reverse_const(d0); // floor(log2(d)) + if constexpr ((d0 & (d0-1)) == 0) { + // d is a power of 2. use shift + return _mm512_srli_epi16(x, b); // x >> b + } + // general case (d > 2) + constexpr uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0);// multiplier = 2^(32+b) / d + constexpr uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d + constexpr bool round_down = (2*rem < d0); // check if fraction is less than 0.5 + Vec32us x1 = x; + if constexpr (round_down) { + x1 = x1 + 1; // round down mult and compensate by adding 1 to x + } + constexpr uint16_t mult1 = round_down ? mult : mult + 1; + const __m512i multv = _mm512_set1_epi16(mult1); // broadcast mult + __m512i xm = _mm512_mulhi_epu16(x1, multv); // high part of 16x16->32 bit unsigned multiplication + Vec32us q = _mm512_srli_epi16(xm, b); // shift right by b + if constexpr (round_down) { + Vec32sb overfl = (x1 == Vec32us(_mm512_setzero_si512())); // check for overflow of x+1 + return select(overfl, Vec32us(uint32_t(mult1 >> b)), q); // deal with overflow (rarely needed) + } + else { + return q; // no overflow possible + } +} + +// define Vec32us a / const_uint(d) +template +static inline Vec32us operator / (Vec32us const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec32us a / const_int(d) +template +static inline Vec32us operator / (Vec32us const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec32us & operator /= (Vec32us & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec32us & operator /= (Vec32us & a, Const_int_t b) { + a = a / b; + return a; +} + + +// define Vec64c a / const_int(d) +template +static inline Vec64c operator / (Vec64c const a, Const_int_t) { + // expand into two Vec32s + Vec32s low = extend_low(a) / Const_int_t(); + Vec32s high = extend_high(a) / Const_int_t(); + return compress(low,high); +} + +// define Vec64c a / const_uint(d) +template +static inline Vec64c operator / (Vec64c const a, Const_uint_t) { + static_assert(uint8_t(d) < 0x80u, "Dividing signed integer by overflowing unsigned"); + return a / Const_int_t(); // signed divide +} + +// vector operator /= : divide +template +static inline Vec64c & operator /= (Vec64c & a, Const_int_t b) { + a = a / b; + return a; +} +// vector operator /= : divide +template +static inline Vec64c & operator /= (Vec64c & a, Const_uint_t b) { + a = a / b; + return a; +} + +// define Vec64uc a / const_uint(d) +template +static inline Vec64uc operator / (Vec64uc const a, Const_uint_t) { + // expand into two Vec32us + Vec32us low = extend_low(a) / Const_uint_t(); + Vec32us high = extend_high(a) / Const_uint_t(); + return compress(low,high); +} + +// define Vec64uc a / const_int(d) +template +static inline Vec64uc operator / (Vec64uc const a, Const_int_t) { + static_assert(int8_t(d) >= 0, "Dividing unsigned integer by negative is ambiguous"); + return a / Const_uint_t(); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec64uc & operator /= (Vec64uc & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec64uc & operator /= (Vec64uc & a, Const_int_t b) { + a = a / b; + return a; +} + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORI512S_H diff --git a/DFTTest/VCL2/vectori512se.h b/DFTTest/VCL2/vectori512se.h new file mode 100644 index 0000000..83a38a5 --- /dev/null +++ b/DFTTest/VCL2/vectori512se.h @@ -0,0 +1,2076 @@ +/**************************** vectori512se.h ******************************* +* Author: Agner Fog +* Date created: 2019-04-20 +* Last modified: 2020-02-23 +* Version: 2.01.01 +* Project: vector class library +* Description: +* Header file defining 512-bit integer vector classes for 8 and 16 bit integers. +* Emulated for processors without AVX512BW instruction set. +* +* Instructions: see vcl_manual.pdf +* +* The following vector classes are defined here: +* Vec64c Vector of 64 8-bit signed integers +* Vec64uc Vector of 64 8-bit unsigned integers +* Vec64cb Vector of 64 booleans for use with Vec64c and Vec64uc +* Vec32s Vector of 32 16-bit signed integers +* Vec32us Vector of 32 16-bit unsigned integers +* Vec32sb Vector of 32 booleans for use with Vec32s and Vec32us +* Other 512-bit integer vectors are defined in Vectori512.h +* +* Each vector object is represented internally in the CPU as two 256-bit registers. +* This header file defines operators and functions for these vectors. +* +* (c) Copyright 2012-2020 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ + +#ifndef VECTORI512SE_H +#define VECTORI512SE_H + +#ifndef VECTORCLASS_H +#include "vectorclass.h" +#endif + +#if VECTORCLASS_H < 20100 +#error Incompatible versions of vector class library mixed +#endif + +// check combination of header files +#ifdef VECTORI512S_H +#error Two different versions of vectorf256.h included +#endif + + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + + +/***************************************************************************** +* +* Vector of 64 8-bit signed integers +* +*****************************************************************************/ + +class Vec64c { +protected: + Vec256b z0; // lower 256 bits + Vec256b z1; // higher 256 bits +public: + // Default constructor: + Vec64c() { + } + // Constructor to build from two Vec32c: + Vec64c(Vec32c const a0, Vec32c const a1) { + z0 = a0; + z1 = a1; + } + // Constructor to broadcast the same value into all elements: + Vec64c(int8_t i) { + z0 = z1 = Vec32c(i); + } + // Constructor to build from all elements: + Vec64c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, + int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, + int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23, + int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31, + int8_t i32, int8_t i33, int8_t i34, int8_t i35, int8_t i36, int8_t i37, int8_t i38, int8_t i39, + int8_t i40, int8_t i41, int8_t i42, int8_t i43, int8_t i44, int8_t i45, int8_t i46, int8_t i47, + int8_t i48, int8_t i49, int8_t i50, int8_t i51, int8_t i52, int8_t i53, int8_t i54, int8_t i55, + int8_t i56, int8_t i57, int8_t i58, int8_t i59, int8_t i60, int8_t i61, int8_t i62, int8_t i63) { + // _mm512_set_epi8 and _mm512_set_epi16 missing in GCC 7.4.0 + int8_t aa[64] = { + i0, i1, i2, i3, i4, i5, i6, i7,i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 }; + load(aa); + } +#ifdef VECTORI512_H + // Constructor to convert from type __m512i used in intrinsics: + Vec64c(__m512i const x) { + z0 = Vec16i(x).get_low(); + z1 = Vec16i(x).get_high(); + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec64c & operator = (__m512i const x) { + return *this = Vec64c(x); + } + // Type cast operator to convert to __m512i used in intrinsics + operator __m512i() const { + return Vec16i(Vec8i(z0),Vec8i(z1)); + } +#else + // Assignment operator to convert from type __m512i used in intrinsics: + Vec64c & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } +#endif + // Constructor to convert from type Vec512b + Vec64c(Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Type cast operator to convert to Vec512b used in emulation + operator Vec512b() const { + return Vec512b(z0,z1); + } + // Member function to load from array (unaligned) + Vec64c & load(void const * p) { + Vec16i x = Vec16i().load(p); + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Member function to load from array, aligned by 64 + Vec64c & load_a(void const * p) { + Vec16i x = Vec16i().load_a(p); + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec64c & load_partial(int n, void const * p) { + Vec32c lo, hi; + if ((uint32_t)n < 32) { + lo = Vec32c().load_partial(n,p); + hi = Vec32c(0); + } + else { + lo = Vec32c().load(p); + hi = Vec32c().load_partial(n-32, ((int8_t*)p)+32); + } + *this = Vec64c(lo, hi); + return *this; + } + // store + void store(void * p) const { + Vec16i x = Vec16i(Vec8i(z0),Vec8i(z1)); + x.store(p); + } + // store aligned + void store_a(void * p) const { + Vec16i x = Vec16i(Vec8i(z0),Vec8i(z1)); + x.store_a(p); + } + // Member function storing to aligned uncached memory (non-temporal store). + // Note: Will generate runtime error if p is not aligned by 64 + void store_nt(void * p) const { + Vec16i x = Vec16i(Vec8i(z0),Vec8i(z1)); + x.store_nt(p); + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + if ((uint32_t)n < 32) { + get_low().store_partial(n, p); + } + else { + get_low().store(p); + get_high().store_partial(n-32, ((int8_t*)p)+32); + } + } + // cut off vector to n elements. The last 64-n elements are set to zero + Vec64c & cutoff(int n) { + Vec32c lo, hi; + if ((uint32_t)n < 32) { + lo = Vec32c(get_low()).cutoff(n); + hi = Vec32c(0); + } + else { + lo = get_low(); + hi = Vec32c(get_high()).cutoff(n-32); + } + *this = Vec64c(lo, hi); + return *this; + } + // Member function to change a single element in vector + Vec64c const insert(int index, int8_t value) { + Vec32c lo, hi; + if ((uint32_t)index < 32) { + lo = Vec32c(get_low()).insert(index, value); + hi = get_high(); + } + else { + lo = get_low(); + hi = Vec32c(get_high()).insert(index-32, value); + } + *this = Vec64c(lo, hi); + return *this; + } + // Member function extract a single element from vector + int8_t extract(int index) const { + if ((uint32_t)index < 32) { + return Vec32c(get_low()).extract(index); + } + else { + return Vec32c(get_high()).extract(index-32); + } + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int8_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec32c: + Vec32c get_low() const { + return z0; + } + Vec32c get_high() const { + return z1; + } + static constexpr int size() { + return 64; + } + static constexpr int elementtype() { + return 4; + } +}; + + +/***************************************************************************** +* +* Vec64cb: Vector of 64 Booleans for use with Vec64c and Vec64uc +* +*****************************************************************************/ + +class Vec64cb : public Vec64c { +public: + // Default constructor: + Vec64cb () { + } + Vec64cb (Vec64c const a) : Vec64c(a) {} + + // Constructor to build from all elements: Not implemented + + // Constructor to convert from type __mmask64 used in intrinsics: not possible + // Vec64cb (__mmask64 x); + + // Constructor to broadcast single value: + Vec64cb(bool b) { + z0 = z1 = Vec32c(-int8_t(b)); + } + // Constructor to make from two halves (big booleans) + Vec64cb (Vec32cb const x0, Vec32cb const x1) : Vec64c(x0,x1) {} + + // Assignment operator to convert from type __mmask64 used in intrinsics: not possible + //Vec64cb & operator = (__mmask64 x); + + // Member functions to split into two Vec32cb: + Vec32cb get_low() const { + return Vec32c(z0); + } + Vec32cb get_high() const { + return Vec32c(z1); + } + // Assignment operator to broadcast scalar value: + Vec64cb & operator = (bool b) { + *this = Vec64cb(b); + return *this; + } + // Member function to change a single element in vector + Vec64cb & insert (int index, bool a) { + if ((uint32_t)index < 32) { + z0 = get_low().insert(index, a); + } + else { + z1 = get_high().insert(index-32, a); + } + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + if (index < 32) { + return get_low().extract(index); + } + else { + return get_high().extract(index-32); + } + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Type cast operator to convert to __mmask64 used in intrinsics. not possible + //operator __mmask64() const; + + // Member function to change a bitfield to a boolean vector + Vec64cb & load_bits(uint64_t a) { + Vec32cb x0 = Vec32cb().load_bits(uint32_t(a)); + Vec32cb x1 = Vec32cb().load_bits(uint32_t(a>>32)); + *this = Vec64cb(x0,x1); + return *this; + } + static constexpr int size() { + return 64; + } + static constexpr int elementtype() { + return 3; + } + Vec64cb(int b) = delete; // Prevent constructing from int, etc. + Vec64cb & operator = (int x) = delete; // Prevent assigning int because of ambiguity +}; + + +/***************************************************************************** +* +* Define operators and functions for Vec64cb +* +*****************************************************************************/ + +// vector operator & : bitwise and +static inline Vec64cb operator & (Vec64cb const a, Vec64cb const b) { + return Vec64cb(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec64cb operator && (Vec64cb const a, Vec64cb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec64cb & operator &= (Vec64cb & a, Vec64cb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec64cb operator | (Vec64cb const a, Vec64cb const b) { + return Vec64cb(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} +static inline Vec64cb operator || (Vec64cb const a, Vec64cb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec64cb & operator |= (Vec64cb & a, Vec64cb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec64cb operator ^ (Vec64cb const a, Vec64cb const b) { + return Vec64cb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} +// vector operator ^= : bitwise xor +static inline Vec64cb & operator ^= (Vec64cb & a, Vec64cb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec64cb operator == (Vec64cb const a, Vec64cb const b) { + return Vec64cb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : xor +static inline Vec64cb operator != (Vec64cb const a, Vec64cb const b) { + return a ^ b; +} + +// vector operator ~ : bitwise not +static inline Vec64cb operator ~ (Vec64cb const a) { + return Vec64cb(~a.get_low(), ~a.get_high());} + +// vector operator ! : element not +static inline Vec64cb operator ! (Vec64cb const a) { + return ~a; +} + +// vector function andnot +static inline Vec64cb andnot (Vec64cb const a, Vec64cb const b) { + return Vec64cb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec64cb const a) { + return horizontal_and(a.get_low()) && horizontal_and(a.get_high()); +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec64cb const a) { + return horizontal_or(a.get_low()) || horizontal_or(a.get_high()); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint64_t to_bits(Vec64cb x) { + return (uint64_t(to_bits(x.get_high())) << 32) | to_bits(x.get_low()); +} + + +/***************************************************************************** +* +* Define operators for Vec64c +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec64c operator + (Vec64c const a, Vec64c const b) { + return Vec64c(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator += : add +static inline Vec64c & operator += (Vec64c & a, Vec64c const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec64c operator ++ (Vec64c & a, int) { + Vec64c a0 = a; + a = a + 1; + return a0; +} + +// prefix operator ++ +static inline Vec64c & operator ++ (Vec64c & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec64c operator - (Vec64c const a, Vec64c const b) { + return Vec64c(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator - : unary minus +static inline Vec64c operator - (Vec64c const a) { + return Vec64c(-a.get_low(), -a.get_high()); +} + +// vector operator -= : subtract +static inline Vec64c & operator -= (Vec64c & a, Vec64c const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec64c operator -- (Vec64c & a, int) { + Vec64c a0 = a; + a = a - 1; + return a0; +} + +// prefix operator -- +static inline Vec64c & operator -- (Vec64c & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec64c operator * (Vec64c const a, Vec64c const b) { + return Vec64c(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator *= : multiply +static inline Vec64c & operator *= (Vec64c & a, Vec64c const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +// See bottom of file + +// vector operator << : shift left +static inline Vec64c operator << (Vec64c const a, int32_t b) { + return Vec64c(a.get_low() << b, a.get_high() << b); +} + +// vector operator <<= : shift left +static inline Vec64c & operator <<= (Vec64c & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec64c operator >> (Vec64c const a, int32_t b) { + return Vec64c(a.get_low() >> b, a.get_high() >> b); +} + +// vector operator >>= : shift right arithmetic +static inline Vec64c & operator >>= (Vec64c & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec64cb operator == (Vec64c const a, Vec64c const b) { + return Vec64cb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec64cb operator != (Vec64c const a, Vec64c const b) { + return Vec64cb(a.get_low() != b.get_low(), a.get_high() != b.get_high()); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec64cb operator > (Vec64c const a, Vec64c const b) { + return Vec64cb(a.get_low() > b.get_low(), a.get_high() > b.get_high()); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec64cb operator < (Vec64c const a, Vec64c const b) { + return b > a; +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec64cb operator >= (Vec64c const a, Vec64c const b) { + return Vec64cb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec64cb operator <= (Vec64c const a, Vec64c const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec64c operator & (Vec64c const a, Vec64c const b) { + return Vec64c(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec64c & operator &= (Vec64c & a, Vec64c const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec64c operator | (Vec64c const a, Vec64c const b) { + return Vec64c(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} + +// vector operator |= : bitwise or +static inline Vec64c & operator |= (Vec64c & a, Vec64c const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec64c operator ^ (Vec64c const a, Vec64c const b) { + return Vec64c(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ^= : bitwise xor +static inline Vec64c & operator ^= (Vec64c & a, Vec64c const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec64c operator ~ (Vec64c const a) { + return Vec64c(~a.get_low(), ~a.get_high()); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec64c select (Vec64cb const s, Vec64c const a, Vec64c const b) { + return Vec64c(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec64c if_add (Vec64cb const f, Vec64c const a, Vec64c const b) { + return Vec64c(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec64c if_sub (Vec64cb const f, Vec64c const a, Vec64c const b) { + return Vec64c(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec64c if_mul (Vec64cb const f, Vec64c const a, Vec64c const b) { + return Vec64c(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int8_t horizontal_add (Vec64c const a) { + return (int8_t)horizontal_add(a.get_low() + a.get_high()); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is sign-extended before addition to avoid overflow +static inline int32_t horizontal_add_x (Vec64c const a) { + return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec64c add_saturated(Vec64c const a, Vec64c const b) { + return Vec64c(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high())); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec64c sub_saturated(Vec64c const a, Vec64c const b) { + return Vec64c(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high())); +} + +// function max: a > b ? a : b +static inline Vec64c max(Vec64c const a, Vec64c const b) { + return Vec64c(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec64c min(Vec64c const a, Vec64c const b) { + return Vec64c(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + +// function abs: a >= 0 ? a : -a +static inline Vec64c abs(Vec64c const a) { + return Vec64c(abs(a.get_low()), abs(a.get_high())); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec64c abs_saturated(Vec64c const a) { + return Vec64c(abs_saturated(a.get_low()), abs_saturated(a.get_high())); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec64c rotate_left(Vec64c const a, int b) { + return Vec64c(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); +} + + +/***************************************************************************** +* +* Vector of 64 8-bit unsigned integers +* +*****************************************************************************/ + +class Vec64uc : public Vec64c { +public: + // Default constructor: + Vec64uc() { + } + // Construct from Vec64c + Vec64uc(Vec64c const a) : Vec64c(a) { + } + // Constructor to broadcast the same value into all elements: + Vec64uc(uint8_t i) : Vec64c(int8_t(i)) { + } + // Constructor to build from two Vec32uc: + Vec64uc(Vec32uc const a0, Vec32uc const a1) : Vec64c(a0,a1) { + } + // Constructor to build from all elements: + Vec64uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, + uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, + uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, + uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31, + uint8_t i32, uint8_t i33, uint8_t i34, uint8_t i35, uint8_t i36, uint8_t i37, uint8_t i38, uint8_t i39, + uint8_t i40, uint8_t i41, uint8_t i42, uint8_t i43, uint8_t i44, uint8_t i45, uint8_t i46, uint8_t i47, + uint8_t i48, uint8_t i49, uint8_t i50, uint8_t i51, uint8_t i52, uint8_t i53, uint8_t i54, uint8_t i55, + uint8_t i56, uint8_t i57, uint8_t i58, uint8_t i59, uint8_t i60, uint8_t i61, uint8_t i62, uint8_t i63) { + // _mm512_set_epi8 and _mm512_set_epi16 missing in GCC 7.4.0 + uint8_t aa[64] = { + i0, i1, i2, i3, i4, i5, i6, i7,i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 }; + load(aa); + } + +#ifdef VECTORI512_H + // Constructor to convert from type __m512i used in intrinsics: + Vec64uc(__m512i const x) : Vec64c(x) {}; + + // Assignment operator to convert from type __m512i used in intrinsics: + Vec64uc & operator = (__m512i const x) { + return *this = Vec64uc(x); + } +#else + // Constructor to convert from type Vec512b + Vec64uc(Vec512b const x) : Vec64c(x) {} + + // Assignment operator to convert from type __m512i used in intrinsics: + Vec64uc & operator = (Vec512b const x) { + return *this = Vec64uc(x); + } +#endif + // Member function to load from array (unaligned) + Vec64uc & load(void const * p) { + Vec64c::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec64uc & load_a(void const * p) { + Vec64c::load_a(p); + return *this; + } + // Member function to change a single element in vector + // Note: This function is inefficient. Use load function if changing more than one element + Vec64uc const insert(int index, uint8_t value) { + Vec64c::insert(index, (int8_t)value); + return *this; + } + // Member function extract a single element from vector + uint8_t extract(int index) const { + return (uint8_t)Vec64c::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint8_t operator [] (int index) const { + return (uint8_t)Vec64c::extract(index); + } + // Member functions to split into two Vec32uc: + Vec32uc get_low() const { + return Vec32uc(Vec64c::get_low()); + } + Vec32uc get_high() const { + return Vec32uc(Vec64c::get_high()); + } + static constexpr int elementtype() { + return 5; + } +}; + +// Define operators for this class + +// vector operator + : add element by element +static inline Vec64uc operator + (Vec64uc const a, Vec64uc const b) { + return Vec64uc(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator - : subtract element by element +static inline Vec64uc operator - (Vec64uc const a, Vec64uc const b) { + return Vec64uc(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator ' : multiply element by element +static inline Vec64uc operator * (Vec64uc const a, Vec64uc const b) { + return Vec64uc(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator / : divide +// See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec64uc operator >> (Vec64uc const a, uint32_t b) { + return Vec64uc(a.get_low() >> b, a.get_high() >> b); +} +static inline Vec64uc operator >> (Vec64uc const a, int b) { + return a >> uint32_t(b); +} + +// vector operator >>= : shift right logical +static inline Vec64uc & operator >>= (Vec64uc & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical (signed b) +static inline Vec64uc & operator >>= (Vec64uc & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec64uc operator << (Vec64uc const a, uint32_t b) { + return Vec64uc(a.get_low() << b, a.get_high() << b); +} +static inline Vec64uc operator << (Vec64uc const a, int b) { + return a << uint32_t(b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec64cb operator < (Vec64uc const a, Vec64uc const b) { + return Vec64cb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec64cb operator > (Vec64uc const a, Vec64uc const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec64cb operator >= (Vec64uc const a, Vec64uc const b) { + return Vec64cb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec64cb operator <= (Vec64uc const a, Vec64uc const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec64uc operator & (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) & Vec64c(b)); +} + +// vector operator | : bitwise or +static inline Vec64uc operator | (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) | Vec64c(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec64uc operator ^ (Vec64uc const a, Vec64uc const b) { + return Vec64uc(Vec64c(a) ^ Vec64c(b)); +} + +// vector operator ~ : bitwise not +static inline Vec64uc operator ~ (Vec64uc const a) { + return Vec64uc( ~ Vec64c(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec64uc select (Vec64cb const s, Vec64uc const a, Vec64uc const b) { + return Vec64uc(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec64uc if_add (Vec64cb const f, Vec64uc const a, Vec64uc const b) { + return Vec64uc(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec64uc if_sub (Vec64cb const f, Vec64uc const a, Vec64uc const b) { + return Vec64uc(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec64uc if_mul (Vec64cb const f, Vec64uc const a, Vec64uc const b) { + return Vec64uc(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec64uc add_saturated(Vec64uc const a, Vec64uc const b) { + return Vec64uc(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high())); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec64uc sub_saturated(Vec64uc const a, Vec64uc const b) { + return Vec64uc(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high())); +} + +// function max: a > b ? a : b +static inline Vec64uc max(Vec64uc const a, Vec64uc const b) { + return Vec64uc(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec64uc min(Vec64uc const a, Vec64uc const b) { + return Vec64uc(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + + +/***************************************************************************** +* +* Vector of 32 16-bit signed integers +* +*****************************************************************************/ + +class Vec32s : public Vec64c { +public: + // Default constructor: + Vec32s() { + } + // Constructor to broadcast the same value into all elements: + Vec32s(int16_t i) { + z0 = z1 = Vec16s(i); + } + // Constructor to build from all elements: + Vec32s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7, + int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15, + int16_t i16, int16_t i17, int16_t i18, int16_t i19, int16_t i20, int16_t i21, int16_t i22, int16_t i23, + int16_t i24, int16_t i25, int16_t i26, int16_t i27, int16_t i28, int16_t i29, int16_t i30, int16_t i31) { + Vec16s x0 = Vec16s(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); + Vec16s x1 = Vec16s(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); + *this = Vec32s(x0,x1); + } + // Constructor to build from two Vec16s: + Vec32s(Vec16s const a0, Vec16s const a1) { + z0 = a0; z1 = a1; + } +#ifdef VECTORI512_H + // Constructor to convert from type __m512i used in intrinsics: + Vec32s(__m512i const x) { + Vec16i zz(x); + z0 = zz.get_low(); + z1 = zz.get_high(); + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec32s & operator = (__m512i const x) { + Vec16i zz(x); + z0 = zz.get_low(); + z1 = zz.get_high(); + return *this; + } +#else + // Constructor to convert from type Vec512b + Vec32s(Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + } + // Assignment operator to convert from type Vec512b + Vec32s & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } +#endif + // Member function to load from array (unaligned) + Vec32s & load(void const * p) { + z0 = Vec16s().load(p); + z1 = Vec16s().load((int16_t*)p + 16); + return *this; + } + // Member function to load from array, aligned by 64 + Vec32s & load_a(void const * p) { + z0 = Vec16s().load_a(p); + z1 = Vec16s().load_a((int16_t*)p + 16); + return *this; + } + // Partial load. Load n elements and set the rest to 0 + Vec32s & load_partial(int n, void const * p) { + if (uint32_t(n) < 16) { + z0 = Vec16s().load_partial(n, p); + z1 = Vec16s(0); + } + else { + z0 = Vec16s().load(p); + z1 = Vec16s().load_partial(n-16, (int16_t*)p + 16); + } + return *this; + } + // store + void store(void * p) const { + Vec16s(z0).store(p); + Vec16s(z1).store((int16_t*)p + 16); + } + // store aligned + void store_a(void * p) const { + Vec16s(z0).store_a(p); + Vec16s(z1).store_a((int16_t*)p + 16); + } + // Partial store. Store n elements + void store_partial(int n, void * p) const { + if (uint32_t(n) < 16) { + Vec16s(z0).store_partial(n, p); + } + else { + Vec16s(z0).store(p); + Vec16s(z1).store_partial(n-16, (int16_t*)p + 16); + } + } + // cut off vector to n elements. The last 32-n elements are set to zero + Vec32s & cutoff(int n) { + if (uint32_t(n) < 16) { + z0 = Vec16s(z0).cutoff(n); + z1 = Vec16s(0); + } + else { + z1 = Vec16s(z1).cutoff(n-16); + } + return *this; + } + // Member function to change a single element in vector + Vec32s const insert(int index, int16_t value) { + if ((uint32_t)index < 16) { + z0 = Vec16s(z0).insert(index, value); + } + else { + z1 = Vec16s(z1).insert(index-16, value); + } + return *this; + } + // Member function extract a single element from vector + int16_t extract(int index) const { + if (index < 16) { + return Vec16s(z0).extract(index); + } + else { + return Vec16s(z1).extract(index-16); + } + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + int16_t operator [] (int index) const { + return extract(index); + } + // Member functions to split into two Vec16s: + Vec16s get_low() const { + return z0; + } + Vec16s get_high() const { + return z1; + } + static constexpr int size() { + return 32; + } + static constexpr int elementtype() { + return 6; + } +}; + + +/***************************************************************************** +* +* Vec32sb: Vector of 64 Booleans for use with Vec32s and Vec32us +* +*****************************************************************************/ + +class Vec32sb : public Vec32s { +public: + // Default constructor: + Vec32sb () { + } + // Constructor to build from all elements: Not implemented + + // Constructor to convert from type __mmask32 used in intrinsics: not possible + + // Constructor to broadcast single value: + Vec32sb(bool b) { + z0 = z1 = Vec16s(-int16_t(b)); + } + // Constructor to make from two halves + Vec32sb (Vec16sb const x0, Vec16sb const x1) { + z0 = x0; z1 = x1; + } + // Assignment operator to convert from type __mmask32 used in intrinsics: not possible + + // Assignment operator to broadcast scalar value: + Vec32sb & operator = (bool b) { + *this = Vec32sb(b); + return *this; + } + // Member functions to split into two Vec16sb: + Vec16sb get_low() const { + return z0; + } + Vec16sb get_high() const { + return z1; + } + // Member function to change a single element in vector + Vec32sb & insert(int index, bool a) { + Vec32s::insert(index, -(int16_t)a); + return *this; + } + // Member function extract a single element from vector + bool extract(int index) const { + return Vec32s::extract(index) != 0; + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + bool operator [] (int index) const { + return extract(index); + } + // Type cast operator to convert to __mmask64 used in intrinsics. Not possible + + // Member function to change a bitfield to a boolean vector + Vec32sb & load_bits(uint32_t a) { + z0 = Vec16sb().load_bits(uint16_t(a)); + z1 = Vec16sb().load_bits(uint16_t(a>>16)); + return *this; + } + static constexpr int elementtype() { + return 3; + } + Vec32sb(int b) = delete; // Prevent constructing from int, etc. + Vec32sb & operator = (int x) = delete; // Prevent assigning int because of ambiguity +}; + + +/***************************************************************************** +* +* Define operators and functions for Vec32sb +* +*****************************************************************************/ + +// vector operator & : bitwise and +static inline Vec32sb operator & (Vec32sb const a, Vec32sb const b) { + return Vec32sb(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} +static inline Vec32sb operator && (Vec32sb const a, Vec32sb const b) { + return a & b; +} +// vector operator &= : bitwise and +static inline Vec32sb & operator &= (Vec32sb & a, Vec32sb const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec32sb operator | (Vec32sb const a, Vec32sb const b) { + return Vec32sb(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} +static inline Vec32sb operator || (Vec32sb const a, Vec32sb const b) { + return a | b; +} +// vector operator |= : bitwise or +static inline Vec32sb & operator |= (Vec32sb & a, Vec32sb const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec32sb operator ^ (Vec32sb const a, Vec32sb const b) { + return Vec32sb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} +// vector operator ^= : bitwise xor +static inline Vec32sb & operator ^= (Vec32sb & a, Vec32sb const b) { + a = a ^ b; + return a; +} + +// vector operator == : xnor +static inline Vec32sb operator == (Vec32sb const a, Vec32sb const b) { + return Vec32sb(a.get_low() == b.get_low(), a.get_high() == b.get_high());} + +// vector operator != : xor +static inline Vec32sb operator != (Vec32sb const a, Vec32sb const b) { + return Vec32sb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());} + +// vector operator ~ : bitwise not +static inline Vec32sb operator ~ (Vec32sb const a) { + return Vec32sb(~a.get_low(), ~a.get_high());} + +// vector operator ! : element not +static inline Vec32sb operator ! (Vec32sb const a) { + return ~a; +} + +// vector function andnot +static inline Vec32sb andnot (Vec32sb const a, Vec32sb const b) { + return Vec32sb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));} + +// horizontal_and. Returns true if all bits are 1 +static inline bool horizontal_and (Vec32sb const a) { + return horizontal_and(a.get_low()) && horizontal_and(a.get_high()); +} + +// horizontal_or. Returns true if at least one bit is 1 +static inline bool horizontal_or (Vec32sb const a) { + return horizontal_or(a.get_low()) || horizontal_or(a.get_high()); +} + +// to_bits: convert boolean vector to integer bitfield +static inline uint32_t to_bits(Vec32sb a) { + return uint32_t(to_bits(a.get_high())) << 16 | to_bits(a.get_low()); +} + + +/***************************************************************************** +* +* Define operators for Vec32s +* +*****************************************************************************/ + +// vector operator + : add element by element +static inline Vec32s operator + (Vec32s const a, Vec32s const b) { + return Vec32s(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator += : add +static inline Vec32s & operator += (Vec32s & a, Vec32s const b) { + a = a + b; + return a; +} + +// postfix operator ++ +static inline Vec32s operator ++ (Vec32s & a, int) { + Vec32s a0 = a; + a = a + 1; + return a0; +} + +// prefix operator ++ +static inline Vec32s & operator ++ (Vec32s & a) { + a = a + 1; + return a; +} + +// vector operator - : subtract element by element +static inline Vec32s operator - (Vec32s const a, Vec32s const b) { + return Vec32s(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator - : unary minus +static inline Vec32s operator - (Vec32s const a) { + return Vec32s(-a.get_low(), -a.get_high()); +} + +// vector operator -= : subtract +static inline Vec32s & operator -= (Vec32s & a, Vec32s const b) { + a = a - b; + return a; +} + +// postfix operator -- +static inline Vec32s operator -- (Vec32s & a, int) { + Vec32s a0 = a; + a = a - 1; + return a0; +} + +// prefix operator -- +static inline Vec32s & operator -- (Vec32s & a) { + a = a - 1; + return a; +} + +// vector operator * : multiply element by element +static inline Vec32s operator * (Vec32s const a, Vec32s const b) { + return Vec32s(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator *= : multiply +static inline Vec32s & operator *= (Vec32s & a, Vec32s const b) { + a = a * b; + return a; +} + +// vector operator / : divide all elements by same integer +// See bottom of file + +// vector operator << : shift left +static inline Vec32s operator << (Vec32s const a, int32_t b) { + return Vec32s(a.get_low() << b, a.get_high() << b); +} + +// vector operator <<= : shift left +static inline Vec32s & operator <<= (Vec32s & a, int32_t b) { + a = a << b; + return a; +} + +// vector operator >> : shift right arithmetic +static inline Vec32s operator >> (Vec32s const a, int32_t b) { + return Vec32s(a.get_low() >> b, a.get_high() >> b); +} + +// vector operator >>= : shift right arithmetic +static inline Vec32s & operator >>= (Vec32s & a, int32_t b) { + a = a >> b; + return a; +} + +// vector operator == : returns true for elements for which a == b +static inline Vec32sb operator == (Vec32s const a, Vec32s const b) { + return Vec32sb(a.get_low() == b.get_low(), a.get_high() == b.get_high()); +} + +// vector operator != : returns true for elements for which a != b +static inline Vec32sb operator != (Vec32s const a, Vec32s const b) { + return Vec32sb(a.get_low() != b.get_low(), a.get_high() != b.get_high()); +} + +// vector operator > : returns true for elements for which a > b +static inline Vec32sb operator > (Vec32s const a, Vec32s const b) { + return Vec32sb(a.get_low() > b.get_low(), a.get_high() > b.get_high()); +} + +// vector operator < : returns true for elements for which a < b +static inline Vec32sb operator < (Vec32s const a, Vec32s const b) { + return b > a; +} + +// vector operator >= : returns true for elements for which a >= b (signed) +static inline Vec32sb operator >= (Vec32s const a, Vec32s const b) { + return Vec32sb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (signed) +static inline Vec32sb operator <= (Vec32s const a, Vec32s const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec32s operator & (Vec32s const a, Vec32s const b) { + return Vec32s(a.get_low() & b.get_low(), a.get_high() & b.get_high()); +} + +// vector operator &= : bitwise and +static inline Vec32s & operator &= (Vec32s & a, Vec32s const b) { + a = a & b; + return a; +} + +// vector operator | : bitwise or +static inline Vec32s operator | (Vec32s const a, Vec32s const b) { + return Vec32s(a.get_low() | b.get_low(), a.get_high() | b.get_high()); +} + +// vector operator |= : bitwise or +static inline Vec32s & operator |= (Vec32s & a, Vec32s const b) { + a = a | b; + return a; +} + +// vector operator ^ : bitwise xor +static inline Vec32s operator ^ (Vec32s const a, Vec32s const b) { + return Vec32s(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high()); +} + +// vector operator ^= : bitwise xor +static inline Vec32s & operator ^= (Vec32s & a, Vec32s const b) { + a = a ^ b; + return a; +} + +// vector operator ~ : bitwise not +static inline Vec32s operator ~ (Vec32s const a) { + return Vec32s(~a.get_low(), ~a.get_high()); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec32s select (Vec32sb const s, Vec32s const a, Vec32s const b) { + return Vec32s(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec32s if_add (Vec32sb const f, Vec32s const a, Vec32s const b) { + return Vec32s(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec32s if_sub (Vec32sb const f, Vec32s const a, Vec32s const b) { + return Vec32s(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec32s if_mul (Vec32sb const f, Vec32s const a, Vec32s const b) { + return Vec32s(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// Horizontal add: Calculates the sum of all vector elements. Overflow will wrap around +static inline int16_t horizontal_add (Vec32s const a) { + Vec16s s = a.get_low() + a.get_high(); + return (int16_t)horizontal_add(s); +} + +// Horizontal add extended: Calculates the sum of all vector elements. +// Each element is sign-extended before addition to avoid overflow +static inline int32_t horizontal_add_x (Vec32s const a) { + return horizontal_add_x(a.get_low()) + horizontal_add_x(a.get_high()); +} + +// function add_saturated: add element by element, signed with saturation +static inline Vec32s add_saturated(Vec32s const a, Vec32s const b) { + return Vec32s(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high())); +} + +// function sub_saturated: subtract element by element, signed with saturation +static inline Vec32s sub_saturated(Vec32s const a, Vec32s const b) { + return Vec32s(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high())); +} + +// function max: a > b ? a : b +static inline Vec32s max(Vec32s const a, Vec32s const b) { + return Vec32s(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec32s min(Vec32s const a, Vec32s const b) { + return Vec32s(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + +// function abs: a >= 0 ? a : -a +static inline Vec32s abs(Vec32s const a) { + return Vec32s(abs(a.get_low()), abs(a.get_high())); +} + +// function abs_saturated: same as abs, saturate if overflow +static inline Vec32s abs_saturated(Vec32s const a) { + return Vec32s(abs_saturated(a.get_low()), abs_saturated(a.get_high())); +} + +// function rotate_left all elements +// Use negative count to rotate right +static inline Vec32s rotate_left(Vec32s const a, int b) { + return Vec32s(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b)); +} + + +/***************************************************************************** +* +* Vector of 32 16-bit unsigned integers +* +*****************************************************************************/ + +class Vec32us : public Vec32s { +public: + // Default constructor: + Vec32us() { + } + // Construct from Vec32s + Vec32us(Vec32s const a) { + z0 = a.get_low(); z1 = a.get_high(); + } + // Constructor to broadcast the same value into all elements: + Vec32us(uint16_t i) { + z0 = z1 = Vec16us(i); + } + // Constructor to build from all elements: + Vec32us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, + uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15, + uint16_t i16, uint16_t i17, uint16_t i18, uint16_t i19, uint16_t i20, uint16_t i21, uint16_t i22, uint16_t i23, + uint16_t i24, uint16_t i25, uint16_t i26, uint16_t i27, uint16_t i28, uint16_t i29, uint16_t i30, uint16_t i31) { + Vec16us x0 = Vec16us(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); + Vec16us x1 = Vec16us(i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); + *this = Vec32us(x0,x1); + } + // Constructor to build from two Vec16us: + Vec32us(Vec16us const a0, Vec16us const a1) { + z0 = a0; z1 = a1; + } +#ifdef VECTORI512_H + // Constructor to convert from type __m512i used in intrinsics: + Vec32us(__m512i const x) : Vec32s(x) { + } + // Assignment operator to convert from type __m512i used in intrinsics: + Vec32us & operator = (__m512i const x) { + return *this = Vec32us(x); + } +#else + // Constructor to convert from type Vec512b + Vec32us(Vec512b const x) : Vec32s(x) {} + // Assignment operator to convert from type Vec512b + Vec32us & operator = (Vec512b const x) { + z0 = x.get_low(); + z1 = x.get_high(); + return *this; + } +#endif + // Member function to load from array (unaligned) + Vec32us & load(void const * p) { + Vec32s::load(p); + return *this; + } + // Member function to load from array, aligned by 64 + Vec32us & load_a(void const * p) { + Vec32s::load_a(p); + return *this; + } + // Member function to change a single element in vector + Vec32us const insert(int index, uint16_t value) { + Vec32s::insert(index, (int16_t)value); + return *this; + } + // Member function extract a single element from vector + uint16_t extract(int index) const { + return (uint16_t)Vec32s::extract(index); + } + // Extract a single element. Use store function if extracting more than one element. + // Operator [] can only read an element, not write. + uint16_t operator [] (int index) const { + return (uint16_t)Vec32s::extract(index); + } + // Member functions to split into two Vec16us: + Vec16us get_low() const { + return Vec16us(Vec32s::get_low()); + } + Vec16us get_high() const { + return Vec16us(Vec32s::get_high()); + } + static constexpr int elementtype() { + return 7; + } +}; + +// Define operators for this class + +// vector operator + : add element by element +static inline Vec32us operator + (Vec32us const a, Vec32us const b) { + return Vec32us(a.get_low() + b.get_low(), a.get_high() + b.get_high()); +} + +// vector operator - : subtract element by element +static inline Vec32us operator - (Vec32us const a, Vec32us const b) { + return Vec32us(a.get_low() - b.get_low(), a.get_high() - b.get_high()); +} + +// vector operator * : multiply element by element +static inline Vec32us operator * (Vec32us const a, Vec32us const b) { + return Vec32us(a.get_low() * b.get_low(), a.get_high() * b.get_high()); +} + +// vector operator / : divide. See bottom of file + +// vector operator >> : shift right logical all elements +static inline Vec32us operator >> (Vec32us const a, uint32_t b) { + return Vec32us(a.get_low() >> b, a.get_high() >> b); +} +static inline Vec32us operator >> (Vec32us const a, int b) { + return a >> uint32_t(b); +} + +// vector operator >>= : shift right logical +static inline Vec32us & operator >>= (Vec32us & a, uint32_t b) { + a = a >> b; + return a; +} + +// vector operator >>= : shift right logical (signed b) +static inline Vec32us & operator >>= (Vec32us & a, int32_t b) { + a = a >> uint32_t(b); + return a; +} + +// vector operator << : shift left all elements +static inline Vec32us operator << (Vec32us const a, uint32_t b) { + return Vec32us(a.get_low() << b, a.get_high() << b); +} +static inline Vec32us operator << (Vec32us const a, int b) { + return a << uint32_t(b); +} + +// vector operator < : returns true for elements for which a < b (unsigned) +static inline Vec32sb operator < (Vec32us const a, Vec32us const b) { + return Vec32sb(a.get_low() < b.get_low(), a.get_high() < b.get_high()); +} + +// vector operator > : returns true for elements for which a > b (unsigned) +static inline Vec32sb operator > (Vec32us const a, Vec32us const b) { + return b < a; +} + +// vector operator >= : returns true for elements for which a >= b (unsigned) +static inline Vec32sb operator >= (Vec32us const a, Vec32us const b) { + return Vec32sb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high()); +} + +// vector operator <= : returns true for elements for which a <= b (unsigned) +static inline Vec32sb operator <= (Vec32us const a, Vec32us const b) { + return b >= a; +} + +// vector operator & : bitwise and +static inline Vec32us operator & (Vec32us const a, Vec32us const b) { + return Vec32us(Vec32s(a) & Vec32s(b)); +} + +// vector operator | : bitwise or +static inline Vec32us operator | (Vec32us const a, Vec32us const b) { + return Vec32us(Vec32s(a) | Vec32s(b)); +} + +// vector operator ^ : bitwise xor +static inline Vec32us operator ^ (Vec32us const a, Vec32us const b) { + return Vec32us(Vec32s(a) ^ Vec32s(b)); +} + +// vector operator ~ : bitwise not +static inline Vec32us operator ~ (Vec32us const a) { + return Vec32us( ~ Vec32s(a)); +} + +// Functions for this class + +// Select between two operands. Corresponds to this pseudocode: +// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; +static inline Vec32us select (Vec32sb const s, Vec32us const a, Vec32us const b) { + return Vec32us(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high())); +} + +// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] +static inline Vec32us if_add (Vec32sb const f, Vec32us const a, Vec32us const b) { + return Vec32us(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional subtract +static inline Vec32us if_sub (Vec32sb const f, Vec32us const a, Vec32us const b) { + return Vec32us(if_sub(f.get_low(), a.get_low(), b.get_low()), if_sub(f.get_high(), a.get_high(), b.get_high())); +} + +// Conditional multiply +static inline Vec32us if_mul (Vec32sb const f, Vec32us const a, Vec32us const b) { + return Vec32us(if_mul(f.get_low(), a.get_low(), b.get_low()), if_mul(f.get_high(), a.get_high(), b.get_high())); +} + +// function add_saturated: add element by element, unsigned with saturation +static inline Vec32us add_saturated(Vec32us const a, Vec32us const b) { + return Vec32us(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high())); +} + +// function sub_saturated: subtract element by element, unsigned with saturation +static inline Vec32us sub_saturated(Vec32us const a, Vec32us const b) { + return Vec32us(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high())); +} + +// function max: a > b ? a : b +static inline Vec32us max(Vec32us const a, Vec32us const b) { + return Vec32us(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high())); +} + +// function min: a < b ? a : b +static inline Vec32us min(Vec32us const a, Vec32us const b) { + return Vec32us(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high())); +} + + +/***************************************************************************** +* +* Vector permute and blend functions +* +*****************************************************************************/ + +// Permute vector of 32 16-bit integers. +template + static inline Vec32s permute32(Vec32s const a) { + return Vec32s( + blend16 (a.get_low(), a.get_high()), + blend16 (a.get_low(), a.get_high())); +} + +template + static inline Vec32us permute32(Vec32us const a) { + return Vec32us (permute32 (Vec32s(a))); +} + +// Permute vector of 64 8-bit integers. +template < + int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, + int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15, + int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, + int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31, + int i32, int i33, int i34, int i35, int i36, int i37, int i38, int i39, + int i40, int i41, int i42, int i43, int i44, int i45, int i46, int i47, + int i48, int i49, int i50, int i51, int i52, int i53, int i54, int i55, + int i56, int i57, int i58, int i59, int i60, int i61, int i62, int i63 > + static inline Vec64c permute64(Vec64c const a) { + return Vec64c( + blend32 < + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 + > (a.get_low(), a.get_high()), + blend32 < + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 + > (a.get_low(), a.get_high())); +} + +template +static inline Vec64uc permute64(Vec64uc const a) { + return Vec64uc (permute64 (Vec64c(a))); +} + +// Blend vector of 32 16-bit integers +template + static inline Vec32s blend32(Vec32s const& a, Vec32s const& b) { + Vec16s x0 = blend_half(a, b); + Vec16s x1 = blend_half(a, b); + return Vec32s(x0, x1); +} + +template +static inline Vec32us blend32(Vec32us const a, Vec32us const b) { + return Vec32us(blend32 (Vec32s(a),Vec32s(b))); +} + +// Blend vector of 64 8-bit integers +template < + int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, + int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15, + int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, + int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31, + int i32, int i33, int i34, int i35, int i36, int i37, int i38, int i39, + int i40, int i41, int i42, int i43, int i44, int i45, int i46, int i47, + int i48, int i49, int i50, int i51, int i52, int i53, int i54, int i55, + int i56, int i57, int i58, int i59, int i60, int i61, int i62, int i63 > + static inline Vec64c blend64(Vec64c const a, Vec64c const b) { + Vec32c x0 = blend_half < Vec64c, + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 > (a, b); + Vec32c x1 = blend_half < Vec64c, + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 > (a, b); + return Vec64c(x0, x1); +} + +template +static inline Vec64uc blend64(Vec64uc const a, Vec64uc const b) { + return Vec64uc(blend64 (Vec64c(a), Vec64c(b))); +} + + +/***************************************************************************** +* +* Vector lookup functions +* +****************************************************************************** +* +* These functions use vector elements as indexes into a table. +* The table is given as one or more vectors +* +*****************************************************************************/ + +// lookup in table of 64 int8_t values +static inline Vec64c lookup64(Vec64c const index, Vec64c const table1) { + int8_t table[64], result[64]; + table1.store(table); + for (int i=0; i<64; i++) result[i] = table[index[i] & 63]; + return Vec64c().load(result); +} + +// lookup in table of 128 int8_t values +static inline Vec64c lookup128(Vec64c const index, Vec64c const table1, Vec64c const table2) { + int8_t table[128], result[64]; + table1.store(table); table2.store(table+64); + for (int i=0; i<64; i++) result[i] = table[index[i] & 127]; + return Vec64c().load(result); +} + +// lookup in table of 256 int8_t values. +// The complete table of all possible 256 byte values is contained in four vectors +// The index is treated as unsigned +static inline Vec64c lookup256(Vec64c const index, Vec64c const table1, Vec64c const table2, Vec64c const table3, Vec64c const table4) { + int8_t table[256], result[64]; + table1.store(table); table2.store(table+64); table3.store(table+128); table4.store(table+192); + for (int i=0; i<64; i++) result[i] = table[index[i] & 255]; + return Vec64c().load(result); +} + +// lookup in table of 32 values +static inline Vec32s lookup32(Vec32s const index, Vec32s const table1) { + int16_t table[32], result[32]; + table1.store(table); + for (int i=0; i<32; i++) result[i] = table[index[i] & 31]; + return Vec32s().load(result); +} + +// lookup in table of 64 values +static inline Vec32s lookup64(Vec32s const index, Vec32s const table1, Vec32s const table2) { + int16_t table[64], result[32]; + table1.store(table); table2.store(table+32); + for (int i=0; i<32; i++) result[i] = table[index[i] & 63]; + return Vec32s().load(result); +} + +// lookup in table of 128 values +static inline Vec32s lookup128(Vec32s const index, Vec32s const table1, Vec32s const table2, Vec32s const table3, Vec32s const table4) { + int16_t table[128], result[32]; + table1.store(table); table2.store(table+32); table3.store(table+64); table4.store(table+96); + for (int i=0; i<32; i++) result[i] = table[index[i] & 127]; + return Vec32s().load(result); +} + + +/***************************************************************************** +* +* Byte shifts +* +*****************************************************************************/ + +// Function shift_bytes_up: shift whole vector left by b bytes. +template +static inline Vec64c shift_bytes_up(Vec64c const a) { + int8_t dat[128]; + if (b < 64) { + Vec64c(0).store(dat); + a.store(dat+b); + return Vec64c().load(dat); + } + else return 0; +} + +// Function shift_bytes_down: shift whole vector right by b bytes +template +static inline Vec64c shift_bytes_down(Vec64c const a) { + int8_t dat[128]; + if (b < 64) { + a.store(dat); + Vec64c(0).store(dat+64); + return Vec64c().load(dat+b); + } + else return 0; +} + + +/***************************************************************************** +* +* Functions for conversion between integer sizes +* +*****************************************************************************/ + +// Extend 8-bit integers to 16-bit integers, signed and unsigned + +// Function extend_low : extends the low 32 elements to 16 bits with sign extension +static inline Vec32s extend_low (Vec64c const a) { + return Vec32s(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : extends the high 16 elements to 16 bits with sign extension +static inline Vec32s extend_high (Vec64c const a) { + return Vec32s(extend_low(a.get_high()), extend_high(a.get_high())); +} + +// Function extend_low : extends the low 16 elements to 16 bits with zero extension +static inline Vec32us extend_low (Vec64uc const a) { + return Vec32us(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : extends the high 19 elements to 16 bits with zero extension +static inline Vec32us extend_high (Vec64uc const a) { + return Vec32us(extend_low(a.get_high()), extend_high(a.get_high())); +} + +// Extend 16-bit integers to 32-bit integers, signed and unsigned + +// Function extend_low : extends the low 8 elements to 32 bits with sign extension +static inline Vec16i extend_low (Vec32s const a) { + return Vec16i(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : extends the high 8 elements to 32 bits with sign extension +static inline Vec16i extend_high (Vec32s const a) { + return Vec16i(extend_low(a.get_high()), extend_high(a.get_high())); +} + +// Function extend_low : extends the low 8 elements to 32 bits with zero extension +static inline Vec16ui extend_low (Vec32us const a) { + return Vec16ui(extend_low(a.get_low()), extend_high(a.get_low())); +} + +// Function extend_high : extends the high 8 elements to 32 bits with zero extension +static inline Vec16ui extend_high (Vec32us const a) { + return Vec16ui(extend_low(a.get_high()), extend_high(a.get_high())); +} + + +// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Overflow wraps around +static inline Vec64c compress (Vec32s const low, Vec32s const high) { + return Vec64c(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Signed, with saturation +static inline Vec64c compress_saturated (Vec32s const low, Vec32s const high) { + return Vec64c(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); +} + +// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers +// Unsigned, overflow wraps around +static inline Vec64uc compress (Vec32us const low, Vec32us const high) { + return Vec64uc(compress((Vec32s)low, (Vec32s)high)); +} + +// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers +// Unsigned, with saturation +static inline Vec64uc compress_saturated (Vec32us const low, Vec32us const high) { + return Vec64uc(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); +} + +// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec32s compress (Vec16i const low, Vec16i const high) { + return Vec32s(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high())); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Signed with saturation +static inline Vec32s compress_saturated (Vec16i const low, Vec16i const high) { + return Vec32s(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Overflow wraps around +static inline Vec32us compress (Vec16ui const low, Vec16ui const high) { + return Vec32us (compress((Vec16i)low, (Vec16i)high)); +} + +// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers +// Unsigned, with saturation +static inline Vec32us compress_saturated (Vec16ui const low, Vec16ui const high) { + return Vec32us(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high())); +} + + +/***************************************************************************** +* +* Integer division operators +* +* Please see the file vectori128.h for explanation. +* +*****************************************************************************/ + +// vector operator / : divide each element by divisor + +// vector of 32 16-bit signed integers +static inline Vec32s operator / (Vec32s const a, Divisor_s const d) { + return Vec32s(a.get_low() / d, a.get_high() / d); +} + +// vector of 16 16-bit unsigned integers +static inline Vec32us operator / (Vec32us const a, Divisor_us const d) { + return Vec32us(a.get_low() / d, a.get_high() / d); +} + +// vector of 32 8-bit signed integers +static inline Vec64c operator / (Vec64c const a, Divisor_s const d) { + return Vec64c(a.get_low() / d, a.get_high() / d); +} + +// vector of 32 8-bit unsigned integers +static inline Vec64uc operator / (Vec64uc const a, Divisor_us const d) { + return Vec64uc(a.get_low() / d, a.get_high() / d); +} + +// vector operator /= : divide +static inline Vec32s & operator /= (Vec32s & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec32us & operator /= (Vec32us & a, Divisor_us const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec64c & operator /= (Vec64c & a, Divisor_s const d) { + a = a / d; + return a; +} + +// vector operator /= : divide +static inline Vec64uc & operator /= (Vec64uc & a, Divisor_us const d) { + a = a / d; + return a; +} + + +/***************************************************************************** +* +* Integer division 2: divisor is a compile-time constant +* +*****************************************************************************/ + + +// Divide Vec32s by compile-time constant +template +static inline Vec32s divide_by_i(Vec32s const a) { + return Vec32s(divide_by_i(a.get_low()), divide_by_i(a.get_high())); +} + +// define Vec32s a / const_int(d) +template +static inline Vec32s operator / (Vec32s const a, Const_int_t) { + return Vec32s(divide_by_i(a.get_low()), divide_by_i(a.get_high())); +} + +// define Vec32s a / const_uint(d) +template +static inline Vec32s operator / (Vec32s const a, Const_uint_t) { + return Vec32s(divide_by_i(a.get_low()), divide_by_i(a.get_high())); +} + +// vector operator /= : divide +template +static inline Vec32s & operator /= (Vec32s & a, Const_int_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec32s & operator /= (Vec32s & a, Const_uint_t b) { + a = a / b; + return a; +} + +// Divide Vec32us by compile-time constant +template +static inline Vec32us divide_by_ui(Vec32us const a) { + return Vec32us( divide_by_ui(a.get_low()), divide_by_ui(a.get_high())); +} + +// define Vec32us a / const_uint(d) +template +static inline Vec32us operator / (Vec32us const a, Const_uint_t) { + return divide_by_ui(a); +} + +// define Vec32us a / const_int(d) +template +static inline Vec32us operator / (Vec32us const a, Const_int_t) { + static_assert(d >= 0, "Dividing unsigned integer by negative is ambiguous"); + return divide_by_ui(a); // unsigned divide +} + +// vector operator /= : divide +template +static inline Vec32us & operator /= (Vec32us & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec32us & operator /= (Vec32us & a, Const_int_t b) { + a = a / b; + return a; +} + +// define Vec64c a / const_int(d) +template +static inline Vec64c operator / (Vec64c const a, Const_int_t b) { + return Vec64c( a.get_low() / b, a.get_high() / b); +} + +// define Vec64c a / const_uint(d) +template +static inline Vec64c operator / (Vec64c const a, Const_uint_t b) { + return Vec64c( a.get_low() / b, a.get_high() / b); +} + +// vector operator /= : divide +template +static inline Vec64c & operator /= (Vec64c & a, Const_int_t b) { + a = a / b; + return a; +} +// vector operator /= : divide +template +static inline Vec64c & operator /= (Vec64c & a, Const_uint_t b) { + a = a / b; + return a; +} + +// define Vec64uc a / const_uint(d) +template +static inline Vec64uc operator / (Vec64uc const a, Const_uint_t b) { + return Vec64uc( a.get_low() / b, a.get_high() / b); +} + +// define Vec64uc a / const_int(d) +template +static inline Vec64uc operator / (Vec64uc const a, Const_int_t b) { + return Vec64uc( a.get_low() / b, a.get_high() / b); +} + +// vector operator /= : divide +template +static inline Vec64uc & operator /= (Vec64uc & a, Const_uint_t b) { + a = a / b; + return a; +} + +// vector operator /= : divide +template +static inline Vec64uc & operator /= (Vec64uc & a, Const_int_t b) { + a = a / b; + return a; +} + +#ifdef VCL_NAMESPACE +} +#endif + +#endif // VECTORI512S_H diff --git a/DFTTest/vectorclass/vectormath_common.h b/DFTTest/VCL2/vectormath_common.h similarity index 68% rename from DFTTest/vectorclass/vectormath_common.h rename to DFTTest/VCL2/vectormath_common.h index ded82bd..8e082b6 100644 --- a/DFTTest/vectorclass/vectormath_common.h +++ b/DFTTest/VCL2/vectormath_common.h @@ -1,43 +1,46 @@ /*************************** vectormath_common.h **************************** * Author: Agner Fog * Date created: 2014-04-18 -* Last modified: 2016-11-25 -* Version: 1.25 +* Last modified: 2020-06-08 +* Version: 2.01.03 * Project: vector classes * Description: * Header file containing common code for inline version of mathematical functions. * -* Theory, methods and inspiration based partially on these sources: -* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. -* Ellis Horwood, 1989. -* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and -* Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt -* > Cephes math library by Stephen L. Moshier 1992, -* http://www.netlib.org/cephes/ -* -* Calculation methods: -* Some functions are using Padé approximations f(x) = P(x)/Q(x) -* Most single precision functions are using Taylor expansions -* * For detailed instructions, see VectorClass.pdf * -* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses +* (c) Copyright 2014-2020 Agner Fog. +* Apache License version 2.0 or later. ******************************************************************************/ #ifndef VECTORMATH_COMMON_H -#define VECTORMATH_COMMON_H 1 +#define VECTORMATH_COMMON_H 2 #ifdef VECTORMATH_LIB_H -#error conflicting header files: vectormath_lib.h for external math functions, other vectormath_xxx.h for inline math functions +#error conflicting header files. More than one implementation of mathematical functions included #endif -#include +#include + +#ifndef VECTORCLASS_H #include "vectorclass.h" +#endif +#if VECTORCLASS_H < 20000 +#error Incompatible versions of vector class library mixed +#endif /****************************************************************************** - define mathematical constants + Define NAN payload values +******************************************************************************/ +#define NAN_LOG 0x101 // logarithm for x<0 +#define NAN_POW 0x102 // negative number raised to non-integer power +#define NAN_HYP 0x104 // acosh for x<1 and atanh for abs(x)>1 + + +/****************************************************************************** + Define mathematical constants ******************************************************************************/ #define VM_PI 3.14159265358979323846 // pi #define VM_PI_2 1.57079632679489661923 // pi / 2 @@ -51,6 +54,7 @@ #define VM_SMALLEST_NORMAL 2.2250738585072014E-308 // smallest normal number, double #define VM_SMALLEST_NORMALF 1.17549435E-38f // smallest normal number, float + #ifdef VCL_NAMESPACE namespace VCL_NAMESPACE { #endif @@ -100,53 +104,58 @@ inline Vec16f infinite_vec() { #endif // MAX_VECTOR_SIZE >= 512 -// template for producing quiet NAN -template -static inline VTYPE nan_vec(int n = 0x100); -template <> -inline Vec2d nan_vec(int n) { - return nan2d(n); +/****************************************************************************** +* Detect NAN codes +* +* These functions return the code hidden in a NAN. The sign bit is ignored +******************************************************************************/ + +static inline Vec4ui nan_code(Vec4f const x) { + Vec4ui a = Vec4ui(reinterpret_i(x)); + Vec4ui const n = 0x007FFFFF; + return select(Vec4ib(is_nan(x)), a & n, 0); } -template <> -inline Vec4f nan_vec(int n) { - return nan4f(n); +// This function returns the code hidden in a NAN. The sign bit is ignored +static inline Vec2uq nan_code(Vec2d const x) { + Vec2uq a = Vec2uq(reinterpret_i(x)); + return select(Vec2qb(is_nan(x)), a << 12 >> (12+29), 0); } #if MAX_VECTOR_SIZE >= 256 -template <> -inline Vec4d nan_vec(int n) { - return nan4d(n); +// This function returns the code hidden in a NAN. The sign bit is ignored +static inline Vec8ui nan_code(Vec8f const x) { + Vec8ui a = Vec8ui(reinterpret_i(x)); + Vec8ui const n = 0x007FFFFF; + return select(Vec8ib(is_nan(x)), a & n, 0); } -template <> -inline Vec8f nan_vec(int n) { - return nan8f(n); +// This function returns the code hidden in a NAN. The sign bit is ignored +static inline Vec4uq nan_code(Vec4d const x) { + Vec4uq a = Vec4uq(reinterpret_i(x)); + return select(Vec4qb(is_nan(x)), a << 12 >> (12+29), 0); } #endif // MAX_VECTOR_SIZE >= 256 - #if MAX_VECTOR_SIZE >= 512 -template <> -inline Vec8d nan_vec(int n) { - return nan8d(n); +// This function returns the code hidden in a NAN. The sign bit is ignored +static inline Vec16ui nan_code(Vec16f const x) { + Vec16ui a = Vec16ui(reinterpret_i(x)); + Vec16ui const n = 0x007FFFFF; + return select(Vec16ib(is_nan(x)), a & n, 0); } -template <> -inline Vec16f nan_vec(int n) { - return nan16f(n); +// This function returns the code hidden in a NAN. The sign bit is ignored +static inline Vec8uq nan_code(Vec8d const x) { + Vec8uq a = Vec8uq(reinterpret_i(x)); + return select(Vec8qb(is_nan(x)), a << 12 >> (12+29), 0); } #endif // MAX_VECTOR_SIZE >= 512 -// Define NAN trace values -#define NAN_LOG 0x101 // logarithm for x<0 -#define NAN_POW 0x102 // negative number raised to non-integer power -#define NAN_HYP 0x104 // acosh for x<1 and atanh for abs(x)>1 - /****************************************************************************** templates for polynomials @@ -154,9 +163,9 @@ Using Estrin's scheme to make shorter dependency chains and use FMA, starting longest dependency chains first. ******************************************************************************/ -// template +// template template -static inline VTYPE polynomial_2(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2) { +static inline VTYPE polynomial_2(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2) { // calculates polynomial c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -165,7 +174,7 @@ static inline VTYPE polynomial_2(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2) } template -static inline VTYPE polynomial_3(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { +static inline VTYPE polynomial_3(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { // calculates polynomial c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -174,7 +183,7 @@ static inline VTYPE polynomial_3(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_4(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { +static inline VTYPE polynomial_4(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { // calculates polynomial c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -184,7 +193,7 @@ static inline VTYPE polynomial_4(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_4n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { +static inline VTYPE polynomial_4n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3) { // calculates polynomial 1*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -194,7 +203,7 @@ static inline VTYPE polynomial_4n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_5(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { +static inline VTYPE polynomial_5(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { // calculates polynomial c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -204,7 +213,7 @@ static inline VTYPE polynomial_5(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_5n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { +static inline VTYPE polynomial_5n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4) { // calculates polynomial 1*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -214,7 +223,7 @@ static inline VTYPE polynomial_5n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_6(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) { +static inline VTYPE polynomial_6(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6) { // calculates polynomial c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -224,7 +233,7 @@ static inline VTYPE polynomial_6(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_6n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { +static inline VTYPE polynomial_6n(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5) { // calculates polynomial 1*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -234,7 +243,7 @@ static inline VTYPE polynomial_6n(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_7(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) { +static inline VTYPE polynomial_7(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7) { // calculates polynomial c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -244,7 +253,7 @@ static inline VTYPE polynomial_7(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_8(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) { +static inline VTYPE polynomial_8(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8) { // calculates polynomial c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -256,7 +265,7 @@ static inline VTYPE polynomial_8(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_9(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) { +static inline VTYPE polynomial_9(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9) { // calculates polynomial c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -269,7 +278,7 @@ static inline VTYPE polynomial_9(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_10(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) { +static inline VTYPE polynomial_10(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10) { // calculates polynomial c10*x^10 + c9*x^9 + c8*x^8 + c7*x^7 + c6*x^6 + c5*x^5 + c4*x^4 + c3*x^3 + c2*x^2 + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -282,7 +291,7 @@ static inline VTYPE polynomial_10(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, } template -static inline VTYPE polynomial_13(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { +static inline VTYPE polynomial_13(VTYPE const x, CTYPE c0, CTYPE c1, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; @@ -299,7 +308,7 @@ static inline VTYPE polynomial_13(VTYPE const & x, CTYPE c0, CTYPE c1, CTYPE c2, template -static inline VTYPE polynomial_13m(VTYPE const & x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { +static inline VTYPE polynomial_13m(VTYPE const x, CTYPE c2, CTYPE c3, CTYPE c4, CTYPE c5, CTYPE c6, CTYPE c7, CTYPE c8, CTYPE c9, CTYPE c10, CTYPE c11, CTYPE c12, CTYPE c13) { // calculates polynomial c13*x^13 + c12*x^12 + ... + x + 0 // VTYPE may be a vector type, CTYPE is a scalar type VTYPE x2 = x * x; diff --git a/DFTTest/vectorclass/vectormath_exp.h b/DFTTest/VCL2/vectormath_exp.h similarity index 50% rename from DFTTest/vectorclass/vectormath_exp.h rename to DFTTest/VCL2/vectormath_exp.h index 91f4ee7..124b1a1 100644 --- a/DFTTest/vectorclass/vectormath_exp.h +++ b/DFTTest/VCL2/vectormath_exp.h @@ -1,11 +1,11 @@ /**************************** vectormath_exp.h ****************************** * Author: Agner Fog * Date created: 2014-04-18 -* Last modified: 2016-12-26 -* Version: 1.26 -* Project: vector classes +* Last modified: 2020-06-08 +* Version: 2.00.03 +* Project: vector class library * Description: -* Header file containing inline vector functions of logarithms, exponential +* Header file containing inline vector functions of logarithms, exponential * and power functions: * exp exponential function * exp2 exponential function base 2 @@ -22,20 +22,21 @@ * Theory, methods and inspiration based partially on these sources: * > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. * Ellis Horwood, 1989. -* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and -* Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt +* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and Vincenzo Innocente, +* 2012, https://root.cern.ch/doc/v606_/md_math_vdt_ReadMe.html * > Cephes math library by Stephen L. Moshier 1992, * http://www.netlib.org/cephes/ * -* For detailed instructions, see vectormath_common.h and VectorClass.pdf +* For detailed instructions see vcl_manual.pdf * -* (c) Copyright 2014-2016 GNU General Public License http://www.gnu.org/licenses +* (c) Copyright 2014-2020 Agner Fog. +* Apache License version 2.0 or later. ******************************************************************************/ #ifndef VECTORMATH_EXP_H -#define VECTORMATH_EXP_H 1 +#define VECTORMATH_EXP_H 1 -#include "vectormath_common.h" +#include "vectormath_common.h" #ifdef VCL_NAMESPACE namespace VCL_NAMESPACE { @@ -48,7 +49,7 @@ namespace VCL_NAMESPACE { // Helper functions, used internally: // This function calculates pow(2,n) where n must be an integer. Does not check for overflow or underflow -static inline Vec2d vm_pow2n (Vec2d const & n) { +static inline Vec2d vm_pow2n (Vec2d const n) { const double pow2_52 = 4503599627370496.0; // 2^52 const double bias = 1023.0; // bias in exponent Vec2d a = n + (bias + pow2_52); // put n + bias in least significant bits @@ -58,7 +59,7 @@ static inline Vec2d vm_pow2n (Vec2d const & n) { return d; } -static inline Vec4f vm_pow2n (Vec4f const & n) { +static inline Vec4f vm_pow2n (Vec4f const n) { const float pow2_23 = 8388608.0; // 2^23 const float bias = 127.0; // bias in exponent Vec4f a = n + (bias + pow2_23); // put n + bias in least significant bits @@ -70,7 +71,7 @@ static inline Vec4f vm_pow2n (Vec4f const & n) { #if MAX_VECTOR_SIZE >= 256 -static inline Vec4d vm_pow2n (Vec4d const & n) { +static inline Vec4d vm_pow2n (Vec4d const n) { const double pow2_52 = 4503599627370496.0; // 2^52 const double bias = 1023.0; // bias in exponent Vec4d a = n + (bias + pow2_52); // put n + bias in least significant bits @@ -80,7 +81,7 @@ static inline Vec4d vm_pow2n (Vec4d const & n) { return d; } -static inline Vec8f vm_pow2n (Vec8f const & n) { +static inline Vec8f vm_pow2n (Vec8f const n) { const float pow2_23 = 8388608.0; // 2^23 const float bias = 127.0; // bias in exponent Vec8f a = n + (bias + pow2_23); // put n + bias in least significant bits @@ -94,7 +95,7 @@ static inline Vec8f vm_pow2n (Vec8f const & n) { #if MAX_VECTOR_SIZE >= 512 -static inline Vec8d vm_pow2n (Vec8d const & n) { +static inline Vec8d vm_pow2n (Vec8d const n) { #ifdef __AVX512ER__ return _mm512_exp2a23_round_pd(n, _MM_FROUND_NO_EXC); // this is exact only for integral n #else @@ -108,7 +109,7 @@ static inline Vec8d vm_pow2n (Vec8d const & n) { #endif } -static inline Vec16f vm_pow2n (Vec16f const & n) { +static inline Vec16f vm_pow2n (Vec16f const n) { #ifdef __AVX512ER__ return _mm512_exp2a23_round_ps(n, _MM_FROUND_NO_EXC); #else @@ -130,30 +131,29 @@ static inline Vec16f vm_pow2n (Vec16f const & n) { // This function does not produce denormals // Template parameters: // VTYPE: double vector type -// BVTYPE: boolean vector type // M1: 0 for exp, 1 for expm1 // BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x) -#if 1 // choose method +#if true // choose method // Taylor expansion -template -static inline VTYPE exp_d(VTYPE const & initial_x) { +template +static inline VTYPE exp_d(VTYPE const initial_x) { // Taylor coefficients, 1/n! // Not using minimax approximation because we prioritize precision close to x = 0 const double p2 = 1./2.; const double p3 = 1./6.; const double p4 = 1./24.; - const double p5 = 1./120.; - const double p6 = 1./720.; - const double p7 = 1./5040.; - const double p8 = 1./40320.; - const double p9 = 1./362880.; - const double p10 = 1./3628800.; - const double p11 = 1./39916800.; - const double p12 = 1./479001600.; - const double p13 = 1./6227020800.; + const double p5 = 1./120.; + const double p6 = 1./720.; + const double p7 = 1./5040.; + const double p8 = 1./40320.; + const double p9 = 1./362880.; + const double p10 = 1./3628800.; + const double p11 = 1./39916800.; + const double p12 = 1./479001600.; + const double p13 = 1./6227020800.; // maximum abs(x), value depends on BA, defined below // The lower limit of x is slightly more restrictive than the upper limit. @@ -162,10 +162,9 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { // data vectors VTYPE x, r, z, n2; - BVTYPE inrange; // boolean vector - if (BA <= 1) { // exp(x) - max_x = BA == 0 ? 708.39 : 709.7; // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in hyperbolic functions + if constexpr (BA <= 1) { // exp(x) + max_x = BA == 0 ? 708.39 : 709.7; // lower limit for 0.5*exp(x) is -707.6, but we are using 0.5*exp(x) only for positive x in hyperbolic functions const double ln2d_hi = 0.693145751953125; const double ln2d_lo = 1.42860682030941723212E-6; x = initial_x; @@ -174,15 +173,15 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { x = nmul_add(r, ln2d_hi, x); // x -= r * ln2d_hi; x = nmul_add(r, ln2d_lo, x); // x -= r * ln2d_lo; } - else if (BA == 2) { // pow(2,x) + else if constexpr (BA == 2) { // pow(2,x) max_x = 1022.0; r = round(initial_x); x = initial_x - r; x *= VM_LN2; } - else if (BA == 10) { // pow(10,x) + else if constexpr (BA == 10) { // pow(10,x) max_x = 307.65; - const double log10_2_hi = 0.30102999554947019; // log10(2) in two parts + const double log10_2_hi = 0.30102999554947019; // log10(2) in two parts const double log10_2_lo = 1.1451100899212592E-10; x = initial_x; r = round(initial_x*(VM_LOG2E*VM_LN10)); @@ -197,22 +196,25 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { z = polynomial_13m(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13); - if (BA == 1) r--; // 0.5 * exp(x) + if constexpr (BA == 1) r--; // 0.5 * exp(x) - // multiply by power of 2 + // multiply by power of 2 n2 = vm_pow2n(r); - if (M1 == 0) { + if constexpr (M1 == 0) { // exp z = (z + 1.0) * n2; } else { // expm1 z = mul_add(z, n2, n2 - 1.0); // z = z * n2 + (n2 - 1.0); +#ifdef SIGNED_ZERO // pedantic preservation of signed zero + z = select(initial_x == 0., initial_x, z); +#endif } // check for overflow - inrange = abs(initial_x) < max_x; + auto inrange = abs(initial_x) < max_x; // boolean vector // check for INF and NAN inrange &= is_finite(initial_x); @@ -222,9 +224,9 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { } else { // overflow, underflow and NAN - r = select(sign_bit(initial_x), 0.-M1, infinite_vec()); // value in case of +/- overflow or INF - z = select(inrange, z, r); // +/- underflow - z = select(is_nan(initial_x), initial_x, z); // NAN goes through + r = select(sign_bit(initial_x), 0.-(M1&1), infinite_vec()); // value in case of +/- overflow or INF + z = select(inrange, z, r); // +/- underflow + z = select(is_nan(initial_x), initial_x, z); // NAN goes through return z; } } @@ -232,8 +234,8 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { #else // Pade expansion uses less code and fewer registers, but is slower -template -static inline VTYPE exp_d(VTYPE const & initial_x) { +template +static inline VTYPE exp_d(VTYPE const initial_x) { // define constants const double ln2p1 = 0.693145751953125; @@ -250,7 +252,6 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { const double Q3exp = 3.00198505138664455042E-6; VTYPE x, r, xx, px, qx, y, n2; // data vectors - BVTYPE inrange; // boolean vector x = initial_x; r = round(initial_x*log2e); @@ -275,7 +276,7 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { // n2 = exp2(n); n2 = vm_pow2n(r); // this is faster - if (M1 == 0) { + if constexpr (M1 == 0) { // exp y = (y + 1.0) * n2; } @@ -285,7 +286,7 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { } // overflow - inrange = abs(initial_x) < max_exp; + auto inrange = abs(initial_x) < max_exp; // check for INF and NAN inrange &= is_finite(initial_x); @@ -304,91 +305,90 @@ static inline VTYPE exp_d(VTYPE const & initial_x) { #endif // instances of exp_d template -static inline Vec2d exp(Vec2d const & x) { - return exp_d(x); +static inline Vec2d exp(Vec2d const x) { + return exp_d(x); } -static inline Vec2d expm1(Vec2d const & x) { - return exp_d(x); +static inline Vec2d expm1(Vec2d const x) { + return exp_d(x); } -static inline Vec2d exp2(Vec2d const & x) { - return exp_d(x); +static inline Vec2d exp2(Vec2d const x) { + return exp_d(x); } -static inline Vec2d exp10(Vec2d const & x) { - return exp_d(x); +static inline Vec2d exp10(Vec2d const x) { + return exp_d(x); } #if MAX_VECTOR_SIZE >= 256 -static inline Vec4d exp(Vec4d const & x) { - return exp_d(x); +static inline Vec4d exp(Vec4d const x) { + return exp_d(x); } -static inline Vec4d expm1(Vec4d const & x) { - return exp_d(x); +static inline Vec4d expm1(Vec4d const x) { + return exp_d(x); } -static inline Vec4d exp2(Vec4d const & x) { - return exp_d(x); +static inline Vec4d exp2(Vec4d const x) { + return exp_d(x); } -static inline Vec4d exp10(Vec4d const & x) { - return exp_d(x); +static inline Vec4d exp10(Vec4d const x) { + return exp_d(x); } #endif // MAX_VECTOR_SIZE >= 256 #if MAX_VECTOR_SIZE >= 512 -static inline Vec8d exp(Vec8d const & x) { - return exp_d(x); +static inline Vec8d exp(Vec8d const x) { + return exp_d(x); } -static inline Vec8d expm1(Vec8d const & x) { - return exp_d(x); +static inline Vec8d expm1(Vec8d const x) { + return exp_d(x); } -static inline Vec8d exp2(Vec8d const & x) { - return exp_d(x); +static inline Vec8d exp2(Vec8d const x) { + return exp_d(x); } -static inline Vec8d exp10(Vec8d const & x) { - return exp_d(x); +static inline Vec8d exp10(Vec8d const x) { + return exp_d(x); } #endif // MAX_VECTOR_SIZE >= 512 + // Template for exp function, single precision // The limit of abs(x) is defined by max_x below // This function does not produce denormals // Template parameters: // VTYPE: float vector type -// BVTYPE: boolean vector type // M1: 0 for exp, 1 for expm1 // BA: 0 for exp, 1 for 0.5*exp, 2 for pow(2,x), 10 for pow(10,x) -template -static inline VTYPE exp_f(VTYPE const & initial_x) { +template +static inline VTYPE exp_f(VTYPE const initial_x) { // Taylor coefficients const float P0expf = 1.f/2.f; const float P1expf = 1.f/6.f; const float P2expf = 1.f/24.f; - const float P3expf = 1.f/120.f; - const float P4expf = 1.f/720.f; - const float P5expf = 1.f/5040.f; + const float P3expf = 1.f/120.f; + const float P4expf = 1.f/720.f; + const float P5expf = 1.f/5040.f; - VTYPE x, r, x2, z, n2; // data vectors - BVTYPE inrange; // boolean vector + VTYPE x, r, x2, z, n2; // data vectors // maximum abs(x), value depends on BA, defined below // The lower limit of x is slightly more restrictive than the upper limit. // We are specifying the lower limit, except for BA = 1 because it is not used for negative x float max_x; - if (BA <= 1) { // exp(x) + if constexpr (BA <= 1) { // exp(x) const float ln2f_hi = 0.693359375f; const float ln2f_lo = -2.12194440e-4f; max_x = (BA == 0) ? 87.3f : 89.0f; @@ -398,13 +398,13 @@ static inline VTYPE exp_f(VTYPE const & initial_x) { x = nmul_add(r, VTYPE(ln2f_hi), x); // x -= r * ln2f_hi; x = nmul_add(r, VTYPE(ln2f_lo), x); // x -= r * ln2f_lo; } - else if (BA == 2) { // pow(2,x) + else if constexpr (BA == 2) { // pow(2,x) max_x = 126.f; r = round(initial_x); x = initial_x - r; x = x * (float)VM_LN2; } - else if (BA == 10) { // pow(10,x) + else if constexpr (BA == 10) { // pow(10,x) max_x = 37.9f; const float log10_2_hi = 0.301025391f; // log10(2) in two parts const float log10_2_lo = 4.60503907E-6f; @@ -419,25 +419,28 @@ static inline VTYPE exp_f(VTYPE const & initial_x) { } x2 = x * x; - z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf); + z = polynomial_5(x,P0expf,P1expf,P2expf,P3expf,P4expf,P5expf); z = mul_add(z, x2, x); // z *= x2; z += x; - if (BA == 1) r--; // 0.5 * exp(x) + if constexpr (BA == 1) r--; // 0.5 * exp(x) - // multiply by power of 2 + // multiply by power of 2 n2 = vm_pow2n(r); - if (M1 == 0) { + if constexpr (M1 == 0) { // exp z = (z + 1.0f) * n2; } else { // expm1 z = mul_add(z, n2, n2 - 1.0f); // z = z * n2 + (n2 - 1.0f); +#ifdef SIGNED_ZERO // pedantic preservation of signed zero + z = select(initial_x == 0.f, initial_x, z); +#endif } // check for overflow - inrange = abs(initial_x) < max_x; + auto inrange = abs(initial_x) < max_x; // boolean vector // check for INF and NAN inrange &= is_finite(initial_x); @@ -447,75 +450,75 @@ static inline VTYPE exp_f(VTYPE const & initial_x) { } else { // overflow, underflow and NAN - r = select(sign_bit(initial_x), 0.f-M1, infinite_vec()); // value in case of +/- overflow or INF - z = select(inrange, z, r); // +/- underflow - z = select(is_nan(initial_x), initial_x, z); // NAN goes through + r = select(sign_bit(initial_x), 0.f-(M1&1), infinite_vec()); // value in case of +/- overflow or INF + z = select(inrange, z, r); // +/- underflow + z = select(is_nan(initial_x), initial_x, z); // NAN goes through return z; } } #if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // forward declarations of fast 512 bit versions -static Vec16f exp(Vec16f const & x); -static Vec16f exp2(Vec16f const & x); -static Vec16f exp10(Vec16f const & x); +static Vec16f exp(Vec16f const x); +static Vec16f exp2(Vec16f const x); +static Vec16f exp10(Vec16f const x); #endif // instances of exp_f template -static inline Vec4f exp(Vec4f const & x) { -#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version +static inline Vec4f exp(Vec4f const x) { +#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version return _mm512_castps512_ps128(exp(Vec16f(_mm512_castps128_ps512(x)))); #else - return exp_f(x); + return exp_f(x); #endif } -static inline Vec4f expm1(Vec4f const & x) { - return exp_f(x); +static inline Vec4f expm1(Vec4f const x) { + return exp_f(x); } -static inline Vec4f exp2(Vec4f const & x) { -#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version +static inline Vec4f exp2(Vec4f const x) { +#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version return _mm512_castps512_ps128(exp2(Vec16f(_mm512_castps128_ps512(x)))); #else - return exp_f(x); + return exp_f(x); #endif } -static inline Vec4f exp10(Vec4f const & x) { -#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version +static inline Vec4f exp10(Vec4f const x) { +#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version return _mm512_castps512_ps128(exp10(Vec16f(_mm512_castps128_ps512(x)))); #else - return exp_f(x); + return exp_f(x); #endif } #if MAX_VECTOR_SIZE >= 256 -static inline Vec8f exp(Vec8f const & x) { -#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version +static inline Vec8f exp(Vec8f const x) { +#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version return _mm512_castps512_ps256(exp(Vec16f(_mm512_castps256_ps512(x)))); #else - return exp_f(x); + return exp_f(x); #endif } -static inline Vec8f expm1(Vec8f const & x) { - return exp_f(x); +static inline Vec8f expm1(Vec8f const x) { + return exp_f(x); } -static inline Vec8f exp2(Vec8f const & x) { -#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version +static inline Vec8f exp2(Vec8f const x) { +#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version return _mm512_castps512_ps256(exp2(Vec16f(_mm512_castps256_ps512(x)))); #else - return exp_f(x); + return exp_f(x); #endif } -static inline Vec8f exp10(Vec8f const & x) { -#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version +static inline Vec8f exp10(Vec8f const x) { +#if defined(__AVX512ER__) && MAX_VECTOR_SIZE >= 512 // use faster 512 bit version return _mm512_castps512_ps256(exp10(Vec16f(_mm512_castps256_ps512(x)))); #else - return exp_f(x); + return exp_f(x); #endif } @@ -523,7 +526,7 @@ static inline Vec8f exp10(Vec8f const & x) { #if MAX_VECTOR_SIZE >= 512 -static inline Vec16f exp(Vec16f const & x) { +static inline Vec16f exp(Vec16f const x) { #ifdef __AVX512ER__ // AVX512ER instruction set includes fast exponential function #ifdef VCL_FASTEXP // very fast, but less precise for large x: @@ -535,62 +538,63 @@ static inline Vec16f exp(Vec16f const & x) { const float ln2f_lo = -2.12194440e-4f; Vec16f x1 = x, r, y; r = round(x1*log2e); - x1 = nmul_add(r, Vec16f(ln2f_hi), x1); // x -= r * ln2f_hi; - x1 = nmul_add(r, Vec16f(ln2f_lo), x1); // x -= r * ln2f_lo; + x1 = nmul_add(r, Vec16f(ln2f_hi), x1); // x -= r * ln2f_hi; + x1 = nmul_add(r, Vec16f(ln2f_lo), x1); // x -= r * ln2f_lo; x1 = x1 * log2e; y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC); // y = vm_pow2n(r); return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC); #endif // VCL_FASTEXP #else // no AVX512ER, use above template - return exp_f(x); + return exp_f(x); #endif -} +} -static inline Vec16f expm1(Vec16f const & x) { - return exp_f(x); +static inline Vec16f expm1(Vec16f const x) { + return exp_f(x); } -static inline Vec16f exp2(Vec16f const & x) { +static inline Vec16f exp2(Vec16f const x) { #ifdef __AVX512ER__ return Vec16f(_mm512_exp2a23_round_ps(x, _MM_FROUND_NO_EXC)); #else - return exp_f(x); + return exp_f(x); #endif } -static inline Vec16f exp10(Vec16f const & x) { +static inline Vec16f exp10(Vec16f const x) { #ifdef __AVX512ER__ // AVX512ER instruction set includes fast exponential function #ifdef VCL_FASTEXP // very fast, but less precise for large x: return _mm512_exp2a23_round_ps(x*float(VM_LOG210), _MM_FROUND_NO_EXC); #else // best precision, also for large x: - const float log10_2_hi = 0.301025391f; // log10(2) in two parts + const float log10_2_hi = 0.301025391f; // log10(2) in two parts const float log10_2_lo = 4.60503907E-6f; Vec16f x1 = x, r, y; Vec16f log210 = float(VM_LOG210); r = round(x1*log210); - x1 = nmul_add(r, Vec16f(log10_2_hi), x1); // x -= r * log10_2_hi - x1 = nmul_add(r, Vec16f(log10_2_lo), x1); // x -= r * log10_2_lo + x1 = nmul_add(r, Vec16f(log10_2_hi), x1); // x -= r * log10_2_hi + x1 = nmul_add(r, Vec16f(log10_2_lo), x1); // x -= r * log10_2_lo x1 = x1 * log210; // y = vm_pow2n(r); y = _mm512_exp2a23_round_ps(r, _MM_FROUND_NO_EXC); return y * _mm512_exp2a23_round_ps(x1, _MM_FROUND_NO_EXC); #endif // VCL_FASTEXP #else // no AVX512ER, use above template - return exp_f(x); + return exp_f(x); #endif } #endif // MAX_VECTOR_SIZE >= 512 + /****************************************************************************** * Logarithm functions ******************************************************************************/ -// Helper functions: fraction_2(x) = fraction(x)*0.5 +// Helper function: fraction_2(x) = fraction(x)*0.5 // Modified fraction function: // Extract the fraction part of a floating point number, and divide by 2 @@ -598,33 +602,33 @@ static inline Vec16f exp10(Vec16f const & x) { // fraction_2(x) = fraction(x)*0.5 // This version gives half the fraction without extra delay // Does not work for x = 0 -static inline Vec4f fraction_2(Vec4f const & a) { - Vec4ui t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000); // set exponent to 0 + bias +static inline Vec4f fraction_2(Vec4f const a) { + Vec4ui t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer + Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F000000); // set exponent to 0 + bias return _mm_castsi128_ps(t2); } -static inline Vec2d fraction_2(Vec2d const & a) { - Vec2uq t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer +static inline Vec2d fraction_2(Vec2d const a) { + Vec2uq t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias return _mm_castsi128_pd(t2); } #if MAX_VECTOR_SIZE >= 256 -static inline Vec8f fraction_2(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 2 // 256 bit integer vectors are available, AVX2 - Vec8ui t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000; // set exponent to 0 + bias +static inline Vec8f fraction_2(Vec8f const a) { +#if defined (VECTORI256_H) && VECTORI256_H > 2 // 256 bit integer vectors are available, AVX2 + Vec8ui t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer + Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F000000; // set exponent to 0 + bias return _mm256_castsi256_ps(t2); #else return Vec8f(fraction_2(a.get_low()), fraction_2(a.get_high())); #endif } -static inline Vec4d fraction_2(Vec4d const & a) { +static inline Vec4d fraction_2(Vec4d const a) { #if VECTORI256_H > 1 // AVX2 - Vec4uq t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer + Vec4uq t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FE0000000000000ll); // set exponent to 0 + bias return _mm256_castsi256_pd(t2); #else @@ -636,8 +640,8 @@ static inline Vec4d fraction_2(Vec4d const & a) { #if MAX_VECTOR_SIZE >= 512 -static inline Vec16f fraction_2(Vec16f const & a) { -#if INSTRSET >= 9 // 512 bit integer vectors are available, AVX512 +static inline Vec16f fraction_2(Vec16f const a) { +#if INSTRSET >= 9 // 512 bit integer vectors are available, AVX512 return _mm512_getmant_ps(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); //return Vec16f(_mm512_getmant_ps(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5f; #else @@ -645,8 +649,8 @@ static inline Vec16f fraction_2(Vec16f const & a) { #endif } -static inline Vec8d fraction_2(Vec8d const & a) { -#if INSTRSET >= 9 // 512 bit integer vectors are available, AVX512 +static inline Vec8d fraction_2(Vec8d const a) { +#if INSTRSET >= 9 // 512 bit integer vectors are available, AVX512 return _mm512_getmant_pd(a, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_zero); //return Vec8d(_mm512_getmant_pd(a, _MM_MANT_NORM_1_2, _MM_MANT_SIGN_zero)) * 0.5; #else @@ -657,7 +661,7 @@ static inline Vec8d fraction_2(Vec8d const & a) { #endif // MAX_VECTOR_SIZE >= 512 -// Helper functions: exponent_f(x) = exponent(x) as floating point number +// Helper function: exponent_f(x) = exponent(x) as floating point number union vm_ufi { float f; @@ -670,11 +674,10 @@ union vm_udi { }; // extract exponent of a positive number x as a floating point number -//Note: the AVX512 version return -inf for x=0, the non-AVX versions return a negative number -static inline Vec4f exponent_f(Vec4f const & x) { +static inline Vec4f exponent_f(Vec4f const x) { #ifdef __AVX512VL__ // AVX512VL - // note: this version returns -inf for x=0 - return _mm_getexp_ps(x); + // prevent returning -inf for x=0 + return _mm_maskz_getexp_ps(_mm_cmp_ps_mask(x,Vec4f(0.f),4), x); #else const float pow2_23 = 8388608.0f; // 2^23 const float bias = 127.f; // bias in exponent @@ -688,10 +691,12 @@ static inline Vec4f exponent_f(Vec4f const & x) { #endif } -static inline Vec2d exponent_f(Vec2d const & x) { +static inline Vec2d exponent_f(Vec2d const x) { #ifdef __AVX512VL__ // AVX512VL - // note: this version returns -inf for x=0 - return _mm_getexp_pd(x); + // prevent returning -inf for x=0 + //return _mm_maskz_getexp_pd(x != 0., x); + return _mm_maskz_getexp_pd(_mm_cmp_pd_mask(x,Vec2d(0.),4), x); + #else const double pow2_52 = 4503599627370496.0; // 2^52 const double bias = 1023.0; // bias in exponent @@ -708,10 +713,11 @@ static inline Vec2d exponent_f(Vec2d const & x) { #if MAX_VECTOR_SIZE >= 256 -static inline Vec8f exponent_f(Vec8f const & x) { +static inline Vec8f exponent_f(Vec8f const x) { #ifdef __AVX512VL__ // AVX512VL - // note: this version returns -inf for x=0 - return _mm256_getexp_ps(x); + // prevent returning -inf for x=0 + //return _mm256_maskz_getexp_ps(x != 0.f, x); + return _mm256_maskz_getexp_ps(_mm256_cmp_ps_mask(x,Vec8f(0.f),4), x); #else const float pow2_23 = 8388608.0f; // 2^23 const float bias = 127.f; // bias in exponent @@ -723,17 +729,18 @@ static inline Vec8f exponent_f(Vec8f const & x) { Vec8f e = d - (pow2_23 + bias); // subtract magic number and bias return e; #endif -} +} // extract exponent of a positive number x as a floating point number -static inline Vec4d exponent_f(Vec4d const & x) { +static inline Vec4d exponent_f(Vec4d const x) { #ifdef __AVX512VL__ // AVX512VL - return _mm256_getexp_pd(x); + // prevent returning -inf for x=0 + //return _mm256_maskz_getexp_pd(x != 0., x); + return _mm256_maskz_getexp_pd(_mm256_cmp_pd_mask(x,Vec4d(0.),4), x); #else const double pow2_52 = 4503599627370496.0; // 2^52 const double bias = 1023.0; // bias in exponent const vm_udi upow2_52 = {pow2_52}; - Vec4uq a = reinterpret_i(x); // bit-cast x to integer Vec4uq b = a >> 52; // shift down exponent to low bits Vec4uq c = b | Vec4uq(upow2_52.i); // insert new exponent @@ -747,20 +754,20 @@ static inline Vec4d exponent_f(Vec4d const & x) { #if MAX_VECTOR_SIZE >= 512 -static inline Vec16f exponent_f(Vec16f const & x) { +static inline Vec16f exponent_f(Vec16f const x) { #if INSTRSET >= 9 // AVX512 - // note: this version returns -inf for x=0 - return _mm512_getexp_ps(x); + // prevent returning -inf for x=0 + return _mm512_maskz_getexp_ps(x != 0.f, x); #else return Vec16f(exponent_f(x.get_low()), exponent_f(x.get_high())); #endif -} +} // extract exponent of a positive number x as a floating point number -static inline Vec8d exponent_f(Vec8d const & x) { +static inline Vec8d exponent_f(Vec8d const x) { #if INSTRSET >= 9 // AVX512 - // note: this returns -inf for x=0 - return _mm512_getexp_pd(x); + // prevent returning -inf for x=0 + return _mm512_maskz_getexp_pd(uint8_t(x != 0.), x); #else return Vec8d(exponent_f(x.get_low()), exponent_f(x.get_high())); #endif @@ -768,14 +775,176 @@ static inline Vec8d exponent_f(Vec8d const & x) { #endif // MAX_VECTOR_SIZE >= 512 +// Helper function: log_special_cases(x,r). Handle special cases for log function +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d log_special_cases(Vec8d const x1, Vec8d const r) { + Vec8d res = r; +#if INSTRSET >= 10 // AVX512DQ + Vec8db specialcases = _mm512_fpclass_pd_mask(x1, 0x7E);// zero, subnormal, negative, +-inf + if (!horizontal_or(specialcases)) { + return res; // normal path + } + res = _mm512_fixupimm_pd(res, x1, Vec8q(0x03530411),0);// handle most cases + res = select(Vec8db(_mm512_fpclass_pd_mask(x1, 0x26)),-infinite_vec(),res); // subnormal -> -INF + res = select(Vec8db(_mm512_fpclass_pd_mask(x1, 0x50)),nan_vec(NAN_LOG),res); // negative -> specific NAN + return res; +#else + Vec8db overflow = !is_finite(x1); + Vec8db underflow = x1 < VM_SMALLEST_NORMAL; // denormals not supported by this functions + if (!horizontal_or(overflow | underflow)) { + return res; // normal path + } + // overflow and underflow + res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN + res = select(is_zero_or_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF + res = select(overflow, x1, res); // INF or NAN goes through + res = select(is_inf(x1) & sign_bit(x1), nan_vec(NAN_LOG), res);// -INF gives NAN + return res; +#endif // INSTRSET +} + +static inline Vec16f log_special_cases(Vec16f const x1, Vec16f const r) { + Vec16f res = r; +#if INSTRSET >= 10 // AVX512DQ + Vec16fb specialcases = _mm512_fpclass_ps_mask(x1, 0x7E); // zero, subnormal, negative, +-inf + if (!horizontal_or(specialcases)) { + return res; // normal path + } + res = _mm512_fixupimm_ps(res, x1, Vec16i(0x03530411), 0); // handle most cases + res = select(Vec16fb(_mm512_fpclass_ps_mask(x1, 0x26)),-infinite_vec(),res); // subnormal -> -INF + res = select(Vec16fb(_mm512_fpclass_ps_mask(x1, 0x50)),nan_vec(NAN_LOG),res); // negative -> specific NAN + return res; +#else + Vec16fb overflow = !is_finite(x1); + Vec16fb underflow = x1 < VM_SMALLEST_NORMALF;// denormals not supported by this functions + if (!horizontal_or(overflow | underflow)) { + return res; // normal path + } + // overflow and underflow + res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN + res = select(is_zero_or_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF + res = select(overflow, x1, res); // INF or NAN goes through + res = select(is_inf(x1) & sign_bit(x1), nan_vec(NAN_LOG), res);// -INF gives NAN + return res; +#endif // INSTRSET +} + +#endif // MAX_VECTOR_SIZE >= 512 + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d log_special_cases(Vec4d const x1, Vec4d const r) { + Vec4d res = r; +#if INSTRSET >= 10 // AVX512DQ AVX512VL + __mmask8 specialcases = _mm256_fpclass_pd_mask(x1, 0x7E); // zero, subnormal, negative, +-inf + if (specialcases == 0) { + return res; // normal path + } + res = _mm256_fixupimm_pd(res, x1, Vec4q(0x03530411), 0); // handle most cases + res = _mm256_mask_mov_pd(res, _mm256_fpclass_pd_mask(x1, 0x26), -infinite_vec()); // subnormal -> -INF + res = _mm256_mask_mov_pd(res, _mm256_fpclass_pd_mask(x1, 0x50), nan_vec(NAN_LOG)); // negative -> specific NAN + return res; +#else + Vec4db overflow = !is_finite(x1); + Vec4db underflow = x1 < VM_SMALLEST_NORMAL; // denormals not supported by this functions + if (!horizontal_or(overflow | underflow)) { + return res; // normal path + } + // overflow and underflow + res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN + res = select(is_zero_or_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF + res = select(overflow, x1, res); // INF or NAN goes through + res = select(is_inf(x1) & sign_bit(x1), nan_vec(NAN_LOG), res);// -INF gives NAN + return res; +#endif // INSTRSET +} + +static inline Vec8f log_special_cases(Vec8f const x1, Vec8f const r) { + Vec8f res = r; +#if INSTRSET >= 10 // AVX512DQ AVX512VL + __mmask8 specialcases = _mm256_fpclass_ps_mask(x1, 0x7E); // zero, subnormal, negative, +-inf + if (specialcases == 0) { + return res; // normal path + } + res = _mm256_fixupimm_ps(res, x1, Vec8i(0x03530411), 0); // handle most cases + res = _mm256_mask_mov_ps(res, _mm256_fpclass_ps_mask(x1, 0x26), -infinite_vec()); // subnormal -> -INF + res = _mm256_mask_mov_ps(res, _mm256_fpclass_ps_mask(x1, 0x50), nan_vec(NAN_LOG)); // negative -> specific NAN + return res; +#else + Vec8fb overflow = !is_finite(x1); + Vec8fb underflow = x1 < VM_SMALLEST_NORMALF; // denormals not supported by this functions + if (!horizontal_or(overflow | underflow)) { + return res; // normal path + } + // overflow and underflow + res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN + res = select(is_zero_or_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF + res = select(overflow, x1, res); // INF or NAN goes through + res = select(is_inf(x1) & sign_bit(x1), nan_vec(NAN_LOG), res);// -INF gives NAN + return res; +#endif // INSTRSET +} + +#endif // MAX_VECTOR_SIZE >= 256 + +static inline Vec2d log_special_cases(Vec2d const x1, Vec2d const r) { + Vec2d res = r; +#if INSTRSET >= 10 // AVX512DQ AVX512VL + __mmask8 specialcases = _mm_fpclass_pd_mask(x1, 0x7E); // zero, subnormal, negative, +-inf + if (specialcases == 0) { + return res; // normal path + } + res = _mm_fixupimm_pd(res, x1, Vec2q(0x03530411), 0); // handle most cases + res = _mm_mask_mov_pd(res, _mm_fpclass_pd_mask(x1, 0x26), -infinite_vec()); // subnormal -> -INF + res = _mm_mask_mov_pd(res, _mm_fpclass_pd_mask(x1, 0x50), nan_vec(NAN_LOG)); // negative -> specific NAN + return res; +#else + Vec2db overflow = !is_finite(x1); + Vec2db underflow = x1 < VM_SMALLEST_NORMAL; // denormals not supported by this functions + if (!horizontal_or(overflow | underflow)) { + return res; // normal path + } + // overflow and underflow + res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN + res = select(is_zero_or_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF + res = select(overflow, x1, res); // INF or NAN goes through + res = select(is_inf(x1) & sign_bit(x1), nan_vec(NAN_LOG), res);// -INF gives NAN + return res; +#endif // INSTRSET +} + +static inline Vec4f log_special_cases(Vec4f const x1, Vec4f const r) { + Vec4f res = r; +#if INSTRSET >= 10 // AVX512DQ AVX512VL + __mmask8 specialcases = _mm_fpclass_ps_mask(x1, 0x7E); // zero, subnormal, negative, +-inf + if (specialcases == 0) { + return res; // normal path + } + res = _mm_fixupimm_ps(res, x1, Vec4i(0x03530411), 0); // handle most cases + res = _mm_mask_mov_ps(res, _mm_fpclass_ps_mask(x1, 0x26), -infinite_vec()); // subnormal -> -INF + res = _mm_mask_mov_ps(res, _mm_fpclass_ps_mask(x1, 0x50), nan_vec(NAN_LOG)); // negative -> specific NAN + return res; +#else + Vec4fb overflow = !is_finite(x1); + Vec4fb underflow = x1 < VM_SMALLEST_NORMALF; // denormals not supported by this functions + if (!horizontal_or(overflow | underflow)) { + return res; // normal path + } + // overflow and underflow + res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN + res = select(is_zero_or_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF + res = select(overflow, x1, res); // INF or NAN goes through + res = select(is_inf(x1) & sign_bit(x1), nan_vec(NAN_LOG), res);// -INF gives NAN + return res; +#endif // INSTRSET +} + // log function, double precision // template parameters: // VTYPE: f.p. vector type -// BVTYPE: boolean vector type // M1: 0 for log, 1 for log1p -template -static inline VTYPE log_d(VTYPE const & initial_x) { +template +static inline VTYPE log_d(VTYPE const initial_x) { // define constants const double ln2_hi = 0.693359375; @@ -793,24 +962,23 @@ static inline VTYPE log_d(VTYPE const & initial_x) { const double Q4log = 1.12873587189167450590E1; VTYPE x1, x, x2, px, qx, res, fe; // data vectors - BVTYPE blend, overflow, underflow; // boolean vectors - if (M1 == 0) { + if constexpr (M1 == 0) { x1 = initial_x; // log(x) } else { x1 = initial_x + 1.0; // log(x+1) } - // separate mantissa from exponent + // separate mantissa from exponent // VTYPE x = fraction(x1) * 0.5; x = fraction_2(x1); fe = exponent_f(x1); - blend = x > VM_SQRT2*0.5; + auto blend = x > VM_SQRT2*0.5; // boolean vector x = if_add(!blend, x, x); // conditional add fe = if_add(blend, fe, 1.); // conditional add - if (M1 == 0) { + if constexpr (M1 == 0) { // log(x). Expand around 1.0 x -= 1.0; } @@ -819,7 +987,7 @@ static inline VTYPE log_d(VTYPE const & initial_x) { x = select(fe==0., initial_x, x - 1.0); } - // rational form + // rational form px = polynomial_5 (x, P0log, P1log, P2log, P3log, P4log, P5log); x2 = x * x; px *= x * x2; @@ -830,92 +998,77 @@ static inline VTYPE log_d(VTYPE const & initial_x) { res = mul_add(fe, ln2_lo, res); // res += fe * ln2_lo; res += nmul_add(x2, 0.5, x); // res += x - 0.5 * x2; res = mul_add(fe, ln2_hi, res); // res += fe * ln2_hi; - - overflow = !is_finite(x1); - underflow = x1 < VM_SMALLEST_NORMAL; // denormals not supported by this functions - - if (!horizontal_or(overflow | underflow)) { - // normal path - return res; - } - else { - // overflow and underflow - res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN - res = select(x1 == 0. || is_subnormal(x1), -infinite_vec(), res); // x1 == 0 gives -INF - res = select(overflow, x1, res); // INF or NAN goes through - res = select(is_inf(x1)&sign_bit(x1), nan_vec(NAN_LOG), res); // -INF gives NAN - return res; - } +#ifdef SIGNED_ZERO // pedantic preservation of signed zero + res = select(initial_x == 0., initial_x, res); +#endif + // handle special cases, or return res + return log_special_cases(x1, res); } -static inline Vec2d log(Vec2d const & x) { - return log_d(x); +static inline Vec2d log(Vec2d const x) { + return log_d(x); } -static inline Vec2d log1p(Vec2d const & x) { - return log_d(x); +static inline Vec2d log1p(Vec2d const x) { + return log_d(x); } -static inline Vec2d log2(Vec2d const & x) { - return VM_LOG2E * log_d(x); +static inline Vec2d log2(Vec2d const x) { + return VM_LOG2E * log_d(x); } -static inline Vec2d log10(Vec2d const & x) { - return VM_LOG10E * log_d(x); +static inline Vec2d log10(Vec2d const x) { + return VM_LOG10E * log_d(x); } #if MAX_VECTOR_SIZE >= 256 -static inline Vec4d log(Vec4d const & x) { - return log_d(x); +static inline Vec4d log(Vec4d const x) { + return log_d(x); } -static inline Vec4d log1p(Vec4d const & x) { - return log_d(x); +static inline Vec4d log1p(Vec4d const x) { + return log_d(x); } -static inline Vec4d log2(Vec4d const & x) { - return VM_LOG2E * log_d(x); +static inline Vec4d log2(Vec4d const x) { + return VM_LOG2E * log_d(x); } -static inline Vec4d log10(Vec4d const & x) { - return VM_LOG10E * log_d(x); +static inline Vec4d log10(Vec4d const x) { + return VM_LOG10E * log_d(x); } #endif // MAX_VECTOR_SIZE >= 256 #if MAX_VECTOR_SIZE >= 512 -static inline Vec8d log(Vec8d const & x) { - return log_d(x); +static inline Vec8d log(Vec8d const x) { + return log_d(x); } -static inline Vec8d log1p(Vec8d const & x) { - return log_d(x); +static inline Vec8d log1p(Vec8d const x) { + return log_d(x); } -static inline Vec8d log2(Vec8d const & x) { - return VM_LOG2E * log_d(x); +static inline Vec8d log2(Vec8d const x) { + return VM_LOG2E * log_d(x); } -static inline Vec8d log10(Vec8d const & x) { - return VM_LOG10E * log_d(x); +static inline Vec8d log10(Vec8d const x) { + return VM_LOG10E * log_d(x); } #endif // MAX_VECTOR_SIZE >= 512 - // log function, single precision // template parameters: // VTYPE: f.p. vector type -// ITYPE: integer vector type with same element size -// BVTYPE: boolean vector type -// BTYPEI: boolean vector type for ITYPE // M1: 0 for log, 1 for log1p -template -static inline VTYPE log_f(VTYPE const & initial_x) { +template +static inline VTYPE log_f(VTYPE const initial_x) { // define constants const float ln2f_hi = 0.693359375f; @@ -931,32 +1084,30 @@ static inline VTYPE log_f(VTYPE const & initial_x) { const float P8logf = 7.0376836292E-2f; VTYPE x1, x, res, x2, fe; // data vectors - ITYPE e; // integer vector - BVTYPE blend, overflow, underflow; // boolean vectors - if (M1 == 0) { + if constexpr (M1 == 0) { x1 = initial_x; // log(x) } else { x1 = initial_x + 1.0f; // log(x+1) } - // separate mantissa from exponent + // separate mantissa from exponent x = fraction_2(x1); - e = exponent(x1); + auto e = exponent(x1); // integer vector - blend = x > float(VM_SQRT2*0.5); + auto blend = x > float(VM_SQRT2*0.5); // boolean vector x = if_add(!blend, x, x); // conditional add - e = if_add(BTYPEI(blend), e, ITYPE(1)); // conditional add + e = if_add(decltype(e>e)(blend), e, decltype(e)(1)); // conditional add fe = to_float(e); - if (M1 == 0) { + if constexpr (M1 == 0) { // log(x). Expand around 1.0 x -= 1.0f; } else { // log(x+1). Avoid loss of precision when adding 1 and later subtracting 1 if exponent = 0 - x = select(BVTYPE(e==0), initial_x, x - 1.0f); + x = select(decltype(x>x)(e==0), initial_x, x - 1.0f); } // Taylor expansion @@ -968,76 +1119,65 @@ static inline VTYPE log_f(VTYPE const & initial_x) { res = mul_add(fe, ln2f_lo, res); // res += ln2f_lo * fe; res += nmul_add(x2, 0.5f, x); // res += x - 0.5f * x2; res = mul_add(fe, ln2f_hi, res); // res += ln2f_hi * fe; - - overflow = !is_finite(x1); - underflow = x1 < VM_SMALLEST_NORMALF; // denormals not supported by this functions - - if (!horizontal_or(overflow | underflow)) { - // normal path - return res; - } - else { - // overflow and underflow - res = select(underflow, nan_vec(NAN_LOG), res); // x1 < 0 gives NAN - res = select(x1 == 0.f || is_subnormal(x1), -infinite_vec(), res); // x1 == 0 or denormal gives -INF - res = select(overflow, x1, res); // INF or NAN goes through - res = select(is_inf(x1)&sign_bit(x1), nan_vec(NAN_LOG), res); // -INF gives NAN - return res; - } +#ifdef SIGNED_ZERO // pedantic preservation of signed zero + res = select(initial_x == 0.f, initial_x, res); +#endif + // handle special cases, or return res + return log_special_cases(x1, res); } -static inline Vec4f log(Vec4f const & x) { - return log_f(x); +static inline Vec4f log(Vec4f const x) { + return log_f(x); } -static inline Vec4f log1p(Vec4f const & x) { - return log_f(x); +static inline Vec4f log1p(Vec4f const x) { + return log_f(x); } -static inline Vec4f log2(Vec4f const & x) { - return float(VM_LOG2E) * log_f(x); +static inline Vec4f log2(Vec4f const x) { + return float(VM_LOG2E) * log_f(x); } -static inline Vec4f log10(Vec4f const & x) { - return float(VM_LOG10E) * log_f(x); +static inline Vec4f log10(Vec4f const x) { + return float(VM_LOG10E) * log_f(x); } #if MAX_VECTOR_SIZE >= 256 -static inline Vec8f log(Vec8f const & x) { - return log_f(x); +static inline Vec8f log(Vec8f const x) { + return log_f(x); } -static inline Vec8f log1p(Vec8f const & x) { - return log_f(x); +static inline Vec8f log1p(Vec8f const x) { + return log_f(x); } -static inline Vec8f log2(Vec8f const & x) { - return float(VM_LOG2E) * log_f(x); +static inline Vec8f log2(Vec8f const x) { + return float(VM_LOG2E) * log_f(x); } -static inline Vec8f log10(Vec8f const & x) { - return float(VM_LOG10E) * log_f(x); +static inline Vec8f log10(Vec8f const x) { + return float(VM_LOG10E) * log_f(x); } #endif // MAX_VECTOR_SIZE >= 256 #if MAX_VECTOR_SIZE >= 512 -static inline Vec16f log(Vec16f const & x) { - return log_f(x); +static inline Vec16f log(Vec16f const x) { + return log_f(x); } -static inline Vec16f log1p(Vec16f const & x) { - return log_f(x); +static inline Vec16f log1p(Vec16f const x) { + return log_f(x); } -static inline Vec16f log2(Vec16f const & x) { - return float(VM_LOG2E) * log_f(x); +static inline Vec16f log2(Vec16f const x) { + return float(VM_LOG2E) * log_f(x); } -static inline Vec16f log10(Vec16f const & x) { - return float(VM_LOG10E) * log_f(x); +static inline Vec16f log10(Vec16f const x) { + return float(VM_LOG10E) * log_f(x); } #endif // MAX_VECTOR_SIZE >= 512 @@ -1050,20 +1190,22 @@ static inline Vec16f log10(Vec16f const & x) { // cube root template, double precision // template parameters: // VTYPE: f.p. vector type -// ITYPE: uint32_t integer vector type with same total number of bits -// ITYPE2: uint64_t integer vector type with same total number of bits -// BVTYPE: boolean vector type // CR: -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared -template -static inline VTYPE cbrt_d(VTYPE const & x) { +template +static inline VTYPE cbrt_d(VTYPE const x) { const int iter = 7; // iteration count of x^(-1/3) loop int i; - VTYPE xa, xa3, a, a2; - ITYPE m1, m2; + typedef decltype(x < x) BVTYPE; // boolean vector type + typedef decltype(roundi(x)) ITYPE64; // 64 bit integer vector type + typedef decltype(roundi(compress(x,x))) ITYPE32; // 32 bit integer vector type + + ITYPE32 m1, m2; BVTYPE underflow; - ITYPE2 q1(0x5540000000000000ULL); // exponent bias - ITYPE2 q2(0x0005555500000000ULL); // exponent multiplier for 1/3 - ITYPE2 q3(0x0010000000000000ULL); // denormal limit + ITYPE64 q1(0x5540000000000000ULL); // exponent bias + ITYPE64 q2(0x0005555500000000ULL); // exponent multiplier for 1/3 + ITYPE64 q3(0x0010000000000000ULL); // denormal limit + + VTYPE xa, xa3, a, a2; const double one_third = 1./3.; const double four_third = 4./3.; @@ -1072,36 +1214,36 @@ static inline VTYPE cbrt_d(VTYPE const & x) { // multiply exponent by -1/3 m1 = reinterpret_i(xa); - m2 = ITYPE(q1) - (m1 >> 20) * ITYPE(q2); + m2 = ITYPE32(q1) - (m1 >> 20) * ITYPE32(q2); a = reinterpret_d(m2); - underflow = BVTYPE(ITYPE2(m1) < q3); // true if denormal or zero + underflow = BVTYPE(ITYPE64(m1) <= q3); // true if denormal or zero - // Newton Raphson iteration + // Newton Raphson iteration. Warning: may overflow! for (i = 0; i < iter-1; i++) { a2 = a * a; a = nmul_add(xa3, a2*a2, four_third*a); // a = four_third*a - xa3*a2*a2; } // last iteration with better precision - a2 = a * a; + a2 = a * a; a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); // a = a + one_third*(a - xa*a2*a2); - if (CR == -1) { // reciprocal cube root - // (note: gives wrong sign when input is INF) - // generate INF if underflow - a = select(underflow, infinite_vec(), a); - // get sign - a = sign_combine(a, x); + if constexpr (CR == -1) { // reciprocal cube root + a = select(underflow, infinite_vec(), a); // generate INF if underflow + a = select(is_inf(x), VTYPE(0), a); // special case for INF // get sign + a = sign_combine(a, x); // get sign } - else if (CR == 1) { // cube root + else if constexpr (CR == 1) { // cube root a = a * a * x; - // generate 0 if underflow - a = select(underflow, 0., a); + a = select(underflow, 0., a); // generate 0 if underflow + a = select(is_inf(x), x, a); // special case for INF +#ifdef SIGNED_ZERO + a = a | (x & VTYPE(-0.0)); // get sign of x +#endif } - else if (CR == 2) { // cube root squared - // (note: gives wrong sign when input is INF) + else if constexpr (CR == 2) { // cube root squared a = a * xa; - // generate 0 if underflow - a = select(underflow, 0., a); + a = select(underflow, 0., a); // generate 0 if underflow + a = select(is_inf(x), xa, a); // special case for INF } return a; } @@ -1109,48 +1251,48 @@ static inline VTYPE cbrt_d(VTYPE const & x) { // template instances for cbrt and reciprocal_cbrt // cube root -static inline Vec2d cbrt(Vec2d const & x) { - return cbrt_d (x); +static inline Vec2d cbrt(Vec2d const x) { + return cbrt_d (x); } // reciprocal cube root -static inline Vec2d reciprocal_cbrt(Vec2d const & x) { - return cbrt_d (x); +static inline Vec2d reciprocal_cbrt(Vec2d const x) { + return cbrt_d (x); } // square cube root -static inline Vec2d square_cbrt(Vec2d const & x) { - return cbrt_d (x); +static inline Vec2d square_cbrt(Vec2d const x) { + return cbrt_d (x); } #if MAX_VECTOR_SIZE >= 256 -static inline Vec4d cbrt(Vec4d const & x) { - return cbrt_d (x); +static inline Vec4d cbrt(Vec4d const x) { + return cbrt_d (x); } -static inline Vec4d reciprocal_cbrt(Vec4d const & x) { - return cbrt_d (x); +static inline Vec4d reciprocal_cbrt(Vec4d const x) { + return cbrt_d (x); } -static inline Vec4d square_cbrt(Vec4d const & x) { - return cbrt_d (x); +static inline Vec4d square_cbrt(Vec4d const x) { + return cbrt_d (x); } #endif // MAX_VECTOR_SIZE >= 256 #if MAX_VECTOR_SIZE >= 512 -static inline Vec8d cbrt(Vec8d const & x) { - return cbrt_d (x); +static inline Vec8d cbrt(Vec8d const x) { + return cbrt_d (x); } -static inline Vec8d reciprocal_cbrt(Vec8d const & x) { - return cbrt_d (x); +static inline Vec8d reciprocal_cbrt(Vec8d const x) { + return cbrt_d (x); } -static inline Vec8d square_cbrt(Vec8d const & x) { - return cbrt_d (x); +static inline Vec8d square_cbrt(Vec8d const x) { + return cbrt_d (x); } #endif // MAX_VECTOR_SIZE >= 512 @@ -1159,14 +1301,16 @@ static inline Vec8d square_cbrt(Vec8d const & x) { // cube root template, single precision // template parameters: // VTYPE: f.p. vector type -// ITYPE: uint32_t integer vector type -// BVTYPE: boolean vector type // CR: -1 for reciprocal cube root, 1 for cube root, 2 for cube root squared -template -static inline VTYPE cbrt_f(VTYPE const & x) { +template +static inline VTYPE cbrt_f(VTYPE const x) { - const int iter = 6; // iteration count of x^(-1/3) loop + const int iter = 4; // iteration count of x^(-1/3) loop int i; + + typedef decltype(roundi(x)) ITYPE; // integer vector type + typedef decltype(x < x) BVTYPE; // boolean vector type + VTYPE xa, xa3, a, a2; ITYPE m1, m2; BVTYPE underflow; @@ -1184,32 +1328,35 @@ static inline VTYPE cbrt_f(VTYPE const & x) { m2 = q1 - (m1 >> 23) * q2; a = reinterpret_f(m2); - underflow = BVTYPE(m1 < q3); // true if denormal or zero + underflow = BVTYPE(m1 <= q3); // true if denormal or zero // Newton Raphson iteration for (i = 0; i < iter-1; i++) { - a2 = a*a; + a2 = a*a; a = nmul_add(xa3, a2*a2, four_third*a); // a = four_third*a - xa3*a2*a2; } // last iteration with better precision - a2 = a*a; + a2 = a*a; a = mul_add(one_third, nmul_add(xa, a2*a2, a), a); //a = a + one_third*(a - xa*a2*a2); - if (CR == -1) { // reciprocal cube root + if constexpr (CR == -1) { // reciprocal cube root // generate INF if underflow a = select(underflow, infinite_vec(), a); - // get sign + a = select(is_inf(x), VTYPE(0), a); // special case for INF // get sign a = sign_combine(a, x); } - else if (CR == 1) { // cube root + else if constexpr (CR == 1) { // cube root a = a * a * x; - // generate 0 if underflow - a = select(underflow, 0., a); + a = select(underflow, 0.f, a); // generate 0 if underflow + a = select(is_inf(x), x, a); // special case for INF +#ifdef SIGNED_ZERO + a = a | (x & VTYPE(-0.0f)); // get sign of x +#endif } - else if (CR == 2) { // cube root squared - a = a * xa; - // generate 0 if underflow - a = select(underflow, 0., a); + else if constexpr (CR == 2) { // cube root squared + a = a * xa; // abs only to fix -INF + a = select(underflow, 0., a); // generate 0 if underflow + a = select(is_inf(x), xa, a); // special case for INF } return a; } @@ -1217,77 +1364,119 @@ static inline VTYPE cbrt_f(VTYPE const & x) { // template instances for cbrt and reciprocal_cbrt // cube root -static inline Vec4f cbrt(Vec4f const & x) { - return cbrt_f (x); +static inline Vec4f cbrt(Vec4f const x) { + return cbrt_f (x); } // reciprocal cube root -static inline Vec4f reciprocal_cbrt(Vec4f const & x) { - return cbrt_f (x); +static inline Vec4f reciprocal_cbrt(Vec4f const x) { + return cbrt_f (x); } // square cube root -static inline Vec4f square_cbrt(Vec4f const & x) { - return cbrt_f (x); +static inline Vec4f square_cbrt(Vec4f const x) { + return cbrt_f (x); } #if MAX_VECTOR_SIZE >= 256 -static inline Vec8f cbrt(Vec8f const & x) { - return cbrt_f (x); +static inline Vec8f cbrt(Vec8f const x) { + return cbrt_f (x); } -static inline Vec8f reciprocal_cbrt(Vec8f const & x) { - return cbrt_f (x); +static inline Vec8f reciprocal_cbrt(Vec8f const x) { + return cbrt_f (x); } -static inline Vec8f square_cbrt(Vec8f const & x) { - return cbrt_f (x); +static inline Vec8f square_cbrt(Vec8f const x) { + return cbrt_f (x); } #endif // MAX_VECTOR_SIZE >= 256 #if MAX_VECTOR_SIZE >= 512 -static inline Vec16f cbrt(Vec16f const & x) { - return cbrt_f (x); +static inline Vec16f cbrt(Vec16f const x) { + return cbrt_f (x); } -static inline Vec16f reciprocal_cbrt(Vec16f const & x) { - return cbrt_f (x); +static inline Vec16f reciprocal_cbrt(Vec16f const x) { + return cbrt_f (x); } -static inline Vec16f square_cbrt(Vec16f const & x) { - return cbrt_f (x); +static inline Vec16f square_cbrt(Vec16f const x) { + return cbrt_f (x); } +#endif // MAX_VECTOR_SIZE >= 512 + + + +/* **************************************************************************** + pow functions +******************************************************************************* +Note about standard conformance: +This implementation of a pow function differs from the IEEE 754-2008 floating +point standard regarding nan propagation. +The standard has pow(nan,0) = 1, and pow(1,nan) = 1, probably for historic reasons. +The present implementation is guaranteed to always propagate nan's for reasons +explained in this report: +Agner Fog: "NAN propagation versus fault trapping in floating point code", 2019, +https://www.agner.org/optimize/nan_propagation.pdf + +The standard defines another function, powr, which propagates NAN's, but powr +will be less useful to programmers because it does not allow integer powers of +negative x. + +******************************************************************************/ + +// Helper functions: + +#if MAX_VECTOR_SIZE >= 512 + // Helper function for power function: insert special values of pow(x,y) when x=0: // y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan -static inline Vec8d wm_pow_case_x0(Vec8db const & xiszero, Vec8d const & y, Vec8d const & z) { +static inline Vec8d wm_pow_case_x0(Vec8db const xiszero, Vec8d const y, Vec8d const z) { #if INSTRSET >= 9 const __m512i table = Vec8q(0x85858A00); - return _mm512_mask_fixupimm_pd(z, xiszero, y, table, 0); + return _mm512_mask_fixupimm_pd(z, uint8_t(xiszero), y, table, 0); #else return select(xiszero, select(y < 0., infinite_vec(), select(y == 0., Vec8d(1.), Vec8d(0.))), z); #endif } -#endif // MAX_VECTOR_SIZE >= 512 +// Helper function for power function: insert special values of pow(x,y) when x=0: +// y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan +static inline Vec16f wm_pow_case_x0(Vec16fb const xiszero, Vec16f const y, Vec16f const z) { +#if INSTRSET >= 9 + const __m512i table = Vec16ui(0x85858A00); + return _mm512_mask_fixupimm_ps(z, xiszero, y, table, 0); +#else + return select(xiszero, select(y < 0.f, infinite_vec(), select(y == 0.f, Vec16f(1.f), Vec16f(0.f))), z); +#endif +} + +#endif #if MAX_VECTOR_SIZE >= 256 -static inline Vec4d wm_pow_case_x0(Vec4db const & xiszero, Vec4d const & y, Vec4d const & z) { -//#if defined __AVX512VL__ && defined ? -// const __m256i table = Vec4q(0x85858A00); -// return _mm256_mask_fixupimm_pd(z, xiszero, y, table, 0); +static inline Vec4d wm_pow_case_x0(Vec4db const xiszero, Vec4d const y, Vec4d const z) { +//#if INSTRSET >= 10 + //const __m256i table = Vec4q(0x85858A00); + //return _mm256_mask_fixupimm_pd(z, xiszero, y, table, 0); //#else return select(xiszero, select(y < 0., infinite_vec(), select(y == 0., Vec4d(1.), Vec4d(0.))), z); //#endif } + +static inline Vec8f wm_pow_case_x0(Vec8fb const xiszero, Vec8f const y, Vec8f const z) { + return select(xiszero, select(y < 0.f, infinite_vec(), select(y == 0.f, Vec8f(1.f), Vec8f(0.f))), z); +} + #endif -static inline Vec2d wm_pow_case_x0(Vec2db const & xiszero, Vec2d const & y, Vec2d const & z) { -//#if defined __AVX512VL__ && defined ? +static inline Vec2d wm_pow_case_x0(Vec2db const xiszero, Vec2d const y, Vec2d const z) { +//#if INSTRSET >= 10 // const __m128i table = Vec2q(0x85858A00); // return _mm_mask_fixupimm_pd(z, xiszero, y, table, 0); //#else @@ -1295,33 +1484,35 @@ static inline Vec2d wm_pow_case_x0(Vec2db const & xiszero, Vec2d const & y, Vec2 //#endif } +static inline Vec4f wm_pow_case_x0(Vec4fb const xiszero, Vec4f const y, Vec4f const z) { + return select(xiszero, select(y < 0.f, infinite_vec(), select(y == 0.f, Vec4f(1.f), Vec4f(0.f))), z); +} + + // **************************************************************************** // pow template, double precision // **************************************************************************** // Calculate x to the power of y. // Precision is important here because rounding errors get multiplied by y. -// The logarithm is calculated with extra precision, and the exponent is +// The logarithm is calculated with extra precision, and the exponent is // calculated separately. -// The logarithm is calculated by Pad\E9 approximation with 6'th degree +// The logarithm is calculated by Pade approximation with 6'th degree // polynomials. A 7'th degree would be preferred for best precision by high y. // The alternative method: log(x) = z + z^3*R(z)/S(z), where z = 2(x-1)/(x+1) // did not give better precision. // Template parameters: // VTYPE: data vector type -// ITYPE: signed integer vector type -// BVTYPE: boolean vector type -template -static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { +template +static inline VTYPE pow_template_d(VTYPE const x0, VTYPE const y) { // define constants const double ln2d_hi = 0.693145751953125; // log(2) in extra precision, high bits const double ln2d_lo = 1.42860682030941723212E-6; // low bits of log(2) const double log2e = VM_LOG2E; // 1/log(2) - const double pow2_52 = 4503599627370496.0; // 2^52 - // coefficients for Pad\E9 polynomials + // coefficients for Pade polynomials const double P0logl = 2.0039553499201281259648E1; const double P1logl = 5.7112963590585538103336E1; const double P2logl = 6.0949667980987787057556E1; @@ -1340,32 +1531,37 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { const double p2 = 1./2.; const double p3 = 1./6.; const double p4 = 1./24.; - const double p5 = 1./120.; - const double p6 = 1./720.; - const double p7 = 1./5040.; - const double p8 = 1./40320.; - const double p9 = 1./362880.; - const double p10 = 1./3628800.; - const double p11 = 1./39916800.; - const double p12 = 1./479001600.; - const double p13 = 1./6227020800.; + const double p5 = 1./120.; + const double p6 = 1./720.; + const double p7 = 1./5040.; + const double p8 = 1./40320.; + const double p9 = 1./362880.; + const double p10 = 1./3628800.; + const double p11 = 1./39916800.; + const double p12 = 1./479001600.; + const double p13 = 1./6227020800.; + + typedef decltype(roundi(x0)) ITYPE; // integer vector type + typedef decltype(x0 < x0) BVTYPE; // boolean vector type // data vectors - VTYPE x, x1, x2; - VTYPE px, qx, ef, yr, v, z, z1; + VTYPE x, x1, x2; // x variable + VTYPE px, qx, ef, yr, v; // calculation of logarithm VTYPE lg, lg1, lg2; VTYPE lgerr, x2err; - VTYPE e1, e2, e3, ee; + VTYPE e1, e2, ee; + VTYPE e3, z, z1; // calculation of exp and pow + VTYPE yodd(0); // has sign bit set if y is an odd integer // integer vectors - ITYPE ei, ej, yodd; + ITYPE ei, ej; // boolean vectors - BVTYPE blend, xzero, xnegative; - BVTYPE overflow, underflow, xfinite, yfinite, efinite; + BVTYPE blend, xzero, xsign; // x conditions + BVTYPE overflow, underflow, xfinite, yfinite, efinite; // error conditions // remove sign x1 = abs(x0); - // Separate mantissa from exponent + // Separate mantissa from exponent // This gives the mantissa * 0.5 x = fraction_2(x1); @@ -1381,7 +1577,7 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { px *= x * x2; qx = polynomial_6n (x, Q0logl, Q1logl, Q2logl, Q3logl, Q4logl, Q5logl); lg1 = px / qx; - + // extract exponent ef = exponent_f(x1); ef = if_add(blend, ef, 1.); // conditional add @@ -1424,7 +1620,8 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { // contributions to exponent ee = e1 + e2 + e3; - ei = round_to_int64_limited(ee); + //ei = round_to_int64_limited(ee); + ei = roundi(ee); // biased exponent of result: ej = ei + (ITYPE(reinterpret_i(z)) >> 52); // check exponent for overflow and underflow @@ -1439,7 +1636,7 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { yfinite = is_finite(y); efinite = is_finite(ee); xzero = is_zero_or_subnormal(x0); - xnegative = x0 < 0.; + xsign = sign_bit(x0); // sign of x0. include -0. // check for overflow and underflow if (horizontal_or(overflow | underflow)) { @@ -1452,13 +1649,17 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { z = wm_pow_case_x0(xzero, y, z); //z = select(xzero, select(y < 0., infinite_vec(), select(y == 0., VTYPE(1.), VTYPE(0.))), z); - // check for x < 0. y must be integer - if (horizontal_or(xnegative)) { - // test if y odd - yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63; // convert y to integer and shift bit 0 to position of sign bit - z1 = z | (x0 & VTYPE(reinterpret_d(yodd))); // apply sign if y odd - z1 = select(y == round(y), z1, nan_vec(NAN_POW)); // NAN if y not integer - z = select(xnegative, z1, z); + // check for sign of x (include -0.). y must be integer + if (horizontal_or(xsign)) { + // test if y is an integer + BVTYPE yinteger = y == round(y); + // test if y is odd: convert to int and shift bit 0 into position of sign bit. + // this will be 0 if overflow + yodd = reinterpret_d(roundi(y) << 63); + z1 = select(yinteger, z | yodd, // y is integer. get sign if y is odd + select(x0 == 0., z, nan_vec(NAN_POW))); // NAN unless x0 == -0. + yodd = select(yinteger, yodd, 0.); // yodd used below. only if y is integer + z = select(xsign, z1, z); } // check for range errors @@ -1467,48 +1668,58 @@ static inline VTYPE pow_template_d(VTYPE const & x0, VTYPE const & y) { return z; } - // handle special error cases - z = select(yfinite & efinite, z, select(x1 == 1., VTYPE(1.), select((x1 > 1.) ^ sign_bit(y), infinite_vec(), 0.))); - yodd = ITYPE(reinterpret_i(abs(y) + pow2_52)) << 63; // same as above - z = select(xfinite, z, select(y == 0., VTYPE(1.), select(y < 0., VTYPE(0.), infinite_vec() | ( VTYPE(reinterpret_d(yodd)) & x0)))); - z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z)); - return z; + // handle special error cases: y infinite + z1 = select(yfinite & efinite, z, + select(x1 == 1., VTYPE(1.), + select((x1 > 1.) ^ sign_bit(y), infinite_vec(), 0.))); + + // handle x infinite + z1 = select(xfinite, z1, + select(y == 0., VTYPE(1.), + select(y < 0., yodd & z, // 0.0 with the sign of z from above + abs(x0) | (x0 & yodd)))); // get sign of x0 only if y is odd integer + + // Always propagate nan: + // Deliberately differing from the IEEE-754 standard which has pow(0,nan)=1, and pow(1,nan)=1 + z1 = select(is_nan(x0)|is_nan(y), x0+y, z1); + + return z1; } //This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined: -//template static Vec2d pow(Vec2d const & a, TT n); +//template static Vec2d pow(Vec2d const a, TT n); // instantiations of pow_template_d: template <> -inline Vec2d pow(Vec2d const & x, Vec2d const & y) { - return pow_template_d(x, y); +inline Vec2d pow(Vec2d const x, Vec2d const y) { + return pow_template_d(x, y); } template <> -inline Vec2d pow(Vec2d const & x, double const & y) { - return pow_template_d(x, y); +inline Vec2d pow(Vec2d const x, double const y) { + return pow_template_d(x, y); } template <> -inline Vec2d pow(Vec2d const & x, float const & y) { - return pow_template_d(x, (double)y); +inline Vec2d pow(Vec2d const x, float const y) { + return pow_template_d(x, (double)y); } #if MAX_VECTOR_SIZE >= 256 template <> -inline Vec4d pow(Vec4d const & x, Vec4d const & y) { - return pow_template_d(x, y); +inline Vec4d pow(Vec4d const x, Vec4d const y) { + return pow_template_d(x, y); } template <> -inline Vec4d pow(Vec4d const & x, double const & y) { - return pow_template_d(x, y); +inline Vec4d pow(Vec4d const x, double const y) { + return pow_template_d(x, y); } template <> -inline Vec4d pow(Vec4d const & x, float const & y) { - return pow_template_d(x, (double)y); +inline Vec4d pow(Vec4d const x, float const y) { + return pow_template_d(x, (double)y); } #endif // MAX_VECTOR_SIZE >= 256 @@ -1516,42 +1727,22 @@ inline Vec4d pow(Vec4d const & x, float const & y) { #if MAX_VECTOR_SIZE >= 512 template <> -inline Vec8d pow(Vec8d const & x, Vec8d const & y) { - return pow_template_d(x, y); +inline Vec8d pow(Vec8d const x, Vec8d const y) { + return pow_template_d(x, y); } template <> -inline Vec8d pow(Vec8d const & x, double const & y) { - return pow_template_d(x, y); +inline Vec8d pow(Vec8d const x, double const y) { + return pow_template_d(x, y); } template <> -inline Vec8d pow(Vec8d const & x, float const & y) { - return pow_template_d(x, (double)y); -} - -// Helper function for power function: insert special values of pow(x,y) when x=0: -// y<0 -> inf, y=0 -> 1, y>0 -> 0, y=nan -> nan -static inline Vec16f wm_pow_case_x0(Vec16fb const & xiszero, Vec16f const & y, Vec16f const & z) { -#if INSTRSET >= 9 - const __m512i table = Vec16ui(0x85858A00); - return _mm512_mask_fixupimm_ps(z, xiszero, y, table, 0); -#else - return select(xiszero, select(y < 0.f, infinite_vec(), select(y == 0.f, Vec16f(1.f), Vec16f(0.f))), z); -#endif +inline Vec8d pow(Vec8d const x, float const y) { + return pow_template_d(x, (double)y); } #endif // MAX_VECTOR_SIZE >= 512 -#if MAX_VECTOR_SIZE >= 256 -static inline Vec8f wm_pow_case_x0(Vec8fb const & xiszero, Vec8f const & y, Vec8f const & z) { - return select(xiszero, select(y < 0.f, infinite_vec(), select(y == 0.f, Vec8f(1.f), Vec8f(0.f))), z); -} -#endif - -static inline Vec4f wm_pow_case_x0(Vec4fb const & xiszero, Vec4f const & y, Vec4f const & z) { - return select(xiszero, select(y < 0.f, infinite_vec(), select(y == 0.f, Vec4f(1.f), Vec4f(0.f))), z); -} // **************************************************************************** // pow template, single precision @@ -1559,20 +1750,16 @@ static inline Vec4f wm_pow_case_x0(Vec4fb const & xiszero, Vec4f const & y, Vec4 // Template parameters: // VTYPE: data vector type -// ITYPE: signed integer vector type -// BVTYPE: boolean vector type // Calculate x to the power of y -template -static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) { +template +static inline VTYPE pow_template_f(VTYPE const x0, VTYPE const y) { // define constants - const float ln2f_hi = 0.693359375f; + const float ln2f_hi = 0.693359375f; // log(2), split in two for extended precision const float ln2f_lo = -2.12194440e-4f; - //const float max_expf = 87.3f; const float log2e = float(VM_LOG2E); // 1/log(2) - const float pow2_23 = 8388608.0f; // 2^23 - const float P0logf = 3.3333331174E-1f; + const float P0logf = 3.3333331174E-1f; // coefficients for logarithm expansion const float P1logf = -2.4999993993E-1f; const float P2logf = 2.0000714765E-1f; const float P3logf = -1.6668057665E-1f; @@ -1582,30 +1769,33 @@ static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) { const float P7logf = -1.1514610310E-1f; const float P8logf = 7.0376836292E-2f; - // Taylor coefficients for exp function, 1/n! - const float p2expf = 1.f/2.f; + const float p2expf = 1.f/2.f; // coefficients for Taylor expansion of exp const float p3expf = 1.f/6.f; const float p4expf = 1.f/24.f; - const float p5expf = 1.f/120.f; - const float p6expf = 1.f/720.f; - const float p7expf = 1.f/5040.f; + const float p5expf = 1.f/120.f; + const float p6expf = 1.f/720.f; + const float p7expf = 1.f/5040.f; + + typedef decltype(roundi(x0)) ITYPE; // integer vector type + typedef decltype(x0 < x0) BVTYPE; // boolean vector type // data vectors - VTYPE x, x1, x2; - VTYPE ef, yr, v, z, z1; - VTYPE lg, lg1; - VTYPE lgerr, x2err; - VTYPE e1, e2, e3, ee; + VTYPE x, x1, x2; // x variable + VTYPE ef, e1, e2, e3, ee; // exponent + VTYPE yr; // remainder + VTYPE lg, lg1, lgerr, x2err, v; // logarithm + VTYPE z, z1; // pow(x,y) + VTYPE yodd(0); // has sign bit set if y is an odd integer // integer vectors - ITYPE ei, ej, yodd; + ITYPE ei, ej; // exponent // boolean vectors - BVTYPE blend, xzero, xnegative; - BVTYPE overflow, underflow, xfinite, yfinite, efinite; + BVTYPE blend, xzero, xsign; // x conditions + BVTYPE overflow, underflow, xfinite, yfinite, efinite; // error conditions // remove sign x1 = abs(x0); - // Separate mantissa from exponent + // Separate mantissa from exponent // This gives the mantissa * 0.5 x = fraction_2(x1); @@ -1617,8 +1807,8 @@ static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) { x -= 1.0f; x2 = x * x; lg1 = polynomial_8(x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf); - lg1 *= x2 * x; - + lg1 *= x2 * x; + // extract exponent ef = exponent_f(x1); ef = if_add(blend, ef, 1.0f); // conditional add @@ -1660,7 +1850,7 @@ static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) { // contributions to exponent ee = e1 + e2 + e3; - ei = round_to_int(ee); + ei = roundi(ee); // biased exponent of result: ej = ei + (ITYPE(reinterpret_i(z)) >> 23); // check exponent for overflow and underflow @@ -1676,7 +1866,7 @@ static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) { efinite = is_finite(ee); xzero = is_zero_or_subnormal(x0); - xnegative = x0 < 0.f; + xsign = sign_bit(x0); // x is negative or -0. // check for overflow and underflow if (horizontal_or(overflow | underflow)) { @@ -1689,61 +1879,73 @@ static inline VTYPE pow_template_f(VTYPE const & x0, VTYPE const & y) { z = wm_pow_case_x0(xzero, y, z); //z = select(xzero, select(y < 0.f, infinite_vec(), select(y == 0.f, VTYPE(1.f), VTYPE(0.f))), z); - // check for x < 0. y must be integer - if (horizontal_or(xnegative)) { - // test if y odd - yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31; // convert y to integer and shift bit 0 to position of sign bit - z1 = z | (x0 & VTYPE(reinterpret_f(yodd))); // apply sign if y odd - z1 = select(y == round(y), z1, nan_vec(NAN_POW)); // NAN if y not integer - z = select(xnegative, z1, z); + // check for sign of x (include -0.). y must be integer + if (horizontal_or(xsign)) { + // test if y is an integer + BVTYPE yinteger = y == round(y); + // test if y is odd: convert to int and shift bit 0 into position of sign bit. + // this will be 0 if overflow + yodd = reinterpret_f(roundi(y) << 31); + z1 = select(yinteger, z | yodd, // y is integer. get sign if y is odd + select(x0 == 0.f, z, nan_vec(NAN_POW)));// NAN unless x0 == -0. + yodd = select(yinteger, yodd, 0); // yodd used below. only if y is integer + z = select(xsign, z1, z); } // check for range errors if (horizontal_and(xfinite & yfinite & (efinite | xzero))) { - // fast return if no special cases - return z; + return z; // fast return if no special cases } - // handle special error cases - z = select(yfinite & efinite, z, select(x1 == 1.f, VTYPE(1.f), select((x1 > 1.f) ^ sign_bit(y), infinite_vec(), 0.f))); - yodd = ITYPE(reinterpret_i(abs(y) + pow2_23)) << 31; // same as above - z = select(xfinite, z, select(y == 0.f, VTYPE(1.f), select(y < 0.f, VTYPE(0.f), infinite_vec() | (VTYPE(reinterpret_f(yodd)) & x0)))); - z = select(is_nan(x0), select(is_nan(y), x0 | y, x0), select(is_nan(y), y, z)); - return z; + // handle special error cases: y infinite + z1 = select(yfinite & efinite, z, + select(x1 == 1.f, VTYPE(1.f), + select((x1 > 1.f) ^ sign_bit(y), infinite_vec(), 0.f))); + + // handle x infinite + z1 = select(xfinite, z1, + select(y == 0.f, VTYPE(1.f), + select(y < 0.f, yodd & z, // 0.0 with the sign of z from above + abs(x0) | (x0 & yodd)))); // get sign of x0 only if y is odd integer + + // Always propagate nan: + // Deliberately differing from the IEEE-754 standard which has pow(0,nan)=1, and pow(1,nan)=1 + z1 = select(is_nan(x0)|is_nan(y), x0+y, z1); + return z1; } //This template is in vectorf128.h to prevent implicit conversion of float y to int when float version is not defined: -//template static Vec4f pow(Vec4f const & a, TT n); +//template static Vec4f pow(Vec4f const a, TT n); template <> -inline Vec4f pow(Vec4f const & x, Vec4f const & y) { - return pow_template_f(x, y); +inline Vec4f pow(Vec4f const x, Vec4f const y) { + return pow_template_f(x, y); } template <> -inline Vec4f pow(Vec4f const & x, float const & y) { - return pow_template_f(x, y); +inline Vec4f pow(Vec4f const x, float const y) { + return pow_template_f(x, y); } template <> -inline Vec4f pow(Vec4f const & x, double const & y) { - return pow_template_f(x, (float)y); +inline Vec4f pow(Vec4f const x, double const y) { + return pow_template_f(x, (float)y); } #if MAX_VECTOR_SIZE >= 256 template <> -inline Vec8f pow(Vec8f const & x, Vec8f const & y) { - return pow_template_f(x, y); +inline Vec8f pow(Vec8f const x, Vec8f const y) { + return pow_template_f(x, y); } template <> -inline Vec8f pow(Vec8f const & x, float const & y) { - return pow_template_f(x, y); +inline Vec8f pow(Vec8f const x, float const y) { + return pow_template_f(x, y); } template <> -inline Vec8f pow(Vec8f const & x, double const & y) { - return pow_template_f(x, (float)y); +inline Vec8f pow(Vec8f const x, double const y) { + return pow_template_f(x, (float)y); } #endif // MAX_VECTOR_SIZE >= 256 @@ -1751,18 +1953,18 @@ inline Vec8f pow(Vec8f const & x, double const & y) { #if MAX_VECTOR_SIZE >= 512 template <> -inline Vec16f pow(Vec16f const & x, Vec16f const & y) { - return pow_template_f(x, y); +inline Vec16f pow(Vec16f const x, Vec16f const y) { + return pow_template_f(x, y); } template <> -inline Vec16f pow(Vec16f const & x, float const & y) { - return pow_template_f(x, y); +inline Vec16f pow(Vec16f const x, float const y) { + return pow_template_f(x, y); } template <> -inline Vec16f pow(Vec16f const & x, double const & y) { - return pow_template_f(x, (float)y); +inline Vec16f pow(Vec16f const x, double const y) { + return pow_template_f(x, (float)y); } #endif // MAX_VECTOR_SIZE >= 512 @@ -1771,387 +1973,198 @@ inline Vec16f pow(Vec16f const & x, double const & y) { // ************************************************************* // power function with rational exponent // ************************************************************* -// Power function with rational exponent: x^(a/b) -// Template must be defined as class to allow partial template specialization -template -class Power_rational { -public: - // overloaded member function for each vector type - Vec4f pow(Vec4f const & x) { - Vec4f y = x; - // negative x allowed when b odd or a even - // (if a is even then either b is odd or a/b can be reduced, - // but we can check a even anyway at no cost to be sure) - if (a == 0) return 1.f; - if ((b | ~a) & 1) y = abs(y); - y = pow(y, float(double(a)/double(b))); - if (a & b & 1) y = sign_combine(y, x); // apply sign if a and b both odd - if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); // zero allowed for positive a and b - return y; - } - Vec2d pow(Vec2d const & x) { - Vec2d y = x; - if (a == 0) return 1.; - if ((b | ~a) & 1) y = abs(y); - y = pow(y, double((long double)a/(long double)b)); - if (a & b & 1) y = sign_combine(y, x); - if ((a ^ b) >= 0) y = select(x == 0., 0., y); - return y; - } -#if MAX_VECTOR_SIZE >= 256 - Vec8f pow(Vec8f const & x) { - Vec8f y = x; - if (a == 0) return 1.f; - if ((b | ~a) & 1) y = abs(y); - y = pow(y, float(double(a)/double(b))); - if (a & b & 1) y = sign_combine(y, x); - if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); - return y; - } - Vec4d pow(Vec4d const & x) { - Vec4d y = x; - if (a == 0) return 1.; - if ((b | ~a) & 1) y = abs(y); - y = pow(y, double((long double)a/(long double)b)); - if (a & b & 1) y = sign_combine(y, x); - if ((a ^ b) >= 0) y = select(x == 0., 0., y); - return y; - } -#endif // MAX_VECTOR_SIZE >= 256 -#if MAX_VECTOR_SIZE >= 512 - Vec16f pow(Vec16f const & x) { - Vec16f y = x; - if (a == 0) return 1.f; - if ((b | ~a) & 1) y = abs(y); - y = pow(y, float(double(a)/double(b))); - if (a & b & 1) y = sign_combine(y, x); - if ((a ^ b) >= 0) y = select(x == 0.f, 0.f, y); - return y; - } - Vec8d pow(Vec8d const & x) { - Vec8d y = x; - if (a == 0) return 1.; - if ((b | ~a) & 1) y = abs(y); - y = pow(y, double((long double)a/(long double)b)); - if (a & b & 1) y = sign_combine(y, x); - if ((a ^ b) >= 0) y = select(x == 0., 0., y); - return y; - } -#endif // MAX_VECTOR_SIZE >= 512 -}; - -// partial specialization for b = 1 -template -class Power_rational { -public: - template - VTYPE pow(VTYPE const & x) {return pow_n(x);} -}; -// partial specialization for b = 2 -template -class Power_rational { -public: - template - VTYPE pow(VTYPE const & x) { - VTYPE y = pow_n<(a > 0 ? a/2 : (a-1)/2)>(x); - if (a & 1) y *= sqrt(x); - return y; - } -}; +// macro to call template power_rational +#define pow_ratio(x, a, b) (power_rational (x)) -// full specialization for a = 1, b = 2 -template<> -class Power_rational<1,2> { -public: - template - VTYPE pow(VTYPE const & x) { - return sqrt(x); - } -}; +// Power function with rational exponent: pow(x,a/b) +template +V power_rational (V const x) { -// full specialization for a = -1, b = 2 -template<> -class Power_rational<-1,2> { -public: - template - VTYPE pow(VTYPE const & x) { - // (this is faster than iteration method on modern CPUs) - return VTYPE(1.f) / sqrt(x); + // constexpr lambda to reduce rational number a/b + auto reduce_rational = [](int const aa, int const bb) constexpr { + int a = aa, b = bb; + if (b < 0) { + a = -a; b = -b; // make b positive + } + while ((((a | b) & 1) == 0) && b > 0) { // prime factor 2 + a /= 2; b /= 2; + } + while (a % 3 == 0 && b % 3 == 0 && b > 0) { // prime factor 3 + a /= 3; b /= 3; + } + while (a % 5 == 0 && b % 5 == 0 && b > 0) { // prime factor 5 + a /= 5; b /= 5; + } + return bb / b; // return common denominator + }; + constexpr int d = reduce_rational(a0, b0); + constexpr int a = a0 / d; + constexpr int b = b0 / d; + + // special cases + if constexpr (a == 0) return V(1.f); + + else if constexpr (b == 1) return pow_n(x); + + else if constexpr (b == 2) { + V y, t = sqrt(x); + if constexpr (a == 1) y = t; + else if constexpr (a == -1) y = V(1.f) / t; + else { + constexpr int a2 = a > 0 ? a / 2 : (a - 1) / 2; + y = pow_n(x) * t; + } +#ifdef SIGNED_ZERO + y = abs(y); // pow(-0., a/2.) must be +0. +#endif + return y; } -}; -// partial specialization for b = 3 -template -class Power_rational { -public: - template - VTYPE pow(VTYPE const & x) { - VTYPE t; - switch (a % 3) { - case -2: - t = reciprocal_cbrt(x); + else if constexpr (b == 3) { + V y; + constexpr int a3 = a % 3; + if constexpr (a3 == -2) { + V t = reciprocal_cbrt(x); t *= t; - if (a == -2) return t; - t = t / pow_n<(-a-2)/3>(x); - break; - case -1: - t = reciprocal_cbrt(x); - if (a == -1) return t; - t = t / pow_n<(-a-1)/3>(x); - break; - case 0: - t = pow_n(x); - break; - case 1: - t = cbrt(x); - if (a == 1) return t; - t = t * pow_n(x); - break; - case 2: - t = square_cbrt(x); - if (a == 2) return t; - t = t * pow_n(x); - break; + if constexpr (a == -2) y = t; + else y = t / pow_n(x); } - return t; + else if constexpr (a3 == -1) { + V t = reciprocal_cbrt(x); + if constexpr (a == -1) y = t; + else y = t / pow_n(x); // fail if INF + } + else if constexpr (a3 == 1) { + V t = cbrt(x); + if constexpr (a == 1) y = t; + else y = t * pow_n(x); + } + else if constexpr (a3 == 2) { + V t = square_cbrt(x); + if constexpr (a == 2) y = t; + else y = t * pow_n(x); + } + return y; } -}; -// partial specialization for b = 4 -template -class Power_rational { -public: - template - VTYPE pow(VTYPE const & x) { - VTYPE t, s1, s2; + else if constexpr (b == 4) { + constexpr int a4 = a % 4; + V s1, s2, y; s1 = sqrt(x); - if (a & 1) s2 = sqrt(s1); - switch (a % 4) { - case -3: - t = s2 / pow_n<1+(-a)/4>(x); - break; - case -2: - t = s1 / pow_n<1+(-a)/4>(x); - break; - case -1: - if (a != -1) s2 *= pow_n<(-a)/4>(x); - t = VTYPE(1.f) / s2; - break; - case 0: default: - t = pow_n(x); - break; - case 1: - t = s2; - if (a != 1) t *= pow_n(x); - break; - case 2: - t = s1; - if (a != 2) t *= pow_n(x); - break; - case 3: - t = s1 * s2; - if (a != 3) t *= pow_n(x); - break; + if ((a & 1) == 1) s2 = sqrt(s1); + + if constexpr (a4 == -3) { + y = s2 / pow_n(x); + } + else if constexpr (a4 == -1) { + if constexpr (a != -1) s2 *= pow_n(x); + y = V(1.f) / s2; } - return t; + else if constexpr (a4 == 1) { + if constexpr (a == 1) y = s2; + else y = s2 * pow_n(x); + } + else if constexpr (a4 == 3) { + V t = s1 * s2; + if constexpr (a != 3) t *= pow_n(x); + y = t; + } +#ifdef SIGNED_ZERO + y = abs(y); +#endif + return y; } -}; -// partial specialization for b = 6 -template -class Power_rational { -public: - template - VTYPE pow(VTYPE const & x) { - VTYPE t, s1, s2, s3; - switch (a % 6) { - case -5: - t = reciprocal_cbrt(x); - t = t * t * sqrt(t); - if (a != -5) t /= pow_n<(-a)/6>(x); - break; - case -4: - t = reciprocal_cbrt(x); - t *= t; - if (a != -4) t /= pow_n<(-a)/6>(x); - break; - case -3: - t = pow_n(x); - t /= sqrt(x); - break; - case -2: - t = reciprocal_cbrt(x); - if (a != -2) t /= pow_n<(-a)/6>(x); - break; - case -1: - t = sqrt(reciprocal_cbrt(x)); - if (a != -1) t /= pow_n<(-a)/6>(x); - break; - case 0: default: - t = pow_n(x); - break; - case 1: - t = sqrt(cbrt(x)); - if (a != 1) t *= pow_n(x); - break; - case 2: - t = cbrt(x); - if (a != 2) t *= pow_n(x); - break; - case 3: - t = sqrt(x); - if (a != 3) t *= pow_n(x); - break; - case 4: - t = square_cbrt(x); - if (a != 4) t *= pow_n(x); - break; - case 5: - t = cbrt(x); - t = t * t * sqrt(t); - if (a != 5) t *= pow_n(x); - break; + else if constexpr (b == 6) { + constexpr int a6 = a % 6; + V y; + if constexpr (a6 == -5) { + V t = cbrt(sqrt(x)) / x; + if constexpr (a != -5) t /= pow_n(x); + y = t; + } + else if constexpr (a6 == -1) { + V t = reciprocal_cbrt(sqrt(x)); + if constexpr (a != -1) t /= pow_n(x); + y = t; } - return t; + else if constexpr (a6 == 1) { + V t = cbrt(sqrt(x)); + if constexpr (a != 1) t *= pow_n(x); + y = t; + } + else if constexpr (a6 == 5) { + V s1 = sqrt(x); + V t = cbrt(s1); + t = t*t*s1; + if constexpr (a != 5) t *= pow_n(x); + y = t; + } +#ifdef SIGNED_ZERO + y = abs(y); +#endif + return y; } -}; -// partial specialization for b = 8 -template -class Power_rational { -public: - template - VTYPE pow(VTYPE const & x) { - VTYPE t, s1, s2, s3; - s1 = sqrt(x); // x^(1/2) - if (a & 3) s2 = sqrt(s1); // x^(1/4) - if (a & 1) s3 = sqrt(s2); // x^(1/8) - switch (a % 8) { - case -7: - t = s3 / pow_n<1+(-a)/8>(x); - break; - case -6: - t = s2 / pow_n<1+(-a)/8>(x); - break; - case -5: - t = s3 * (s2 / pow_n<1+(-a)/8>(x)); - break; - case -4: - t = s1 / pow_n<1+(-a)/8>(x); - break; - case -3: - t = s3 * (s1 / pow_n<1+(-a)/8>(x)); - break; - case -2: - if (a != -2) s2 *= pow_n<(-a)/8>(x); - t = VTYPE(1.f) / s2; - break; - case -1: - if (a != -1) s3 *= pow_n<(-a)/8>(x); - t = VTYPE(1.f) / s3; - break; - case 0: default: - t = pow_n(x); - break; - case 1: - t = s3; - if (a != 1) t *= pow_n(x); - break; - case 2: - t = s2; - if (a != 2) t *= pow_n(x); - break; - case 3: - t = s2 * s3; - if (a != 3) t *= pow_n(x); - break; - case 4: - t = s1; - if (a != 4) t *= pow_n(x); - break; - case 5: - t = s1 * s3; - if (a != 5) t *= pow_n(x); - break; - case 6: - t = s1 * s2; - if (a != 6) t *= pow_n(x); - break; - case 7: - t = s2 * s3; - if (a != 7) s1 *= pow_n(x); + else if constexpr (b == 8) { + V s1 = sqrt(x); // x^(1/2) + V s2 = sqrt(s1); // x^(1/4) + V s3 = sqrt(s2); // x^(1/8) + V y; + constexpr int a8 = a % 8; + if constexpr (a8 == -7) { + y = s3 / pow_n(x); + } + else if constexpr (a8 == -5) { + y = s3 * (s2 / pow_n(x)); + } + else if constexpr (a8 == -3) { + y = s3 * (s1 / pow_n(x)); + } + else if constexpr (a8 == -1) { + if constexpr (a != -1) s3 *= pow_n(x); + y = V(1.f) / s3; + } + else if constexpr (a8 == 1) { + if constexpr (a == 1) y = s3; + else y = s3 * pow_n(x); + } + else if constexpr (a8 == 3) { + V t = s2 * s3; + if constexpr (a != 3) t *= pow_n(x); + y = t; + } + else if constexpr (a8 == 5) { + V t = s1 * s3; + if constexpr (a != 5) t *= pow_n(x); + y = t; + } + else if constexpr (a8 == 7) { + V t = s2 * s3; + if constexpr (a != 7) s1 *= pow_n(x); t *= s1; - break; - + y = t; } - return t; +#ifdef SIGNED_ZERO + y = abs(y); +#endif + return y; } -}; - -// macro to call template class member function pow -#define pow_ratio(x, a, b) (Power_rational<(b)<0 ? -(a):(a), (b)<0 ? -(b):(b)> ().pow(x)) - - -/****************************************************************************** -* Detect NAN codes -* -* These functions return the code hidden in a NAN. The sign bit is ignored -******************************************************************************/ - -static inline Vec4i nan_code(Vec4f const & x) { - Vec4i a = reinterpret_i(x); - Vec4ib b = (a & 0x7F800000) == 0x7F800000; // check if NAN/INF - return a & 0x007FFFFF & Vec4i(b); // isolate NAN code bits -} - -// This function returns the code hidden in a NAN. The sign bit is ignored -static inline Vec2q nan_code(Vec2d const & x) { - Vec2q a = reinterpret_i(x); - Vec2q const m = 0x7FF0000000000000; - Vec2q const n = 0x000FFFFFFFFFFFFF; - Vec2qb b = (a & m) == m; // check if NAN/INF - return a & n & Vec2q(b); // isolate NAN code bits -} - -#if MAX_VECTOR_SIZE >= 256 -// This function returns the code hidden in a NAN. The sign bit is ignored -static inline Vec8i nan_code(Vec8f const & x) { - Vec8i a = reinterpret_i(x); - Vec8ib b = (a & 0x7F800000) == 0x7F800000; // check if NAN/INF - return a & 0x007FFFFF & Vec8i(b); // isolate NAN code bits -} - -// This function returns the code hidden in a NAN. The sign bit is ignored -static inline Vec4q nan_code(Vec4d const & x) { - Vec4q a = reinterpret_i(x); - Vec4q const m = 0x7FF0000000000000; - Vec4q const n = 0x000FFFFFFFFFFFFF; - Vec4qb b = (a & m) == m; // check if NAN/INF - return a & n & Vec4q(b); // isolate NAN code bits -} - -#endif // MAX_VECTOR_SIZE >= 256 -#if MAX_VECTOR_SIZE >= 512 - -// This function returns the code hidden in a NAN. The sign bit is ignored -static inline Vec16i nan_code(Vec16f const & x) { - Vec16i a = Vec16i(reinterpret_i(x)); - Vec16ib b = (a & 0x7F800000) == 0x7F800000; // check if NAN/INF - return a & 0x007FFFFF & Vec16i(b); // isolate NAN code bits -} - -// This function returns the code hidden in a NAN. The sign bit is ignored -static inline Vec8q nan_code(Vec8d const & x) { - Vec8q a = Vec8q(reinterpret_i(x)); - Vec8q const m = 0x7FF0000000000000; - Vec8q const n = 0x000FFFFFFFFFFFFF; - Vec8qb b = (a & m) == m; // check if NAN/INF - return a & n & Vec8q(b); // isolate NAN code bits + else { + // general case + V y = x; + // negative x allowed when b odd or a even + // (if a is even then either b is odd or a/b can be reduced, + // but we can check a even anyway at no cost to be sure) + if constexpr (((b | ~a) & 1) == 1) y = abs(y); + y = pow(y, (double(a) / double(b))); + if constexpr ((a & b & 1) == 1) y = sign_combine(y, x); // apply sign if a and b both odd + return y; + } } -#endif // MAX_VECTOR_SIZE >= 512 #ifdef VCL_NAMESPACE } diff --git a/DFTTest/VCL2/vectormath_hyp.h b/DFTTest/VCL2/vectormath_hyp.h new file mode 100644 index 0000000..6c0efa8 --- /dev/null +++ b/DFTTest/VCL2/vectormath_hyp.h @@ -0,0 +1,717 @@ +/**************************** vectormath_hyp.h ****************************** +* Author: Agner Fog +* Date created: 2014-07-09 +* Last modified: 2019-08-01 +* Version: 2.00.00 +* Project: vector class library +* Description: +* Header file containing inline vector functions of hyperbolic and inverse +* hyperbolic functions: +* sinh hyperbolic sine +* cosh hyperbolic cosine +* tanh hyperbolic tangent +* asinh inverse hyperbolic sine +* acosh inverse hyperbolic cosine +* atanh inverse hyperbolic tangent +* +* Theory, methods and inspiration based partially on these sources: +* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. +* Ellis Horwood, 1989. +* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and +* Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt +* > Cephes math library by Stephen L. Moshier 1992, +* http://www.netlib.org/cephes/ +* +* For detailed instructions, see vectormath_common.h and vcl_manual.pdf +* +* (c) Copyright 2014-2019 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ + +#ifndef VECTORMATH_HYP_H +#define VECTORMATH_HYP_H 1 + +#include "vectormath_exp.h" + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + +/****************************************************************************** +* Hyperbolic functions +******************************************************************************/ + +// Template for sinh function, double precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE sinh_d(VTYPE const x0) { +// The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). + + // Coefficients + const double p0 = -3.51754964808151394800E5; + const double p1 = -1.15614435765005216044E4; + const double p2 = -1.63725857525983828727E2; + const double p3 = -7.89474443963537015605E-1; + + const double q0 = -2.11052978884890840399E6; + const double q1 = 3.61578279834431989373E4; + const double q2 = -2.77711081420602794433E2; + const double q3 = 1.0; + + // data vectors + VTYPE x, x2, y1, y2; + + x = abs(x0); + auto x_small = x <= 1.0; // use Pade approximation if abs(x) <= 1 + + if (horizontal_or(x_small)) { + // At least one element needs small method + x2 = x*x; + y1 = polynomial_3(x2, p0, p1, p2, p3) / polynomial_3(x2, q0, q1, q2, q3); + y1 = mul_add(y1, x*x2, x); // y1 = x + x2*(x*y1); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = exp_d(x); // 0.5 * exp(x) + y2 -= 0.25 / y2; // - 0.5 * exp(-x) + } + y1 = select(x_small, y1, y2); // choose method + y1 = sign_combine(y1, x0); // get original sign + // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision + + return y1; +} + +// instances of sinh_d template +static inline Vec2d sinh(Vec2d const x) { + return sinh_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d sinh(Vec4d const x) { + return sinh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d sinh(Vec8d const x) { + return sinh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for sinh function, single precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE sinh_f(VTYPE const x0) { +// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). + + // Coefficients + const float r0 = 1.66667160211E-1f; + const float r1 = 8.33028376239E-3f; + const float r2 = 2.03721912945E-4f; + + // data vectors + VTYPE x, x2, y1, y2; + + x = abs(x0); + auto x_small = x <= 1.0f; // use polynomial approximation if abs(x) <= 1 + + if (horizontal_or(x_small)) { + // At least one element needs small method + x2 = x*x; + y1 = polynomial_2(x2, r0, r1, r2); + y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = exp_f(x); // 0.5 * exp(x) + y2 -= 0.25f / y2; // - 0.5 * exp(-x) + } + y1 = select(x_small, y1, y2); // choose method + y1 = sign_combine(y1, x0); // get original sign + // you can avoid the sign_combine by replacing x by x0 above, but at a loss of precision + + return y1; +} + +// instances of sinh_f template +static inline Vec4f sinh(Vec4f const x) { + return sinh_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f sinh(Vec8f const x) { + return sinh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f sinh(Vec16f const x) { + return sinh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for cosh function, double precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE cosh_d(VTYPE const x0) { +// The limit of abs(x) is 709.7, as defined by max_x in vectormath_exp.h for 0.5*exp(x). + + // data vectors + VTYPE x, y; + x = abs(x0); + y = exp_d(x); // 0.5 * exp(x) + y += 0.25 / y; // + 0.5 * exp(-x) + return y; +} + +// instances of sinh_d template +static inline Vec2d cosh(Vec2d const x) { + return cosh_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d cosh(Vec4d const x) { + return cosh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d cosh(Vec8d const x) { + return cosh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for cosh function, single precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE cosh_f(VTYPE const x0) { +// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). + + // data vectors + VTYPE x, y; + x = abs(x0); + y = exp_f(x); // 0.5 * exp(x) + y += 0.25f / y; // + 0.5 * exp(-x) + return y; +} + +// instances of sinh_d template +static inline Vec4f cosh(Vec4f const x) { + return cosh_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f cosh(Vec8f const x) { + return cosh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f cosh(Vec16f const x) { + return cosh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for tanh function, double precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE tanh_d(VTYPE const x0) { + + // Coefficients + const double p0 = -1.61468768441708447952E3; + const double p1 = -9.92877231001918586564E1; + const double p2 = -9.64399179425052238628E-1; + + const double q0 = 4.84406305325125486048E3; + const double q1 = 2.23548839060100448583E3; + const double q2 = 1.12811678491632931402E2; + const double q3 = 1.0; + + // data vectors + VTYPE x, x2, y1, y2; + + x = abs(x0); + auto x_small = x <= 0.625; // use Pade approximation if abs(x) <= 5/8 + + if (horizontal_or(x_small)) { + // At least one element needs small method + x2 = x*x; + y1 = polynomial_2(x2, p0, p1, p2) / polynomial_3(x2, q0, q1, q2, q3); + y1 = mul_add(y1, x2*x, x); // y1 = x + x2*(x*y1); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = exp(x+x); // exp(2*x) + y2 = 1.0 - 2.0 / (y2 + 1.0); // tanh(x) + } + auto x_big = x > 350.; + y1 = select(x_small, y1, y2); // choose method + y1 = select(x_big, 1.0, y1); // avoid overflow + y1 = sign_combine(y1, x0); // get original sign + return y1; +} + +// instances of tanh_d template +static inline Vec2d tanh(Vec2d const x) { + return tanh_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d tanh(Vec4d const x) { + return tanh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d tanh(Vec8d const x) { + return tanh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for tanh function, single precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE tanh_f(VTYPE const x0) { +// The limit of abs(x) is 89.0, as defined by max_x in vectormath_exp.h for 0.5*exp(x). + + // Coefficients + const float r0 = -3.33332819422E-1f; + const float r1 = 1.33314422036E-1f; + const float r2 = -5.37397155531E-2f; + const float r3 = 2.06390887954E-2f; + const float r4 = -5.70498872745E-3f; + + // data vectors + VTYPE x, x2, y1, y2; + + x = abs(x0); + auto x_small = x <= 0.625f; // use polynomial approximation if abs(x) <= 5/8 + + if (horizontal_or(x_small)) { + // At least one element needs small method + x2 = x*x; + y1 = polynomial_4(x2, r0, r1, r2, r3, r4); + y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = exp(x+x); // exp(2*x) + y2 = 1.0f - 2.0f / (y2 + 1.0f); // tanh(x) + } + auto x_big = x > 44.4f; + y1 = select(x_small, y1, y2); // choose method + y1 = select(x_big, 1.0f, y1); // avoid overflow + y1 = sign_combine(y1, x0); // get original sign + return y1; +} + +// instances of tanh_f template +static inline Vec4f tanh(Vec4f const x) { + return tanh_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f tanh(Vec8f const x) { + return tanh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f tanh(Vec16f const x) { + return tanh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + + +/****************************************************************************** +* Inverse hyperbolic functions +******************************************************************************/ + +// Template for asinh function, double precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE asinh_d(VTYPE const x0) { + + // Coefficients + const double p0 = -5.56682227230859640450E0; + const double p1 = -9.09030533308377316566E0; + const double p2 = -4.37390226194356683570E0; + const double p3 = -5.91750212056387121207E-1; + const double p4 = -4.33231683752342103572E-3; + + const double q0 = 3.34009336338516356383E1; + const double q1 = 6.95722521337257608734E1; + const double q2 = 4.86042483805291788324E1; + const double q3 = 1.28757002067426453537E1; + const double q4 = 1.0; + + // data vectors + VTYPE x, x2, y1, y2; + + x2 = x0 * x0; + x = abs(x0); + auto x_small = x <= 0.533; // use Pade approximation if abs(x) <= 0.5 + // Both methods give the highest error close to 0.5. + // This limit is adjusted for minimum error + auto x_huge = x > 1.E20; // simple approximation, avoid overflow + + if (horizontal_or(x_small)) { + // At least one element needs small method + y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_4(x2, q0, q1, q2, q3, q4); + y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = log(x + sqrt(x2 + 1.0)); + if (horizontal_or(x_huge)) { + // At least one element needs huge method to avoid overflow + y2 = select(x_huge, log(x) + VM_LN2, y2); + } + } + y1 = select(x_small, y1, y2); // choose method + y1 = sign_combine(y1, x0); // get original sign + return y1; +} + +// instances of asinh_d template +static inline Vec2d asinh(Vec2d const x) { + return asinh_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d asinh(Vec4d const x) { + return asinh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d asinh(Vec8d const x) { + return asinh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for asinh function, single precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE asinh_f(VTYPE const x0) { + + // Coefficients + const float r0 = -1.6666288134E-1f; + const float r1 = 7.4847586088E-2f; + const float r2 = -4.2699340972E-2f; + const float r3 = 2.0122003309E-2f; + + // data vectors + VTYPE x, x2, y1, y2; + + x2 = x0 * x0; + x = abs(x0); + auto x_small = x <= 0.51f; // use polynomial approximation if abs(x) <= 0.5 + auto x_huge = x > 1.E10f; // simple approximation, avoid overflow + + if (horizontal_or(x_small)) { + // At least one element needs small method + y1 = polynomial_3(x2, r0, r1, r2, r3); + y1 = mul_add(y1, x2*x, x); // y1 = x + (x2*x)*y1; + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = log(x + sqrt(x2 + 1.0f)); + if (horizontal_or(x_huge)) { + // At least one element needs huge method to avoid overflow + y2 = select(x_huge, log(x) + (float)VM_LN2, y2); + } + } + y1 = select(x_small, y1, y2); // choose method + y1 = sign_combine(y1, x0); // get original sign + return y1; +} + +// instances of asinh_f template +static inline Vec4f asinh(Vec4f const x) { + return asinh_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f asinh(Vec8f const x) { + return asinh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f asinh(Vec16f const x) { + return asinh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for acosh function, double precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE acosh_d(VTYPE const x0) { + + // Coefficients + const double p0 = 1.10855947270161294369E5; + const double p1 = 1.08102874834699867335E5; + const double p2 = 3.43989375926195455866E4; + const double p3 = 3.94726656571334401102E3; + const double p4 = 1.18801130533544501356E2; + + const double q0 = 7.83869920495893927727E4; + const double q1 = 8.29725251988426222434E4; + const double q2 = 2.97683430363289370382E4; + const double q3 = 4.15352677227719831579E3; + const double q4 = 1.86145380837903397292E2; + const double q5 = 1.0; + + // data vectors + VTYPE x1, y1, y2; + + x1 = x0 - 1.0; + auto undef = x0 < 1.0; // result is NAN + auto x_small = x1 < 0.49; // use Pade approximation if abs(x-1) < 0.5 + auto x_huge = x1 > 1.E20; // simple approximation, avoid overflow + + if (horizontal_or(x_small)) { + // At least one element needs small method + y1 = sqrt(x1) * (polynomial_4(x1, p0, p1, p2, p3, p4) / polynomial_5(x1, q0, q1, q2, q3, q4, q5)); + // x < 1 generates NAN + y1 = select(undef, nan_vec(NAN_HYP), y1); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); + if (horizontal_or(x_huge)) { + // At least one element needs huge method to avoid overflow + y2 = select(x_huge, log(x0) + VM_LN2, y2); + } + } + y1 = select(x_small, y1, y2); // choose method + return y1; +} + +// instances of acosh_d template +static inline Vec2d acosh(Vec2d const x) { + return acosh_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d acosh(Vec4d const x) { + return acosh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d acosh(Vec8d const x) { + return acosh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for acosh function, single precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE acosh_f(VTYPE const x0) { + + // Coefficients + const float r0 = 1.4142135263E0f; + const float r1 = -1.1784741703E-1f; + const float r2 = 2.6454905019E-2f; + const float r3 = -7.5272886713E-3f; + const float r4 = 1.7596881071E-3f; + + // data vectors + VTYPE x1, y1, y2; + + x1 = x0 - 1.0f; + auto undef = x0 < 1.0f; // result is NAN + auto x_small = x1 < 0.49f; // use Pade approximation if abs(x-1) < 0.5 + auto x_huge = x1 > 1.E10f; // simple approximation, avoid overflow + + if (horizontal_or(x_small)) { + // At least one element needs small method + y1 = sqrt(x1) * polynomial_4(x1, r0, r1, r2, r3, r4); + // x < 1 generates NAN + y1 = select(undef, nan_vec(NAN_HYP), y1); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = log(x0 + sqrt(mul_sub(x0,x0,1.0))); + if (horizontal_or(x_huge)) { + // At least one element needs huge method to avoid overflow + y2 = select(x_huge, log(x0) + (float)VM_LN2, y2); + } + } + y1 = select(x_small, y1, y2); // choose method + return y1; +} + +// instances of acosh_f template +static inline Vec4f acosh(Vec4f const x) { + return acosh_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f acosh(Vec8f const x) { + return acosh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f acosh(Vec16f const x) { + return acosh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for atanh function, double precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE atanh_d(VTYPE const x0) { + + // Coefficients + const double p0 = -3.09092539379866942570E1; + const double p1 = 6.54566728676544377376E1; + const double p2 = -4.61252884198732692637E1; + const double p3 = 1.20426861384072379242E1; + const double p4 = -8.54074331929669305196E-1; + + const double q0 = -9.27277618139601130017E1; + const double q1 = 2.52006675691344555838E2; + const double q2 = -2.49839401325893582852E2; + const double q3 = 1.08938092147140262656E2; + const double q4 = -1.95638849376911654834E1; + const double q5 = 1.0; + + // data vectors + VTYPE x, x2, y1, y2, y3; + + x = abs(x0); + auto x_small = x < 0.5; // use Pade approximation if abs(x) < 0.5 + + if (horizontal_or(x_small)) { + // At least one element needs small method + x2 = x * x; + y1 = polynomial_4(x2, p0, p1, p2, p3, p4) / polynomial_5(x2, q0, q1, q2, q3, q4, q5); + y1 = mul_add(y1, x2*x, x); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = log((1.0+x)/(1.0-x)) * 0.5; + // check if out of range + y3 = select(x == 1.0, infinite_vec(), nan_vec(NAN_HYP)); + y2 = select(x >= 1.0, y3, y2); + } + y1 = select(x_small, y1, y2); // choose method + y1 = sign_combine(y1, x0); // get original sign + return y1; +} + +// instances of atanh_d template +static inline Vec2d atanh(Vec2d const x) { + return atanh_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d atanh(Vec4d const x) { + return atanh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d atanh(Vec8d const x) { + return atanh_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// Template for atanh function, single precision +// This function does not produce denormals +// Template parameters: +// VTYPE: double vector type +template +static inline VTYPE atanh_f(VTYPE const x0) { + + // Coefficients + const float r0 = 3.33337300303E-1f; + const float r1 = 1.99782164500E-1f; + const float r2 = 1.46691431730E-1f; + const float r3 = 8.24370301058E-2f; + const float r4 = 1.81740078349E-1f; + + // data vectors + VTYPE x, x2, y1, y2, y3; + + x = abs(x0); + auto x_small = x < 0.5f; // use polynomial approximation if abs(x) < 0.5 + + if (horizontal_or(x_small)) { + // At least one element needs small method + x2 = x * x; + y1 = polynomial_4(x2, r0, r1, r2, r3, r4); + y1 = mul_add(y1, x2*x, x); + } + if (!horizontal_and(x_small)) { + // At least one element needs big method + y2 = log((1.0f+x)/(1.0f-x)) * 0.5f; + // check if out of range + y3 = select(x == 1.0f, infinite_vec(), nan_vec(NAN_HYP)); + y2 = select(x >= 1.0f, y3, y2); + } + y1 = select(x_small, y1, y2); // choose method + y1 = sign_combine(y1, x0); // get original sign + return y1; +} + +// instances of atanh_f template +static inline Vec4f atanh(Vec4f const x) { + return atanh_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f atanh(Vec8f const x) { + return atanh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f atanh(Vec16f const x) { + return atanh_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + +#ifdef VCL_NAMESPACE +} +#endif + +#endif diff --git a/DFTTest/VCL2/vectormath_lib.h b/DFTTest/VCL2/vectormath_lib.h new file mode 100644 index 0000000..172fbbb --- /dev/null +++ b/DFTTest/VCL2/vectormath_lib.h @@ -0,0 +1,2026 @@ +/**************************** vectormath_lib.h ***************************** +* Author: Agner Fog +* Date created: 2012-05-30 +* Last modified: 2019-08-01 +* Version: 2.00.00 +* Project: vector class library +* Description: +* Header file defining mathematical functions on floating point vectors +* using Intel SVML library +* +* Instructions to use SVML library: +* Include this file and link with svml +* +* Alternatively, use the inline math functions by including +* vectormath_exp.h for power and exponential functions +* vectormath_trig.h for trigonometric functions +* vectormath_hyp.h for hyperbolic functions +* +* For detailed instructions, see vcl_manual.pdf +* +* (c) Copyright 2012-2019 Agner Fog. +* Apache License version 2.0 or later. +\*****************************************************************************/ + +// check combination of header files +#ifndef VECTORMATH_LIB_H +#define VECTORMATH_LIB_H 1 + +#ifdef VECTORMATH_COMMON_H +#error conflicting header files. More than one implementation of mathematical functions included +#else + +#include "vectorclass.h" // make sure vector classes are defined first + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { // optional name space +#endif + + +#ifdef __INTEL_COMPILER +/***************************************************************************** +* +* 128-bit vector functions using Intel compiler +* +*****************************************************************************/ + +// exponential and power functions +static inline Vec4f exp(Vec4f const x) { // exponential function + return _mm_exp_ps(x); +} +static inline Vec2d exp(Vec2d const x) { // exponential function + return _mm_exp_pd(x); +} +static inline Vec4f expm1(Vec4f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return _mm_expm1_ps(x); +} +static inline Vec2d expm1(Vec2d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return _mm_expm1_pd(x); +} +static inline Vec4f exp2(Vec4f const x) { // pow(2,x) + return _mm_exp2_ps(x); +} +static inline Vec2d exp2(Vec2d const x) { // pow(2,x) + return _mm_exp2_pd(x); +} +static inline Vec4f exp10(Vec4f const x) { // pow(10,x) + return _mm_exp10_ps(x); +} +static inline Vec2d exp10(Vec2d const x) { // pow(10,x) + return _mm_exp10_pd(x); +} +static inline Vec4f pow(Vec4f const a, Vec4f const b) { // pow(a,b) = a to the power of b + return _mm_pow_ps(a, b); +} +static inline Vec4f pow(Vec4f const a, float const b) { // pow(a,b) = a to the power of b + return _mm_pow_ps(a, Vec4f(b)); +} +static inline Vec2d pow(Vec2d const a, Vec2d const b) { // pow(a,b) = a to the power of b + return _mm_pow_pd(a, b); +} +static inline Vec2d pow(Vec2d const a, double const b) { // pow(a,b) = a to the power of b + return _mm_pow_pd(a, Vec2d(b)); +} +static inline Vec4f cbrt(Vec4f const x) { // pow(x,1/3) + return _mm_cbrt_ps(x); +} +static inline Vec2d cbrt(Vec2d const x) { // pow(x,1/3) + return _mm_cbrt_pd(x); +} +// logarithms +static inline Vec4f log(Vec4f const x) { // natural logarithm + return _mm_log_ps(x); +} +static inline Vec2d log(Vec2d const x) { // natural logarithm + return _mm_log_pd(x); +} +static inline Vec4f log1p(Vec4f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return _mm_log1p_ps(x); +} +static inline Vec2d log1p(Vec2d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return _mm_log1p_pd(x); +} +static inline Vec4f log2(Vec4f const x) { // logarithm base 2 + return _mm_log2_ps(x); +} +static inline Vec2d log2(Vec2d const x) { // logarithm base 2 + return _mm_log2_pd(x); +} +static inline Vec4f log10(Vec4f const x) { // logarithm base 10 + return _mm_log10_ps(x); +} +static inline Vec2d log10(Vec2d const x) { // logarithm base 10 + return _mm_log10_pd(x); +} + +// trigonometric functions +static inline Vec4f sin(Vec4f const x) { // sine + return _mm_sin_ps(x); +} +static inline Vec2d sin(Vec2d const x) { // sine + return _mm_sin_pd(x); +} +static inline Vec4f cos(Vec4f const x) { // cosine + return _mm_cos_ps(x); +} +static inline Vec2d cos(Vec2d const x) { // cosine + return _mm_cos_pd(x); +} +static inline Vec4f sincos(Vec4f * pcos, Vec4f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m128 r_sin, r_cos; + r_sin = _mm_sincos_ps(&r_cos, x); + *pcos = r_cos; + return r_sin; +} +static inline Vec2d sincos(Vec2d * pcos, Vec2d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m128d r_sin, r_cos; + r_sin = _mm_sincos_pd(&r_cos, x); + *pcos = r_cos; + return r_sin; +} +static inline Vec4f tan(Vec4f const x) { // tangent + return _mm_tan_ps(x); +} +static inline Vec2d tan(Vec2d const x) { // tangent + return _mm_tan_pd(x); +} + +// inverse trigonometric functions +static inline Vec4f asin(Vec4f const x) { // inverse sine + return _mm_asin_ps(x); +} +static inline Vec2d asin(Vec2d const x) { // inverse sine + return _mm_asin_pd(x); +} + +static inline Vec4f acos(Vec4f const x) { // inverse cosine + return _mm_acos_ps(x); +} +static inline Vec2d acos(Vec2d const x) { // inverse cosine + return _mm_acos_pd(x); +} + +static inline Vec4f atan(Vec4f const x) { // inverse tangent + return _mm_atan_ps(x); +} +static inline Vec2d atan(Vec2d const x) { // inverse tangent + return _mm_atan_pd(x); +} +static inline Vec4f atan2(Vec4f const a, Vec4f const b) { // inverse tangent of a/b + return _mm_atan2_ps(a, b); +} +static inline Vec2d atan2(Vec2d const a, Vec2d const b) { // inverse tangent of a/b + return _mm_atan2_pd(a, b); +} + +// hyperbolic functions and inverse hyperbolic functions +static inline Vec4f sinh(Vec4f const x) { // hyperbolic sine + return _mm_sinh_ps(x); +} +static inline Vec2d sinh(Vec2d const x) { // hyperbolic sine + return _mm_sinh_pd(x); +} +static inline Vec4f cosh(Vec4f const x) { // hyperbolic cosine + return _mm_cosh_ps(x); +} +static inline Vec2d cosh(Vec2d const x) { // hyperbolic cosine + return _mm_cosh_pd(x); +} +static inline Vec4f tanh(Vec4f const x) { // hyperbolic tangent + return _mm_tanh_ps(x); +} +static inline Vec2d tanh(Vec2d const x) { // hyperbolic tangent + return _mm_tanh_pd(x); +} +static inline Vec4f asinh(Vec4f const x) { // inverse hyperbolic sine + return _mm_asinh_ps(x); +} +static inline Vec2d asinh(Vec2d const x) { // inverse hyperbolic sine + return _mm_asinh_pd(x); +} +static inline Vec4f acosh(Vec4f const x) { // inverse hyperbolic cosine + return _mm_acosh_ps(x); +} +static inline Vec2d acosh(Vec2d const x) { // inverse hyperbolic cosine + return _mm_acosh_pd(x); +} +static inline Vec4f atanh(Vec4f const x) { // inverse hyperbolic tangent + return _mm_atanh_ps(x); +} +static inline Vec2d atanh(Vec2d const x) { // inverse hyperbolic tangent + return _mm_atanh_pd(x); +} + +// error function +static inline Vec4f erf(Vec4f const x) { // error function + return _mm_erf_ps(x); +} +static inline Vec2d erf(Vec2d const x) { // error function + return _mm_erf_pd(x); +} +static inline Vec4f erfc(Vec4f const x) { // error function complement + return _mm_erfc_ps(x); +} +static inline Vec2d erfc(Vec2d const x) { // error function complement + return _mm_erfc_pd(x); +} +static inline Vec4f erfinv(Vec4f const x) { // inverse error function + return _mm_erfinv_ps(x); +} +static inline Vec2d erfinv(Vec2d const x) { // inverse error function + return _mm_erfinv_pd(x); +} + +static inline Vec4f cdfnorm(Vec4f const x) { // cumulative normal distribution function + return _mm_cdfnorm_ps(x); +} +static inline Vec2d cdfnorm(Vec2d const x) { // cumulative normal distribution function + return _mm_cdfnorm_pd(x); +} +static inline Vec4f cdfnorminv(Vec4f const x) { // inverse cumulative normal distribution function + return _mm_cdfnorminv_ps(x); +} +static inline Vec2d cdfnorminv(Vec2d const x) { // inverse cumulative normal distribution function + return _mm_cdfnorminv_pd(x); +} + +#else +/***************************************************************************** +* +* 128-bit vector functions using other compiler than Intel +* +*****************************************************************************/ + +#if (defined(_WIN64) || defined(__CYGWIN__)) && defined(__x86_64__) +// fix incompatible calling convention in Win64 +#if defined(_MSC_VER) || defined(__clang__) +#define V_VECTORCALL __vectorcall +#else +// gcc. Change this if future gcc version supports __vectorcall +#define V_VECTORCALL __attribute__((sysv_abi)) // this is inefficient but it works +#endif +#else // not Win64. Vectors are transferred in registers by default +#define V_VECTORCALL +#endif + +// External function prototypes, 128-bit vectors +extern "C" { + extern __m128 V_VECTORCALL __svml_expf4 (__m128); + extern __m128d V_VECTORCALL __svml_exp2 (__m128d); + extern __m128 V_VECTORCALL __svml_expm1f4 (__m128); + extern __m128d V_VECTORCALL __svml_expm12 (__m128d); + extern __m128 V_VECTORCALL __svml_exp2f4 (__m128); + extern __m128d V_VECTORCALL __svml_exp22 (__m128d); + extern __m128 V_VECTORCALL __svml_exp10f4 (__m128); + extern __m128d V_VECTORCALL __svml_exp102 (__m128d); + extern __m128 V_VECTORCALL __svml_powf4 (__m128, __m128); + extern __m128d V_VECTORCALL __svml_pow2 (__m128d, __m128d); + extern __m128 V_VECTORCALL __svml_cbrtf4 (__m128); + extern __m128d V_VECTORCALL __svml_cbrt2 (__m128d); + extern __m128 V_VECTORCALL __svml_invsqrtf4 (__m128); + extern __m128d V_VECTORCALL __svml_invsqrt2 (__m128d); + extern __m128 V_VECTORCALL __svml_logf4 (__m128); + extern __m128d V_VECTORCALL __svml_log2 (__m128d); + extern __m128 V_VECTORCALL __svml_log1pf4 (__m128); + extern __m128d V_VECTORCALL __svml_log1p2 (__m128d); + extern __m128 V_VECTORCALL __svml_log2f4 (__m128); + extern __m128d V_VECTORCALL __svml_log22 (__m128d); + extern __m128 V_VECTORCALL __svml_log10f4 (__m128); + extern __m128d V_VECTORCALL __svml_log102 (__m128d); + extern __m128 V_VECTORCALL __svml_sinf4 (__m128); + extern __m128d V_VECTORCALL __svml_sin2 (__m128d); + extern __m128 V_VECTORCALL __svml_cosf4 (__m128); + extern __m128d V_VECTORCALL __svml_cos2 (__m128d); + extern __m128 V_VECTORCALL __svml_sincosf4 (__m128); // cos returned in xmm1 + extern __m128d V_VECTORCALL __svml_sincos2 (__m128d); // cos returned in xmm1 + extern __m128 V_VECTORCALL __svml_tanf4 (__m128); + extern __m128d V_VECTORCALL __svml_tan2 (__m128d); + extern __m128 V_VECTORCALL __svml_asinf4 (__m128); + extern __m128d V_VECTORCALL __svml_asin2 (__m128d); + extern __m128 V_VECTORCALL __svml_acosf4 (__m128); + extern __m128d V_VECTORCALL __svml_acos2 (__m128d); + extern __m128 V_VECTORCALL __svml_atanf4 (__m128); + extern __m128d V_VECTORCALL __svml_atan2 (__m128d); + extern __m128 V_VECTORCALL __svml_atan2f4 (__m128, __m128); + extern __m128d V_VECTORCALL __svml_atan22 (__m128d, __m128d); + extern __m128 V_VECTORCALL __svml_sinhf4 (__m128); + extern __m128d V_VECTORCALL __svml_sinh2 (__m128d); + extern __m128 V_VECTORCALL __svml_coshf4 (__m128); + extern __m128d V_VECTORCALL __svml_cosh2 (__m128d); + extern __m128 V_VECTORCALL __svml_tanhf4 (__m128); + extern __m128d V_VECTORCALL __svml_tanh2 (__m128d); + extern __m128 V_VECTORCALL __svml_asinhf4 (__m128); + extern __m128d V_VECTORCALL __svml_asinh2 (__m128d); + extern __m128 V_VECTORCALL __svml_acoshf4 (__m128); + extern __m128d V_VECTORCALL __svml_acosh2 (__m128d); + extern __m128 V_VECTORCALL __svml_atanhf4 (__m128); + extern __m128d V_VECTORCALL __svml_atanh2 (__m128d); + extern __m128 V_VECTORCALL __svml_erff4 (__m128); + extern __m128d V_VECTORCALL __svml_erf2 (__m128d); + extern __m128 V_VECTORCALL __svml_erfcf4 (__m128); + extern __m128d V_VECTORCALL __svml_erfc2 (__m128d); + extern __m128 V_VECTORCALL __svml_erfinvf4 (__m128); + extern __m128d V_VECTORCALL __svml_erfinv2 (__m128d); + extern __m128 V_VECTORCALL __svml_cdfnormf4 (__m128); + extern __m128d V_VECTORCALL __svml_cdfnorm2 (__m128d); + extern __m128 V_VECTORCALL __svml_cdfnorminvf4(__m128); + extern __m128d V_VECTORCALL __svml_cdfnorminv2 (__m128d); + extern __m128 V_VECTORCALL __svml_cexpf4 (__m128); + extern __m128d V_VECTORCALL __svml_cexp2 (__m128d); +} + + +/***************************************************************************** +* +* Function definitions +* +*****************************************************************************/ + +// exponential and power functions +static inline Vec4f exp (Vec4f const x) { // exponential function + return __svml_expf4(x); +} +static inline Vec2d exp (Vec2d const x) { // exponential function + return __svml_exp2(x); +} + +static inline Vec4f expm1 (Vec4f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return __svml_expm1f4(x); +} +static inline Vec2d expm1 (Vec2d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return __svml_expm12(x); +} + +static inline Vec4f exp2 (Vec4f const x) { // pow(2,x) + return __svml_exp2f4(x); +} +static inline Vec2d exp2 (Vec2d const x) { // pow(2,x) + return __svml_exp22(x); +} + +static inline Vec4f exp10 (Vec4f const x) { // pow(10,x) + return __svml_exp10f4(x); +} +static inline Vec2d exp10 (Vec2d const x) { // pow(10,x) + return __svml_exp102(x); +} + +static inline Vec4f pow (Vec4f const a, Vec4f const b) { // pow(a,b) = a to the power of b + return __svml_powf4(a,b); +} + +static inline Vec4f pow (Vec4f const a, float const b) { // pow(a,b) = a to the power of b + return __svml_powf4(a,Vec4f(b)); +} +static inline Vec2d pow (Vec2d const a, Vec2d const b) { // pow(a,b) = a to the power of b + return __svml_pow2(a,b); +} +static inline Vec2d pow (Vec2d const a, double const b) { // pow(a,b) = a to the power of b + return __svml_pow2(a,Vec2d(b)); +} + +static inline Vec4f cbrt (Vec4f const x) { // pow(x,1/3) + return __svml_cbrtf4(x); +} +static inline Vec2d cbrt (Vec2d const x) { // pow(x,1/3) + return __svml_cbrt2(x); +} + +// logarithms +static inline Vec4f log (Vec4f const x) { // natural logarithm + return __svml_logf4(x); +} +static inline Vec2d log (Vec2d const x) { // natural logarithm + return __svml_log2(x); +} + +static inline Vec4f log1p (Vec4f const x) { // log(1+x) + return __svml_log1pf4(x); +} +static inline Vec2d log1p (Vec2d const x) { // log(1+x) + return __svml_log1p2(x); +} + +static inline Vec4f log2 (Vec4f const x) { // logarithm base 2 + return __svml_log2f4(x); +} +static inline Vec2d log2 (Vec2d const x) { // logarithm base 2 + return __svml_log22(x); +} + +static inline Vec4f log10 (Vec4f const x) { // logarithm base 10 + return __svml_log10f4(x); +} +static inline Vec2d log10 (Vec2d const x) { // logarithm base 10 + return __svml_log102(x); +} + +// trigonometric functions (angles in radians) +static inline Vec4f sin (Vec4f const x) { // sine + return __svml_sinf4(x); +} +static inline Vec2d sin (Vec2d const x) { // sine + return __svml_sin2(x); +} + +static inline Vec4f cos (Vec4f const x) { // cosine + return __svml_cosf4(x); +} +static inline Vec2d cos (Vec2d const x) { // cosine + return __svml_cos2(x); +} + +#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER) +// no inline assembly in 64 bit MS compiler +static inline Vec4f sincos (Vec4f * pcos, Vec4f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m128 r_sin, r_cos; + r_sin = __svml_sincosf4(x); +#if defined(__unix__) || defined(__GNUC__) + // __asm__ ( "call V_VECTORCALL __svml_sincosf4 \n movaps %%xmm0, %0 \n movaps %%xmm1, %1" : "=m"(r_sin), "=m"(r_cos) : "xmm0"(x) ); + __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(r_cos)); +#else // Windows + _asm movaps r_cos, xmm1; +#endif + *pcos = r_cos; + return r_sin; +} +static inline Vec2d sincos (Vec2d * pcos, Vec2d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m128d r_sin, r_cos; + r_sin = __svml_sincos2(x); +#if defined(__unix__) || defined(__GNUC__) + __asm__ __volatile__ ( "movaps %%xmm1, %0":"=m"(r_cos)); +#else // Windows + _asm movapd r_cos, xmm1; +#endif + *pcos = r_cos; + return r_sin; +} +#endif // inline assembly available + +static inline Vec4f tan (Vec4f const x) { // tangent + return __svml_tanf4(x); +} +static inline Vec2d tan (Vec2d const x) { // tangent + return __svml_tan2(x); +} + +// inverse trigonometric functions +static inline Vec4f asin (Vec4f const x) { // inverse sine + return __svml_asinf4(x); +} +static inline Vec2d asin (Vec2d const x) { // inverse sine + return __svml_asin2(x); +} + +static inline Vec4f acos (Vec4f const x) { // inverse cosine + return __svml_acosf4(x); +} +static inline Vec2d acos (Vec2d const x) { // inverse cosine + return __svml_acos2(x); +} + +static inline Vec4f atan (Vec4f const x) { // inverse tangent + return __svml_atanf4(x); +} +static inline Vec2d atan (Vec2d const x) { // inverse tangent + return __svml_atan2(x); +} + +static inline Vec4f atan2 (Vec4f const a, Vec4f const b) { // inverse tangent of a/b + return __svml_atan2f4(a,b); +} +static inline Vec2d atan2 (Vec2d const a, Vec2d const b) { // inverse tangent of a/b + return __svml_atan22(a,b); +} + +// hyperbolic functions and inverse hyperbolic functions +static inline Vec4f sinh (Vec4f const x) { // hyperbolic sine + return __svml_sinhf4(x); +} +static inline Vec2d sinh (Vec2d const x) { // hyperbolic sine + return __svml_sinh2(x); +} + +static inline Vec4f cosh (Vec4f const x) { // hyperbolic cosine + return __svml_coshf4(x); +} +static inline Vec2d cosh (Vec2d const x) { // hyperbolic cosine + return __svml_cosh2(x); +} + +static inline Vec4f tanh (Vec4f const x) { // hyperbolic tangent + return __svml_tanhf4(x); +} +static inline Vec2d tanh (Vec2d const x) { // hyperbolic tangent + return __svml_tanh2(x); +} + +static inline Vec4f asinh (Vec4f const x) { // inverse hyperbolic sine + return __svml_asinhf4(x); +} +static inline Vec2d asinh (Vec2d const x) { // inverse hyperbolic sine + return __svml_asinh2(x); +} + +static inline Vec4f acosh (Vec4f const x) { // inverse hyperbolic cosine + return __svml_acoshf4(x); +} +static inline Vec2d acosh (Vec2d const x) { // inverse hyperbolic cosine + return __svml_acosh2(x); +} + +static inline Vec4f atanh (Vec4f const x) { // inverse hyperbolic tangent + return __svml_atanhf4(x); +} +static inline Vec2d atanh (Vec2d const x) { // inverse hyperbolic tangent + return __svml_atanh2(x); +} + +// error function +static inline Vec4f erf (Vec4f const x) { // error function + return __svml_erff4(x); +} +static inline Vec2d erf (Vec2d const x) { // error function + return __svml_erf2(x); +} + +static inline Vec4f erfc (Vec4f const x) { // error function complement + return __svml_erfcf4(x); +} +static inline Vec2d erfc (Vec2d const x) { // error function complement + return __svml_erfc2(x); +} + +static inline Vec4f erfinv (Vec4f const x) { // inverse error function + return __svml_erfinvf4(x); +} +static inline Vec2d erfinv (Vec2d const x) { // inverse error function + return __svml_erfinv2(x); +} + +static inline Vec4f cdfnorm (Vec4f const x) { // cumulative normal distribution function + return __svml_cdfnormf4(x); +} +static inline Vec2d cdfnorm (Vec2d const x) { // cumulative normal distribution function + return __svml_cdfnorm2(x); +} + +static inline Vec4f cdfnorminv (Vec4f const x) { // inverse cumulative normal distribution function + return __svml_cdfnorminvf4(x); +} +static inline Vec2d cdfnorminv (Vec2d const x) { // inverse cumulative normal distribution function + return __svml_cdfnorminv2(x); +} + +#endif // __INTEL_COMPILER + +#if defined (MAX_VECTOR_SIZE) && MAX_VECTOR_SIZE >= 256 // 256 bit vectors + +#if defined (VECTORF256_H) // 256-bit vector registers supported + +#ifdef __INTEL_COMPILER +/***************************************************************************** +* +* 256-bit vector functions using Intel compiler +* +*****************************************************************************/ +// exponential and power functions +static inline Vec8f exp(Vec8f const x) { // exponential function + return _mm256_exp_ps(x); +} +static inline Vec4d exp(Vec4d const x) { // exponential function + return _mm256_exp_pd(x); +} +static inline Vec8f expm1(Vec8f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return _mm256_expm1_ps(x); +} +static inline Vec4d expm1(Vec4d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return _mm256_expm1_pd(x); +} +static inline Vec8f exp2(Vec8f const x) { // pow(2,x) + return _mm256_exp2_ps(x); +} +static inline Vec4d exp2(Vec4d const x) { // pow(2,x) + return _mm256_exp2_pd(x); +} +static inline Vec8f exp10(Vec8f const x) { // pow(10,x) + return _mm256_exp10_ps(x); +} +static inline Vec4d exp10(Vec4d const x) { // pow(10,x) + return _mm256_exp10_pd(x); +} +static inline Vec8f pow(Vec8f const a, Vec8f const b) { // pow(a,b) = a to the power of b + return _mm256_pow_ps(a, b); +} +static inline Vec8f pow(Vec8f const a, float const b) { // pow(a,b) = a to the power of b + return _mm256_pow_ps(a, Vec8f(b)); +} +static inline Vec4d pow(Vec4d const a, Vec4d const b) { // pow(a,b) = a to the power of b + return _mm256_pow_pd(a, b); +} +static inline Vec4d pow(Vec4d const a, double const b) { // pow(a,b) = a to the power of b + return _mm256_pow_pd(a, Vec4d(b)); +} +static inline Vec8f cbrt(Vec8f const x) { // pow(x,1/3) + return _mm256_cbrt_ps(x); +} +static inline Vec4d cbrt(Vec4d const x) { // pow(x,1/3) + return _mm256_cbrt_pd(x); +} +// logarithms +static inline Vec8f log(Vec8f const x) { // natural logarithm + return _mm256_log_ps(x); +} +static inline Vec4d log(Vec4d const x) { // natural logarithm + return _mm256_log_pd(x); +} +static inline Vec8f log1p(Vec8f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return _mm256_log1p_ps(x); +} +static inline Vec4d log1p(Vec4d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return _mm256_log1p_pd(x); +} +static inline Vec8f log2(Vec8f const x) { // logarithm base 2 + return _mm256_log2_ps(x); +} +static inline Vec4d log2(Vec4d const x) { // logarithm base 2 + return _mm256_log2_pd(x); +} +static inline Vec8f log10(Vec8f const x) { // logarithm base 10 + return _mm256_log10_ps(x); +} +static inline Vec4d log10(Vec4d const x) { // logarithm base 10 + return _mm256_log10_pd(x); +} + +// trigonometric functions +static inline Vec8f sin(Vec8f const x) { // sine + return _mm256_sin_ps(x); +} +static inline Vec4d sin(Vec4d const x) { // sine + return _mm256_sin_pd(x); +} +static inline Vec8f cos(Vec8f const x) { // cosine + return _mm256_cos_ps(x); +} +static inline Vec4d cos(Vec4d const x) { // cosine + return _mm256_cos_pd(x); +} +static inline Vec8f sincos(Vec8f * pcos, Vec8f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m256 r_sin, r_cos; + r_sin = _mm256_sincos_ps(&r_cos, x); + *pcos = r_cos; + return r_sin; +} +static inline Vec4d sincos(Vec4d * pcos, Vec4d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m256d r_sin, r_cos; + r_sin = _mm256_sincos_pd(&r_cos, x); + *pcos = r_cos; + return r_sin; +} +static inline Vec8f tan(Vec8f const x) { // tangent + return _mm256_tan_ps(x); +} +static inline Vec4d tan(Vec4d const x) { // tangent + return _mm256_tan_pd(x); +} + +// inverse trigonometric functions +static inline Vec8f asin(Vec8f const x) { // inverse sine + return _mm256_asin_ps(x); +} +static inline Vec4d asin(Vec4d const x) { // inverse sine + return _mm256_asin_pd(x); +} + +static inline Vec8f acos(Vec8f const x) { // inverse cosine + return _mm256_acos_ps(x); +} +static inline Vec4d acos(Vec4d const x) { // inverse cosine + return _mm256_acos_pd(x); +} + +static inline Vec8f atan(Vec8f const x) { // inverse tangent + return _mm256_atan_ps(x); +} +static inline Vec4d atan(Vec4d const x) { // inverse tangent + return _mm256_atan_pd(x); +} +static inline Vec8f atan2(Vec8f const a, Vec8f const b) { // inverse tangent of a/b + return _mm256_atan2_ps(a, b); +} +static inline Vec4d atan2(Vec4d const a, Vec4d const b) { // inverse tangent of a/b + return _mm256_atan2_pd(a, b); +} + +// hyperbolic functions and inverse hyperbolic functions +static inline Vec8f sinh(Vec8f const x) { // hyperbolic sine + return _mm256_sinh_ps(x); +} +static inline Vec4d sinh(Vec4d const x) { // hyperbolic sine + return _mm256_sinh_pd(x); +} +static inline Vec8f cosh(Vec8f const x) { // hyperbolic cosine + return _mm256_cosh_ps(x); +} +static inline Vec4d cosh(Vec4d const x) { // hyperbolic cosine + return _mm256_cosh_pd(x); +} +static inline Vec8f tanh(Vec8f const x) { // hyperbolic tangent + return _mm256_tanh_ps(x); +} +static inline Vec4d tanh(Vec4d const x) { // hyperbolic tangent + return _mm256_tanh_pd(x); +} +static inline Vec8f asinh(Vec8f const x) { // inverse hyperbolic sine + return _mm256_asinh_ps(x); +} +static inline Vec4d asinh(Vec4d const x) { // inverse hyperbolic sine + return _mm256_asinh_pd(x); +} +static inline Vec8f acosh(Vec8f const x) { // inverse hyperbolic cosine + return _mm256_acosh_ps(x); +} +static inline Vec4d acosh(Vec4d const x) { // inverse hyperbolic cosine + return _mm256_acosh_pd(x); +} +static inline Vec8f atanh(Vec8f const x) { // inverse hyperbolic tangent + return _mm256_atanh_ps(x); +} +static inline Vec4d atanh(Vec4d const x) { // inverse hyperbolic tangent + return _mm256_atanh_pd(x); +} + +// error function +static inline Vec8f erf(Vec8f const x) { // error function + return _mm256_erf_ps(x); +} +static inline Vec4d erf(Vec4d const x) { // error function + return _mm256_erf_pd(x); +} +static inline Vec8f erfc(Vec8f const x) { // error function complement + return _mm256_erfc_ps(x); +} +static inline Vec4d erfc(Vec4d const x) { // error function complement + return _mm256_erfc_pd(x); +} +static inline Vec8f erfinv(Vec8f const x) { // inverse error function + return _mm256_erfinv_ps(x); +} +static inline Vec4d erfinv(Vec4d const x) { // inverse error function + return _mm256_erfinv_pd(x); +} + +static inline Vec8f cdfnorm(Vec8f const x) { // cumulative normal distribution function + return _mm256_cdfnorm_ps(x); +} +static inline Vec4d cdfnorm(Vec4d const x) { // cumulative normal distribution function + return _mm256_cdfnorm_pd(x); +} +static inline Vec8f cdfnorminv(Vec8f const x) { // inverse cumulative normal distribution function + return _mm256_cdfnorminv_ps(x); +} +static inline Vec4d cdfnorminv(Vec4d const x) { // inverse cumulative normal distribution function + return _mm256_cdfnorminv_pd(x); +} + + +#else // __INTEL_COMPILER +/***************************************************************************** +* +* 256-bit vector functions using other compiler than Intel +* +*****************************************************************************/ +// External function prototypes, 256-bit vectors +extern "C" { + extern __m256 V_VECTORCALL __svml_expf8 (__m256); + extern __m256d V_VECTORCALL __svml_exp4 (__m256d); + extern __m256 V_VECTORCALL __svml_expm1f8 (__m256); + extern __m256d V_VECTORCALL __svml_expm14 (__m256d); + extern __m256 V_VECTORCALL __svml_exp2f8 (__m256); + extern __m256d V_VECTORCALL __svml_exp24 (__m256d); + extern __m256 V_VECTORCALL __svml_exp10f8 (__m256); + extern __m256d V_VECTORCALL __svml_exp104 (__m256d); + extern __m256 V_VECTORCALL __svml_powf8 (__m256, __m256); + extern __m256d V_VECTORCALL __svml_pow4 (__m256d, __m256d); + extern __m256 V_VECTORCALL __svml_cbrtf8 (__m256); + extern __m256d V_VECTORCALL __svml_cbrt4 (__m256d); + extern __m256 V_VECTORCALL __svml_invsqrtf8 (__m256); + extern __m256d V_VECTORCALL __svml_invsqrt4 (__m256d); + extern __m256 V_VECTORCALL __svml_logf8 (__m256); + extern __m256d V_VECTORCALL __svml_log4 (__m256d); + extern __m256 V_VECTORCALL __svml_log1pf8 (__m256); + extern __m256d V_VECTORCALL __svml_log1p4 (__m256d); + extern __m256 V_VECTORCALL __svml_log2f8 (__m256); + extern __m256d V_VECTORCALL __svml_log24 (__m256d); + extern __m256 V_VECTORCALL __svml_log10f8 (__m256); + extern __m256d V_VECTORCALL __svml_log104 (__m256d); + extern __m256 V_VECTORCALL __svml_sinf8 (__m256); + extern __m256d V_VECTORCALL __svml_sin4 (__m256d); + extern __m256 V_VECTORCALL __svml_cosf8 (__m256); + extern __m256d V_VECTORCALL __svml_cos4 (__m256d); + extern __m256 V_VECTORCALL __svml_sincosf8 (__m256); // cos returned in ymm1 + extern __m256d V_VECTORCALL __svml_sincos4 (__m256d); // cos returned in ymm1 + extern __m256 V_VECTORCALL __svml_tanf8 (__m256); + extern __m256d V_VECTORCALL __svml_tan4 (__m256d); + extern __m256 V_VECTORCALL __svml_asinf8 (__m256); + extern __m256d V_VECTORCALL __svml_asin4 (__m256d); + extern __m256 V_VECTORCALL __svml_acosf8 (__m256); + extern __m256d V_VECTORCALL __svml_acos4 (__m256d); + extern __m256 V_VECTORCALL __svml_atanf8 (__m256); + extern __m256d V_VECTORCALL __svml_atan4 (__m256d); + extern __m256 V_VECTORCALL __svml_atan2f8 (__m256, __m256); + extern __m256d V_VECTORCALL __svml_atan24 (__m256d, __m256d); + extern __m256 V_VECTORCALL __svml_sinhf8 (__m256); + extern __m256d V_VECTORCALL __svml_sinh4 (__m256d); + extern __m256 V_VECTORCALL __svml_coshf8 (__m256); + extern __m256d V_VECTORCALL __svml_cosh4 (__m256d); + extern __m256 V_VECTORCALL __svml_tanhf8 (__m256); + extern __m256d V_VECTORCALL __svml_tanh4 (__m256d); + extern __m256 V_VECTORCALL __svml_asinhf8 (__m256); + extern __m256d V_VECTORCALL __svml_asinh4 (__m256d); + extern __m256 V_VECTORCALL __svml_acoshf8 (__m256); + extern __m256d V_VECTORCALL __svml_acosh4 (__m256d); + extern __m256 V_VECTORCALL __svml_atanhf8 (__m256); + extern __m256d V_VECTORCALL __svml_atanh4 (__m256d); + extern __m256 V_VECTORCALL __svml_erff8 (__m256); + extern __m256d V_VECTORCALL __svml_erf4 (__m256d); + extern __m256 V_VECTORCALL __svml_erfcf8 (__m256); + extern __m256d V_VECTORCALL __svml_erfc4 (__m256d); + extern __m256 V_VECTORCALL __svml_erfinvf8 (__m256); + extern __m256d V_VECTORCALL __svml_erfinv4 (__m256d); + extern __m256 V_VECTORCALL __svml_cdfnorminvf8(__m256); + extern __m256d V_VECTORCALL __svml_cdfnorminv4 (__m256d); + extern __m256 V_VECTORCALL __svml_cdfnormf8 (__m256); + extern __m256d V_VECTORCALL __svml_cdfnorm4 (__m256d); + //extern __m256 V_VECTORCALL __svml_cexpf8 (__m256); + //extern __m256d V_VECTORCALL __svml_cexp4 (__m256d); +} + + +// exponential and power functions +static inline Vec8f exp (Vec8f const x) { // exponential function + return __svml_expf8(x); +} +static inline Vec4d exp (Vec4d const x) { // exponential function + return __svml_exp4(x); +} +static inline Vec8f expm1 (Vec8f const x) { // exp(x)-1 + return __svml_expm1f8(x); +} +static inline Vec4d expm1 (Vec4d const x) { // exp(x)-1 + return __svml_expm14(x); +} +static inline Vec8f exp2 (Vec8f const x) { // pow(2,x) + return __svml_exp2f8(x); +} +static inline Vec4d exp2 (Vec4d const x) { // pow(2,x) + return __svml_exp24(x); +} +static inline Vec8f exp10 (Vec8f const x) { // pow(10,x) + return __svml_exp10f8(x); +} +static inline Vec4d exp10 (Vec4d const x) { // pow(10,x) + return __svml_exp104(x); +} +static inline Vec8f pow (Vec8f const a, Vec8f const b) { // pow(a,b) = a to the power of b + return __svml_powf8(a,b); +} +static inline Vec8f pow (Vec8f const a, float const b) { // pow(a,b) = a to the power of b + return __svml_powf8(a,Vec8f(b)); +} +static inline Vec4d pow (Vec4d const a, Vec4d const b) { // pow(a,b) = a to the power of b + return __svml_pow4(a,b); +} +static inline Vec4d pow (Vec4d const a, double const b) { // pow(a,b) = a to the power of b + return __svml_pow4(a,Vec4d(b)); +} +static inline Vec8f cbrt (Vec8f const x) { // pow(x,1/3) + return __svml_cbrtf8(x); +} +static inline Vec4d cbrt (Vec4d const x) { // pow(x,1/3) + return __svml_cbrt4(x); +} + +// logarithms +static inline Vec8f log (Vec8f const x) { // natural logarithm + return __svml_logf8(x); +} +static inline Vec4d log (Vec4d const x) { // natural logarithm + return __svml_log4(x); +} +static inline Vec8f log1p (Vec8f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return __svml_log1pf8(x); +} +static inline Vec4d log1p (Vec4d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return __svml_log1p4(x); +} +static inline Vec8f log2 (Vec8f const x) { // logarithm base 2 + return __svml_log2f8(x); +} +static inline Vec4d log2 (Vec4d const x) { // logarithm base 2 + return __svml_log24(x); +} +static inline Vec8f log10 (Vec8f const x) { // logarithm base 10 + return __svml_log10f8(x); +} +static inline Vec4d log10 (Vec4d const x) { // logarithm base 10 + return __svml_log104(x); +} + +// trigonometric functions (angles in radians) +static inline Vec8f sin (Vec8f const x) { // sine + return __svml_sinf8(x); +} +static inline Vec4d sin (Vec4d const x) { // sine + return __svml_sin4(x); +} +static inline Vec8f cos (Vec8f const x) { // cosine + return __svml_cosf8(x); +} +static inline Vec4d cos (Vec4d const x) { // cosine + return __svml_cos4(x); +} + +#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER) +// no inline assembly in 64 bit MS compiler +static inline Vec8f sincos (Vec8f * pcos, Vec8f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m256 r_sin, r_cos; + r_sin = __svml_sincosf8(x); +#if defined(__unix__) || defined(__GNUC__) + __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(r_cos)); +#else // Windows + _asm vmovaps r_cos, ymm1; +#endif + *pcos = r_cos; + return r_sin; +} +static inline Vec4d sincos (Vec4d * pcos, Vec4d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m256d r_sin, r_cos; + r_sin = __svml_sincos4(x); +#if defined(__unix__) || defined(__GNUC__) + __asm__ __volatile__ ( "vmovaps %%ymm1, %0":"=m"(r_cos)); +#else // Windows + _asm vmovapd r_cos, ymm1; +#endif + *pcos = r_cos; + return r_sin; +} +#endif // inline assembly available + +static inline Vec8f tan (Vec8f const x) { // tangent + return __svml_tanf8(x); +} +static inline Vec4d tan (Vec4d const x) { // tangent + return __svml_tan4(x); +} + +// inverse trigonometric functions +static inline Vec8f asin (Vec8f const x) { // inverse sine + return __svml_asinf8(x); +} +static inline Vec4d asin (Vec4d const x) { // inverse sine + return __svml_asin4(x); +} +static inline Vec8f acos (Vec8f const x) { // inverse cosine + return __svml_acosf8(x); +} +static inline Vec4d acos (Vec4d const x) { // inverse cosine + return __svml_acos4(x); +} +static inline Vec8f atan (Vec8f const x) { // inverse tangent + return __svml_atanf8(x); +} +static inline Vec4d atan (Vec4d const x) { // inverse tangent + return __svml_atan4(x); +} +static inline Vec8f atan2 (Vec8f const a, Vec8f const b) { // inverse tangent of a/b + return __svml_atan2f8(a,b); +} +static inline Vec4d atan2 (Vec4d const a, Vec4d const b) { // inverse tangent of a/b + return __svml_atan24(a,b); +} + +// hyperbolic functions and inverse hyperbolic functions +static inline Vec8f sinh (Vec8f const x) { // hyperbolic sine + return __svml_sinhf8(x); +} +static inline Vec4d sinh (Vec4d const x) { // hyperbolic sine + return __svml_sinh4(x); +} +static inline Vec8f cosh (Vec8f const x) { // hyperbolic cosine + return __svml_coshf8(x); +} +static inline Vec4d cosh (Vec4d const x) { // hyperbolic cosine + return __svml_cosh4(x); +} +static inline Vec8f tanh (Vec8f const x) { // hyperbolic tangent + return __svml_tanhf8(x); +} +static inline Vec4d tanh (Vec4d const x) { // hyperbolic tangent + return __svml_tanh4(x); +} +static inline Vec8f asinh (Vec8f const x) { // inverse hyperbolic sine + return __svml_asinhf8(x); +} +static inline Vec4d asinh (Vec4d const x) { // inverse hyperbolic sine + return __svml_asinh4(x); +} +static inline Vec8f acosh (Vec8f const x) { // inverse hyperbolic cosine + return __svml_acoshf8(x); +} +static inline Vec4d acosh (Vec4d const x) { // inverse hyperbolic cosine + return __svml_acosh4(x); +} + +static inline Vec8f atanh (Vec8f const x) { // inverse hyperbolic tangent + return __svml_atanhf8(x); +} +static inline Vec4d atanh (Vec4d const x) { // inverse hyperbolic tangent + return __svml_atanh4(x); +} + +// error function +static inline Vec8f erf (Vec8f const x) { // error function + return __svml_erff8(x); +} +static inline Vec4d erf (Vec4d const x) { // error function + return __svml_erf4(x); +} +static inline Vec8f erfc (Vec8f const x) { // error function complement + return __svml_erfcf8(x); +} +static inline Vec4d erfc (Vec4d const x) { // error function complement + return __svml_erfc4(x); +} +static inline Vec8f erfinv (Vec8f const x) { // inverse error function + return __svml_erfinvf8(x); +} +static inline Vec4d erfinv (Vec4d const x) { // inverse error function + return __svml_erfinv4(x); +} + +static inline Vec8f cdfnorm (Vec8f const x) { // cumulative normal distribution function + return __svml_cdfnormf8(x); +} +static inline Vec4d cdfnorm (Vec4d const x) { // cumulative normal distribution function + return __svml_cdfnorm4(x); +} +static inline Vec8f cdfnorminv (Vec8f const x) { // inverse cumulative normal distribution function + return __svml_cdfnorminvf8(x); +} +static inline Vec4d cdfnorminv (Vec4d const x) { // inverse cumulative normal distribution function + return __svml_cdfnorminv4(x); +} + +#endif // __INTEL_COMPILER + +#else // VECTORF256_H + +/***************************************************************************** +* +* 256-bit vector functions emulated with 128-bit vectors +* +*****************************************************************************/ +// exponential and power functions +static inline Vec8f exp (Vec8f const x) { // exponential function + return Vec8f(exp(x.get_low()), exp(x.get_high())); +} +static inline Vec4d exp (Vec4d const x) { // exponential function + return Vec4d(exp(x.get_low()), exp(x.get_high())); +} +static inline Vec8f expm1 (Vec8f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return Vec8f(expm1(x.get_low()), expm1(x.get_high())); +} +static inline Vec4d expm1 (Vec4d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return Vec4d(expm1(x.get_low()), expm1(x.get_high())); +} +static inline Vec8f exp2 (Vec8f const x) { // pow(2,x) + return Vec8f(exp2(x.get_low()), exp2(x.get_high())); +} +static inline Vec4d exp2 (Vec4d const x) { // pow(2,x) + return Vec4d(exp2(x.get_low()), exp2(x.get_high())); +} +static inline Vec8f exp10 (Vec8f const x) { // pow(10,x) + return Vec8f(exp10(x.get_low()), exp10(x.get_high())); +} +static inline Vec4d exp10 (Vec4d const x) { // pow(10,x) + return Vec4d(exp10(x.get_low()), exp10(x.get_high())); +} +static inline Vec8f pow (Vec8f const a, Vec8f const b) { // pow(a,b) = a to the power of b + return Vec8f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high())); +} +static inline Vec8f pow (Vec8f const a, float const b) { // pow(a,b) = a to the power of b + return Vec8f(pow(a.get_low(),b), pow(a.get_high(),b)); +} +static inline Vec4d pow (Vec4d const a, Vec4d const b) { // pow(a,b) = a to the power of b + return Vec4d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high())); +} +static inline Vec4d pow (Vec4d const a, double const b) { // pow(a,b) = a to the power of b + return Vec4d(pow(a.get_low(),b), pow(a.get_high(),b)); +} +static inline Vec8f cbrt (Vec8f const x) { // pow(x,1/3) + return Vec8f(cbrt(x.get_low()), cbrt(x.get_high())); +} +static inline Vec4d cbrt (Vec4d const x) { // pow(x,1/3) + return Vec4d(cbrt(x.get_low()), cbrt(x.get_high())); +} + +// logarithms +static inline Vec8f log (Vec8f const x) { // natural logarithm + return Vec8f(log(x.get_low()), log(x.get_high())); +} +static inline Vec4d log (Vec4d const x) { // natural logarithm + return Vec4d(log(x.get_low()), log(x.get_high())); +} +static inline Vec8f log1p (Vec8f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return Vec8f(log1p(x.get_low()), log1p(x.get_high())); +} +static inline Vec4d log1p (Vec4d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return Vec4d(log1p(x.get_low()), log1p(x.get_high())); +} +static inline Vec8f log2 (Vec8f const x) { // logarithm base 2 + return Vec8f(log2(x.get_low()), log2(x.get_high())); +} +static inline Vec4d log2 (Vec4d const x) { // logarithm base 2 + return Vec4d(log2(x.get_low()), log2(x.get_high())); +} +static inline Vec8f log10 (Vec8f const x) { // logarithm base 10 + return Vec8f(log10(x.get_low()), log10(x.get_high())); +} +static inline Vec4d log10 (Vec4d const x) { // logarithm base 10 + return Vec4d(log10(x.get_low()), log10(x.get_high())); +} + +// trigonometric functions (angles in radians) +static inline Vec8f sin (Vec8f const x) { // sine + return Vec8f(sin(x.get_low()), sin(x.get_high())); +} +static inline Vec4d sin (Vec4d const x) { // sine + return Vec4d(sin(x.get_low()), sin(x.get_high())); +} +static inline Vec8f cos (Vec8f const x) { // cosine + return Vec8f(cos(x.get_low()), cos(x.get_high())); +} +static inline Vec4d cos (Vec4d const x) { // cosine + return Vec4d(cos(x.get_low()), cos(x.get_high())); +} + +#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER) +// no inline assembly in 64 bit MS compiler +static inline Vec8f sincos (Vec8f * pcos, Vec8f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + Vec4f r_sin0, r_sin1, r_cos0, r_cos1; + r_sin0 = sincos(&r_cos0, x.get_low()); + r_sin1 = sincos(&r_cos1, x.get_high()); + *pcos = Vec8f(r_cos0, r_cos1); + return Vec8f(r_sin0, r_sin1); +} +static inline Vec4d sincos (Vec4d * pcos, Vec4d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + Vec2d r_sin0, r_sin1, r_cos0, r_cos1; + r_sin0 = sincos(&r_cos0, x.get_low()); + r_sin1 = sincos(&r_cos1, x.get_high()); + *pcos = Vec4d(r_cos0, r_cos1); + return Vec4d(r_sin0, r_sin1); +} +#endif // inline assembly available + +static inline Vec8f tan (Vec8f const x) { // tangent + return Vec8f(tan(x.get_low()), tan(x.get_high())); +} +static inline Vec4d tan (Vec4d const x) { // tangent + return Vec4d(tan(x.get_low()), tan(x.get_high())); +} + +// inverse trigonometric functions +static inline Vec8f asin (Vec8f const x) { // inverse sine + return Vec8f(asin(x.get_low()), asin(x.get_high())); +} +static inline Vec4d asin (Vec4d const x) { // inverse sine + return Vec4d(asin(x.get_low()), asin(x.get_high())); +} +static inline Vec8f acos (Vec8f const x) { // inverse cosine + return Vec8f(acos(x.get_low()), acos(x.get_high())); +} +static inline Vec4d acos (Vec4d const x) { // inverse cosine + return Vec4d(acos(x.get_low()), acos(x.get_high())); +} +static inline Vec8f atan (Vec8f const x) { // inverse tangent + return Vec8f(atan(x.get_low()), atan(x.get_high())); +} +static inline Vec4d atan (Vec4d const x) { // inverse tangent + return Vec4d(atan(x.get_low()), atan(x.get_high())); +} +static inline Vec8f atan2 (Vec8f const a, Vec8f const b) { // inverse tangent of a/b + return Vec8f(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high())); +} +static inline Vec4d atan2 (Vec4d const a, Vec4d const b) { // inverse tangent of a/b + return Vec4d(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high())); +} + +// hyperbolic functions +static inline Vec8f sinh (Vec8f const x) { // hyperbolic sine + return Vec8f(sinh(x.get_low()), sinh(x.get_high())); +} +static inline Vec4d sinh (Vec4d const x) { // hyperbolic sine + return Vec4d(sinh(x.get_low()), sinh(x.get_high())); +} +static inline Vec8f cosh (Vec8f const x) { // hyperbolic cosine + return Vec8f(cosh(x.get_low()), cosh(x.get_high())); +} +static inline Vec4d cosh (Vec4d const x) { // hyperbolic cosine + return Vec4d(cosh(x.get_low()), cosh(x.get_high())); +} +static inline Vec8f tanh (Vec8f const x) { // hyperbolic tangent + return Vec8f(tanh(x.get_low()), tanh(x.get_high())); +} +static inline Vec4d tanh (Vec4d const x) { // hyperbolic tangent + return Vec4d(tanh(x.get_low()), tanh(x.get_high())); +} + +// inverse hyperbolic functions +static inline Vec8f asinh (Vec8f const x) { // inverse hyperbolic sine + return Vec8f(asinh(x.get_low()), asinh(x.get_high())); +} +static inline Vec4d asinh (Vec4d const x) { // inverse hyperbolic sine + return Vec4d(asinh(x.get_low()), asinh(x.get_high())); +} +static inline Vec8f acosh (Vec8f const x) { // inverse hyperbolic cosine + return Vec8f(acosh(x.get_low()), acosh(x.get_high())); +} +static inline Vec4d acosh (Vec4d const x) { // inverse hyperbolic cosine + return Vec4d(acosh(x.get_low()), acosh(x.get_high())); +} +static inline Vec8f atanh (Vec8f const x) { // inverse hyperbolic tangent + return Vec8f(atanh(x.get_low()), atanh(x.get_high())); +} +static inline Vec4d atanh (Vec4d const x) { // inverse hyperbolic tangent + return Vec4d(atanh(x.get_low()), atanh(x.get_high())); +} + +// error function +static inline Vec8f erf (Vec8f const x) { // error function + return Vec8f(erf(x.get_low()), erf(x.get_high())); +} +static inline Vec4d erf (Vec4d const x) { // error function + return Vec4d(erf(x.get_low()), erf(x.get_high())); +} +static inline Vec8f erfc (Vec8f const x) { // error function complement + return Vec8f(erfc(x.get_low()), erfc(x.get_high())); +} +static inline Vec4d erfc (Vec4d const x) { // error function complement + return Vec4d(erfc(x.get_low()), erfc(x.get_high())); +} +static inline Vec8f erfinv (Vec8f const x) { // inverse error function + return Vec8f(erfinv(x.get_low()), erfinv(x.get_high())); +} +static inline Vec4d erfinv (Vec4d const x) { // inverse error function + return Vec4d(erfinv(x.get_low()), erfinv(x.get_high())); +} + +static inline Vec8f cdfnorm (Vec8f const x) { // cumulative normal distribution function + return Vec8f(cdfnorm(x.get_low()), cdfnorm(x.get_high())); +} +static inline Vec4d cdfnorm (Vec4d const x) { // cumulative normal distribution function + return Vec4d(cdfnorm(x.get_low()), cdfnorm(x.get_high())); +} +static inline Vec8f cdfnorminv (Vec8f const x) { // inverse cumulative normal distribution function + return Vec8f(cdfnorminv(x.get_low()), cdfnorminv(x.get_high())); +} +static inline Vec4d cdfnorminv (Vec4d const x) { // inverse cumulative normal distribution function + return Vec4d(cdfnorminv(x.get_low()), cdfnorminv(x.get_high())); +} + +#endif // VECTORF256_H + +#endif // MAX_VECTOR_SIZE >= 256 + +#if defined (MAX_VECTOR_SIZE) && MAX_VECTOR_SIZE >= 512 // 512 bit vectors + +#if defined (VECTORF512_H) // 512-bit vector registers supported + +#ifdef __INTEL_COMPILER +/***************************************************************************** +* +* 512-bit vector functions using Intel compiler +* +*****************************************************************************/ + +// exponential and power functions +static inline Vec16f exp(Vec16f const x) { // exponential function + return _mm512_exp_ps(x); +} +static inline Vec8d exp(Vec8d const x) { // exponential function + return _mm512_exp_pd(x); +} +static inline Vec16f expm1(Vec16f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return _mm512_expm1_ps(x); +} +static inline Vec8d expm1(Vec8d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return _mm512_expm1_pd(x); +} +static inline Vec16f exp2(Vec16f const x) { // pow(2,x) + return _mm512_exp2_ps(x); +} +static inline Vec8d exp2(Vec8d const x) { // pow(2,x) + return _mm512_exp2_pd(x); +} +static inline Vec16f exp10(Vec16f const x) { // pow(10,x) + return _mm512_exp10_ps(x); +} +static inline Vec8d exp10(Vec8d const x) { // pow(10,x) + return _mm512_exp10_pd(x); +} +static inline Vec16f pow(Vec16f const a, Vec16f const b) { // pow(a,b) = a to the power of b + return _mm512_pow_ps(a, b); +} +static inline Vec16f pow(Vec16f const a, float const b) { // pow(a,b) = a to the power of b + return _mm512_pow_ps(a, Vec16f(b)); +} +static inline Vec8d pow(Vec8d const a, Vec8d const b) { // pow(a,b) = a to the power of b + return _mm512_pow_pd(a, b); +} +static inline Vec8d pow(Vec8d const a, double const b) { // pow(a,b) = a to the power of b + return _mm512_pow_pd(a, Vec8d(b)); +} +static inline Vec16f cbrt(Vec16f const x) { // pow(x,1/3) + return _mm512_cbrt_ps(x); +} +static inline Vec8d cbrt(Vec8d const x) { // pow(x,1/3) + return _mm512_cbrt_pd(x); +} +// logarithms +static inline Vec16f log(Vec16f const x) { // natural logarithm + return _mm512_log_ps(x); +} +static inline Vec8d log(Vec8d const x) { // natural logarithm + return _mm512_log_pd(x); +} +static inline Vec16f log1p(Vec16f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return _mm512_log1p_ps(x); +} +static inline Vec8d log1p(Vec8d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return _mm512_log1p_pd(x); +} +static inline Vec16f log2(Vec16f const x) { // logarithm base 2 + return _mm512_log2_ps(x); +} +static inline Vec8d log2(Vec8d const x) { // logarithm base 2 + return _mm512_log2_pd(x); +} +static inline Vec16f log10(Vec16f const x) { // logarithm base 10 + return _mm512_log10_ps(x); +} +static inline Vec8d log10(Vec8d const x) { // logarithm base 10 + return _mm512_log10_pd(x); +} + +// trigonometric functions +static inline Vec16f sin(Vec16f const x) { // sine + return _mm512_sin_ps(x); +} +static inline Vec8d sin(Vec8d const x) { // sine + return _mm512_sin_pd(x); +} +static inline Vec16f cos(Vec16f const x) { // cosine + return _mm512_cos_ps(x); +} +static inline Vec8d cos(Vec8d const x) { // cosine + return _mm512_cos_pd(x); +} +static inline Vec16f sincos(Vec16f * pcos, Vec16f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m512 r_sin, r_cos; + r_sin = _mm512_sincos_ps(&r_cos, x); + *pcos = r_cos; + return r_sin; +} +static inline Vec8d sincos(Vec8d * pcos, Vec8d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + __m512d r_sin, r_cos; + r_sin = _mm512_sincos_pd(&r_cos, x); + *pcos = r_cos; + return r_sin; +} +static inline Vec16f tan(Vec16f const x) { // tangent + return _mm512_tan_ps(x); +} +static inline Vec8d tan(Vec8d const x) { // tangent + return _mm512_tan_pd(x); +} + +// inverse trigonometric functions +static inline Vec16f asin(Vec16f const x) { // inverse sine + return _mm512_asin_ps(x); +} +static inline Vec8d asin(Vec8d const x) { // inverse sine + return _mm512_asin_pd(x); +} + +static inline Vec16f acos(Vec16f const x) { // inverse cosine + return _mm512_acos_ps(x); +} +static inline Vec8d acos(Vec8d const x) { // inverse cosine + return _mm512_acos_pd(x); +} + +static inline Vec16f atan(Vec16f const x) { // inverse tangent + return _mm512_atan_ps(x); +} +static inline Vec8d atan(Vec8d const x) { // inverse tangent + return _mm512_atan_pd(x); +} +static inline Vec16f atan2(Vec16f const a, Vec16f const b) { // inverse tangent of a/b + return _mm512_atan2_ps(a, b); +} +static inline Vec8d atan2(Vec8d const a, Vec8d const b) { // inverse tangent of a/b + return _mm512_atan2_pd(a, b); +} + +// hyperbolic functions and inverse hyperbolic functions +static inline Vec16f sinh(Vec16f const x) { // hyperbolic sine + return _mm512_sinh_ps(x); +} +static inline Vec8d sinh(Vec8d const x) { // hyperbolic sine + return _mm512_sinh_pd(x); +} +static inline Vec16f cosh(Vec16f const x) { // hyperbolic cosine + return _mm512_cosh_ps(x); +} +static inline Vec8d cosh(Vec8d const x) { // hyperbolic cosine + return _mm512_cosh_pd(x); +} +static inline Vec16f tanh(Vec16f const x) { // hyperbolic tangent + return _mm512_tanh_ps(x); +} +static inline Vec8d tanh(Vec8d const x) { // hyperbolic tangent + return _mm512_tanh_pd(x); +} +static inline Vec16f asinh(Vec16f const x) { // inverse hyperbolic sine + return _mm512_asinh_ps(x); +} +static inline Vec8d asinh(Vec8d const x) { // inverse hyperbolic sine + return _mm512_asinh_pd(x); +} +static inline Vec16f acosh(Vec16f const x) { // inverse hyperbolic cosine + return _mm512_acosh_ps(x); +} +static inline Vec8d acosh(Vec8d const x) { // inverse hyperbolic cosine + return _mm512_acosh_pd(x); +} +static inline Vec16f atanh(Vec16f const x) { // inverse hyperbolic tangent + return _mm512_atanh_ps(x); +} +static inline Vec8d atanh(Vec8d const x) { // inverse hyperbolic tangent + return _mm512_atanh_pd(x); +} + +// error function +static inline Vec16f erf(Vec16f const x) { // error function + return _mm512_erf_ps(x); +} +static inline Vec8d erf(Vec8d const x) { // error function + return _mm512_erf_pd(x); +} +static inline Vec16f erfc(Vec16f const x) { // error function complement + return _mm512_erfc_ps(x); +} +static inline Vec8d erfc(Vec8d const x) { // error function complement + return _mm512_erfc_pd(x); +} +static inline Vec16f erfinv(Vec16f const x) { // inverse error function + return _mm512_erfinv_ps(x); +} +static inline Vec8d erfinv(Vec8d const x) { // inverse error function + return _mm512_erfinv_pd(x); +} + +static inline Vec16f cdfnorm(Vec16f const x) { // cumulative normal distribution function + return _mm512_cdfnorm_ps(x); +} +static inline Vec8d cdfnorm(Vec8d const x) { // cumulative normal distribution function + return _mm512_cdfnorm_pd(x); +} +static inline Vec16f cdfnorminv(Vec16f const x) {// inverse cumulative normal distribution function + return _mm512_cdfnorminv_ps(x); +} +static inline Vec8d cdfnorminv(Vec8d const x) { // inverse cumulative normal distribution function + return _mm512_cdfnorminv_pd(x); +} + +#else // __INTEL_COMPILER +/***************************************************************************** +* +* 512-bit vector functions using other compiler than Intel +* +*****************************************************************************/ + +// External function prototypes, 512-bit vectors +extern "C" { + extern __m512 V_VECTORCALL __svml_expf16 (__m512); + extern __m512d V_VECTORCALL __svml_exp8 (__m512d); + extern __m512 V_VECTORCALL __svml_expm1f16 (__m512); + extern __m512d V_VECTORCALL __svml_expm18 (__m512d); + extern __m512 V_VECTORCALL __svml_exp2f16 (__m512); + extern __m512d V_VECTORCALL __svml_exp28 (__m512d); + extern __m512 V_VECTORCALL __svml_exp10f16 (__m512); + extern __m512d V_VECTORCALL __svml_exp108 (__m512d); + extern __m512 V_VECTORCALL __svml_powf16 (__m512, __m512); + extern __m512d V_VECTORCALL __svml_pow8 (__m512d, __m512d); + extern __m512 V_VECTORCALL __svml_cbrtf16 (__m512); + extern __m512d V_VECTORCALL __svml_cbrt8 (__m512d); + extern __m512 V_VECTORCALL __svml_invsqrtf16 (__m512); + extern __m512d V_VECTORCALL __svml_invsqrt8 (__m512d); + extern __m512 V_VECTORCALL __svml_logf16 (__m512); + extern __m512d V_VECTORCALL __svml_log8 (__m512d); + extern __m512 V_VECTORCALL __svml_log1pf16 (__m512); + extern __m512d V_VECTORCALL __svml_log1p8 (__m512d); + extern __m512 V_VECTORCALL __svml_log2f16 (__m512); + extern __m512d V_VECTORCALL __svml_log28 (__m512d); + extern __m512 V_VECTORCALL __svml_log10f16 (__m512); + extern __m512d V_VECTORCALL __svml_log108 (__m512d); + extern __m512 V_VECTORCALL __svml_sinf16 (__m512); + extern __m512d V_VECTORCALL __svml_sin8 (__m512d); + extern __m512 V_VECTORCALL __svml_cosf16 (__m512); + extern __m512d V_VECTORCALL __svml_cos8 (__m512d); + extern __m512 V_VECTORCALL __svml_sincosf16 (__m512); // cos returned in ymm1 + extern __m512d V_VECTORCALL __svml_sincos8 (__m512d); // cos returned in ymm1 + extern __m512 V_VECTORCALL __svml_tanf16 (__m512); + extern __m512d V_VECTORCALL __svml_tan8 (__m512d); + extern __m512 V_VECTORCALL __svml_asinf16 (__m512); + extern __m512d V_VECTORCALL __svml_asin8 (__m512d); + extern __m512 V_VECTORCALL __svml_acosf16 (__m512); + extern __m512d V_VECTORCALL __svml_acos8 (__m512d); + extern __m512 V_VECTORCALL __svml_atanf16 (__m512); + extern __m512d V_VECTORCALL __svml_atan8 (__m512d); + extern __m512 V_VECTORCALL __svml_atan2f16 (__m512, __m512); + extern __m512d V_VECTORCALL __svml_atan28 (__m512d, __m512d); + extern __m512 V_VECTORCALL __svml_sinhf16 (__m512); + extern __m512d V_VECTORCALL __svml_sinh8 (__m512d); + extern __m512 V_VECTORCALL __svml_coshf16 (__m512); + extern __m512d V_VECTORCALL __svml_cosh8 (__m512d); + extern __m512 V_VECTORCALL __svml_tanhf16 (__m512); + extern __m512d V_VECTORCALL __svml_tanh8 (__m512d); + extern __m512 V_VECTORCALL __svml_asinhf16 (__m512); + extern __m512d V_VECTORCALL __svml_asinh8 (__m512d); + extern __m512 V_VECTORCALL __svml_acoshf16 (__m512); + extern __m512d V_VECTORCALL __svml_acosh8 (__m512d); + extern __m512 V_VECTORCALL __svml_atanhf16 (__m512); + extern __m512d V_VECTORCALL __svml_atanh8 (__m512d); + extern __m512 V_VECTORCALL __svml_erff16 (__m512); + extern __m512d V_VECTORCALL __svml_erf8 (__m512d); + extern __m512 V_VECTORCALL __svml_erfcf16 (__m512); + extern __m512d V_VECTORCALL __svml_erfc8 (__m512d); + extern __m512 V_VECTORCALL __svml_erfinvf16 (__m512); + extern __m512d V_VECTORCALL __svml_erfinv8 (__m512d); + extern __m512 V_VECTORCALL __svml_cdfnorminvf16(__m512); + extern __m512d V_VECTORCALL __svml_cdfnorminv8 (__m512d); + extern __m512 V_VECTORCALL __svml_cdfnormf16 (__m512); + extern __m512d V_VECTORCALL __svml_cdfnorm8 (__m512d); + //extern __m512 V_VECTORCALL __svml_cexpf16 (__m512); + //extern __m512d V_VECTORCALL __svml_cexp8 (__m512d); +} + + +// exponential and power functions +static inline Vec16f exp (Vec16f const x) { // exponential function + return __svml_expf16(x); +} +static inline Vec8d exp (Vec8d const x) { // exponential function + return __svml_exp8(x); +} +static inline Vec16f expm1 (Vec16f const x) { // exp(x)-1 + return __svml_expm1f16(x); +} +static inline Vec8d expm1 (Vec8d const x) { // exp(x)-1 + return __svml_expm18(x); +} +static inline Vec16f exp2 (Vec16f const x) { // pow(2,x) + return __svml_exp2f16(x); +} +static inline Vec8d exp2 (Vec8d const x) { // pow(2,x) + return __svml_exp28(x); +} +static inline Vec16f exp10 (Vec16f const x) { // pow(10,x) + return __svml_exp10f16(x); +} +static inline Vec8d exp10 (Vec8d const x) { // pow(10,x) + return __svml_exp108(x); +} +static inline Vec16f pow (Vec16f const a, Vec16f const b) { // pow(a,b) = a to the power of b + return __svml_powf16(a,b); +} +static inline Vec16f pow (Vec16f const a, float const b) { // pow(a,b) = a to the power of b + return __svml_powf16(a,Vec16f(b)); +} +static inline Vec8d pow (Vec8d const a, Vec8d const b) { // pow(a,b) = a to the power of b + return __svml_pow8(a,b); +} +static inline Vec8d pow (Vec8d const a, double const b) { // pow(a,b) = a to the power of b + return __svml_pow8(a,Vec8d(b)); +} +static inline Vec16f cbrt (Vec16f const x) { // pow(x,1/3) + return __svml_cbrtf16(x); +} +static inline Vec8d cbrt (Vec8d const x) { // pow(x,1/3) + return __svml_cbrt8(x); +} + +// logarithms +static inline Vec16f log (Vec16f const x) { // natural logarithm + return __svml_logf16(x); +} +static inline Vec8d log (Vec8d const x) { // natural logarithm + return __svml_log8(x); +} +static inline Vec16f log1p (Vec16f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return __svml_log1pf16(x); +} +static inline Vec8d log1p (Vec8d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return __svml_log1p8(x); +} +static inline Vec16f log2 (Vec16f const x) { // logarithm base 2 + return __svml_log2f16(x); +} +static inline Vec8d log2 (Vec8d const x) { // logarithm base 2 + return __svml_log28(x); +} +static inline Vec16f log10 (Vec16f const x) { // logarithm base 10 + return __svml_log10f16(x); +} +static inline Vec8d log10 (Vec8d const x) { // logarithm base 10 + return __svml_log108(x); +} + +// trigonometric functions (angles in radians) +static inline Vec16f sin (Vec16f const x) { // sine + return __svml_sinf16(x); +} +static inline Vec8d sin (Vec8d const x) { // sine + return __svml_sin8(x); +} +static inline Vec16f cos (Vec16f const x) { // cosine + return __svml_cosf16(x); +} +static inline Vec8d cos (Vec8d const x) { // cosine + return __svml_cos8(x); +} + +#if defined(__unix__) || defined(__INTEL_COMPILER) //|| !defined(__x86_64__) || !defined(_MSC_VER) +// no inline assembly in 64 bit MS compiler +// sine and cosine. sin(x) returned, cos(x) in pcos +static inline Vec16f sincos (Vec16f * pcos, Vec16f const x) { + __m512 r_sin, r_cos; + r_sin = __svml_sincosf16(x); +#if defined(__unix__) || defined(__GNUC__) + __asm__ __volatile__ ( "vmovaps %%zmm1, %0":"=m"(r_cos)); +#else // Windows + // _asm vmovaps r_cos, zmm1; // does not work in VS 2019 +#endif + *pcos = r_cos; + return r_sin; +} +// sine and cosine. sin(x) returned, cos(x) in pcos +static inline Vec8d sincos (Vec8d * pcos, Vec8d const x) { + __m512d r_sin, r_cos; + r_sin = __svml_sincos8(x); +#if defined(__unix__) || defined(__GNUC__) + __asm__ __volatile__ ( "vmovaps %%zmm1, %0":"=m"(r_cos)); +#else // Windows + // _asm vmovapd r_cos, zmm1; // does not work in VS 2019 +#endif + *pcos = r_cos; + return r_sin; +} +#endif // inline assembly available + +static inline Vec16f tan (Vec16f const x) { // tangent + return __svml_tanf16(x); +} +static inline Vec8d tan (Vec8d const x) { // tangent + return __svml_tan8(x); +} + +// inverse trigonometric functions +static inline Vec16f asin (Vec16f const x) { // inverse sine + return __svml_asinf16(x); +} +static inline Vec8d asin (Vec8d const x) { // inverse sine + return __svml_asin8(x); +} +static inline Vec16f acos (Vec16f const x) { // inverse cosine + return __svml_acosf16(x); +} +static inline Vec8d acos (Vec8d const x) { // inverse cosine + return __svml_acos8(x); +} +static inline Vec16f atan (Vec16f const x) { // inverse tangent + return __svml_atanf16(x); +} +static inline Vec8d atan (Vec8d const x) { // inverse tangent + return __svml_atan8(x); +} +static inline Vec16f atan2 (Vec16f const a, Vec16f const b) {// inverse tangent of a/b + return __svml_atan2f16(a,b); +} +static inline Vec8d atan2 (Vec8d const a, Vec8d const b) { // inverse tangent of a/b + return __svml_atan28(a,b); +} + +// hyperbolic functions and inverse hyperbolic functions +static inline Vec16f sinh (Vec16f const x) { // hyperbolic sine + return __svml_sinhf16(x); +} +static inline Vec8d sinh (Vec8d const x) { // hyperbolic sine + return __svml_sinh8(x); +} +static inline Vec16f cosh (Vec16f const x) { // hyperbolic cosine + return __svml_coshf16(x); +} +static inline Vec8d cosh (Vec8d const x) { // hyperbolic cosine + return __svml_cosh8(x); +} +static inline Vec16f tanh (Vec16f const x) { // hyperbolic tangent + return __svml_tanhf16(x); +} +static inline Vec8d tanh (Vec8d const x) { // hyperbolic tangent + return __svml_tanh8(x); +} +static inline Vec16f asinh (Vec16f const x) { // inverse hyperbolic sine + return __svml_asinhf16(x); +} +static inline Vec8d asinh (Vec8d const x) { // inverse hyperbolic sine + return __svml_asinh8(x); +} +static inline Vec16f acosh (Vec16f const x) { // inverse hyperbolic cosine + return __svml_acoshf16(x); +} +static inline Vec8d acosh (Vec8d const x) { // inverse hyperbolic cosine + return __svml_acosh8(x); +} +static inline Vec16f atanh (Vec16f const x) { // inverse hyperbolic tangent + return __svml_atanhf16(x); +} +static inline Vec8d atanh (Vec8d const x) { // inverse hyperbolic tangent + return __svml_atanh8(x); +} + +// error function +static inline Vec16f erf (Vec16f const x) { // error function + return __svml_erff16(x); +} +static inline Vec8d erf (Vec8d const x) { // error function + return __svml_erf8(x); +} +static inline Vec16f erfc (Vec16f const x) { // error function complement + return __svml_erfcf16(x); +} +static inline Vec8d erfc (Vec8d const x) { // error function complement + return __svml_erfc8(x); +} +static inline Vec16f erfinv (Vec16f const x) { // inverse error function + return __svml_erfinvf16(x); +} +static inline Vec8d erfinv (Vec8d const x) { // inverse error function + return __svml_erfinv8(x); +} + +static inline Vec16f cdfnorm (Vec16f const x) { // cumulative normal distribution function + return __svml_cdfnormf16(x); +} +static inline Vec8d cdfnorm (Vec8d const x) { // cumulative normal distribution function + return __svml_cdfnorm8(x); +} +static inline Vec16f cdfnorminv (Vec16f const x) { // inverse cumulative normal distribution function + return __svml_cdfnorminvf16(x); +} +static inline Vec8d cdfnorminv (Vec8d const x) { // inverse cumulative normal distribution function + return __svml_cdfnorminv8(x); +} + +#endif // __INTEL_COMPILER + +#else // VECTORF512_H +/***************************************************************************** +* +* 512-bit vector functions emulated with 256-bit vectors +* +*****************************************************************************/ + +// exponential and power functions +static inline Vec16f exp (Vec16f const x) { // exponential function + return Vec16f(exp(x.get_low()), exp(x.get_high())); +} +static inline Vec8d exp (Vec8d const x) { // exponential function + return Vec8d(exp(x.get_low()), exp(x.get_high())); +} +static inline Vec16f expm1 (Vec16f const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return Vec16f(expm1(x.get_low()), expm1(x.get_high())); +} +static inline Vec8d expm1 (Vec8d const x) { // exp(x)-1. Avoids loss of precision if x is close to 1 + return Vec8d(expm1(x.get_low()), expm1(x.get_high())); +} +static inline Vec16f exp2 (Vec16f const x) { // pow(2,x) + return Vec16f(exp2(x.get_low()), exp2(x.get_high())); +} +static inline Vec8d exp2 (Vec8d const x) { // pow(2,x) + return Vec8d(exp2(x.get_low()), exp2(x.get_high())); +} +static inline Vec16f exp10 (Vec16f const x) { // pow(10,x) + return Vec16f(exp10(x.get_low()), exp10(x.get_high())); +} +static inline Vec8d exp10 (Vec8d const x) { // pow(10,x) + return Vec8d(exp10(x.get_low()), exp10(x.get_high())); +} +static inline Vec16f pow (Vec16f const a, Vec16f const b) { // pow(a,b) = a to the power of b + return Vec16f(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high())); +} +static inline Vec16f pow (Vec16f const a, float const b) { // pow(a,b) = a to the power of b + return Vec16f(pow(a.get_low(),b), pow(a.get_high(),b)); +} +static inline Vec8d pow (Vec8d const a, Vec8d const b) { // pow(a,b) = a to the power of b + return Vec8d(pow(a.get_low(),b.get_low()), pow(a.get_high(),b.get_high())); +} +static inline Vec8d pow (Vec8d const a, double const b) { // pow(a,b) = a to the power of b + return Vec8d(pow(a.get_low(),b), pow(a.get_high(),b)); +} +static inline Vec16f cbrt (Vec16f const x) { // pow(x,1/3) + return Vec16f(cbrt(x.get_low()), cbrt(x.get_high())); +} +static inline Vec8d cbrt (Vec8d const x) { // pow(x,1/3) + return Vec8d(cbrt(x.get_low()), cbrt(x.get_high())); +} + +// logarithms +static inline Vec16f log (Vec16f const x) { // natural logarithm + return Vec16f(log(x.get_low()), log(x.get_high())); +} +static inline Vec8d log (Vec8d const x) { // natural logarithm + return Vec8d(log(x.get_low()), log(x.get_high())); +} +static inline Vec16f log1p (Vec16f const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return Vec16f(log1p(x.get_low()), log1p(x.get_high())); +} +static inline Vec8d log1p (Vec8d const x) { // log(1+x). Avoids loss of precision if 1+x is close to 1 + return Vec8d(log1p(x.get_low()), log1p(x.get_high())); +} +static inline Vec16f log2 (Vec16f const x) { // logarithm base 2 + return Vec16f(log2(x.get_low()), log2(x.get_high())); +} +static inline Vec8d log2 (Vec8d const x) { // logarithm base 2 + return Vec8d(log2(x.get_low()), log2(x.get_high())); +} +static inline Vec16f log10 (Vec16f const x) { // logarithm base 10 + return Vec16f(log10(x.get_low()), log10(x.get_high())); +} +static inline Vec8d log10 (Vec8d const x) { // logarithm base 10 + return Vec8d(log10(x.get_low()), log10(x.get_high())); +} + +// trigonometric functions (angles in radians) +static inline Vec16f sin (Vec16f const x) { // sine + return Vec16f(sin(x.get_low()), sin(x.get_high())); +} +static inline Vec8d sin (Vec8d const x) { // sine + return Vec8d(sin(x.get_low()), sin(x.get_high())); +} +static inline Vec16f cos (Vec16f const x) { // cosine + return Vec16f(cos(x.get_low()), cos(x.get_high())); +} +static inline Vec8d cos (Vec8d const x) { // cosine + return Vec8d(cos(x.get_low()), cos(x.get_high())); +} + +#if defined(__unix__) || defined(__INTEL_COMPILER) || !defined(__x86_64__) || !defined(_MSC_VER) +// no inline assembly in 64 bit MS compiler +static inline Vec16f sincos (Vec16f * pcos, Vec16f const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + Vec8f r_sin0, r_sin1, r_cos0, r_cos1; + r_sin0 = sincos(&r_cos0, x.get_low()); + r_sin1 = sincos(&r_cos1, x.get_high()); + *pcos = Vec16f(r_cos0, r_cos1); + return Vec16f(r_sin0, r_sin1); +} +static inline Vec8d sincos (Vec8d * pcos, Vec8d const x) { // sine and cosine. sin(x) returned, cos(x) in pcos + Vec4d r_sin0, r_sin1, r_cos0, r_cos1; + r_sin0 = sincos(&r_cos0, x.get_low()); + r_sin1 = sincos(&r_cos1, x.get_high()); + *pcos = Vec8d(r_cos0, r_cos1); + return Vec8d(r_sin0, r_sin1); +} +#endif // inline assembly available + + +static inline Vec16f tan (Vec16f const x) { // tangent + return Vec16f(tan(x.get_low()), tan(x.get_high())); +} +static inline Vec8d tan (Vec8d const x) { // tangent + return Vec8d(tan(x.get_low()), tan(x.get_high())); +} + +// inverse trigonometric functions +static inline Vec16f asin (Vec16f const x) { // inverse sine + return Vec16f(asin(x.get_low()), asin(x.get_high())); +} +static inline Vec8d asin (Vec8d const x) { // inverse sine + return Vec8d(asin(x.get_low()), asin(x.get_high())); +} +static inline Vec16f acos (Vec16f const x) { // inverse cosine + return Vec16f(acos(x.get_low()), acos(x.get_high())); +} +static inline Vec8d acos (Vec8d const x) { // inverse cosine + return Vec8d(acos(x.get_low()), acos(x.get_high())); +} +static inline Vec16f atan (Vec16f const x) { // inverse tangent + return Vec16f(atan(x.get_low()), atan(x.get_high())); +} +static inline Vec8d atan (Vec8d const x) { // inverse tangent + return Vec8d(atan(x.get_low()), atan(x.get_high())); +} +static inline Vec16f atan2 (Vec16f const a, Vec16f const b) { // inverse tangent of a/b + return Vec16f(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high())); +} +static inline Vec8d atan2 (Vec8d const a, Vec8d const b) { // inverse tangent of a/b + return Vec8d(atan2(a.get_low(),b.get_low()), atan2(a.get_high(),b.get_high())); +} + +// hyperbolic functions +static inline Vec16f sinh (Vec16f const x) { // hyperbolic sine + return Vec16f(sinh(x.get_low()), sinh(x.get_high())); +} +static inline Vec8d sinh (Vec8d const x) { // hyperbolic sine + return Vec8d(sinh(x.get_low()), sinh(x.get_high())); +} +static inline Vec16f cosh (Vec16f const x) { // hyperbolic cosine + return Vec16f(cosh(x.get_low()), cosh(x.get_high())); +} +static inline Vec8d cosh (Vec8d const x) { // hyperbolic cosine + return Vec8d(cosh(x.get_low()), cosh(x.get_high())); +} +static inline Vec16f tanh (Vec16f const x) { // hyperbolic tangent + return Vec16f(tanh(x.get_low()), tanh(x.get_high())); +} +static inline Vec8d tanh (Vec8d const x) { // hyperbolic tangent + return Vec8d(tanh(x.get_low()), tanh(x.get_high())); +} + +// inverse hyperbolic functions +static inline Vec16f asinh (Vec16f const x) { // inverse hyperbolic sine + return Vec16f(asinh(x.get_low()), asinh(x.get_high())); +} +static inline Vec8d asinh (Vec8d const x) { // inverse hyperbolic sine + return Vec8d(asinh(x.get_low()), asinh(x.get_high())); +} +static inline Vec16f acosh (Vec16f const x) { // inverse hyperbolic cosine + return Vec16f(acosh(x.get_low()), acosh(x.get_high())); +} +static inline Vec8d acosh (Vec8d const x) { // inverse hyperbolic cosine + return Vec8d(acosh(x.get_low()), acosh(x.get_high())); +} +static inline Vec16f atanh (Vec16f const x) { // inverse hyperbolic tangent + return Vec16f(atanh(x.get_low()), atanh(x.get_high())); +} +static inline Vec8d atanh (Vec8d const x) { // inverse hyperbolic tangent + return Vec8d(atanh(x.get_low()), atanh(x.get_high())); +} + +// error function +static inline Vec16f erf (Vec16f const x) { // error function + return Vec16f(erf(x.get_low()), erf(x.get_high())); +} +static inline Vec8d erf (Vec8d const x) { // error function + return Vec8d(erf(x.get_low()), erf(x.get_high())); +} +static inline Vec16f erfc (Vec16f const x) { // error function complement + return Vec16f(erfc(x.get_low()), erfc(x.get_high())); +} +static inline Vec8d erfc (Vec8d const x) { // error function complement + return Vec8d(erfc(x.get_low()), erfc(x.get_high())); +} +static inline Vec16f erfinv (Vec16f const x) { // inverse error function + return Vec16f(erfinv(x.get_low()), erfinv(x.get_high())); +} +static inline Vec8d erfinv (Vec8d const x) { // inverse error function + return Vec8d(erfinv(x.get_low()), erfinv(x.get_high())); +} + +static inline Vec16f cdfnorm (Vec16f const x) { // cumulative normal distribution function + return Vec16f(cdfnorm(x.get_low()), cdfnorm(x.get_high())); +} +static inline Vec8d cdfnorm (Vec8d const x) { // cumulative normal distribution function + return Vec8d(cdfnorm(x.get_low()), cdfnorm(x.get_high())); +} +static inline Vec16f cdfnorminv (Vec16f const x) { // inverse cumulative normal distribution function + return Vec16f(cdfnorminv(x.get_low()), cdfnorminv(x.get_high())); +} +static inline Vec8d cdfnorminv (Vec8d const x) { // inverse cumulative normal distribution function + return Vec8d(cdfnorminv(x.get_low()), cdfnorminv(x.get_high())); +} + +#endif // VECTORF512_H + +#endif // MAX_VECTOR_SIZE >= 512 + +#ifdef VCL_NAMESPACE +} +#endif // VCL_NAMESPACE + +#endif // VECTORMATH_COMMON_H + +#endif // VECTORMATH_LIB_H diff --git a/DFTTest/VCL2/vectormath_trig.h b/DFTTest/VCL2/vectormath_trig.h new file mode 100644 index 0000000..7b33886 --- /dev/null +++ b/DFTTest/VCL2/vectormath_trig.h @@ -0,0 +1,898 @@ +/**************************** vectormath_trig.h ****************************** +* Author: Agner Fog +* Date created: 2014-04-18 +* Last modified: 2020-06-08 +* Version: 2.00.03 +* Project: vector class library +* Description: +* Header file containing inline version of trigonometric functions +* and inverse trigonometric functions +* sin, cos, sincos, tan +* asin, acos, atan, atan2 +* +* Theory, methods and inspiration based partially on these sources: +* > Moshier, Stephen Lloyd Baluk: Methods and programs for mathematical functions. +* Ellis Horwood, 1989. +* > VDT library developed on CERN by Danilo Piparo, Thomas Hauth and +* Vincenzo Innocente, 2012, https://svnweb.cern.ch/trac/vdt +* > Cephes math library by Stephen L. Moshier 1992, +* http://www.netlib.org/cephes/ +* +* For detailed instructions, see vectormath_common.h and vcl_manual.pdf +* +* (c) Copyright 2014-2020 Agner Fog. +* Apache License version 2.0 or later. +******************************************************************************/ + +#ifndef VECTORMATH_TRIG_H +#define VECTORMATH_TRIG_H 1 + +#include "vectormath_common.h" + +#ifdef VCL_NAMESPACE +namespace VCL_NAMESPACE { +#endif + + +// ************************************************************* +// sin/cos template, double precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// SC: 1 = sin, 2 = cos, 3 = sincos +// Paramterers: +// xx = input x (radians) +// cosret = return pointer (only if SC = 3) +template +static inline VTYPE sincos_d(VTYPE * cosret, VTYPE const xx) { + + // define constants + const double P0sin = -1.66666666666666307295E-1; + const double P1sin = 8.33333333332211858878E-3; + const double P2sin = -1.98412698295895385996E-4; + const double P3sin = 2.75573136213857245213E-6; + const double P4sin = -2.50507477628578072866E-8; + const double P5sin = 1.58962301576546568060E-10; + + const double P0cos = 4.16666666666665929218E-2; + const double P1cos = -1.38888888888730564116E-3; + const double P2cos = 2.48015872888517045348E-5; + const double P3cos = -2.75573141792967388112E-7; + const double P4cos = 2.08757008419747316778E-9; + const double P5cos = -1.13585365213876817300E-11; + + const double DP1 = 7.853981554508209228515625E-1 * 2.; + const double DP2 = 7.94662735614792836714E-9 * 2.; + const double DP3 = 3.06161699786838294307E-17 * 2.; + /* + const double DP1sc = 7.85398125648498535156E-1; + const double DP2sc = 3.77489470793079817668E-8; + const double DP3sc = 2.69515142907905952645E-15; + */ + typedef decltype(roundi(xx)) ITYPE; // integer vector type + typedef decltype(nan_code(xx)) UITYPE; // unsigned integer vector type + typedef decltype(xx < xx) BVTYPE; // boolean vector type + + VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors + ITYPE q, qq, signsin, signcos; // integer vectors, 64 bit + + BVTYPE swap, overflow; // boolean vectors + + xa = abs(xx); + + // Find quadrant + y = round(xa * (double)(2. / VM_PI)); // quadrant, as float + q = roundi(y); // quadrant, as integer + // Find quadrant + // 0 - pi/4 => 0 + // pi/4 - 3*pi/4 => 1 + // 3*pi/4 - 5*pi/4 => 2 + // 5*pi/4 - 7*pi/4 => 3 + // 7*pi/4 - 8*pi/4 => 4 + + // Reduce by extended precision modular arithmetic + x = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa))); // x = ((xa - y * DP1) - y * DP2) - y * DP3; + + // Expansion of sin and cos, valid for -pi/4 <= x <= pi/4 + x2 = x * x; + s = polynomial_5(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin); + c = polynomial_5(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos); + s = mul_add(x * x2, s, x); // s = x + (x * x2) * s; + c = mul_add(x2 * x2, c, nmul_add(x2, 0.5, 1.0)); // c = 1.0 - x2 * 0.5 + (x2 * x2) * c; + + // swap sin and cos if odd quadrant + swap = BVTYPE((q & 1) != 0); + + // check for overflow + overflow = BVTYPE(UITYPE(q) > 0x80000000000000); // q big if overflow + overflow &= is_finite(xa); + s = select(overflow, 0.0, s); + c = select(overflow, 1.0, c); + + if constexpr ((SC & 1) != 0) { // calculate sin + sin1 = select(swap, c, s); + signsin = ((q << 62) ^ ITYPE(reinterpret_i(xx))); + sin1 = sign_combine(sin1, reinterpret_d(signsin)); + } + if constexpr ((SC & 2) != 0) { // calculate cos + cos1 = select(swap, s, c); + signcos = ((q + 1) & 2) << 62; + cos1 ^= reinterpret_d(signcos); + } + if constexpr (SC == 3) { // calculate both. cos returned through pointer + *cosret = cos1; + } + if constexpr ((SC & 1) != 0) return sin1; else return cos1; +} + +// instantiations of sincos_d template: + +static inline Vec2d sin(Vec2d const x) { + return sincos_d(0, x); +} + +static inline Vec2d cos(Vec2d const x) { + return sincos_d(0, x); +} + +static inline Vec2d sincos(Vec2d * cosret, Vec2d const x) { + return sincos_d(cosret, x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d sin(Vec4d const x) { + return sincos_d(0, x); +} + +static inline Vec4d cos(Vec4d const x) { + return sincos_d(0, x); +} + +static inline Vec4d sincos(Vec4d * cosret, Vec4d const x) { + return sincos_d(cosret, x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d sin(Vec8d const x) { + return sincos_d(0, x); +} + +static inline Vec8d cos(Vec8d const x) { + return sincos_d(0, x); +} + +static inline Vec8d sincos(Vec8d * cosret, Vec8d const x) { + return sincos_d(cosret, x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// ************************************************************* +// sincos template, single precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// SC: 1 = sin, 2 = cos, 3 = sincos, 4 = tan +// Paramterers: +// xx = input x (radians) +// cosret = return pointer (only if SC = 3) +template +static inline VTYPE sincos_f(VTYPE * cosret, VTYPE const xx) { + + // define constants + const float DP1F = 0.78515625f * 2.f; + const float DP2F = 2.4187564849853515625E-4f * 2.f; + const float DP3F = 3.77489497744594108E-8f * 2.f; + + const float P0sinf = -1.6666654611E-1f; + const float P1sinf = 8.3321608736E-3f; + const float P2sinf = -1.9515295891E-4f; + + const float P0cosf = 4.166664568298827E-2f; + const float P1cosf = -1.388731625493765E-3f; + const float P2cosf = 2.443315711809948E-5f; + + typedef decltype(roundi(xx)) ITYPE; // integer vector type + typedef decltype(nan_code(xx)) UITYPE; // unsigned integer vector type + typedef decltype(xx < xx) BVTYPE; // boolean vector type + + VTYPE xa, x, y, x2, s, c, sin1, cos1; // data vectors + ITYPE q, signsin, signcos; // integer vectors + BVTYPE swap, overflow; // boolean vectors + + xa = abs(xx); + + // Find quadrant + y = round(xa * (float)(2. / VM_PI)); // quadrant, as float + q = roundi(y); // quadrant, as integer + // 0 - pi/4 => 0 + // pi/4 - 3*pi/4 => 1 + // 3*pi/4 - 5*pi/4 => 2 + // 5*pi/4 - 7*pi/4 => 3 + // 7*pi/4 - 8*pi/4 => 4 + + // Reduce by extended precision modular arithmetic + // x = ((xa - y * DP1F) - y * DP2F) - y * DP3F; + x = nmul_add(y, DP3F, nmul_add(y, DP2F, nmul_add(y, DP1F, xa))); + + // A two-step reduction saves time at the cost of precision for very big x: + //x = (xa - y * DP1F) - y * (DP2F+DP3F); + + // Taylor expansion of sin and cos, valid for -pi/4 <= x <= pi/4 + x2 = x * x; + s = polynomial_2(x2, P0sinf, P1sinf, P2sinf) * (x*x2) + x; + c = polynomial_2(x2, P0cosf, P1cosf, P2cosf) * (x2*x2) + nmul_add(0.5f, x2, 1.0f); + + // swap sin and cos if odd quadrant + swap = BVTYPE((q & 1) != 0); + + // check for overflow + overflow = BVTYPE(UITYPE(q) > 0x2000000); // q big if overflow + overflow &= is_finite(xa); + s = select(overflow, 0.0f, s); + c = select(overflow, 1.0f, c); + + if constexpr ((SC & 5) != 0) { // calculate sin + sin1 = select(swap, c, s); + signsin = ((q << 30) ^ ITYPE(reinterpret_i(xx))); + sin1 = sign_combine(sin1, reinterpret_f(signsin)); + } + if constexpr ((SC & 6) != 0) { // calculate cos + cos1 = select(swap, s, c); + signcos = ((q + 1) & 2) << 30; + cos1 ^= reinterpret_f(signcos); + } + if constexpr (SC == 1) return sin1; + else if constexpr (SC == 2) return cos1; + else if constexpr (SC == 3) { // calculate both. cos returned through pointer + *cosret = cos1; + return sin1; + } + else { // SC == 4. tan + return sin1 / cos1; + } +} + +// instantiations of sincos_f template: + +static inline Vec4f sin(Vec4f const x) { + return sincos_f(0, x); +} + +static inline Vec4f cos(Vec4f const x) { + return sincos_f(0, x); +} + +static inline Vec4f sincos(Vec4f * cosret, Vec4f const x) { + return sincos_f(cosret, x); +} + +static inline Vec4f tan(Vec4f const x) { + return sincos_f(0, x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f sin(Vec8f const x) { + return sincos_f(0, x); +} + +static inline Vec8f cos(Vec8f const x) { + return sincos_f(0, x); +} + +static inline Vec8f sincos(Vec8f * cosret, Vec8f const x) { + return sincos_f(cosret, x); +} + +static inline Vec8f tan(Vec8f const x) { + return sincos_f(0, x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f sin(Vec16f const x) { + return sincos_f(0, x); +} + +static inline Vec16f cos(Vec16f const x) { + return sincos_f(0, x); +} + +static inline Vec16f sincos(Vec16f * cosret, Vec16f const x) { + return sincos_f(cosret, x); +} + +static inline Vec16f tan(Vec16f const x) { + return sincos_f(0, x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// ************************************************************* +// tan template, double precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// Paramterers: +// x = input x (radians) +template +static inline VTYPE tan_d(VTYPE const x) { + + // define constants + const double DP1 = 7.853981554508209228515625E-1 * 2.;; + const double DP2 = 7.94662735614792836714E-9 * 2.;; + const double DP3 = 3.06161699786838294307E-17 * 2.;; + + const double P2tan = -1.30936939181383777646E4; + const double P1tan = 1.15351664838587416140E6; + const double P0tan = -1.79565251976484877988E7; + + const double Q3tan = 1.36812963470692954678E4; + const double Q2tan = -1.32089234440210967447E6; + const double Q1tan = 2.50083801823357915839E7; + const double Q0tan = -5.38695755929454629881E7; + + typedef decltype(x > x) BVTYPE; // boolean vector type + VTYPE xa, y, z, zz, px, qx, tn, recip; // data vectors + BVTYPE doinvert, xzero, overflow; // boolean vectors + typedef decltype(nan_code(x)) UITYPE; // unsigned integer vector type + + + xa = abs(x); + + // Find quadrant + y = round(xa * (double)(2. / VM_PI)); // quadrant, as float + auto q = roundi(y); // quadrant, as integer + // Find quadrant + // 0 - pi/4 => 0 + // pi/4 - 3*pi/4 => 1 + // 3*pi/4 - 5*pi/4 => 2 + // 5*pi/4 - 7*pi/4 => 3 + // 7*pi/4 - 8*pi/4 => 4 + + // Reduce by extended precision modular arithmetic + // z = ((xa - y * DP1) - y * DP2) - y * DP3; + z = nmul_add(y, DP3, nmul_add(y, DP2, nmul_add(y, DP1, xa))); + + // Pade expansion of tan, valid for -pi/4 <= x <= pi/4 + zz = z * z; + px = polynomial_2(zz, P0tan, P1tan, P2tan); + qx = polynomial_4n(zz, Q0tan, Q1tan, Q2tan, Q3tan); + + // qx cannot be 0 for x <= pi/4 + tn = mul_add(px / qx, z * zz, z); // tn = z + z * zz * px / qx; + + // if (q&2) tn = -1/tn + doinvert = BVTYPE((q & 1) != 0); + xzero = (xa == 0.); + // avoid division by 0. We will not be using recip anyway if xa == 0. + // tn never becomes exactly 0 when x = pi/2 so we only have to make + // a special case for x == 0. + recip = (-1.) / select(xzero, VTYPE(-1.), tn); + tn = select(doinvert, recip, tn); + tn = sign_combine(tn, x); // get original sign + + overflow = BVTYPE(UITYPE(q) > 0x80000000000000) & is_finite(xa); + tn = select(overflow, 0., tn); + + return tn; +} + +// instantiations of tan_d template: + +static inline Vec2d tan(Vec2d const x) { + return tan_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d tan(Vec4d const x) { + return tan_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d tan(Vec8d const x) { + return tan_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// ************************************************************* +// tan template, single precision +// ************************************************************* +// This is removed for the single precision version. +// It is faster to use tan(x) = sin(x)/cos(x) + + + +// ************************************************************* +// asin/acos template, double precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// AC: 0 = asin, 1 = acos +// Paramterers: +// x = input x +template +static inline VTYPE asin_d(VTYPE const x) { + + // define constants + const double R4asin = 2.967721961301243206100E-3; + const double R3asin = -5.634242780008963776856E-1; + const double R2asin = 6.968710824104713396794E0; + const double R1asin = -2.556901049652824852289E1; + const double R0asin = 2.853665548261061424989E1; + + const double S3asin = -2.194779531642920639778E1; + const double S2asin = 1.470656354026814941758E2; + const double S1asin = -3.838770957603691357202E2; + const double S0asin = 3.424398657913078477438E2; + + const double P5asin = 4.253011369004428248960E-3; + const double P4asin = -6.019598008014123785661E-1; + const double P3asin = 5.444622390564711410273E0; + const double P2asin = -1.626247967210700244449E1; + const double P1asin = 1.956261983317594739197E1; + const double P0asin = -8.198089802484824371615E0; + + const double Q4asin = -1.474091372988853791896E1; + const double Q3asin = 7.049610280856842141659E1; + const double Q2asin = -1.471791292232726029859E2; + const double Q1asin = 1.395105614657485689735E2; + const double Q0asin = -4.918853881490881290097E1; + + VTYPE xa, xb, x1, x2, x3, x4, x5, px, qx, rx, sx, vx, wx, y1, yb, z, z1, z2; + bool dobig, dosmall; + + xa = abs(x); + auto big = xa >= 0.625; // boolean vector + + /* + Small: xa < 0.625 + ------------------ + x = xa * xa; + px = PX(x); + qx = QX(x); + y1 = x*px/qx; + y1 = xa * y1 + xa; + + Big: xa >= 0.625 + ------------------ + x = 1.0 - xa; + rx = RX(x); + sx = SX(x); + y1 = x * rx/sx; + x3 = sqrt(x+x); + y3 = x3 * y1 - MOREBITS; + z = pi/2 - x3 - y3 + */ + + // select a common x for all polynomials + // This allows sharing of powers of x through common subexpression elimination + x1 = select(big, 1.0 - xa, xa * xa); + + // calculate powers of x1 outside branches to make sure they are only calculated once + x2 = x1 * x1; + x4 = x2 * x2; + x5 = x4 * x1; + x3 = x2 * x1; + + dosmall = !horizontal_and(big); // at least one element is small + dobig = horizontal_or(big); // at least one element is big + + // calculate polynomials (reuse powers of x) + if (dosmall) { + // px = polynomial_5 (x1, P0asin, P1asin, P2asin, P3asin, P4asin, P5asin); + // qx = polynomial_5n(x1, Q0asin, Q1asin, Q2asin, Q3asin, Q4asin); + px = mul_add(x3, P3asin, P0asin) + mul_add(x4, P4asin, x1*P1asin) + mul_add(x5, P5asin, x2*P2asin); + qx = mul_add(x4, Q4asin, x5) + mul_add(x3, Q3asin, x1*Q1asin) + mul_add(x2, Q2asin, Q0asin); + } + if (dobig) { + // rx = polynomial_4 (x1, R0asin, R1asin, R2asin, R3asin, R4asin); + // sx = polynomial_4n(x1, S0asin, S1asin, S2asin, S3asin); + rx = mul_add(x3, R3asin, x2*R2asin) + mul_add(x4, R4asin, mul_add(x1, R1asin, R0asin)); + sx = mul_add(x3, S3asin, x4) + mul_add(x2, S2asin, mul_add(x1, S1asin, S0asin)); + } + + // select and divide outside branches to avoid dividing twice + vx = select(big, rx, px); + wx = select(big, sx, qx); + y1 = vx / wx * x1; + + // results for big + if (dobig) { // avoid square root if all are small + xb = sqrt(x1 + x1); // this produces NAN if xa > 1 so we don't need a special case for xa > 1 + z1 = mul_add(xb, y1, xb); // yb = xb * y1; z1 = xb + yb; + } + + // results for small + z2 = mul_add(xa, y1, xa); // z2 = xa * y1 + xa; + + // correct for sign + if constexpr (AC == 1) { // acos + z1 = select(x < 0., VM_PI - z1, z1); + z2 = VM_PI_2 - sign_combine(z2, x); + z = select(big, z1, z2); + } + else { // asin + z1 = VM_PI_2 - z1; + z = select(big, z1, z2); + z = sign_combine(z, x); + } + return z; +} + +// instantiations of asin_d template: + +static inline Vec2d asin(Vec2d const x) { + return asin_d(x); +} + +static inline Vec2d acos(Vec2d const x) { + return asin_d(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d asin(Vec4d const x) { + return asin_d(x); +} + +static inline Vec4d acos(Vec4d const x) { + return asin_d(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d asin(Vec8d const x) { + return asin_d(x); +} + +static inline Vec8d acos(Vec8d const x) { + return asin_d(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// ************************************************************* +// asin/acos template, single precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// AC: 0 = asin, 1 = acos +// Paramterers: +// x = input x +template +static inline VTYPE asin_f(VTYPE const x) { + + // define constants + const float P4asinf = 4.2163199048E-2f; + const float P3asinf = 2.4181311049E-2f; + const float P2asinf = 4.5470025998E-2f; + const float P1asinf = 7.4953002686E-2f; + const float P0asinf = 1.6666752422E-1f; + + VTYPE xa, x1, x2, x3, x4, xb, z, z1, z2; + + xa = abs(x); + auto big = xa > 0.5f; // boolean vector + + x1 = 0.5f * (1.0f - xa); + x2 = xa * xa; + x3 = select(big, x1, x2); + + //if (horizontal_or(big)) + { + xb = sqrt(x1); + } + x4 = select(big, xb, xa); + + z = polynomial_4(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); + z = mul_add(z, x3*x4, x4); // z = z * (x3*x4) + x4; + z1 = z + z; + + // correct for sign + if constexpr (AC == 1) { // acos + z1 = select(x < 0., float(VM_PI) - z1, z1); + z2 = float(VM_PI_2) - sign_combine(z, x); + z = select(big, z1, z2); + } + else { // asin + z1 = float(VM_PI_2) - z1; + z = select(big, z1, z); + z = sign_combine(z, x); + } + + return z; +} + +// instantiations of asin_f template: + +static inline Vec4f asin(Vec4f const x) { + return asin_f(x); +} + +static inline Vec4f acos(Vec4f const x) { + return asin_f(x); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f asin(Vec8f const x) { + return asin_f(x); +} +static inline Vec8f acos(Vec8f const x) { + return asin_f(x); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f asin(Vec16f const x) { + return asin_f(x); +} +static inline Vec16f acos(Vec16f const x) { + return asin_f(x); +} +#endif // MAX_VECTOR_SIZE >= 512 + + +// ************************************************************* +// atan template, double precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// T2: 0 = atan, 1 = atan2 +// Paramterers: +// y, x. calculate tan(y/x) +// result is between -pi/2 and +pi/2 when x > 0 +// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2 +template +static inline VTYPE atan_d(VTYPE const y, VTYPE const x) { + + // define constants + //const double ONEOPIO4 = 4./VM_PI; + const double MOREBITS = 6.123233995736765886130E-17; + const double MOREBITSO2 = MOREBITS * 0.5; + const double T3PO8 = VM_SQRT2 + 1.; // 2.41421356237309504880; + + const double P4atan = -8.750608600031904122785E-1; + const double P3atan = -1.615753718733365076637E1; + const double P2atan = -7.500855792314704667340E1; + const double P1atan = -1.228866684490136173410E2; + const double P0atan = -6.485021904942025371773E1; + + const double Q4atan = 2.485846490142306297962E1; + const double Q3atan = 1.650270098316988542046E2; + const double Q2atan = 4.328810604912902668951E2; + const double Q1atan = 4.853903996359136964868E2; + const double Q0atan = 1.945506571482613964425E2; + + typedef decltype (x > x) BVTYPE; // boolean vector type + VTYPE t, x1, x2, y1, y2, s, fac, a, b, z, zz, px, qx, re; // data vectors + BVTYPE swapxy, notbig, notsmal; // boolean vectors + + if constexpr (T2 == 1) { // atan2(y,x) + // move in first octant + x1 = abs(x); + y1 = abs(y); + swapxy = (y1 > x1); + // swap x and y if y1 > x1 + x2 = select(swapxy, y1, x1); + y2 = select(swapxy, x1, y1); + + // check for special case: x and y are both +/- INF + BVTYPE both_infinite = is_inf(x) & is_inf(y); // x and Y are both infinite + if (horizontal_or(both_infinite)) { // at least one element has both infinite + VTYPE mone = VTYPE(-1.0); + x2 = select(both_infinite, x2 & mone, x2); // get 1.0 with the sign of x + y2 = select(both_infinite, y2 & mone, y2); // get 1.0 with the sign of y + } + + t = y2 / x2; // x = y = 0 gives NAN here + } + else { // atan(y) + t = abs(y); + } + + // small: t < 0.66 + // medium: 0.66 <= t <= 2.4142 (1+sqrt(2)) + // big: t > 2.4142 + notbig = t <= T3PO8; // t <= 2.4142 + notsmal = t >= 0.66; // t >= 0.66 + + s = select(notbig, VTYPE(VM_PI_4), VTYPE(VM_PI_2)); + s = notsmal & s; // select(notsmal, s, 0.); + fac = select(notbig, VTYPE(MOREBITSO2), VTYPE(MOREBITS)); + fac = notsmal & fac; //select(notsmal, fac, 0.); + + // small: z = t / 1.0; + // medium: z = (t-1.0) / (t+1.0); + // big: z = -1.0 / t; + a = notbig & t; // select(notbig, t, 0.); + a = if_add(notsmal, a, -1.); + b = notbig & VTYPE(1.); // select(notbig, 1., 0.); + b = if_add(notsmal, b, t); + z = a / b; // division by 0 will not occur unless x and y are both 0 + + zz = z * z; + + px = polynomial_4(zz, P0atan, P1atan, P2atan, P3atan, P4atan); + qx = polynomial_5n(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); + + re = mul_add(px / qx, z * zz, z); // re = (px / qx) * (z * zz) + z; + re += s + fac; + + if constexpr (T2 == 1) { // atan2(y,x) + // move back in place + re = select(swapxy, VM_PI_2 - re, re); + re = select((x | y) == 0., 0., re); // atan2(0,0) = 0 by convention + re = select(sign_bit(x), VM_PI - re, re);// also for x = -0. + } + // get sign bit + re = sign_combine(re, y); + + return re; +} + +// instantiations of atan_d template: + +static inline Vec2d atan2(Vec2d const y, Vec2d const x) { + return atan_d(y, x); +} + +static inline Vec2d atan(Vec2d const y) { + return atan_d(y, 0.); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec4d atan2(Vec4d const y, Vec4d const x) { + return atan_d(y, x); +} + +static inline Vec4d atan(Vec4d const y) { + return atan_d(y, 0.); +} +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec8d atan2(Vec8d const y, Vec8d const x) { + return atan_d(y, x); +} + +static inline Vec8d atan(Vec8d const y) { + return atan_d(y, 0.); +} +#endif // MAX_VECTOR_SIZE >= 512 + + + +// ************************************************************* +// atan template, single precision +// ************************************************************* +// Template parameters: +// VTYPE: f.p. vector type +// T2: 0 = atan, 1 = atan2 +// Paramterers: +// y, x. calculate tan(y/x) +// result is between -pi/2 and +pi/2 when x > 0 +// result is between -pi and -pi/2 or between pi/2 and pi when x < 0 for atan2 +template +static inline VTYPE atan_f(VTYPE const y, VTYPE const x) { + + // define constants + const float P3atanf = 8.05374449538E-2f; + const float P2atanf = -1.38776856032E-1f; + const float P1atanf = 1.99777106478E-1f; + const float P0atanf = -3.33329491539E-1f; + + typedef decltype (x > x) BVTYPE; // boolean vector type + VTYPE t, x1, x2, y1, y2, s, a, b, z, zz, re;// data vectors + BVTYPE swapxy, notbig, notsmal; // boolean vectors + + if constexpr (T2 == 1) { // atan2(y,x) + // move in first octant + x1 = abs(x); + y1 = abs(y); + swapxy = (y1 > x1); + // swap x and y if y1 > x1 + x2 = select(swapxy, y1, x1); + y2 = select(swapxy, x1, y1); + + // check for special case: x and y are both +/- INF + BVTYPE both_infinite = is_inf(x) & is_inf(y); // x and Y are both infinite + if (horizontal_or(both_infinite)) { // at least one element has both infinite + VTYPE mone = VTYPE(-1.0f); + x2 = select(both_infinite, x2 & mone, x2); // get 1.0 with the sign of x + y2 = select(both_infinite, y2 & mone, y2); // get 1.0 with the sign of y + } + + // x = y = 0 will produce NAN. No problem, fixed below + t = y2 / x2; + } + else { // atan(y) + t = abs(y); + } + + // small: t < 0.4142 + // medium: 0.4142 <= t <= 2.4142 + // big: t > 2.4142 (not for atan2) + if constexpr (T2 == 0) { // atan(y) + notsmal = t >= float(VM_SQRT2 - 1.); // t >= tan pi/8 + notbig = t <= float(VM_SQRT2 + 1.); // t <= tan 3pi/8 + + s = select(notbig, VTYPE(float(VM_PI_4)), VTYPE(float(VM_PI_2))); + s = notsmal & s; // select(notsmal, s, 0.); + + // small: z = t / 1.0; + // medium: z = (t-1.0) / (t+1.0); + // big: z = -1.0 / t; + a = notbig & t; // select(notbig, t, 0.); + a = if_add(notsmal, a, -1.f); + b = notbig & VTYPE(1.f); // select(notbig, 1., 0.); + b = if_add(notsmal, b, t); + z = a / b; // division by 0 will not occur unless x and y are both 0 + } + else { // atan2(y,x) + // small: z = t / 1.0; + // medium: z = (t-1.0) / (t+1.0); + notsmal = t >= float(VM_SQRT2 - 1.); + a = if_add(notsmal, t, -1.f); + b = if_add(notsmal, 1.f, t); + s = notsmal & VTYPE(float(VM_PI_4)); + z = a / b; + } + + zz = z * z; + + // Taylor expansion + re = polynomial_3(zz, P0atanf, P1atanf, P2atanf, P3atanf); + re = mul_add(re, zz * z, z) + s; + + if constexpr (T2 == 1) { // atan2(y,x) + // move back in place + re = select(swapxy, float(VM_PI_2) - re, re); + re = select((x | y) == 0.f, 0.f, re); // atan2(0,+0) = 0 by convention + re = select(sign_bit(x), float(VM_PI) - re, re); // also for x = -0. + } + // get sign bit + re = sign_combine(re, y); + + return re; +} + +// instantiations of atan_f template: + +static inline Vec4f atan2(Vec4f const y, Vec4f const x) { + return atan_f(y, x); +} + +static inline Vec4f atan(Vec4f const y) { + return atan_f(y, 0.); +} + +#if MAX_VECTOR_SIZE >= 256 +static inline Vec8f atan2(Vec8f const y, Vec8f const x) { + return atan_f(y, x); +} + +static inline Vec8f atan(Vec8f const y) { + return atan_f(y, 0.); +} + +#endif // MAX_VECTOR_SIZE >= 256 + +#if MAX_VECTOR_SIZE >= 512 +static inline Vec16f atan2(Vec16f const y, Vec16f const x) { + return atan_f(y, x); +} + +static inline Vec16f atan(Vec16f const y) { + return atan_f(y, 0.); +} + +#endif // MAX_VECTOR_SIZE >= 512 + +#ifdef VCL_NAMESPACE +} +#endif + +#endif diff --git a/DFTTest/vectorclass/instrset.h b/DFTTest/vectorclass/instrset.h deleted file mode 100644 index 9578147..0000000 --- a/DFTTest/vectorclass/instrset.h +++ /dev/null @@ -1,216 +0,0 @@ -/**************************** instrset.h ********************************** -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2016-11-25 -* Version: 1.25 -* Project: vector classes -* Description: -* Header file for various compiler-specific tasks and other common tasks to -* vector class library: -* > selects the supported instruction set -* > defines integer types -* > defines compiler version macros -* > undefines certain macros that prevent function overloading -* > defines template class to represent compile-time integer constant -* > defines template for compile-time error messages -* -* (c) Copyright 2012-2016 GNU General Public License www.gnu.org/licenses -******************************************************************************/ - -#ifndef INSTRSET_H -#define INSTRSET_H 125 - -// Detect 64 bit mode -#if (defined(_M_AMD64) || defined(_M_X64) || defined(__amd64) ) && ! defined(__x86_64__) -#define __x86_64__ 1 // There are many different macros for this, decide on only one -#endif - -// Find instruction set from compiler macros if INSTRSET not defined -// Note: Most of these macros are not defined in Microsoft compilers -#ifndef INSTRSET -#if defined ( __AVX512F__ ) || defined ( __AVX512__ ) -#define INSTRSET 9 -#elif defined ( __AVX2__ ) -#define INSTRSET 8 -#elif defined ( __AVX__ ) -#define INSTRSET 7 -#elif defined ( __SSE4_2__ ) -#define INSTRSET 6 -#elif defined ( __SSE4_1__ ) -#define INSTRSET 5 -#elif defined ( __SSSE3__ ) -#define INSTRSET 4 -#elif defined ( __SSE3__ ) -#define INSTRSET 3 -#elif defined ( __SSE2__ ) || defined ( __x86_64__ ) -#define INSTRSET 2 -#elif defined ( __SSE__ ) -#define INSTRSET 1 -#elif defined ( _M_IX86_FP ) // Defined in MS compiler. 1: SSE, 2: SSE2 -#define INSTRSET _M_IX86_FP -#else -#define INSTRSET 0 -#endif // instruction set defines -#endif // INSTRSET - -// Include the appropriate header file for intrinsic functions -#if INSTRSET > 7 // AVX2 and later -#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) -#include // x86intrin.h includes header files for whatever instruction - // sets are specified on the compiler command line, such as: - // xopintrin.h, fma4intrin.h -#else -#include // MS version of immintrin.h covers AVX, AVX2 and FMA3 -#endif // __GNUC__ -#elif INSTRSET == 7 -#include // AVX -#elif INSTRSET == 6 -#include // SSE4.2 -#elif INSTRSET == 5 -#include // SSE4.1 -#elif INSTRSET == 4 -#include // SSSE3 -#elif INSTRSET == 3 -#include // SSE3 -#elif INSTRSET == 2 -#include // SSE2 -#elif INSTRSET == 1 -#include // SSE -#endif // INSTRSET - -#if INSTRSET >= 8 && !defined(__FMA__) -// Assume that all processors that have AVX2 also have FMA3 -#if defined (__GNUC__) && ! defined (__INTEL_COMPILER) && ! defined (__clang__) -// Prevent error message in g++ when using FMA intrinsics with avx2: -#pragma message "It is recommended to specify also option -mfma when using -mavx2 or higher" -#else -#define __FMA__ 1 -#endif -#endif - -// AMD instruction sets -#if defined (__XOP__) || defined (__FMA4__) -#ifdef __GNUC__ -#include // AMD XOP (Gnu) -#else -#include // AMD XOP (Microsoft) -#endif // __GNUC__ -#elif defined (__SSE4A__) // AMD SSE4A -#include -#endif // __XOP__ - -// FMA3 instruction set -#if defined (__FMA__) && (defined(__GNUC__) || defined(__clang__)) && ! defined (__INTEL_COMPILER) -#include -#endif // __FMA__ - -// FMA4 instruction set -#if defined (__FMA4__) && (defined(__GNUC__) || defined(__clang__)) -#include // must have both x86intrin.h and fma4intrin.h, don't know why -#endif // __FMA4__ - - -// Define integer types with known size -#if defined(__GNUC__) || defined(__clang__) || (defined(_MSC_VER) && _MSC_VER >= 1600) - // Compilers supporting C99 or C++0x have stdint.h defining these integer types - #include -#elif defined(_MSC_VER) - // Older Microsoft compilers have their own definitions - typedef signed __int8 int8_t; - typedef unsigned __int8 uint8_t; - typedef signed __int16 int16_t; - typedef unsigned __int16 uint16_t; - typedef signed __int32 int32_t; - typedef unsigned __int32 uint32_t; - typedef signed __int64 int64_t; - typedef unsigned __int64 uint64_t; - #ifndef _INTPTR_T_DEFINED - #define _INTPTR_T_DEFINED - #ifdef __x86_64__ - typedef int64_t intptr_t; - #else - typedef int32_t intptr_t; - #endif - #endif -#else - // This works with most compilers - typedef signed char int8_t; - typedef unsigned char uint8_t; - typedef signed short int int16_t; - typedef unsigned short int uint16_t; - typedef signed int int32_t; - typedef unsigned int uint32_t; - typedef long long int64_t; - typedef unsigned long long uint64_t; - #ifdef __x86_64__ - typedef int64_t intptr_t; - #else - typedef int32_t intptr_t; - #endif -#endif - -#include // define abs(int) - -#ifdef _MSC_VER // Microsoft compiler or compatible Intel compiler -#include // define _BitScanReverse(int), __cpuid(int[4],int), _xgetbv(int) -#endif // _MSC_VER - -// functions in instrset_detect.cpp -#ifdef VCL_NAMESPACE -namespace VCL_NAMESPACE { -#endif - int instrset_detect(void); // tells which instruction sets are supported - bool hasFMA3(void); // true if FMA3 instructions supported - bool hasFMA4(void); // true if FMA4 instructions supported - bool hasXOP(void); // true if XOP instructions supported - bool hasAVX512ER(void); // true if AVX512ER instructions supported -#ifdef VCL_NAMESPACE -} -#endif - -// GCC version -#if defined(__GNUC__) && !defined (GCC_VERSION) && !defined (__clang__) -#define GCC_VERSION ((__GNUC__) * 10000 + (__GNUC_MINOR__) * 100 + (__GNUC_PATCHLEVEL__)) -#endif - -// Clang version -#if defined (__clang__) -#define CLANG_VERSION ((__clang_major__) * 10000 + (__clang_minor__) * 100 + (__clang_patchlevel__)) -// Problem: The version number is not consistent across platforms -// http://llvm.org/bugs/show_bug.cgi?id=12643 -// Apple bug 18746972 -#endif - -// Fix problem with non-overloadable macros named min and max in WinDef.h -#ifdef _MSC_VER -#if defined (_WINDEF_) && defined(min) && defined(max) -#undef min -#undef max -#endif -#ifndef NOMINMAX -#define NOMINMAX -#endif -#endif - -#ifdef VCL_NAMESPACE -namespace VCL_NAMESPACE { -#endif - // Template class to represent compile-time integer constant - template class Const_int_t {}; // represent compile-time signed integer constant - template class Const_uint_t {}; // represent compile-time unsigned integer constant - #define const_int(n) (Const_int_t ()) // n must be compile-time integer constant - #define const_uint(n) (Const_uint_t()) // n must be compile-time unsigned integer constant - - // Template for compile-time error messages - template class Static_error_check { - public: Static_error_check() {}; - }; - template <> class Static_error_check { // generate compile-time error if false - private: Static_error_check() {}; - }; -#ifdef VCL_NAMESPACE -} -#endif - - -#endif // INSTRSET_H diff --git a/DFTTest/vectorclass/license.txt b/DFTTest/vectorclass/license.txt deleted file mode 100644 index bc08fe2..0000000 --- a/DFTTest/vectorclass/license.txt +++ /dev/null @@ -1,619 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. diff --git a/DFTTest/vectorclass/vectorclass.h b/DFTTest/vectorclass/vectorclass.h deleted file mode 100644 index 3f687bd..0000000 --- a/DFTTest/vectorclass/vectorclass.h +++ /dev/null @@ -1,69 +0,0 @@ -/**************************** vectorclass.h ******************************** -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2017-05-10 -* Version: 1.29 -* Project: vector classes -* Description: -* Header file defining vector classes as interface to intrinsic functions -* in x86 microprocessors with SSE2 and later instruction sets up to AVX512. -* -* Instructions: -* Use Gnu, Clang, Intel or Microsoft C++ compiler. Compile for the desired -* instruction set, which must be at least SSE2. Specify the supported -* instruction set by a command line define, e.g. __SSE4_1__ if the -* compiler does not automatically do so. -* -* Each vector object is represented internally in the CPU as a vector -* register with 128, 256 or 512 bits. -* -* This header file includes the appropriate header files depending on the -* supported instruction set -* -* For detailed instructions, see VectorClass.pdf -* -* (c) Copyright 2012-2017 GNU General Public License www.gnu.org/licenses -******************************************************************************/ -#ifndef VECTORCLASS_H -#define VECTORCLASS_H 129 - -// Maximum vector size, bits. Allowed values are 128, 256, 512 -#ifndef MAX_VECTOR_SIZE -#define MAX_VECTOR_SIZE 256 -#endif - -#include "instrset.h" // Select supported instruction set - -#if INSTRSET < 2 // SSE2 required - #error Please compile for the SSE2 instruction set or higher -#else - -#include "vectori128.h" // 128-bit integer vectors -#include "vectorf128.h" // 128-bit floating point vectors - -#if MAX_VECTOR_SIZE >= 256 -#if INSTRSET >= 8 - #include "vectori256.h" // 256-bit integer vectors, requires AVX2 instruction set -#else - #include "vectori256e.h" // 256-bit integer vectors, emulated -#endif // INSTRSET >= 8 -#if INSTRSET >= 7 - #include "vectorf256.h" // 256-bit floating point vectors, requires AVX instruction set -#else - #include "vectorf256e.h" // 256-bit floating point vectors, emulated -#endif // INSTRSET >= 7 -#endif // MAX_VECTOR_SIZE >= 256 - -#if MAX_VECTOR_SIZE >= 512 -#if INSTRSET >= 9 - #include "vectori512.h" // 512-bit integer vectors, requires AVX512 instruction set - #include "vectorf512.h" // 512-bit floating point vectors, requires AVX512 instruction set -#else - #include "vectori512e.h" // 512-bit integer vectors, emulated - #include "vectorf512e.h" // 512-bit floating point vectors, emulated -#endif // INSTRSET >= 9 -#endif // MAX_VECTOR_SIZE >= 512 - -#endif // INSTRSET >= 2 - -#endif // VECTORCLASS_H diff --git a/DFTTest/vectorclass/vectorf128.h b/DFTTest/vectorclass/vectorf128.h deleted file mode 100644 index 7cfec52..0000000 --- a/DFTTest/vectorclass/vectorf128.h +++ /dev/null @@ -1,2779 +0,0 @@ -/**************************** vectorf128.h ******************************* -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2017-05-10 -* Version: 1.29 -* Project: vector classes -* Description: -* Header file defining floating point vector classes as interface to -* intrinsic functions in x86 microprocessors with SSE2 and later instruction -* sets up to AVX. -* -* Instructions: -* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired -* instruction set, which must be at least SSE2. Specify the supported -* instruction set by a command line define, e.g. __SSE4_1__ if the -* compiler does not automatically do so. -* -* The following vector classes are defined here: -* Vec4f Vector of 4 single precision floating point numbers -* Vec4fb Vector of 4 Booleans for use with Vec4f -* Vec2d Vector of 2 double precision floating point numbers -* Vec2db Vector of 2 Booleans for use with Vec2d -* -* Each vector object is represented internally in the CPU as a 128-bit register. -* This header file defines operators and functions for these vectors. -* -* For example: -* Vec2d a(1.0, 2.0), b(3.0, 4.0), c; -* c = a + b; // now c contains (4.0, 6.0) -* -* For detailed instructions, see VectorClass.pdf -* -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses -*****************************************************************************/ -#ifndef VECTORF128_H -#define VECTORF128_H - -#if defined _MSC_VER && _MSC_VER >= 1800 -// solve problem with ambiguous overloading of pow function in Microsoft math.h: -// make sure math.h is included first rather than last -#include -#endif - -#include "vectori128.h" // Define integer vectors - -#ifdef VCL_NAMESPACE -namespace VCL_NAMESPACE { -#endif - -/***************************************************************************** -* -* select functions -* -*****************************************************************************/ -// Select between two __m128 sources, element by element. Used in various functions -// and operators. Corresponds to this pseudocode: -// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each element in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are -// allowed. The implementation depends on the instruction set: -// If SSE4.1 is supported then only bit 31 in each dword of s is checked, -// otherwise all bits in s are used. -static inline __m128 selectf (__m128 const & s, __m128 const & a, __m128 const & b) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_blendv_ps (b, a, s); -#else - return _mm_or_ps( - _mm_and_ps(s,a), - _mm_andnot_ps(s,b)); -#endif -} - -// Same, with two __m128d sources. -// and operators. Corresponds to this pseudocode: -// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; -// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other -// values are allowed. The implementation depends on the instruction set: -// If SSE4.1 is supported then only bit 63 in each dword of s is checked, -// otherwise all bits in s are used. -static inline __m128d selectd (__m128d const & s, __m128d const & a, __m128d const & b) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_blendv_pd (b, a, s); -#else - return _mm_or_pd( - _mm_and_pd(s,a), - _mm_andnot_pd(s,b)); -#endif -} - - -/***************************************************************************** -* -* Vec4fb: Vector of 4 Booleans for use with Vec4f -* -*****************************************************************************/ - -class Vec4fb { -protected: - __m128 xmm; // Float vector -public: - // Default constructor: - Vec4fb() { - } - // Constructor to build from all elements: - Vec4fb(bool b0, bool b1, bool b2, bool b3) { - xmm = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); - } - // Constructor to convert from type __m128 used in intrinsics: - Vec4fb(__m128 const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128 used in intrinsics: - Vec4fb & operator = (__m128 const & x) { - xmm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec4fb(bool b) { - xmm = _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b))); - } - // Assignment operator to broadcast scalar value: - Vec4fb & operator = (bool b) { - *this = Vec4fb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec4fb(int b); - Vec4fb & operator = (int x); -public: - // Constructor to convert from type Vec4ib used as Boolean for integer vectors - Vec4fb(Vec4ib const & x) { - xmm = _mm_castsi128_ps(x); - } - // Assignment operator to convert from type Vec4ib used as Boolean for integer vectors - Vec4fb & operator = (Vec4ib const & x) { - xmm = _mm_castsi128_ps(x); - return *this; - } - // Type cast operator to convert to __m128 used in intrinsics - operator __m128() const { - return xmm; - } - /* Clang problem: - The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128f as identical. - I have reported this problem in 2013 but it is still not fixed in 2017! - See the bug report at http://llvm.org/bugs/show_bug.cgi?id=17164 - Additional problem: The version number is not consistent across platforms. The Apple build has - different version numbers. We have to rely on __apple_build_version__ on the Mac platform: - http://llvm.org/bugs/show_bug.cgi?id=12643 - I have received reports that there was no aliasing of vector types on __apple_build_version__ = 6020053 - but apparently the problem has come back. The aliasing of vector types has been reported on - __apple_build_version__ = 8000042 - We have to make switches here when - hopefully - the error some day has been fixed. - We need different version checks with and whithout __apple_build_version__ - */ - -//#if (defined (__clang__) && !defined(__apple_build_version__)) || (defined(__apple_build_version__) && __apple_build_version__ < 6020000) -#if defined (__clang__) /* && CLANG_VERSION < xxxxx */ || defined(__apple_build_version__) -#define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY -#else - // Type cast operator to convert to type Vec4ib used as Boolean for integer vectors - operator Vec4ib() const { - return _mm_castps_si128(xmm); - } -#endif - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4fb const & insert(uint32_t index, bool value) { - static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0}; - __m128 mask = _mm_loadu_ps((float const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position - if (value) { - xmm = _mm_or_ps(xmm,mask); - } - else { - xmm = _mm_andnot_ps(mask,xmm); - } - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - //return Vec4ib(*this).extract(index); - return Vec4ib(_mm_castps_si128(xmm)).extract(index); - } - // Extract a single element. Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 4; - } -}; - - -/***************************************************************************** -* -* Operators for Vec4fb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec4fb operator & (Vec4fb const & a, Vec4fb const & b) { - return _mm_and_ps(a, b); -} -static inline Vec4fb operator && (Vec4fb const & a, Vec4fb const & b) { - return a & b; -} - -// vector operator &= : bitwise and -static inline Vec4fb & operator &= (Vec4fb & a, Vec4fb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec4fb operator | (Vec4fb const & a, Vec4fb const & b) { - return _mm_or_ps(a, b); -} -static inline Vec4fb operator || (Vec4fb const & a, Vec4fb const & b) { - return a | b; -} - -// vector operator |= : bitwise or -static inline Vec4fb & operator |= (Vec4fb & a, Vec4fb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4fb operator ^ (Vec4fb const & a, Vec4fb const & b) { - return _mm_xor_ps(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec4fb & operator ^= (Vec4fb & a, Vec4fb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec4fb operator ~ (Vec4fb const & a) { - return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1))); -} - -// vector operator ! : logical not -// (operator ! is less efficient than operator ~. Use only where not -// all bits in an element are the same) -static inline Vec4fb operator ! (Vec4fb const & a) { - return Vec4fb( ! Vec4ib(a)); -} - -// Functions for Vec4fb - -// andnot: a & ~ b -static inline Vec4fb andnot(Vec4fb const & a, Vec4fb const & b) { - return _mm_andnot_ps(b, a); -} - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - -// horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec4fb const & a) { - return _mm_movemask_ps(a) == 0x0F; - //return horizontal_and(Vec128b(_mm_castps_si128(a))); -} - -// horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec4fb const & a) { - return _mm_movemask_ps(a) != 0; - //return horizontal_or(Vec128b(_mm_castps_si128(a))); -} - - -/***************************************************************************** -* -* Vec2db: Vector of 2 Booleans for use with Vec2d -* -*****************************************************************************/ - -class Vec2db { -protected: - __m128d xmm; // Double vector -public: - // Default constructor: - Vec2db() { - } - // Constructor to broadcast the same value into all elements: - // Constructor to build from all elements: - Vec2db(bool b0, bool b1) { - xmm = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); - } - // Constructor to convert from type __m128d used in intrinsics: - Vec2db(__m128d const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128d used in intrinsics: - Vec2db & operator = (__m128d const & x) { - xmm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec2db(bool b) { - xmm = _mm_castsi128_pd(_mm_set1_epi32(-int32_t(b))); - } - // Assignment operator to broadcast scalar value: - Vec2db & operator = (bool b) { - *this = Vec2db(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec2db(int b); - Vec2db & operator = (int x); -public: - // Constructor to convert from type Vec2qb used as Boolean for integer vectors - Vec2db(Vec2qb const & x) { - xmm = _mm_castsi128_pd(x); - } - // Assignment operator to convert from type Vec2qb used as Boolean for integer vectors - Vec2db & operator = (Vec2qb const & x) { - xmm = _mm_castsi128_pd(x); - return *this; - } - // Type cast operator to convert to __m128d used in intrinsics - operator __m128d() const { - return xmm; - } -#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY - // Type cast operator to convert to type Vec2qb used as Boolean for integer vectors - operator Vec2qb() const { - return _mm_castpd_si128(xmm); - } -#endif - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec2db const & insert(uint32_t index, bool value) { - static const int32_t maskl[8] = {0,0,0,0,-1,-1,0,0}; - __m128 mask = _mm_loadu_ps((float const*)(maskl+4-(index&1)*2)); // mask with FFFFFFFFFFFFFFFF at index position - if (value) { - xmm = _mm_or_pd(xmm,_mm_castps_pd(mask)); - } - else { - xmm = _mm_andnot_pd(_mm_castps_pd(mask),xmm); - } - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec2qb(*this).extract(index); - } - // Extract a single element. Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 2; - } -}; - - -/***************************************************************************** -* -* Operators for Vec2db -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec2db operator & (Vec2db const & a, Vec2db const & b) { - return _mm_and_pd(a, b); -} -static inline Vec2db operator && (Vec2db const & a, Vec2db const & b) { - return a & b; -} - -// vector operator &= : bitwise and -static inline Vec2db & operator &= (Vec2db & a, Vec2db const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec2db operator | (Vec2db const & a, Vec2db const & b) { - return _mm_or_pd(a, b); -} -static inline Vec2db operator || (Vec2db const & a, Vec2db const & b) { - return a | b; -} - -// vector operator |= : bitwise or -static inline Vec2db & operator |= (Vec2db & a, Vec2db const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec2db operator ^ (Vec2db const & a, Vec2db const & b) { - return _mm_xor_pd(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec2db & operator ^= (Vec2db & a, Vec2db const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec2db operator ~ (Vec2db const & a) { - return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1))); -} - -// vector operator ! : logical not -// (operator ! is less efficient than operator ~. Use only where not -// all bits in an element are the same) -static inline Vec2db operator ! (Vec2db const & a) { - return Vec2db (! Vec2qb(a)); -} - -// Functions for Vec2db - -// andnot: a & ~ b -static inline Vec2db andnot(Vec2db const & a, Vec2db const & b) { - return _mm_andnot_pd(b, a); -} - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - -// horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec2db const & a) { - return _mm_movemask_pd(a) == 3; - //return horizontal_and(Vec128b(_mm_castpd_si128(a))); -} - -// horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec2db const & a) { - return _mm_movemask_pd(a) != 0; - //return horizontal_or(Vec128b(_mm_castpd_si128(a))); -} - - - -/***************************************************************************** -* -* Vec4f: Vector of 4 single precision floating point values -* -*****************************************************************************/ - -class Vec4f { -protected: - __m128 xmm; // Float vector -public: - // Default constructor: - Vec4f() { - } - // Constructor to broadcast the same value into all elements: - Vec4f(float f) { - xmm = _mm_set1_ps(f); - } - // Constructor to build from all elements: - Vec4f(float f0, float f1, float f2, float f3) { - xmm = _mm_setr_ps(f0, f1, f2, f3); - } - // Constructor to convert from type __m128 used in intrinsics: - Vec4f(__m128 const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128 used in intrinsics: - Vec4f & operator = (__m128 const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128 used in intrinsics - operator __m128() const { - return xmm; - } - // Member function to load from array (unaligned) - Vec4f & load(void const * p) { - xmm = _mm_loadu_ps((float const*)p); - return *this; - } - // Member function to load from array, aligned by 16 - // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, - // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 16. - Vec4f & load_a(void const * p) { - xmm = _mm_load_ps((float const*)p); - return *this; - } - // Member function to store into array (unaligned) - void store(float * p) const { - _mm_storeu_ps(p, xmm); - } - // Member function to store into array, aligned by 16 - // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, - // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 16. - void store_a(float * p) const { - _mm_store_ps(p, xmm); - } - // Member function to store into array using a non-temporal memory hint, aligned by 16 - void stream(float * p) const { - _mm_stream_ps(p, xmm); - } - // Partial load. Load n elements and set the rest to 0 - Vec4f & load_partial(int n, float const * p) { - __m128 t1, t2; - switch (n) { - case 1: - xmm = _mm_load_ss(p); break; - case 2: - xmm = _mm_castpd_ps(_mm_load_sd((double const*)p)); break; - case 3: - t1 = _mm_castpd_ps(_mm_load_sd((double const*)p)); - t2 = _mm_load_ss(p + 2); - xmm = _mm_movelh_ps(t1, t2); break; - case 4: - load(p); break; - default: - xmm = _mm_setzero_ps(); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, float * p) const { - __m128 t1; - switch (n) { - case 1: - _mm_store_ss(p, xmm); break; - case 2: - _mm_store_sd((double*)p, _mm_castps_pd(xmm)); break; - case 3: - _mm_store_sd((double*)p, _mm_castps_pd(xmm)); - t1 = _mm_movehl_ps(xmm,xmm); - _mm_store_ss(p + 2, t1); break; - case 4: - store(p); break; - default:; - } - } - // cut off vector to n elements. The last 4-n elements are set to zero - Vec4f & cutoff(int n) { - if (uint32_t(n) >= 4) return *this; - static const union { - int32_t i[8]; - float f[8]; - } mask = {{1,-1,-1,-1,0,0,0,0}}; - xmm = _mm_and_ps(xmm, Vec4f().load(mask.f + 4 - n)); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4f const & insert(uint32_t index, float value) { -#if INSTRSET >= 5 // SSE4.1 supported - switch (index & 3) { - case 0: - xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 0 << 4); break; - case 1: - xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 1 << 4); break; - case 2: - xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 2 << 4); break; - default: - xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 3 << 4); break; - } -#else - static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0}; - __m128 broad = _mm_set1_ps(value); // broadcast value into all elements - __m128 mask = _mm_loadu_ps((float const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position - xmm = selectf(mask,broad,xmm); -#endif - return *this; - }; - // Member function extract a single element from vector - float extract(uint32_t index) const { - float x[4]; - store(x); - return x[index & 3]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - float operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 4; - } -}; - - -/***************************************************************************** -* -* Operators for Vec4f -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec4f operator + (Vec4f const & a, Vec4f const & b) { - return _mm_add_ps(a, b); -} - -// vector operator + : add vector and scalar -static inline Vec4f operator + (Vec4f const & a, float b) { - return a + Vec4f(b); -} -static inline Vec4f operator + (float a, Vec4f const & b) { - return Vec4f(a) + b; -} - -// vector operator += : add -static inline Vec4f & operator += (Vec4f & a, Vec4f const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec4f operator ++ (Vec4f & a, int) { - Vec4f a0 = a; - a = a + 1.0f; - return a0; -} - -// prefix operator ++ -static inline Vec4f & operator ++ (Vec4f & a) { - a = a + 1.0f; - return a; -} - -// vector operator - : subtract element by element -static inline Vec4f operator - (Vec4f const & a, Vec4f const & b) { - return _mm_sub_ps(a, b); -} - -// vector operator - : subtract vector and scalar -static inline Vec4f operator - (Vec4f const & a, float b) { - return a - Vec4f(b); -} -static inline Vec4f operator - (float a, Vec4f const & b) { - return Vec4f(a) - b; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec4f operator - (Vec4f const & a) { - return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); -} - -// vector operator -= : subtract -static inline Vec4f & operator -= (Vec4f & a, Vec4f const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec4f operator -- (Vec4f & a, int) { - Vec4f a0 = a; - a = a - 1.0f; - return a0; -} - -// prefix operator -- -static inline Vec4f & operator -- (Vec4f & a) { - a = a - 1.0f; - return a; -} - -// vector operator * : multiply element by element -static inline Vec4f operator * (Vec4f const & a, Vec4f const & b) { - return _mm_mul_ps(a, b); -} - -// vector operator * : multiply vector and scalar -static inline Vec4f operator * (Vec4f const & a, float b) { - return a * Vec4f(b); -} -static inline Vec4f operator * (float a, Vec4f const & b) { - return Vec4f(a) * b; -} - -// vector operator *= : multiply -static inline Vec4f & operator *= (Vec4f & a, Vec4f const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec4f operator / (Vec4f const & a, Vec4f const & b) { - return _mm_div_ps(a, b); -} - -// vector operator / : divide vector and scalar -static inline Vec4f operator / (Vec4f const & a, float b) { - return a / Vec4f(b); -} -static inline Vec4f operator / (float a, Vec4f const & b) { - return Vec4f(a) / b; -} - -// vector operator /= : divide -static inline Vec4f & operator /= (Vec4f & a, Vec4f const & b) { - a = a / b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec4fb operator == (Vec4f const & a, Vec4f const & b) { - return _mm_cmpeq_ps(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec4fb operator != (Vec4f const & a, Vec4f const & b) { - return _mm_cmpneq_ps(a, b); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec4fb operator < (Vec4f const & a, Vec4f const & b) { - return _mm_cmplt_ps(a, b); -} - -// vector operator <= : returns true for elements for which a <= b -static inline Vec4fb operator <= (Vec4f const & a, Vec4f const & b) { - return _mm_cmple_ps(a, b); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec4fb operator > (Vec4f const & a, Vec4f const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline Vec4fb operator >= (Vec4f const & a, Vec4f const & b) { - return b <= a; -} - -// Bitwise logical operators - -// vector operator & : bitwise and -static inline Vec4f operator & (Vec4f const & a, Vec4f const & b) { - return _mm_and_ps(a, b); -} - -// vector operator &= : bitwise and -static inline Vec4f & operator &= (Vec4f & a, Vec4f const & b) { - a = a & b; - return a; -} - -// vector operator & : bitwise and of Vec4f and Vec4fb -static inline Vec4f operator & (Vec4f const & a, Vec4fb const & b) { - return _mm_and_ps(a, b); -} -static inline Vec4f operator & (Vec4fb const & a, Vec4f const & b) { - return _mm_and_ps(a, b); -} - -// vector operator | : bitwise or -static inline Vec4f operator | (Vec4f const & a, Vec4f const & b) { - return _mm_or_ps(a, b); -} - -// vector operator |= : bitwise or -static inline Vec4f & operator |= (Vec4f & a, Vec4f const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4f operator ^ (Vec4f const & a, Vec4f const & b) { - return _mm_xor_ps(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec4f & operator ^= (Vec4f & a, Vec4f const & b) { - a = a ^ b; - return a; -} - -// vector operator ! : logical not. Returns Boolean vector -static inline Vec4fb operator ! (Vec4f const & a) { - return a == Vec4f(0.0f); -} - - -/***************************************************************************** -* -* Functions for Vec4f -* -*****************************************************************************/ - -static inline Vec4f zero_4f() { - return _mm_setzero_ps(); -} - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed. -static inline Vec4f select (Vec4fb const & s, Vec4f const & a, Vec4f const & b) { - return selectf(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4f if_add (Vec4fb const & f, Vec4f const & a, Vec4f const & b) { - return a + (Vec4f(f) & b); -} - -// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] -static inline Vec4f if_mul (Vec4fb const & f, Vec4f const & a, Vec4f const & b) { - return a * select(f, b, 1.f); -} - - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline float horizontal_add (Vec4f const & a) { -#if INSTRSET >= 3 // SSE3 - __m128 t1 = _mm_hadd_ps(a,a); - __m128 t2 = _mm_hadd_ps(t1,t1); - return _mm_cvtss_f32(t2); -#else - __m128 t1 = _mm_movehl_ps(a,a); - __m128 t2 = _mm_add_ps(a,t1); - __m128 t3 = _mm_shuffle_ps(t2,t2,1); - __m128 t4 = _mm_add_ss(t2,t3); - return _mm_cvtss_f32(t4); -#endif -} - -// function max: a > b ? a : b -static inline Vec4f max(Vec4f const & a, Vec4f const & b) { - return _mm_max_ps(a,b); -} - -// function min: a < b ? a : b -static inline Vec4f min(Vec4f const & a, Vec4f const & b) { - return _mm_min_ps(a,b); -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec4f abs(Vec4f const & a) { - __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF)); - return _mm_and_ps(a,mask); -} - -// function sqrt: square root -static inline Vec4f sqrt(Vec4f const & a) { - return _mm_sqrt_ps(a); -} - -// function square: a * a -static inline Vec4f square(Vec4f const & a) { - return a * a; -} - -// pow(vector,int) function template -template -static inline VTYPE pow_template_i(VTYPE const & x0, int n) { - VTYPE x = x0; // a^(2^i) - VTYPE y(1.0f); // accumulator - if (n >= 0) { // make sure n is not negative - while (true) { // loop for each bit in n - if (n & 1) y *= x; // multiply if bit = 1 - n >>= 1; // get next bit of n - if (n == 0) return y; // finished - x *= x; // x = a^2, a^4, a^8, etc. - } - } - else { // n < 0 - return VTYPE(1.0f)/pow_template_i(x0,-n); // reciprocal - } -} - -// pow(Vec4f, int): -// The purpose of this template is to prevent implicit conversion of a float -// exponent to int when calling pow(vector, float) and vectormath_exp.h is -// not included - -template static Vec4f pow(Vec4f const & a, TT const & n); - -// Raise floating point numbers to integer power n -template <> -inline Vec4f pow(Vec4f const & x0, int const & n) { - return pow_template_i(x0, n); -} - -// allow conversion from unsigned int -template <> -inline Vec4f pow(Vec4f const & x0, uint32_t const & n) { - return pow_template_i(x0, (int)n); -} - -// Raise floating point numbers to integer power n, where n is a compile-time constant -template -static inline Vec4f pow_n(Vec4f const & a) { - if (n < 0) return Vec4f(1.0f) / pow_n<-n>(a); - if (n == 0) return Vec4f(1.0f); - if (n >= 256) return pow(a, n); - Vec4f x = a; // a^(2^i) - Vec4f y; // accumulator - const int lowest = n - (n & (n-1));// lowest set bit in n - if (n & 1) y = x; - if (n < 2) return y; - x = x*x; // x^2 - if (n & 2) { - if (lowest == 2) y = x; else y *= x; - } - if (n < 4) return y; - x = x*x; // x^4 - if (n & 4) { - if (lowest == 4) y = x; else y *= x; - } - if (n < 8) return y; - x = x*x; // x^8 - if (n & 8) { - if (lowest == 8) y = x; else y *= x; - } - if (n < 16) return y; - x = x*x; // x^16 - if (n & 16) { - if (lowest == 16) y = x; else y *= x; - } - if (n < 32) return y; - x = x*x; // x^32 - if (n & 32) { - if (lowest == 32) y = x; else y *= x; - } - if (n < 64) return y; - x = x*x; // x^64 - if (n & 64) { - if (lowest == 64) y = x; else y *= x; - } - if (n < 128) return y; - x = x*x; // x^128 - if (n & 128) { - if (lowest == 128) y = x; else y *= x; - } - return y; -} - -// implement as function pow(vector, const_int) -template -static inline Vec4f pow(Vec4f const & a, Const_int_t) { - return pow_n(a); -} - -// implement the same as macro pow_const(vector, int) -#define pow_const(x,n) pow_n(x) - - -// avoid unsafe optimization in function round -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5 -static inline Vec4f round(Vec4f const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations"))); -#elif defined(__clang__) && INSTRSET < 5 -// static inline Vec4f round(Vec4f const & a) __attribute__ ((optnone)); -// This doesn't work, but current versions of Clang (3.5) don't optimize away signedmagic, even with -funsafe-math-optimizations -// Add volatile to b if future versions fail -#elif defined (_MSC_VER) || defined(__INTEL_COMPILER) && INSTRSET < 5 -#pragma float_control(push) -#pragma float_control(precise,on) -#define FLOAT_CONTROL_PRECISE_FOR_ROUND -#endif -// function round: round to nearest integer (even). (result as float vector) -static inline Vec4f round(Vec4f const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_ps(a, 8); -#else // SSE2. Use magic number method - // Note: assume MXCSR control register is set to rounding - // (don't use conversion to int, it will limit the value to +/- 2^31) - Vec4f signmask = _mm_castsi128_ps(constant4ui<0x80000000,0x80000000,0x80000000,0x80000000>()); // -0.0 - Vec4f magic = _mm_castsi128_ps(constant4ui<0x4B000000,0x4B000000,0x4B000000,0x4B000000>()); // magic number = 2^23 - Vec4f sign = _mm_and_ps(a, signmask); // signbit of a - Vec4f signedmagic = _mm_or_ps(magic, sign); // magic number with sign of a - // volatile - Vec4f b = a + signedmagic; // round by adding magic number - return b - signedmagic; // .. and subtracting it again -#endif -} -#ifdef FLOAT_CONTROL_PRECISE_FOR_ROUND -#pragma float_control(pop) -#endif - -// function truncate: round towards zero. (result as float vector) -static inline Vec4f truncate(Vec4f const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_ps(a, 3+8); -#else // SSE2. Use magic number method (conversion to int would limit the value to 2^31) - uint32_t t1 = _mm_getcsr(); // MXCSR - uint32_t t2 = t1 | (3 << 13); // bit 13-14 = 11 - _mm_setcsr(t2); // change MXCSR - Vec4f r = round(a); // use magic number method - _mm_setcsr(t1); // restore MXCSR - return r; -#endif -} - -// function floor: round towards minus infinity. (result as float vector) -static inline Vec4f floor(Vec4f const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_ps(a, 1+8); -#else // SSE2. Use magic number method (conversion to int would limit the value to 2^31) - uint32_t t1 = _mm_getcsr(); // MXCSR - uint32_t t2 = t1 | (1 << 13); // bit 13-14 = 01 - _mm_setcsr(t2); // change MXCSR - Vec4f r = round(a); // use magic number method - _mm_setcsr(t1); // restore MXCSR - return r; -#endif -} - -// function ceil: round towards plus infinity. (result as float vector) -static inline Vec4f ceil(Vec4f const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_ps(a, 2+8); -#else // SSE2. Use magic number method (conversion to int would limit the value to 2^31) - uint32_t t1 = _mm_getcsr(); // MXCSR - uint32_t t2 = t1 | (2 << 13); // bit 13-14 = 10 - _mm_setcsr(t2); // change MXCSR - Vec4f r = round(a); // use magic number method - _mm_setcsr(t1); // restore MXCSR - return r; -#endif -} - -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec4i round_to_int(Vec4f const & a) { - // Note: assume MXCSR control register is set to rounding - return _mm_cvtps_epi32(a); -} - -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec4i truncate_to_int(Vec4f const & a) { - return _mm_cvttps_epi32(a); -} - -// function to_float: convert integer vector to float vector -static inline Vec4f to_float(Vec4i const & a) { - return _mm_cvtepi32_ps(a); -} - -// function to_float: convert unsigned integer vector to float vector -static inline Vec4f to_float(Vec4ui const & a) { -#ifdef __AVX512VL__ - return _mm_cvtepu32_ps(a); -#else - Vec4f b = to_float(Vec4i(a & 0x7FFFFFFF)); // 31 bits - Vec4i c = Vec4i(a) >> 31; // generate mask from highest bit - Vec4f d = Vec4f(2147483648.f) & Vec4f(_mm_castsi128_ps(c));// mask floating point constant 2^31 - return b + d; -#endif -} - - -// Approximate math functions - -// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11) -static inline Vec4f approx_recipr(Vec4f const & a) { -#if INSTRSET >= 9 // use more accurate version if available. (none of these will raise exceptions on zero) -#ifdef __AVX512ER__ // AVX512ER: full precision - // todo: if future processors have both AVX512ER and AVX512VL: _mm128_rcp28_round_ps(a, _MM_FROUND_NO_EXC); - return _mm512_castps512_ps128(_mm512_rcp28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC)); -#elif defined __AVX512VL__ // AVX512VL: 14 bit precision - return _mm_rcp14_ps(a); -#else // AVX512F: 14 bit precision - return _mm512_castps512_ps128(_mm512_rcp14_ps(_mm512_castps128_ps512(a))); -#endif -#else // AVX: 11 bit precision - return _mm_rcp_ps(a); -#endif -} - -// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11) -static inline Vec4f approx_rsqrt(Vec4f const & a) { -#if INSTRSET >= 9 // use more accurate version if available. (none of these will raise exceptions on zero) -#ifdef __AVX512ER__ // AVX512ER: full precision - // todo: if future processors have both AVX512ER and AVX521VL: _mm128_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); - return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC)); -#elif defined __AVX512VL__ // AVX512VL: 14 bit precision - return _mm_rsqrt14_ps(a); -#else // AVX512F: 14 bit precision - return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a))); -#endif -#else // AVX: 11 bit precision - return _mm_rsqrt_ps(a); -#endif -} - -// Fused multiply and add functions - -// Multiply and add -static inline Vec4f mul_add(Vec4f const & a, Vec4f const & b, Vec4f const & c) { -#ifdef __FMA__ - return _mm_fmadd_ps(a, b, c); -#elif defined (__FMA4__) - return _mm_macc_ps(a, b, c); -#else - return a * b + c; -#endif -} - -// Multiply and subtract -static inline Vec4f mul_sub(Vec4f const & a, Vec4f const & b, Vec4f const & c) { -#ifdef __FMA__ - return _mm_fmsub_ps(a, b, c); -#elif defined (__FMA4__) - return _mm_msub_ps(a, b, c); -#else - return a * b - c; -#endif -} - -// Multiply and inverse subtract -static inline Vec4f nmul_add(Vec4f const & a, Vec4f const & b, Vec4f const & c) { -#ifdef __FMA__ - return _mm_fnmadd_ps(a, b, c); -#elif defined (__FMA4__) - return _mm_nmacc_ps(a, b, c); -#else - return c - a * b; -#endif -} - - -// Multiply and subtract with extra precision on the intermediate calculations, -// even if FMA instructions not supported, using Veltkamp-Dekker split -static inline Vec4f mul_sub_x(Vec4f const & a, Vec4f const & b, Vec4f const & c) { -#ifdef __FMA__ - return _mm_fmsub_ps(a, b, c); -#elif defined (__FMA4__) - return _mm_msub_ps(a, b, c); -#else - // calculate a * b - c with extra precision - Vec4i upper_mask = -(1 << 12); // mask to remove lower 12 bits - Vec4f a_high = a & Vec4f(_mm_castsi128_ps(upper_mask));// split into high and low parts - Vec4f b_high = b & Vec4f(_mm_castsi128_ps(upper_mask)); - Vec4f a_low = a - a_high; - Vec4f b_low = b - b_high; - Vec4f r1 = a_high * b_high; // this product is exact - Vec4f r2 = r1 - c; // subtract c from high product - Vec4f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product - return r3; // + ((r2 - r1) + c); -#endif -} - -// Math functions using fast bit manipulation - -// Extract the exponent as an integer -// exponent(a) = floor(log2(abs(a))); -// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 -static inline Vec4i exponent(Vec4f const & a) { - Vec4ui t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4ui t2 = t1 << 1; // shift out sign bit - Vec4ui t3 = t2 >> 24; // shift down logical to position 0 - Vec4i t4 = Vec4i(t3) - 0x7F; // subtract bias from exponent - return t4; -} - -// Extract the fraction part of a floating point number -// a = 2^exponent(a) * fraction(a), except for a = 0 -// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f -// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h ! -static inline Vec4f fraction(Vec4f const & a) { - Vec4ui t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F800000); // set exponent to 0 + bias - return _mm_castsi128_ps(t2); -} - -// Fast calculation of pow(2,n) with n integer -// n = 0 gives 1.0f -// n >= 128 gives +INF -// n <= -127 gives 0.0f -// This function will never produce denormals, and never raise exceptions -static inline Vec4f exp2(Vec4i const & n) { - Vec4i t1 = max(n, -0x7F); // limit to allowed range - Vec4i t2 = min(t1, 0x80); - Vec4i t3 = t2 + 0x7F; // add bias - Vec4i t4 = t3 << 23; // put exponent into position 23 - return _mm_castsi128_ps(t4); // reinterpret as float -} -//static Vec4f exp2(Vec4f const & x); // defined in vectormath_exp.h - - -// Control word manipulaton -// ------------------------ -// The MXCSR control word has the following bits: -// 0: Invalid Operation Flag -// 1: Denormal Flag (=subnormal) -// 2: Divide-by-Zero Flag -// 3: Overflow Flag -// 4: Underflow Flag -// 5: Precision Flag -// 6: Denormals Are Zeros (=subnormals) -// 7: Invalid Operation Mask -// 8: Denormal Operation Mask (=subnormal) -// 9: Divide-by-Zero Mask -// 10: Overflow Mask -// 11: Underflow Mask -// 12: Precision Mask -// 13-14: Rounding control -// 00: round to nearest or even -// 01: round down towards -infinity -// 10: round up towards +infinity -// 11: round towards zero (truncate) -// 15: Flush to Zero - -// Function get_control_word: -// Read the MXCSR control word -static inline uint32_t get_control_word() { - return _mm_getcsr(); -} - -// Function set_control_word: -// Write the MXCSR control word -static inline void set_control_word(uint32_t w) { - _mm_setcsr(w); -} - -// Function no_subnormals: -// Set "Denormals Are Zeros" and "Flush to Zero" mode to avoid the extremely -// time-consuming denormals in case of underflow -static inline void no_subnormals() { - uint32_t t1 = get_control_word(); - t1 |= (1 << 6) | (1 << 15); // set bit 6 and 15 in MXCSR - set_control_word(t1); -} - -// Function reset_control_word: -// Set the MXCSR control word to the default value 0x1F80. -// This will mask floating point exceptions, set rounding mode to nearest (or even), -// and allow denormals. -static inline void reset_control_word() { - set_control_word(0x1F80); -} - - -// Categorization functions - -// Function sign_bit: gives true for elements that have the sign bit set -// even for -0.0f, -INF and -NAN -// Note that sign_bit(Vec4f(-0.0f)) gives true, while Vec4f(-0.0f) < Vec4f(0.0f) gives false -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec4fb sign_bit(Vec4f const & a) { - Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4i t2 = t1 >> 31; // extend sign bit - return _mm_castsi128_ps(t2); // reinterpret as 32-bit Boolean -} - -// Function sign_combine: changes the sign of a when b has the sign bit set -// same as select(sign_bit(b), -a, a) -static inline Vec4f sign_combine(Vec4f const & a, Vec4f const & b) { - Vec4f signmask = _mm_castsi128_ps(constant4ui<0x80000000,0x80000000,0x80000000,0x80000000>()); // -0.0 - return a ^ (b & signmask); -} - -// Function is_finite: gives true for elements that are normal, denormal or zero, -// false for INF and NAN -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec4fb is_finite(Vec4f const & a) { - Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4i t2 = t1 << 1; // shift out sign bit - Vec4i t3 = Vec4i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s - return Vec4ib(t3); -} - -// Function is_inf: gives true for elements that are +INF or -INF -// false for finite numbers and NAN -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec4fb is_inf(Vec4f const & a) { - Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4i t2 = t1 << 1; // shift out sign bit - return t2 == Vec4i(0xFF000000); // exponent is all 1s, fraction is 0 -} - -// Function is_nan: gives true for elements that are +NAN or -NAN -// false for finite numbers and +/-INF -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec4fb is_nan(Vec4f const & a) { - Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4i t2 = t1 << 1; // shift out sign bit - Vec4i t3 = 0xFF000000; // exponent mask - Vec4i t4 = t2 & t3; // exponent - Vec4i t5 = _mm_andnot_si128(t3,t2);// fraction - return Vec4ib((t4 == t3) & (t5 != 0));// exponent = all 1s and fraction != 0 -} - -// Function is_subnormal: gives true for elements that are denormal (subnormal) -// false for finite numbers, zero, NAN and INF -static inline Vec4fb is_subnormal(Vec4f const & a) { - Vec4i t1 = _mm_castps_si128(a); // reinterpret as 32-bit integer - Vec4i t2 = t1 << 1; // shift out sign bit - Vec4i t3 = 0xFF000000; // exponent mask - Vec4i t4 = t2 & t3; // exponent - Vec4i t5 = _mm_andnot_si128(t3,t2);// fraction - return Vec4ib((t4 == 0) & (t5 != 0));// exponent = 0 and fraction != 0 -} - -// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) -// false for finite numbers, NAN and INF -static inline Vec4fb is_zero_or_subnormal(Vec4f const & a) { - Vec4i t = _mm_castps_si128(a); // reinterpret as 32-bit integer - t &= 0x7F800000; // isolate exponent - return t == 0; // exponent = 0 -} - -// Function infinite4f: returns a vector where all elements are +INF -static inline Vec4f infinite4f() { - return _mm_castsi128_ps(_mm_set1_epi32(0x7F800000)); -} - -// Function nan4f: returns a vector where all elements are NAN (quiet) -static inline Vec4f nan4f(int n = 0x10) { - return _mm_castsi128_ps(_mm_set1_epi32(0x7FC00000 + n)); -} - - -/***************************************************************************** -* -* Vector Vec4f permute and blend functions -* -****************************************************************************** -* -* The permute function can reorder the elements of a vector and optionally -* set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select. A negative index will generate zero. -* -* Example: -* Vec4f a(10.f,11.f,12.f,13.f); // a is (10,11,12,13) -* Vec4f b, c; -* b = permute4f<0,0,2,2>(a); // b is (10,10,12,12) -* c = permute4f<3,2,-1,-1>(a); // c is (13,12, 0, 0) -* -* -* The blend function can mix elements from two different vectors and -* optionally set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where indexes 0 - 3 indicate an element from the first source -* vector and indexes 4 - 7 indicate an element from the second source vector. -* A negative index will generate zero. -* -* -* Example: -* Vec4f a(10.f,11.f,12.f,13.f); // a is (10, 11, 12, 13) -* Vec4f b(20.f,21.f,22.f,23.f); // b is (20, 21, 22, 23) -* Vec4f c; -* c = blend4f<1,4,-1,7> (a,b); // c is (11, 20, 0, 23) -* -* Don't worry about the complicated code for these functions. Most of the -* code is resolved at compile time to generate only a few instructions. -*****************************************************************************/ - -// permute vector Vec4f -template -static inline Vec4f permute4f(Vec4f const & a) { - // is shuffling needed - const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0) || (i2 != 2 && i2 >= 0) || (i3 != 3 && i3 >= 0); - // is zeroing needed - const bool do_zero = (i0 | i1 | i2 | i3) < 0 && ((i0 | i1 | i2 | i3) & 0x80); - - if (!do_shuffle && !do_zero) { - return a; // trivial case: do nothing - } - if (do_zero && !do_shuffle) { // zeroing, not shuffling - if ((i0 & i1 & i2 & i3) < 0) return _mm_setzero_ps(); // zero everything - // zero some elements - __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >(); - return _mm_and_ps(a,_mm_castsi128_ps(mask1)); // zero with AND mask - } - if (do_shuffle && !do_zero) { // shuffling, not zeroing - return _mm_shuffle_ps(a, a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); - } - // both shuffle and zero - if ((i0 & i1) < 0 && (i2 | i3) >= 0) { // zero low half, shuffle high half - return _mm_shuffle_ps(_mm_setzero_ps(), a, (i2&3)<<4 | (i3&3)<<6); - } - if ((i0 | i1) >= 0 && (i2 & i3) < 0) { // shuffle low half, zero high half - return _mm_shuffle_ps(a, _mm_setzero_ps(), (i0&3) | (i1&3)<<2); - } -#if INSTRSET >= 4 // SSSE3 - // With SSSE3 we can do both with the PSHUFB instruction - const int j0 = (i0 & 3) << 2; - const int j1 = (i1 & 3) << 2; - const int j2 = (i2 & 3) << 2; - const int j3 = (i3 & 3) << 2; - __m128i mask2 = constant4i < - i0 < 0 ? -1 : j0 | (j0+1)<<8 | (j0+2)<<16 | (j0+3) << 24, - i1 < 0 ? -1 : j1 | (j1+1)<<8 | (j1+2)<<16 | (j1+3) << 24, - i2 < 0 ? -1 : j2 | (j2+1)<<8 | (j2+2)<<16 | (j2+3) << 24, - i3 < 0 ? -1 : j3 | (j3+1)<<8 | (j3+2)<<16 | (j3+3) << 24 > (); - return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a),mask2)); -#else - __m128 t1 = _mm_shuffle_ps(a, a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); // shuffle - __m128i mask3 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >(); - return _mm_and_ps(t1,_mm_castsi128_ps(mask3)); // zero with AND mask -#endif -} - - -// blend vectors Vec4f -template -static inline Vec4f blend4f(Vec4f const & a, Vec4f const & b) { - - // Combine all the indexes into a single bitfield, with 8 bits for each - const int m1 = (i0&7) | (i1&7)<<8 | (i2&7)<<16 | (i3&7)<<24; - - // Mask to zero out negative indexes - const int m2 = (i0<0?0:0xFF) | (i1<0?0:0xFF)<<8 | (i2<0?0:0xFF)<<16 | (i3<0?0:0xFF)<<24; - - if ((m1 & 0x04040404 & m2) == 0) { - // no elements from b - return permute4f(a); - } - if (((m1^0x04040404) & 0x04040404 & m2) == 0) { - // no elements from a - return permute4f(b); - } - if (((m1 & ~0x04040404) ^ 0x03020100) == 0 && m2 == -1) { - // selecting without shuffling or zeroing - __m128i sel = constant4i (); - return selectf(_mm_castsi128_ps(sel), a, b); - } -#ifdef __XOP__ // Use AMD XOP instruction PPERM - __m128i maska = constant4i < - i0 < 0 ? 0x80808080 : (i0*4 & 31) + (((i0*4 & 31) + 1) << 8) + (((i0*4 & 31) + 2) << 16) + (((i0*4 & 31) + 3) << 24), - i1 < 0 ? 0x80808080 : (i1*4 & 31) + (((i1*4 & 31) + 1) << 8) + (((i1*4 & 31) + 2) << 16) + (((i1*4 & 31) + 3) << 24), - i2 < 0 ? 0x80808080 : (i2*4 & 31) + (((i2*4 & 31) + 1) << 8) + (((i2*4 & 31) + 2) << 16) + (((i2*4 & 31) + 3) << 24), - i3 < 0 ? 0x80808080 : (i3*4 & 31) + (((i3*4 & 31) + 1) << 8) + (((i3*4 & 31) + 2) << 16) + (((i3*4 & 31) + 3) << 24) > (); - return _mm_castsi128_ps(_mm_perm_epi8(_mm_castps_si128(a), _mm_castps_si128(b), maska)); -#else - if ((((m1 & ~0x04040404) ^ 0x03020100) & m2) == 0) { - // selecting and zeroing, not shuffling - __m128i sel1 = constant4i (); - __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >(); - __m128 t1 = selectf(_mm_castsi128_ps(sel1), a, b); // select - return _mm_and_ps(t1, _mm_castsi128_ps(mask1)); // zero - } - // special cases unpckhps, unpcklps, shufps - Vec4f t; - if (((m1 ^ 0x05010400) & m2) == 0) { - t = _mm_unpacklo_ps(a, b); - goto DOZERO; - } - if (((m1 ^ 0x01050004) & m2) == 0) { - t = _mm_unpacklo_ps(b, a); - goto DOZERO; - } - if (((m1 ^ 0x07030602) & m2) == 0) { - t = _mm_unpackhi_ps(a, b); - goto DOZERO; - } - if (((m1 ^ 0x03070206) & m2) == 0) { - t = _mm_unpackhi_ps(b, a); - goto DOZERO; - } - // first two elements from a, last two from b - if (((m1^0x04040000) & 0x04040404 & m2) == 0) { - t = _mm_shuffle_ps(a, b, (i0&3) + ((i1&3)<<2) + ((i2&3)<<4) + ((i3&3)<<6)); - goto DOZERO; - } - // first two elements from b, last two from a - if (((m1^0x00000404) & 0x04040404 & m2) == 0) { - t = _mm_shuffle_ps(b, a, (i0&3) + ((i1&3)<<2) + ((i2&3)<<4) + ((i3&3)<<6)); - goto DOZERO; - } - { // general case. combine two permutes - __m128 a1 = permute4f < - (uint32_t)i0 < 4 ? i0 : -1, - (uint32_t)i1 < 4 ? i1 : -1, - (uint32_t)i2 < 4 ? i2 : -1, - (uint32_t)i3 < 4 ? i3 : -1 > (a); - __m128 b1 = permute4f < - (uint32_t)(i0^4) < 4 ? (i0^4) : -1, - (uint32_t)(i1^4) < 4 ? (i1^4) : -1, - (uint32_t)(i2^4) < 4 ? (i2^4) : -1, - (uint32_t)(i3^4) < 4 ? (i3^4) : -1 > (b); - return _mm_or_ps(a1,b1); - } -DOZERO: - if ((i0|i1|i2|i3) & 0x80) { - // zero some elements - __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >(); - t = _mm_and_ps(t,_mm_castsi128_ps(mask1)); // zero with AND mask - } - return t; - -#endif // __XOP__ -} - -// change signs on vectors Vec4f -// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change -template -static inline Vec4f change_sign(Vec4f const & a) { - if ((i0 | i1 | i2 | i3) == 0) return a; - __m128i mask = constant4ui(); - return _mm_xor_ps(a, _mm_castsi128_ps(mask)); // flip sign bits -} - - -/***************************************************************************** -* -* Vec2d: Vector of 2 double precision floating point values -* -*****************************************************************************/ - -class Vec2d { -protected: - __m128d xmm; // double vector -public: - // Default constructor: - Vec2d() { - } - // Constructor to broadcast the same value into all elements: - Vec2d(double d) { - xmm = _mm_set1_pd(d); - } - // Constructor to build from all elements: - Vec2d(double d0, double d1) { - xmm = _mm_setr_pd(d0, d1); - } - // Constructor to convert from type __m128d used in intrinsics: - Vec2d(__m128d const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128d used in intrinsics: - Vec2d & operator = (__m128d const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128d used in intrinsics - operator __m128d() const { - return xmm; - } - // Member function to load from array (unaligned) - Vec2d & load(double const * p) { - xmm = _mm_loadu_pd(p); - return *this; - } - // Member function to load from array, aligned by 16 - // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, - // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 16. - Vec2d const & load_a(double const * p) { - xmm = _mm_load_pd(p); - return *this; - } - // Member function to store into array (unaligned) - void store(double * p) const { - _mm_storeu_pd(p, xmm); - } - // Member function to store into array, aligned by 16 - // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, - // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 16. - void store_a(double * p) const { - _mm_store_pd(p, xmm); - } - // Partial load. Load n elements and set the rest to 0 - Vec2d & load_partial(int n, double const * p) { - if (n == 1) { - xmm = _mm_load_sd(p); - } - else if (n == 2) { - load(p); - } - else { - xmm = _mm_setzero_pd(); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, double * p) const { - if (n == 1) { - _mm_store_sd(p, xmm); - } - else if (n == 2) { - store(p); - } - } - // cut off vector to n elements. The last 4-n elements are set to zero - Vec2d & cutoff(int n) { - xmm = _mm_castps_pd(Vec4f(_mm_castpd_ps(xmm)).cutoff(n*2)); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec2d const & insert(uint32_t index, double value) { - __m128d v2 = _mm_set_sd(value); - if (index == 0) { - xmm = _mm_shuffle_pd(v2,xmm,2); - } - else { - xmm = _mm_shuffle_pd(xmm,v2,0); - } - return *this; - }; - // Member function extract a single element from vector - double extract(uint32_t index) const { - double x[2]; - store(x); - return x[index & 1]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - double operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 2; - } -}; - - -/***************************************************************************** -* -* Operators for Vec2d -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec2d operator + (Vec2d const & a, Vec2d const & b) { - return _mm_add_pd(a, b); -} - -// vector operator + : add vector and scalar -static inline Vec2d operator + (Vec2d const & a, double b) { - return a + Vec2d(b); -} -static inline Vec2d operator + (double a, Vec2d const & b) { - return Vec2d(a) + b; -} - -// vector operator += : add -static inline Vec2d & operator += (Vec2d & a, Vec2d const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec2d operator ++ (Vec2d & a, int) { - Vec2d a0 = a; - a = a + 1.0; - return a0; -} - -// prefix operator ++ -static inline Vec2d & operator ++ (Vec2d & a) { - a = a + 1.0; - return a; -} - -// vector operator - : subtract element by element -static inline Vec2d operator - (Vec2d const & a, Vec2d const & b) { - return _mm_sub_pd(a, b); -} - -// vector operator - : subtract vector and scalar -static inline Vec2d operator - (Vec2d const & a, double b) { - return a - Vec2d(b); -} -static inline Vec2d operator - (double a, Vec2d const & b) { - return Vec2d(a) - b; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec2d operator - (Vec2d const & a) { - return _mm_xor_pd(a, _mm_castsi128_pd(_mm_setr_epi32(0,0x80000000,0,0x80000000))); -} - -// vector operator -= : subtract -static inline Vec2d & operator -= (Vec2d & a, Vec2d const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec2d operator -- (Vec2d & a, int) { - Vec2d a0 = a; - a = a - 1.0; - return a0; -} - -// prefix operator -- -static inline Vec2d & operator -- (Vec2d & a) { - a = a - 1.0; - return a; -} - -// vector operator * : multiply element by element -static inline Vec2d operator * (Vec2d const & a, Vec2d const & b) { - return _mm_mul_pd(a, b); -} - -// vector operator * : multiply vector and scalar -static inline Vec2d operator * (Vec2d const & a, double b) { - return a * Vec2d(b); -} -static inline Vec2d operator * (double a, Vec2d const & b) { - return Vec2d(a) * b; -} - -// vector operator *= : multiply -static inline Vec2d & operator *= (Vec2d & a, Vec2d const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec2d operator / (Vec2d const & a, Vec2d const & b) { - return _mm_div_pd(a, b); -} - -// vector operator / : divide vector and scalar -static inline Vec2d operator / (Vec2d const & a, double b) { - return a / Vec2d(b); -} -static inline Vec2d operator / (double a, Vec2d const & b) { - return Vec2d(a) / b; -} - -// vector operator /= : divide -static inline Vec2d & operator /= (Vec2d & a, Vec2d const & b) { - a = a / b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec2db operator == (Vec2d const & a, Vec2d const & b) { - return _mm_cmpeq_pd(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec2db operator != (Vec2d const & a, Vec2d const & b) { - return _mm_cmpneq_pd(a, b); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec2db operator < (Vec2d const & a, Vec2d const & b) { - return _mm_cmplt_pd(a, b); -} - -// vector operator <= : returns true for elements for which a <= b -static inline Vec2db operator <= (Vec2d const & a, Vec2d const & b) { - return _mm_cmple_pd(a, b); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec2db operator > (Vec2d const & a, Vec2d const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline Vec2db operator >= (Vec2d const & a, Vec2d const & b) { - return b <= a; -} - -// Bitwise logical operators - -// vector operator & : bitwise and -static inline Vec2d operator & (Vec2d const & a, Vec2d const & b) { - return _mm_and_pd(a, b); -} - -// vector operator &= : bitwise and -static inline Vec2d & operator &= (Vec2d & a, Vec2d const & b) { - a = a & b; - return a; -} - -// vector operator & : bitwise and of Vec2d and Vec2db -static inline Vec2d operator & (Vec2d const & a, Vec2db const & b) { - return _mm_and_pd(a, b); -} -static inline Vec2d operator & (Vec2db const & a, Vec2d const & b) { - return _mm_and_pd(a, b); -} - -// vector operator | : bitwise or -static inline Vec2d operator | (Vec2d const & a, Vec2d const & b) { - return _mm_or_pd(a, b); -} - -// vector operator |= : bitwise or -static inline Vec2d & operator |= (Vec2d & a, Vec2d const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec2d operator ^ (Vec2d const & a, Vec2d const & b) { - return _mm_xor_pd(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec2d & operator ^= (Vec2d & a, Vec2d const & b) { - a = a ^ b; - return a; -} - -// vector operator ! : logical not. Returns Boolean vector -static inline Vec2db operator ! (Vec2d const & a) { - return a == Vec2d(0.0); -} - - -/***************************************************************************** -* -* Functions for Vec2d -* -*****************************************************************************/ - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). -// No other values are allowed. -static inline Vec2d select (Vec2db const & s, Vec2d const & a, Vec2d const & b) { - return selectd(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec2d if_add (Vec2db const & f, Vec2d const & a, Vec2d const & b) { - return a + (Vec2d(f) & b); -} - -// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] -static inline Vec2d if_mul (Vec2db const & f, Vec2d const & a, Vec2d const & b) { - return a * select(f, b, 1.); -} - - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline double horizontal_add (Vec2d const & a) { -#if INSTRSET >= 3 // SSE3 - __m128d t1 = _mm_hadd_pd(a,a); - return _mm_cvtsd_f64(t1); -#else - __m128 t0 = _mm_castpd_ps(a); - __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0)); - __m128d t2 = _mm_add_sd(a,t1); - return _mm_cvtsd_f64(t2); -#endif -} - -// function max: a > b ? a : b -static inline Vec2d max(Vec2d const & a, Vec2d const & b) { - return _mm_max_pd(a,b); -} - -// function min: a < b ? a : b -static inline Vec2d min(Vec2d const & a, Vec2d const & b) { - return _mm_min_pd(a,b); -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec2d abs(Vec2d const & a) { - __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(-1,0x7FFFFFFF,-1,0x7FFFFFFF)); - return _mm_and_pd(a,mask); -} - -// function sqrt: square root -static inline Vec2d sqrt(Vec2d const & a) { - return _mm_sqrt_pd(a); -} - -// function square: a * a -static inline Vec2d square(Vec2d const & a) { - return a * a; -} - -// pow(Vec2d, int): -// The purpose of this template is to prevent implicit conversion of a float -// exponent to int when calling pow(vector, float) and vectormath_exp.h is -// not included - -template static Vec2d pow(Vec2d const & a, TT const & n); - -// Raise floating point numbers to integer power n -template <> -inline Vec2d pow(Vec2d const & x0, int const & n) { - return pow_template_i(x0, n); -} - -// allow conversion from unsigned int -template <> -inline Vec2d pow(Vec2d const & x0, uint32_t const & n) { - return pow_template_i(x0, (int)n); -} - - -// Raise floating point numbers to integer power n, where n is a compile-time constant -template -static inline Vec2d pow_n(Vec2d const & a) { - if (n < 0) return Vec2d(1.0) / pow_n<-n>(a); - if (n == 0) return Vec2d(1.0); - if (n >= 256) return pow(a, n); - Vec2d x = a; // a^(2^i) - Vec2d y; // accumulator - const int lowest = n - (n & (n-1));// lowest set bit in n - if (n & 1) y = x; - if (n < 2) return y; - x = x*x; // x^2 - if (n & 2) { - if (lowest == 2) y = x; else y *= x; - } - if (n < 4) return y; - x = x*x; // x^4 - if (n & 4) { - if (lowest == 4) y = x; else y *= x; - } - if (n < 8) return y; - x = x*x; // x^8 - if (n & 8) { - if (lowest == 8) y = x; else y *= x; - } - if (n < 16) return y; - x = x*x; // x^16 - if (n & 16) { - if (lowest == 16) y = x; else y *= x; - } - if (n < 32) return y; - x = x*x; // x^32 - if (n & 32) { - if (lowest == 32) y = x; else y *= x; - } - if (n < 64) return y; - x = x*x; // x^64 - if (n & 64) { - if (lowest == 64) y = x; else y *= x; - } - if (n < 128) return y; - x = x*x; // x^128 - if (n & 128) { - if (lowest == 128) y = x; else y *= x; - } - return y; -} - -template -static inline Vec2d pow(Vec2d const & a, Const_int_t) { - return pow_n(a); -} - - -// avoid unsafe optimization in function round -#if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5 -static inline Vec2d round(Vec2d const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations"))); -#elif defined (FLOAT_CONTROL_PRECISE_FOR_ROUND) -#pragma float_control(push) -#pragma float_control(precise,on) -#endif -// function round: round to nearest integer (even). (result as double vector) -static inline Vec2d round(Vec2d const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_pd(a, 0+8); -#else // SSE2. Use magic number method - // Note: assume MXCSR control register is set to rounding - // (don't use conversion to int, it will limit the value to +/- 2^31) - Vec2d signmask = _mm_castsi128_pd(constant4ui<0,0x80000000,0,0x80000000>()); // -0.0 - Vec2d magic = _mm_castsi128_pd(constant4ui<0,0x43300000,0,0x43300000>()); // magic number = 2^52 - Vec2d sign = _mm_and_pd(a, signmask); // signbit of a - Vec2d signedmagic = _mm_or_pd(magic, sign); // magic number with sign of a - return a + signedmagic - signedmagic; // round by adding magic number -#endif -} -#if defined (FLOAT_CONTROL_PRECISE_FOR_ROUND) -#pragma float_control(pop) -#endif - -// function truncate: round towards zero. (result as double vector) -static inline Vec2d truncate(Vec2d const & a) { -// (note: may fail on MS Visual Studio 2008, works in later versions) -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_pd(a, 3+8); -#else // SSE2. Use magic number method (conversion to int would limit the value to 2^31) - uint32_t t1 = _mm_getcsr(); // MXCSR - uint32_t t2 = t1 | (3 << 13); // bit 13-14 = 11 - _mm_setcsr(t2); // change MXCSR - Vec2d r = round(a); // use magic number method - _mm_setcsr(t1); // restore MXCSR - return r; -#endif -} - -// function floor: round towards minus infinity. (result as double vector) -// (note: may fail on MS Visual Studio 2008, works in later versions) -static inline Vec2d floor(Vec2d const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_pd(a, 1+8); -#else // SSE2. Use magic number method (conversion to int would limit the value to 2^31) - uint32_t t1 = _mm_getcsr(); // MXCSR - uint32_t t2 = t1 | (1 << 13); // bit 13-14 = 01 - _mm_setcsr(t2); // change MXCSR - Vec2d r = round(a); // use magic number method - _mm_setcsr(t1); // restore MXCSR - return r; -#endif -} - -// function ceil: round towards plus infinity. (result as double vector) -static inline Vec2d ceil(Vec2d const & a) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_round_pd(a, 2+8); -#else // SSE2. Use magic number method (conversion to int would limit the value to 2^31) - uint32_t t1 = _mm_getcsr(); // MXCSR - uint32_t t2 = t1 | (2 << 13); // bit 13-14 = 10 - _mm_setcsr(t2); // change MXCSR - Vec2d r = round(a); // use magic number method - _mm_setcsr(t1); // restore MXCSR - return r; -#endif -} - -// function truncate_to_int: round towards zero. -static inline Vec4i truncate_to_int(Vec2d const & a, Vec2d const & b) { - Vec4i t1 = _mm_cvttpd_epi32(a); - Vec4i t2 = _mm_cvttpd_epi32(b); - return blend4i<0,1,4,5> (t1, t2); -} - -// function round_to_int: round to nearest integer (even). -// result as 32-bit integer vector -static inline Vec4i round_to_int(Vec2d const & a, Vec2d const & b) { - // Note: assume MXCSR control register is set to rounding - Vec4i t1 = _mm_cvtpd_epi32(a); - Vec4i t2 = _mm_cvtpd_epi32(b); - return blend4i<0,1,4,5> (t1, t2); -} -// function round_to_int: round to nearest integer (even). -// result as 32-bit integer vector. Upper two values of result are 0 -static inline Vec4i round_to_int(Vec2d const & a) { - Vec4i t1 = _mm_cvtpd_epi32(a); - return t1; -} - -// function truncate_to_int64: round towards zero. (inefficient) -static inline Vec2q truncate_to_int64(Vec2d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - //return _mm_maskz_cvttpd_epi64( __mmask8(0xFF), a); - return _mm_cvttpd_epi64(a); -#else - double aa[2]; - a.store(aa); - return Vec2q(int64_t(aa[0]), int64_t(aa[1])); -#endif -} - -// function truncate_to_int64_limited: round towards zero. (inefficient) -// result as 64-bit integer vector, but with limited range. Deprecated! -static inline Vec2q truncate_to_int64_limited(Vec2d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return truncate_to_int64(a); -#else - // Note: assume MXCSR control register is set to rounding - Vec4i t1 = _mm_cvttpd_epi32(a); - return extend_low(t1); -#endif -} - -// function round_to_int64: round to nearest or even. (inefficient) -static inline Vec2q round_to_int64(Vec2d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return _mm_cvtpd_epi64(a); -#else - return truncate_to_int64(round(a)); -#endif -} - -// function round_to_int: round to nearest integer (even) -// result as 64-bit integer vector, but with limited range. Deprecated! -static inline Vec2q round_to_int64_limited(Vec2d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return round_to_int64(a); -#else - // Note: assume MXCSR control register is set to rounding - Vec4i t1 = _mm_cvtpd_epi32(a); - return extend_low(t1); -#endif -} - -// function to_double: convert integer vector elements to double vector (inefficient) -static inline Vec2d to_double(Vec2q const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return _mm_maskz_cvtepi64_pd( __mmask8(0xFF), a); -#else - int64_t aa[2]; - a.store(aa); - return Vec2d(double(aa[0]), double(aa[1])); -#endif -} - -// function to_double_limited: convert integer vector elements to double vector -// limited to abs(x) < 2^31. Deprecated! -static inline Vec2d to_double_limited(Vec2q const & x) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return to_double(x); -#else - Vec4i compressed = permute4i<0,2,-256,-256>(Vec4i(x)); - return _mm_cvtepi32_pd(compressed); -#endif -} - -// function to_double_low: convert integer vector elements [0] and [1] to double vector -static inline Vec2d to_double_low(Vec4i const & a) { - return _mm_cvtepi32_pd(a); -} - -// function to_double_high: convert integer vector elements [2] and [3] to double vector -static inline Vec2d to_double_high(Vec4i const & a) { - return to_double_low(_mm_srli_si128(a,8)); -} - -// function compress: convert two Vec2d to one Vec4f -static inline Vec4f compress (Vec2d const & low, Vec2d const & high) { - Vec4f t1 = _mm_cvtpd_ps(low); - Vec4f t2 = _mm_cvtpd_ps(high); - return blend4f<0,1,4,5> (t1, t2); -} - -// Function extend_low : convert Vec4f vector elements [0] and [1] to Vec2d -static inline Vec2d extend_low (Vec4f const & a) { - return _mm_cvtps_pd(a); -} - -// Function extend_high : convert Vec4f vector elements [2] and [3] to Vec2d -static inline Vec2d extend_high (Vec4f const & a) { - return _mm_cvtps_pd(_mm_movehl_ps(a,a)); -} - - -// Fused multiply and add functions - -// Multiply and add -static inline Vec2d mul_add(Vec2d const & a, Vec2d const & b, Vec2d const & c) { -#ifdef __FMA__ - return _mm_fmadd_pd(a, b, c); -#elif defined (__FMA4__) - return _mm_macc_pd(a, b, c); -#else - return a * b + c; -#endif -} - -// Multiply and subtract -static inline Vec2d mul_sub(Vec2d const & a, Vec2d const & b, Vec2d const & c) { -#ifdef __FMA__ - return _mm_fmsub_pd(a, b, c); -#elif defined (__FMA4__) - return _mm_msub_pd(a, b, c); -#else - return a * b - c; -#endif -} - -// Multiply and inverse subtract -static inline Vec2d nmul_add(Vec2d const & a, Vec2d const & b, Vec2d const & c) { -#ifdef __FMA__ - return _mm_fnmadd_pd(a, b, c); -#elif defined (__FMA4__) - return _mm_nmacc_pd(a, b, c); -#else - return c - a * b; -#endif -} - - -// Multiply and subtract with extra precision on the intermediate calculations, -// even if FMA instructions not supported, using Veltkamp-Dekker split -static inline Vec2d mul_sub_x(Vec2d const & a, Vec2d const & b, Vec2d const & c) { -#ifdef __FMA__ - return _mm_fmsub_pd(a, b, c); -#elif defined (__FMA4__) - return _mm_msub_pd(a, b, c); -#else - // calculate a * b - c with extra precision - Vec2q upper_mask = -(1LL << 27); // mask to remove lower 27 bits - Vec2d a_high = a & Vec2d(_mm_castsi128_pd(upper_mask));// split into high and low parts - Vec2d b_high = b & Vec2d(_mm_castsi128_pd(upper_mask)); - Vec2d a_low = a - a_high; - Vec2d b_low = b - b_high; - Vec2d r1 = a_high * b_high; // this product is exact - Vec2d r2 = r1 - c; // subtract c from high product - Vec2d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product - return r3; // + ((r2 - r1) + c); -#endif -} - - -// Math functions using fast bit manipulation - -// Extract the exponent as an integer -// exponent(a) = floor(log2(abs(a))); -// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 -static inline Vec2q exponent(Vec2d const & a) { - Vec2uq t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer - Vec2uq t2 = t1 << 1; // shift out sign bit - Vec2uq t3 = t2 >> 53; // shift down logical to position 0 - Vec2q t4 = Vec2q(t3) - 0x3FF; // subtract bias from exponent - return t4; -} - -// Extract the fraction part of a floating point number -// a = 2^exponent(a) * fraction(a), except for a = 0 -// fraction(1.0) = 1.0, fraction(5.0) = 1.25 -// NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h ! -static inline Vec2d fraction(Vec2d const & a) { - Vec2uq t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer - Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FF0000000000000ll); // set exponent to 0 + bias - return _mm_castsi128_pd(t2); -} - -// Fast calculation of pow(2,n) with n integer -// n = 0 gives 1.0 -// n >= 1024 gives +INF -// n <= -1023 gives 0.0 -// This function will never produce denormals, and never raise exceptions -static inline Vec2d exp2(Vec2q const & n) { - Vec2q t1 = max(n, -0x3FF); // limit to allowed range - Vec2q t2 = min(t1, 0x400); - Vec2q t3 = t2 + 0x3FF; // add bias - Vec2q t4 = t3 << 52; // put exponent into position 52 - return _mm_castsi128_pd(t4); // reinterpret as double -} -//static Vec2d exp2(Vec2d const & x); // defined in vectormath_exp.h - - -// Categorization functions - -// Function sign_bit: gives true for elements that have the sign bit set -// even for -0.0, -INF and -NAN -// Note that sign_bit(Vec2d(-0.0)) gives true, while Vec2d(-0.0) < Vec2d(0.0) gives false -static inline Vec2db sign_bit(Vec2d const & a) { - Vec2q t1 = _mm_castpd_si128(a); // reinterpret as 64-bit integer - Vec2q t2 = t1 >> 63; // extend sign bit - return _mm_castsi128_pd(t2); // reinterpret as 64-bit Boolean -} - -// Function sign_combine: changes the sign of a when b has the sign bit set -// same as select(sign_bit(b), -a, a) -static inline Vec2d sign_combine(Vec2d const & a, Vec2d const & b) { - Vec2d signmask = _mm_castsi128_pd(constant4ui<0,0x80000000,0,0x80000000>()); // -0.0 - return a ^ (b & signmask); -} - -// Function is_finite: gives true for elements that are normal, denormal or zero, -// false for INF and NAN -static inline Vec2db is_finite(Vec2d const & a) { - Vec2q t1 = _mm_castpd_si128(a); // reinterpret as integer - Vec2q t2 = t1 << 1; // shift out sign bit - Vec2q t3 = 0xFFE0000000000000ll; // exponent mask - Vec2qb t4 = Vec2q(t2 & t3) != t3; // exponent field is not all 1s - return t4; -} - -// Function is_inf: gives true for elements that are +INF or -INF -// false for finite numbers and NAN -static inline Vec2db is_inf(Vec2d const & a) { - Vec2q t1 = _mm_castpd_si128(a); // reinterpret as integer - Vec2q t2 = t1 << 1; // shift out sign bit - return t2 == 0xFFE0000000000000ll; // exponent is all 1s, fraction is 0 -} - -// Function is_nan: gives true for elements that are +NAN or -NAN -// false for finite numbers and +/-INF -static inline Vec2db is_nan(Vec2d const & a) { - Vec2q t1 = _mm_castpd_si128(a); // reinterpret as integer - Vec2q t2 = t1 << 1; // shift out sign bit - Vec2q t3 = 0xFFE0000000000000ll; // exponent mask - Vec2q t4 = t2 & t3; // exponent - Vec2q t5 = _mm_andnot_si128(t3,t2);// fraction - return Vec2qb((t4==t3) & (t5!=0)); // exponent = all 1s and fraction != 0 -} - -// Function is_subnormal: gives true for elements that are subnormal (denormal) -// false for finite numbers, zero, NAN and INF -static inline Vec2db is_subnormal(Vec2d const & a) { - Vec2q t1 = _mm_castpd_si128(a); // reinterpret as 32-bit integer - Vec2q t2 = t1 << 1; // shift out sign bit - Vec2q t3 = 0xFFE0000000000000ll; // exponent mask - Vec2q t4 = t2 & t3; // exponent - Vec2q t5 = _mm_andnot_si128(t3,t2);// fraction - return Vec2qb((t4==0) & (t5!=0)); // exponent = 0 and fraction != 0 -} - -// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) -// false for finite numbers, NAN and INF -static inline Vec2db is_zero_or_subnormal(Vec2d const & a) { - Vec2q t = _mm_castpd_si128(a); // reinterpret as 32-bit integer - t &= 0x7FF0000000000000ll; // isolate exponent - return t == 0; // exponent = 0 -} - -// Function infinite2d: returns a vector where all elements are +INF -static inline Vec2d infinite2d() { - return _mm_castsi128_pd(_mm_setr_epi32(0,0x7FF00000,0,0x7FF00000)); -} - -// Function nan2d: returns a vector where all elements are +NAN (quiet) -static inline Vec2d nan2d(int n = 0x10) { - return _mm_castsi128_pd(_mm_setr_epi32(n, 0x7FF80000, n, 0x7FF80000)); -} - - -/***************************************************************************** -* -* Functions for reinterpretation between vector types -* -*****************************************************************************/ - -static inline __m128i reinterpret_i (__m128i const & x) { - return x; -} - -static inline __m128i reinterpret_i (__m128 const & x) { - return _mm_castps_si128(x); -} - -static inline __m128i reinterpret_i (__m128d const & x) { - return _mm_castpd_si128(x); -} - -static inline __m128 reinterpret_f (__m128i const & x) { - return _mm_castsi128_ps(x); -} - -static inline __m128 reinterpret_f (__m128 const & x) { - return x; -} - -static inline __m128 reinterpret_f (__m128d const & x) { - return _mm_castpd_ps(x); -} - -static inline __m128d reinterpret_d (__m128i const & x) { - return _mm_castsi128_pd(x); -} - -static inline __m128d reinterpret_d (__m128 const & x) { - return _mm_castps_pd(x); -} - -static inline __m128d reinterpret_d (__m128d const & x) { - return x; -} - - -/***************************************************************************** -* -* Vector permute and blend functions -* -****************************************************************************** -* -* The permute function can reorder the elements of a vector and optionally -* set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select. An index of -1 will generate zero. An index of -256 means don't care. -* -* Example: -* Vec2d a(10., 11.); // a is (10, 11) -* Vec2d b, c; -* b = permute2d<1,1>(a); // b is (11, 11) -* c = permute2d<-1,0>(a); // c is ( 0, 10) -* -* -* The blend function can mix elements from two different vectors and -* optionally set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where indexes 0 - 1 indicate an element from the first source -* vector and indexes 2 - 3 indicate an element from the second source vector. -* An index of -1 will generate zero. -* -* -* Example: -* Vec2d a(10., 11.); // a is (10, 11) -* Vec2d b(20., 21.); // b is (20, 21) -* Vec2d c; -* c = blend2d<0,3> (a,b); // c is (10, 21) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. -*****************************************************************************/ - -// permute vector Vec2d -template -static inline Vec2d permute2d(Vec2d const & a) { - // is shuffling needed - const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0); - // is zeroing needed - const bool do_zero = ((i0 | i1) < 0 && (i0 | i1) & 0x80); - - if (do_zero && !do_shuffle) { // zeroing, not shuffling - if ((i0 & i1) < 0) return _mm_setzero_pd(); // zero everything - // zero some elements - __m128i mask1 = constant4i< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0) >(); - return _mm_and_pd(a,_mm_castsi128_pd(mask1)); // zero with AND mask - } - else if (do_shuffle && !do_zero) { // shuffling, not zeroing - return _mm_shuffle_pd(a, a, (i0&1) | (i1&1)<<1); - } - else if (do_shuffle && do_zero) { // shuffling and zeroing - // both shuffle and zero - if (i0 < 0 && i1 >= 0) { // zero low half, shuffle high half - return _mm_shuffle_pd(_mm_setzero_pd(), a, (i1 & 1) << 1); - } - if (i0 >= 0 && i1 < 0) { // shuffle low half, zero high half - return _mm_shuffle_pd(a, _mm_setzero_pd(), i0 & 1); - } - } - return a; // trivial case: do nothing -} - - -// blend vectors Vec2d -template -static inline Vec2d blend2d(Vec2d const & a, Vec2d const & b) { - - // Combine all the indexes into a single bitfield, with 8 bits for each - const int m1 = (i0 & 3) | (i1 & 3) << 8; - - // Mask to zero out negative indexes - const int m2 = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8; - - if ((m1 & 0x0202 & m2) == 0) { - // no elements from b, only elements from a and possibly zero - return permute2d (a); - } - if (((m1^0x0202) & 0x0202 & m2) == 0) { - // no elements from a, only elements from b and possibly zero - return permute2d (b); - } - // selecting from both a and b without zeroing - if ((i0 & 2) == 0) { // first element from a, second element from b - return _mm_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1); - } - else { // first element from b, second element from a - return _mm_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1); - } -} - -// change signs on vectors Vec4f -// Each index i0 - i1 is 1 for changing sign on the corresponding element, 0 for no change -template -static inline Vec2d change_sign(Vec2d const & a) { - if ((i0 | i1) == 0) return a; - __m128i mask = constant4ui<0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0> (); - return _mm_xor_pd(a, _mm_castsi128_pd(mask)); // flip sign bits -} - - -/***************************************************************************** -* -* Vector lookup functions -* -****************************************************************************** -* -* These functions use vector elements as indexes into a table. -* The table is given as one or more vectors or as an array. -* -* This can be used for several purposes: -* - table lookup -* - permute or blend with variable indexes -* - blend from more than two sources -* - gather non-contiguous data -* -* An index out of range may produce any value - the actual value produced is -* implementation dependent and may be different for different instruction -* sets. An index out of range does not produce an error message or exception. -* -* Example: -* Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) -* Vec4f b(1.0f,1.1f,1.2f,1.3f); // table b is (1.0, 1.1, 1.2, 1.3) -* Vec4f c; -* c = lookup4 (a,b); // result c is (1.2, 1.0, 1.0, 1.3) -* -*****************************************************************************/ - -static inline Vec4f lookup4(Vec4i const & index, Vec4f const & table) { -#if INSTRSET >= 7 // AVX - return _mm_permutevar_ps(table, index); -#else - int32_t ii[4]; - float tt[6]; - table.store(tt); (index & 3).store(ii); - __m128 r01 = _mm_loadh_pi(_mm_load_ss(&tt[ii[0]]), (const __m64 *)&tt[ii[1]]); - __m128 r23 = _mm_loadh_pi(_mm_load_ss(&tt[ii[2]]), (const __m64 *)&tt[ii[3]]); - return _mm_shuffle_ps(r01, r23, 0x88); -#endif -} - -static inline Vec4f lookup8(Vec4i const & index, Vec4f const & table0, Vec4f const & table1) { -#if INSTRSET >= 8 // AVX2 - __m256 tt = _mm256_insertf128_ps(_mm256_castps128_ps256(table0), table1, 1); // combine tables - -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order - __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt))); - r = _mm_and_ps(r,r); // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16) -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 has wrong parameter type and operands in wrong order - __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt)); -#else - // no bug version - __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index))); -#endif - return r; - -#elif INSTRSET >= 7 // AVX - __m128 r0 = _mm_permutevar_ps(table0, index); - __m128 r1 = _mm_permutevar_ps(table1, index); - __m128i i4 = _mm_slli_epi32(index, 29); - return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4)); - -#elif INSTRSET >= 5 // SSE4.1 - Vec4f r0 = lookup4(index, table0); - Vec4f r1 = lookup4(index, table1); - __m128i i4 = _mm_slli_epi32(index, 29); - return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4)); - -#else // SSE2 - Vec4f r0 = lookup4(index, table0); - Vec4f r1 = lookup4(index, table1); - __m128i i4 = _mm_srai_epi32(_mm_slli_epi32(index, 29), 31); - return selectf(_mm_castsi128_ps(i4), r1, r0); -#endif -} - -template -static inline Vec4f lookup(Vec4i const & index, float const * table) { - if (n <= 0) return 0.0f; - if (n <= 4) return lookup4(index, Vec4f().load(table)); - if (n <= 8) { -#if INSTRSET >= 8 // AVX2 - __m256 tt = _mm256_loadu_ps(table); -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order - __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt))); - r = _mm_and_ps(r,r); // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16) -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 has wrong parameter type and operands in wrong order - __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt)); -#else - // no bug version - __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index))); -#endif - return r; -#else // not AVX2 - return lookup8(index, Vec4f().load(table), Vec4f().load(table+4)); -#endif // INSTRSET - } - // n > 8. Limit index - Vec4ui index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec4ui(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec4ui(index), n-1); - } -#if INSTRSET >= 8 // AVX2 - return _mm_i32gather_ps(table, index1, 4); -#else - uint32_t ii[4]; index1.store(ii); - return Vec4f(table[ii[0]], table[ii[1]], table[ii[2]], table[ii[3]]); -#endif -} - -static inline Vec2d lookup2(Vec2q const & index, Vec2d const & table) { -#if INSTRSET >= 7 // AVX - return _mm_permutevar_pd(table, index + index); -#else - int32_t ii[4]; - double tt[2]; - table.store(tt); (index & 1).store(ii); - return Vec2d(tt[ii[0]], tt[ii[2]]); -#endif -} - -static inline Vec2d lookup4(Vec2q const & index, Vec2d const & table0, Vec2d const & table1) { -#if INSTRSET >= 7 // AVX - Vec2q index2 = index + index; // index << 1 - __m128d r0 = _mm_permutevar_pd(table0, index2); - __m128d r1 = _mm_permutevar_pd(table1, index2); - __m128i i4 = _mm_slli_epi64(index, 62); - return _mm_blendv_pd(r0, r1, _mm_castsi128_pd(i4)); -#else - int32_t ii[4]; - double tt[4]; - table0.store(tt); table1.store(tt + 2); - (index & 3).store(ii); - return Vec2d(tt[ii[0]], tt[ii[2]]); -#endif -} - -template -static inline Vec2d lookup(Vec2q const & index, double const * table) { - if (n <= 0) return 0.0; - if (n <= 2) return lookup2(index, Vec2d().load(table)); -#if INSTRSET < 8 // not AVX2 - if (n <= 4) return lookup4(index, Vec2d().load(table), Vec2d().load(table + 2)); -#endif - // Limit index - Vec2uq index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec2uq(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec2uq(index), n-1); - } -#if INSTRSET >= 8 // AVX2 - return _mm_i64gather_pd(table, index1, 8); -#else - uint32_t ii[4]; index1.store(ii); - return Vec2d(table[ii[0]], table[ii[2]]); -#endif -} - - -/***************************************************************************** -* -* Gather functions with fixed indexes -* -*****************************************************************************/ -// Load elements from array a with indices i0, i1, i2, i3 -template -static inline Vec4f gather4f(void const * a) { - return reinterpret_f(gather4i(a)); -} - -// Load elements from array a with indices i0, i1 -template -static inline Vec2d gather2d(void const * a) { - return reinterpret_d(gather2q(a)); -} - -/***************************************************************************** -* -* Vector scatter functions -* -****************************************************************************** -* -* These functions write the elements of a vector to arbitrary positions in an -* array in memory. Each vector element is written to an array position -* determined by an index. An element is not written if the corresponding -* index is out of range. -* The indexes can be specified as constant template parameters or as an -* integer vector. -* -* The scatter functions are useful if the data are distributed in a sparce -* manner into the array. If the array is dense then it is more efficient -* to permute the data into the right positions and then write the whole -* permuted vector into the array. -* -* Example: -* Vec8d a(10,11,12,13,14,15,16,17); -* double b[16] = {0}; -* scatter<0,2,14,10,1,-1,5,9>(a,b); -* // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0} -* -*****************************************************************************/ - -template -static inline void scatter(Vec4f const & data, float * array) { -#if defined (__AVX512VL__) - __m128i indx = constant4i(); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3); - _mm_mask_i32scatter_ps(array, mask, indx, data, 4); -#else - const int index[4] = {i0,i1,i2,i3}; - for (int i = 0; i < 4; i++) { - if (index[i] >= 0) array[index[i]] = data[i]; - } -#endif -} - -template -static inline void scatter(Vec2d const & data, double * array) { - if (i0 >= 0) array[i0] = data[0]; - if (i1 >= 0) array[i1] = data[1]; -} - -static inline void scatter(Vec4i const & index, uint32_t limit, Vec4f const & data, float * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); - _mm_mask_i32scatter_ps(array, mask, index, data, 4); -#else - for (int i = 0; i < 4; i++) { - if (uint32_t(index[i]) < limit) array[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec2q const & index, uint32_t limit, Vec2d const & data, double * array) { - if (uint64_t(index[0]) < uint64_t(limit)) array[index[0]] = data[0]; - if (uint64_t(index[1]) < uint64_t(limit)) array[index[1]] = data[1]; -} - -static inline void scatter(Vec4i const & index, uint32_t limit, Vec2d const & data, double * array) { - if (uint32_t(index[0]) < limit) array[index[0]] = data[0]; - if (uint32_t(index[1]) < limit) array[index[1]] = data[1]; -} - -/***************************************************************************** -* -* Horizontal scan functions -* -*****************************************************************************/ - -// Get index to the first element that is true. Return -1 if all are false -static inline int horizontal_find_first(Vec4fb const & x) { - return horizontal_find_first(Vec4ib(x)); -} - -static inline int horizontal_find_first(Vec2db const & x) { - return horizontal_find_first(Vec2qb(x)); -} - -// Count the number of elements that are true -static inline uint32_t horizontal_count(Vec4fb const & x) { - return horizontal_count(Vec4ib(x)); -} - -static inline uint32_t horizontal_count(Vec2db const & x) { - return horizontal_count(Vec2qb(x)); -} - -/***************************************************************************** -* -* Boolean <-> bitfield conversion functions -* -*****************************************************************************/ - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4fb const & x) { - return to_bits(Vec4ib(x)); -} - -// to_Vec4fb: convert integer bitfield to boolean vector -static inline Vec4fb to_Vec4fb(uint8_t x) { - return Vec4fb(to_Vec4ib(x)); -} - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec2db const & x) { - return to_bits(Vec2qb(x)); -} - -// to_Vec2db: convert integer bitfield to boolean vector -static inline Vec2db to_Vec2db(uint8_t x) { - return Vec2db(to_Vec2qb(x)); -} - -#ifdef VCL_NAMESPACE -} -#endif - -#endif // VECTORF128_H diff --git a/DFTTest/vectorclass/vectorf256.h b/DFTTest/vectorclass/vectorf256.h deleted file mode 100644 index feeeda5..0000000 --- a/DFTTest/vectorclass/vectorf256.h +++ /dev/null @@ -1,3349 +0,0 @@ -/**************************** vectorf256.h ******************************* -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2017-07-27 -* Version: 1.30 -* Project: vector classes -* Description: -* Header file defining 256-bit floating point vector classes as interface -* to intrinsic functions in x86 microprocessors with AVX instruction set. -* -* Instructions: -* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired -* instruction set, which must be at least AVX. -* -* The following vector classes are defined here: -* Vec8f Vector of 8 single precision floating point numbers -* Vec8fb Vector of 8 Booleans for use with Vec8f -* Vec4d Vector of 4 double precision floating point numbers -* Vec4db Vector of 4 Booleans for use with Vec4d -* -* Each vector object is represented internally in the CPU as a 256-bit register. -* This header file defines operators and functions for these vectors. -* -* For example: -* Vec4d a(1., 2., 3., 4.), b(5., 6., 7., 8.), c; -* c = a + b; // now c contains (6., 8., 10., 12.) -* -* For detailed instructions, see VectorClass.pdf -* -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses -*****************************************************************************/ - -// check combination of header files -#if defined (VECTORF256_H) -#if VECTORF256_H != 2 -#error Two different versions of vectorf256.h included -#endif -#else -#define VECTORF256_H 2 - -#if INSTRSET < 7 // AVX required -#error Please compile for the AVX instruction set or higher -#endif - -#include "vectorf128.h" // Define 128-bit vectors - -#ifdef VCL_NAMESPACE -namespace VCL_NAMESPACE { -#endif - -/***************************************************************************** -* -* select functions -* -*****************************************************************************/ -// Select between two __m256 sources, element by element. Used in various functions -// and operators. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each element in s must be either 0 (false) or 0xFFFFFFFF (true). -static inline __m256 selectf (__m256 const & s, __m256 const & a, __m256 const & b) { - return _mm256_blendv_ps (b, a, s); -} - -// Same, with two __m256d sources. -// and operators. Corresponds to this pseudocode: -// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other -// values are allowed. -static inline __m256d selectd (__m256d const & s, __m256d const & a, __m256d const & b) { - return _mm256_blendv_pd (b, a, s); -} - - - -/***************************************************************************** -* -* Generate compile-time constant vector -* -*****************************************************************************/ -// Generate a constant vector of 8 integers stored in memory, -// load as __m256 -template -static inline __m256 constant8f() { - static const union { - int i[8]; - __m256 ymm; - } u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; - return u.ymm; -} - - -/***************************************************************************** -* -* Join two 128-bit vectors -* -*****************************************************************************/ -#define set_m128r(lo,hi) _mm256_insertf128_ps(_mm256_castps128_ps256(lo),(hi),1) - // _mm256_set_m128(hi,lo); // not defined in all versions of immintrin.h - - -/***************************************************************************** -* -* Vec8fb: Vector of 8 Booleans for use with Vec8f -* -*****************************************************************************/ - -class Vec8fb { -protected: - __m256 ymm; // Float vector -public: - // Default constructor: - Vec8fb() { - } - // Constructor to build from all elements: - Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) { -#if INSTRSET >= 8 // AVX2 - ymm = _mm256_castsi256_ps(_mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7)); -#else - __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3)); - __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b4, -(int)b5, -(int)b6, -(int)b7)); - ymm = set_m128r(blo,bhi); -#endif - } - // Constructor to build from two Vec4fb: - Vec8fb(Vec4fb const & a0, Vec4fb const & a1) { - ymm = set_m128r(a0, a1); - } - // Constructor to convert from type __m256 used in intrinsics: - Vec8fb(__m256 const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256 used in intrinsics: - Vec8fb & operator = (__m256 const & x) { - ymm = x; - return *this; - } - // Constructor to broadcast the same value into all elements: - Vec8fb(bool b) { -#if INSTRSET >= 8 // AVX2 - ymm = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b)); -#else - __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b)); - //ymm = _mm256_set_m128(b1,b1); - ymm = set_m128r(b1,b1); -#endif - } - // Assignment operator to broadcast scalar value: - Vec8fb & operator = (bool b) { - *this = Vec8fb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec8fb(int b); - Vec8fb & operator = (int x); -public: - // Type cast operator to convert to __m256 used in intrinsics - operator __m256() const { - return ymm; - } -#if defined (VECTORI256_H) -#if VECTORI256_H >= 2 // AVX2 version - // Constructor to convert from type Vec8ib used as Boolean for integer vectors - Vec8fb(Vec8ib const & x) { - ymm = _mm256_castsi256_ps(x); - } - // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors - Vec8fb & operator = (Vec8ib const & x) { - ymm = _mm256_castsi256_ps(x); - return *this; - } -#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY - // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors - operator Vec8ib() const { - return _mm256_castps_si256(ymm); - } -#endif -#else - // Constructor to convert from type Vec8ib used as Boolean for integer vectors - Vec8fb(Vec8ib const & x) { - ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high())); - } - // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors - Vec8fb & operator = (Vec8ib const & x) { - ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high())); - return *this; - } - // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors - operator Vec8ib() const { - return Vec8i(_mm_castps_si128(get_low()), _mm_castps_si128(get_high())); - } -#endif -#endif // VECTORI256_H - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8fb const & insert(uint32_t index, bool value) { - static const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0}; - __m256 mask = _mm256_loadu_ps((float const*)(maskl+8-(index & 7))); // mask with FFFFFFFF at index position - if (value) { - ymm = _mm256_or_ps(ymm,mask); - } - else { - ymm = _mm256_andnot_ps(mask,ymm); - } - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - union { - float f[8]; - int32_t i[8]; - } u; - _mm256_storeu_ps(u.f, ymm); - return u.i[index & 7] != 0; - } - // Extract a single element. Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec4fb: - Vec4fb get_low() const { - return _mm256_castps256_ps128(ymm); - } - Vec4fb get_high() const { - return _mm256_extractf128_ps(ymm,1); - } - static int size () { - return 8; - } -}; - - -/***************************************************************************** -* -* Operators for Vec8fb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) { - return _mm256_and_ps(a, b); -} -static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) { - return a & b; -} - -// vector operator &= : bitwise and -static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) { - return _mm256_or_ps(a, b); -} -static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) { - return a | b; -} - -// vector operator |= : bitwise or -static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) { - return _mm256_xor_ps(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec8fb operator ~ (Vec8fb const & a) { - return _mm256_xor_ps(a, constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()); -} - -// vector operator ! : logical not -// (operator ! is less efficient than operator ~. Use only where not -// all bits in an element are the same) -static inline Vec8fb operator ! (Vec8fb const & a) { -return Vec8fb( !Vec8ib(a)); -} - -// Functions for Vec8fb - -// andnot: a & ~ b -static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) { - return _mm256_andnot_ps(b, a); -} - - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - -// horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec8fb const & a) { - return _mm256_testc_ps(a,constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()) != 0; -} - -// horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec8fb const & a) { - return ! _mm256_testz_ps(a,a); -} - - -/***************************************************************************** -* -* Vec4db: Vector of 4 Booleans for use with Vec4d -* -*****************************************************************************/ - -class Vec4db { -protected: - __m256d ymm; // double vector -public: - // Default constructor: - Vec4db() { - } - // Constructor to build from all elements: - Vec4db(bool b0, bool b1, bool b2, bool b3) { -#if INSTRSET >= 8 // AVX2 - ymm = _mm256_castsi256_pd(_mm256_setr_epi64x(-(int64_t)b0, -(int64_t)b1, -(int64_t)b2, -(int64_t)b3)); -#else - __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1)); - __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b2, -(int)b2, -(int)b3, -(int)b3)); - ymm = _mm256_castps_pd(set_m128r(blo, bhi)); -#endif - } - // Constructor to build from two Vec2db: - Vec4db(Vec2db const & a0, Vec2db const & a1) { - ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0),_mm_castpd_ps(a1))); - //ymm = _mm256_set_m128d(a1, a0); - } - // Constructor to convert from type __m256d used in intrinsics: - Vec4db(__m256d const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256d used in intrinsics: - Vec4db & operator = (__m256d const & x) { - ymm = x; - return *this; - } - // Constructor to broadcast the same value into all elements: - Vec4db(bool b) { -#if INSTRSET >= 8 // AVX2 - ymm = _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64_t)b)); -#else - __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b)); - ymm = _mm256_castps_pd(set_m128r(b1,b1)); -#endif - } - // Assignment operator to broadcast scalar value: - Vec4db & operator = (bool b) { - ymm = _mm256_castsi256_pd(_mm256_set1_epi32(-int32_t(b))); - return *this; - } -private: // Prevent constructing from int, etc. - Vec4db(int b); - Vec4db & operator = (int x); -public: - // Type cast operator to convert to __m256d used in intrinsics - operator __m256d() const { - return ymm; - } -#ifdef VECTORI256_H -#if VECTORI256_H == 2 // 256 bit integer vectors are available, AVX2 - // Constructor to convert from type Vec4qb used as Boolean for integer vectors - Vec4db(Vec4qb const & x) { - ymm = _mm256_castsi256_pd(x); - } - // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors - Vec4db & operator = (Vec4qb const & x) { - ymm = _mm256_castsi256_pd(x); - return *this; - } -#ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY - // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors - operator Vec4qb() const { - return _mm256_castpd_si256(ymm); - } -#endif -#else // 256 bit integer vectors emulated without AVX2 - // Constructor to convert from type Vec4qb used as Boolean for integer vectors - Vec4db(Vec4qb const & x) { - *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high())); - } - // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors - Vec4db & operator = (Vec4qb const & x) { - *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high())); - return *this; - } - // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors - operator Vec4qb() const { - return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high())); - } -#endif -#endif // VECTORI256_H - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4db const & insert(uint32_t index, bool value) { - static const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0}; - __m256d mask = _mm256_loadu_pd((double const*)(maskl+8-(index&3)*2)); // mask with FFFFFFFFFFFFFFFF at index position - if (value) { - ymm = _mm256_or_pd(ymm,mask); - } - else { - ymm = _mm256_andnot_pd(mask,ymm); - } - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - union { - double f[8]; - int32_t i[16]; - } u; - _mm256_storeu_pd(u.f, ymm); - return u.i[(index & 3) * 2 + 1] != 0; - } - // Extract a single element. Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec4fb: - Vec2db get_low() const { - return _mm256_castpd256_pd128(ymm); - } - Vec2db get_high() const { - return _mm256_extractf128_pd(ymm,1); - } - static int size () { - return 4; - } -}; - - -/***************************************************************************** -* -* Operators for Vec4db -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) { - return _mm256_and_pd(a, b); -} -static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) { - return a & b; -} - -// vector operator &= : bitwise and -static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) { - return _mm256_or_pd(a, b); -} -static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) { - return a | b; -} - -// vector operator |= : bitwise or -static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) { - return _mm256_xor_pd(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec4db operator ~ (Vec4db const & a) { - return _mm256_xor_pd(a, _mm256_castps_pd (constant8f<-1,-1,-1,-1,-1,-1,-1,-1>())); -} - -// vector operator ! : logical not -// (operator ! is less efficient than operator ~. Use only where not -// all bits in an element are the same) -static inline Vec4db operator ! (Vec4db const & a) { -return Vec4db( ! Vec4qb(a)); -} - -// Functions for Vec8fb - -// andnot: a & ~ b -static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) { - return _mm256_andnot_pd(b, a); -} - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - -// horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec4db const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - return horizontal_and(Vec256b(_mm256_castpd_si256(a))); -#else // split into 128 bit vectors - return horizontal_and(a.get_low() & a.get_high()); -#endif -} - -// horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec4db const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - return horizontal_or(Vec256b(_mm256_castpd_si256(a))); -#else // split into 128 bit vectors - return horizontal_or(a.get_low() | a.get_high()); -#endif -} - - - /***************************************************************************** -* -* Vec8f: Vector of 8 single precision floating point values -* -*****************************************************************************/ - -class Vec8f { -protected: - __m256 ymm; // Float vector -public: - // Default constructor: - Vec8f() { - } - // Constructor to broadcast the same value into all elements: - Vec8f(float f) { - ymm = _mm256_set1_ps(f); - } - // Constructor to build from all elements: - Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) { - ymm = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7); - } - // Constructor to build from two Vec4f: - Vec8f(Vec4f const & a0, Vec4f const & a1) { - ymm = set_m128r(a0, a1); - //ymm = _mm256_set_m128(a1, a0); - } - // Constructor to convert from type __m256 used in intrinsics: - Vec8f(__m256 const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256 used in intrinsics: - Vec8f & operator = (__m256 const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256 used in intrinsics - operator __m256() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec8f & load(void const * p) { - ymm = _mm256_loadu_ps((float const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 32. - Vec8f & load_a(void const * p) { - ymm = _mm256_load_ps((float const*)p); - return *this; - } - // Member function to store into array (unaligned) - void store(float * p) const { - _mm256_storeu_ps(p, ymm); - } - // Member function to store into array, aligned by 32 - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 32. - void store_a(float * p) const { - _mm256_store_ps(p, ymm); - } - // Member function to store into array using a non-temporal memory hint, aligned by 32 - void stream(float * p) const { - _mm256_stream_ps(p, ymm); - } - // Partial load. Load n elements and set the rest to 0 - Vec8f & load_partial(int n, float const * p) { - if (n > 0 && n <= 4) { - *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps()); - // ymm = _mm256_castps128_ps256(Vec4f().load_partial(p)); (this doesn't work on MS compiler due to sloppy definition of the cast) - } - else if (n > 4 && n <= 8) { - *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4)); - } - else { - ymm = _mm256_setzero_ps(); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, float * p) const { - if (n <= 4) { - get_low().store_partial(n, p); - } - else if (n <= 8) { - get_low().store(p); - get_high().store_partial(n - 4, p + 4); - } - } - // cut off vector to n elements. The last 8-n elements are set to zero - Vec8f & cutoff(int n) { - if (uint32_t(n) >= 8) return *this; - static const union { - int32_t i[16]; - float f[16]; - } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}}; - *this = Vec8fb(*this) & Vec8fb(Vec8f().load(mask.f + 8 - n)); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8f const & insert(uint32_t index, float value) { - __m256 v0 = _mm256_broadcast_ss(&value); - switch (index) { - case 0: - ymm = _mm256_blend_ps (ymm, v0, 1); break; - case 1: - ymm = _mm256_blend_ps (ymm, v0, 2); break; - case 2: - ymm = _mm256_blend_ps (ymm, v0, 4); break; - case 3: - ymm = _mm256_blend_ps (ymm, v0, 8); break; - case 4: - ymm = _mm256_blend_ps (ymm, v0, 0x10); break; - case 5: - ymm = _mm256_blend_ps (ymm, v0, 0x20); break; - case 6: - ymm = _mm256_blend_ps (ymm, v0, 0x40); break; - default: - ymm = _mm256_blend_ps (ymm, v0, 0x80); break; - } - return *this; - } - // Member function extract a single element from vector - float extract(uint32_t index) const { - float x[8]; - store(x); - return x[index & 7]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - float operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec4f: - Vec4f get_low() const { - return _mm256_castps256_ps128(ymm); - } - Vec4f get_high() const { - return _mm256_extractf128_ps(ymm,1); - } - static int size () { - return 8; - } -}; - - -/***************************************************************************** -* -* Operators for Vec8f -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) { - return _mm256_add_ps(a, b); -} - -// vector operator + : add vector and scalar -static inline Vec8f operator + (Vec8f const & a, float b) { - return a + Vec8f(b); -} -static inline Vec8f operator + (float a, Vec8f const & b) { - return Vec8f(a) + b; -} - -// vector operator += : add -static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec8f operator ++ (Vec8f & a, int) { - Vec8f a0 = a; - a = a + 1.0f; - return a0; -} - -// prefix operator ++ -static inline Vec8f & operator ++ (Vec8f & a) { - a = a + 1.0f; - return a; -} - -// vector operator - : subtract element by element -static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) { - return _mm256_sub_ps(a, b); -} - -// vector operator - : subtract vector and scalar -static inline Vec8f operator - (Vec8f const & a, float b) { - return a - Vec8f(b); -} -static inline Vec8f operator - (float a, Vec8f const & b) { - return Vec8f(a) - b; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec8f operator - (Vec8f const & a) { - return _mm256_xor_ps(a, constant8f<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000> ()); -} - -// vector operator -= : subtract -static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec8f operator -- (Vec8f & a, int) { - Vec8f a0 = a; - a = a - 1.0f; - return a0; -} - -// prefix operator -- -static inline Vec8f & operator -- (Vec8f & a) { - a = a - 1.0f; - return a; -} - -// vector operator * : multiply element by element -static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) { - return _mm256_mul_ps(a, b); -} - -// vector operator * : multiply vector and scalar -static inline Vec8f operator * (Vec8f const & a, float b) { - return a * Vec8f(b); -} -static inline Vec8f operator * (float a, Vec8f const & b) { - return Vec8f(a) * b; -} - -// vector operator *= : multiply -static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) { - return _mm256_div_ps(a, b); -} - -// vector operator / : divide vector and scalar -static inline Vec8f operator / (Vec8f const & a, float b) { - return a / Vec8f(b); -} -static inline Vec8f operator / (float a, Vec8f const & b) { - return Vec8f(a) / b; -} - -// vector operator /= : divide -static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) { - a = a / b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) { - return _mm256_cmp_ps(a, b, 0); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) { - return _mm256_cmp_ps(a, b, 4); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) { - return _mm256_cmp_ps(a, b, 1); -} - -// vector operator <= : returns true for elements for which a <= b -static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) { - return _mm256_cmp_ps(a, b, 2); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) { - return b <= a; -} - -// Bitwise logical operators - -// vector operator & : bitwise and -static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) { - return _mm256_and_ps(a, b); -} - -// vector operator &= : bitwise and -static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) { - a = a & b; - return a; -} - -// vector operator & : bitwise and of Vec8f and Vec8fb -static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) { - return _mm256_and_ps(a, b); -} -static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) { - return _mm256_and_ps(a, b); -} - -// vector operator | : bitwise or -static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) { - return _mm256_or_ps(a, b); -} - -// vector operator |= : bitwise or -static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) { - return _mm256_xor_ps(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) { - a = a ^ b; - return a; -} - -// vector operator ! : logical not. Returns Boolean vector -static inline Vec8fb operator ! (Vec8f const & a) { - return a == Vec8f(0.0f); -} - - -/***************************************************************************** -* -* Functions for Vec8f -* -*****************************************************************************/ - -static inline Vec8f zero_8f() { - return _mm256_setzero_ps(); -} - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed. -static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) { - return _mm256_blendv_ps (b, a, s); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) { - return a + (Vec8f(f) & b); -} - -// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] -static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) { - return a * select(f, b, 1.f); -} - - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline float horizontal_add (Vec8f const & a) { - __m256 t1 = _mm256_hadd_ps(a,a); - __m256 t2 = _mm256_hadd_ps(t1,t1); - __m128 t3 = _mm256_extractf128_ps(t2,1); - __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3); - return _mm_cvtss_f32(t4); -} - -// function max: a > b ? a : b -static inline Vec8f max(Vec8f const & a, Vec8f const & b) { - return _mm256_max_ps(a,b); -} - -// function min: a < b ? a : b -static inline Vec8f min(Vec8f const & a, Vec8f const & b) { - return _mm256_min_ps(a,b); -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec8f abs(Vec8f const & a) { - __m256 mask = constant8f<0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF> (); - return _mm256_and_ps(a,mask); -} - -// function sqrt: square root -static inline Vec8f sqrt(Vec8f const & a) { - return _mm256_sqrt_ps(a); -} - -// function square: a * a -static inline Vec8f square(Vec8f const & a) { - return a * a; -} - -// pow(Vec8f, int): -template static Vec8f pow(Vec8f const & a, TT const & n); - -// Raise floating point numbers to integer power n -template <> -inline Vec8f pow(Vec8f const & x0, int const & n) { - return pow_template_i(x0, n); -} - -// allow conversion from unsigned int -template <> -inline Vec8f pow(Vec8f const & x0, uint32_t const & n) { - return pow_template_i(x0, (int)n); -} - - -// Raise floating point numbers to integer power n, where n is a compile-time constant -template -static inline Vec8f pow_n(Vec8f const & a) { - if (n < 0) return Vec8f(1.0f) / pow_n<-n>(a); - if (n == 0) return Vec8f(1.0f); - if (n >= 256) return pow(a, n); - Vec8f x = a; // a^(2^i) - Vec8f y; // accumulator - const int lowest = n - (n & (n-1));// lowest set bit in n - if (n & 1) y = x; - if (n < 2) return y; - x = x*x; // x^2 - if (n & 2) { - if (lowest == 2) y = x; else y *= x; - } - if (n < 4) return y; - x = x*x; // x^4 - if (n & 4) { - if (lowest == 4) y = x; else y *= x; - } - if (n < 8) return y; - x = x*x; // x^8 - if (n & 8) { - if (lowest == 8) y = x; else y *= x; - } - if (n < 16) return y; - x = x*x; // x^16 - if (n & 16) { - if (lowest == 16) y = x; else y *= x; - } - if (n < 32) return y; - x = x*x; // x^32 - if (n & 32) { - if (lowest == 32) y = x; else y *= x; - } - if (n < 64) return y; - x = x*x; // x^64 - if (n & 64) { - if (lowest == 64) y = x; else y *= x; - } - if (n < 128) return y; - x = x*x; // x^128 - if (n & 128) { - if (lowest == 128) y = x; else y *= x; - } - return y; -} - -template -static inline Vec8f pow(Vec8f const & a, Const_int_t) { - return pow_n(a); -} - - -// function round: round to nearest integer (even). (result as float vector) -static inline Vec8f round(Vec8f const & a) { - return _mm256_round_ps(a, 0+8); -} - -// function truncate: round towards zero. (result as float vector) -static inline Vec8f truncate(Vec8f const & a) { - return _mm256_round_ps(a, 3+8); -} - -// function floor: round towards minus infinity. (result as float vector) -static inline Vec8f floor(Vec8f const & a) { - return _mm256_round_ps(a, 1+8); -} - -// function ceil: round towards plus infinity. (result as float vector) -static inline Vec8f ceil(Vec8f const & a) { - return _mm256_round_ps(a, 2+8); -} - -#ifdef VECTORI256_H // 256 bit integer vectors are available -#if VECTORI256_H > 1 // AVX2 -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec8i round_to_int(Vec8f const & a) { - // Note: assume MXCSR control register is set to rounding - return _mm256_cvtps_epi32(a); -} - -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec8i truncate_to_int(Vec8f const & a) { - return _mm256_cvttps_epi32(a); -} - -// function to_float: convert integer vector to float vector -static inline Vec8f to_float(Vec8i const & a) { - return _mm256_cvtepi32_ps(a); -} - -// function to_float: convert unsigned integer vector to float vector -static inline Vec8f to_float(Vec8ui const & a) { -#ifdef __AVX512VL__ - return _mm256_cvtepu32_ps(a); -#else - Vec8f b = to_float(Vec8i(a & 0x7FFFFFFF)); // 31 bits - Vec8i c = Vec8i(a) >> 31; // generate mask from highest bit - Vec8f d = Vec8f(2147483648.f) & Vec8f(_mm256_castsi256_ps(c));// mask floating point constant 2^31 - return b + d; -#endif -} - -#else // no AVX2 - -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec8i round_to_int(Vec8f const & a) { - // Note: assume MXCSR control register is set to rounding - return Vec8i(_mm_cvtps_epi32(a.get_low()), _mm_cvtps_epi32(a.get_high())); -} - -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec8i truncate_to_int(Vec8f const & a) { - return Vec8i(_mm_cvttps_epi32(a.get_low()), _mm_cvttps_epi32(a.get_high())); -} - -// function to_float: convert integer vector to float vector -static inline Vec8f to_float(Vec8i const & a) { - return Vec8f(_mm_cvtepi32_ps(a.get_low()), _mm_cvtepi32_ps(a.get_high())); -} - -// function to_float: convert unsigned integer vector to float vector -static inline Vec8f to_float(Vec8ui const & a) { - return Vec8f(to_float(a.get_low()), to_float(a.get_high())); -} -#endif -#endif // VECTORI256_H - - -// Fused multiply and add functions - -// Multiply and add -static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { -#ifdef __FMA__ - return _mm256_fmadd_ps(a, b, c); -#elif defined (__FMA4__) - return _mm256_macc_ps(a, b, c); -#else - return a * b + c; -#endif - -} - -// Multiply and subtract -static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) { -#ifdef __FMA__ - return _mm256_fmsub_ps(a, b, c); -#elif defined (__FMA4__) - return _mm256_msub_ps(a, b, c); -#else - return a * b - c; -#endif -} - -// Multiply and inverse subtract -static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) { -#ifdef __FMA__ - return _mm256_fnmadd_ps(a, b, c); -#elif defined (__FMA4__) - return _mm256_nmacc_ps(a, b, c); -#else - return c - a * b; -#endif -} - - -// Multiply and subtract with extra precision on the intermediate calculations, -// even if FMA instructions not supported, using Veltkamp-Dekker split -static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) { -#ifdef __FMA__ - return _mm256_fmsub_ps(a, b, c); -#elif defined (__FMA4__) - return _mm256_msub_ps(a, b, c); -#else - // calculate a * b - c with extra precision - const int b12 = -(1 << 12); // mask to remove lower 12 bits - Vec8f upper_mask = constant8f(); - Vec8f a_high = a & upper_mask; // split into high and low parts - Vec8f b_high = b & upper_mask; - Vec8f a_low = a - a_high; - Vec8f b_low = b - b_high; - Vec8f r1 = a_high * b_high; // this product is exact - Vec8f r2 = r1 - c; // subtract c from high product - Vec8f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product - return r3; // + ((r2 - r1) + c); -#endif -} - - -// Approximate math functions - -// approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11) -static inline Vec8f approx_recipr(Vec8f const & a) { -#if INSTRSET >= 9 // use more accurate version if available. (none of these will raise exceptions on zero) -#ifdef __AVX512ER__ // AVX512ER: full precision - // todo: if future processors have both AVX512ER and AVX512VL: _mm256_rcp28_round_ps(a, _MM_FROUND_NO_EXC); - return _mm512_castps512_ps256(_mm512_rcp28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC)); -#elif defined __AVX512VL__ // AVX512VL: 14 bit precision - return _mm256_rcp14_ps(a); -#else // AVX512F: 14 bit precision - return _mm512_castps512_ps256(_mm512_rcp14_ps(_mm512_castps256_ps512(a))); -#endif -#else // AVX: 11 bit precision - return _mm256_rcp_ps(a); -#endif -} - -// approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11) -static inline Vec8f approx_rsqrt(Vec8f const & a) { -#if INSTRSET >= 9 // use more accurate version if available. (none of these will raise exceptions on zero) -#ifdef __AVX512ER__ // AVX512ER: full precision - // todo: if future processors have both AVX512ER and AVX521VL: _mm256_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); - return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC)); -#elif defined __AVX512VL__ // AVX512VL: 14 bit precision - return _mm256_rsqrt14_ps(a); -#else // AVX512F: 14 bit precision - return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a))); -#endif -#else // AVX: 11 bit precision - return _mm256_rsqrt_ps(a); -#endif -} - - -// Math functions using fast bit manipulation - -#ifdef VECTORI256_H // 256 bit integer vectors are available, AVX2 -// Extract the exponent as an integer -// exponent(a) = floor(log2(abs(a))); -// exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128 -static inline Vec8i exponent(Vec8f const & a) { -#if VECTORI256_H > 1 // AVX2 - Vec8ui t1 = _mm256_castps_si256(a);// reinterpret as 32-bit integer - Vec8ui t2 = t1 << 1; // shift out sign bit - Vec8ui t3 = t2 >> 24; // shift down logical to position 0 - Vec8i t4 = Vec8i(t3) - 0x7F; // subtract bias from exponent - return t4; -#else // no AVX2 - return Vec8i(exponent(a.get_low()), exponent(a.get_high())); -#endif -} -#endif - -// Extract the fraction part of a floating point number -// a = 2^exponent(a) * fraction(a), except for a = 0 -// fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f -static inline Vec8f fraction(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 2 // 256 bit integer vectors are available, AVX2 - Vec8ui t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000; // set exponent to 0 + bias - return _mm256_castsi256_ps(t2); -#else - return Vec8f(fraction(a.get_low()), fraction(a.get_high())); -#endif -} - -#ifdef VECTORI256_H // 256 bit integer vectors are available, AVX2 -// Fast calculation of pow(2,n) with n integer -// n = 0 gives 1.0f -// n >= 128 gives +INF -// n <= -127 gives 0.0f -// This function will never produce denormals, and never raise exceptions -static inline Vec8f exp2(Vec8i const & n) { -#if VECTORI256_H > 1 // AVX2 - Vec8i t1 = max(n, -0x7F); // limit to allowed range - Vec8i t2 = min(t1, 0x80); - Vec8i t3 = t2 + 0x7F; // add bias - Vec8i t4 = t3 << 23; // put exponent into position 23 - return _mm256_castsi256_ps(t4); // reinterpret as float -#else - return Vec8f(exp2(n.get_low()), exp2(n.get_high())); -#endif -} -//static inline Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h - -#endif // VECTORI256_H - - -// Categorization functions - -// Function sign_bit: gives true for elements that have the sign bit set -// even for -0.0f, -INF and -NAN -// Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb sign_bit(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8i t2 = t1 >> 31; // extend sign bit - return _mm256_castsi256_ps(t2); // reinterpret as 32-bit Boolean -#else - return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high())); -#endif -} - -// Function sign_combine: changes the sign of a when b has the sign bit set -// same as select(sign_bit(b), -a, a) -static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) { - Vec8f signmask = constant8f<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000>(); // -0.0 - return a ^ (b & signmask); -} - -// Function is_finite: gives true for elements that are normal, denormal or zero, -// false for INF and NAN -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb is_finite(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8i t2 = t1 << 1; // shift out sign bit - Vec8ib t3 = Vec8i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s - return t3; -#else - return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high())); -#endif -} - -// Function is_inf: gives true for elements that are +INF or -INF -// false for finite numbers and NAN -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb is_inf(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8i t2 = t1 << 1; // shift out sign bit - return t2 == 0xFF000000; // exponent is all 1s, fraction is 0 -#else - return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high())); -#endif -} - -// Function is_nan: gives true for elements that are +NAN or -NAN -// false for finite numbers and +/-INF -// (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h) -static inline Vec8fb is_nan(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8i t2 = t1 << 1; // shift out sign bit - Vec8i t3 = 0xFF000000; // exponent mask - Vec8i t4 = t2 & t3; // exponent - Vec8i t5 = _mm256_andnot_si256(t3,t2);// fraction - return Vec8ib(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0 -#else - return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high())); -#endif -} - -// Function is_subnormal: gives true for elements that are denormal (subnormal) -// false for finite numbers, zero, NAN and INF -static inline Vec8fb is_subnormal(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer - Vec8i t2 = t1 << 1; // shift out sign bit - Vec8i t3 = 0xFF000000; // exponent mask - Vec8i t4 = t2 & t3; // exponent - Vec8i t5 = _mm256_andnot_si256(t3,t2);// fraction - return Vec8ib(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0 -#else - return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high())); -#endif -} - -// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) -// false for finite numbers, NAN and INF -static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec8i t = _mm256_castps_si256(a); // reinterpret as 32-bit integer - t &= 0x7F800000; // isolate exponent - return t == 0; // exponent = 0 -#else - return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high())); -#endif -} - -// Function infinite4f: returns a vector where all elements are +INF -static inline Vec8f infinite8f() { - return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>(); -} - -// Function nan4f: returns a vector where all elements are +NAN (quiet) -static inline Vec8f nan8f(int n = 0x10) { - return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FC00000 + n)); -} - -// change signs on vectors Vec8f -// Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change -template -static inline Vec8f change_sign(Vec8f const & a) { - if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a; - __m256 mask = constant8f (); - return _mm256_xor_ps(a, mask); -} - - -/***************************************************************************** -* -* Vec4d: Vector of 4 double precision floating point values -* -*****************************************************************************/ - -class Vec4d { -protected: - __m256d ymm; // double vector -public: - // Default constructor: - Vec4d() { - } - // Constructor to broadcast the same value into all elements: - Vec4d(double d) { - ymm = _mm256_set1_pd(d); - } - // Constructor to build from all elements: - Vec4d(double d0, double d1, double d2, double d3) { - ymm = _mm256_setr_pd(d0, d1, d2, d3); - } - // Constructor to build from two Vec2d: - Vec4d(Vec2d const & a0, Vec2d const & a1) { - ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1))); - //ymm = _mm256_set_m128d(a1, a0); - } - // Constructor to convert from type __m256d used in intrinsics: - Vec4d(__m256d const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256d used in intrinsics: - Vec4d & operator = (__m256d const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256d used in intrinsics - operator __m256d() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec4d & load(double const * p) { - ymm = _mm256_loadu_pd(p); - return *this; - } - // Member function to load from array, aligned by 32 - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 32 - Vec4d & load_a(double const * p) { - ymm = _mm256_load_pd(p); - return *this; - } - // Member function to store into array (unaligned) - void store(double * p) const { - _mm256_storeu_pd(p, ymm); - } - // Member function to store into array, aligned by 32 - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 32 - void store_a(double * p) const { - _mm256_store_pd(p, ymm); - } - // Partial load. Load n elements and set the rest to 0 - Vec4d & load_partial(int n, double const * p) { - if (n > 0 && n <= 2) { - *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd()); - } - else if (n > 2 && n <= 4) { - *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2)); - } - else { - ymm = _mm256_setzero_pd(); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, double * p) const { - if (n <= 2) { - get_low().store_partial(n, p); - } - else if (n <= 4) { - get_low().store(p); - get_high().store_partial(n - 2, p + 2); - } - } - // cut off vector to n elements. The last 4-n elements are set to zero - Vec4d & cutoff(int n) { - ymm = _mm256_castps_pd(Vec8f(_mm256_castpd_ps(ymm)).cutoff(n*2)); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4d const & insert(uint32_t index, double value) { - __m256d v0 = _mm256_broadcast_sd(&value); - switch (index) { - case 0: - ymm = _mm256_blend_pd (ymm, v0, 1); break; - case 1: - ymm = _mm256_blend_pd (ymm, v0, 2); break; - case 2: - ymm = _mm256_blend_pd (ymm, v0, 4); break; - default: - ymm = _mm256_blend_pd (ymm, v0, 8); break; - } - return *this; - } - // Member function extract a single element from vector - double extract(uint32_t index) const { - double x[4]; - store(x); - return x[index & 3]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - double operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec2d: - Vec2d get_low() const { - return _mm256_castpd256_pd128(ymm); - } - Vec2d get_high() const { - return _mm256_extractf128_pd(ymm,1); - } - static int size () { - return 4; - } -}; - - - -/***************************************************************************** -* -* Operators for Vec4d -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) { - return _mm256_add_pd(a, b); -} - -// vector operator + : add vector and scalar -static inline Vec4d operator + (Vec4d const & a, double b) { - return a + Vec4d(b); -} -static inline Vec4d operator + (double a, Vec4d const & b) { - return Vec4d(a) + b; -} - -// vector operator += : add -static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec4d operator ++ (Vec4d & a, int) { - Vec4d a0 = a; - a = a + 1.0; - return a0; -} - -// prefix operator ++ -static inline Vec4d & operator ++ (Vec4d & a) { - a = a + 1.0; - return a; -} - -// vector operator - : subtract element by element -static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) { - return _mm256_sub_pd(a, b); -} - -// vector operator - : subtract vector and scalar -static inline Vec4d operator - (Vec4d const & a, double b) { - return a - Vec4d(b); -} -static inline Vec4d operator - (double a, Vec4d const & b) { - return Vec4d(a) - b; -} - -// vector operator - : unary minus -// Change sign bit, even for 0, INF and NAN -static inline Vec4d operator - (Vec4d const & a) { - return _mm256_xor_pd(a, _mm256_castps_pd(constant8f<0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000> ())); -} - -// vector operator -= : subtract -static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec4d operator -- (Vec4d & a, int) { - Vec4d a0 = a; - a = a - 1.0; - return a0; -} - -// prefix operator -- -static inline Vec4d & operator -- (Vec4d & a) { - a = a - 1.0; - return a; -} - -// vector operator * : multiply element by element -static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) { - return _mm256_mul_pd(a, b); -} - -// vector operator * : multiply vector and scalar -static inline Vec4d operator * (Vec4d const & a, double b) { - return a * Vec4d(b); -} -static inline Vec4d operator * (double a, Vec4d const & b) { - return Vec4d(a) * b; -} - -// vector operator *= : multiply -static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) { - return _mm256_div_pd(a, b); -} - -// vector operator / : divide vector and scalar -static inline Vec4d operator / (Vec4d const & a, double b) { - return a / Vec4d(b); -} -static inline Vec4d operator / (double a, Vec4d const & b) { - return Vec4d(a) / b; -} - -// vector operator /= : divide -static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) { - a = a / b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) { - return _mm256_cmp_pd(a, b, 0); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) { - return _mm256_cmp_pd(a, b, 4); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) { - return _mm256_cmp_pd(a, b, 1); -} - -// vector operator <= : returns true for elements for which a <= b -static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) { - return _mm256_cmp_pd(a, b, 2); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b -static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) { - return b <= a; -} - -// Bitwise logical operators - -// vector operator & : bitwise and -static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) { - return _mm256_and_pd(a, b); -} - -// vector operator &= : bitwise and -static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) { - a = a & b; - return a; -} - -// vector operator & : bitwise and of Vec4d and Vec4db -static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) { - return _mm256_and_pd(a, b); -} -static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) { - return _mm256_and_pd(a, b); -} - -// vector operator | : bitwise or -static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) { - return _mm256_or_pd(a, b); -} - -// vector operator |= : bitwise or -static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) { - return _mm256_xor_pd(a, b); -} - -// vector operator ^= : bitwise xor -static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) { - a = a ^ b; - return a; -} - -// vector operator ! : logical not. Returns Boolean vector -static inline Vec4db operator ! (Vec4d const & a) { - return a == Vec4d(0.0); -} - - -/***************************************************************************** -* -* Functions for Vec4d -* -*****************************************************************************/ - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). -// No other values are allowed. -static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) { - return _mm256_blendv_pd(b, a, s); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) { - return a + (Vec4d(f) & b); -} - -// Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i] -static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) { - return a * select(f, b, 1.); -} - - -// General arithmetic functions, etc. - -// Horizontal add: Calculates the sum of all vector elements. -static inline double horizontal_add (Vec4d const & a) { - __m256d t1 = _mm256_hadd_pd(a,a); - __m128d t2 = _mm256_extractf128_pd(t1,1); - __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2); - return _mm_cvtsd_f64(t3); -} - -// function max: a > b ? a : b -static inline Vec4d max(Vec4d const & a, Vec4d const & b) { - return _mm256_max_pd(a,b); -} - -// function min: a < b ? a : b -static inline Vec4d min(Vec4d const & a, Vec4d const & b) { - return _mm256_min_pd(a,b); -} - -// function abs: absolute value -// Removes sign bit, even for -0.0f, -INF and -NAN -static inline Vec4d abs(Vec4d const & a) { - __m256d mask = _mm256_castps_pd(constant8f<-1,0x7FFFFFFF,-1,0x7FFFFFFF,-1,0x7FFFFFFF,-1,0x7FFFFFFF> ()); - return _mm256_and_pd(a,mask); -} - -// function sqrt: square root -static inline Vec4d sqrt(Vec4d const & a) { - return _mm256_sqrt_pd(a); -} - -// function square: a * a -static inline Vec4d square(Vec4d const & a) { - return a * a; -} - -// pow(Vec4d, int): -template static Vec4d pow(Vec4d const & a, TT const & n); - -// Raise floating point numbers to integer power n -template <> -inline Vec4d pow(Vec4d const & x0, int const & n) { - return pow_template_i(x0, n); -} - -// allow conversion from unsigned int -template <> -inline Vec4d pow(Vec4d const & x0, uint32_t const & n) { - return pow_template_i(x0, (int)n); -} - - -// Raise floating point numbers to integer power n, where n is a compile-time constant -template -static inline Vec4d pow_n(Vec4d const & a) { - if (n < 0) return Vec4d(1.0) / pow_n<-n>(a); - if (n == 0) return Vec4d(1.0); - if (n >= 256) return pow(a, n); - Vec4d x = a; // a^(2^i) - Vec4d y; // accumulator - const int lowest = n - (n & (n-1));// lowest set bit in n - if (n & 1) y = x; - if (n < 2) return y; - x = x*x; // x^2 - if (n & 2) { - if (lowest == 2) y = x; else y *= x; - } - if (n < 4) return y; - x = x*x; // x^4 - if (n & 4) { - if (lowest == 4) y = x; else y *= x; - } - if (n < 8) return y; - x = x*x; // x^8 - if (n & 8) { - if (lowest == 8) y = x; else y *= x; - } - if (n < 16) return y; - x = x*x; // x^16 - if (n & 16) { - if (lowest == 16) y = x; else y *= x; - } - if (n < 32) return y; - x = x*x; // x^32 - if (n & 32) { - if (lowest == 32) y = x; else y *= x; - } - if (n < 64) return y; - x = x*x; // x^64 - if (n & 64) { - if (lowest == 64) y = x; else y *= x; - } - if (n < 128) return y; - x = x*x; // x^128 - if (n & 128) { - if (lowest == 128) y = x; else y *= x; - } - return y; -} - -template -static inline Vec4d pow(Vec4d const & a, Const_int_t) { - return pow_n(a); -} - - -// function round: round to nearest integer (even). (result as double vector) -static inline Vec4d round(Vec4d const & a) { - return _mm256_round_pd(a, 0+8); -} - -// function truncate: round towards zero. (result as double vector) -static inline Vec4d truncate(Vec4d const & a) { - return _mm256_round_pd(a, 3+8); -} - -// function floor: round towards minus infinity. (result as double vector) -static inline Vec4d floor(Vec4d const & a) { - return _mm256_round_pd(a, 1+8); -} - -// function ceil: round towards plus infinity. (result as double vector) -static inline Vec4d ceil(Vec4d const & a) { - return _mm256_round_pd(a, 2+8); -} - -// function round_to_int: round to nearest integer (even). (result as integer vector) -static inline Vec4i round_to_int(Vec4d const & a) { - // Note: assume MXCSR control register is set to rounding - return _mm256_cvtpd_epi32(a); -} - -// function truncate_to_int: round towards zero. (result as integer vector) -static inline Vec4i truncate_to_int(Vec4d const & a) { - return _mm256_cvttpd_epi32(a); -} - -#ifdef VECTORI256_H // 256 bit integer vectors are available - -// function truncate_to_int64: round towards zero. (inefficient) -static inline Vec4q truncate_to_int64(Vec4d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - //return _mm256_maskz_cvttpd_epi64( __mmask8(0xFF), a); - return _mm256_cvttpd_epi64(a); -#else - double aa[4]; - a.store(aa); - return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3])); -#endif -} - -// function truncate_to_int64_limited: round towards zero. -// result as 64-bit integer vector, but with limited range. Deprecated! -static inline Vec4q truncate_to_int64_limited(Vec4d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return truncate_to_int64(a); -#elif VECTORI256_H > 1 - // Note: assume MXCSR control register is set to rounding - Vec2q b = _mm256_cvttpd_epi32(a); // round to 32-bit integers - __m256i c = permute4q<0,-256,1,-256>(Vec4q(b,b)); // get bits 64-127 to position 128-191 - __m256i s = _mm256_srai_epi32(c, 31); // sign extension bits - return _mm256_unpacklo_epi32(c, s); // interleave with sign extensions -#else - return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high())); -#endif -} - -// function round_to_int64: round to nearest or even. (inefficient) -static inline Vec4q round_to_int64(Vec4d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return _mm256_cvtpd_epi64(a); -#else - return truncate_to_int64(round(a)); -#endif -} - -// function round_to_int64_limited: round to nearest integer (even) -// result as 64-bit integer vector, but with limited range. Deprecated! -static inline Vec4q round_to_int64_limited(Vec4d const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return round_to_int64(a); -#elif VECTORI256_H > 1 - // Note: assume MXCSR control register is set to rounding - Vec2q b = _mm256_cvtpd_epi32(a); // round to 32-bit integers - __m256i c = permute4q<0,-256,1,-256>(Vec4q(b,b)); // get bits 64-127 to position 128-191 - __m256i s = _mm256_srai_epi32(c, 31); // sign extension bits - return _mm256_unpacklo_epi32(c, s); // interleave with sign extensions -#else - return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high())); -#endif -} - -// function to_double: convert integer vector elements to double vector (inefficient) -static inline Vec4d to_double(Vec4q const & a) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return _mm256_maskz_cvtepi64_pd( __mmask16(0xFF), a); -#else - int64_t aa[4]; - a.store(aa); - return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3])); -#endif -} - -// function to_double_limited: convert integer vector elements to double vector -// limited to abs(x) < 2^31. Deprecated! -static inline Vec4d to_double_limited(Vec4q const & x) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return to_double(x); -#else - Vec8i compressed = permute8i<0,2,4,6,-256,-256,-256,-256>(Vec8i(x)); - return _mm256_cvtepi32_pd(compressed.get_low()); // AVX -#endif -} - -#endif // VECTORI256_H - -// function to_double: convert integer vector to double vector -static inline Vec4d to_double(Vec4i const & a) { - return _mm256_cvtepi32_pd(a); -} - -// function compress: convert two Vec4d to one Vec8f -static inline Vec8f compress (Vec4d const & low, Vec4d const & high) { - __m128 t1 = _mm256_cvtpd_ps(low); - __m128 t2 = _mm256_cvtpd_ps(high); - return Vec8f(t1, t2); -} - -// Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d -static inline Vec4d extend_low(Vec8f const & a) { - return _mm256_cvtps_pd(_mm256_castps256_ps128(a)); -} - -// Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d -static inline Vec4d extend_high (Vec8f const & a) { - return _mm256_cvtps_pd(_mm256_extractf128_ps(a,1)); -} - -// Fused multiply and add functions - -// Multiply and add -static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { -#ifdef __FMA__ - return _mm256_fmadd_pd(a, b, c); -#elif defined (__FMA4__) - return _mm256_macc_pd(a, b, c); -#else - return a * b + c; -#endif - -} - - -// Multiply and subtract -static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) { -#ifdef __FMA__ - return _mm256_fmsub_pd(a, b, c); -#elif defined (__FMA4__) - return _mm256_msub_pd(a, b, c); -#else - return a * b - c; -#endif - -} - -// Multiply and inverse subtract -static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) { -#ifdef __FMA__ - return _mm256_fnmadd_pd(a, b, c); -#elif defined (__FMA4__) - return _mm256_nmacc_pd(a, b, c); -#else - return c - a * b; -#endif -} - -// Multiply and subtract with extra precision on the intermediate calculations, -// even if FMA instructions not supported, using Veltkamp-Dekker split -static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) { -#ifdef __FMA__ - return _mm256_fmsub_pd(a, b, c); -#elif defined (__FMA4__) - return _mm256_msub_pd(a, b, c); -#else - // calculate a * b - c with extra precision - // mask to remove lower 27 bits - Vec4d upper_mask = _mm256_castps_pd(constant8f<(int)0xF8000000,-1,(int)0xF8000000,-1,(int)0xF8000000,-1,(int)0xF8000000,-1>()); - Vec4d a_high = a & upper_mask; // split into high and low parts - Vec4d b_high = b & upper_mask; - Vec4d a_low = a - a_high; - Vec4d b_low = b - b_high; - Vec4d r1 = a_high * b_high; // this product is exact - Vec4d r2 = r1 - c; // subtract c from high product - Vec4d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product - return r3; // + ((r2 - r1) + c); -#endif -} - - -// Math functions using fast bit manipulation - -#ifdef VECTORI256_H // 256 bit integer vectors are available -// Extract the exponent as an integer -// exponent(a) = floor(log2(abs(a))); -// exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024 -static inline Vec4q exponent(Vec4d const & a) { -#if VECTORI256_H > 1 // AVX2 - Vec4uq t1 = _mm256_castpd_si256(a);// reinterpret as 64-bit integer - Vec4uq t2 = t1 << 1; // shift out sign bit - Vec4uq t3 = t2 >> 53; // shift down logical to position 0 - Vec4q t4 = Vec4q(t3) - 0x3FF; // subtract bias from exponent - return t4; -#else - return Vec4q(exponent(a.get_low()), exponent(a.get_high())); -#endif -} - -// Extract the fraction part of a floating point number -// a = 2^exponent(a) * fraction(a), except for a = 0 -// fraction(1.0) = 1.0, fraction(5.0) = 1.25 -static inline Vec4d fraction(Vec4d const & a) { -#if VECTORI256_H > 1 // AVX2 - Vec4uq t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer - Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFF) | 0x3FF0000000000000); // set exponent to 0 + bias - return _mm256_castsi256_pd(t2); -#else - return Vec4d(fraction(a.get_low()), fraction(a.get_high())); -#endif -} - -// Fast calculation of pow(2,n) with n integer -// n = 0 gives 1.0 -// n >= 1024 gives +INF -// n <= -1023 gives 0.0 -// This function will never produce denormals, and never raise exceptions -static inline Vec4d exp2(Vec4q const & n) { -#if VECTORI256_H > 1 // AVX2 - Vec4q t1 = max(n, -0x3FF); // limit to allowed range - Vec4q t2 = min(t1, 0x400); - Vec4q t3 = t2 + 0x3FF; // add bias - Vec4q t4 = t3 << 52; // put exponent into position 52 - return _mm256_castsi256_pd(t4); // reinterpret as double -#else - return Vec4d(exp2(n.get_low()), exp2(n.get_high())); -#endif -} -//static inline Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h -#endif - - -// Categorization functions - -// Function sign_bit: gives true for elements that have the sign bit set -// even for -0.0, -INF and -NAN -// Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false -static inline Vec4db sign_bit(Vec4d const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer - Vec4q t2 = t1 >> 63; // extend sign bit - return _mm256_castsi256_pd(t2); // reinterpret as 64-bit Boolean -#else - return Vec4db(sign_bit(a.get_low()),sign_bit(a.get_high())); -#endif -} - -// Function sign_combine: changes the sign of a when b has the sign bit set -// same as select(sign_bit(b), -a, a) -static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) { - Vec4d signmask = _mm256_castps_pd(constant8f<0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000>()); // -0.0 - return a ^ (b & signmask); -} - -// Function is_finite: gives true for elements that are normal, denormal or zero, -// false for INF and NAN -static inline Vec4db is_finite(Vec4d const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer - Vec4q t2 = t1 << 1; // shift out sign bit - Vec4q t3 = 0xFFE0000000000000; // exponent mask - Vec4qb t4 = Vec4q(t2 & t3) != t3; // exponent field is not all 1s - return t4; -#else - return Vec4db(is_finite(a.get_low()),is_finite(a.get_high())); -#endif -} - -// Function is_inf: gives true for elements that are +INF or -INF -// false for finite numbers and NAN -static inline Vec4db is_inf(Vec4d const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer - Vec4q t2 = t1 << 1; // shift out sign bit - return t2 == 0xFFE0000000000000; // exponent is all 1s, fraction is 0 -#else - return Vec4db(is_inf(a.get_low()),is_inf(a.get_high())); -#endif -} - -// Function is_nan: gives true for elements that are +NAN or -NAN -// false for finite numbers and +/-INF -static inline Vec4db is_nan(Vec4d const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer - Vec4q t2 = t1 << 1; // shift out sign bit - Vec4q t3 = 0xFFE0000000000000; // exponent mask - Vec4q t4 = t2 & t3; // exponent - Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction - return Vec4qb(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0 -#else - return Vec4db(is_nan(a.get_low()),is_nan(a.get_high())); -#endif -} - -// Function is_subnormal: gives true for elements that are denormal (subnormal) -// false for finite numbers, zero, NAN and INF -static inline Vec4db is_subnormal(Vec4d const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer - Vec4q t2 = t1 << 1; // shift out sign bit - Vec4q t3 = 0xFFE0000000000000; // exponent mask - Vec4q t4 = t2 & t3; // exponent - Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction - return Vec4qb(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0 -#else - return Vec4db(is_subnormal(a.get_low()),is_subnormal(a.get_high())); -#endif -} - -// Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal) -// false for finite numbers, NAN and INF -static inline Vec4db is_zero_or_subnormal(Vec4d const & a) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - Vec4q t = _mm256_castpd_si256(a); // reinterpret as 32-bit integer - t &= 0x7FF0000000000000ll; // isolate exponent - return t == 0; // exponent = 0 -#else - return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high())); -#endif -} - -// Function infinite2d: returns a vector where all elements are +INF -static inline Vec4d infinite4d() { - return _mm256_castps_pd(constant8f<0,0x7FF00000,0,0x7FF00000,0,0x7FF00000,0,0x7FF00000>()); -} - -// Function nan4d: returns a vector where all elements are +NAN (quiet) -static inline Vec4d nan4d(int n = 0x10) { -#if defined (VECTORI256_H) && VECTORI256_H > 1 // 256 bit integer vectors are available, AVX2 - return _mm256_castsi256_pd(Vec4q(0x7FF8000000000000 + n)); -#else - return Vec4d(nan2d(n),nan2d(n)); -#endif -} - -// change signs on vectors Vec4d -// Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change -template -static inline Vec4d change_sign(Vec4d const & a) { - if ((i0 | i1 | i2 | i3) == 0) return a; - __m256 mask = constant8f<0, i0 ? (int)0x80000000 : 0, 0, i1 ? (int)0x80000000 : 0, 0, i2 ? (int)0x80000000 : 0, 0, i3 ? (int)0x80000000 : 0> (); - return _mm256_xor_pd(a, _mm256_castps_pd(mask)); -} - - -/***************************************************************************** -* -* Functions for reinterpretation between vector types -* -*****************************************************************************/ - -#if defined (VECTORI256_H) && VECTORI256_H >= 2 -// AVX2 vectors defined - - -// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors. -// It is recommended to compile with -fabi-version=0 to get the latest abi version -#if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004) -static inline __m256i reinterpret_i (__m256i const & x) { - return x; -} - -static inline __m256i reinterpret_i (__m256 const & x) { - return _mm256_castps_si256(x); -} - -static inline __m256i reinterpret_i (__m256d const & x) { - return _mm256_castpd_si256(x); -} - -static inline __m256 reinterpret_f (__m256i const & x) { - return _mm256_castsi256_ps(x); -} - -static inline __m256 reinterpret_f (__m256 const & x) { - return x; -} - -static inline __m256 reinterpret_f (__m256d const & x) { - return _mm256_castpd_ps(x); -} - -static inline __m256d reinterpret_d (__m256i const & x) { - return _mm256_castsi256_pd(x); -} - -static inline __m256d reinterpret_d (__m256 const & x) { - return _mm256_castps_pd(x); -} - -static inline __m256d reinterpret_d (__m256d const & x) { - return x; -} - -#else // __GXX_ABI_VERSION < 1004 - -static inline __m256i reinterpret_i (Vec32c const & x) { - return x; -} - -static inline __m256i reinterpret_i (Vec16s const & x) { - return x; -} - -static inline __m256i reinterpret_i (Vec8i const & x) { - return x; -} - -static inline __m256i reinterpret_i (Vec4q const & x) { - return x; -} - -static inline __m256i reinterpret_i (Vec8f const & x) { - return _mm256_castps_si256(x); -} - -static inline __m256i reinterpret_i (Vec4d const & x) { - return _mm256_castpd_si256(x); -} - -static inline __m256 reinterpret_f (Vec32c const & x) { - return _mm256_castsi256_ps(x); -} - -static inline __m256 reinterpret_f (Vec16s const & x) { - return _mm256_castsi256_ps(x); -} - -static inline __m256 reinterpret_f (Vec8i const & x) { - return _mm256_castsi256_ps(x); -} - -static inline __m256 reinterpret_f (Vec4q const & x) { - return _mm256_castsi256_ps(x); -} - -static inline __m256 reinterpret_f (Vec8f const & x) { - return x; -} - -static inline __m256 reinterpret_f (Vec4d const & x) { - return _mm256_castpd_ps(x); -} - -static inline __m256d reinterpret_d (Vec32c const & x) { - return _mm256_castsi256_pd(x); -} - -static inline __m256d reinterpret_d (Vec16s const & x) { - return _mm256_castsi256_pd(x); -} - -static inline __m256d reinterpret_d (Vec8i const & x) { - return _mm256_castsi256_pd(x); -} - -static inline __m256d reinterpret_d (Vec4q const & x) { - return _mm256_castsi256_pd(x); -} - -static inline __m256d reinterpret_d (Vec8f const & x) { - return _mm256_castps_pd(x); -} - -static inline __m256d reinterpret_d (Vec4d const & x) { - return x; -} - -#endif // __GXX_ABI_VERSION - -#else -// AVX2 emulated in vectori256e.h, AVX supported - -// ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors. -// It is recommended to compile with -fabi-version=0 to get the latest abi version -#if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004) - -static inline Vec256ie reinterpret_i (__m256 const & x) { - Vec8f xx(x); - return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high())); -} - -static inline Vec256ie reinterpret_i (__m256d const & x) { - Vec4d xx(x); - return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high())); -} - -static inline __m256 reinterpret_f (__m256 const & x) { - return x; -} - -static inline __m256 reinterpret_f (__m256d const & x) { - return _mm256_castpd_ps(x); -} - -static inline __m256d reinterpret_d (__m256 const & x) { - return _mm256_castps_pd(x); -} - -static inline __m256d reinterpret_d (__m256d const & x) { - return x; -} - -#else // __GXX_ABI_VERSION < 1004 - -static inline Vec256ie reinterpret_i (Vec8f const & x) { - Vec8f xx(x); - return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high())); -} - -static inline Vec256ie reinterpret_i (Vec4d const & x) { - Vec4d xx(x); - return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high())); -} - -static inline __m256 reinterpret_f (Vec8f const & x) { - return x; -} - -static inline __m256 reinterpret_f (Vec4d const & x) { - return _mm256_castpd_ps(x); -} - -static inline __m256d reinterpret_d (Vec8f const & x) { - return _mm256_castps_pd(x); -} - -static inline __m256d reinterpret_d (Vec4d const & x) { - return x; -} - -#endif // __GXX_ABI_VERSION - -static inline Vec256ie reinterpret_i (Vec256ie const & x) { - return x; -} - -static inline __m256 reinterpret_f (Vec256ie const & x) { - return Vec8f(Vec4f(reinterpret_f(x.get_low())), Vec4f(reinterpret_f(x.get_high()))); -} - -static inline __m256d reinterpret_d (Vec256ie const & x) { - return Vec4d(Vec2d(reinterpret_d(x.get_low())), Vec2d(reinterpret_d(x.get_high()))); -} - -#endif // VECTORI256_H - - -/***************************************************************************** -* -* Vector permute and blend functions -* -****************************************************************************** -* -* The permute function can reorder the elements of a vector and optionally -* set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select. An index of -1 will generate zero. An index of -256 means don't care. -* -* Example: -* Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13) -* Vec4d b; -* b = permute4d<1,0,-1,3>(a); // b is (11, 10, 0, 13) -* -* -* The blend function can mix elements from two different vectors and -* optionally set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where indexes 0 - 3 indicate an element from the first source -* vector and indexes 4 - 7 indicate an element from the second source vector. -* A negative index will generate zero. -* -* -* Example: -* Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13) -* Vec4d b(20., 21., 22., 23.); // a is (20, 21, 22, 23) -* Vec4d c; -* c = blend4d<4,3,7,-1> (a,b); // c is (20, 13, 23, 0) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. -*****************************************************************************/ - -// permute vector Vec4d -template -static inline Vec4d permute4d(Vec4d const & a) { - - const int ior = i0 | i1 | i2 | i3; // OR indexes - - // is zeroing needed - const bool do_zero = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100 - - // is shuffling needed - const bool do_shuffle = (i0>0) || (i1!=1 && i1>=0) || (i2!=2 && i2>=0) || (i3!=3 && i3>=0); - - if (!do_shuffle) { // no shuffling needed - if (do_zero) { // zeroing - if ((i0 & i1 & i2 & i3) < 0) { - return _mm256_setzero_pd(); // zero everything - } - // zero some elements - __m256d const mask = _mm256_castps_pd ( - constant8f< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0), -int(i2>=0), -int(i2>=0), -int(i3>=0), -int(i3>=0) > ()); - return _mm256_and_pd(a, mask); // zero with AND mask - } - else { - return a; // do nothing - } - } -#if INSTRSET >= 8 // AVX2: use VPERMPD - __m256d x = _mm256_permute4x64_pd(a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); - if (do_zero) { // zeroing - // zero some elements - __m256d const mask2 = _mm256_castps_pd ( - constant8f< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0), -int(i2>=0), -int(i2>=0), -int(i3>=0), -int(i3>=0) > ()); - x = _mm256_and_pd(x, mask2); // zero with AND mask - } - return x; -#else // AVX - - // Needed contents of low/high part of each source register in VSHUFPD - // 0: a.low, 1: a.high, 3: zero - const int s1 = (i0 < 0 ? 3 : (i0 & 2) >> 1) | (i2 < 0 ? 0x30 : (i2 & 2) << 3); - const int s2 = (i1 < 0 ? 3 : (i1 & 2) >> 1) | (i3 < 0 ? 0x30 : (i3 & 2) << 3); - // permute mask - const int sm = (i0 < 0 ? 0 : (i0 & 1)) | (i1 < 0 ? 1 : (i1 & 1)) << 1 | (i2 < 0 ? 0 : (i2 & 1)) << 2 | (i3 < 0 ? 1 : (i3 & 1)) << 3; - - if (s1 == 0x01 || s1 == 0x11 || s2 == 0x01 || s2 == 0x11) { - // too expensive to use 256 bit permute, split into two 128 bit permutes - Vec2d alo = a.get_low(); - Vec2d ahi = a.get_high(); - Vec2d rlo = blend2d (alo, ahi); - Vec2d rhi = blend2d (alo, ahi); - return Vec4d(rlo, rhi); - } - - // make operands for VSHUFPD - __m256d r1, r2; - - switch (s1) { - case 0x00: // LL - r1 = _mm256_insertf128_pd(a,_mm256_castpd256_pd128(a),1); break; - case 0x03: // LZ - r1 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1); - break; - case 0x10: // LH - r1 = a; break; - case 0x13: // ZH - r1 = do_zero ? _mm256_and_pd(a, _mm256_castps_pd(constant8f<0,0,0,0,-1,-1,-1,-1>())) : __m256d(a); break; - case 0x30: // LZ - if (do_zero) { - __m128d t = _mm256_castpd256_pd128(a); - t = _mm_and_pd(t,t); - r1 = _mm256_castpd128_pd256(t); - } - else r1 = a; - break; - case 0x31: // HZ - r1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a,1)); break; - case 0x33: // ZZ - r1 = do_zero ? _mm256_setzero_pd() : __m256d(a); break; - default:; // Not needed. Avoid warning in Clang - } - - if (s2 == s1) { - if (sm == 0x0A) return r1; - r2 = r1; - } - else { - switch (s2) { - case 0x00: // LL - r2 = _mm256_insertf128_pd(a,_mm256_castpd256_pd128(a),1); break; - case 0x03: // ZL - r2 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1); - break; - case 0x10: // LH - r2 = a; break; - case 0x13: // ZH - r2 = do_zero ? _mm256_and_pd(a,_mm256_castps_pd(constant8f<0,0,0,0,-1,-1,-1,-1>())) : __m256d(a); break; - case 0x30: // LZ - if (do_zero) { - __m128d t = _mm256_castpd256_pd128(a); - t = _mm_and_pd(t,t); - r2 = _mm256_castpd128_pd256(t); - } - else r2 = a; - break; - case 0x31: // HZ - r2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a,1)); break; - case 0x33: // ZZ - r2 = do_zero ? _mm256_setzero_pd() : __m256d(a); break; - default:; // Not needed. Avoid warning in Clang - } - } - return _mm256_shuffle_pd(r1, r2, sm); - -#endif // INSTRSET >= 8 -} - - -// blend vectors Vec4d -template -static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) { - - // Combine all the indexes into a single bitfield, with 8 bits for each - const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24; - - // Mask to zero out negative indexes - const uint32_t mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24; - - if (mz == 0) return _mm256_setzero_pd(); // all zero - - __m256d t1; - if ((((m1 & 0xFEFEFEFE) ^ 0x06020400) & mz) == 0) { - // fits VSHUFPD(a,b) - t1 = _mm256_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3); - if (mz == 0xFFFFFFFF) return t1; - return permute4d (t1); - } - if ((((m1 & 0xFEFEFEFE) ^0x02060004) & mz) == 0) { - // fits VSHUFPD(b,a) - t1 = _mm256_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3); - if (mz == 0xFFFFFFFF) return t1; - return permute4d (t1); - } - if ((((m1 & 0x03030303) ^ 0x03020100) & mz) == 0) { - // blend and zero, no permute - if ((m1 & 0x04040404 & mz) == 0) { - t1 = a; - } - else if (((m1 ^ 0x04040404) & 0x04040404 & mz) == 0) { - t1 = b; - } - else { - t1 = _mm256_blend_pd(a, b, (i0&4)>>2 | (i1&4)>>1 | (i2&4) | (i3&4) << 1); - } - if (mz == 0xFFFFFFFF) return t1; - return permute4d (t1); - } - if ((m1 & 0x04040404 & mz) == 0) { - // all from a - return permute4d (a); - } - if (((m1 ^ 0x04040404) & 0x04040404 & mz) == 0) { - // all from b - return permute4d (b); - } - // check if we can do 128-bit blend/permute - if (((m1 ^ 0x01000100) & 0x01010101 & mz) == 0) { - const uint32_t j0 = uint32_t((i0 >= 0 ? i0 : i1 >= 0 ? i1 : -1) >> 1); - const uint32_t j1 = uint32_t((i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 1); - if (((m1 ^ ((j0 & 3) * 0x00000202 | (j1 & 3) * 0x02020000)) & 0x06060606 & mz) == 0) { - t1 = _mm256_permute2f128_pd(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4); - const bool partialzero = (((i0 | i1) ^ j0) & 0x80) != 0 || (((i2 | i3) ^ j1) & 0x80) != 0; - if (partialzero) { - // zero some elements - __m256d mask = _mm256_castps_pd (constant8f < - i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, - i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ()); - return _mm256_and_pd(t1, mask); - } - else return t1; - } - } - // general case. combine two permutes - Vec4d a1 = permute4d < - (uint32_t)i0 < 4 ? i0 : -0x100, - (uint32_t)i1 < 4 ? i1 : -0x100, - (uint32_t)i2 < 4 ? i2 : -0x100, - (uint32_t)i3 < 4 ? i3 : -0x100 > (a); - Vec4d b1 = permute4d < - (uint32_t)(i0^4) < 4 ? (i0^4) : -0x100, - (uint32_t)(i1^4) < 4 ? (i1^4) : -0x100, - (uint32_t)(i2^4) < 4 ? (i2^4) : -0x100, - (uint32_t)(i3^4) < 4 ? (i3^4) : -0x100 > (b); - t1 = _mm256_blend_pd(a1, b1, (i0&4)>>2 | (i1&4)>>1 | (i2&4) | (i3&4) << 1); - if (mz == 0xFFFFFFFF) return t1; - return permute4d (t1); -} - -/***************************************************************************** -* -* Vector Vec8f permute and blend functions -* -*****************************************************************************/ - -// permute vector Vec8f -template -static inline Vec8f permute8f(Vec8f const & a) { - - __m256 t1, mask; - - const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7; // OR indexes - - // is zeroing needed - const bool do_zero = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100 - - // is shuffling needed - const bool do_shuffle = (i0>0) || (i1!=1 && i1>=0) || (i2!=2 && i2>=0) || (i3!=3 && i3>=0) || - (i4!=4 && i4>=0) || (i5!=5 && i5>=0) || (i6!=6 && i6>=0) || (i7!=7 && i7>=0); - - if (!do_shuffle) { // no shuffling needed - if (do_zero) { // zeroing - if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) { - return _mm256_setzero_ps(); // zero everything - } - // zero some elements - mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > (); - return _mm256_and_ps(a, mask); // zero with AND mask - } - else { - return a; // do nothing - } - } - -#if INSTRSET >= 8 // AVX2: use VPERMPS - if (do_shuffle) { // shuffling - mask = constant8f< i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7 > (); -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0 - t1 = _mm256_permutevar8x32_ps(mask, _mm256_castps_si256(a)); // problem in immintrin.h -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1 - t1 = _mm256_permutevar8x32_ps(mask, a); -#else // no bug version - t1 = _mm256_permutevar8x32_ps(a, _mm256_castps_si256(mask)); -#endif - } - else { - t1 = a; // no shuffling - } - if (do_zero) { // zeroing - if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) { - return _mm256_setzero_ps(); // zero everything - } - // zero some elements - mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > (); - t1 = _mm256_and_ps(t1, mask); // zero with AND mask - } - return t1; -#else // AVX - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28; - - // Mask to zero out negative indexes - const int m2 = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - // Check if it is possible to use VSHUFPS. Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2 - const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & m2 & (m2 >> 16)) == 0 && ((m1 ^ (m1 >> 4)) & 0x04040404 & m2 & m2 >> 4) == 0; - - if (sps) { // can use VSHUFPS - - // Index of each pair (i[n],i[n+1]) - const int j0 = i0 >= 0 ? i0 : i1; - const int j1 = i2 >= 0 ? i2 : i3; - const int j2 = i4 >= 0 ? i4 : i5; - const int j3 = i6 >= 0 ? i6 : i7; - - // Index of each pair (i[n],i[n+4]) - const int k0 = i0 >= 0 ? i0 : i4; - const int k1 = i1 >= 0 ? i1 : i5; - const int k2 = i2 >= 0 ? i2 : i6; - const int k3 = i3 >= 0 ? i3 : i7; - - // Needed contents of low/high part of each source register in VSHUFPS - // 0: a.low, 1: a.high, 3: zero or don't care - const int s1 = (j0 < 0 ? 3 : (j0 & 4) >> 2) | (j2 < 0 ? 0x30 : (j2 & 4) << 2); - const int s2 = (j1 < 0 ? 3 : (j1 & 4) >> 2) | (j3 < 0 ? 0x30 : (j3 & 4) << 2); - - // calculate cost of using VSHUFPS - const int cost1 = (s1 == 0x01 || s1 == 0x11) ? 2 : (s1 == 0x00 || s1 == 0x03 || s1 == 0x31) ? 1 : 0; - const int cost2 = (s2 == s1) ? 0 : (s2 == 0x01 || s2 == 0x11) ? 2 : (s2 == 0x00 || (s2 == 0x03 && (s1 & 0xF0) != 0x00) || (s2 == 0x31 && (s1 & 0x0F) != 0x01)) ? 1 : 0; - - if (cost1 + cost2 <= 3) { - - // permute mask - const int sm = (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6; - - // make operands for VSHUFPS - __m256 r1, r2; - - switch (s1) { - case 0x00: // LL - case 0x03: // ZL - r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1); break; - case 0x01: // HL - r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1)); - r1 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(a),1); break; - case 0x10: // LH - case 0x13: // ZH - case 0x30: // LZ - case 0x33: // ZZ - r1 = a; break; - case 0x11: // HH - r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1)); - r1 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(r1),1); break; - case 0x31: // HZ - r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1)); break; - } - - if (s2 == s1) { - if (sm == 0xE4) return r1; - r2 = r1; - } - else { - switch (s2) { - case 0x00: // LL - r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1); break; - case 0x03: // ZL - if ((s1 & 0xF0) == 0x00) r2 = r1; - else { - r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1); - } - break; - case 0x01: // HL - r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1)); - r2 = _mm256_insertf128_ps(r2,_mm256_castps256_ps128(a),1); break; - case 0x10: // LH - case 0x13: // ZH - case 0x30: // LZ - case 0x33: // ZZ - r2 = a; break; - case 0x11: // HH - r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1)); - r2 = _mm256_insertf128_ps(r2,_mm256_castps256_ps128(r2),1); break; - case 0x31: // HZ - if ((s1 & 0x0F) == 0x01) r2 = r1; - else { - r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1)); - } - break; - } - } - - // now the permute instruction - t1 = _mm256_shuffle_ps(r1, r2, sm); - - if (do_zero) { - // zero some elements - mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > (); - t1 = _mm256_and_ps(t1, mask); // zero with AND mask - } - return t1; - } - } - // not using VSHUFPS. Split into low and high part - Vec4f alo = a.get_low(); - Vec4f ahi = a.get_high(); - Vec4f rlo = blend4f (alo, ahi); - Vec4f rhi = blend4f (alo, ahi); - return Vec8f(rlo, rhi); -#endif -} - - -// blend vectors Vec8f -template -static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) { - - const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7; // OR indexes - - // is zeroing needed - const bool do_zero = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100 - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; - - // Mask to zero out negative indexes - const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - __m256 t1, mask; - - if (mz == 0) return _mm256_setzero_ps(); // all zero - - if ((m1 & 0x88888888 & mz) == 0) { - // all from a - return permute8f (a); - } - - if (((m1 ^ 0x88888888) & 0x88888888 & mz) == 0) { - // all from b - return permute8f (b); - } - - if ((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0) { - // blend and zero, no permute - mask = constant8f<(i0&8)?0:-1, (i1&8)?0:-1, (i2&8)?0:-1, (i3&8)?0:-1, (i4&8)?0:-1, (i5&8)?0:-1, (i6&8)?0:-1, (i7&8)?0:-1> (); - t1 = select(mask, a, b); - if (!do_zero) return t1; - // zero some elements - mask = constant8f< (i0<0&&(i0&8)) ? 0 : -1, (i1<0&&(i1&8)) ? 0 : -1, (i2<0&&(i2&8)) ? 0 : -1, (i3<0&&(i3&8)) ? 0 : -1, - (i4<0&&(i4&8)) ? 0 : -1, (i5<0&&(i5&8)) ? 0 : -1, (i6<0&&(i6&8)) ? 0 : -1, (i7<0&&(i7&8)) ? 0 : -1 > (); - return _mm256_and_ps(t1, mask); - } - - // check if we can do 128-bit blend/permute - if (((m1 ^ 0x32103210) & 0x33333333 & mz) == 0) { - const uint32_t j0 = (i0 >= 0 ? i0 : i1 >= 0 ? i1 : i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 2; - const uint32_t j1 = (i4 >= 0 ? i4 : i5 >= 0 ? i5 : i6 >= 0 ? i6 : i7 >= 0 ? i7 : -1) >> 2; - if (((m1 ^ ((j0 & 3) * 0x00004444 | (j1 & 3) * 0x44440000)) & 0xCCCCCCCC & mz) == 0) { - t1 = _mm256_permute2f128_ps(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4); - const bool partialzero = (((i0 | i1 | i2 | i3) ^ j0) & 0x80) != 0 || (((i4 | i5 | i6 | i7) ^ j1) & 0x80) != 0; - if (partialzero) { - // zero some elements - mask = constant8f< i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, - i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > (); - return _mm256_and_ps(t1, mask); - } - else return t1; - } - } - // Not checking special cases for vunpckhps, vunpcklps: they are too rare - - // Check if it is possible to use VSHUFPS. - // Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2-3 - const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0 && ((m1 ^ (m1 >> 4)) & 0x0C0C0C0C & mz & mz >> 4) == 0; - - if (sps) { // can use VSHUFPS - - // Index of each pair (i[n],i[n+1]) - const int j0 = i0 >= 0 ? i0 : i1; - const int j1 = i2 >= 0 ? i2 : i3; - const int j2 = i4 >= 0 ? i4 : i5; - const int j3 = i6 >= 0 ? i6 : i7; - - // Index of each pair (i[n],i[n+4]) - const int k0 = i0 >= 0 ? i0 : i4; - const int k1 = i1 >= 0 ? i1 : i5; - const int k2 = i2 >= 0 ? i2 : i6; - const int k3 = i3 >= 0 ? i3 : i7; - - // Needed contents of low/high part of each source register in VSHUFPS - // 0: a.low, 1: a.high, 2: b.low, 3: b.high, 4: zero or don't care - const int s1 = (j0 < 0 ? 4 : (j0 & 0xC) >> 2) | (j2 < 0 ? 0x30 : (j2 & 0xC) << 2); - const int s2 = (j1 < 0 ? 3 : (j1 & 0xC) >> 2) | (j3 < 0 ? 0x30 : (j3 & 0xC) << 2); - - // permute mask - const int sm = (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6; - - __m256 r1, r2; - __m128 ahi = _mm256_extractf128_ps(a,1); // 1 - __m128 bhi = _mm256_extractf128_ps(b,1); // 3 - - switch (s1) { - case 0x00: case 0x04: - r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1); break; - case 0x01: case 0x41: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(a),1); break; - case 0x02: - r1 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(a),1); break; - case 0x03: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(a),1); break; - case 0x10: case 0x14: case 0x40: case 0x44: - r1 = a; break; - case 0x11: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),ahi,1); break; - case 0x12: - r1 = _mm256_insertf128_ps(b,ahi,1); break; - case 0x13: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),ahi,1); break; - case 0x20: - r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(b),1); break; - case 0x21: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(b),1); break; - case 0x22: case 0x24: case 0x42: - r1 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(b),1); break; - case 0x23: case 0x43: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(b),1); break; - case 0x30: - r1 = _mm256_insertf128_ps(a,bhi,1); break; - case 0x31: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),bhi,1); break; - case 0x32: case 0x34: - r1 = b; break; - case 0x33: - r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),bhi,1); break; - } - if (s2 == s1 || ((s2 & 0x04) && ((s1 ^ s2) & 0xF0) == 0) || ((s2 & 0x40) && ((s1 ^ s2) & 0x0F) == 0)) { - // can use r2 = r1 - if (sm == 0xE4) return r1; // no shuffling needed - r2 = r1; - } - else { - switch (s2) { - case 0x00: case 0x04: - r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1); break; - case 0x01: case 0x41: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(a),1); break; - case 0x02: - r2 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(a),1); break; - case 0x03: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(a),1); break; - case 0x10: case 0x14: case 0x40: case 0x44: - r2 = a; break; - case 0x11: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),ahi,1); break; - case 0x12: - r2 = _mm256_insertf128_ps(b,ahi,1); break; - case 0x13: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),ahi,1); break; - case 0x20: - r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(b),1); break; - case 0x21: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(b),1); break; - case 0x22: case 0x24: case 0x42: - r2 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(b),1); break; - case 0x23: case 0x43: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(b),1); break; - case 0x30: - r2 = _mm256_insertf128_ps(a,bhi,1); break; - case 0x31: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),bhi,1); break; - case 0x32: case 0x34: - r2 = b; break; - case 0x33: - r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),bhi,1); break; - } - } - - // now the shuffle instruction - t1 = _mm256_shuffle_ps(r1, r2, sm); - - if (do_zero) { - // zero some elements - mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > (); - t1 = _mm256_and_ps(t1, mask); // zero with AND mask - } - return t1; - } - - // Check if we can use 64-bit blend. Even numbered indexes must be even and odd numbered - // indexes must be equal to the preceding index + 1, except for negative indexes. - if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0) { - - const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7)) < 0; // part of a 64-bit block is zeroed - const int blank1 = partialzero ? -0x100 : -1; // ignore or zero - const int n0 = i0 > 0 ? i0/2 : i1 > 0 ? i1/2 : blank1; // indexes for 64 bit blend - const int n1 = i2 > 0 ? i2/2 : i3 > 0 ? i3/2 : blank1; - const int n2 = i4 > 0 ? i4/2 : i5 > 0 ? i5/2 : blank1; - const int n3 = i6 > 0 ? i6/2 : i7 > 0 ? i7/2 : blank1; - t1 = _mm256_castpd_ps (blend4d (_mm256_castps_pd(a), _mm256_castps_pd(b))); - if (blank1 == -1 || !do_zero) { - return t1; - } - // need more zeroing - mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > (); - return _mm256_and_ps(t1, mask); // zero with AND mask - } - - // general case: permute and blend and possible zero - const int blank2 = do_zero ? -1 : -0x100; // ignore or zero - - Vec8f ta = permute8f < - (uint32_t)i0 < 8 ? i0 : blank2, - (uint32_t)i1 < 8 ? i1 : blank2, - (uint32_t)i2 < 8 ? i2 : blank2, - (uint32_t)i3 < 8 ? i3 : blank2, - (uint32_t)i4 < 8 ? i4 : blank2, - (uint32_t)i5 < 8 ? i5 : blank2, - (uint32_t)i6 < 8 ? i6 : blank2, - (uint32_t)i7 < 8 ? i7 : blank2 > (a); - Vec8f tb = permute8f < - (uint32_t)(i0^8) < 8 ? (i0^8) : blank2, - (uint32_t)(i1^8) < 8 ? (i1^8) : blank2, - (uint32_t)(i2^8) < 8 ? (i2^8) : blank2, - (uint32_t)(i3^8) < 8 ? (i3^8) : blank2, - (uint32_t)(i4^8) < 8 ? (i4^8) : blank2, - (uint32_t)(i5^8) < 8 ? (i5^8) : blank2, - (uint32_t)(i6^8) < 8 ? (i6^8) : blank2, - (uint32_t)(i7^8) < 8 ? (i7^8) : blank2 > (b); - - if (blank2 == -1) { - return _mm256_or_ps(ta, tb); - } - // no zeroing, need to blend - const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) | - ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) | ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80); - return _mm256_blend_ps(ta, tb, maskb); // blend -} - - -/***************************************************************************** -* -* Vector lookup functions -* -****************************************************************************** -* -* These functions use vector elements as indexes into a table. -* The table is given as one or more vectors or as an array. -* -* This can be used for several purposes: -* - table lookup -* - permute or blend with variable indexes -* - blend from more than two sources -* - gather non-contiguous data -* -* An index out of range may produce any value - the actual value produced is -* implementation dependent and may be different for different instruction -* sets. An index out of range does not produce an error message or exception. -* -* Example: -* Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) -* Vec4f b(1.0f,1.1f,1.2f,1.3f); // table b is (1.0, 1.1, 1.2, 1.3) -* Vec4f c; -* c = lookup4 (a,b); // result c is (1.2, 1.0, 1.0, 1.3) -* -*****************************************************************************/ - -#ifdef VECTORI256_H // Vec8i and Vec4q must be defined - -static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) { -#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2 -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0 - return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), _mm256_castps_si256(table)); -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1 - return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), table); -#else - // no bug version - return _mm256_permutevar8x32_ps(table, index); -#endif - -#else // AVX - // swap low and high part of table - __m256 t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(table, 1)); - __m256 t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(table), 1); - // join index parts - __m256i index2 = _mm256_insertf128_si256(_mm256_castsi128_si256(index.get_low()), index.get_high(), 1); - // permute within each 128-bit part - __m256 r0 = _mm256_permutevar_ps(table, index2); - __m256 r1 = _mm256_permutevar_ps(t2, index2); - // high index bit for blend - __m128i k1 = _mm_slli_epi32(index.get_high() ^ 4, 29); - __m128i k0 = _mm_slli_epi32(index.get_low(), 29); - __m256 kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), _mm_castsi128_ps(k1), 1); - // blend the two permutes - return _mm256_blendv_ps(r0, r1, kk); -#endif -} - -template -static inline Vec8f lookup(Vec8i const & index, float const * table) { - if (n <= 0) return 0; - if (n <= 4) { - Vec4f table1 = Vec4f().load(table); - return Vec8f( - lookup4 (index.get_low(), table1), - lookup4 (index.get_high(), table1)); - } -#if INSTRSET < 8 // not AVX2 - if (n <= 8) { - return lookup8(index, Vec8f().load(table)); - } -#endif - // Limit index - Vec8ui index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec8ui(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec8ui(index), n-1); - } -#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2 - return _mm256_i32gather_ps(table, index1, 4); -#else // AVX - return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]], - table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]); -#endif -} - -static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) { -#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2 - // We can't use VPERMPD because it has constant indexes. - // Convert the index to fit VPERMPS - Vec8i index1 = permute8i<0,0,2,2,4,4,6,6> (Vec8i(index+index)); - Vec8i index2 = index1 + Vec8i(constant8i<0,1,0,1,0,1,0,1>()); -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0 - return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_si256(table))); -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 has wrong parameter type and operands in wrong order - return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_ps(table))); -#else - // no bug version - return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(table), index2)); -#endif - -#else // AVX - // swap low and high part of table - __m256d t1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(table, 1)); - __m256d t2 = _mm256_insertf128_pd(t1, _mm256_castpd256_pd128(table), 1); - // index << 1 - __m128i index2lo = index.get_low() + index.get_low(); - __m128i index2hi = index.get_high() + index.get_high(); - // join index parts - __m256i index3 = _mm256_insertf128_si256(_mm256_castsi128_si256(index2lo), index2hi, 1); - // permute within each 128-bit part - __m256d r0 = _mm256_permutevar_pd(table, index3); - __m256d r1 = _mm256_permutevar_pd(t2, index3); - // high index bit for blend - __m128i k1 = _mm_slli_epi64(index.get_high() ^ 2, 62); - __m128i k0 = _mm_slli_epi64(index.get_low(), 62); - __m256d kk = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_castsi128_pd(k0)), _mm_castsi128_pd(k1), 1); - // blend the two permutes - return _mm256_blendv_pd(r0, r1, kk); -#endif -} - -template -static inline Vec4d lookup(Vec4q const & index, double const * table) { - if (n <= 0) return 0; - if (n <= 2) { - Vec2d table1 = Vec2d().load(table); - return Vec4d( - lookup2 (index.get_low(), table1), - lookup2 (index.get_high(), table1)); - } -#if INSTRSET < 8 // not AVX2 - if (n <= 4) { - return lookup4(index, Vec4d().load(table)); - } -#endif - // Limit index - Vec8ui index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec8ui(index) & constant8i(); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec8ui(index), constant8i() ); - } -#if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2 - return _mm256_i64gather_pd(table, index1, 8); -#else // AVX - Vec4q index2 = Vec4q(index1); - return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]); -#endif -} -#endif // VECTORI256_H - -/***************************************************************************** -* -* Gather functions with fixed indexes -* -*****************************************************************************/ -// Load elements from array a with indices i0, i1, i2, i3, .. -template -static inline Vec8f gather8f(void const * a) { - return reinterpret_f(gather8i(a)); -} - -// Load elements from array a with indices i0, i1, i2, i3 -template -static inline Vec4d gather4d(void const * a) { - return reinterpret_d(gather4q(a)); -} - -/***************************************************************************** -* -* Vector scatter functions -* -****************************************************************************** -* -* These functions write the elements of a vector to arbitrary positions in an -* array in memory. Each vector element is written to an array position -* determined by an index. An element is not written if the corresponding -* index is out of range. -* The indexes can be specified as constant template parameters or as an -* integer vector. -* -* The scatter functions are useful if the data are distributed in a sparce -* manner into the array. If the array is dense then it is more efficient -* to permute the data into the right positions and then write the whole -* permuted vector into the array. -* -* Example: -* Vec8d a(10,11,12,13,14,15,16,17); -* double b[16] = {0}; -* scatter<0,2,14,10,1,-1,5,9>(a,b); -* // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0} -* -*****************************************************************************/ - -template -static inline void scatter(Vec8f const & data, float * array) { -#if defined (__AVX512VL__) - __m256i indx = constant8i(); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3| (i4>=0)<<4| (i5>=0)<<5| (i6>=0)<<6| (i7>=0)<<7); - _mm256_mask_i32scatter_ps(array, mask, indx, data, 4); -#elif defined (__AVX512F__) - __m512i indx = _mm512_castsi256_si512(constant8i()); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3| (i4>=0)<<4| (i5>=0)<<5| (i6>=0)<<6| (i7>=0)<<7); - _mm512_mask_i32scatter_ps(array, mask, indx, _mm512_castps256_ps512(data), 4); -#else - const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; - for (int i = 0; i < 8; i++) { - if (index[i] >= 0) array[index[i]] = data[i]; - } -#endif -} - -template -static inline void scatter(Vec4d const & data, double * array) { -#if defined (__AVX512VL__) - __m128i indx = constant4i(); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3); - _mm256_mask_i32scatter_pd(array, mask, indx, data, 8); -#elif defined (__AVX512F__) - __m256i indx = _mm256_castsi128_si256(constant4i()); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3); - _mm512_mask_i32scatter_pd(array, mask, indx, _mm512_castpd256_pd512(data), 8); -#else - const int index[4] = {i0,i1,i2,i3}; - for (int i = 0; i < 4; i++) { - if (index[i] >= 0) array[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec8i const & index, uint32_t limit, Vec8f const & data, float * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit)); - _mm256_mask_i32scatter_ps(array, mask, index, data, 4); -#elif defined (__AVX512F__) - // 16 bit mask. upper 8 bits are (0<0) = false - __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); - _mm512_mask_i32scatter_ps(array, mask, _mm512_castsi256_si512(index), _mm512_castps256_ps512(data), 4); -#else - for (int i = 0; i < 8; i++) { - if (uint32_t(index[i]) < limit) array[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec4q const & index, uint32_t limit, Vec4d const & data, double * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit))); - _mm256_mask_i64scatter_pd(array, mask, index, data, 8); -#elif defined (__AVX512F__) - // 16 bit mask. upper 8 bits are (0<0) = false - __mmask16 mask = _mm512_cmplt_epu64_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit)))); - _mm512_mask_i64scatter_pd(array, mask, _mm512_castsi256_si512(index), _mm512_castpd256_pd512(data), 8); -#else - for (int i = 0; i < 4; i++) { - if (uint64_t(index[i]) < uint64_t(limit)) array[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec4i const & index, uint32_t limit, Vec4d const & data, double * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); - _mm256_mask_i32scatter_pd(array, mask, index, data, 8); -#elif defined (__AVX512F__) - // 16 bit mask. upper 12 bits are (0<0) = false - __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit))); - _mm512_mask_i32scatter_pd(array, mask, _mm256_castsi128_si256(index), _mm512_castpd256_pd512(data), 8); -#else - for (int i = 0; i < 4; i++) { - if (uint32_t(index[i]) < limit) array[index[i]] = data[i]; - } -#endif -} - - -/***************************************************************************** -* -* Horizontal scan functions -* -*****************************************************************************/ - -// Get index to the first element that is true. Return -1 if all are false -static inline int horizontal_find_first(Vec8fb const & x) { - return horizontal_find_first(Vec8ib(x)); -} - -static inline int horizontal_find_first(Vec4db const & x) { - return horizontal_find_first(Vec4qb(x)); -} - -// Count the number of elements that are true -static inline uint32_t horizontal_count(Vec8fb const & x) { - return horizontal_count(Vec8ib(x)); -} - -static inline uint32_t horizontal_count(Vec4db const & x) { - return horizontal_count(Vec4qb(x)); -} - -/***************************************************************************** -* -* Boolean <-> bitfield conversion functions -* -*****************************************************************************/ - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec8fb const & x) { - return to_bits(Vec8ib(x)); -} - -// to_Vec8fb: convert integer bitfield to boolean vector -static inline Vec8fb to_Vec8fb(uint8_t x) { - return Vec8fb(to_Vec8ib(x)); -} - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4db const & x) { - return to_bits(Vec4qb(x)); -} - -// to_Vec4db: convert integer bitfield to boolean vector -static inline Vec4db to_Vec4db(uint8_t x) { - return Vec4db(to_Vec4qb(x)); -} - -#ifdef VCL_NAMESPACE -} -#endif - -#endif // VECTORF256_H diff --git a/DFTTest/vectorclass/vectori128.h b/DFTTest/vectorclass/vectori128.h deleted file mode 100644 index 17f5746..0000000 --- a/DFTTest/vectorclass/vectori128.h +++ /dev/null @@ -1,6394 +0,0 @@ -/**************************** vectori128.h ******************************* -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2017-05-02 -* Version: 1.28 -* Project: vector classes -* Description: -* Header file defining integer vector classes as interface to intrinsic -* functions in x86 microprocessors with SSE2 and later instruction sets -* up to AVX. -* -* Instructions: -* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired -* instruction set, which must be at least SSE2. Specify the supported -* instruction set by a command line define, e.g. __SSE4_1__ if the -* compiler does not automatically do so. -* -* The following vector classes are defined here: -* Vec128b Vector of 128 1-bit unsigned integers or Booleans -* Vec16c Vector of 16 8-bit signed integers -* Vec16uc Vector of 16 8-bit unsigned integers -* Vec16cb Vector of 16 Booleans for use with Vec16c and Vec16uc -* Vec8s Vector of 8 16-bit signed integers -* Vec8us Vector of 8 16-bit unsigned integers -* Vec8sb Vector of 8 Booleans for use with Vec8s and Vec8us -* Vec4i Vector of 4 32-bit signed integers -* Vec4ui Vector of 4 32-bit unsigned integers -* Vec4ib Vector of 4 Booleans for use with Vec4i and Vec4ui -* Vec2q Vector of 2 64-bit signed integers -* Vec2uq Vector of 2 64-bit unsigned integers -* Vec2qb Vector of 2 Booleans for use with Vec2q and Vec2uq -* -* Each vector object is represented internally in the CPU as a 128-bit register. -* This header file defines operators and functions for these vectors. -* -* For example: -* Vec4i a(1,2,3,4), b(5,6,7,8), c; -* c = a + b; // now c contains (6,8,10,12) -* -* For detailed instructions, see VectorClass.pdf -* -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses -*****************************************************************************/ -#ifndef VECTORI128_H -#define VECTORI128_H - -#include "instrset.h" // Select supported instruction set - -#if INSTRSET < 2 // SSE2 required -#error Please compile for the SSE2 instruction set or higher -#endif - -#ifdef VCL_NAMESPACE -namespace VCL_NAMESPACE { -#endif - -/***************************************************************************** -* -* Vector of 128 1-bit unsigned integers or Booleans -* -*****************************************************************************/ -class Vec128b { -protected: - __m128i xmm; // Integer vector -public: - // Default constructor: - Vec128b() { - } - // Constructor to broadcast the same value into all elements - // Removed because of undesired implicit conversions - // Vec128b(int i) { - // xmm = _mm_set1_epi32(-(i & 1));} - - // Constructor to convert from type __m128i used in intrinsics: - Vec128b(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec128b & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128i used in intrinsics - operator __m128i() const { - return xmm; - } - // Member function to load from array (unaligned) - Vec128b & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array, aligned by 16 - // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1, - // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 16. - Vec128b & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to store 64-bit integer into array - void storel(void * p) const { - _mm_storel_epi64((__m128i*)p, xmm); - } - // Member function to store into array (unaligned) - void store(void * p) const { - _mm_storeu_si128((__m128i*)p, xmm); - } - // Member function to store into array, aligned by 16 - // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1, - // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA. - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 16. - void store_a(void * p) const { - _mm_store_si128((__m128i*)p, xmm); - } - // Member function to store into array using a non-temporal memory hint, aligned by 16 - void stream(void * p) const { - _mm_stream_si128((__m128i*)p, xmm); - } - // Member function to change a single bit - // Note: This function is inefficient. Use load function if changing more than one bit - Vec128b const & set_bit(uint32_t index, int value) { - static const union { - uint64_t i[4]; - __m128i x[2]; - } u = {{1,0,0,1}}; // 2 vectors with bit 0 and 64 set, respectively - int w = (index >> 6) & 1; // qword index - int bi = index & 0x3F; // bit index within qword w - __m128i mask = u.x[w]; - mask = _mm_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set - if (value & 1) { - xmm = _mm_or_si128(mask,xmm); - } - else { - xmm = _mm_andnot_si128(mask,xmm); - } - return *this; - } - // Member function to get a single bit - // Note: This function is inefficient. Use store function if reading more than one bit - int get_bit(uint32_t index) const { - union { - __m128i x; - uint8_t i[16]; - } u; - u.x = xmm; - int w = (index >> 3) & 0xF; // byte index - int bi = index & 7; // bit index within byte w - return (u.i[w] >> bi) & 1; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return get_bit(index) != 0; - } - static int size() { - return 128; - } -}; - - -// Define operators for this class - -// vector operator & : bitwise and -static inline Vec128b operator & (Vec128b const & a, Vec128b const & b) { - return _mm_and_si128(a, b); -} -static inline Vec128b operator && (Vec128b const & a, Vec128b const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec128b operator | (Vec128b const & a, Vec128b const & b) { - return _mm_or_si128(a, b); -} -static inline Vec128b operator || (Vec128b const & a, Vec128b const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec128b operator ^ (Vec128b const & a, Vec128b const & b) { - return _mm_xor_si128(a, b); -} - -// vector operator ~ : bitwise not -static inline Vec128b operator ~ (Vec128b const & a) { - return _mm_xor_si128(a, _mm_set1_epi32(-1)); -} - -// vector operator &= : bitwise and -static inline Vec128b & operator &= (Vec128b & a, Vec128b const & b) { - a = a & b; - return a; -} - -// vector operator |= : bitwise or -static inline Vec128b & operator |= (Vec128b & a, Vec128b const & b) { - a = a | b; - return a; -} - -// vector operator ^= : bitwise xor -static inline Vec128b & operator ^= (Vec128b & a, Vec128b const & b) { - a = a ^ b; - return a; -} - -// Define functions for this class - -static inline __m128i zero_128b() { - return _mm_setzero_si128(); -} - -// function andnot: a & ~ b -static inline Vec128b andnot (Vec128b const & a, Vec128b const & b) { - return _mm_andnot_si128(b, a); -} - - -/***************************************************************************** -* -* Generate compile-time constant vector -* -*****************************************************************************/ -// Generate a constant vector of 4 integers stored in memory. -// Can be converted to any integer vector type -template -static inline __m128i constant4i() { - static const union { - int i[4]; - __m128i xmm; - } u = {{i0,i1,i2,i3}}; - return u.xmm; -} - -template -static inline __m128i constant4ui() { - return constant4i(); -} - -/***************************************************************************** -* -* selectb function -* -*****************************************************************************/ -// Select between two sources, byte by byte. Used in various functions and operators -// Corresponds to this pseudocode: -// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed. -// The implementation depends on the instruction set: -// If SSE4.1 is supported then only bit 7 in each byte of s is checked, -// otherwise all bits in s are used. -static inline __m128i selectb (__m128i const & s, __m128i const & a, __m128i const & b) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_blendv_epi8 (b, a, s); -#else - return _mm_or_si128( - _mm_and_si128(s,a), - _mm_andnot_si128(s,b)); -#endif -} - - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - -// horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec128b const & a) { -#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST - return _mm_testc_si128(a,constant4i<-1,-1,-1,-1>()) != 0; -#else - __m128i t1 = _mm_unpackhi_epi64(a,a); // get 64 bits down - __m128i t2 = _mm_and_si128(a,t1); // and 64 bits -#ifdef __x86_64__ - int64_t t5 = _mm_cvtsi128_si64(t2); // transfer 64 bits to integer - return t5 == int64_t(-1); -#else - __m128i t3 = _mm_srli_epi64(t2,32); // get 32 bits down - __m128i t4 = _mm_and_si128(t2,t3); // and 32 bits - int t5 = _mm_cvtsi128_si32(t4); // transfer 32 bits to integer - return t5 == -1; -#endif // __x86_64__ -#endif // INSTRSET -} - -// horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec128b const & a) { -#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST - return ! _mm_testz_si128(a,a); -#else - __m128i t1 = _mm_unpackhi_epi64(a,a); // get 64 bits down - __m128i t2 = _mm_or_si128(a,t1); // and 64 bits -#ifdef __x86_64__ - int64_t t5 = _mm_cvtsi128_si64(t2); // transfer 64 bits to integer - return t5 != int64_t(0); -#else - __m128i t3 = _mm_srli_epi64(t2,32); // get 32 bits down - __m128i t4 = _mm_or_si128(t2,t3); // and 32 bits - int t5 = _mm_cvtsi128_si32(t4); // transfer to integer - return t5 != 0; -#endif // __x86_64__ -#endif // INSTRSET -} - - - -/***************************************************************************** -* -* Vector of 16 8-bit signed integers -* -*****************************************************************************/ - -class Vec16c : public Vec128b { -public: - // Default constructor: - Vec16c() { - } - // Constructor to broadcast the same value into all elements: - Vec16c(int i) { - xmm = _mm_set1_epi8((char)i); - } - // Constructor to build from all elements: - Vec16c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, - int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15) { - xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec16c(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec16c & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128i used in intrinsics - operator __m128i() const { - return xmm; - } - // Member function to load 64-bit integer from array - Vec16c & loadl(void const * p) { - xmm = _mm_loadl_epi64((__m128i const*)p); - return *this; - } - // Member function to load from array (unaligned) - Vec16c & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec16c & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec16c & load_partial(int n, void const * p) { - if (n >= 16) load(p); - else if (n <= 0) *this = 0; - else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) { - // p is at least 16 bytes from a page boundary. OK to read 16 bytes - load(p); - } - else { - // worst case. read 1 byte at a time and suffer store forwarding penalty - char x[16]; - for (int i = 0; i < n; i++) x[i] = ((char const *)p)[i]; - load(x); - } - cutoff(n); - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - if (n >= 16) { - store(p); - return; - } - if (n <= 0) return; - // we are not using _mm_maskmoveu_si128 because it is too slow on many processors - union { - int8_t c[16]; - int16_t s[8]; - int32_t i[4]; - int64_t q[2]; - } u; - store(u.c); - int j = 0; - if (n & 8) { - *(int64_t*)p = u.q[0]; - j += 8; - } - if (n & 4) { - ((int32_t*)p)[j/4] = u.i[j/4]; - j += 4; - } - if (n & 2) { - ((int16_t*)p)[j/2] = u.s[j/2]; - j += 2; - } - if (n & 1) { - ((int8_t*)p)[j] = u.c[j]; - } - } - // cut off vector to n elements. The last 16-n elements are set to zero - Vec16c & cutoff(int n) { - if (uint32_t(n) >= 16) return *this; - static const char mask[32] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - *this &= Vec16c().load(mask+16-n); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec16c const & insert(uint32_t index, int8_t value) { - static const int8_t maskl[32] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - -1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - __m128i broad = _mm_set1_epi8(value); // broadcast value into all elements - __m128i mask = _mm_loadu_si128((__m128i const*)(maskl+16-(index & 0x0F))); // mask with FF at index position - xmm = selectb(mask,broad,xmm); - return *this; - } - // Member function extract a single element from vector - int8_t extract(uint32_t index) const { - int8_t x[16]; - store(x); - return x[index & 0x0F]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int8_t operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 16; - } -}; - -/***************************************************************************** -* -* Vec16cb: Vector of 16 Booleans for use with Vec16c and Vec16uc -* -*****************************************************************************/ - -class Vec16cb : public Vec16c { -public: - // Default constructor - Vec16cb() {} - // Constructor to build from all elements: - Vec16cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, - bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) { - xmm = Vec16c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), - -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15)); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec16cb(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec16cb & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec16cb(bool b) : Vec16c(-int8_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec16cb & operator = (bool b) { - *this = Vec16cb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec16cb(int b); - Vec16cb & operator = (int x); -public: - Vec16cb & insert (int index, bool a) { - Vec16c::insert(index, -(int)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec16c::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec16cb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec16cb operator & (Vec16cb const & a, Vec16cb const & b) { - return Vec16cb(Vec128b(a) & Vec128b(b)); -} -static inline Vec16cb operator && (Vec16cb const & a, Vec16cb const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec16cb & operator &= (Vec16cb & a, Vec16cb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec16cb operator | (Vec16cb const & a, Vec16cb const & b) { - return Vec16cb(Vec128b(a) | Vec128b(b)); -} -static inline Vec16cb operator || (Vec16cb const & a, Vec16cb const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec16cb & operator |= (Vec16cb & a, Vec16cb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec16cb operator ^ (Vec16cb const & a, Vec16cb const & b) { - return Vec16cb(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec16cb & operator ^= (Vec16cb & a, Vec16cb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec16cb operator ~ (Vec16cb const & a) { - return Vec16cb( ~ Vec128b(a)); -} - -// vector operator ! : element not -static inline Vec16cb operator ! (Vec16cb const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec16cb andnot (Vec16cb const & a, Vec16cb const & b) { - return Vec16cb(andnot(Vec128b(a), Vec128b(b))); -} - -// Horizontal Boolean functions for Vec16cb - -// horizontal_and. Returns true if all elements are true -static inline bool horizontal_and(Vec16cb const & a) { - return _mm_movemask_epi8(a) == 0xFFFF; -} - -// horizontal_or. Returns true if at least one element is true -static inline bool horizontal_or(Vec16cb const & a) { -#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST - return !_mm_testz_si128(a, a); -#else - return _mm_movemask_epi8(a) != 0; -#endif -} - - -/***************************************************************************** -* -* Define operators for Vec16c -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec16c operator + (Vec16c const & a, Vec16c const & b) { - return _mm_add_epi8(a, b); -} - -// vector operator += : add -static inline Vec16c & operator += (Vec16c & a, Vec16c const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec16c operator ++ (Vec16c & a, int) { - Vec16c a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec16c & operator ++ (Vec16c & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec16c operator - (Vec16c const & a, Vec16c const & b) { - return _mm_sub_epi8(a, b); -} - -// vector operator - : unary minus -static inline Vec16c operator - (Vec16c const & a) { - return _mm_sub_epi8(_mm_setzero_si128(), a); -} - -// vector operator -= : add -static inline Vec16c & operator -= (Vec16c & a, Vec16c const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec16c operator -- (Vec16c & a, int) { - Vec16c a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec16c & operator -- (Vec16c & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec16c operator * (Vec16c const & a, Vec16c const & b) { - // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies - __m128i aodd = _mm_srli_epi16(a,8); // odd numbered elements of a - __m128i bodd = _mm_srli_epi16(b,8); // odd numbered elements of b - __m128i muleven = _mm_mullo_epi16(a,b); // product of even numbered elements - __m128i mulodd = _mm_mullo_epi16(aodd,bodd); // product of odd numbered elements - mulodd = _mm_slli_epi16(mulodd,8); // put odd numbered elements back in place - __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions - __m128i product = selectb(mask,muleven,mulodd); // interleave even and odd - return product; -} - -// vector operator *= : multiply -static inline Vec16c & operator *= (Vec16c & a, Vec16c const & b) { - a = a * b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec16c operator << (Vec16c const & a, int b) { - uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out - __m128i am = _mm_and_si128(a,_mm_set1_epi8((char)mask)); // remove bits that will overflow - __m128i res = _mm_sll_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts - return res; -} - -// vector operator <<= : shift left -static inline Vec16c & operator <<= (Vec16c & a, int b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic all elements -static inline Vec16c operator >> (Vec16c const & a, int b) { - __m128i aeven = _mm_slli_epi16(a,8); // even numbered elements of a. get sign bit in position - aeven = _mm_sra_epi16(aeven,_mm_cvtsi32_si128(b+8)); // shift arithmetic, back to position - __m128i aodd = _mm_sra_epi16(a,_mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic - __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for even positions - __m128i res = selectb(mask,aeven,aodd); // interleave even and odd - return res; -} - -// vector operator >>= : shift right arithmetic -static inline Vec16c & operator >>= (Vec16c & a, int b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec16cb operator == (Vec16c const & a, Vec16c const & b) { - return _mm_cmpeq_epi8(a,b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec16cb operator != (Vec16c const & a, Vec16c const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec16cb)_mm_comneq_epi8(a,b); -#else // SSE2 instruction set - return Vec16cb(Vec16c(~(a == b))); -#endif -} - -// vector operator > : returns true for elements for which a > b (signed) -static inline Vec16cb operator > (Vec16c const & a, Vec16c const & b) { - return _mm_cmpgt_epi8(a,b); -} - -// vector operator < : returns true for elements for which a < b (signed) -static inline Vec16cb operator < (Vec16c const & a, Vec16c const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec16cb operator >= (Vec16c const & a, Vec16c const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec16cb)_mm_comge_epi8(a,b); -#else // SSE2 instruction set - return Vec16cb(Vec16c(~(b > a))); -#endif -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec16cb operator <= (Vec16c const & a, Vec16c const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec16c operator & (Vec16c const & a, Vec16c const & b) { - return Vec16c(Vec128b(a) & Vec128b(b)); -} -static inline Vec16c operator && (Vec16c const & a, Vec16c const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec16c & operator &= (Vec16c & a, Vec16c const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec16c operator | (Vec16c const & a, Vec16c const & b) { - return Vec16c(Vec128b(a) | Vec128b(b)); -} -static inline Vec16c operator || (Vec16c const & a, Vec16c const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec16c & operator |= (Vec16c & a, Vec16c const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec16c operator ^ (Vec16c const & a, Vec16c const & b) { - return Vec16c(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec16c & operator ^= (Vec16c & a, Vec16c const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec16c operator ~ (Vec16c const & a) { - return Vec16c( ~ Vec128b(a)); -} - -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec16cb operator ! (Vec16c const & a) { - return _mm_cmpeq_epi8(a,_mm_setzero_si128()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -static inline Vec16c select (Vec16cb const & s, Vec16c const & a, Vec16c const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec16c if_add (Vec16cb const & f, Vec16c const & a, Vec16c const & b) { - return a + (Vec16c(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec16c const & a) { - __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128()); - __m128i sum2 = _mm_shuffle_epi32(sum1,2); - __m128i sum3 = _mm_add_epi16(sum1,sum2); - int8_t sum4 = (int8_t)_mm_cvtsi128_si32(sum3); // truncate to 8 bits - return sum4; // sign extend to 32 bits -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Each element is sign-extended before addition to avoid overflow -static inline int32_t horizontal_add_x (Vec16c const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epi8(a); - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element - __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum - return _mm_cvtsi128_si32(sum3); -#elif INSTRSET >= 4 // SSSE3 - __m128i aeven = _mm_slli_epi16(a,8); // even numbered elements of a. get sign bit in position - aeven = _mm_srai_epi16(aeven,8); // sign extend even numbered elements - __m128i aodd = _mm_srai_epi16(a,8); // sign extend odd numbered elements - __m128i sum1 = _mm_add_epi16(aeven,aodd); // add even and odd elements - __m128i sum2 = _mm_hadd_epi16(sum1,sum1); // horizontally add 8 elements in 3 steps - __m128i sum3 = _mm_hadd_epi16(sum2,sum2); - __m128i sum4 = _mm_hadd_epi16(sum3,sum3); - int16_t sum5 = (int16_t)_mm_cvtsi128_si32(sum4); // 16 bit sum - return sum5; // sign extend to 32 bits -#else // SSE2 - __m128i aeven = _mm_slli_epi16(a,8); // even numbered elements of a. get sign bit in position - aeven = _mm_srai_epi16(aeven,8); // sign extend even numbered elements - __m128i aodd = _mm_srai_epi16(a,8); // sign extend odd numbered elements - __m128i sum1 = _mm_add_epi16(aeven,aodd); // add even and odd elements - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // 4 high elements - __m128i sum3 = _mm_add_epi16(sum1,sum2); // 4 sums - __m128i sum4 = _mm_shuffle_epi32(sum3,0x01); // 2 high elements - __m128i sum5 = _mm_add_epi16(sum3,sum4); // 2 sums - __m128i sum6 = _mm_shufflelo_epi16(sum5,0x01); // 1 high element - __m128i sum7 = _mm_add_epi16(sum5,sum6); // 1 sum - int16_t sum8 = _mm_cvtsi128_si32(sum7); // 16 bit sum - return sum8; // sign extend to 32 bits -#endif -} - - -// function add_saturated: add element by element, signed with saturation -static inline Vec16c add_saturated(Vec16c const & a, Vec16c const & b) { - return _mm_adds_epi8(a, b); -} - -// function sub_saturated: subtract element by element, signed with saturation -static inline Vec16c sub_saturated(Vec16c const & a, Vec16c const & b) { - return _mm_subs_epi8(a, b); -} - -// function max: a > b ? a : b -static inline Vec16c max(Vec16c const & a, Vec16c const & b) { -#if INSTRSET >= 5 // SSE4.1 - return _mm_max_epi8(a,b); -#else // SSE2 - __m128i signbit = _mm_set1_epi32(0x80808080); - __m128i a1 = _mm_xor_si128(a,signbit); // add 0x80 - __m128i b1 = _mm_xor_si128(b,signbit); // add 0x80 - __m128i m1 = _mm_max_epu8(a1,b1); // unsigned max - return _mm_xor_si128(m1,signbit); // sub 0x80 -#endif -} - -// function min: a < b ? a : b -static inline Vec16c min(Vec16c const & a, Vec16c const & b) { -#if INSTRSET >= 5 // SSE4.1 - return _mm_min_epi8(a,b); -#else // SSE2 - __m128i signbit = _mm_set1_epi32(0x80808080); - __m128i a1 = _mm_xor_si128(a,signbit); // add 0x80 - __m128i b1 = _mm_xor_si128(b,signbit); // add 0x80 - __m128i m1 = _mm_min_epu8(a1,b1); // unsigned min - return _mm_xor_si128(m1,signbit); // sub 0x80 -#endif -} - -// function abs: a >= 0 ? a : -a -static inline Vec16c abs(Vec16c const & a) { -#if INSTRSET >= 4 // SSSE3 supported - return _mm_sign_epi8(a,a); -#else // SSE2 - __m128i nega = _mm_sub_epi8(_mm_setzero_si128(), a); - return _mm_min_epu8(a, nega); // unsigned min (the negative value is bigger when compared as unsigned) -#endif -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec16c abs_saturated(Vec16c const & a) { - __m128i absa = abs(a); // abs(a) - __m128i overfl = _mm_cmpgt_epi8(_mm_setzero_si128(),absa);// 0 > a - return _mm_add_epi8(absa,overfl); // subtract 1 if 0x80 -} - -// function rotate_left: rotate each element left by b bits -// Use negative count to rotate right -static inline Vec16c rotate_left(Vec16c const & a, int b) { -#ifdef __XOP__ // AMD XOP instruction set - return _mm_rot_epi8(a,_mm_set1_epi8(b)); -#else // SSE2 instruction set - __m128i bb = _mm_cvtsi32_si128(b & 7); // b modulo 8 - __m128i mbb = _mm_cvtsi32_si128((8-b) & 7); // 8-b modulo 8 - __m128i maskeven = _mm_set1_epi32(0x00FF00FF); // mask for even numbered bytes - __m128i even = _mm_and_si128(a,maskeven); // even numbered bytes of a - __m128i odd = _mm_andnot_si128(maskeven,a); // odd numbered bytes of a - __m128i evenleft = _mm_sll_epi16(even,bb); // even bytes of a << b - __m128i oddleft = _mm_sll_epi16(odd,bb); // odd bytes of a << b - __m128i evenright = _mm_srl_epi16(even,mbb); // even bytes of a >> 8-b - __m128i oddright = _mm_srl_epi16(odd,mbb); // odd bytes of a >> 8-b - __m128i evenrot = _mm_or_si128(evenleft,evenright); // even bytes of a rotated - __m128i oddrot = _mm_or_si128(oddleft,oddright); // odd bytes of a rotated - __m128i allrot = selectb(maskeven,evenrot,oddrot); // all bytes rotated - return allrot; -#endif -} - - -/***************************************************************************** -* -* Vector of 16 8-bit unsigned integers -* -*****************************************************************************/ - -class Vec16uc : public Vec16c { -public: - // Default constructor: - Vec16uc() { - } - // Constructor to broadcast the same value into all elements: - Vec16uc(uint32_t i) { - xmm = _mm_set1_epi8((char)i); - } - // Constructor to build from all elements: - Vec16uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, - uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15) { - xmm = _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec16uc(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec16uc & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Member function to load 64-bit integer from array - Vec16uc & loadl(void const * p) { - xmm = _mm_loadl_epi64((__m128i const*)p); - return *this; - } - // Member function to load from array (unaligned) - Vec16uc & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec16uc & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec16uc const & insert(uint32_t index, uint8_t value) { - Vec16c::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint8_t extract(uint32_t index) const { - return Vec16c::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint8_t operator [] (uint32_t index) const { - return extract(index); - } -}; - -// Define operators for this class - -// vector operator << : shift left all elements -static inline Vec16uc operator << (Vec16uc const & a, uint32_t b) { - uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out - __m128i am = _mm_and_si128(a,_mm_set1_epi8((char)mask)); // remove bits that will overflow - __m128i res = _mm_sll_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts - return res; -} - -// vector operator << : shift left all elements -static inline Vec16uc operator << (Vec16uc const & a, int32_t b) { - return a << (uint32_t)b; -} - -// vector operator >> : shift right logical all elements -static inline Vec16uc operator >> (Vec16uc const & a, uint32_t b) { - uint32_t mask = (uint32_t)0xFF << (uint32_t)b; // mask to remove bits that are shifted out - __m128i am = _mm_and_si128(a,_mm_set1_epi8((char)mask)); // remove bits that will overflow - __m128i res = _mm_srl_epi16(am,_mm_cvtsi32_si128(b));// 16-bit shifts - return res; -} - -// vector operator >> : shift right logical all elements -static inline Vec16uc operator >> (Vec16uc const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right logical -static inline Vec16uc & operator >>= (Vec16uc & a, int b) { - a = a >> b; - return a; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec16cb operator >= (Vec16uc const & a, Vec16uc const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec16cb)_mm_comge_epu8(a,b); -#else // SSE2 instruction set - return (Vec16cb)_mm_cmpeq_epi8(_mm_max_epu8(a,b),a); // a == max(a,b) -#endif -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec16cb operator <= (Vec16uc const & a, Vec16uc const & b) { - return b >= a; -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec16cb operator > (Vec16uc const & a, Vec16uc const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec16cb)_mm_comgt_epu8(a,b); -#else // SSE2 instruction set - return Vec16cb(Vec16c(~(b >= a))); -#endif -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec16cb operator < (Vec16uc const & a, Vec16uc const & b) { - return b > a; -} - -// vector operator + : add -static inline Vec16uc operator + (Vec16uc const & a, Vec16uc const & b) { - return Vec16uc (Vec16c(a) + Vec16c(b)); -} - -// vector operator - : subtract -static inline Vec16uc operator - (Vec16uc const & a, Vec16uc const & b) { - return Vec16uc (Vec16c(a) - Vec16c(b)); -} - -// vector operator * : multiply -static inline Vec16uc operator * (Vec16uc const & a, Vec16uc const & b) { - return Vec16uc (Vec16c(a) * Vec16c(b)); -} - -// vector operator & : bitwise and -static inline Vec16uc operator & (Vec16uc const & a, Vec16uc const & b) { - return Vec16uc(Vec128b(a) & Vec128b(b)); -} -static inline Vec16uc operator && (Vec16uc const & a, Vec16uc const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec16uc operator | (Vec16uc const & a, Vec16uc const & b) { - return Vec16uc(Vec128b(a) | Vec128b(b)); -} -static inline Vec16uc operator || (Vec16uc const & a, Vec16uc const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec16uc operator ^ (Vec16uc const & a, Vec16uc const & b) { - return Vec16uc(Vec128b(a) ^ Vec128b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec16uc operator ~ (Vec16uc const & a) { - return Vec16uc( ~ Vec128b(a)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec16uc select (Vec16cb const & s, Vec16uc const & a, Vec16uc const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec16uc if_add (Vec16cb const & f, Vec16uc const & a, Vec16uc const & b) { - return a + (Vec16uc(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -// (Note: horizontal_add_x(Vec16uc) is slightly faster) -static inline uint32_t horizontal_add (Vec16uc const & a) { - __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128()); - __m128i sum2 = _mm_shuffle_epi32(sum1,2); - __m128i sum3 = _mm_add_epi16(sum1,sum2); - uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3); // truncate to 16 bits - return sum4; -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Each element is zero-extended before addition to avoid overflow -static inline uint32_t horizontal_add_x (Vec16uc const & a) { - __m128i sum1 = _mm_sad_epu8(a,_mm_setzero_si128()); - __m128i sum2 = _mm_shuffle_epi32(sum1,2); - __m128i sum3 = _mm_add_epi16(sum1,sum2); - return _mm_cvtsi128_si32(sum3); -} - -// function add_saturated: add element by element, unsigned with saturation -static inline Vec16uc add_saturated(Vec16uc const & a, Vec16uc const & b) { - return _mm_adds_epu8(a, b); -} - -// function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec16uc sub_saturated(Vec16uc const & a, Vec16uc const & b) { - return _mm_subs_epu8(a, b); -} - -// function max: a > b ? a : b -static inline Vec16uc max(Vec16uc const & a, Vec16uc const & b) { - return _mm_max_epu8(a,b); -} - -// function min: a < b ? a : b -static inline Vec16uc min(Vec16uc const & a, Vec16uc const & b) { - return _mm_min_epu8(a,b); -} - -// function avg: (a + b + 1) >> 1 -static inline Vec16uc avg(Vec16uc const & a, Vec16uc const & b) { - return _mm_avg_epu8(a,b); -} - - - -/***************************************************************************** -* -* Vector of 8 16-bit signed integers -* -*****************************************************************************/ - -class Vec8s : public Vec128b { -public: - // Default constructor: - Vec8s() { - } - // Constructor to broadcast the same value into all elements: - Vec8s(int i) { - xmm = _mm_set1_epi16((int16_t)i); - } - // Constructor to build from all elements: - Vec8s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7) { - xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec8s(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec8s & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128i used in intrinsics - operator __m128i() const { - return xmm; - } - // Member function to load 64-bit integer from array - Vec8s & loadl(void const * p) { - xmm = _mm_loadl_epi64((__m128i const*)p); - return *this; - } - // Member function to load from array (unaligned) - Vec8s & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec8s & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to load 8 8-bit unsigned integers from array - Vec8s & load_8uc(void const * p) { -#if INSTRSET >= 5 // SSE4.1 - xmm = _mm_cvtepu8_epi16(Vec16uc().loadl(p)); -#else - xmm = _mm_unpacklo_epi8(Vec16uc().loadl(p),_mm_setzero_si128()); -#endif - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec8s & load_partial(int n, void const * p) { - if (n >= 8) load(p); - else if (n <= 0) *this = 0; - else if (((int)(intptr_t)p & 0xFFF) < 0xFF0) { - // p is at least 16 bytes from a page boundary. OK to read 16 bytes - load(p); - } - else { - // worst case. read 1 byte at a time and suffer store forwarding penalty - int16_t x[8]; - for (int i = 0; i < n; i++) x[i] = ((int16_t const *)p)[i]; - load(x); - } - cutoff(n); - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - if (n >= 8) { - store(p); - return; - } - if (n <= 0) return; - // we are not using _mm_maskmoveu_si128 because it is too slow on many processors - union { - int8_t c[16]; - int16_t s[8]; - int32_t i[4]; - int64_t q[2]; - } u; - store(u.c); - int j = 0; - if (n & 4) { - *(int64_t*)p = u.q[0]; - j += 8; - } - if (n & 2) { - ((int32_t*)p)[j/4] = u.i[j/4]; - j += 4; - } - if (n & 1) { - ((int16_t*)p)[j/2] = u.s[j/2]; - } - } - // cut off vector to n elements. The last 8-n elements are set to zero - Vec8s & cutoff(int n) { - *this = Vec16c(xmm).cutoff(n * 2); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8s const & insert(uint32_t index, int16_t value) { - switch(index) { - case 0: - xmm = _mm_insert_epi16(xmm,value,0); break; - case 1: - xmm = _mm_insert_epi16(xmm,value,1); break; - case 2: - xmm = _mm_insert_epi16(xmm,value,2); break; - case 3: - xmm = _mm_insert_epi16(xmm,value,3); break; - case 4: - xmm = _mm_insert_epi16(xmm,value,4); break; - case 5: - xmm = _mm_insert_epi16(xmm,value,5); break; - case 6: - xmm = _mm_insert_epi16(xmm,value,6); break; - case 7: - xmm = _mm_insert_epi16(xmm,value,7); break; - } - return *this; - } - // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - int16_t extract(uint32_t index) const { - switch(index) { - case 0: - return (int16_t)_mm_extract_epi16(xmm,0); - case 1: - return (int16_t)_mm_extract_epi16(xmm,1); - case 2: - return (int16_t)_mm_extract_epi16(xmm,2); - case 3: - return (int16_t)_mm_extract_epi16(xmm,3); - case 4: - return (int16_t)_mm_extract_epi16(xmm,4); - case 5: - return (int16_t)_mm_extract_epi16(xmm,5); - case 6: - return (int16_t)_mm_extract_epi16(xmm,6); - case 7: - return (int16_t)_mm_extract_epi16(xmm,7); - } - return 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int16_t operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 8; - } -}; - -/***************************************************************************** -* -* Vec8sb: Vector of 8 Booleans for use with Vec8s and Vec8us -* -*****************************************************************************/ - -class Vec8sb : public Vec8s { -public: - // Constructor to build from all elements: - Vec8sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) { - xmm = Vec8s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7)); - } - // Default constructor: - Vec8sb() { - } - // Constructor to convert from type __m128i used in intrinsics: - Vec8sb(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec8sb & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec8sb(bool b) : Vec8s(-int16_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec8sb & operator = (bool b) { - *this = Vec8sb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec8sb(int b); - Vec8sb & operator = (int x); -public: - Vec8sb & insert (int index, bool a) { - Vec8s::insert(index, -(int)a); - return *this; - } - // Member function extract a single element from vector - // Note: This function is inefficient. Use store function if extracting more than one element - bool extract(uint32_t index) const { - return Vec8s::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec8sb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec8sb operator & (Vec8sb const & a, Vec8sb const & b) { - return Vec8sb(Vec128b(a) & Vec128b(b)); -} -static inline Vec8sb operator && (Vec8sb const & a, Vec8sb const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec8sb & operator &= (Vec8sb & a, Vec8sb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec8sb operator | (Vec8sb const & a, Vec8sb const & b) { - return Vec8sb(Vec128b(a) | Vec128b(b)); -} -static inline Vec8sb operator || (Vec8sb const & a, Vec8sb const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec8sb & operator |= (Vec8sb & a, Vec8sb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8sb operator ^ (Vec8sb const & a, Vec8sb const & b) { - return Vec8sb(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec8sb & operator ^= (Vec8sb & a, Vec8sb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec8sb operator ~ (Vec8sb const & a) { - return Vec8sb( ~ Vec128b(a)); -} - -// vector operator ! : element not -static inline Vec8sb operator ! (Vec8sb const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec8sb andnot (Vec8sb const & a, Vec8sb const & b) { - return Vec8sb(andnot(Vec128b(a), Vec128b(b))); -} - -// Horizontal Boolean functions for Vec8sb - -// horizontal_and. Returns true if all elements are true -static inline bool horizontal_and(Vec8sb const & a) { - return _mm_movemask_epi8(a) == 0xFFFF; -} - -// horizontal_or. Returns true if at least one element is true -static inline bool horizontal_or(Vec8sb const & a) { -#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST - return !_mm_testz_si128(a, a); -#else - return _mm_movemask_epi8(a) != 0; -#endif -} - - -/***************************************************************************** -* -* operators for Vec8s -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec8s operator + (Vec8s const & a, Vec8s const & b) { - return _mm_add_epi16(a, b); -} - -// vector operator += : add -static inline Vec8s & operator += (Vec8s & a, Vec8s const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec8s operator ++ (Vec8s & a, int) { - Vec8s a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec8s & operator ++ (Vec8s & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec8s operator - (Vec8s const & a, Vec8s const & b) { - return _mm_sub_epi16(a, b); -} - -// vector operator - : unary minus -static inline Vec8s operator - (Vec8s const & a) { - return _mm_sub_epi16(_mm_setzero_si128(), a); -} - -// vector operator -= : subtract -static inline Vec8s & operator -= (Vec8s & a, Vec8s const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec8s operator -- (Vec8s & a, int) { - Vec8s a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec8s & operator -- (Vec8s & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec8s operator * (Vec8s const & a, Vec8s const & b) { - return _mm_mullo_epi16(a, b); -} - -// vector operator *= : multiply -static inline Vec8s & operator *= (Vec8s & a, Vec8s const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -// See bottom of file - - -// vector operator << : shift left -static inline Vec8s operator << (Vec8s const & a, int b) { - return _mm_sll_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator <<= : shift left -static inline Vec8s & operator <<= (Vec8s & a, int b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic -static inline Vec8s operator >> (Vec8s const & a, int b) { - return _mm_sra_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >>= : shift right arithmetic -static inline Vec8s & operator >>= (Vec8s & a, int b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec8sb operator == (Vec8s const & a, Vec8s const & b) { - return _mm_cmpeq_epi16(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec8sb operator != (Vec8s const & a, Vec8s const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec8sb)_mm_comneq_epi16(a,b); -#else // SSE2 instruction set - return Vec8sb(Vec8s(~(a == b))); -#endif -} - -// vector operator > : returns true for elements for which a > b -static inline Vec8sb operator > (Vec8s const & a, Vec8s const & b) { - return _mm_cmpgt_epi16(a, b); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec8sb operator < (Vec8s const & a, Vec8s const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec8sb operator >= (Vec8s const & a, Vec8s const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec8sb)_mm_comge_epi16(a,b); -#else // SSE2 instruction set - return Vec8sb(Vec8s(~(b > a))); -#endif -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec8sb operator <= (Vec8s const & a, Vec8s const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec8s operator & (Vec8s const & a, Vec8s const & b) { - return Vec8s(Vec128b(a) & Vec128b(b)); -} -static inline Vec8s operator && (Vec8s const & a, Vec8s const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec8s & operator &= (Vec8s & a, Vec8s const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec8s operator | (Vec8s const & a, Vec8s const & b) { - return Vec8s(Vec128b(a) | Vec128b(b)); -} -static inline Vec8s operator || (Vec8s const & a, Vec8s const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec8s & operator |= (Vec8s & a, Vec8s const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8s operator ^ (Vec8s const & a, Vec8s const & b) { - return Vec8s(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec8s & operator ^= (Vec8s & a, Vec8s const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec8s operator ~ (Vec8s const & a) { - return Vec8s( ~ Vec128b(a)); -} - -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec8s operator ! (Vec8s const & a) { - return _mm_cmpeq_epi16(a,_mm_setzero_si128()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec8s select (Vec8sb const & s, Vec8s const & a, Vec8s const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8s if_add (Vec8sb const & f, Vec8s const & a, Vec8s const & b) { - return a + (Vec8s(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec8s const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epi16(a); - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element - __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum - int16_t sum4 = _mm_cvtsi128_si32(sum3); // truncate to 16 bits - return sum4; // sign extend to 32 bits -#elif INSTRSET >= 4 // SSSE3 - __m128i sum1 = _mm_hadd_epi16(a,a); // horizontally add 8 elements in 3 steps - __m128i sum2 = _mm_hadd_epi16(sum1,sum1); - __m128i sum3 = _mm_hadd_epi16(sum2,sum2); - int16_t sum4 = (int16_t)_mm_cvtsi128_si32(sum3); // 16 bit sum - return sum4; // sign extend to 32 bits -#else // SSE2 - __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 4 high elements - __m128i sum2 = _mm_add_epi16(a,sum1); // 4 sums - __m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 2 high elements - __m128i sum4 = _mm_add_epi16(sum2,sum3); // 2 sums - __m128i sum5 = _mm_shufflelo_epi16(sum4,0x01); // 1 high element - __m128i sum6 = _mm_add_epi16(sum4,sum5); // 1 sum - int16_t sum7 = _mm_cvtsi128_si32(sum6); // 16 bit sum - return sum7; // sign extend to 32 bits -#endif -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are sign extended before adding to avoid overflow -static inline int32_t horizontal_add_x (Vec8s const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epi16(a); - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element - __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum - return _mm_cvtsi128_si32(sum3); -#elif INSTRSET >= 4 // SSSE3 - __m128i aeven = _mm_slli_epi32(a,16); // even numbered elements of a. get sign bit in position - aeven = _mm_srai_epi32(aeven,16); // sign extend even numbered elements - __m128i aodd = _mm_srai_epi32(a,16); // sign extend odd numbered elements - __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements - __m128i sum2 = _mm_hadd_epi32(sum1,sum1); // horizontally add 4 elements in 2 steps - __m128i sum3 = _mm_hadd_epi32(sum2,sum2); - return _mm_cvtsi128_si32(sum3); -#else // SSE2 - __m128i aeven = _mm_slli_epi32(a,16); // even numbered elements of a. get sign bit in position - aeven = _mm_srai_epi32(aeven,16); // sign extend even numbered elements - __m128i aodd = _mm_srai_epi32(a,16); // sign extend odd numbered elements - __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // 2 high elements - __m128i sum3 = _mm_add_epi32(sum1,sum2); - __m128i sum4 = _mm_shuffle_epi32(sum3,0x01); // 1 high elements - __m128i sum5 = _mm_add_epi32(sum3,sum4); - return _mm_cvtsi128_si32(sum5); // 32 bit sum -#endif -} - -// function add_saturated: add element by element, signed with saturation -static inline Vec8s add_saturated(Vec8s const & a, Vec8s const & b) { - return _mm_adds_epi16(a, b); -} - -// function sub_saturated: subtract element by element, signed with saturation -static inline Vec8s sub_saturated(Vec8s const & a, Vec8s const & b) { - return _mm_subs_epi16(a, b); -} - -// function max: a > b ? a : b -static inline Vec8s max(Vec8s const & a, Vec8s const & b) { - return _mm_max_epi16(a,b); -} - -// function min: a < b ? a : b -static inline Vec8s min(Vec8s const & a, Vec8s const & b) { - return _mm_min_epi16(a,b); -} - -// function abs: a >= 0 ? a : -a -static inline Vec8s abs(Vec8s const & a) { -#if INSTRSET >= 4 // SSSE3 supported - return _mm_sign_epi16(a,a); -#else // SSE2 - __m128i nega = _mm_sub_epi16(_mm_setzero_si128(), a); - return _mm_max_epi16(a, nega); -#endif -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec8s abs_saturated(Vec8s const & a) { - __m128i absa = abs(a); // abs(a) - __m128i overfl = _mm_srai_epi16(absa,15); // sign - return _mm_add_epi16(absa,overfl); // subtract 1 if 0x8000 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec8s rotate_left(Vec8s const & a, int b) { -#ifdef __XOP__ // AMD XOP instruction set - return _mm_rot_epi16(a,_mm_set1_epi16(b)); -#else // SSE2 instruction set - __m128i left = _mm_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F)); // a << b - __m128i right = _mm_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0x0F)); // a >> (16 - b) - __m128i rot = _mm_or_si128(left,right); // or - return rot; -#endif -} - - -/***************************************************************************** -* -* Vector of 8 16-bit unsigned integers -* -*****************************************************************************/ - -class Vec8us : public Vec8s { -public: - // Default constructor: - Vec8us() { - } - // Constructor to broadcast the same value into all elements: - Vec8us(uint32_t i) { - xmm = _mm_set1_epi16((int16_t)i); - } - // Constructor to build from all elements: - Vec8us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7) { - xmm = _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec8us(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec8us & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Member function to load 64-bit integer from array - Vec8us & loadl(void const * p) { - xmm = _mm_loadl_epi64((__m128i const*)p); - return *this; - } - // Member function to load from array (unaligned) - Vec8us & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec8us & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8us const & insert(uint32_t index, uint16_t value) { - Vec8s::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint16_t extract(uint32_t index) const { - return Vec8s::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint16_t operator [] (uint32_t index) const { - return extract(index); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec8us operator + (Vec8us const & a, Vec8us const & b) { - return Vec8us (Vec8s(a) + Vec8s(b)); -} - -// vector operator - : subtract -static inline Vec8us operator - (Vec8us const & a, Vec8us const & b) { - return Vec8us (Vec8s(a) - Vec8s(b)); -} - -// vector operator * : multiply -static inline Vec8us operator * (Vec8us const & a, Vec8us const & b) { - return Vec8us (Vec8s(a) * Vec8s(b)); -} - -// vector operator / : divide -// See bottom of file - -// vector operator >> : shift right logical all elements -static inline Vec8us operator >> (Vec8us const & a, uint32_t b) { - return _mm_srl_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec8us operator >> (Vec8us const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right logical -static inline Vec8us & operator >>= (Vec8us & a, int b) { - a = a >> b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec8us operator << (Vec8us const & a, uint32_t b) { - return _mm_sll_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator << : shift left all elements -static inline Vec8us operator << (Vec8us const & a, int32_t b) { - return a << (uint32_t)b; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec8sb operator >= (Vec8us const & a, Vec8us const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return _mm_comge_epu16(a,b); -#elif INSTRSET >= 5 // SSE4.1 - __m128i max_ab = _mm_max_epu16(a,b); // max(a,b), unsigned - return _mm_cmpeq_epi16(a,max_ab); // a == max(a,b) -#else // SSE2 instruction set - __m128i s = _mm_subs_epu16(b,a); // b-a, saturated - return _mm_cmpeq_epi16(s, _mm_setzero_si128()); // s == 0 -#endif -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec8sb operator <= (Vec8us const & a, Vec8us const & b) { - return b >= a; -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec8sb operator > (Vec8us const & a, Vec8us const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec8sb)_mm_comgt_epu16(a,b); -#else // SSE2 instruction set - return Vec8sb(Vec8s(~(b >= a))); -#endif -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec8sb operator < (Vec8us const & a, Vec8us const & b) { - return b > a; -} - -// vector operator & : bitwise and -static inline Vec8us operator & (Vec8us const & a, Vec8us const & b) { - return Vec8us(Vec128b(a) & Vec128b(b)); -} -static inline Vec8us operator && (Vec8us const & a, Vec8us const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec8us operator | (Vec8us const & a, Vec8us const & b) { - return Vec8us(Vec128b(a) | Vec128b(b)); -} -static inline Vec8us operator || (Vec8us const & a, Vec8us const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec8us operator ^ (Vec8us const & a, Vec8us const & b) { - return Vec8us(Vec128b(a) ^ Vec128b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec8us operator ~ (Vec8us const & a) { - return Vec8us( ~ Vec128b(a)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each word in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec8us select (Vec8sb const & s, Vec8us const & a, Vec8us const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8us if_add (Vec8sb const & f, Vec8us const & a, Vec8us const & b) { - return a + (Vec8us(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec8us const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epu16(a); - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element - __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum - uint16_t sum4 = _mm_cvtsi128_si32(sum3); // truncate to 16 bits - return sum4; // zero extend to 32 bits -#elif INSTRSET >= 4 // SSSE3 - __m128i sum1 = _mm_hadd_epi16(a,a); // horizontally add 8 elements in 3 steps - __m128i sum2 = _mm_hadd_epi16(sum1,sum1); - __m128i sum3 = _mm_hadd_epi16(sum2,sum2); - uint16_t sum4 = (uint16_t)_mm_cvtsi128_si32(sum3); // 16 bit sum - return sum4; // zero extend to 32 bits -#else // SSE2 - __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 4 high elements - __m128i sum2 = _mm_add_epi16(a,sum1); // 4 sums - __m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 2 high elements - __m128i sum4 = _mm_add_epi16(sum2,sum3); // 2 sums - __m128i sum5 = _mm_shufflelo_epi16(sum4,0x01); // 1 high element - __m128i sum6 = _mm_add_epi16(sum4,sum5); // 1 sum - uint16_t sum7 = _mm_cvtsi128_si32(sum6); // 16 bit sum - return sum7; // zero extend to 32 bits -#endif -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Each element is zero-extended before addition to avoid overflow -static inline uint32_t horizontal_add_x (Vec8us const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epu16(a); - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element - __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum - return _mm_cvtsi128_si32(sum3); -#elif INSTRSET >= 4 // SSSE3 - __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for even positions - __m128i aeven = _mm_and_si128(a,mask); // even numbered elements of a - __m128i aodd = _mm_srli_epi32(a,16); // zero extend odd numbered elements - __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements - __m128i sum2 = _mm_hadd_epi32(sum1,sum1); // horizontally add 4 elements in 2 steps - __m128i sum3 = _mm_hadd_epi32(sum2,sum2); - return _mm_cvtsi128_si32(sum3); -#else // SSE2 - __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for even positions - __m128i aeven = _mm_and_si128(a,mask); // even numbered elements of a - __m128i aodd = _mm_srli_epi32(a,16); // zero extend odd numbered elements - __m128i sum1 = _mm_add_epi32(aeven,aodd); // add even and odd elements - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // 2 high elements - __m128i sum3 = _mm_add_epi32(sum1,sum2); - __m128i sum4 = _mm_shuffle_epi32(sum3,0x01); // 1 high elements - __m128i sum5 = _mm_add_epi32(sum3,sum4); - return _mm_cvtsi128_si32(sum5); // 16 bit sum -#endif -} - -// function add_saturated: add element by element, unsigned with saturation -static inline Vec8us add_saturated(Vec8us const & a, Vec8us const & b) { - return _mm_adds_epu16(a, b); -} - -// function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec8us sub_saturated(Vec8us const & a, Vec8us const & b) { - return _mm_subs_epu16(a, b); -} - -// function max: a > b ? a : b -static inline Vec8us max(Vec8us const & a, Vec8us const & b) { -#if INSTRSET >= 5 // SSE4.1 - return _mm_max_epu16(a,b); -#else // SSE2 - __m128i signbit = _mm_set1_epi32(0x80008000); - __m128i a1 = _mm_xor_si128(a,signbit); // add 0x8000 - __m128i b1 = _mm_xor_si128(b,signbit); // add 0x8000 - __m128i m1 = _mm_max_epi16(a1,b1); // signed max - return _mm_xor_si128(m1,signbit); // sub 0x8000 -#endif -} - -// function min: a < b ? a : b -static inline Vec8us min(Vec8us const & a, Vec8us const & b) { -#if INSTRSET >= 5 // SSE4.1 - return _mm_min_epu16(a,b); -#else // SSE2 - __m128i signbit = _mm_set1_epi32(0x80008000); - __m128i a1 = _mm_xor_si128(a,signbit); // add 0x8000 - __m128i b1 = _mm_xor_si128(b,signbit); // add 0x8000 - __m128i m1 = _mm_min_epi16(a1,b1); // signed min - return _mm_xor_si128(m1,signbit); // sub 0x8000 -#endif -} - -// function avg: (a + b + 1) >> 1 -static inline Vec8us avg(Vec8us const & a, Vec8us const & b) { - return _mm_avg_epu16(a,b); -} - - - -/***************************************************************************** -* -* Vector of 4 32-bit signed integers -* -*****************************************************************************/ - -class Vec4i : public Vec128b { -public: - // Default constructor: - Vec4i() { - } - // Constructor to broadcast the same value into all elements: - Vec4i(int i) { - xmm = _mm_set1_epi32(i); - } - // Constructor to build from all elements: - Vec4i(int32_t i0, int32_t i1, int32_t i2, int32_t i3) { - xmm = _mm_setr_epi32(i0, i1, i2, i3); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec4i(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec4i & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128i used in intrinsics - operator __m128i() const { - return xmm; - } - // Member function to load from array (unaligned) - Vec4i & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec4i & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to load 4 8-bit unsigned integers from array - Vec4i & load_4uc(void const * p) { -#if INSTRSET >= 5 // SSE4.1 - xmm = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t const*)p)); -#else - __m128i zero = _mm_setzero_si128(); - xmm = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int32_t const*)p),zero),zero); -#endif - return *this; - } - // Member function to load 4 16-bit unsigned integers from array - Vec4i & load_4us(void const * p) { -#if INSTRSET >= 5 // SSE4.1 - xmm = _mm_cvtepu16_epi32(Vec8us().loadl(p)); -#else - xmm = _mm_unpacklo_epi16(Vec8us().loadl(p),_mm_setzero_si128()); -#endif - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec4i & load_partial(int n, void const * p) { - switch (n) { - case 0: - *this = 0; break; - case 1: - xmm = _mm_cvtsi32_si128(*(int32_t const*)p); break; - case 2: - // intrinsic for movq is missing! - xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], 0, 0); break; - case 3: - xmm = _mm_setr_epi32(((int32_t const*)p)[0], ((int32_t const*)p)[1], ((int32_t const*)p)[2], 0); break; - case 4: - load(p); break; - default: - break; - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - union { - int32_t i[4]; - int64_t q[2]; - } u; - switch (n) { - case 1: - *(int32_t*)p = _mm_cvtsi128_si32(xmm); break; - case 2: - // intrinsic for movq is missing! - store(u.i); - *(int64_t*)p = u.q[0]; break; - case 3: - store(u.i); - *(int64_t*)p = u.q[0]; - ((int32_t*)p)[2] = u.i[2]; break; - case 4: - store(p); break; - default: - break; - } - } - // cut off vector to n elements. The last 4-n elements are set to zero - Vec4i & cutoff(int n) { - *this = Vec16c(xmm).cutoff(n * 4); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4i const & insert(uint32_t index, int32_t value) { - static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0}; - __m128i broad = _mm_set1_epi32(value); // broadcast value into all elements - __m128i mask = _mm_loadu_si128((__m128i const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position - xmm = selectb(mask,broad,xmm); - return *this; - } - // Member function extract a single element from vector - int32_t extract(uint32_t index) const { - int32_t x[4]; - store(x); - return x[index & 3]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int32_t operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 4; - } -}; - - -/***************************************************************************** -* -* Vec4ib: Vector of 4 Booleans for use with Vec4i and Vec4ui -* -*****************************************************************************/ -class Vec4ib : public Vec4i { -public: - // Default constructor: - Vec4ib() { - } - // Constructor to build from all elements: - Vec4ib(bool x0, bool x1, bool x2, bool x3) { - xmm = Vec4i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3)); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec4ib(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec4ib & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec4ib(bool b) : Vec4i(-int32_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec4ib & operator = (bool b) { - *this = Vec4ib(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec4ib(int b); - Vec4ib & operator = (int x); -public: - Vec4ib & insert (int index, bool a) { - Vec4i::insert(index, -(int)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec4i::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec4ib -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec4ib operator & (Vec4ib const & a, Vec4ib const & b) { - return Vec4ib(Vec128b(a) & Vec128b(b)); -} -static inline Vec4ib operator && (Vec4ib const & a, Vec4ib const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec4ib & operator &= (Vec4ib & a, Vec4ib const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec4ib operator | (Vec4ib const & a, Vec4ib const & b) { - return Vec4ib(Vec128b(a) | Vec128b(b)); -} -static inline Vec4ib operator || (Vec4ib const & a, Vec4ib const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec4ib & operator |= (Vec4ib & a, Vec4ib const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4ib operator ^ (Vec4ib const & a, Vec4ib const & b) { - return Vec4ib(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec4ib & operator ^= (Vec4ib & a, Vec4ib const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec4ib operator ~ (Vec4ib const & a) { - return Vec4ib( ~ Vec128b(a)); -} - -// vector operator ! : element not -static inline Vec4ib operator ! (Vec4ib const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec4ib andnot (Vec4ib const & a, Vec4ib const & b) { - return Vec4ib(andnot(Vec128b(a), Vec128b(b))); -} - -// Horizontal Boolean functions for Vec4ib - -// horizontal_and. Returns true if all elements are true -static inline bool horizontal_and(Vec4ib const & a) { - return _mm_movemask_epi8(a) == 0xFFFF; -} - -// horizontal_or. Returns true if at least one element is true -static inline bool horizontal_or(Vec4ib const & a) { -#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST - return !_mm_testz_si128(a, a); -#else - return _mm_movemask_epi8(a) != 0; -#endif -} - - -/***************************************************************************** -* -* Operators for Vec4i -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec4i operator + (Vec4i const & a, Vec4i const & b) { - return _mm_add_epi32(a, b); -} - -// vector operator += : add -static inline Vec4i & operator += (Vec4i & a, Vec4i const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec4i operator ++ (Vec4i & a, int) { - Vec4i a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec4i & operator ++ (Vec4i & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec4i operator - (Vec4i const & a, Vec4i const & b) { - return _mm_sub_epi32(a, b); -} - -// vector operator - : unary minus -static inline Vec4i operator - (Vec4i const & a) { - return _mm_sub_epi32(_mm_setzero_si128(), a); -} - -// vector operator -= : subtract -static inline Vec4i & operator -= (Vec4i & a, Vec4i const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec4i operator -- (Vec4i & a, int) { - Vec4i a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec4i & operator -- (Vec4i & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec4i operator * (Vec4i const & a, Vec4i const & b) { -#if INSTRSET >= 5 // SSE4.1 instruction set - return _mm_mullo_epi32(a, b); -#else - __m128i a13 = _mm_shuffle_epi32(a, 0xF5); // (-,a3,-,a1) - __m128i b13 = _mm_shuffle_epi32(b, 0xF5); // (-,b3,-,b1) - __m128i prod02 = _mm_mul_epu32(a, b); // (-,a2*b2,-,a0*b0) - __m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1) - __m128i prod01 = _mm_unpacklo_epi32(prod02,prod13); // (-,-,a1*b1,a0*b0) - __m128i prod23 = _mm_unpackhi_epi32(prod02,prod13); // (-,-,a3*b3,a2*b2) - return _mm_unpacklo_epi64(prod01,prod23); // (ab3,ab2,ab1,ab0) -#endif -} - -// vector operator *= : multiply -static inline Vec4i & operator *= (Vec4i & a, Vec4i const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -// See bottom of file - - -// vector operator << : shift left -static inline Vec4i operator << (Vec4i const & a, int32_t b) { - return _mm_sll_epi32(a,_mm_cvtsi32_si128(b)); -} - -// vector operator <<= : shift left -static inline Vec4i & operator <<= (Vec4i & a, int32_t b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic -static inline Vec4i operator >> (Vec4i const & a, int32_t b) { - return _mm_sra_epi32(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >>= : shift right arithmetic -static inline Vec4i & operator >>= (Vec4i & a, int32_t b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec4ib operator == (Vec4i const & a, Vec4i const & b) { - return _mm_cmpeq_epi32(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec4ib operator != (Vec4i const & a, Vec4i const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec4ib)_mm_comneq_epi32(a,b); -#else // SSE2 instruction set - return Vec4ib(Vec4i (~(a == b))); -#endif -} - -// vector operator > : returns true for elements for which a > b -static inline Vec4ib operator > (Vec4i const & a, Vec4i const & b) { - return _mm_cmpgt_epi32(a, b); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec4ib operator < (Vec4i const & a, Vec4i const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec4ib operator >= (Vec4i const & a, Vec4i const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec4ib)_mm_comge_epi32(a,b); -#else // SSE2 instruction set - return Vec4ib(Vec4i (~(b > a))); -#endif -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec4ib operator <= (Vec4i const & a, Vec4i const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec4i operator & (Vec4i const & a, Vec4i const & b) { - return Vec4i(Vec128b(a) & Vec128b(b)); -} -static inline Vec4i operator && (Vec4i const & a, Vec4i const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec4i & operator &= (Vec4i & a, Vec4i const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec4i operator | (Vec4i const & a, Vec4i const & b) { - return Vec4i(Vec128b(a) | Vec128b(b)); -} -static inline Vec4i operator || (Vec4i const & a, Vec4i const & b) { - return a | b; -} -// vector operator |= : bitwise and -static inline Vec4i & operator |= (Vec4i & a, Vec4i const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4i operator ^ (Vec4i const & a, Vec4i const & b) { - return Vec4i(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise and -static inline Vec4i & operator ^= (Vec4i & a, Vec4i const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec4i operator ~ (Vec4i const & a) { - return Vec4i( ~ Vec128b(a)); -} - -// vector operator ! : returns true for elements == 0 -static inline Vec4ib operator ! (Vec4i const & a) { - return _mm_cmpeq_epi32(a,_mm_setzero_si128()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec4i select (Vec4ib const & s, Vec4i const & a, Vec4i const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4i if_add (Vec4ib const & f, Vec4i const & a, Vec4i const & b) { - return a + (Vec4i(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec4i const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epi32(a); - __m128i sum2 = _mm_shuffle_epi32(sum1,0x0E); // high element - __m128i sum3 = _mm_add_epi32(sum1,sum2); // sum - return _mm_cvtsi128_si32(sum3); // truncate to 32 bits -#elif INSTRSET >= 4 // SSSE3 - __m128i sum1 = _mm_hadd_epi32(a,a); // horizontally add 4 elements in 2 steps - __m128i sum2 = _mm_hadd_epi32(sum1,sum1); - return _mm_cvtsi128_si32(sum2); // 32 bit sum -#else // SSE2 - __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // 2 high elements - __m128i sum2 = _mm_add_epi32(a,sum1); // 2 sums - __m128i sum3 = _mm_shuffle_epi32(sum2,0x01); // 1 high element - __m128i sum4 = _mm_add_epi32(sum2,sum3); // 2 sums - return _mm_cvtsi128_si32(sum4); // 32 bit sum -#endif -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are sign extended before adding to avoid overflow -static inline int64_t horizontal_add_x (Vec4i const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epi32(a); -#else // SSE2 - __m128i signs = _mm_srai_epi32(a,31); // sign of all elements - __m128i a01 = _mm_unpacklo_epi32(a,signs); // sign-extended a0, a1 - __m128i a23 = _mm_unpackhi_epi32(a,signs); // sign-extended a2, a3 - __m128i sum1 = _mm_add_epi64(a01,a23); // add -#endif - __m128i sum2 = _mm_unpackhi_epi64(sum1,sum1); // high qword - __m128i sum3 = _mm_add_epi64(sum1,sum2); // add -#if defined (__x86_64__) - return _mm_cvtsi128_si64(sum3); // 64 bit mode -#else - union { - __m128i x; // silly definition of _mm_storel_epi64 requires __m128i - int64_t i; - } u; - _mm_storel_epi64(&u.x,sum3); - return u.i; -#endif -} - -// function add_saturated: add element by element, signed with saturation -static inline Vec4i add_saturated(Vec4i const & a, Vec4i const & b) { - __m128i sum = _mm_add_epi32(a, b); // a + b - __m128i axb = _mm_xor_si128(a, b); // check if a and b have different sign - __m128i axs = _mm_xor_si128(a, sum); // check if a and sum have different sign - __m128i overf1 = _mm_andnot_si128(axb,axs); // check if sum has wrong sign - __m128i overf2 = _mm_srai_epi32(overf1,31); // -1 if overflow - __m128i asign = _mm_srli_epi32(a,31); // 1 if a < 0 - __m128i sat1 = _mm_srli_epi32(overf2,1); // 7FFFFFFF if overflow - __m128i sat2 = _mm_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow - return selectb(overf2,sat2,sum); // sum if not overflow, else sat2 -} - -// function sub_saturated: subtract element by element, signed with saturation -static inline Vec4i sub_saturated(Vec4i const & a, Vec4i const & b) { - __m128i diff = _mm_sub_epi32(a, b); // a + b - __m128i axb = _mm_xor_si128(a, b); // check if a and b have different sign - __m128i axs = _mm_xor_si128(a, diff); // check if a and sum have different sign - __m128i overf1 = _mm_and_si128(axb,axs); // check if sum has wrong sign - __m128i overf2 = _mm_srai_epi32(overf1,31); // -1 if overflow - __m128i asign = _mm_srli_epi32(a,31); // 1 if a < 0 - __m128i sat1 = _mm_srli_epi32(overf2,1); // 7FFFFFFF if overflow - __m128i sat2 = _mm_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow - return selectb(overf2,sat2,diff); // diff if not overflow, else sat2 -} - -// function max: a > b ? a : b -static inline Vec4i max(Vec4i const & a, Vec4i const & b) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_max_epi32(a,b); -#else - __m128i greater = _mm_cmpgt_epi32(a,b); - return selectb(greater,a,b); -#endif -} - -// function min: a < b ? a : b -static inline Vec4i min(Vec4i const & a, Vec4i const & b) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_min_epi32(a,b); -#else - __m128i greater = _mm_cmpgt_epi32(a,b); - return selectb(greater,b,a); -#endif -} - -// function abs: a >= 0 ? a : -a -static inline Vec4i abs(Vec4i const & a) { -#if INSTRSET >= 4 // SSSE3 supported - return _mm_sign_epi32(a,a); -#else // SSE2 - __m128i sign = _mm_srai_epi32(a,31); // sign of a - __m128i inv = _mm_xor_si128(a,sign); // invert bits if negative - return _mm_sub_epi32(inv,sign); // add 1 -#endif -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec4i abs_saturated(Vec4i const & a) { - __m128i absa = abs(a); // abs(a) - __m128i overfl = _mm_srai_epi32(absa,31); // sign - return _mm_add_epi32(absa,overfl); // subtract 1 if 0x80000000 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec4i rotate_left(Vec4i const & a, int b) { -#ifdef __AVX512VL__ - return _mm_rolv_epi32(a, _mm_set1_epi32(b)); -#elif defined __XOP__ // AMD XOP instruction set - return _mm_rot_epi32(a,_mm_set1_epi32(b)); -#else // SSE2 instruction set - __m128i left = _mm_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F)); // a << b - __m128i right = _mm_srl_epi32(a,_mm_cvtsi32_si128((32-b) & 0x1F)); // a >> (32 - b) - __m128i rot = _mm_or_si128(left,right); // or - return rot; -#endif -} - - -/***************************************************************************** -* -* Vector of 4 32-bit unsigned integers -* -*****************************************************************************/ - -class Vec4ui : public Vec4i { -public: - // Default constructor: - Vec4ui() { - } - // Constructor to broadcast the same value into all elements: - Vec4ui(uint32_t i) { - xmm = _mm_set1_epi32(i); - } - // Constructor to build from all elements: - Vec4ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3) { - xmm = _mm_setr_epi32(i0, i1, i2, i3); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec4ui(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec4ui & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Member function to load from array (unaligned) - Vec4ui & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec4ui & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4ui const & insert(uint32_t index, uint32_t value) { - Vec4i::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint32_t extract(uint32_t index) const { - return Vec4i::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint32_t operator [] (uint32_t index) const { - return extract(index); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec4ui operator + (Vec4ui const & a, Vec4ui const & b) { - return Vec4ui (Vec4i(a) + Vec4i(b)); -} - -// vector operator - : subtract -static inline Vec4ui operator - (Vec4ui const & a, Vec4ui const & b) { - return Vec4ui (Vec4i(a) - Vec4i(b)); -} - -// vector operator * : multiply -static inline Vec4ui operator * (Vec4ui const & a, Vec4ui const & b) { - return Vec4ui (Vec4i(a) * Vec4i(b)); -} - -// vector operator / : divide -// See bottom of file - -// vector operator >> : shift right logical all elements -static inline Vec4ui operator >> (Vec4ui const & a, uint32_t b) { - return _mm_srl_epi32(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec4ui operator >> (Vec4ui const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right logical -static inline Vec4ui & operator >>= (Vec4ui & a, int b) { - a = a >> b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec4ui operator << (Vec4ui const & a, uint32_t b) { - return Vec4ui ((Vec4i)a << (int32_t)b); -} - -// vector operator << : shift left all elements -static inline Vec4ui operator << (Vec4ui const & a, int32_t b) { - return Vec4ui ((Vec4i)a << (int32_t)b); -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec4ib operator > (Vec4ui const & a, Vec4ui const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec4ib)_mm_comgt_epu32(a,b); -#else // SSE2 instruction set - __m128i signbit = _mm_set1_epi32(0x80000000); - __m128i a1 = _mm_xor_si128(a,signbit); - __m128i b1 = _mm_xor_si128(b,signbit); - return (Vec4ib)_mm_cmpgt_epi32(a1,b1); // signed compare -#endif -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec4ib operator < (Vec4ui const & a, Vec4ui const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec4ib operator >= (Vec4ui const & a, Vec4ui const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return (Vec4ib)_mm_comge_epu32(a,b); -#elif INSTRSET >= 5 // SSE4.1 - __m128i max_ab = _mm_max_epu32(a,b); // max(a,b), unsigned - return (Vec4ib)_mm_cmpeq_epi32(a,max_ab); // a == max(a,b) -#else // SSE2 instruction set - return Vec4ib(Vec4i (~(b > a))); -#endif -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec4ib operator <= (Vec4ui const & a, Vec4ui const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec4ui operator & (Vec4ui const & a, Vec4ui const & b) { - return Vec4ui(Vec128b(a) & Vec128b(b)); -} -static inline Vec4ui operator && (Vec4ui const & a, Vec4ui const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec4ui operator | (Vec4ui const & a, Vec4ui const & b) { - return Vec4ui(Vec128b(a) | Vec128b(b)); -} -static inline Vec4ui operator || (Vec4ui const & a, Vec4ui const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec4ui operator ^ (Vec4ui const & a, Vec4ui const & b) { - return Vec4ui(Vec128b(a) ^ Vec128b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec4ui operator ~ (Vec4ui const & a) { - return Vec4ui( ~ Vec128b(a)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each word in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec4ui select (Vec4ib const & s, Vec4ui const & a, Vec4ui const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4ui if_add (Vec4ib const & f, Vec4ui const & a, Vec4ui const & b) { - return a + (Vec4ui(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec4ui const & a) { - return horizontal_add((Vec4i)a); -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are zero extended before adding to avoid overflow -static inline uint64_t horizontal_add_x (Vec4ui const & a) { -#ifdef __XOP__ // AMD XOP instruction set - __m128i sum1 = _mm_haddq_epu32(a); -#else // SSE2 - __m128i zero = _mm_setzero_si128(); // 0 - __m128i a01 = _mm_unpacklo_epi32(a,zero); // zero-extended a0, a1 - __m128i a23 = _mm_unpackhi_epi32(a,zero); // zero-extended a2, a3 - __m128i sum1 = _mm_add_epi64(a01,a23); // add -#endif - __m128i sum2 = _mm_unpackhi_epi64(sum1,sum1); // high qword - __m128i sum3 = _mm_add_epi64(sum1,sum2); // add -#if defined(_M_AMD64) || defined(_M_X64) || defined(__x86_64__) || defined(__amd64) - return _mm_cvtsi128_si64(sum3); // 64 bit mode -#else - union { - __m128i x; // silly definition of _mm_storel_epi64 requires __m128i - uint64_t i; - } u; - _mm_storel_epi64(&u.x,sum3); - return u.i; -#endif -} - -// function add_saturated: add element by element, unsigned with saturation -static inline Vec4ui add_saturated(Vec4ui const & a, Vec4ui const & b) { - Vec4ui sum = a + b; - Vec4ui aorb = Vec4ui(a | b); - Vec4ui overflow = Vec4ui(sum < aorb); // overflow if a + b < (a | b) - return Vec4ui (sum | overflow); // return 0xFFFFFFFF if overflow -} - -// function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec4ui sub_saturated(Vec4ui const & a, Vec4ui const & b) { - Vec4ui diff = a - b; - Vec4ui underflow = Vec4ui(diff > a); // underflow if a - b > a - return _mm_andnot_si128(underflow,diff); // return 0 if underflow -} - -// function max: a > b ? a : b -static inline Vec4ui max(Vec4ui const & a, Vec4ui const & b) { -#if INSTRSET >= 5 // SSE4.1 - return _mm_max_epu32(a,b); -#else // SSE2 - return select(a > b, a, b); -#endif -} - -// function min: a < b ? a : b -static inline Vec4ui min(Vec4ui const & a, Vec4ui const & b) { -#if INSTRSET >= 5 // SSE4.1 - return _mm_min_epu32(a,b); -#else // SSE2 - return select(a > b, b, a); -#endif -} - - -/***************************************************************************** -* -* Vector of 2 64-bit signed integers -* -*****************************************************************************/ - -class Vec2q : public Vec128b { -public: - // Default constructor: - Vec2q() { - } - // Constructor to broadcast the same value into all elements: - Vec2q(int64_t i) { -#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined(__INTEL_COMPILER) - // MS compiler has no _mm_set1_epi64x in 32 bit mode -#if defined(__x86_64__) // 64 bit mode -#if _MSC_VER < 1700 - __m128i x1 = _mm_cvtsi64_si128(i); // 64 bit load - xmm = _mm_unpacklo_epi64(x1,x1); // broadcast -#else - xmm = _mm_set1_epi64x(i); -#endif -#else - union { - int64_t q[2]; - int32_t r[4]; - } u; - u.q[0] = u.q[1] = i; - xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]); - /* // this will use an mm register and produce store forwarding stall: - union { - __m64 m; - int64_t ii; - } u; - u.ii = i; - xmm = _mm_set1_epi64(u.m); - _m_empty(); */ - -#endif // __x86_64__ -#else // Other compilers - xmm = _mm_set1_epi64x(i); -#endif - } - // Constructor to build from all elements: - Vec2q(int64_t i0, int64_t i1) { -#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined(__INTEL_COMPILER) - // MS compiler has no _mm_set_epi64x in 32 bit mode -#if defined(__x86_64__) // 64 bit mode -#if _MSC_VER < 1700 - __m128i x0 = _mm_cvtsi64_si128(i0); // 64 bit load - __m128i x1 = _mm_cvtsi64_si128(i1); // 64 bit load - xmm = _mm_unpacklo_epi64(x0,x1); // combine -#else - xmm = _mm_set_epi64x(i1, i0); -#endif -#else // MS compiler in 32-bit mode - union { - int64_t q[2]; - int32_t r[4]; - } u; - u.q[0] = i0; u.q[1] = i1; - // this is inefficient, but other solutions are worse - xmm = _mm_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3]); -#endif // __x86_64__ -#else // Other compilers - xmm = _mm_set_epi64x(i1, i0); -#endif - } - // Constructor to convert from type __m128i used in intrinsics: - Vec2q(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec2q & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Type cast operator to convert to __m128i used in intrinsics - operator __m128i() const { - return xmm; - } - // Member function to load from array (unaligned) - Vec2q & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec2q & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec2q & load_partial(int n, void const * p) { - switch (n) { - case 0: - *this = 0; break; - case 1: - // intrinsic for movq is missing! - *this = Vec2q(*(int64_t const*)p, 0); break; - case 2: - load(p); break; - default: - break; - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - switch (n) { - case 1: - int64_t q[2]; - store(q); - *(int64_t*)p = q[0]; break; - case 2: - store(p); break; - default: - break; - } - } - // cut off vector to n elements. The last 2-n elements are set to zero - Vec2q & cutoff(int n) { - *this = Vec16c(xmm).cutoff(n * 8); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec2q const & insert(uint32_t index, int64_t value) { -#if INSTRSET >= 5 && defined(__x86_64__) // SSE4.1 supported, 64 bit mode - if (index == 0) { - xmm = _mm_insert_epi64(xmm,value,0); - } - else { - xmm = _mm_insert_epi64(xmm,value,1); - } - -#else // SSE2 -#if defined(__x86_64__) // 64 bit mode - __m128i v = _mm_cvtsi64_si128(value); // 64 bit load -#else - union { - __m128i m; - int64_t ii; - } u; - u.ii = value; - __m128i v = _mm_loadl_epi64(&u.m); -#endif - if (index == 0) { - v = _mm_unpacklo_epi64(v,v); - xmm = _mm_unpackhi_epi64(v,xmm); - } - else { // index = 1 - xmm = _mm_unpacklo_epi64(xmm,v); - } -#endif - return *this; - } - // Member function extract a single element from vector - int64_t extract(uint32_t index) const { - int64_t x[2]; - store(x); - return x[index & 1]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int64_t operator [] (uint32_t index) const { - return extract(index); - } - static int size() { - return 2; - } -}; - -/***************************************************************************** -* -* Vec2qb: Vector of 2 Booleans for use with Vec2q and Vec2uq -* -*****************************************************************************/ -// Definition will be different for the AVX512 instruction set -class Vec2qb : public Vec2q { -public: - // Default constructor: - Vec2qb() { - } - // Constructor to build from all elements: - Vec2qb(bool x0, bool x1) { - xmm = Vec2q(-int64_t(x0), -int64_t(x1)); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec2qb(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec2qb & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec2qb(bool b) : Vec2q(-int64_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec2qb & operator = (bool b) { - *this = Vec2qb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec2qb(int b); - Vec2qb & operator = (int x); -public: - Vec2qb & insert (int index, bool a) { - Vec2q::insert(index, -(int64_t)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec2q::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec2qb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec2qb operator & (Vec2qb const & a, Vec2qb const & b) { - return Vec2qb(Vec128b(a) & Vec128b(b)); -} -static inline Vec2qb operator && (Vec2qb const & a, Vec2qb const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec2qb & operator &= (Vec2qb & a, Vec2qb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec2qb operator | (Vec2qb const & a, Vec2qb const & b) { - return Vec2qb(Vec128b(a) | Vec128b(b)); -} -static inline Vec2qb operator || (Vec2qb const & a, Vec2qb const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec2qb & operator |= (Vec2qb & a, Vec2qb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec2qb operator ^ (Vec2qb const & a, Vec2qb const & b) { - return Vec2qb(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec2qb & operator ^= (Vec2qb & a, Vec2qb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec2qb operator ~ (Vec2qb const & a) { - return Vec2qb( ~ Vec128b(a)); -} - -// vector operator ! : element not -static inline Vec2qb operator ! (Vec2qb const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec2qb andnot (Vec2qb const & a, Vec2qb const & b) { - return Vec2qb(andnot(Vec128b(a), Vec128b(b))); -} - -// Horizontal Boolean functions for Vec2qb - -// horizontal_and. Returns true if all elements are true -static inline bool horizontal_and(Vec2qb const & a) { - return _mm_movemask_epi8(a) == 0xFFFF; -} - -// horizontal_or. Returns true if at least one element is true -static inline bool horizontal_or(Vec2qb const & a) { -#if INSTRSET >= 5 // SSE4.1 supported. Use PTEST - return !_mm_testz_si128(a, a); -#else - return _mm_movemask_epi8(a) != 0; -#endif -} - - -/***************************************************************************** -* -* Operators for Vec2q -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec2q operator + (Vec2q const & a, Vec2q const & b) { - return _mm_add_epi64(a, b); -} - -// vector operator += : add -static inline Vec2q & operator += (Vec2q & a, Vec2q const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec2q operator ++ (Vec2q & a, int) { - Vec2q a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec2q & operator ++ (Vec2q & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec2q operator - (Vec2q const & a, Vec2q const & b) { - return _mm_sub_epi64(a, b); -} - -// vector operator - : unary minus -static inline Vec2q operator - (Vec2q const & a) { - return _mm_sub_epi64(_mm_setzero_si128(), a); -} - -// vector operator -= : subtract -static inline Vec2q & operator -= (Vec2q & a, Vec2q const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec2q operator -- (Vec2q & a, int) { - Vec2q a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec2q & operator -- (Vec2q & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec2q operator * (Vec2q const & a, Vec2q const & b) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return _mm_mullo_epi64(a, b); -#elif INSTRSET >= 5 // SSE4.1 supported - // instruction does not exist. Split into 32-bit multiplies - __m128i bswap = _mm_shuffle_epi32(b,0xB1); // b0H,b0L,b1H,b1L (swap H<->L) - __m128i prodlh = _mm_mullo_epi32(a,bswap); // a0Lb0H,a0Hb0L,a1Lb1H,a1Hb1L, 32 bit L*H products - __m128i zero = _mm_setzero_si128(); // 0 - __m128i prodlh2 = _mm_hadd_epi32(prodlh,zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0 - __m128i prodlh3 = _mm_shuffle_epi32(prodlh2,0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L - __m128i prodll = _mm_mul_epu32(a,b); // a0Lb0L,a1Lb1L, 64 bit unsigned products - __m128i prod = _mm_add_epi64(prodll,prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 - return prod; -#else // SSE2 - int64_t aa[2], bb[2]; - a.store(aa); // split into elements - b.store(bb); - return Vec2q(aa[0]*bb[0], aa[1]*bb[1]); // multiply elements separetely -#endif -} - -// vector operator *= : multiply -static inline Vec2q & operator *= (Vec2q & a, Vec2q const & b) { - a = a * b; - return a; -} - -// vector operator << : shift left -static inline Vec2q operator << (Vec2q const & a, int32_t b) { - return _mm_sll_epi64(a,_mm_cvtsi32_si128(b)); -} - -// vector operator <<= : shift left -static inline Vec2q & operator <<= (Vec2q & a, int32_t b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic -static inline Vec2q operator >> (Vec2q const & a, int32_t b) { - // instruction does not exist. Split into 32-bit shifts - if (b <= 32) { - __m128i bb = _mm_cvtsi32_si128(b); // b - __m128i sra = _mm_sra_epi32(a,bb); // a >> b signed dwords - __m128i srl = _mm_srl_epi64(a,bb); // a >> b unsigned qwords - __m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for signed high part - return selectb(mask,sra,srl); - } - else { // b > 32 - __m128i bm32 = _mm_cvtsi32_si128(b-32); // b - 32 - __m128i sign = _mm_srai_epi32(a,31); // sign of a - __m128i sra2 = _mm_sra_epi32(a,bm32); // a >> (b-32) signed dwords - __m128i sra3 = _mm_srli_epi64(sra2,32); // a >> (b-32) >> 32 (second shift unsigned qword) - __m128i mask = _mm_setr_epi32(0,-1,0,-1); // mask for high part containing only sign - return selectb(mask,sign,sra3); - } -} - -// vector operator >>= : shift right arithmetic -static inline Vec2q & operator >>= (Vec2q & a, int32_t b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec2qb operator == (Vec2q const & a, Vec2q const & b) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_cmpeq_epi64(a, b); -#else // SSE2 - // no 64 compare instruction. Do two 32 bit compares - __m128i com32 = _mm_cmpeq_epi32(a,b); // 32 bit compares - __m128i com32s = _mm_shuffle_epi32(com32,0xB1); // swap low and high dwords - __m128i test = _mm_and_si128(com32,com32s); // low & high - __m128i teste = _mm_srai_epi32(test,31); // extend sign bit to 32 bits - __m128i testee = _mm_shuffle_epi32(teste,0xF5); // extend sign bit to 64 bits - return Vec2qb(Vec2q(testee)); -#endif -} - -// vector operator != : returns true for elements for which a != b -static inline Vec2qb operator != (Vec2q const & a, Vec2q const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return Vec2qb(_mm_comneq_epi64(a,b)); -#else // SSE2 instruction set - return Vec2qb(Vec2q(~(a == b))); -#endif -} - -// vector operator < : returns true for elements for which a < b -static inline Vec2qb operator < (Vec2q const & a, Vec2q const & b) { -#if INSTRSET >= 6 // SSE4.2 supported - return Vec2qb(Vec2q(_mm_cmpgt_epi64(b, a))); -#else // SSE2 - // no 64 compare instruction. Subtract - __m128i s = _mm_sub_epi64(a,b); // a-b - // a < b if a and b have same sign and s < 0 or (a < 0 and b >= 0) - // The latter () corrects for overflow - __m128i axb = _mm_xor_si128(a,b); // a ^ b - __m128i anb = _mm_andnot_si128(b,a); // a & ~b - __m128i snaxb = _mm_andnot_si128(axb,s); // s & ~(a ^ b) - __m128i or1 = _mm_or_si128(anb,snaxb); // (a & ~b) | (s & ~(a ^ b)) - __m128i teste = _mm_srai_epi32(or1,31); // extend sign bit to 32 bits - __m128i testee = _mm_shuffle_epi32(teste,0xF5); // extend sign bit to 64 bits - return testee; -#endif -} - -// vector operator > : returns true for elements for which a > b -static inline Vec2qb operator > (Vec2q const & a, Vec2q const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec2qb operator >= (Vec2q const & a, Vec2q const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return Vec2qb(_mm_comge_epi64(a,b)); -#else // SSE2 instruction set - return Vec2qb(Vec2q(~(a < b))); -#endif -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec2qb operator <= (Vec2q const & a, Vec2q const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec2q operator & (Vec2q const & a, Vec2q const & b) { - return Vec2q(Vec128b(a) & Vec128b(b)); -} -static inline Vec2q operator && (Vec2q const & a, Vec2q const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec2q & operator &= (Vec2q & a, Vec2q const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec2q operator | (Vec2q const & a, Vec2q const & b) { - return Vec2q(Vec128b(a) | Vec128b(b)); -} -static inline Vec2q operator || (Vec2q const & a, Vec2q const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec2q & operator |= (Vec2q & a, Vec2q const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec2q operator ^ (Vec2q const & a, Vec2q const & b) { - return Vec2q(Vec128b(a) ^ Vec128b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec2q & operator ^= (Vec2q & a, Vec2q const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec2q operator ~ (Vec2q const & a) { - return Vec2q( ~ Vec128b(a)); -} - -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec2qb operator ! (Vec2q const & a) { - return a == Vec2q(_mm_setzero_si128()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec2q select (Vec2qb const & s, Vec2q const & a, Vec2q const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec2q if_add (Vec2qb const & f, Vec2q const & a, Vec2q const & b) { - return a + (Vec2q(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int64_t horizontal_add (Vec2q const & a) { - __m128i sum1 = _mm_shuffle_epi32(a,0x0E); // high element - __m128i sum2 = _mm_add_epi64(a,sum1); // sum -#if defined(__x86_64__) - return _mm_cvtsi128_si64(sum2); // 64 bit mode -#else - union { - __m128i x; // silly definition of _mm_storel_epi64 requires __m128i - int64_t i; - } u; - _mm_storel_epi64(&u.x,sum2); - return u.i; -#endif -} - -// function max: a > b ? a : b -static inline Vec2q max(Vec2q const & a, Vec2q const & b) { - return select(a > b, a, b); -} - -// function min: a < b ? a : b -static inline Vec2q min(Vec2q const & a, Vec2q const & b) { - return select(a < b, a, b); -} - -// function abs: a >= 0 ? a : -a -static inline Vec2q abs(Vec2q const & a) { -#if INSTRSET >= 6 // SSE4.2 supported - __m128i sign = _mm_cmpgt_epi64(_mm_setzero_si128(),a);// 0 > a -#else // SSE2 - __m128i signh = _mm_srai_epi32(a,31); // sign in high dword - __m128i sign = _mm_shuffle_epi32(signh,0xF5); // copy sign to low dword -#endif - __m128i inv = _mm_xor_si128(a,sign); // invert bits if negative - return _mm_sub_epi64(inv,sign); // add 1 -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec2q abs_saturated(Vec2q const & a) { - __m128i absa = abs(a); // abs(a) -#if INSTRSET >= 6 // SSE4.2 supported - __m128i overfl = _mm_cmpgt_epi64(_mm_setzero_si128(),absa);// 0 > a -#else // SSE2 - __m128i signh = _mm_srai_epi32(absa,31); // sign in high dword - __m128i overfl= _mm_shuffle_epi32(signh,0xF5); // copy sign to low dword -#endif - return _mm_add_epi64(absa,overfl); // subtract 1 if 0x8000000000000000 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec2q rotate_left(Vec2q const & a, int b) { -#ifdef __AVX512VL__ - return _mm_rolv_epi64(a, _mm_set1_epi64x(int64_t(b))); -#elif defined __XOP__ // AMD XOP instruction set - return (Vec2q)_mm_rot_epi64(a,Vec2q(b)); -#else // SSE2 instruction set - __m128i left = _mm_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F)); // a << b - __m128i right = _mm_srl_epi64(a,_mm_cvtsi32_si128((64-b) & 0x3F)); // a >> (64 - b) - __m128i rot = _mm_or_si128(left,right); // or - return (Vec2q)rot; -#endif -} - - -/***************************************************************************** -* -* Vector of 2 64-bit unsigned integers -* -*****************************************************************************/ - -class Vec2uq : public Vec2q { -public: - // Default constructor: - Vec2uq() { - } - // Constructor to broadcast the same value into all elements: - Vec2uq(uint64_t i) { - xmm = Vec2q(i); - } - // Constructor to build from all elements: - Vec2uq(uint64_t i0, uint64_t i1) { - xmm = Vec2q(i0, i1); - } - // Constructor to convert from type __m128i used in intrinsics: - Vec2uq(__m128i const & x) { - xmm = x; - } - // Assignment operator to convert from type __m128i used in intrinsics: - Vec2uq & operator = (__m128i const & x) { - xmm = x; - return *this; - } - // Member function to load from array (unaligned) - Vec2uq & load(void const * p) { - xmm = _mm_loadu_si128((__m128i const*)p); - return *this; - } - // Member function to load from array (aligned) - Vec2uq & load_a(void const * p) { - xmm = _mm_load_si128((__m128i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec2uq const & insert(uint32_t index, uint64_t value) { - Vec2q::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint64_t extract(uint32_t index) const { - return Vec2q::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint64_t operator [] (uint32_t index) const { - return extract(index); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec2uq operator + (Vec2uq const & a, Vec2uq const & b) { - return Vec2uq (Vec2q(a) + Vec2q(b)); -} - -// vector operator - : subtract -static inline Vec2uq operator - (Vec2uq const & a, Vec2uq const & b) { - return Vec2uq (Vec2q(a) - Vec2q(b)); -} - -// vector operator * : multiply element by element -static inline Vec2uq operator * (Vec2uq const & a, Vec2uq const & b) { - return Vec2uq (Vec2q(a) * Vec2q(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec2uq operator >> (Vec2uq const & a, uint32_t b) { - return _mm_srl_epi64(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec2uq operator >> (Vec2uq const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right logical -static inline Vec2uq & operator >>= (Vec2uq & a, int b) { - a = a >> b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec2uq operator << (Vec2uq const & a, uint32_t b) { - return Vec2uq ((Vec2q)a << (int32_t)b); -} - -// vector operator << : shift left all elements -static inline Vec2uq operator << (Vec2uq const & a, int32_t b) { - return Vec2uq ((Vec2q)a << b); -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec2qb operator > (Vec2uq const & a, Vec2uq const & b) { -#if defined ( __XOP__ ) // AMD XOP instruction set - return Vec2qb(_mm_comgt_epu64(a,b)); -#elif INSTRSET >= 6 // SSE4.2 - __m128i sign64 = constant4ui<0,0x80000000,0,0x80000000>(); - __m128i aflip = _mm_xor_si128(a, sign64); // flip sign bits to use signed compare - __m128i bflip = _mm_xor_si128(b, sign64); - Vec2q cmp = _mm_cmpgt_epi64(aflip,bflip); - return Vec2qb(cmp); -#else // SSE2 instruction set - __m128i sign32 = _mm_set1_epi32(0x80000000); // sign bit of each dword - __m128i aflip = _mm_xor_si128(a,sign32); // a with sign bits flipped to use signed compare - __m128i bflip = _mm_xor_si128(b,sign32); // b with sign bits flipped to use signed compare - __m128i equal = _mm_cmpeq_epi32(a,b); // a == b, dwords - __m128i bigger = _mm_cmpgt_epi32(aflip,bflip); // a > b, dwords - __m128i biggerl = _mm_shuffle_epi32(bigger,0xA0); // a > b, low dwords copied to high dwords - __m128i eqbig = _mm_and_si128(equal,biggerl); // high part equal and low part bigger - __m128i hibig = _mm_or_si128(bigger,eqbig); // high part bigger or high part equal and low part bigger - __m128i big = _mm_shuffle_epi32(hibig,0xF5); // result copied to low part - return Vec2qb(Vec2q(big)); -#endif -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec2qb operator < (Vec2uq const & a, Vec2uq const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec2qb operator >= (Vec2uq const & a, Vec2uq const & b) { -#ifdef __XOP__ // AMD XOP instruction set - return Vec2qb(_mm_comge_epu64(a,b)); -#else // SSE2 instruction set - return Vec2qb(Vec2q(~(b > a))); -#endif -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec2qb operator <= (Vec2uq const & a, Vec2uq const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec2uq operator & (Vec2uq const & a, Vec2uq const & b) { - return Vec2uq(Vec128b(a) & Vec128b(b)); -} -static inline Vec2uq operator && (Vec2uq const & a, Vec2uq const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec2uq operator | (Vec2uq const & a, Vec2uq const & b) { - return Vec2uq(Vec128b(a) | Vec128b(b)); -} -static inline Vec2uq operator || (Vec2uq const & a, Vec2uq const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec2uq operator ^ (Vec2uq const & a, Vec2uq const & b) { - return Vec2uq(Vec128b(a) ^ Vec128b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec2uq operator ~ (Vec2uq const & a) { - return Vec2uq( ~ Vec128b(a)); -} - - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i]; -// Each word in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec2uq select (Vec2qb const & s, Vec2uq const & a, Vec2uq const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec2uq if_add (Vec2qb const & f, Vec2uq const & a, Vec2uq const & b) { - return a + (Vec2uq(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint64_t horizontal_add (Vec2uq const & a) { - return horizontal_add((Vec2q)a); -} - -// function max: a > b ? a : b -static inline Vec2uq max(Vec2uq const & a, Vec2uq const & b) { - return select(a > b, a, b); -} - -// function min: a < b ? a : b -static inline Vec2uq min(Vec2uq const & a, Vec2uq const & b) { - return select(a > b, b, a); -} - - -/***************************************************************************** -* -* Vector permute functions -* -****************************************************************************** -* -* These permute functions can reorder the elements of a vector and optionally -* set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select. A negative index will generate zero. an index of -256 means don't care. -* -* Example: -* Vec4i a(10,11,12,13); // a is (10,11,12,13) -* Vec4i b, c; -* b = permute4i<0,0,2,2>(a); // b is (10,10,12,12) -* c = permute4i<3,2,-1,-1>(a); // c is (13,12, 0, 0) -* -* The permute functions for vectors of 8-bit integers are inefficient if -* the SSSE3 instruction set or later is not enabled. -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. -*****************************************************************************/ - -template -static inline Vec2q permute2q(Vec2q const & a) { - if (i0 == 0) { - if (i1 == 0) { // 0,0 - return _mm_unpacklo_epi64(a, a); - } - else if (i1 == 1 || i1 == -0x100) { // 0,1 - return a; - } - else { // 0,-1 - // return _mm_mov_epi64(a); // doesn't work with MS VS 2008 - return _mm_and_si128(a, constant4i<-1,-1,0,0>()); - } - } - else if (i0 == 1) { - if (i1 == 0) { // 1,0 - return _mm_shuffle_epi32(a, 0x4E); - } - else if (i1 == 1) { // 1,1 - return _mm_unpackhi_epi64(a, a); - } - else { // 1,-1 - return _mm_srli_si128(a, 8); - } - } - else { // i0 < 0 - if (i1 == 0) { // -1,0 - return _mm_slli_si128(a, 8); - } - else if (i1 == 1) { // -1,1 - if (i0 == -0x100) return a; - return _mm_and_si128(a, constant4i<0,0,-1,-1>()); - } - else { // -1,-1 - return _mm_setzero_si128(); - } - } -} - -template -static inline Vec2uq permute2uq(Vec2uq const & a) { - return Vec2uq (permute2q ((__m128i)a)); -} - -// permute vector Vec4i -template -static inline Vec4i permute4i(Vec4i const & a) { - - // Combine all the indexes into a single bitfield, with 4 bits for each - const uint32_t m1 = (i0&3) | (i1&3)<<4 | (i2&3)<<8 | (i3&3)<<12; - - // Mask to zero out negative indexes - const uint32_t mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12; - - // Mask indicating required zeroing of all indexes, with 4 bits for each, 0 for index = -1, 0xF for index >= 0 or -256 - const uint32_t ssz = ((i0 & 0x80) ? 0 : 0xF) | ((i1 & 0x80) ? 0 : 0xF) << 4 | ((i2 & 0x80) ? 0 : 0xF) << 8 | ((i3 & 0x80) ? 0 : 0xF) << 12; - - // Mask indicating 0 for don't care, 0xF for non-negative value of required zeroing - const uint32_t md = mz | ~ ssz; - - // Test if permutation needed - const bool do_shuffle = ((m1 ^ 0x00003210) & mz) != 0; - - // is zeroing needed - const bool do_zero = (ssz != 0xFFFF); - - if (mz == 0) { - return _mm_setzero_si128(); // special case: all zero or don't care - } - // Test if we can do with 64-bit permute only - if ((m1 & 0x0101 & mz) == 0 // even indexes are even or negative - && (~m1 & 0x1010 & mz) == 0 // odd indexes are odd or negative - && ((m1 ^ ((m1 + 0x0101) << 4)) & 0xF0F0 & mz & (mz << 4)) == 0 // odd index == preceding even index +1 or at least one of them negative - && ((mz ^ (mz << 4)) & 0xF0F0 & md & md << 4) == 0) { // each pair of indexes are both negative or both positive or one of them don't care - const int j0 = i0 >= 0 ? i0 / 2 : (i0 & 0x80) ? i0 : i1 >= 0 ? i1/2 : i1; - const int j1 = i2 >= 0 ? i2 / 2 : (i2 & 0x80) ? i2 : i3 >= 0 ? i3/2 : i3; - return Vec4i(permute2q (Vec2q(a))); // 64 bit permute - } -#if INSTRSET >= 4 // SSSE3 - if (do_shuffle && do_zero) { - // With SSSE3 we can do both with the PSHUFB instruction - const int j0 = (i0 & 3) << 2; - const int j1 = (i1 & 3) << 2; - const int j2 = (i2 & 3) << 2; - const int j3 = (i3 & 3) << 2; - __m128i mask1 = constant4i < - i0 < 0 ? -1 : j0 | (j0+1)<<8 | (j0+2)<<16 | (j0+3) << 24, - i1 < 0 ? -1 : j1 | (j1+1)<<8 | (j1+2)<<16 | (j1+3) << 24, - i2 < 0 ? -1 : j2 | (j2+1)<<8 | (j2+2)<<16 | (j2+3) << 24, - i3 < 0 ? -1 : j3 | (j3+1)<<8 | (j3+2)<<16 | (j3+3) << 24 > (); - return _mm_shuffle_epi8(a,mask1); - } -#endif - __m128i t1; - - if (do_shuffle) { // permute - t1 = _mm_shuffle_epi32(a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); - } - else { - t1 = a; - } - if (do_zero) { // set some elements to zero - __m128i mask2 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >(); - t1 = _mm_and_si128(t1,mask2); - } - return t1; -} - -template -static inline Vec4ui permute4ui(Vec4ui const & a) { - return Vec4ui (permute4i (a)); -} - -template -static inline Vec8s permute8s(Vec8s const & a) { - if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) { - return _mm_setzero_si128(); // special case: all zero - } -#if INSTRSET >= 4 // SSSE3 - - // special case: rotate - if (i0>=0 && i0 < 8 && i1==((i0+1)&7) && i2==((i0+2)&7) && i3==((i0+3)&7) && i4==((i0+4)&7) && i5==((i0+5)&7) && i6==((i0+6)&7) && i7==((i0+7)&7)) { - if (i0 == 0) return a; // do nothing - return _mm_alignr_epi8(a, a, (i0 & 7) * 2); - } - - // General case: Use PSHUFB - const int j0 = i0 < 0 ? 0xFFFF : ( (i0 & 7) * 2 | ((i0 & 7) * 2 + 1) << 8 ); - const int j1 = i1 < 0 ? 0xFFFF : ( (i1 & 7) * 2 | ((i1 & 7) * 2 + 1) << 8 ); - const int j2 = i2 < 0 ? 0xFFFF : ( (i2 & 7) * 2 | ((i2 & 7) * 2 + 1) << 8 ); - const int j3 = i3 < 0 ? 0xFFFF : ( (i3 & 7) * 2 | ((i3 & 7) * 2 + 1) << 8 ); - const int j4 = i4 < 0 ? 0xFFFF : ( (i4 & 7) * 2 | ((i4 & 7) * 2 + 1) << 8 ); - const int j5 = i5 < 0 ? 0xFFFF : ( (i5 & 7) * 2 | ((i5 & 7) * 2 + 1) << 8 ); - const int j6 = i6 < 0 ? 0xFFFF : ( (i6 & 7) * 2 | ((i6 & 7) * 2 + 1) << 8 ); - const int j7 = i7 < 0 ? 0xFFFF : ( (i7 & 7) * 2 | ((i7 & 7) * 2 + 1) << 8 ); - __m128i mask = constant4i < j0 | j1 << 16, j2 | j3 << 16, j4 | j5 << 16, j6 | j7 << 16 > (); - return _mm_shuffle_epi8(a,mask); - -#else // SSE2 has no simple solution. Find the optimal permute method. - // Without proper metaprogramming features, we have to use constant expressions - // and if-statements to make sure these calculations are resolved at compile time. - // All this should produce at most 8 instructions in the final code, depending - // on the template parameters. - - // Temporary vectors - __m128i t1, t2, t3, t4, t5, t6, t7; - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 - | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28; - - // Mask to zero out negative indexes - const int m2 = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 - | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - // Test if we can do without permute - const bool case0 = ((m1 ^ 0x76543210) & m2) == 0; // all indexes point to their own place or negative - - // Test if we can do with 32-bit permute only - const bool case1 = - (m1 & 0x01010101 & m2) == 0 // even indexes are even or negative - && (~m1 & 0x10101010 & m2) == 0 // odd indexes are odd or negative - && ((m1 ^ ((m1 + 0x01010101) << 4)) & 0xF0F0F0F0 & m2 & (m2 << 4)) == 0; // odd index == preceding even index +1 or at least one of them negative - - // Test if we can do with 16-bit permute only - const bool case2 = - (((m1 & 0x44444444) ^ 0x44440000) & m2) == 0; // indexes 0-3 point to lower 64 bits, 1-7 to higher 64 bits, or negative - - if (case0) { - // no permute needed - t7 = a; - } - else if (case1) { - // 32 bit permute only - const int j0 = i0 >= 0 ? i0/2 : i1 >= 0 ? i1/2 : 0; - const int j1 = i2 >= 0 ? i2/2 : i3 >= 0 ? i3/2 : 0; - const int j2 = i4 >= 0 ? i4/2 : i5 >= 0 ? i5/2 : 0; - const int j3 = i6 >= 0 ? i6/2 : i7 >= 0 ? i7/2 : 0; - t7 = _mm_shuffle_epi32(a, (j0&3) | (j1&3)<<2 | (j2&3)<<4 | (j3&3)<<6 ); - } - else if (case2) { - // 16 bit permute only - const int j0 = i0 >= 0 ? i0&3 : 0; - const int j1 = i1 >= 0 ? i1&3 : 1; - const int j2 = i2 >= 0 ? i2&3 : 2; - const int j3 = i3 >= 0 ? i3&3 : 3; - const int j4 = i4 >= 0 ? i4&3 : 0; - const int j5 = i5 >= 0 ? i5&3 : 1; - const int j6 = i6 >= 0 ? i6&3 : 2; - const int j7 = i7 >= 0 ? i7&3 : 3; - if (j0!=0 || j1!=1 || j2!=2 || j3!=3) { - t1 = _mm_shufflelo_epi16(a, j0 | j1<<2 | j2<<4 | j3<<6); - } - else t1 = a; - if (j4!=0 || j5!=1 || j6!=2 || j7!=3) { - t7 = _mm_shufflehi_epi16(t1, j4 | j5<<2 | j6<<4 | j7<<6); - } - else t7 = t1; - } - else { - // Need at least two permute steps - - // Index to where each dword of a is needed - const int nn = (m1 & 0x66666666) | 0x88888888; // indicate which dwords are needed - const int n0 = ((((uint32_t)(nn ^ 0x00000000) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; - const int n1 = ((((uint32_t)(nn ^ 0x22222222) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; - const int n2 = ((((uint32_t)(nn ^ 0x44444444) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; - const int n3 = ((((uint32_t)(nn ^ 0x66666666) - 0x22222222) & 0x88888888) ^ 0x88888888) & m2; - // indicate which dwords are needed in low half - const int l0 = (n0 & 0xFFFF) != 0; - const int l1 = (n1 & 0xFFFF) != 0; - const int l2 = (n2 & 0xFFFF) != 0; - const int l3 = (n3 & 0xFFFF) != 0; - // indicate which dwords are needed in high half - const int h0 = (n0 & 0xFFFF0000) != 0; - const int h1 = (n1 & 0xFFFF0000) != 0; - const int h2 = (n2 & 0xFFFF0000) != 0; - const int h3 = (n3 & 0xFFFF0000) != 0; - - // Test if we can do with two permute steps - const bool case3 = l0 + l1 + l2 + l3 <= 2 && h0 + h1 + h2 + h3 <= 2; - - if (case3) { - // one 32-bit permute followed by one 16-bit permute in each half. - // Find permute indices for 32-bit permute - const int j0 = l0 ? 0 : l1 ? 1 : l2 ? 2 : 3; - const int j1 = l3 ? 3 : l2 ? 2 : l1 ? 1 : 0; - const int j2 = h0 ? 0 : h1 ? 1 : h2 ? 2 : 3; - const int j3 = h3 ? 3 : h2 ? 2 : h1 ? 1 : 0; - - // Find permute indices for low 16-bit permute - const int r0 = i0 < 0 ? 0 : (i0>>1 == j0 ? 0 : 2) + (i0 & 1); - const int r1 = i1 < 0 ? 1 : (i1>>1 == j0 ? 0 : 2) + (i1 & 1); - const int r2 = i2 < 0 ? 2 : (i2>>1 == j1 ? 2 : 0) + (i2 & 1); - const int r3 = i3 < 0 ? 3 : (i3>>1 == j1 ? 2 : 0) + (i3 & 1); - - // Find permute indices for high 16-bit permute - const int s0 = i4 < 0 ? 0 : (i4>>1 == j2 ? 0 : 2) + (i4 & 1); - const int s1 = i5 < 0 ? 1 : (i5>>1 == j2 ? 0 : 2) + (i5 & 1); - const int s2 = i6 < 0 ? 2 : (i6>>1 == j3 ? 2 : 0) + (i6 & 1); - const int s3 = i7 < 0 ? 3 : (i7>>1 == j3 ? 2 : 0) + (i7 & 1); - - // 32-bit permute - t1 = _mm_shuffle_epi32 (a, j0 | j1<<2 | j2<<4 | j3<<6); - // 16-bit permutes - if (r0!=0 || r1!=1 || r2!=2 || r3!=3) { // 16 bit permute of low half - t2 = _mm_shufflelo_epi16(t1, r0 | r1<<2 | r2<<4 | r3<<6); - } - else t2 = t1; - if (s0!=0 || s1!=1 || s2!=2 || s3!=3) { // 16 bit permute of high half - t7 = _mm_shufflehi_epi16(t2, s0 | s1<<2 | s2<<4 | s3<<6); - } - else t7 = t2; - } - else { - // Worst case. We need two sets of 16-bit permutes - t1 = _mm_shuffle_epi32(a, 0x4E); // swap low and high 64-bits - - // Find permute indices for low 16-bit permute from swapped t1 - const int r0 = i0 < 4 ? 0 : i0 & 3; - const int r1 = i1 < 4 ? 1 : i1 & 3; - const int r2 = i2 < 4 ? 2 : i2 & 3; - const int r3 = i3 < 4 ? 3 : i3 & 3; - // Find permute indices for high 16-bit permute from swapped t1 - const int s0 = i4 < 0 || i4 >= 4 ? 0 : i4 & 3; - const int s1 = i5 < 0 || i5 >= 4 ? 1 : i5 & 3; - const int s2 = i6 < 0 || i6 >= 4 ? 2 : i6 & 3; - const int s3 = i7 < 0 || i7 >= 4 ? 3 : i7 & 3; - // Find permute indices for low 16-bit permute from direct a - const int u0 = i0 < 0 || i0 >= 4 ? 0 : i0 & 3; - const int u1 = i1 < 0 || i1 >= 4 ? 1 : i1 & 3; - const int u2 = i2 < 0 || i2 >= 4 ? 2 : i2 & 3; - const int u3 = i3 < 0 || i3 >= 4 ? 3 : i3 & 3; - // Find permute indices for high 16-bit permute from direct a - const int v0 = i4 < 4 ? 0 : i4 & 3; - const int v1 = i5 < 4 ? 1 : i5 & 3; - const int v2 = i6 < 4 ? 2 : i6 & 3; - const int v3 = i7 < 4 ? 3 : i7 & 3; - - // 16-bit permutes - if (r0!=0 || r1!=1 || r2!=2 || r3!=3) { // 16 bit permute of low half - t2 = _mm_shufflelo_epi16(t1, r0 | r1<<2 | r2<<4 | r3<<6); - } - else t2 = t1; - if (u0!=0 || u1!=1 || u2!=2 || u3!=3) { // 16 bit permute of low half - t3 = _mm_shufflelo_epi16(a, u0 | u1<<2 | u2<<4 | u3<<6); - } - else t3 = a; - if (s0!=0 || s1!=1 || s2!=2 || s3!=3) { // 16 bit permute of low half - t4 = _mm_shufflehi_epi16(t2, s0 | s1<<2 | s2<<4 | s3<<6); - } - else t4 = t2; - if (v0!=0 || v1!=1 || v2!=2 || v3!=3) { // 16 bit permute of low half - t5 = _mm_shufflehi_epi16(t3, v0 | v1<<2 | v2<<4 | v3<<6); - } - else t5 = t3; - // merge data from t4 and t5 - t6 = constant4i < - ((i0 & 4) ? 0xFFFF : 0) | ((i1 & 4) ? 0xFFFF0000 : 0), - ((i2 & 4) ? 0xFFFF : 0) | ((i3 & 4) ? 0xFFFF0000 : 0), - ((i4 & 4) ? 0 : 0xFFFF) | ((i5 & 4) ? 0 : 0xFFFF0000), - ((i6 & 4) ? 0 : 0xFFFF) | ((i7 & 4) ? 0 : 0xFFFF0000) > (); - t7 = selectb(t6,t4,t5); // select between permuted data t4 and t5 - } - } - // Set any elements to zero if required - if (m2 != -1 && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80)) { - // some elements need to be set to 0 - __m128i mask = constant4i < - (i0 < 0 ? 0xFFFF0000 : -1) & (i1 < 0 ? 0x0000FFFF : -1), - (i2 < 0 ? 0xFFFF0000 : -1) & (i3 < 0 ? 0x0000FFFF : -1), - (i4 < 0 ? 0xFFFF0000 : -1) & (i5 < 0 ? 0x0000FFFF : -1), - (i6 < 0 ? 0xFFFF0000 : -1) & (i7 < 0 ? 0x0000FFFF : -1) > (); - return _mm_and_si128(t7,mask); - } - else { - return t7; - } -#endif -} - -template -static inline Vec8us permute8us(Vec8us const & a) { - return Vec8us (permute8s (a)); -} - - -template -static inline Vec16c permute16c(Vec16c const & a) { - - __m128i temp; - - // Combine all even indexes into a single bitfield, with 4 bits for each - const uint32_t me = (i0&15) | (i2&15)<<4 | (i4&15)<<8 | (i6&15)<<12 - | (i8&15)<<16 | (i10&15)<<20 | (i12&15)<<24 | (i14&15)<<28; - - // Combine all odd indexes into a single bitfield, with 4 bits for each - const uint32_t mo = (i1&15) | (i3&15)<<4 | (i5&15)<<8 | (i7&15)<<12 - | (i9&15)<<16 | (i11&15)<<20 | (i13&15)<<24 | (i15&15)<<28; - - // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative - const uint32_t se = (i0<0?0:0xF) | (i2<0?0:0xF)<<4 | (i4<0?0:0xF)<<8 | (i6<0?0:0xF)<<12 - | (i8<0?0:0xF)<<16 | (i10<0?0:0xF)<<20 | (i12<0?0:0xF)<<24 | (i14<0?0:0xF)<<28; - - // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative - const uint32_t so = (i1<0?0:0xF) | (i3<0?0:0xF)<<4 | (i5<0?0:0xF)<<8 | (i7<0?0:0xF)<<12 - | (i9<0?0:0xF)<<16 | (i11<0?0:0xF)<<20 | (i13<0?0:0xF)<<24 | (i15<0?0:0xF)<<28; - - // Mask indicating sign of all indexes, with 2 bits for each, 0 for negative (means set to zero or don't care), 0x3 for non-negative - const uint32_t ss = (se & 0x33333333) | (so & 0xCCCCCCCC); - - // Mask indicating required zeroing of all indexes, with 2 bits for each, 0 for index = -1, 3 for index >= 0 or -256 - const uint32_t ssz = ((i0&0x80)?0:3) | ((i1 &0x80)?0:3)<< 2 | ((i2 &0x80)?0:3)<< 4 | ((i3 &0x80)?0:3)<< 6 | - ((i4 &0x80)?0:3)<< 8 | ((i5 &0x80)?0:3)<<10 | ((i6 &0x80)?0:3)<<12 | ((i7 &0x80)?0:3)<<14 | - ((i8 &0x80)?0:3)<<16 | ((i9 &0x80)?0:3)<<18 | ((i10&0x80)?0:3)<<20 | ((i11&0x80)?0:3)<<22 | - ((i12&0x80)?0:3)<<24 | ((i13&0x80)?0:3)<<26 | ((i14&0x80)?0:3)<<28 | ((i15&0x80)?0:3)<<30 ; - - // These indexes are used only to avoid bogus compiler warnings in false branches - const int I0 = i0 > 0 ? (i0 & 0xF) : 0; - const int I15 = i15 > 0 ? (i15 & 0xF) : 0; - - // special case: all zero - if (ss == 0) { - return _mm_setzero_si128(); - } - - // remember if extra zeroing is needed - bool do_and_zero = (ssz != 0xFFFFFFFFu); - - // check for special shortcut cases - int shortcut = 0; - - // check if any permutation - if (((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0) { - shortcut = 1; - } - // check if we can use punpcklbw - else if (((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0) { - shortcut = 2; - } - // check if we can use punpckhbw - else if (((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0) { - shortcut = 3; - } - - #if defined (_MSC_VER) && ! defined(__INTEL_COMPILER) - #pragma warning(disable: 4307) // disable MS warning C4307: '+' : integral constant overflow - #endif - - // check if we can use byte shift right - else if (i0 > 0 && ((me ^ (uint32_t(I0)*0x11111111u + 0xECA86420u)) & se) == 0 && - ((mo ^ (uint32_t(I0)*0x11111111u + 0xFDB97531u)) & so) == 0) { - shortcut = 4; - do_and_zero = ((0xFFFFFFFFu >> 2*I0) & ~ ssz) != 0; - } - // check if we can use byte shift left - else if (i15 >= 0 && i15 < 15 && - ((mo ^ (uint32_t(I15*0x11111111u) - (0x02468ACEu & so))) & so) == 0 && - ((me ^ (uint32_t(I15*0x11111111u) - (0x13579BDFu & se))) & se) == 0) { - shortcut = 5; - do_and_zero = ((0xFFFFFFFFu << 2*(15-I15)) & ~ ssz) != 0; - } - -#if INSTRSET >= 4 // SSSE3 (PSHUFB available only under SSSE3) - - // special case: rotate - if (i0>0 && i0 < 16 && i1==((i0+1)&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) - && i8==((i0+8)&15) && i9==((i0+9)&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15)) { - temp = _mm_alignr_epi8(a, a, i0 & 15); - shortcut = -1; - } - if (shortcut == 0 || do_and_zero) { - // general case: use PSHUFB - __m128i mask = constant4i< - (i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24 , - (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24 , - (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24 , - (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24 > (); - temp = _mm_shuffle_epi8(a,mask); - shortcut = -1; - do_and_zero = false; - } - -#endif - - // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered - // indexes must be equal to the preceding index + 1, except for negative indexes. - if (shortcut == 0 && (me & 0x11111111 & se) == 0 && ((mo ^ 0x11111111) & 0x11111111 & so) == 0 && ((me ^ mo) & 0xEEEEEEEE & se & so) == 0) { - temp = permute8s < - i0 >= 0 ? i0 /2 : i1 >= 0 ? i1 /2 : (i0 | i1 ), - i2 >= 0 ? i2 /2 : i3 >= 0 ? i3 /2 : (i2 | i3 ), - i4 >= 0 ? i4 /2 : i5 >= 0 ? i5 /2 : (i4 | i5 ), - i6 >= 0 ? i6 /2 : i7 >= 0 ? i7 /2 : (i6 | i7 ), - i8 >= 0 ? i8 /2 : i9 >= 0 ? i9 /2 : (i8 | i9 ), - i10 >= 0 ? i10/2 : i11 >= 0 ? i11/2 : (i10 | i11), - i12 >= 0 ? i12/2 : i13 >= 0 ? i13/2 : (i12 | i13), - i14 >= 0 ? i14/2 : i15 >= 0 ? i15/2 : (i14 | i15) > (Vec8s(a)); - shortcut = 100; - do_and_zero = (se != so && ssz != 0xFFFFFFFFu); - } - - // Check if we can use 16-bit permute with bytes swapped. Even numbered indexes must be odd and odd - // numbered indexes must be equal to the preceding index - 1, except for negative indexes. - // (this case occurs when reversing byte order) - if (shortcut == 0 && ((me ^ 0x11111111) & 0x11111111 & se) == 0 && (mo & 0x11111111 & so) == 0 && ((me ^ mo) & 0xEEEEEEEE & se & so) == 0) { - Vec16c swapped = Vec16c(rotate_left(Vec8s(a), 8)); // swap odd and even bytes - temp = permute8s < - i0 >= 0 ? i0 /2 : i1 >= 0 ? i1 /2 : (i0 | i1 ), - i2 >= 0 ? i2 /2 : i3 >= 0 ? i3 /2 : (i2 | i3 ), - i4 >= 0 ? i4 /2 : i5 >= 0 ? i5 /2 : (i4 | i5 ), - i6 >= 0 ? i6 /2 : i7 >= 0 ? i7 /2 : (i6 | i7 ), - i8 >= 0 ? i8 /2 : i9 >= 0 ? i9 /2 : (i8 | i9 ), - i10 >= 0 ? i10/2 : i11 >= 0 ? i11/2 : (i10 | i11), - i12 >= 0 ? i12/2 : i13 >= 0 ? i13/2 : (i12 | i13), - i14 >= 0 ? i14/2 : i15 >= 0 ? i15/2 : (i14 | i15) > (Vec8s(swapped)); - shortcut = 101; - do_and_zero = (se != so && ssz != 0xFFFFFFFFu); - } - - // all shortcuts end here - if (shortcut) { - switch (shortcut) { - case 1: - temp = a; break; - case 2: - temp = _mm_unpacklo_epi8(a,a); break; - case 3: - temp = _mm_unpackhi_epi8(a,a); break; - case 4: - temp = _mm_srli_si128(a, I0); break; - case 5: - temp = _mm_slli_si128(a, 15-I15); break; - default: - break; // result is already in temp - } - if (do_and_zero) { - // additional zeroing needed - __m128i maskz = constant4i < - (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , - (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , - (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , - (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); - temp = _mm_and_si128(temp, maskz); - } - return temp; - } - - // complicated cases: use 16-bit permute up to four times - const bool e2e = (~me & 0x11111111 & se) != 0; // even bytes of source to even bytes of destination - const bool e2o = (~mo & 0x11111111 & so) != 0; // even bytes of source to odd bytes of destination - const bool o2e = (me & 0x11111111 & se) != 0; // odd bytes of source to even bytes of destination - const bool o2o = (mo & 0x11111111 & so) != 0; // odd bytes of source to odd bytes of destination - - Vec16c swapped, te2e, te2o, to2e, to2o, combeven, combodd; - - if (e2o || o2e) swapped = rotate_left(Vec8s(a), 8); // swap odd and even bytes - - // even-to-even bytes - if (e2e) te2e = permute8s <(i0&1)?-1:i0/2, (i2&1)?-1:i2/2, (i4&1)?-1:i4/2, (i6&1)?-1:i6/2, - (i8&1)?-1:i8/2, (i10&1)?-1:i10/2, (i12&1)?-1:i12/2, (i14&1)?-1:i14/2> (Vec8s(a)); - // odd-to-even bytes - if (o2e) to2e = permute8s <(i0&1)?i0/2:-1, (i2&1)?i2/2:-1, (i4&1)?i4/2:-1, (i6&1)?i6/2:-1, - (i8&1)?i8/2:-1, (i10&1)?i10/2:-1, (i12&1)?i12/2:-1, (i14&1)?i14/2:-1> (Vec8s(swapped)); - // even-to-odd bytes - if (e2o) te2o = permute8s <(i1&1)?-1:i1/2, (i3&1)?-1:i3/2, (i5&1)?-1:i5/2, (i7&1)?-1:i7/2, - (i9&1)?-1:i9/2, (i11&1)?-1:i11/2, (i13&1)?-1:i13/2, (i15&1)?-1:i15/2> (Vec8s(swapped)); - // odd-to-odd bytes - if (o2o) to2o = permute8s <(i1&1)?i1/2:-1, (i3&1)?i3/2:-1, (i5&1)?i5/2:-1, (i7&1)?i7/2:-1, - (i9&1)?i9/2:-1, (i11&1)?i11/2:-1, (i13&1)?i13/2:-1, (i15&1)?i15/2:-1> (Vec8s(a)); - - if (e2e && o2e) combeven = te2e | to2e; - else if (e2e) combeven = te2e; - else if (o2e) combeven = to2e; - else combeven = _mm_setzero_si128(); - - if (e2o && o2o) combodd = te2o | to2o; - else if (e2o) combodd = te2o; - else if (o2o) combodd = to2o; - else combodd = _mm_setzero_si128(); - - __m128i maske = constant4i < // mask used even bytes - (i0 < 0 ? 0 : 0xFF) | (i2 < 0 ? 0 : 0xFF0000), - (i4 < 0 ? 0 : 0xFF) | (i6 < 0 ? 0 : 0xFF0000), - (i8 < 0 ? 0 : 0xFF) | (i10 < 0 ? 0 : 0xFF0000), - (i12 < 0 ? 0 : 0xFF) | (i14 < 0 ? 0 : 0xFF0000) > (); - __m128i masko = constant4i < // mask used odd bytes - (i1 < 0 ? 0 : 0xFF00) | (i3 < 0 ? 0 : 0xFF000000), - (i5 < 0 ? 0 : 0xFF00) | (i7 < 0 ? 0 : 0xFF000000), - (i9 < 0 ? 0 : 0xFF00) | (i11 < 0 ? 0 : 0xFF000000), - (i13 < 0 ? 0 : 0xFF00) | (i15 < 0 ? 0 : 0xFF000000) > (); - - return _mm_or_si128( // combine even and odd bytes - _mm_and_si128(combeven, maske), - _mm_and_si128(combodd, masko)); -} - -template -static inline Vec16uc permute16uc(Vec16uc const & a) { - return Vec16uc (permute16c (a)); -} - - -/***************************************************************************** -* -* Vector blend functions -* -****************************************************************************** -* -* These blend functions can mix elements from two different vectors and -* optionally set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where higher indexes indicate an element from the second source -* vector. For example, if each vector has 4 elements, then indexes 0 - 3 -* will select an element from the first vector and indexes 4 - 7 will select -* an element from the second vector. A negative index will generate zero. -* -* The blend functions for vectors of 8-bit integers are inefficient if -* the SSSE3 instruction set or later is not enabled. -* -* Example: -* Vec4i a(100,101,102,103); // a is (100, 101, 102, 103) -* Vec4i b(200,201,202,203); // b is (200, 201, 202, 203) -* Vec4i c; -* c = blend4i<1,4,-1,7> (a,b); // c is (101, 200, 0, 203) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. -*****************************************************************************/ - -template -static inline Vec16c blend16c(Vec16c const & a, Vec16c const & b) { - - // Combine bit 0-3 of all even indexes into a single bitfield, with 4 bits for each - const int me = (i0&15) | (i2&15)<<4 | (i4&15)<<8 | (i6&15)<<12 - | (i8&15)<<16 | (i10&15)<<20 | (i12&15)<<24 | (i14&15)<<28; - - // Combine bit 0-3 of all odd indexes into a single bitfield, with 4 bits for each - const int mo = (i1&15) | (i3&15)<<4 | (i5&15)<<8 | (i7&15)<<12 - | (i9&15)<<16 | (i11&15)<<20 | (i13&15)<<24 | (i15&15)<<28; - - // Mask indicating sign of all even indexes, with 4 bits for each, 0 for negative, 0xF for non-negative - const int se = (i0<0?0:0xF) | (i2<0?0:0xF)<<4 | (i4<0?0:0xF)<<8 | (i6<0?0:0xF)<<12 - | (i8<0?0:0xF)<<16 | (i10<0?0:0xF)<<20 | (i12<0?0:0xF)<<24 | (i14<0?0:0xF)<<28; - - // Mask indicating sign of all odd indexes, with 4 bits for each, 0 for negative, 0xF for non-negative - const int so = (i1<0?0:0xF) | (i3<0?0:0xF)<<4 | (i5<0?0:0xF)<<8 | (i7<0?0:0xF)<<12 - | (i9<0?0:0xF)<<16 | (i11<0?0:0xF)<<20 | (i13<0?0:0xF)<<24 | (i15<0?0:0xF)<<28; - - // Combine bit 4 of all even indexes into a single bitfield, with 4 bits for each - const int ne = (i0&16)>>4 | (i2&16) | (i4&16)<<4 | (i6&16)<<8 - | (i8&16)<<12 | (i10&16)<<16 | (i12&16)<<20 | (i14&16)<<24; - - // Combine bit 4 of all odd indexes into a single bitfield, with 4 bits for each - const int no = (i1&16)>>4 | (i3&16) | (i5&16)<<4 | (i7&16)<<8 - | (i9&16)<<12 | (i11&16)<<16 | (i13&16)<<20 | (i15&16)<<24; - - // Check if zeroing needed - const bool do_zero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0; // needs zeroing - - // no elements from b - if (((ne & se) | (no & so)) == 0) { - return permute16c (a); - } - - // no elements from a - if ((((ne^0x11111111) & se) | ((no^0x11111111) & so)) == 0) { - return permute16c (b); - } - __m128i t; - - // check if we can use punpcklbw - if (((me ^ 0x76543210) & se) == 0 && ((mo ^ 0x76543210) & so) == 0) { - if ((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0) { - t = _mm_unpacklo_epi8(a,b); - } - if ((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0) { - t = _mm_unpacklo_epi8(b,a); - } - if (do_zero) { - // additional zeroing needed - __m128i maskz = constant4i < - (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , - (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , - (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , - (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); - t = _mm_and_si128(t, maskz); - } - return t; - } - - // check if we can use punpckhbw - if (((me ^ 0xFEDCBA98) & se) == 0 && ((mo ^ 0xFEDCBA98) & so) == 0) { - if ((ne & se) == 0 && ((no ^ 0x11111111) & so) == 0) { - t = _mm_unpackhi_epi8(a,b); - } - if ((no & so) == 0 && ((ne ^ 0x11111111) & se) == 0) { - t = _mm_unpackhi_epi8(b,a); - } - if (do_zero) { - // additional zeroing needed - __m128i maskz = constant4i < - (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , - (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , - (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , - (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); - t = _mm_and_si128(t, maskz); - } - return t; - } - -#if INSTRSET >= 4 // SSSE3 - // special case: shift left - if (i0 > 0 && i0 < 16 && i1==i0+1 && i2==i0+2 && i3==i0+3 && i4==i0+4 && i5==i0+5 && i6==i0+6 && i7==i0+7 && - i8==i0+8 && i9==i0+9 && i10==i0+10 && i11==i0+11 && i12==i0+12 && i13==i0+13 && i14==i0+14 && i15==i0+15) { - return _mm_alignr_epi8(b, a, (i0 & 15)); - } - - // special case: shift right - if (i0 > 15 && i0 < 32 && i1==((i0+1)&31) && i2 ==((i0+2 )&31) && i3 ==((i0+3 )&31) && i4 ==((i0+4 )&31) && i5 ==((i0+5 )&31) && i6 ==((i0+6 )&31) && i7 ==((i0+7 )&31) && - i8==((i0+8 )&31) && i9==((i0+9)&31) && i10==((i0+10)&31) && i11==((i0+11)&31) && i12==((i0+12)&31) && i13==((i0+13)&31) && i14==((i0+14)&31) && i15==((i0+15)&31)) { - return _mm_alignr_epi8(a, b, (i0 & 15)); - } -#endif - -#if INSTRSET >= 5 // SSE4.1 supported - // special case: blend without permute - if (((me ^ 0xECA86420) & se) == 0 && ((mo ^ 0xFDB97531) & so) == 0) { - __m128i maskbl = constant4i< - ((i0 & 16) ? 0xFF : 0) | ((i1 & 16) ? 0xFF00 : 0) | ((i2 & 16) ? 0xFF0000 : 0) | ((i3 & 16) ? 0xFF000000 : 0) , - ((i4 & 16) ? 0xFF : 0) | ((i5 & 16) ? 0xFF00 : 0) | ((i6 & 16) ? 0xFF0000 : 0) | ((i7 & 16) ? 0xFF000000 : 0) , - ((i8 & 16) ? 0xFF : 0) | ((i9 & 16) ? 0xFF00 : 0) | ((i10& 16) ? 0xFF0000 : 0) | ((i11& 16) ? 0xFF000000 : 0) , - ((i12& 16) ? 0xFF : 0) | ((i13& 16) ? 0xFF00 : 0) | ((i14& 16) ? 0xFF0000 : 0) | ((i15& 16) ? 0xFF000000 : 0) > (); - t = _mm_blendv_epi8(a, b, maskbl); - if (do_zero) { - // additional zeroing needed - __m128i maskz = constant4i < - (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF00) | (i2 < 0 ? 0 : 0xFF0000) | (i3 < 0 ? 0 : 0xFF000000) , - (i4 < 0 ? 0 : 0xFF) | (i5 < 0 ? 0 : 0xFF00) | (i6 < 0 ? 0 : 0xFF0000) | (i7 < 0 ? 0 : 0xFF000000) , - (i8 < 0 ? 0 : 0xFF) | (i9 < 0 ? 0 : 0xFF00) | (i10 < 0 ? 0 : 0xFF0000) | (i11 < 0 ? 0 : 0xFF000000) , - (i12 < 0 ? 0 : 0xFF) | (i13 < 0 ? 0 : 0xFF00) | (i14 < 0 ? 0 : 0xFF0000) | (i15 < 0 ? 0 : 0xFF000000) > (); - t = _mm_and_si128(t, maskz); - } - return t; - } -#endif // SSE4.1 - -#if defined ( __XOP__ ) // Use AMD XOP instruction VPPERM - __m128i mask = constant4i< - (i0 <0 ? 0x80 : (i0 &31)) | (i1 <0 ? 0x80 : (i1 &31)) << 8 | (i2 <0 ? 0x80 : (i2 &31)) << 16 | (i3 <0 ? 0x80 : (i3 &31)) << 24, - (i4 <0 ? 0x80 : (i4 &31)) | (i5 <0 ? 0x80 : (i5 &31)) << 8 | (i6 <0 ? 0x80 : (i6 &31)) << 16 | (i7 <0 ? 0x80 : (i7 &31)) << 24, - (i8 <0 ? 0x80 : (i8 &31)) | (i9 <0 ? 0x80 : (i9 &31)) << 8 | (i10<0 ? 0x80 : (i10&31)) << 16 | (i11<0 ? 0x80 : (i11&31)) << 24, - (i12<0 ? 0x80 : (i12&31)) | (i13<0 ? 0x80 : (i13&31)) << 8 | (i14<0 ? 0x80 : (i14&31)) << 16 | (i15<0 ? 0x80 : (i15&31)) << 24 > (); - return _mm_perm_epi8(a, b, mask); - -#elif INSTRSET >= 4 // SSSE3 - - // general case. Use PSHUFB - __m128i maska = constant4i< - ((i0 & 0x90) ? 0xFF : (i0 &15)) | ((i1 & 0x90) ? 0xFF : (i1 &15)) << 8 | ((i2 & 0x90) ? 0xFF : (i2 &15)) << 16 | ((i3 & 0x90) ? 0xFF : (i3 &15)) << 24, - ((i4 & 0x90) ? 0xFF : (i4 &15)) | ((i5 & 0x90) ? 0xFF : (i5 &15)) << 8 | ((i6 & 0x90) ? 0xFF : (i6 &15)) << 16 | ((i7 & 0x90) ? 0xFF : (i7 &15)) << 24, - ((i8 & 0x90) ? 0xFF : (i8 &15)) | ((i9 & 0x90) ? 0xFF : (i9 &15)) << 8 | ((i10& 0x90) ? 0xFF : (i10&15)) << 16 | ((i11& 0x90) ? 0xFF : (i11&15)) << 24, - ((i12& 0x90) ? 0xFF : (i12&15)) | ((i13& 0x90) ? 0xFF : (i13&15)) << 8 | ((i14& 0x90) ? 0xFF : (i14&15)) << 16 | ((i15& 0x90) ? 0xFF : (i15&15)) << 24 > (); - __m128i maskb = constant4i< - (((i0^0x10) & 0x90) ? 0xFF : (i0 &15)) | (((i1^0x10) & 0x90) ? 0xFF : (i1 &15)) << 8 | (((i2^0x10) & 0x90) ? 0xFF : (i2 &15)) << 16 | (((i3^0x10) & 0x90) ? 0xFF : (i3 &15)) << 24, - (((i4^0x10) & 0x90) ? 0xFF : (i4 &15)) | (((i5^0x10) & 0x90) ? 0xFF : (i5 &15)) << 8 | (((i6^0x10) & 0x90) ? 0xFF : (i6 &15)) << 16 | (((i7^0x10) & 0x90) ? 0xFF : (i7 &15)) << 24, - (((i8^0x10) & 0x90) ? 0xFF : (i8 &15)) | (((i9^0x10) & 0x90) ? 0xFF : (i9 &15)) << 8 | (((i10^0x10)& 0x90) ? 0xFF : (i10&15)) << 16 | (((i11^0x10)& 0x90) ? 0xFF : (i11&15)) << 24, - (((i12^0x10)& 0x90) ? 0xFF : (i12&15)) | (((i13^0x10)& 0x90) ? 0xFF : (i13&15)) << 8 | (((i14^0x10)& 0x90) ? 0xFF : (i14&15)) << 16 | (((i15^0x10)& 0x90) ? 0xFF : (i15&15)) << 24 > (); - __m128i a1 = _mm_shuffle_epi8(a,maska); - __m128i b1 = _mm_shuffle_epi8(b,maskb); - return _mm_or_si128(a1,b1); - -#else // SSE2 - // combine two permutes - __m128i a1 = permute16c < - (uint32_t)i0 < 16 ? i0 : -1, - (uint32_t)i1 < 16 ? i1 : -1, - (uint32_t)i2 < 16 ? i2 : -1, - (uint32_t)i3 < 16 ? i3 : -1, - (uint32_t)i4 < 16 ? i4 : -1, - (uint32_t)i5 < 16 ? i5 : -1, - (uint32_t)i6 < 16 ? i6 : -1, - (uint32_t)i7 < 16 ? i7 : -1, - (uint32_t)i8 < 16 ? i8 : -1, - (uint32_t)i9 < 16 ? i9 : -1, - (uint32_t)i10 < 16 ? i10 : -1, - (uint32_t)i11 < 16 ? i11 : -1, - (uint32_t)i12 < 16 ? i12 : -1, - (uint32_t)i13 < 16 ? i13 : -1, - (uint32_t)i14 < 16 ? i14 : -1, - (uint32_t)i15 < 16 ? i15 : -1 > (a); - __m128i b1 = permute16c < - (uint32_t)(i0 ^16) < 16 ? (i0 ^16) : -1, - (uint32_t)(i1 ^16) < 16 ? (i1 ^16) : -1, - (uint32_t)(i2 ^16) < 16 ? (i2 ^16) : -1, - (uint32_t)(i3 ^16) < 16 ? (i3 ^16) : -1, - (uint32_t)(i4 ^16) < 16 ? (i4 ^16) : -1, - (uint32_t)(i5 ^16) < 16 ? (i5 ^16) : -1, - (uint32_t)(i6 ^16) < 16 ? (i6 ^16) : -1, - (uint32_t)(i7 ^16) < 16 ? (i7 ^16) : -1, - (uint32_t)(i8 ^16) < 16 ? (i8 ^16) : -1, - (uint32_t)(i9 ^16) < 16 ? (i9 ^16) : -1, - (uint32_t)(i10^16) < 16 ? (i10^16) : -1, - (uint32_t)(i11^16) < 16 ? (i11^16) : -1, - (uint32_t)(i12^16) < 16 ? (i12^16) : -1, - (uint32_t)(i13^16) < 16 ? (i13^16) : -1, - (uint32_t)(i14^16) < 16 ? (i14^16) : -1, - (uint32_t)(i15^16) < 16 ? (i15^16) : -1 > (b); - return _mm_or_si128(a1,b1); - -#endif -} - -template -static inline Vec16uc blend16uc(Vec16uc const & a, Vec16uc const & b) { - return Vec16uc( blend16c (a,b)); -} - - -template -static inline Vec8s blend8s(Vec8s const & a, Vec8s const & b) { - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 - | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; - - // Mask to zero out negative indexes - const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 - | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - // Some elements must be set to zero - const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) & 0x80) != 0; - - // temp contains temporary result, some zeroing needs to be done - bool zeroing_pending = false; - - // partially finished result - __m128i temp; - - if ((m1 & 0x88888888 & mz) == 0) { - // no elements from b - return permute8s (a); - } - - if (((m1^0x88888888) & 0x88888888 & mz) == 0) { - // no elements from a - return permute8s (b); - } - - // special case: PUNPCKLWD - if (((m1 ^ 0xB3A29180) & mz) == 0) { - temp = _mm_unpacklo_epi16(a, b); - if (do_zero) zeroing_pending = true; else return temp; - } - if (((m1 ^ 0x3B2A1908) & mz) == 0) { - temp = _mm_unpacklo_epi16(b, a); - if (do_zero) zeroing_pending = true; else return temp; - } - // special case: PUNPCKHWD - if (((m1 ^ 0xF7E6D5C4) & mz) == 0) { - temp = _mm_unpackhi_epi16(a, b); - if (do_zero) zeroing_pending = true; else return temp; - } - if (((m1 ^ 0x7F6E5D4C) & mz) == 0) { - temp = _mm_unpackhi_epi16(b, a); - if (do_zero) zeroing_pending = true; else return temp; - } - -#if INSTRSET >= 4 // SSSE3 - // special case: shift left - if (i0 > 0 && i0 < 8 && ((m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0) { - temp = _mm_alignr_epi8(b, a, (i0 & 7) * 2); - if (do_zero) zeroing_pending = true; else return temp; - } - - // special case: shift right - if (i0 > 8 && i0 < 16 && ((m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) & mz) == 0) { - temp = _mm_alignr_epi8(a, b, (i0 & 7) * 2); - if (do_zero) zeroing_pending = true; else return temp; - } -#endif // SSSE3 - -#if INSTRSET >= 5 // SSE4.1 supported - // special case: blending without permuting - if ((((m1 & ~0x88888888) ^ 0x76543210) & mz) == 0) { - temp = _mm_blend_epi16(a, b, (i0>>3&1) | (i1>>3&1)<<1 | (i2>>3&1)<<2 | (i3>>3&1)<<3 - | (i4>>3&1)<<4 | (i5>>3&1)<<5 | (i6>>3&1)<<6 | (i7>>3&1)<<7); - if (do_zero) zeroing_pending = true; else return temp; - } -#endif // SSE4.1 - - if (zeroing_pending) { - // additional zeroing of temp needed - __m128i maskz = constant4i < - (i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000) , - (i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000) , - (i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000) , - (i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000) > (); - return _mm_and_si128(temp, maskz); - } - - // general case -#ifdef __XOP__ // Use AMD XOP instruction PPERM - __m128i mask = constant4i < - (i0 < 0 ? 0x8080 : (i0*2 & 31) | ((i0*2 & 31)+1)<<8) | (i1 < 0 ? 0x80800000 : ((i1*2 & 31)<<16) | ((i1*2 & 31)+1)<<24), - (i2 < 0 ? 0x8080 : (i2*2 & 31) | ((i2*2 & 31)+1)<<8) | (i3 < 0 ? 0x80800000 : ((i3*2 & 31)<<16) | ((i3*2 & 31)+1)<<24), - (i4 < 0 ? 0x8080 : (i4*2 & 31) | ((i4*2 & 31)+1)<<8) | (i5 < 0 ? 0x80800000 : ((i5*2 & 31)<<16) | ((i5*2 & 31)+1)<<24), - (i6 < 0 ? 0x8080 : (i6*2 & 31) | ((i6*2 & 31)+1)<<8) | (i7 < 0 ? 0x80800000 : ((i7*2 & 31)<<16) | ((i7*2 & 31)+1)<<24) > (); - return _mm_perm_epi8(a, b, mask); -#else - // combine two permutes - __m128i a1 = permute8s < - (uint32_t)i0 < 8 ? i0 : -1, - (uint32_t)i1 < 8 ? i1 : -1, - (uint32_t)i2 < 8 ? i2 : -1, - (uint32_t)i3 < 8 ? i3 : -1, - (uint32_t)i4 < 8 ? i4 : -1, - (uint32_t)i5 < 8 ? i5 : -1, - (uint32_t)i6 < 8 ? i6 : -1, - (uint32_t)i7 < 8 ? i7 : -1 > (a); - __m128i b1 = permute8s < - (uint32_t)(i0^8) < 8 ? (i0^8) : -1, - (uint32_t)(i1^8) < 8 ? (i1^8) : -1, - (uint32_t)(i2^8) < 8 ? (i2^8) : -1, - (uint32_t)(i3^8) < 8 ? (i3^8) : -1, - (uint32_t)(i4^8) < 8 ? (i4^8) : -1, - (uint32_t)(i5^8) < 8 ? (i5^8) : -1, - (uint32_t)(i6^8) < 8 ? (i6^8) : -1, - (uint32_t)(i7^8) < 8 ? (i7^8) : -1 > (b); - return _mm_or_si128(a1,b1); - -#endif -} - -template -static inline Vec8us blend8us(Vec8us const & a, Vec8us const & b) { - return Vec8us(blend8s (a,b)); -} - -template -static inline Vec4i blend4i(Vec4i const & a, Vec4i const & b) { - - // Combine all the indexes into a single bitfield, with 8 bits for each - const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24; - - // Mask to zero out negative indexes - const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24; - - // Some elements must be set to zero - const bool do_zero = (mz != -1) && ((i0 | i1 | i2 | i3) & 0x80) != 0; - - // temp contains temporary result, some zeroing needs to be done - bool zeroing_pending = false; - - // partially finished result - __m128i temp; -#if defined (_MSC_VER) || defined (__clang__) - temp = a; // avoid spurious warning message for temp unused -#endif - - // special case: no elements from b - if ((m1 & 0x04040404 & mz) == 0) { - return permute4i(a); - } - - // special case: no elements from a - if (((m1^0x04040404) & 0x04040404 & mz) == 0) { - return permute4i(b); - } - - // special case: PUNPCKLDQ - if (((m1 ^ 0x05010400) & mz) == 0) { - temp = _mm_unpacklo_epi32(a, b); - if (do_zero) zeroing_pending = true; else return temp; - } - if (((m1 ^ 0x01050004) & mz) == 0) { - temp = _mm_unpacklo_epi32(b, a); - if (do_zero) zeroing_pending = true; else return temp; - } - - // special case: PUNPCKHDQ - if (((m1 ^ 0x07030602) & mz) == 0) { - temp = _mm_unpackhi_epi32(a, b); - if (do_zero) zeroing_pending = true; else return temp; - } - if (((m1 ^ 0x03070206) & mz) == 0) { - temp = _mm_unpackhi_epi32(b, a); - if (do_zero) zeroing_pending = true; else return temp; - } - -#if INSTRSET >= 4 // SSSE3 - // special case: shift left - if (i0 > 0 && i0 < 4 && ((m1 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0) { - temp = _mm_alignr_epi8(b, a, (i0 & 3) * 4); - if (do_zero) zeroing_pending = true; else return temp; - } - - // special case: shift right - if (i0 > 4 && i0 < 8 && ((m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101u + 0x03020100u)) & mz) == 0) { - temp = _mm_alignr_epi8(a, b, (i0 & 3) * 4); - if (do_zero) zeroing_pending = true; else return temp; - } -#endif // SSSE3 - -#if INSTRSET >= 5 // SSE4.1 supported - if ((((m1 & ~0x04040404) ^ 0x03020100) & mz) == 0) { - // blending without permuting - temp = _mm_blend_epi16(a, b, ((i0>>2)&1)*3 | ((((i1>>2)&1)*3)<<2) | ((((i2>>2)&1)*3)<<4) | ((((i3>>2)&1)*3)<<6)); - if (do_zero) zeroing_pending = true; else return temp; - } -#endif // SSE4.1 - - if (zeroing_pending) { - // additional zeroing of temp needed - __m128i maskz = constant4i < (i0 < 0 ? 0 : -1), (i1 < 0 ? 0 : -1), (i2 < 0 ? 0 : -1), (i3 < 0 ? 0 : -1) > (); - return _mm_and_si128(temp, maskz); - } - - // general case -#ifdef __XOP__ // Use AMD XOP instruction PPERM - __m128i mask = constant4i < - i0 < 0 ? 0x80808080 : (i0*4 & 31) + (((i0*4 & 31) + 1) << 8) + (((i0*4 & 31) + 2) << 16) + (((i0*4 & 31) + 3) << 24), - i1 < 0 ? 0x80808080 : (i1*4 & 31) + (((i1*4 & 31) + 1) << 8) + (((i1*4 & 31) + 2) << 16) + (((i1*4 & 31) + 3) << 24), - i2 < 0 ? 0x80808080 : (i2*4 & 31) + (((i2*4 & 31) + 1) << 8) + (((i2*4 & 31) + 2) << 16) + (((i2*4 & 31) + 3) << 24), - i3 < 0 ? 0x80808080 : (i3*4 & 31) + (((i3*4 & 31) + 1) << 8) + (((i3*4 & 31) + 2) << 16) + (((i3*4 & 31) + 3) << 24) > (); - return _mm_perm_epi8(a, b, mask); - -#else // combine two permutes - __m128i a1 = permute4i < - (uint32_t)i0 < 4 ? i0 : -1, - (uint32_t)i1 < 4 ? i1 : -1, - (uint32_t)i2 < 4 ? i2 : -1, - (uint32_t)i3 < 4 ? i3 : -1 > (a); - __m128i b1 = permute4i < - (uint32_t)(i0^4) < 4 ? (i0^4) : -1, - (uint32_t)(i1^4) < 4 ? (i1^4) : -1, - (uint32_t)(i2^4) < 4 ? (i2^4) : -1, - (uint32_t)(i3^4) < 4 ? (i3^4) : -1 > (b); - return _mm_or_si128(a1,b1); -#endif -} - -template -static inline Vec4ui blend4ui(Vec4ui const & a, Vec4ui const & b) { - return Vec4ui (blend4i (a,b)); -} - -template -static inline Vec2q blend2q(Vec2q const & a, Vec2q const & b) { - - // Combine all the indexes into a single bitfield, with 8 bits for each - const int m1 = (i0&3) | (i1&3)<<8; - - // Mask to zero out negative indexes - const int mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8; - - // no elements from b - if ((m1 & 0x0202 & mz) == 0) { - return permute2q (a); - } - // no elements from a - if (((m1^0x0202) & 0x0202 & mz) == 0) { - return permute2q (b); - } - // (all cases where one index is -1 or -256 would go to the above cases) - - // special case: PUNPCKLQDQ - if (i0 == 0 && i1 == 2) { - return _mm_unpacklo_epi64(a, b); - } - if (i0 == 2 && i1 == 0) { - return _mm_unpacklo_epi64(b, a); - } - // special case: PUNPCKHQDQ - if (i0 == 1 && i1 == 3) { - return _mm_unpackhi_epi64(a, b); - } - if (i0 == 3 && i1 == 1) { - return _mm_unpackhi_epi64(b, a); - } - -#if INSTRSET >= 4 // SSSE3 - // special case: shift left - if (i0 == 1 && i1 == 2) { - return _mm_alignr_epi8(b, a, 8); - } - // special case: shift right - if (i0 == 3 && i1 == 0) { - return _mm_alignr_epi8(a, b, 8); - } -#endif // SSSE3 - -#if INSTRSET >= 5 // SSE4.1 supported - if (((m1 & ~0x0202) ^ 0x0100) == 0 && mz == 0xFFFF) { - // blending without permuting - return _mm_blend_epi16(a, b, (i0>>1 & 1) * 0xF | ((i1>>1 & 1) * 0xF) << 4 ); - } -#endif // SSE4.1 - - // general case. combine two permutes - // (all cases are caught by the above special cases if SSE4.1 or higher is supported) - __m128i a1, b1; - a1 = permute2q <(uint32_t)i0 < 2 ? i0 : -1, (uint32_t)i1 < 2 ? i1 : -1 > (a); - b1 = permute2q <(uint32_t)(i0^2) < 2 ? (i0^2) : -1, (uint32_t)(i1^2) < 2 ? (i1^2) : -1 > (b); - return _mm_or_si128(a1,b1); -} - -template -static inline Vec2uq blend2uq(Vec2uq const & a, Vec2uq const & b) { - return Vec2uq (blend2q ((__m128i)a, (__m128i)b)); -} - - - -/***************************************************************************** -* -* Vector lookup functions -* -****************************************************************************** -* -* These functions use vector elements as indexes into a table. -* The table is given as one or more vectors or as an array. -* -* This can be used for several purposes: -* - table lookup -* - permute or blend with variable indexes -* - blend from more than two sources -* - gather non-contiguous data -* -* An index out of range may produce any value - the actual value produced is -* implementation dependent and may be different for different instruction -* sets. An index out of range does not produce an error message or exception. -* -* Example: -* Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3) -* Vec4i b(100,101,102,103); // table b is (100, 101, 102, 103) -* Vec4i c; -* c = lookup4 (a,b); // c is (102, 100, 100, 103) -* -*****************************************************************************/ - -static inline Vec16c lookup16(Vec16c const & index, Vec16c const & table) { -#if INSTRSET >= 5 // SSSE3 - return _mm_shuffle_epi8(table, index); -#else - uint8_t ii[16]; - int8_t tt[16], rr[16]; - table.store(tt); index.store(ii); - for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x0F]; - return Vec16c().load(rr); -#endif -} - -static inline Vec16c lookup32(Vec16c const & index, Vec16c const & table0, Vec16c const & table1) { -#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM - return _mm_perm_epi8(table0, table1, index); -#elif INSTRSET >= 5 // SSSE3 - Vec16c r0 = _mm_shuffle_epi8(table0, index + 0x70); // make negative index for values >= 16 - Vec16c r1 = _mm_shuffle_epi8(table1, (index ^ 0x10) + 0x70); // make negative index for values < 16 - return r0 | r1; -#else - uint8_t ii[16]; - int8_t tt[16], rr[16]; - table0.store(tt); table1.store(tt+16); index.store(ii); - for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F]; - return Vec16c().load(rr); -#endif -} - -template -static inline Vec16c lookup(Vec16c const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 16) return lookup16(index, Vec16c().load(table)); - if (n <= 32) return lookup32(index, Vec16c().load(table), Vec16c().load((int8_t*)table + 16)); - // n > 32. Limit index - Vec16uc index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec16uc(index) & uint8_t(n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec16uc(index), uint8_t(n-1)); - } - uint8_t ii[16]; index1.store(ii); - int8_t rr[16]; - for (int j = 0; j < 16; j++) { - rr[j] = ((int8_t*)table)[ii[j]]; - } - return Vec16c().load(rr); -} - -static inline Vec8s lookup8(Vec8s const & index, Vec8s const & table) { -#if INSTRSET >= 5 // SSSE3 - return _mm_shuffle_epi8(table, index * 0x202 + 0x100); -#else - int16_t ii[8], tt[8], rr[8]; - table.store(tt); index.store(ii); - for (int j = 0; j < 8; j++) rr[j] = tt[ii[j] & 0x07]; - return Vec8s().load(rr); -#endif -} - -static inline Vec8s lookup16(Vec8s const & index, Vec8s const & table0, Vec8s const & table1) { -#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM - return _mm_perm_epi8(table0, table1, index * 0x202 + 0x100); -#elif INSTRSET >= 5 // SSSE3 - Vec8s r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x202) + Vec16c(Vec8s(0x7170))); - Vec8s r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x202 ^ 0x1010) + Vec16c(Vec8s(0x7170))); - return r0 | r1; -#else - int16_t ii[16], tt[32], rr[16]; - table0.store(tt); table1.store(tt+8); index.store(ii); - for (int j = 0; j < 16; j++) rr[j] = tt[ii[j] & 0x1F]; - return Vec8s().load(rr); -#endif -} - -template -static inline Vec8s lookup(Vec8s const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 8) return lookup8 (index, Vec8s().load(table)); - if (n <= 16) return lookup16(index, Vec8s().load(table), Vec8s().load((int16_t*)table + 8)); - // n > 16. Limit index - Vec8us index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec8us(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec8us(index), n-1); - } -#if INSTRSET >= 8 // AVX2. Use VPERMD - Vec8s t1 = _mm_i32gather_epi32((const int *)table, __m128i((Vec4i(index1)) & (Vec4i(0x0000FFFF))), 2); // even positions - Vec8s t2 = _mm_i32gather_epi32((const int *)table, _mm_srli_epi32(index1, 16) , 2); // odd positions - return blend8s<0,8,2,10,4,12,6,14>(t1, t2); -#else - uint16_t ii[8]; index1.store(ii); - return Vec8s(((int16_t*)table)[ii[0]], ((int16_t*)table)[ii[1]], ((int16_t*)table)[ii[2]], ((int16_t*)table)[ii[3]], - ((int16_t*)table)[ii[4]], ((int16_t*)table)[ii[5]], ((int16_t*)table)[ii[6]], ((int16_t*)table)[ii[7]]); -#endif -} - - -static inline Vec4i lookup4(Vec4i const & index, Vec4i const & table) { -#if INSTRSET >= 5 // SSSE3 - return _mm_shuffle_epi8(table, index * 0x04040404 + 0x03020100); -#else - return Vec4i(table[index[0]],table[index[1]],table[index[2]],table[index[3]]); -#endif -} - -static inline Vec4i lookup8(Vec4i const & index, Vec4i const & table0, Vec4i const & table1) { - // return Vec4i(lookup16(Vec8s(index * 0x20002 + 0x10000), Vec8s(table0), Vec8s(table1))); -#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM - return _mm_perm_epi8(table0, table1, index * 0x04040404 + 0x03020100); -#elif INSTRSET >= 8 // AVX2. Use VPERMD - __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector - -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order - return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01)); -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 also has operands in wrong order - return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index), table01)); -#else - return _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index))); -#endif // bug - -#elif INSTRSET >= 4 // SSSE3 - Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c(index * 0x04040404) + Vec16c(Vec4i(0x73727170))); - Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c(index * 0x04040404 ^ 0x10101010) + Vec16c(Vec4i(0x73727170))); - return r0 | r1; -#else // SSE2 - int32_t ii[4], tt[8], rr[4]; - table0.store(tt); table1.store(tt+4); index.store(ii); - for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x07]; - return Vec4i().load(rr); -#endif -} - -static inline Vec4i lookup16(Vec4i const & index, Vec4i const & table0, Vec4i const & table1, Vec4i const & table2, Vec4i const & table3) { -#if INSTRSET >= 8 // AVX2. Use VPERMD - __m256i table01 = _mm256_inserti128_si256(_mm256_castsi128_si256(table0), table1, 1); // join tables into 256 bit vector - __m256i table23 = _mm256_inserti128_si256(_mm256_castsi128_si256(table2), table3, 1); // join tables into 256 bit vector -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order - __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ), table01)); - __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23)); -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 also has operands in wrong order - __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ), table01)); - __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(_mm256_castsi128_si256(index ^ 8), table23)); -#else - __m128i r0 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table01, _mm256_castsi128_si256(index))); - __m128i r1 = _mm256_castsi256_si128(_mm256_permutevar8x32_epi32(table23, _mm256_castsi128_si256(index ^ 8))); -#endif // bug - return _mm_blendv_epi8(r0, r1, index > 8); - -#elif defined (__XOP__) // AMD XOP instruction set. Use VPPERM - Vec4i r0 = _mm_perm_epi8(table0, table1, ((index ) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu); - Vec4i r1 = _mm_perm_epi8(table2, table3, ((index ^ 8) * 0x04040404u + 0x63626160u) & 0X9F9F9F9Fu); - return r0 | r1; - -#elif INSTRSET >= 5 // SSSE3 - Vec16c aa = Vec16c(Vec4i(0x73727170)); - Vec4i r0 = _mm_shuffle_epi8(table0, Vec16c((index ) * 0x04040404) + aa); - Vec4i r1 = _mm_shuffle_epi8(table1, Vec16c((index ^ 4) * 0x04040404) + aa); - Vec4i r2 = _mm_shuffle_epi8(table2, Vec16c((index ^ 8) * 0x04040404) + aa); - Vec4i r3 = _mm_shuffle_epi8(table3, Vec16c((index ^ 12) * 0x04040404) + aa); - return (r0 | r1) | (r2 | r3); - -#else // SSE2 - int32_t ii[4], tt[16], rr[4]; - table0.store(tt); table1.store(tt+4); table2.store(tt+8); table3.store(tt+12); - index.store(ii); - for (int j = 0; j < 4; j++) rr[j] = tt[ii[j] & 0x0F]; - return Vec4i().load(rr); -#endif -} - -template -static inline Vec4i lookup(Vec4i const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 4) return lookup4(index, Vec4i().load(table)); - if (n <= 8) return lookup8(index, Vec4i().load(table), Vec4i().load((int32_t*)table + 4)); - // n > 8. Limit index - Vec4ui index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec4ui(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec4ui(index), n-1); - } -#if INSTRSET >= 8 // AVX2. Use VPERMD - return _mm_i32gather_epi32((const int *)table, index1, 4); -#else - uint32_t ii[4]; index1.store(ii); - return Vec4i(((int32_t*)table)[ii[0]], ((int32_t*)table)[ii[1]], ((int32_t*)table)[ii[2]], ((int32_t*)table)[ii[3]]); -#endif -} - - -static inline Vec2q lookup2(Vec2q const & index, Vec2q const & table) { -#if INSTRSET >= 5 // SSSE3 - return _mm_shuffle_epi8(table, index * 0x0808080808080808ll + 0x0706050403020100ll); -#else - int64_t ii[2], tt[2]; - table.store(tt); index.store(ii); - return Vec2q(tt[int(ii[0])], tt[int(ii[1])]); -#endif -} - -template -static inline Vec2q lookup(Vec2q const & index, void const * table) { - if (n <= 0) return 0; - // n > 0. Limit index - Vec2uq index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec2uq(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1. - // There is no 64-bit min instruction, but we can use the 32-bit unsigned min, - // since n is a 32-bit integer - index1 = Vec2uq(min(Vec2uq(index), constant4i())); - } - uint32_t ii[4]; index1.store(ii); // use only lower 32 bits of each index - int64_t const * tt = (int64_t const *)table; - return Vec2q(tt[ii[0]], tt[ii[2]]); -} - - -/***************************************************************************** -* -* Other permutations with variable indexes -* -*****************************************************************************/ - -// Function shift_bytes_up: shift whole vector left by b bytes. -// You may use a permute function instead if b is a compile-time constant -static inline Vec16c shift_bytes_up(Vec16c const & a, int b) { - if ((uint32_t)b > 15) return _mm_setzero_si128(); -#if INSTRSET >= 4 // SSSE3 - static const char mask[32] = {-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask+16-b))); -#else - Vec2uq a1 = Vec2uq(a); - if (b < 8) { - a1 = (a1 << (b*8)) | (permute2uq<-1,0>(a1) >> (64 - (b*8))); - } - else { - a1 = permute2uq<-1,0>(a1) << ((b-8)*8); - } - return Vec16c(a1); -#endif -} - -// Function shift_bytes_down: shift whole vector right by b bytes -// You may use a permute function instead if b is a compile-time constant -static inline Vec16c shift_bytes_down(Vec16c const & a, int b) { - if ((uint32_t)b > 15) return _mm_setzero_si128(); -#if INSTRSET >= 4 // SSSE3 - static const char mask[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}; - return Vec16c(_mm_shuffle_epi8(a, Vec16c().load(mask+b))); -#else - Vec2uq a1 = Vec2uq(a); - if (b < 8) { - a1 = (a1 >> (b*8)) | (permute2uq<1,-1>(a1) << (64 - (b*8))); - } - else { - a1 = permute2uq<1,-1>(a1) >> ((b-8)*8); - } - return Vec16c(a1); -#endif -} - -/***************************************************************************** -* -* Gather functions with fixed indexes -* -*****************************************************************************/ -// Load elements from array a with indices i0, i1, i2, i3 -template -static inline Vec4i gather4i(void const * a) { - Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index; // Error message if index is negative - const int i01min = i0 < i1 ? i0 : i1; - const int i23min = i2 < i3 ? i2 : i3; - const int imin = i01min < i23min ? i01min : i23min; - const int i01max = i0 > i1 ? i0 : i1; - const int i23max = i2 > i3 ? i2 : i3; - const int imax = i01max > i23max ? i01max : i23max; - if (imax - imin <= 3) { - // load one contiguous block and permute - if (imax > 3) { - // make sure we don't read past the end of the array - Vec4i b = Vec4i().load((int32_t const *)a + imax-3); - return permute4i(b); - } - else { - Vec4i b = Vec4i().load((int32_t const *)a + imin); - return permute4i(b); - } - } - if ((i0imax-4) && (i1imax-4) && (i2imax-4) && (i3imax-4)) { - // load two contiguous blocks and blend - Vec4i b = Vec4i().load((int32_t const *)a + imin); - Vec4i c = Vec4i().load((int32_t const *)a + imax-3); - const int j0 = i0(b, c); - } - // use AVX2 gather if available -#if INSTRSET >= 8 - return _mm_i32gather_epi32((const int *)a, Vec4i(i0,i1,i2,i3), 4); -#else - return lookup(Vec4i(i0,i1,i2,i3), a); -#endif -} - -// Load elements from array a with indices i0, i1 -template -static inline Vec2q gather2q(void const * a) { - Static_error_check<(i0|i1)>=0> Negative_array_index; // Error message if index is negative - const int imin = i0 < i1 ? i0 : i1; - const int imax = i0 > i1 ? i0 : i1; - if (imax - imin <= 1) { - // load one contiguous block and permute - if (imax > 1) { - // make sure we don't read past the end of the array - Vec2q b = Vec2q().load((int64_t const *)a + imax-1); - return permute2q(b); - } - else { - Vec2q b = Vec2q().load((int64_t const *)a + imin); - return permute2q(b); - } - } - return Vec2q(((int64_t*)a)[i0], ((int64_t*)a)[i1]); -} - - -/***************************************************************************** -* -* Vector scatter functions -* -****************************************************************************** -* -* These functions write the elements of a vector to arbitrary positions in an -* array in memory. Each vector element is written to an array position -* determined by an index. An element is not written if the corresponding -* index is out of range. -* The indexes can be specified as constant template parameters or as an -* integer vector. -* -* The scatter functions are useful if the data are distributed in a sparce -* manner into the array. If the array is dense then it is more efficient -* to permute the data into the right positions and then write the whole -* permuted vector into the array. -* -* Example: -* Vec8q a(10,11,12,13,14,15,16,17); -* int64_t b[16] = {0}; -* scatter<0,2,14,10,1,-1,5,9>(a,b); -* // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0} -* -*****************************************************************************/ - -template -static inline void scatter(Vec4i const & data, void * array) { -#if defined (__AVX512VL__) - __m128i indx = constant4i(); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3); - _mm_mask_i32scatter_epi32((int*)array, mask, indx, data, 4); -#else - int32_t* arr = (int32_t*)array; - const int index[4] = {i0,i1,i2,i3}; - for (int i = 0; i < 4; i++) { - if (index[i] >= 0) arr[index[i]] = data[i]; - } -#endif -} - -template -static inline void scatter(Vec2q const & data, void * array) { - int64_t* arr = (int64_t*)array; - if (i0 >= 0) arr[i0] = data[0]; - if (i1 >= 0) arr[i1] = data[1]; -} - -static inline void scatter(Vec4i const & index, uint32_t limit, Vec4i const & data, void * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); - _mm_mask_i32scatter_epi32((int*)array, mask, index, data, 4); -#else - int32_t* arr = (int32_t*)array; - for (int i = 0; i < 4; i++) { - if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec2q const & index, uint32_t limit, Vec2q const & data, void * array) { - int64_t* arr = (int64_t*)array; - if (uint64_t(index[0]) < uint64_t(limit)) arr[index[0]] = data[0]; - if (uint64_t(index[1]) < uint64_t(limit)) arr[index[1]] = data[1]; -} - -static inline void scatter(Vec4i const & index, uint32_t limit, Vec2q const & data, void * array) { - int64_t* arr = (int64_t*)array; - if (uint32_t(index[0]) < limit) arr[index[0]] = data[0]; - if (uint32_t(index[1]) < limit) arr[index[1]] = data[1]; -} - -/***************************************************************************** -* -* Functions for conversion between integer sizes -* -*****************************************************************************/ - -// Extend 8-bit integers to 16-bit integers, signed and unsigned - -// Function extend_low : extends the low 8 elements to 16 bits with sign extension -static inline Vec8s extend_low (Vec16c const & a) { - __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(),a); // 0 > a - return _mm_unpacklo_epi8(a,sign); // interleave with sign extensions -} - -// Function extend_high : extends the high 8 elements to 16 bits with sign extension -static inline Vec8s extend_high (Vec16c const & a) { - __m128i sign = _mm_cmpgt_epi8(_mm_setzero_si128(),a); // 0 > a - return _mm_unpackhi_epi8(a,sign); // interleave with sign extensions -} - -// Function extend_low : extends the low 8 elements to 16 bits with zero extension -static inline Vec8us extend_low (Vec16uc const & a) { - return _mm_unpacklo_epi8(a,_mm_setzero_si128()); // interleave with zero extensions -} - -// Function extend_high : extends the high 8 elements to 16 bits with zero extension -static inline Vec8us extend_high (Vec16uc const & a) { - return _mm_unpackhi_epi8(a,_mm_setzero_si128()); // interleave with zero extensions -} - -// Extend 16-bit integers to 32-bit integers, signed and unsigned - -// Function extend_low : extends the low 4 elements to 32 bits with sign extension -static inline Vec4i extend_low (Vec8s const & a) { - __m128i sign = _mm_srai_epi16(a,15); // sign bit - return _mm_unpacklo_epi16(a,sign); // interleave with sign extensions -} - -// Function extend_high : extends the high 4 elements to 32 bits with sign extension -static inline Vec4i extend_high (Vec8s const & a) { - __m128i sign = _mm_srai_epi16(a,15); // sign bit - return _mm_unpackhi_epi16(a,sign); // interleave with sign extensions -} - -// Function extend_low : extends the low 4 elements to 32 bits with zero extension -static inline Vec4ui extend_low (Vec8us const & a) { - return _mm_unpacklo_epi16(a,_mm_setzero_si128()); // interleave with zero extensions -} - -// Function extend_high : extends the high 4 elements to 32 bits with zero extension -static inline Vec4ui extend_high (Vec8us const & a) { - return _mm_unpackhi_epi16(a,_mm_setzero_si128()); // interleave with zero extensions -} - -// Extend 32-bit integers to 64-bit integers, signed and unsigned - -// Function extend_low : extends the low 2 elements to 64 bits with sign extension -static inline Vec2q extend_low (Vec4i const & a) { - __m128i sign = _mm_srai_epi32(a,31); // sign bit - return _mm_unpacklo_epi32(a,sign); // interleave with sign extensions -} - -// Function extend_high : extends the high 2 elements to 64 bits with sign extension -static inline Vec2q extend_high (Vec4i const & a) { - __m128i sign = _mm_srai_epi32(a,31); // sign bit - return _mm_unpackhi_epi32(a,sign); // interleave with sign extensions -} - -// Function extend_low : extends the low 2 elements to 64 bits with zero extension -static inline Vec2uq extend_low (Vec4ui const & a) { - return _mm_unpacklo_epi32(a,_mm_setzero_si128()); // interleave with zero extensions -} - -// Function extend_high : extends the high 2 elements to 64 bits with zero extension -static inline Vec2uq extend_high (Vec4ui const & a) { - return _mm_unpackhi_epi32(a,_mm_setzero_si128()); // interleave with zero extensions -} - -// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Overflow wraps around -static inline Vec16c compress (Vec8s const & low, Vec8s const & high) { - __m128i mask = _mm_set1_epi32(0x00FF00FF); // mask for low bytes - __m128i lowm = _mm_and_si128(low,mask); // bytes of low - __m128i highm = _mm_and_si128(high,mask); // bytes of high - return _mm_packus_epi16(lowm,highm); // unsigned pack -} - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Signed, with saturation -static inline Vec16c compress_saturated (Vec8s const & low, Vec8s const & high) { - return _mm_packs_epi16(low,high); -} - -// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers -// Unsigned, overflow wraps around -static inline Vec16uc compress (Vec8us const & low, Vec8us const & high) { - return Vec16uc (compress((Vec8s)low, (Vec8s)high)); -} - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Unsigned, with saturation -static inline Vec16uc compress_saturated (Vec8us const & low, Vec8us const & high) { -#if INSTRSET >= 5 // SSE4.1 supported - __m128i maxval = _mm_set1_epi32(0x00FF00FF); // maximum value - __m128i minval = _mm_setzero_si128(); // minimum value = 0 - __m128i low1 = _mm_min_epu16(low,maxval); // upper limit - __m128i high1 = _mm_min_epu16(high,maxval); // upper limit - __m128i low2 = _mm_max_epu16(low1,minval); // lower limit - __m128i high2 = _mm_max_epu16(high1,minval); // lower limit - return _mm_packus_epi16(low2,high2); // this instruction saturates from signed 32 bit to unsigned 16 bit -#else - __m128i zero = _mm_setzero_si128(); // 0 - __m128i signlow = _mm_cmpgt_epi16(zero,low); // sign bit of low - __m128i signhi = _mm_cmpgt_epi16(zero,high); // sign bit of high - __m128i slow2 = _mm_srli_epi16(signlow,8); // FF if low negative - __m128i shigh2 = _mm_srli_epi16(signhi,8); // FF if high negative - __m128i maskns = _mm_set1_epi32(0x7FFF7FFF); // mask for removing sign bit - __m128i lowns = _mm_and_si128(low,maskns); // low, with sign bit removed - __m128i highns = _mm_and_si128(high,maskns); // high, with sign bit removed - __m128i lowo = _mm_or_si128(lowns,slow2); // low, sign bit replaced by 00FF - __m128i higho = _mm_or_si128(highns,shigh2); // high, sign bit replaced by 00FF - return _mm_packus_epi16(lowo,higho); // this instruction saturates from signed 16 bit to unsigned 8 bit -#endif -} - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Signed to unsigned, with saturation -static inline Vec16uc compress_saturated_s2u (Vec8s const & low, Vec8s const & high) { - return _mm_packus_epi16(low,high); // this instruction saturates from signed 16 bit to unsigned 8 bit -} - -// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Overflow wraps around -static inline Vec8s compress (Vec4i const & low, Vec4i const & high) { -#if INSTRSET >= 5 // SSE4.1 supported - __m128i mask = _mm_set1_epi32(0x0000FFFF); // mask for low words - __m128i lowm = _mm_and_si128(low,mask); // bytes of low - __m128i highm = _mm_and_si128(high,mask); // bytes of high - return _mm_packus_epi32(lowm,highm); // unsigned pack -#else - __m128i low1 = _mm_shufflelo_epi16(low,0xD8); // low words in place - __m128i high1 = _mm_shufflelo_epi16(high,0xD8); // low words in place - __m128i low2 = _mm_shufflehi_epi16(low1,0xD8); // low words in place - __m128i high2 = _mm_shufflehi_epi16(high1,0xD8); // low words in place - __m128i low3 = _mm_shuffle_epi32(low2,0xD8); // low dwords of low to pos. 0 and 32 - __m128i high3 = _mm_shuffle_epi32(high2,0xD8); // low dwords of high to pos. 0 and 32 - return _mm_unpacklo_epi64(low3,high3); // interleave -#endif -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Signed with saturation -static inline Vec8s compress_saturated (Vec4i const & low, Vec4i const & high) { - return _mm_packs_epi32(low,high); // pack with signed saturation -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Overflow wraps around -static inline Vec8us compress (Vec4ui const & low, Vec4ui const & high) { - return Vec8us (compress((Vec4i)low, (Vec4i)high)); -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Unsigned, with saturation -static inline Vec8us compress_saturated (Vec4ui const & low, Vec4ui const & high) { -#if INSTRSET >= 5 // SSE4.1 supported - __m128i maxval = _mm_set1_epi32(0x0000FFFF); // maximum value - __m128i minval = _mm_setzero_si128(); // minimum value = 0 - __m128i low1 = _mm_min_epu32(low,maxval); // upper limit - __m128i high1 = _mm_min_epu32(high,maxval); // upper limit - __m128i low2 = _mm_max_epu32(low1,minval); // lower limit - __m128i high2 = _mm_max_epu32(high1,minval); // lower limit - return _mm_packus_epi32(low2,high2); // this instruction saturates from signed 32 bit to unsigned 16 bit -#else - __m128i zero = _mm_setzero_si128(); // 0 - __m128i lowzero = _mm_cmpeq_epi16(low,zero); // for each word is zero - __m128i highzero = _mm_cmpeq_epi16(high,zero); // for each word is zero - __m128i mone = _mm_set1_epi32(-1); // FFFFFFFF - __m128i lownz = _mm_xor_si128(lowzero,mone); // for each word is nonzero - __m128i highnz = _mm_xor_si128(highzero,mone); // for each word is nonzero - __m128i lownz2 = _mm_srli_epi32(lownz,16); // shift down to low dword - __m128i highnz2 = _mm_srli_epi32(highnz,16); // shift down to low dword - __m128i lowsatur = _mm_or_si128(low,lownz2); // low, saturated - __m128i hisatur = _mm_or_si128(high,highnz2); // high, saturated - return Vec8us (compress(Vec4i(lowsatur), Vec4i(hisatur))); -#endif -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Signed to unsigned, with saturation -static inline Vec8us compress_saturated_s2u (Vec4i const & low, Vec4i const & high) { -#if INSTRSET >= 5 // SSE4.1 supported - return _mm_packus_epi32(low,high); // this instruction saturates from signed 32 bit to unsigned 16 bit -#else - __m128i val_32 = _mm_set1_epi32(0x8000); - __m128i val_16 = _mm_set1_epi16(0x8000); - __m128i low1 = _mm_sub_epi32(low,val_32); - __m128i high1 = _mm_sub_epi32(high,val_32); - return _mm_add_epi16(_mm_packs_epi32(low1,high1),val_16); -#endif -} - -// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation - -// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers -// Overflow wraps around -static inline Vec4i compress (Vec2q const & low, Vec2q const & high) { - __m128i low2 = _mm_shuffle_epi32(low,0xD8); // low dwords of low to pos. 0 and 32 - __m128i high2 = _mm_shuffle_epi32(high,0xD8); // low dwords of high to pos. 0 and 32 - return _mm_unpacklo_epi64(low2,high2); // interleave -} - -// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers -// Signed, with saturation -// This function is very inefficient unless the SSE4.2 instruction set is supported -static inline Vec4i compress_saturated (Vec2q const & low, Vec2q const & high) { - Vec2q maxval = _mm_set_epi32(0,0x7FFFFFFF,0,0x7FFFFFFF); - Vec2q minval = _mm_set_epi32(-1,0x80000000,-1,0x80000000); - Vec2q low1 = min(low,maxval); - Vec2q high1 = min(high,maxval); - Vec2q low2 = max(low1,minval); - Vec2q high2 = max(high1,minval); - return compress(low2,high2); -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Overflow wraps around -static inline Vec4ui compress (Vec2uq const & low, Vec2uq const & high) { - return Vec4ui (compress((Vec2q)low, (Vec2q)high)); -} - -// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers -// Unsigned, with saturation -static inline Vec4ui compress_saturated (Vec2uq const & low, Vec2uq const & high) { - __m128i zero = _mm_setzero_si128(); // 0 - __m128i lowzero = _mm_cmpeq_epi32(low,zero); // for each dword is zero - __m128i highzero = _mm_cmpeq_epi32(high,zero); // for each dword is zero - __m128i mone = _mm_set1_epi32(-1); // FFFFFFFF - __m128i lownz = _mm_xor_si128(lowzero,mone); // for each dword is nonzero - __m128i highnz = _mm_xor_si128(highzero,mone); // for each dword is nonzero - __m128i lownz2 = _mm_srli_epi64(lownz,32); // shift down to low dword - __m128i highnz2 = _mm_srli_epi64(highnz,32); // shift down to low dword - __m128i lowsatur = _mm_or_si128(low,lownz2); // low, saturated - __m128i hisatur = _mm_or_si128(high,highnz2); // high, saturated - return Vec4ui (compress(Vec2q(lowsatur), Vec2q(hisatur))); -} - -/***************************************************************************** -* -* Helper functions for division and bit scan -* -*****************************************************************************/ - -// Define popcount function. Gives sum of bits -#if INSTRSET >= 6 // SSE4.2 - // popcnt instruction is not officially part of the SSE4.2 instruction set, - // but available in all known processors with SSE4.2 -#if defined (__GNUC__) || defined(__clang__) -static inline uint32_t vml_popcnt (uint32_t a) __attribute__ ((pure)); -static inline uint32_t vml_popcnt (uint32_t a) { - uint32_t r; - __asm("popcnt %1, %0" : "=r"(r) : "r"(a) : ); - return r; -} -#else -static inline uint32_t vml_popcnt (uint32_t a) { - return _mm_popcnt_u32(a); // MS intrinsic -} -#endif // platform -#else // no SSE4.2 -static inline uint32_t vml_popcnt (uint32_t a) { - // popcnt instruction not available - uint32_t b = a - ((a >> 1) & 0x55555555); - uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333); - uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F; - uint32_t e = d * 0x01010101; - return e >> 24; -} -#endif - - -// Define bit-scan-forward function. Gives index to lowest set bit -#if defined (__GNUC__) || defined(__clang__) -static inline uint32_t bit_scan_forward (uint32_t a) __attribute__ ((pure)); -static inline uint32_t bit_scan_forward (uint32_t a) { - uint32_t r; - __asm("bsfl %1, %0" : "=r"(r) : "r"(a) : ); - return r; -} -#else -static inline uint32_t bit_scan_forward (uint32_t a) { - unsigned long r; - _BitScanForward(&r, a); // defined in intrin.h for MS and Intel compilers - return r; -} -#endif - -// Define bit-scan-reverse function. Gives index to highest set bit = floor(log2(a)) -#if defined (__GNUC__) || defined(__clang__) -static inline uint32_t bit_scan_reverse (uint32_t a) __attribute__ ((pure)); -static inline uint32_t bit_scan_reverse (uint32_t a) { - uint32_t r; - __asm("bsrl %1, %0" : "=r"(r) : "r"(a) : ); - return r; -} -#else -static inline uint32_t bit_scan_reverse (uint32_t a) { - unsigned long r; - _BitScanReverse(&r, a); // defined in intrin.h for MS and Intel compilers - return r; -} -#endif - -// Same function, for compile-time constants. -// We need template metaprogramming for calculating this function at compile time. -// This may take a long time to compile because of the template recursion. -// Todo: replace this with a constexpr function when C++14 becomes available -template -struct BitScanR { - enum {val = ( - n >= 0x10 ? 4 + (BitScanR<(n>>4)>::val) : - n < 2 ? 0 : - n < 4 ? 1 : - n < 8 ? 2 : 3 ) }; -}; -template <> struct BitScanR<0> {enum {val = 0};}; // Avoid infinite template recursion - -#define bit_scan_reverse_const(n) (BitScanR::val) // n must be a valid compile-time constant - - -/***************************************************************************** -* -* Integer division operators -* -****************************************************************************** -* -* The instruction set does not support integer vector division. Instead, we -* are using a method for fast integer division based on multiplication and -* shift operations. This method is faster than simple integer division if the -* same divisor is used multiple times. -* -* All elements in a vector are divided by the same divisor. It is not possible -* to divide different elements of the same vector by different divisors. -* -* The parameters used for fast division are stored in an object of a -* Divisor class. This object can be created implicitly, for example in: -* Vec4i a, b; int c; -* a = b / c; -* or explicitly as: -* a = b / Divisor_i(c); -* -* It takes more time to compute the parameters used for fast division than to -* do the division. Therefore, it is advantageous to use the same divisor object -* multiple times. For example, to divide 80 unsigned short integers by 10: -* -* uint16_t dividends[80], quotients[80]; // numbers to work with -* Divisor_us div10(10); // make divisor object for dividing by 10 -* Vec8us temp; // temporary vector -* for (int i = 0; i < 80; i += 8) { // loop for 4 elements per iteration -* temp.load(dividends+i); // load 4 elements -* temp /= div10; // divide each element by 10 -* temp.store(quotients+i); // store 4 elements -* } -* -* The parameters for fast division can also be computed at compile time. This is -* an advantage if the divisor is known at compile time. Use the const_int or const_uint -* macro to do this. For example, for signed integers: -* Vec8s a, b; -* a = b / const_int(10); -* Or, for unsigned integers: -* Vec8us a, b; -* a = b / const_uint(10); -* -* The division of a vector of 16-bit integers is faster than division of a vector -* of other integer sizes. -* -* -* Mathematical formula, used for signed division with fixed or variable divisor: -* (From T. Granlund and P. L. Montgomery: Division by Invariant Integers Using Multiplication, -* Proceedings of the SIGPLAN 1994 Conference on Programming Language Design and Implementation. -* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556 ) -* x = dividend -* d = abs(divisor) -* w = integer word size, bits -* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1 -* L = max(L,1) -* m = 1 + 2^(w+L-1)/d - 2^w [division should overflow to 0 if d = 1] -* sh1 = L-1 -* q = x + (m*x >> w) [high part of signed multiplication with 2w bits] -* q = (q >> sh1) - (x<0 ? -1 : 0) -* if (divisor < 0) q = -q -* result trunc(x/d) = q -* -* Mathematical formula, used for unsigned division with variable divisor: -* (Also from T. Granlund and P. L. Montgomery) -* x = dividend -* d = divisor -* w = integer word size, bits -* L = ceil(log2(d)) = bit_scan_reverse(d-1)+1 -* m = 1 + 2^w * (2^L-d) / d [2^L should overflow to 0 if L = w] -* sh1 = min(L,1) -* sh2 = max(L-1,0) -* t = m*x >> w [high part of unsigned multiplication with 2w bits] -* result floor(x/d) = (((x-t) >> sh1) + t) >> sh2 -* -* Mathematical formula, used for unsigned division with fixed divisor: -* (From Terje Mathisen, unpublished) -* x = dividend -* d = divisor -* w = integer word size, bits -* b = floor(log2(d)) = bit_scan_reverse(d) -* f = 2^(w+b) / d [exact division] -* If f is an integer then d is a power of 2 then go to case A -* If the fractional part of f is < 0.5 then go to case B -* If the fractional part of f is > 0.5 then go to case C -* Case A: [shift only] -* result = x >> b -* Case B: [round down f and compensate by adding one to x] -* result = ((x+1)*floor(f)) >> (w+b) [high part of unsigned multiplication with 2w bits] -* Case C: [round up f, no compensation for rounding error] -* result = (x*ceil(f)) >> (w+b) [high part of unsigned multiplication with 2w bits] -* -* -*****************************************************************************/ - -// encapsulate parameters for fast division on vector of 4 32-bit signed integers -class Divisor_i { -protected: - __m128i multiplier; // multiplier used in fast division - __m128i shift1; // shift count used in fast division - __m128i sign; // sign of divisor -public: - Divisor_i() {}; // Default constructor - Divisor_i(int32_t d) { // Constructor with divisor - set(d); - } - Divisor_i(int m, int s1, int sgn) { // Constructor with precalculated multiplier, shift and sign - multiplier = _mm_set1_epi32(m); - shift1 = _mm_cvtsi32_si128(s1); - sign = _mm_set1_epi32(sgn); - } - void set(int32_t d) { // Set or change divisor, calculate parameters - const int32_t d1 = ::abs(d); - int32_t sh, m; - if (d1 > 1) { - sh = bit_scan_reverse(d1-1); // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1 - m = int32_t((int64_t(1) << (32+sh)) / d1 - ((int64_t(1) << 32) - 1)); // calculate multiplier - } - else { - m = 1; // for d1 = 1 - sh = 0; - if (d == 0) m /= d; // provoke error here if d = 0 - if (uint32_t(d) == 0x80000000u) { // fix overflow for this special case - m = 0x80000001; - sh = 30; - } - } - multiplier = _mm_set1_epi32(m); // broadcast multiplier - shift1 = _mm_setr_epi32(sh, 0, 0, 0); // shift count - sign = _mm_set1_epi32(d < 0 ? -1 : 0); // sign of divisor - } - __m128i getm() const { // get multiplier - return multiplier; - } - __m128i gets1() const { // get shift count - return shift1; - } - __m128i getsign() const { // get sign of divisor - return sign; - } -}; - -// encapsulate parameters for fast division on vector of 4 32-bit unsigned integers -class Divisor_ui { -protected: - __m128i multiplier; // multiplier used in fast division - __m128i shift1; // shift count 1 used in fast division - __m128i shift2; // shift count 2 used in fast division -public: - Divisor_ui() {}; // Default constructor - Divisor_ui(uint32_t d) { // Constructor with divisor - set(d); - } - Divisor_ui(uint32_t m, int s1, int s2) { // Constructor with precalculated multiplier and shifts - multiplier = _mm_set1_epi32(m); - shift1 = _mm_setr_epi32(s1, 0, 0, 0); - shift2 = _mm_setr_epi32(s2, 0, 0, 0); - } - void set(uint32_t d) { // Set or change divisor, calculate parameters - uint32_t L, L2, sh1, sh2, m; - switch (d) { - case 0: - m = sh1 = sh2 = 1 / d; // provoke error for d = 0 - break; - case 1: - m = 1; sh1 = sh2 = 0; // parameters for d = 1 - break; - case 2: - m = 1; sh1 = 1; sh2 = 0; // parameters for d = 2 - break; - default: // general case for d > 2 - L = bit_scan_reverse(d-1)+1; // ceil(log2(d)) - L2 = L < 32 ? 1 << L : 0; // 2^L, overflow to 0 if L = 32 - m = 1 + uint32_t((uint64_t(L2 - d) << 32) / d); // multiplier - sh1 = 1; sh2 = L - 1; // shift counts - } - multiplier = _mm_set1_epi32(m); - shift1 = _mm_setr_epi32(sh1, 0, 0, 0); - shift2 = _mm_setr_epi32(sh2, 0, 0, 0); - } - __m128i getm() const { // get multiplier - return multiplier; - } - __m128i gets1() const { // get shift count 1 - return shift1; - } - __m128i gets2() const { // get shift count 2 - return shift2; - } -}; - - -// encapsulate parameters for fast division on vector of 8 16-bit signed integers -class Divisor_s { -protected: - __m128i multiplier; // multiplier used in fast division - __m128i shift1; // shift count used in fast division - __m128i sign; // sign of divisor -public: - Divisor_s() {}; // Default constructor - Divisor_s(int16_t d) { // Constructor with divisor - set(d); - } - Divisor_s(int16_t m, int s1, int sgn) { // Constructor with precalculated multiplier, shift and sign - multiplier = _mm_set1_epi16(m); - shift1 = _mm_setr_epi32(s1, 0, 0, 0); - sign = _mm_set1_epi32(sgn); - } - void set(int16_t d) { // Set or change divisor, calculate parameters - const int32_t d1 = ::abs(d); - int32_t sh, m; - if (d1 > 1) { - sh = bit_scan_reverse(d1-1); // shift count = ceil(log2(d1))-1 = (bit_scan_reverse(d1-1)+1)-1 - m = ((int32_t(1) << (16+sh)) / d1 - ((int32_t(1) << 16) - 1)); // calculate multiplier - } - else { - m = 1; // for d1 = 1 - sh = 0; - if (d == 0) m /= d; // provoke error here if d = 0 - if (uint16_t(d) == 0x8000u) { // fix overflow for this special case - m = 0x8001; - sh = 14; - } - } - multiplier = _mm_set1_epi16(int16_t(m)); // broadcast multiplier - shift1 = _mm_setr_epi32(sh, 0, 0, 0); // shift count - sign = _mm_set1_epi32(d < 0 ? -1 : 0); // sign of divisor - } - __m128i getm() const { // get multiplier - return multiplier; - } - __m128i gets1() const { // get shift count - return shift1; - } - __m128i getsign() const { // get sign of divisor - return sign; - } -}; - - -// encapsulate parameters for fast division on vector of 8 16-bit unsigned integers -class Divisor_us { -protected: - __m128i multiplier; // multiplier used in fast division - __m128i shift1; // shift count 1 used in fast division - __m128i shift2; // shift count 2 used in fast division -public: - Divisor_us() {}; // Default constructor - Divisor_us(uint16_t d) { // Constructor with divisor - set(d); - } - Divisor_us(uint16_t m, int s1, int s2) { // Constructor with precalculated multiplier and shifts - multiplier = _mm_set1_epi16(m); - shift1 = _mm_setr_epi32(s1, 0, 0, 0); - shift2 = _mm_setr_epi32(s2, 0, 0, 0); - } - void set(uint16_t d) { // Set or change divisor, calculate parameters - uint16_t L, L2, sh1, sh2, m; - switch (d) { - case 0: - m = sh1 = sh2 = 1 / d; // provoke error for d = 0 - break; - case 1: - m = 1; sh1 = sh2 = 0; // parameters for d = 1 - break; - case 2: - m = 1; sh1 = 1; sh2 = 0; // parameters for d = 2 - break; - default: // general case for d > 2 - L = (uint16_t)bit_scan_reverse(d-1)+1; // ceil(log2(d)) - L2 = uint16_t(1 << L); // 2^L, overflow to 0 if L = 16 - m = 1 + uint16_t((uint32_t(L2 - d) << 16) / d); // multiplier - sh1 = 1; sh2 = L - 1; // shift counts - } - multiplier = _mm_set1_epi16(m); - shift1 = _mm_setr_epi32(sh1, 0, 0, 0); - shift2 = _mm_setr_epi32(sh2, 0, 0, 0); - } - __m128i getm() const { // get multiplier - return multiplier; - } - __m128i gets1() const { // get shift count 1 - return shift1; - } - __m128i gets2() const { // get shift count 2 - return shift2; - } -}; - - -// vector operator / : divide each element by divisor - -// vector of 4 32-bit signed integers -static inline Vec4i operator / (Vec4i const & a, Divisor_i const & d) { -#if defined (__XOP__) && defined (GCC_VERSION) && GCC_VERSION <= 40702/*??*/ && !defined(__INTEL_COMPILER) && !defined(__clang__) -#define XOP_MUL_BUG // GCC has bug in XOP multiply -// Bug found in GCC version 4.7.0 and 4.7.1 -#endif -// todo: test this when GCC bug is fixed -#if defined (__XOP__) && !defined (XOP_MUL_BUG) - __m128i t1 = _mm_mul_epi32(a,d.getm()); // 32x32->64 bit signed multiplication of a[0] and a[2] - __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 - __m128i t3 = _mm_macchi_epi32(a,d.getm(),_mm_setzero_si128());// 32x32->64 bit signed multiplication of a[1] and a[3] - __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 - __m128i t7 = _mm_blendv_epi8(t2,t3,t5); // blend two results - __m128i t8 = _mm_add_epi32(t7,a); // add - __m128i t9 = _mm_sra_epi32(t8,d.gets1()); // shift right arithmetic - __m128i t10 = _mm_srai_epi32(a,31); // sign of a - __m128i t11 = _mm_sub_epi32(t10,d.getsign()); // sign of a - sign of d - __m128i t12 = _mm_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 - return _mm_xor_si128(t12,d.getsign()); // change sign if divisor negative - -#elif INSTRSET >= 5 && !defined (XOP_MUL_BUG) // SSE4.1 supported - __m128i t1 = _mm_mul_epi32(a,d.getm()); // 32x32->64 bit signed multiplication of a[0] and a[2] - __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 - __m128i t3 = _mm_srli_epi64(a,32); // get a[1] and a[3] into position for multiplication - __m128i t4 = _mm_mul_epi32(t3,d.getm()); // 32x32->64 bit signed multiplication of a[1] and a[3] - __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 - __m128i t7 = _mm_blendv_epi8(t2,t4,t5); // blend two results - __m128i t8 = _mm_add_epi32(t7,a); // add - __m128i t9 = _mm_sra_epi32(t8,d.gets1()); // shift right arithmetic - __m128i t10 = _mm_srai_epi32(a,31); // sign of a - __m128i t11 = _mm_sub_epi32(t10,d.getsign()); // sign of a - sign of d - __m128i t12 = _mm_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 - return _mm_xor_si128(t12,d.getsign()); // change sign if divisor negative -#else // not SSE4.1 - __m128i t1 = _mm_mul_epu32(a,d.getm()); // 32x32->64 bit unsigned multiplication of a[0] and a[2] - __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 - __m128i t3 = _mm_srli_epi64(a,32); // get a[1] and a[3] into position for multiplication - __m128i t4 = _mm_mul_epu32(t3,d.getm()); // 32x32->64 bit unsigned multiplication of a[1] and a[3] - __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 - __m128i t6 = _mm_and_si128(t4,t5); // high dword of result 1 and 3 - __m128i t7 = _mm_or_si128(t2,t6); // combine all four results of unsigned high mul into one vector - // convert unsigned to signed high multiplication (from: H S Warren: Hacker's delight, 2003, p. 132) - __m128i u1 = _mm_srai_epi32(a,31); // sign of a - __m128i u2 = _mm_srai_epi32(d.getm(),31); // sign of m [ m is always negative, except for abs(d) = 1 ] - __m128i u3 = _mm_and_si128 (d.getm(),u1); // m * sign of a - __m128i u4 = _mm_and_si128 (a,u2); // a * sign of m - __m128i u5 = _mm_add_epi32 (u3,u4); // sum of sign corrections - __m128i u6 = _mm_sub_epi32 (t7,u5); // high multiplication result converted to signed - __m128i t8 = _mm_add_epi32(u6,a); // add a - __m128i t9 = _mm_sra_epi32(t8,d.gets1()); // shift right arithmetic - __m128i t10 = _mm_sub_epi32(u1,d.getsign()); // sign of a - sign of d - __m128i t11 = _mm_sub_epi32(t9,t10); // + 1 if a < 0, -1 if d < 0 - return _mm_xor_si128(t11,d.getsign()); // change sign if divisor negative -#endif -} - -// vector of 4 32-bit unsigned integers -static inline Vec4ui operator / (Vec4ui const & a, Divisor_ui const & d) { - __m128i t1 = _mm_mul_epu32(a,d.getm()); // 32x32->64 bit unsigned multiplication of a[0] and a[2] - __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 - __m128i t3 = _mm_srli_epi64(a,32); // get a[1] and a[3] into position for multiplication - __m128i t4 = _mm_mul_epu32(t3,d.getm()); // 32x32->64 bit unsigned multiplication of a[1] and a[3] - __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 -#if INSTRSET >= 5 // SSE4.1 supported - __m128i t7 = _mm_blendv_epi8(t2,t4,t5); // blend two results -#else - __m128i t6 = _mm_and_si128(t4,t5); // high dword of result 1 and 3 - __m128i t7 = _mm_or_si128(t2,t6); // combine all four results into one vector -#endif - __m128i t8 = _mm_sub_epi32(a,t7); // subtract - __m128i t9 = _mm_srl_epi32(t8,d.gets1()); // shift right logical - __m128i t10 = _mm_add_epi32(t7,t9); // add - return _mm_srl_epi32(t10,d.gets2()); // shift right logical -} - -// vector of 8 16-bit signed integers -static inline Vec8s operator / (Vec8s const & a, Divisor_s const & d) { - __m128i t1 = _mm_mulhi_epi16(a, d.getm()); // multiply high signed words - __m128i t2 = _mm_add_epi16(t1,a); // + a - __m128i t3 = _mm_sra_epi16(t2,d.gets1()); // shift right arithmetic - __m128i t4 = _mm_srai_epi16(a,15); // sign of a - __m128i t5 = _mm_sub_epi16(t4,d.getsign()); // sign of a - sign of d - __m128i t6 = _mm_sub_epi16(t3,t5); // + 1 if a < 0, -1 if d < 0 - return _mm_xor_si128(t6,d.getsign()); // change sign if divisor negative -} - -// vector of 8 16-bit unsigned integers -static inline Vec8us operator / (Vec8us const & a, Divisor_us const & d) { - __m128i t1 = _mm_mulhi_epu16(a, d.getm()); // multiply high unsigned words - __m128i t2 = _mm_sub_epi16(a,t1); // subtract - __m128i t3 = _mm_srl_epi16(t2,d.gets1()); // shift right logical - __m128i t4 = _mm_add_epi16(t1,t3); // add - return _mm_srl_epi16(t4,d.gets2()); // shift right logical -} - - -// vector of 16 8-bit signed integers -static inline Vec16c operator / (Vec16c const & a, Divisor_s const & d) { - // expand into two Vec8s - Vec8s low = extend_low(a) / d; - Vec8s high = extend_high(a) / d; - return compress(low,high); -} - -// vector of 16 8-bit unsigned integers -static inline Vec16uc operator / (Vec16uc const & a, Divisor_us const & d) { - // expand into two Vec8s - Vec8us low = extend_low(a) / d; - Vec8us high = extend_high(a) / d; - return compress(low,high); -} - -// vector operator /= : divide -static inline Vec8s & operator /= (Vec8s & a, Divisor_s const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec8us & operator /= (Vec8us & a, Divisor_us const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec4i & operator /= (Vec4i & a, Divisor_i const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec4ui & operator /= (Vec4ui & a, Divisor_ui const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec16c & operator /= (Vec16c & a, Divisor_s const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec16uc & operator /= (Vec16uc & a, Divisor_us const & d) { - a = a / d; - return a; -} - -/***************************************************************************** -* -* Integer division 2: divisor is a compile-time constant -* -*****************************************************************************/ - -// Divide Vec4i by compile-time constant -template -static inline Vec4i divide_by_i(Vec4i const & x) { - Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero - if (d == 1) return x; - if (d == -1) return -x; - if (uint32_t(d) == 0x80000000u) return Vec4i(x == Vec4i(0x80000000)) & 1; // prevent overflow when changing sign - const uint32_t d1 = d > 0 ? uint32_t(d) : uint32_t(-d); // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits) - if ((d1 & (d1-1)) == 0) { - // d1 is a power of 2. use shift - const int k = bit_scan_reverse_const(d1); - __m128i sign; - if (k > 1) sign = _mm_srai_epi32(x, k-1); else sign = x; // k copies of sign bit - __m128i bias = _mm_srli_epi32(sign, 32-k); // bias = x >= 0 ? 0 : k-1 - __m128i xpbias = _mm_add_epi32 (x, bias); // x + bias - __m128i q = _mm_srai_epi32(xpbias, k); // (x + bias) >> k - if (d > 0) return q; // d > 0: return q - return _mm_sub_epi32(_mm_setzero_si128(), q); // d < 0: return -q - } - // general case - const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case) - const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32)); // multiplier - const Divisor_i div(mult, sh, d < 0 ? -1 : 0); - return x / div; -} - -// define Vec4i a / const_int(d) -template -static inline Vec4i operator / (Vec4i const & a, Const_int_t) { - return divide_by_i(a); -} - -// define Vec4i a / const_uint(d) -template -static inline Vec4i operator / (Vec4i const & a, Const_uint_t) { - Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned - return divide_by_i(a); // signed divide -} - -// vector operator /= : divide -template -static inline Vec4i & operator /= (Vec4i & a, Const_int_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec4i & operator /= (Vec4i & a, Const_uint_t b) { - a = a / b; - return a; -} - - -// Divide Vec4ui by compile-time constant -template -static inline Vec4ui divide_by_ui(Vec4ui const & x) { - Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero - if (d == 1) return x; // divide by 1 - const int b = bit_scan_reverse_const(d); // floor(log2(d)) - if ((uint32_t(d) & (uint32_t(d)-1)) == 0) { - // d is a power of 2. use shift - return _mm_srli_epi32(x, b); // x >> b - } - // general case (d > 2) - uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d - const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d - const bool round_down = (2*rem < d); // check if fraction is less than 0.5 - if (!round_down) { - mult = mult + 1; // round up mult - } - // do 32*32->64 bit unsigned multiplication and get high part of result - const __m128i multv = _mm_set_epi32(0,mult,0,mult); // zero-extend mult and broadcast - __m128i t1 = _mm_mul_epu32(x,multv); // 32x32->64 bit unsigned multiplication of x[0] and x[2] - if (round_down) { - t1 = _mm_add_epi64(t1,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow - } - __m128i t2 = _mm_srli_epi64(t1,32); // high dword of result 0 and 2 - __m128i t3 = _mm_srli_epi64(x,32); // get x[1] and x[3] into position for multiplication - __m128i t4 = _mm_mul_epu32(t3,multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3] - if (round_down) { - t4 = _mm_add_epi64(t4,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow - } - __m128i t5 = _mm_set_epi32(-1,0,-1,0); // mask of dword 1 and 3 -#if INSTRSET >= 5 // SSE4.1 supported - __m128i t7 = _mm_blendv_epi8(t2,t4,t5); // blend two results -#else - __m128i t6 = _mm_and_si128(t4,t5); // high dword of result 1 and 3 - __m128i t7 = _mm_or_si128(t2,t6); // combine all four results into one vector -#endif - Vec4ui q = _mm_srli_epi32(t7, b); // shift right by b - return q; // no overflow possible -} - -// define Vec4ui a / const_uint(d) -template -static inline Vec4ui operator / (Vec4ui const & a, Const_uint_t) { - return divide_by_ui(a); -} - -// define Vec4ui a / const_int(d) -template -static inline Vec4ui operator / (Vec4ui const & a, Const_int_t) { - Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous - return divide_by_ui(a); // unsigned divide -} - -// vector operator /= : divide -template -static inline Vec4ui & operator /= (Vec4ui & a, Const_uint_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec4ui & operator /= (Vec4ui & a, Const_int_t b) { - a = a / b; - return a; -} - - -// Divide Vec8s by compile-time constant -template -static inline Vec8s divide_by_i(Vec8s const & x) { - const int16_t d0 = int16_t(d); // truncate d to 16 bits - Static_error_check<(d0 != 0)> Dividing_by_zero; // Error message if dividing by zero - if (d0 == 1) return x; // divide by 1 - if (d0 == -1) return -x; // divide by -1 - if (uint16_t(d0) == 0x8000u) return Vec8s(x == Vec8s(0x8000)) & 1;// prevent overflow when changing sign - // if (d > 0x7FFF || d < -0x8000) return 0; // not relevant when d truncated to 16 bits - const uint16_t d1 = d0 > 0 ? d0 : -d0; // compile-time abs(d0) - if ((d1 & (d1-1)) == 0) { - // d is a power of 2. use shift - const int k = bit_scan_reverse_const(uint32_t(d1)); - __m128i sign; - if (k > 1) sign = _mm_srai_epi16(x, k-1); else sign = x; // k copies of sign bit - __m128i bias = _mm_srli_epi16(sign, 16-k); // bias = x >= 0 ? 0 : k-1 - __m128i xpbias = _mm_add_epi16 (x, bias); // x + bias - __m128i q = _mm_srai_epi16(xpbias, k); // (x + bias) >> k - if (d0 > 0) return q; // d0 > 0: return q - return _mm_sub_epi16(_mm_setzero_si128(), q); // d0 < 0: return -q - } - // general case - const int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1; // ceil(log2(d)). (d < 2 handled above) - const int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier - const int shift1 = L - 1; - const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1); - return x / div; -} - -// define Vec8s a / const_int(d) -template -static inline Vec8s operator / (Vec8s const & a, Const_int_t) { - return divide_by_i(a); -} - -// define Vec8s a / const_uint(d) -template -static inline Vec8s operator / (Vec8s const & a, Const_uint_t) { - Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned - return divide_by_i(a); // signed divide -} - -// vector operator /= : divide -template -static inline Vec8s & operator /= (Vec8s & a, Const_int_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec8s & operator /= (Vec8s & a, Const_uint_t b) { - a = a / b; - return a; -} - - -// Divide Vec8us by compile-time constant -template -static inline Vec8us divide_by_ui(Vec8us const & x) { - const uint16_t d0 = uint16_t(d); // truncate d to 16 bits - Static_error_check<(d0 != 0)> Dividing_by_zero; // Error message if dividing by zero - if (d0 == 1) return x; // divide by 1 - const int b = bit_scan_reverse_const(d0); // floor(log2(d)) - if ((d0 & (d0-1)) == 0) { - // d is a power of 2. use shift - return _mm_srli_epi16(x, b); // x >> b - } - // general case (d > 2) - uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0); // multiplier = 2^(32+b) / d - const uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d - const bool round_down = (2*rem < d0); // check if fraction is less than 0.5 - Vec8us x1 = x; - if (round_down) { - x1 = x1 + 1; // round down mult and compensate by adding 1 to x - } - else { - mult = mult + 1; // round up mult. no compensation needed - } - const __m128i multv = _mm_set1_epi16(mult); // broadcast mult - __m128i xm = _mm_mulhi_epu16(x1, multv); // high part of 16x16->32 bit unsigned multiplication - Vec8us q = _mm_srli_epi16(xm, b); // shift right by b - if (round_down) { - Vec8sb overfl = (x1 == (Vec8us)_mm_setzero_si128()); // check for overflow of x+1 - return select(overfl, Vec8us(mult >> b), q); // deal with overflow (rarely needed) - } - else { - return q; // no overflow possible - } -} - -// define Vec8us a / const_uint(d) -template -static inline Vec8us operator / (Vec8us const & a, Const_uint_t) { - return divide_by_ui(a); -} - -// define Vec8us a / const_int(d) -template -static inline Vec8us operator / (Vec8us const & a, Const_int_t) { - Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous - return divide_by_ui(a); // unsigned divide -} - -// vector operator /= : divide -template -static inline Vec8us & operator /= (Vec8us & a, Const_uint_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec8us & operator /= (Vec8us & a, Const_int_t b) { - a = a / b; - return a; -} - - -// define Vec16c a / const_int(d) -template -static inline Vec16c operator / (Vec16c const & a, Const_int_t) { - // expand into two Vec8s - Vec8s low = extend_low(a) / Const_int_t(); - Vec8s high = extend_high(a) / Const_int_t(); - return compress(low,high); -} - -// define Vec16c a / const_uint(d) -template -static inline Vec16c operator / (Vec16c const & a, Const_uint_t) { - Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned - return a / Const_int_t(); // signed divide -} - -// vector operator /= : divide -template -static inline Vec16c & operator /= (Vec16c & a, Const_int_t b) { - a = a / b; - return a; -} -// vector operator /= : divide -template -static inline Vec16c & operator /= (Vec16c & a, Const_uint_t b) { - a = a / b; - return a; -} - -// define Vec16uc a / const_uint(d) -template -static inline Vec16uc operator / (Vec16uc const & a, Const_uint_t) { - // expand into two Vec8usc - Vec8us low = extend_low(a) / Const_uint_t(); - Vec8us high = extend_high(a) / Const_uint_t(); - return compress(low,high); -} - -// define Vec16uc a / const_int(d) -template -static inline Vec16uc operator / (Vec16uc const & a, Const_int_t) { - Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous - return a / Const_uint_t(); // unsigned divide -} - -// vector operator /= : divide -template -static inline Vec16uc & operator /= (Vec16uc & a, Const_uint_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec16uc & operator /= (Vec16uc & a, Const_int_t b) { - a = a / b; - return a; -} - -/***************************************************************************** -* -* Horizontal scan functions -* -*****************************************************************************/ - -// Get index to the first element that is true. Return -1 if all are false -static inline int horizontal_find_first(Vec16cb const & x) { - uint32_t a = _mm_movemask_epi8(x); - if (a == 0) return -1; - int32_t b = bit_scan_forward(a); - return b; -} - -static inline int horizontal_find_first(Vec8sb const & x) { - return horizontal_find_first(Vec16cb(x)) >> 1; // must use signed shift -} - -static inline int horizontal_find_first(Vec4ib const & x) { - return horizontal_find_first(Vec16cb(x)) >> 2; // must use signed shift -} - -static inline int horizontal_find_first(Vec2qb const & x) { - return horizontal_find_first(Vec16cb(x)) >> 3; // must use signed shift -} - -// Count the number of elements that are true -static inline uint32_t horizontal_count(Vec16cb const & x) { - uint32_t a = _mm_movemask_epi8(x); - return vml_popcnt(a); -} - -static inline uint32_t horizontal_count(Vec8sb const & x) { - return horizontal_count(Vec16cb(x)) >> 1; -} - -static inline uint32_t horizontal_count(Vec4ib const & x) { - return horizontal_count(Vec16cb(x)) >> 2; -} - -static inline uint32_t horizontal_count(Vec2qb const & x) { - return horizontal_count(Vec16cb(x)) >> 3; -} - - -/***************************************************************************** -* -* Boolean <-> bitfield conversion functions -* -*****************************************************************************/ - -// to_bits: convert boolean vector to integer bitfield -static inline uint16_t to_bits(Vec16cb const & x) { - return (uint16_t)_mm_movemask_epi8(x); -} - -// to_Vec16bc: convert integer bitfield to boolean vector -static inline Vec16cb to_Vec16cb(uint16_t x) { - static const uint32_t table[16] = { // lookup-table - 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, - 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, - 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, - 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; - uint32_t a0 = table[x & 0xF]; - uint32_t a1 = table[(x>>4) & 0xF]; - uint32_t a2 = table[(x>>8) & 0xF]; - uint32_t a3 = table[(x>>12) & 0xF]; - return Vec16cb(Vec16c(Vec4ui(a0, a1, a2, a3))); -} - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec8sb const & x) { - __m128i a = _mm_packs_epi16(x, x); // 16-bit words to bytes - return (uint8_t)_mm_movemask_epi8(a); -} - -// to_Vec8sb: convert integer bitfield to boolean vector -static inline Vec8sb to_Vec8sb(uint8_t x) { - static const uint32_t table[16] = { // lookup-table - 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, - 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, - 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, - 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; - uint32_t a0 = table[x & 0xF]; - uint32_t a1 = table[(x>>4) & 0xF]; - Vec4ui b = Vec4ui(a0, a1, a0, a1); - return _mm_unpacklo_epi8(b, b); // duplicate bytes to 16-bit words -} - -#if INSTRSET < 9 || MAX_VECTOR_SIZE < 512 -// These functions are defined in Vectori512.h if AVX512 instruction set is used - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4ib const & x) { - __m128i a = _mm_packs_epi32(x, x); // 32-bit dwords to 16-bit words - __m128i b = _mm_packs_epi16(a, a); // 16-bit words to bytes - return _mm_movemask_epi8(b) & 0xF; -} - -// to_Vec4ib: convert integer bitfield to boolean vector -static inline Vec4ib to_Vec4ib(uint8_t x) { - static const uint32_t table[16] = { // lookup-table - 0x00000000, 0x000000FF, 0x0000FF00, 0x0000FFFF, - 0x00FF0000, 0x00FF00FF, 0x00FFFF00, 0x00FFFFFF, - 0xFF000000, 0xFF0000FF, 0xFF00FF00, 0xFF00FFFF, - 0xFFFF0000, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFFFF}; - uint32_t a = table[x & 0xF]; // 4 bytes - __m128i b = _mm_cvtsi32_si128(a); // transfer to vector register - __m128i c = _mm_unpacklo_epi8(b, b); // duplicate bytes to 16-bit words - __m128i d = _mm_unpacklo_epi16(c, c); // duplicate 16-bit words to 32-bit dwords - return d; -} - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec2qb const & x) { - uint32_t a = _mm_movemask_epi8(x); - return (a & 1) | ((a >> 7) & 2); -} - -// to_Vec2qb: convert integer bitfield to boolean vector -static inline Vec2qb to_Vec2qb(uint8_t x) { - return Vec2qb(Vec2q(-(x&1), -((x>>1)&1))); -} - -#else // function prototypes here only - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4ib x); - -// to_Vec4ib: convert integer bitfield to boolean vector -static inline Vec4ib to_Vec4ib(uint8_t x); - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec2qb x); - -// to_Vec2qb: convert integer bitfield to boolean vector -static inline Vec2qb to_Vec2qb(uint8_t x); - -#endif // INSTRSET < 9 || MAX_VECTOR_SIZE < 512 - -#ifdef VCL_NAMESPACE -} -#endif - -#endif // VECTORI128_H diff --git a/DFTTest/vectorclass/vectori256.h b/DFTTest/vectorclass/vectori256.h deleted file mode 100644 index 61ac5da..0000000 --- a/DFTTest/vectorclass/vectori256.h +++ /dev/null @@ -1,5693 +0,0 @@ -/**************************** vectori256.h ******************************* -* Author: Agner Fog -* Date created: 2012-05-30 -* Last modified: 2017-02-19 -* Version: 1.27 -* Project: vector classes -* Description: -* Header file defining integer vector classes as interface to intrinsic -* functions in x86 microprocessors with AVX2 and later instruction sets. -* -* Instructions: -* Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired -* instruction set, which must be at least AVX2. -* -* The following vector classes are defined here: -* Vec256b Vector of 256 1-bit unsigned integers or Booleans -* Vec32c Vector of 32 8-bit signed integers -* Vec32uc Vector of 32 8-bit unsigned integers -* Vec32cb Vector of 32 Booleans for use with Vec32c and Vec32uc -* Vec16s Vector of 16 16-bit signed integers -* Vec16us Vector of 16 16-bit unsigned integers -* Vec16sb Vector of 16 Booleans for use with Vec16s and Vec16us -* Vec8i Vector of 8 32-bit signed integers -* Vec8ui Vector of 8 32-bit unsigned integers -* Vec8ib Vector of 8 Booleans for use with Vec8i and Vec8ui -* Vec4q Vector of 4 64-bit signed integers -* Vec4uq Vector of 4 64-bit unsigned integers -* Vec4qb Vector of 4 Booleans for use with Vec4q and Vec4uq -* -* Each vector object is represented internally in the CPU as a 256-bit register. -* This header file defines operators and functions for these vectors. -* -* For example: -* Vec8i a(1,2,3,4,5,6,7,8), b(9,10,11,12,13,14,15,16), c; -* c = a + b; // now c contains (10,12,14,16,18,20,22,24) -* -* For detailed instructions, see VectorClass.pdf -* -* (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses -*****************************************************************************/ - -// check combination of header files -#if defined (VECTORI256_H) -#if VECTORI256_H != 2 -#error Two different versions of vectori256.h included -#endif -#else -#define VECTORI256_H 2 - -#ifdef VECTORF256_H -#error Please put header file vectori256.h before vectorf256.h -#endif - - -#if INSTRSET < 8 // AVX2 required -#error Wrong instruction set for vectori256.h, AVX2 required or use vectori256e.h -#endif - -#include "vectori128.h" - -#ifdef VCL_NAMESPACE -namespace VCL_NAMESPACE { -#endif - -/***************************************************************************** -* -* Join two 128-bit vectors -* -*****************************************************************************/ -#define set_m128ir(lo,hi) _mm256_inserti128_si256(_mm256_castsi128_si256(lo),(hi),1) - - -/***************************************************************************** -* -* Vector of 256 1-bit unsigned integers or Booleans -* -*****************************************************************************/ -class Vec256b { -protected: - __m256i ymm; // Integer vector -public: - // Default constructor: - Vec256b() { - } - // Constructor to broadcast the same value into all elements - // Removed because of undesired implicit conversions - //Vec256b(int i) { - // ymm = _mm256_set1_epi32(-(i & 1));} - - // Constructor to build from two Vec128b: - Vec256b(Vec128b const & a0, Vec128b const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec256b(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec256b & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256i used in intrinsics - operator __m256i() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec256b & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - // You may use load_a instead of load if you are certain that p points to an address - // divisible by 32, but there is hardly any speed advantage of load_a on modern processors - Vec256b & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to store into array (unaligned) - void store(void * p) const { - _mm256_storeu_si256((__m256i*)p, ymm); - } - // Member function to store into array, aligned by 32 - // You may use store_a instead of store if you are certain that p points to an address - // divisible by 32, but there is hardly any speed advantage of load_a on modern processors - void store_a(void * p) const { - _mm256_store_si256((__m256i*)p, ymm); - } - // Member function to store into array using a non-temporal memory hint, aligned by 32 - void stream(void * p) const { - _mm256_stream_si256((__m256i*)p, ymm); - } - // Member function to change a single bit - // Note: This function is inefficient. Use load function if changing more than one bit - Vec256b const & set_bit(uint32_t index, int value) { - static uint64_t m[8] = {0,0,0,0,1,0,0,0}; - int wi = (index >> 6) & 3; // qword index - int bi = index & 0x3F; // bit index within qword w - - __m256i mask = Vec256b().load(m+4-wi); // 1 in qword number wi - mask = _mm256_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set - if (value & 1) { - ymm = _mm256_or_si256(mask,ymm); - } - else { - ymm = _mm256_andnot_si256(mask,ymm); - } - return *this; - } - // Member function to get a single bit - // Note: This function is inefficient. Use store function if reading more than one bit - int get_bit(uint32_t index) const { - union { - __m256i x; - uint8_t i[32]; - } u; - u.x = ymm; - int wi = (index >> 3) & 0x1F; // byte index - int bi = index & 7; // bit index within byte w - return (u.i[wi] >> bi) & 1; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return get_bit(index) != 0; - } - // Member functions to split into two Vec128b: - Vec128b get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec128b get_high() const { - return _mm256_extracti128_si256(ymm,1); - } - static int size() { - return 256; - } -}; - - -// Define operators for this class - -// vector operator & : bitwise and -static inline Vec256b operator & (Vec256b const & a, Vec256b const & b) { - return _mm256_and_si256(a, b); -} -static inline Vec256b operator && (Vec256b const & a, Vec256b const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec256b operator | (Vec256b const & a, Vec256b const & b) { - return _mm256_or_si256(a, b); -} -static inline Vec256b operator || (Vec256b const & a, Vec256b const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec256b operator ^ (Vec256b const & a, Vec256b const & b) { - return _mm256_xor_si256(a, b); -} - -// vector operator ~ : bitwise not -static inline Vec256b operator ~ (Vec256b const & a) { - return _mm256_xor_si256(a, _mm256_set1_epi32(-1)); -} - -// vector operator &= : bitwise and -static inline Vec256b & operator &= (Vec256b & a, Vec256b const & b) { - a = a & b; - return a; -} - -// vector operator |= : bitwise or -static inline Vec256b & operator |= (Vec256b & a, Vec256b const & b) { - a = a | b; - return a; -} - -// vector operator ^= : bitwise xor -static inline Vec256b & operator ^= (Vec256b & a, Vec256b const & b) { - a = a ^ b; - return a; -} - -// Define functions for this class - -static inline __m256i zero_256b() { - return _mm256_setzero_si256(); -} - -// function andnot: a & ~ b -static inline Vec256b andnot (Vec256b const & a, Vec256b const & b) { - return _mm256_andnot_si256(b, a); -} - - -/***************************************************************************** -* -* Generate compile-time constant vector -* -*****************************************************************************/ -// Generate a constant vector of 8 integers stored in memory. -// Can be converted to any integer vector type -template -static inline __m256i constant8i() { - static const union { - int32_t i[8]; - __m256i ymm; - } u = {{i0,i1,i2,i3,i4,i5,i6,i7}}; - return u.ymm; -} - -template -static inline __m256i constant8ui() { - return constant8i(); -} - -/***************************************************************************** -* -* selectb function -* -*****************************************************************************/ -// Select between two sources, byte by byte. Used in various functions and operators -// Corresponds to this pseudocode: -// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or 0xFF (true). No other values are allowed. -// Only bit 7 in each byte of s is checked, -static inline __m256i selectb (__m256i const & s, __m256i const & a, __m256i const & b) { - return _mm256_blendv_epi8 (b, a, s); -} - - - -/***************************************************************************** -* -* Horizontal Boolean functions -* -*****************************************************************************/ - -// horizontal_and. Returns true if all bits are 1 -static inline bool horizontal_and (Vec256b const & a) { - return _mm256_testc_si256(a,constant8i<-1,-1,-1,-1,-1,-1,-1,-1>()) != 0; -} - -// horizontal_or. Returns true if at least one bit is 1 -static inline bool horizontal_or (Vec256b const & a) { - return ! _mm256_testz_si256(a,a); -} - - - -/***************************************************************************** -* -* Vector of 32 8-bit signed integers -* -*****************************************************************************/ - -class Vec32c : public Vec256b { -public: - // Default constructor: - Vec32c(){ - } - // Constructor to broadcast the same value into all elements: - Vec32c(int i) { - ymm = _mm256_set1_epi8((char)i); - } - // Constructor to build from all elements: - Vec32c(int8_t i0, int8_t i1, int8_t i2, int8_t i3, int8_t i4, int8_t i5, int8_t i6, int8_t i7, - int8_t i8, int8_t i9, int8_t i10, int8_t i11, int8_t i12, int8_t i13, int8_t i14, int8_t i15, - int8_t i16, int8_t i17, int8_t i18, int8_t i19, int8_t i20, int8_t i21, int8_t i22, int8_t i23, - int8_t i24, int8_t i25, int8_t i26, int8_t i27, int8_t i28, int8_t i29, int8_t i30, int8_t i31) { - ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, - i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); - } - // Constructor to build from two Vec16c: - Vec32c(Vec16c const & a0, Vec16c const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec32c(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec32c & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256i used in intrinsics - operator __m256i() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec32c & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec32c & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec32c & load_partial(int n, void const * p) { - if (n <= 0) { - *this = 0; - } - else if (n <= 16) { - *this = Vec32c(Vec16c().load_partial(n, p), 0); - } - else if (n < 32) { - *this = Vec32c(Vec16c().load(p), Vec16c().load_partial(n-16, (char const*)p+16)); - } - else { - load(p); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - if (n <= 0) { - return; - } - else if (n <= 16) { - get_low().store_partial(n, p); - } - else if (n < 32) { - get_low().store(p); - get_high().store_partial(n-16, (char*)p+16); - } - else { - store(p); - } - } - // cut off vector to n elements. The last 32-n elements are set to zero - Vec32c & cutoff(int n) { - if (uint32_t(n) >= 32) return *this; - static const union { - int32_t i[16]; - char c[64]; - } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}}; - *this &= Vec32c().load(mask.c+32-n); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec32c const & insert(uint32_t index, int8_t value) { - static const int8_t maskl[64] = {0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, - -1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0 ,0,0,0,0, 0,0,0,0, 0,0,0,0}; - __m256i broad = _mm256_set1_epi8(value); // broadcast value into all elements - __m256i mask = _mm256_loadu_si256((__m256i const*)(maskl+32-(index & 0x1F))); // mask with FF at index position - ymm = selectb(mask,broad,ymm); - return *this; - } - // Member function extract a single element from vector - int8_t extract(uint32_t index) const { - int8_t x[32]; - store(x); - return x[index & 0x1F]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int8_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec16c: - Vec16c get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec16c get_high() const { -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - return _mm256_extractf128_si256(ymm,1); // workaround bug in MS compiler VS 11 -#else - return _mm256_extracti128_si256(ymm,1); -#endif - } - static int size() { - return 32; - } -}; - - -/***************************************************************************** -* -* Vec32cb: Vector of 32 Booleans for use with Vec32c and Vec32uc -* -*****************************************************************************/ - -class Vec32cb : public Vec32c { -public: - // Default constructor: - Vec32cb(){ - } - // Constructor to build from all elements: - Vec32cb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, - bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15, - bool x16, bool x17, bool x18, bool x19, bool x20, bool x21, bool x22, bool x23, - bool x24, bool x25, bool x26, bool x27, bool x28, bool x29, bool x30, bool x31) : - Vec32c(-int8_t(x0), -int8_t(x1), -int8_t(x2), -int8_t(x3), -int8_t(x4), -int8_t(x5), -int8_t(x6), -int8_t(x7), - -int8_t(x8), -int8_t(x9), -int8_t(x10), -int8_t(x11), -int8_t(x12), -int8_t(x13), -int8_t(x14), -int8_t(x15), - -int8_t(x16), -int8_t(x17), -int8_t(x18), -int8_t(x19), -int8_t(x20), -int8_t(x21), -int8_t(x22), -int8_t(x23), - -int8_t(x24), -int8_t(x25), -int8_t(x26), -int8_t(x27), -int8_t(x28), -int8_t(x29), -int8_t(x30), -int8_t(x31)) - {} - // Constructor to convert from type __m256i used in intrinsics: - Vec32cb(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec32cb & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec32cb(bool b) : Vec32c(-int8_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec32cb & operator = (bool b) { - *this = Vec32cb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec32cb(int b); - Vec32cb & operator = (int x); -public: - // Member functions to split into two Vec16c: - Vec16cb get_low() const { - return Vec16cb(Vec32c::get_low()); - } - Vec16cb get_high() const { - return Vec16cb(Vec32c::get_high()); - } - Vec32cb & insert (int index, bool a) { - Vec32c::insert(index, -(int)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec32c::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec32cb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec32cb operator & (Vec32cb const & a, Vec32cb const & b) { - return Vec32cb(Vec256b(a) & Vec256b(b)); -} -static inline Vec32cb operator && (Vec32cb const & a, Vec32cb const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec32cb & operator &= (Vec32cb & a, Vec32cb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec32cb operator | (Vec32cb const & a, Vec32cb const & b) { - return Vec32cb(Vec256b(a) | Vec256b(b)); -} -static inline Vec32cb operator || (Vec32cb const & a, Vec32cb const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec32cb & operator |= (Vec32cb & a, Vec32cb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec32cb operator ^ (Vec32cb const & a, Vec32cb const & b) { - return Vec32cb(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec32cb & operator ^= (Vec32cb & a, Vec32cb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec32cb operator ~ (Vec32cb const & a) { - return Vec32cb( ~ Vec256b(a)); -} - -// vector operator ! : element not -static inline Vec32cb operator ! (Vec32cb const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec32cb andnot (Vec32cb const & a, Vec32cb const & b) { - return Vec32cb(andnot(Vec256b(a), Vec256b(b))); -} - - -/***************************************************************************** -* -* Operators for Vec32c -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec32c operator + (Vec32c const & a, Vec32c const & b) { - return _mm256_add_epi8(a, b); -} - -// vector operator += : add -static inline Vec32c & operator += (Vec32c & a, Vec32c const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec32c operator ++ (Vec32c & a, int) { - Vec32c a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec32c & operator ++ (Vec32c & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec32c operator - (Vec32c const & a, Vec32c const & b) { - return _mm256_sub_epi8(a, b); -} - -// vector operator - : unary minus -static inline Vec32c operator - (Vec32c const & a) { - return _mm256_sub_epi8(_mm256_setzero_si256(), a); -} - -// vector operator -= : add -static inline Vec32c & operator -= (Vec32c & a, Vec32c const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec32c operator -- (Vec32c & a, int) { - Vec32c a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec32c & operator -- (Vec32c & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec32c operator * (Vec32c const & a, Vec32c const & b) { - // There is no 8-bit multiply in SSE2. Split into two 16-bit multiplies - __m256i aodd = _mm256_srli_epi16(a,8); // odd numbered elements of a - __m256i bodd = _mm256_srli_epi16(b,8); // odd numbered elements of b - __m256i muleven = _mm256_mullo_epi16(a,b); // product of even numbered elements - __m256i mulodd = _mm256_mullo_epi16(aodd,bodd); // product of odd numbered elements - mulodd = _mm256_slli_epi16(mulodd,8); // put odd numbered elements back in place - __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for even positions - __m256i product = selectb(mask,muleven,mulodd); // interleave even and odd - return product; -} - -// vector operator *= : multiply -static inline Vec32c & operator *= (Vec32c & a, Vec32c const & b) { - a = a * b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec32c operator << (Vec32c const & a, int b) { - uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out - __m256i am = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow - __m256i res = _mm256_sll_epi16(am,_mm_cvtsi32_si128(b)); // 16-bit shifts - return res; -} - -// vector operator <<= : shift left -static inline Vec32c & operator <<= (Vec32c & a, int b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic all elements -static inline Vec32c operator >> (Vec32c const & a, int b) { - __m256i aeven = _mm256_slli_epi16(a,8); // even numbered elements of a. get sign bit in position - aeven = _mm256_sra_epi16(aeven,_mm_cvtsi32_si128(b+8)); // shift arithmetic, back to position - __m256i aodd = _mm256_sra_epi16(a,_mm_cvtsi32_si128(b)); // shift odd numbered elements arithmetic - __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for even positions - __m256i res = selectb(mask,aeven,aodd); // interleave even and odd - return res; -} - -// vector operator >>= : shift right artihmetic -static inline Vec32c & operator >>= (Vec32c & a, int b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec32cb operator == (Vec32c const & a, Vec32c const & b) { - return _mm256_cmpeq_epi8(a,b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec32cb operator != (Vec32c const & a, Vec32c const & b) { - return Vec32cb(Vec32c(~(a == b))); -} - -// vector operator > : returns true for elements for which a > b (signed) -static inline Vec32cb operator > (Vec32c const & a, Vec32c const & b) { - return _mm256_cmpgt_epi8(a,b); -} - -// vector operator < : returns true for elements for which a < b (signed) -static inline Vec32cb operator < (Vec32c const & a, Vec32c const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec32cb operator >= (Vec32c const & a, Vec32c const & b) { - return Vec32cb(Vec32c(~(b > a))); -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec32cb operator <= (Vec32c const & a, Vec32c const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec32c operator & (Vec32c const & a, Vec32c const & b) { - return Vec32c(Vec256b(a) & Vec256b(b)); -} -static inline Vec32c operator && (Vec32c const & a, Vec32c const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec32c & operator &= (Vec32c & a, Vec32c const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec32c operator | (Vec32c const & a, Vec32c const & b) { - return Vec32c(Vec256b(a) | Vec256b(b)); -} -static inline Vec32c operator || (Vec32c const & a, Vec32c const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec32c & operator |= (Vec32c & a, Vec32c const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec32c operator ^ (Vec32c const & a, Vec32c const & b) { - return Vec32c(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec32c & operator ^= (Vec32c & a, Vec32c const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec32c operator ~ (Vec32c const & a) { - return Vec32c( ~ Vec256b(a)); -} - -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec32cb operator ! (Vec32c const & a) { - return _mm256_cmpeq_epi8(a,_mm256_setzero_si256()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -static inline Vec32c select (Vec32cb const & s, Vec32c const & a, Vec32c const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec32c if_add (Vec32cb const & f, Vec32c const & a, Vec32c const & b) { - return a + (Vec32c(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec32c const & a) { - __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256()); - __m256i sum2 = _mm256_shuffle_epi32(sum1,2); - __m256i sum3 = _mm256_add_epi16(sum1,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); -#endif - __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); - int8_t sum6 = (int8_t)_mm_cvtsi128_si32(sum5); // truncate to 8 bits - return sum6; // sign extend to 32 bits -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Each element is sign-extended before addition to avoid overflow -static inline int32_t horizontal_add_x (Vec32c const & a) { - __m256i aeven = _mm256_slli_epi16(a,8); // even numbered elements of a. get sign bit in position - aeven = _mm256_srai_epi16(aeven,8); // sign extend even numbered elements - __m256i aodd = _mm256_srai_epi16(a,8); // sign extend odd numbered elements - __m256i sum1 = _mm256_add_epi16(aeven,aodd); // add even and odd elements - __m256i sum2 = _mm256_hadd_epi16(sum1,sum1); // horizontally add 2x8 elements in 3 steps - __m256i sum3 = _mm256_hadd_epi16(sum2,sum2); - __m256i sum4 = _mm256_hadd_epi16(sum3,sum3); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum5 = _mm256_extractf128_si256(sum4,1); // bug in MS VS 11 -#else - __m128i sum5 = _mm256_extracti128_si256(sum4,1); // get high sum -#endif - __m128i sum6 = _mm_add_epi16(_mm256_castsi256_si128(sum4),sum5);// add high and low sum - int16_t sum7 = (int16_t)_mm_cvtsi128_si32(sum6); // 16 bit sum - return sum7; // sign extend to 32 bits -} - -// function add_saturated: add element by element, signed with saturation -static inline Vec32c add_saturated(Vec32c const & a, Vec32c const & b) { - return _mm256_adds_epi8(a, b); -} - -// function sub_saturated: subtract element by element, signed with saturation -static inline Vec32c sub_saturated(Vec32c const & a, Vec32c const & b) { - return _mm256_subs_epi8(a, b); -} - -// function max: a > b ? a : b -static inline Vec32c max(Vec32c const & a, Vec32c const & b) { - return _mm256_max_epi8(a,b); -} - -// function min: a < b ? a : b -static inline Vec32c min(Vec32c const & a, Vec32c const & b) { - return _mm256_min_epi8(a,b); -} - -// function abs: a >= 0 ? a : -a -static inline Vec32c abs(Vec32c const & a) { - return _mm256_sign_epi8(a,a); -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec32c abs_saturated(Vec32c const & a) { - __m256i absa = abs(a); // abs(a) - __m256i overfl = _mm256_cmpgt_epi8(_mm256_setzero_si256(),absa); // 0 > a - return _mm256_add_epi8(absa,overfl); // subtract 1 if 0x80 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec32c rotate_left(Vec32c const & a, int b) { - __m128i bb = _mm_cvtsi32_si128(b & 7); // b modulo 8 - __m128i mbb = _mm_cvtsi32_si128((8-b) & 7); // 8-b modulo 8 - __m256i maskeven = _mm256_set1_epi32(0x00FF00FF); // mask for even numbered bytes - __m256i even = _mm256_and_si256(a,maskeven); // even numbered bytes of a - __m256i odd = _mm256_andnot_si256(maskeven,a); // odd numbered bytes of a - __m256i evenleft = _mm256_sll_epi16(even,bb); // even bytes of a << b - __m256i oddleft = _mm256_sll_epi16(odd,bb); // odd bytes of a << b - __m256i evenright = _mm256_srl_epi16(even,mbb); // even bytes of a >> 8-b - __m256i oddright = _mm256_srl_epi16(odd,mbb); // odd bytes of a >> 8-b - __m256i evenrot = _mm256_or_si256(evenleft,evenright); // even bytes of a rotated - __m256i oddrot = _mm256_or_si256(oddleft,oddright); // odd bytes of a rotated - __m256i allrot = selectb(maskeven,evenrot,oddrot); // all bytes rotated - return allrot; -} - - - -/***************************************************************************** -* -* Vector of 16 8-bit unsigned integers -* -*****************************************************************************/ - -class Vec32uc : public Vec32c { -public: - // Default constructor: - Vec32uc(){ - } - // Constructor to broadcast the same value into all elements: - Vec32uc(uint32_t i) { - ymm = _mm256_set1_epi8((char)i); - } - // Constructor to build from all elements: - Vec32uc(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3, uint8_t i4, uint8_t i5, uint8_t i6, uint8_t i7, - uint8_t i8, uint8_t i9, uint8_t i10, uint8_t i11, uint8_t i12, uint8_t i13, uint8_t i14, uint8_t i15, - uint8_t i16, uint8_t i17, uint8_t i18, uint8_t i19, uint8_t i20, uint8_t i21, uint8_t i22, uint8_t i23, - uint8_t i24, uint8_t i25, uint8_t i26, uint8_t i27, uint8_t i28, uint8_t i29, uint8_t i30, uint8_t i31) { - ymm = _mm256_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, - i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31); - } - // Constructor to build from two Vec16uc: - Vec32uc(Vec16uc const & a0, Vec16uc const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec32uc(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec32uc & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Member function to load from array (unaligned) - Vec32uc & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec32uc & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec32uc const & insert(uint32_t index, uint8_t value) { - Vec32c::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint8_t extract(uint32_t index) const { - return Vec32c::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint8_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec16uc: - Vec16uc get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec16uc get_high() const { - return _mm256_extracti128_si256(ymm,1); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec32uc operator + (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc (Vec32c(a) + Vec32c(b)); -} - -// vector operator - : subtract -static inline Vec32uc operator - (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc (Vec32c(a) - Vec32c(b)); -} - -// vector operator * : multiply -static inline Vec32uc operator * (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc (Vec32c(a) * Vec32c(b)); -} - -// vector operator << : shift left all elements -static inline Vec32uc operator << (Vec32uc const & a, uint32_t b) { - uint32_t mask = (uint32_t)0xFF >> (uint32_t)b; // mask to remove bits that are shifted out - __m256i am = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow - __m256i res = _mm256_sll_epi16(am,_mm_cvtsi32_si128(b)); // 16-bit shifts - return res; -} - -// vector operator << : shift left all elements -static inline Vec32uc operator << (Vec32uc const & a, int32_t b) { - return a << (uint32_t)b; -} - -// vector operator >> : shift right logical all elements -static inline Vec32uc operator >> (Vec32uc const & a, uint32_t b) { - uint32_t mask = (uint32_t)0xFF << (uint32_t)b; // mask to remove bits that are shifted out - __m256i am = _mm256_and_si256(a,_mm256_set1_epi8((char)mask));// remove bits that will overflow - __m256i res = _mm256_srl_epi16(am,_mm_cvtsi32_si128(b)); // 16-bit shifts - return res; -} - -// vector operator >> : shift right logical all elements -static inline Vec32uc operator >> (Vec32uc const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right artihmetic -static inline Vec32uc & operator >>= (Vec32uc & a, uint32_t b) { - a = a >> b; - return a; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec32cb operator >= (Vec32uc const & a, Vec32uc const & b) { - return _mm256_cmpeq_epi8(_mm256_max_epu8(a,b), a); // a == max(a,b) -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec32cb operator <= (Vec32uc const & a, Vec32uc const & b) { - return b >= a; -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec32cb operator > (Vec32uc const & a, Vec32uc const & b) { - return Vec32cb(Vec32c(~(b >= a))); -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec32cb operator < (Vec32uc const & a, Vec32uc const & b) { - return b > a; -} - -// vector operator & : bitwise and -static inline Vec32uc operator & (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(Vec256b(a) & Vec256b(b)); -} -static inline Vec32uc operator && (Vec32uc const & a, Vec32uc const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec32uc operator | (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(Vec256b(a) | Vec256b(b)); -} -static inline Vec32uc operator || (Vec32uc const & a, Vec32uc const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec32uc operator ^ (Vec32uc const & a, Vec32uc const & b) { - return Vec32uc(Vec256b(a) ^ Vec256b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec32uc operator ~ (Vec32uc const & a) { - return Vec32uc( ~ Vec256b(a)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 32; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec32uc select (Vec32cb const & s, Vec32uc const & a, Vec32uc const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec32uc if_add (Vec32cb const & f, Vec32uc const & a, Vec32uc const & b) { - return a + (Vec32uc(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -// (Note: horizontal_add_x(Vec32uc) is slightly faster) -static inline uint32_t horizontal_add (Vec32uc const & a) { - __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256()); - __m256i sum2 = _mm256_shuffle_epi32(sum1,2); - __m256i sum3 = _mm256_add_epi16(sum1,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); -#endif - __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); - uint8_t sum6 = (uint8_t)_mm_cvtsi128_si32(sum5); // truncate to 8 bits - return sum6; // zero extend to 32 bits -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Each element is zero-extended before addition to avoid overflow -static inline uint32_t horizontal_add_x (Vec32uc const & a) { - __m256i sum1 = _mm256_sad_epu8(a,_mm256_setzero_si256()); - __m256i sum2 = _mm256_shuffle_epi32(sum1,2); - __m256i sum3 = _mm256_add_epi16(sum1,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); -#endif - __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); - return _mm_cvtsi128_si32(sum5); -} - -// function add_saturated: add element by element, unsigned with saturation -static inline Vec32uc add_saturated(Vec32uc const & a, Vec32uc const & b) { - return _mm256_adds_epu8(a, b); -} - -// function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec32uc sub_saturated(Vec32uc const & a, Vec32uc const & b) { - return _mm256_subs_epu8(a, b); -} - -// function max: a > b ? a : b -static inline Vec32uc max(Vec32uc const & a, Vec32uc const & b) { - return _mm256_max_epu8(a,b); -} - -// function min: a < b ? a : b -static inline Vec32uc min(Vec32uc const & a, Vec32uc const & b) { - return _mm256_min_epu8(a,b); -} - -// function avg: (a + b + 1) >> 1 -static inline Vec32uc avg(Vec32uc const & a, Vec32uc const & b) { - return _mm256_avg_epu8(a,b); -} - - - -/***************************************************************************** -* -* Vector of 16 16-bit signed integers -* -*****************************************************************************/ - -class Vec16s : public Vec256b { -public: - // Default constructor: - Vec16s() { - } - // Constructor to broadcast the same value into all elements: - Vec16s(int i) { - ymm = _mm256_set1_epi16((int16_t)i); - } - // Constructor to build from all elements: - Vec16s(int16_t i0, int16_t i1, int16_t i2, int16_t i3, int16_t i4, int16_t i5, int16_t i6, int16_t i7, - int16_t i8, int16_t i9, int16_t i10, int16_t i11, int16_t i12, int16_t i13, int16_t i14, int16_t i15) { - ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 ); - } - // Constructor to build from two Vec8s: - Vec16s(Vec8s const & a0, Vec8s const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec16s(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec16s & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256i used in intrinsics - operator __m256i() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec16s & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec16s & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to load 16 8-bit unsigned integers from array - Vec16s & load_16uc(void const * p) { - ymm = _mm256_cvtepu8_epi16(Vec16uc().load(p)); - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec16s & load_partial(int n, void const * p) { - if (n <= 0) { - *this = 0; - } - else if (n <= 8) { - *this = Vec16s(Vec8s().load_partial(n, p), 0); - } - else if (n < 16) { - *this = Vec16s(Vec8s().load(p), Vec8s().load_partial(n-8, (int16_t const*)p+8)); - } - else { - load(p); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - if (n <= 0) { - return; - } - else if (n <= 8) { - get_low().store_partial(n, p); - } - else if (n < 16) { - get_low().store(p); - get_high().store_partial(n-8, (int16_t*)p+8); - } - else { - store(p); - } - } - // cut off vector to n elements. The last 16-n elements are set to zero - Vec16s & cutoff(int n) { - *this = Vec32c(*this).cutoff(n * 2); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec16s const & insert(uint32_t index, int16_t value) { - static const int16_t m[32] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, -1,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; - __m256i mask = Vec256b().load(m + 16 - (index & 0x0F)); - __m256i broad = _mm256_set1_epi16(value); - ymm = selectb(mask, broad, ymm); - return *this; - } - // Member function extract a single element from vector - int16_t extract(uint32_t index) const { - int16_t x[16]; - store(x); - return x[index & 0x0F]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int16_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec8s: - Vec8s get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec8s get_high() const { - return _mm256_extracti128_si256(ymm,1); - } - static int size() { - return 16; - } -}; - - -/***************************************************************************** -* -* Vec16sb: Vector of 16 Booleans for use with Vec16s and Vec16us -* -*****************************************************************************/ -class Vec16sb : public Vec16s { -public: - // Default constructor: - Vec16sb() { - } - // Constructor to build from all elements: - Vec16sb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7, - bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) : - Vec16s(-int16_t(x0), -int16_t(x1), -int16_t(x2), -int16_t(x3), -int16_t(x4), -int16_t(x5), -int16_t(x6), -int16_t(x7), - -int16_t(x8), -int16_t(x9), -int16_t(x10), -int16_t(x11), -int16_t(x12), -int16_t(x13), -int16_t(x14), -int16_t(x15)) - {} - // Constructor to convert from type __m256i used in intrinsics: - Vec16sb(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec16sb & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec16sb(bool b) : Vec16s(-int16_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec16sb & operator = (bool b) { - *this = Vec16sb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec16sb(int b); - Vec16sb & operator = (int x); -public: - Vec8sb get_low() const { - return Vec8sb(Vec16s::get_low()); - } - Vec8sb get_high() const { - return Vec8sb(Vec16s::get_high()); - } - Vec16sb & insert (int index, bool a) { - Vec16s::insert(index, -(int)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec16s::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec16sb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec16sb operator & (Vec16sb const & a, Vec16sb const & b) { - return Vec16sb(Vec256b(a) & Vec256b(b)); -} -static inline Vec16sb operator && (Vec16sb const & a, Vec16sb const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec16sb & operator &= (Vec16sb & a, Vec16sb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec16sb operator | (Vec16sb const & a, Vec16sb const & b) { - return Vec16sb(Vec256b(a) | Vec256b(b)); -} -static inline Vec16sb operator || (Vec16sb const & a, Vec16sb const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec16sb & operator |= (Vec16sb & a, Vec16sb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec16sb operator ^ (Vec16sb const & a, Vec16sb const & b) { - return Vec16sb(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec16sb & operator ^= (Vec16sb & a, Vec16sb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec16sb operator ~ (Vec16sb const & a) { - return Vec16sb( ~ Vec256b(a)); -} - -// vector operator ! : element not -static inline Vec16sb operator ! (Vec16sb const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec16sb andnot (Vec16sb const & a, Vec16sb const & b) { - return Vec16sb(andnot(Vec256b(a), Vec256b(b))); -} - - -/***************************************************************************** -* -* Operators for Vec16s -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec16s operator + (Vec16s const & a, Vec16s const & b) { - return _mm256_add_epi16(a, b); -} - -// vector operator += : add -static inline Vec16s & operator += (Vec16s & a, Vec16s const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec16s operator ++ (Vec16s & a, int) { - Vec16s a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec16s & operator ++ (Vec16s & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec16s operator - (Vec16s const & a, Vec16s const & b) { - return _mm256_sub_epi16(a, b); -} - -// vector operator - : unary minus -static inline Vec16s operator - (Vec16s const & a) { - return _mm256_sub_epi16(_mm256_setzero_si256(), a); -} - -// vector operator -= : subtract -static inline Vec16s & operator -= (Vec16s & a, Vec16s const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec16s operator -- (Vec16s & a, int) { - Vec16s a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec16s & operator -- (Vec16s & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec16s operator * (Vec16s const & a, Vec16s const & b) { - return _mm256_mullo_epi16(a, b); -} - -// vector operator *= : multiply -static inline Vec16s & operator *= (Vec16s & a, Vec16s const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -// See bottom of file - - -// vector operator << : shift left -static inline Vec16s operator << (Vec16s const & a, int b) { - return _mm256_sll_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator <<= : shift left -static inline Vec16s & operator <<= (Vec16s & a, int b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic -static inline Vec16s operator >> (Vec16s const & a, int b) { - return _mm256_sra_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >>= : shift right arithmetic -static inline Vec16s & operator >>= (Vec16s & a, int b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec16sb operator == (Vec16s const & a, Vec16s const & b) { - return _mm256_cmpeq_epi16(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec16sb operator != (Vec16s const & a, Vec16s const & b) { - return Vec16sb(Vec16s(~(a == b))); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec16sb operator > (Vec16s const & a, Vec16s const & b) { - return _mm256_cmpgt_epi16(a, b); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec16sb operator < (Vec16s const & a, Vec16s const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec16sb operator >= (Vec16s const & a, Vec16s const & b) { - return Vec16sb(Vec16s(~(b > a))); -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec16sb operator <= (Vec16s const & a, Vec16s const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec16s operator & (Vec16s const & a, Vec16s const & b) { - return Vec16s(Vec256b(a) & Vec256b(b)); -} -static inline Vec16s operator && (Vec16s const & a, Vec16s const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec16s & operator &= (Vec16s & a, Vec16s const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec16s operator | (Vec16s const & a, Vec16s const & b) { - return Vec16s(Vec256b(a) | Vec256b(b)); -} -static inline Vec16s operator || (Vec16s const & a, Vec16s const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec16s & operator |= (Vec16s & a, Vec16s const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec16s operator ^ (Vec16s const & a, Vec16s const & b) { - return Vec16s(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec16s & operator ^= (Vec16s & a, Vec16s const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec16s operator ~ (Vec16s const & a) { - return Vec16s( ~ Vec256b(a)); -} - -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec16sb operator ! (Vec16s const & a) { - return _mm256_cmpeq_epi16(a,_mm256_setzero_si256()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec16s select (Vec16sb const & s, Vec16s const & a, Vec16s const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec16s if_add (Vec16sb const & f, Vec16s const & a, Vec16s const & b) { - return a + (Vec16s(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec16s const & a) { - __m256i sum1 = _mm256_hadd_epi16(a,a); // horizontally add 2x8 elements in 3 steps - __m256i sum2 = _mm256_hadd_epi16(sum1,sum1); - __m256i sum3 = _mm256_hadd_epi16(sum2,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); // get high part -#endif - __m128i sum5 = _mm_add_epi16(_mm256_castsi256_si128(sum3),sum4); // add low and high parts - int16_t sum6 = (int16_t)_mm_cvtsi128_si32(sum5); // truncate to 16 bits - return sum6; // sign extend to 32 bits -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are sign extended before adding to avoid overflow -static inline int32_t horizontal_add_x (Vec16s const & a) { - __m256i aeven = _mm256_slli_epi32(a,16); // even numbered elements of a. get sign bit in position - aeven = _mm256_srai_epi32(aeven,16); // sign extend even numbered elements - __m256i aodd = _mm256_srai_epi32(a,16); // sign extend odd numbered elements - __m256i sum1 = _mm256_add_epi32(aeven,aodd); // add even and odd elements - __m256i sum2 = _mm256_hadd_epi32(sum1,sum1); // horizontally add 2x4 elements in 2 steps - __m256i sum3 = _mm256_hadd_epi32(sum2,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); -#endif - __m128i sum5 = _mm_add_epi32(_mm256_castsi256_si128(sum3),sum4); - return _mm_cvtsi128_si32(sum5); -} - -// function add_saturated: add element by element, signed with saturation -static inline Vec16s add_saturated(Vec16s const & a, Vec16s const & b) { - return _mm256_adds_epi16(a, b); -} - -// function sub_saturated: subtract element by element, signed with saturation -static inline Vec16s sub_saturated(Vec16s const & a, Vec16s const & b) { - return _mm256_subs_epi16(a, b); -} - -// function max: a > b ? a : b -static inline Vec16s max(Vec16s const & a, Vec16s const & b) { - return _mm256_max_epi16(a,b); -} - -// function min: a < b ? a : b -static inline Vec16s min(Vec16s const & a, Vec16s const & b) { - return _mm256_min_epi16(a,b); -} - -// function abs: a >= 0 ? a : -a -static inline Vec16s abs(Vec16s const & a) { - return _mm256_sign_epi16(a,a); -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec16s abs_saturated(Vec16s const & a) { - __m256i absa = abs(a); // abs(a) - __m256i overfl = _mm256_srai_epi16(absa,15); // sign - return _mm256_add_epi16(absa,overfl); // subtract 1 if 0x8000 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec16s rotate_left(Vec16s const & a, int b) { - __m256i left = _mm256_sll_epi16(a,_mm_cvtsi32_si128(b & 0x0F)); // a << b - __m256i right = _mm256_srl_epi16(a,_mm_cvtsi32_si128((16-b) & 0x0F)); // a >> (16 - b) - __m256i rot = _mm256_or_si256(left,right); // or - return rot; -} - - -/***************************************************************************** -* -* Vector of 16 16-bit unsigned integers -* -*****************************************************************************/ - -class Vec16us : public Vec16s { -public: - // Default constructor: - Vec16us(){ - } - // Constructor to broadcast the same value into all elements: - Vec16us(uint32_t i) { - ymm = _mm256_set1_epi16((int16_t)i); - } - // Constructor to build from all elements: - Vec16us(uint16_t i0, uint16_t i1, uint16_t i2, uint16_t i3, uint16_t i4, uint16_t i5, uint16_t i6, uint16_t i7, - uint16_t i8, uint16_t i9, uint16_t i10, uint16_t i11, uint16_t i12, uint16_t i13, uint16_t i14, uint16_t i15) { - ymm = _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 ); - } - // Constructor to build from two Vec8us: - Vec16us(Vec8us const & a0, Vec8us const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec16us(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec16us & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Member function to load from array (unaligned) - Vec16us & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec16us & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec16us const & insert(uint32_t index, uint16_t value) { - Vec16s::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint16_t extract(uint32_t index) const { - return Vec16s::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint16_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec8us: - Vec8us get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec8us get_high() const { - return _mm256_extracti128_si256(ymm,1); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec16us operator + (Vec16us const & a, Vec16us const & b) { - return Vec16us (Vec16s(a) + Vec16s(b)); -} - -// vector operator - : subtract -static inline Vec16us operator - (Vec16us const & a, Vec16us const & b) { - return Vec16us (Vec16s(a) - Vec16s(b)); -} - -// vector operator * : multiply -static inline Vec16us operator * (Vec16us const & a, Vec16us const & b) { - return Vec16us (Vec16s(a) * Vec16s(b)); -} - -// vector operator / : divide -// See bottom of file - -// vector operator >> : shift right logical all elements -static inline Vec16us operator >> (Vec16us const & a, uint32_t b) { - return _mm256_srl_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec16us operator >> (Vec16us const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right artihmetic -static inline Vec16us & operator >>= (Vec16us & a, uint32_t b) { - a = a >> b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec16us operator << (Vec16us const & a, uint32_t b) { - return _mm256_sll_epi16(a,_mm_cvtsi32_si128(b)); -} - -// vector operator << : shift left all elements -static inline Vec16us operator << (Vec16us const & a, int32_t b) { - return a << (uint32_t)b; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec16sb operator >= (Vec16us const & a, Vec16us const & b) { - __m256i max_ab = _mm256_max_epu16(a,b); // max(a,b), unsigned - return _mm256_cmpeq_epi16(a,max_ab); // a == max(a,b) -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec16sb operator <= (Vec16us const & a, Vec16us const & b) { - return b >= a; -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec16sb operator > (Vec16us const & a, Vec16us const & b) { - return Vec16sb(Vec16s(~(b >= a))); -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec16sb operator < (Vec16us const & a, Vec16us const & b) { - return b > a; -} - -// vector operator & : bitwise and -static inline Vec16us operator & (Vec16us const & a, Vec16us const & b) { - return Vec16us(Vec256b(a) & Vec256b(b)); -} -static inline Vec16us operator && (Vec16us const & a, Vec16us const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec16us operator | (Vec16us const & a, Vec16us const & b) { - return Vec16us(Vec256b(a) | Vec256b(b)); -} -static inline Vec16us operator || (Vec16us const & a, Vec16us const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec16us operator ^ (Vec16us const & a, Vec16us const & b) { - return Vec16us(Vec256b(a) ^ Vec256b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec16us operator ~ (Vec16us const & a) { - return Vec16us( ~ Vec256b(a)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each word in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec16us select (Vec16sb const & s, Vec16us const & a, Vec16us const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec16us if_add (Vec16sb const & f, Vec16us const & a, Vec16us const & b) { - return a + (Vec16us(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec16us const & a) { - __m256i sum1 = _mm256_hadd_epi16(a,a); // horizontally add 2x8 elements in 3 steps - __m256i sum2 = _mm256_hadd_epi16(sum1,sum1); - __m256i sum3 = _mm256_hadd_epi16(sum2,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); // get high part -#endif - __m128i sum5 = _mm_add_epi32(_mm256_castsi256_si128(sum3),sum4); // add low and high parts - return _mm_cvtsi128_si32(sum5); -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Each element is zero-extended before addition to avoid overflow -static inline uint32_t horizontal_add_x (Vec16us const & a) { - __m256i mask = _mm256_set1_epi32(0x0000FFFF); // mask for even positions - __m256i aeven = _mm256_and_si256(a,mask); // even numbered elements of a - __m256i aodd = _mm256_srli_epi32(a,16); // zero extend odd numbered elements - __m256i sum1 = _mm256_add_epi32(aeven,aodd); // add even and odd elements - __m256i sum2 = _mm256_hadd_epi32(sum1,sum1); // horizontally add 2x4 elements in 2 steps - __m256i sum3 = _mm256_hadd_epi32(sum2,sum2); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum4 = _mm256_extractf128_si256(sum3,1); // bug in MS compiler VS 11 -#else - __m128i sum4 = _mm256_extracti128_si256(sum3,1); // get high part -#endif - __m128i sum5 = _mm_add_epi32(_mm256_castsi256_si128(sum3),sum4); // add low and high parts - return _mm_cvtsi128_si32(sum5); -} - -// function add_saturated: add element by element, unsigned with saturation -static inline Vec16us add_saturated(Vec16us const & a, Vec16us const & b) { - return _mm256_adds_epu16(a, b); -} - -// function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec16us sub_saturated(Vec16us const & a, Vec16us const & b) { - return _mm256_subs_epu16(a, b); -} - -// function max: a > b ? a : b -static inline Vec16us max(Vec16us const & a, Vec16us const & b) { - return _mm256_max_epu16(a,b); -} - -// function min: a < b ? a : b -static inline Vec16us min(Vec16us const & a, Vec16us const & b) { - return _mm256_min_epu16(a,b); -} - -// function avg: (a + b + 1) >> 1 -static inline Vec16us avg(Vec16us const & a, Vec16us const & b) { - return _mm256_avg_epu16(a,b); -} - - -/***************************************************************************** -* -* Vector of 8 32-bit signed integers -* -*****************************************************************************/ - -class Vec8i : public Vec256b { -public: - // Default constructor: - Vec8i() { - } - // Constructor to broadcast the same value into all elements: - Vec8i(int i) { - ymm = _mm256_set1_epi32(i); - } - // Constructor to build from all elements: - Vec8i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7) { - ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7); - } - // Constructor to build from two Vec4i: - Vec8i(Vec4i const & a0, Vec4i const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec8i(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec8i & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256i used in intrinsics - operator __m256i() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec8i & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec8i & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to load 8 8-bit unsigned integers from array - Vec8i & load_8uc(void const * p) { - ymm = _mm256_cvtepu8_epi32(Vec16uc().loadl(p)); - return *this; - } - // Member function to load 8 16-bit unsigned integers from array - Vec8i & load_8us(void const * p) { - ymm = _mm256_cvtepu16_epi32(Vec8us().load(p)); - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec8i & load_partial(int n, void const * p) { - if (n <= 0) { - *this = 0; - } - else if (n <= 4) { - *this = Vec8i(Vec4i().load_partial(n, p), 0); - } - else if (n < 8) { - *this = Vec8i(Vec4i().load(p), Vec4i().load_partial(n-4, (int32_t const*)p+4)); - } - else { - load(p); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - if (n <= 0) { - return; - } - else if (n <= 4) { - get_low().store_partial(n, p); - } - else if (n < 8) { - get_low().store(p); - get_high().store_partial(n-4, (int32_t*)p+4); - } - else { - store(p); - } - } - // cut off vector to n elements. The last 8-n elements are set to zero - Vec8i & cutoff(int n) { - *this = Vec32c(*this).cutoff(n * 4); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8i const & insert(uint32_t index, int32_t value) { - static const int32_t maskl[16] = {0,0,0,0,0,0,0,0, -1,0,0,0,0,0,0,0}; - __m256i broad = _mm256_set1_epi32(value); // broadcast value into all elements - __m256i mask = Vec256b().load(maskl + 8 - (index & 7)); // mask with FFFFFFFF at index position - ymm = selectb (mask, broad, ymm); - return *this; - } - // Member function extract a single element from vector - int32_t extract(uint32_t index) const { - int32_t x[8]; - store(x); - return x[index & 7]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int32_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec4i: - Vec4i get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec4i get_high() const { - return _mm256_extracti128_si256(ymm,1); - } - static int size() { - return 8; - } -}; - - -/***************************************************************************** -* -* Vec8ib: Vector of 8 Booleans for use with Vec8i and Vec8ui -* -*****************************************************************************/ - -class Vec8ib : public Vec8i { -public: - // Default constructor: - Vec8ib() { - } - // Constructor to build from all elements: - Vec8ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) : - Vec8i(-int32_t(x0), -int32_t(x1), -int32_t(x2), -int32_t(x3), -int32_t(x4), -int32_t(x5), -int32_t(x6), -int32_t(x7)) - {} - // Constructor to convert from type __m256i used in intrinsics: - Vec8ib(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec8ib & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec8ib(bool b) : Vec8i(-int32_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec8ib & operator = (bool b) { - *this = Vec8ib(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec8ib(int b); - Vec8ib & operator = (int x); -public: - Vec4ib get_low() const { - return Vec4ib(Vec8i::get_low()); - } - Vec4ib get_high() const { - return Vec4ib(Vec8i::get_high()); - } - Vec8ib & insert (int index, bool a) { - Vec8i::insert(index, -(int)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec8i::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec8ib -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec8ib operator & (Vec8ib const & a, Vec8ib const & b) { - return Vec8ib(Vec256b(a) & Vec256b(b)); -} -static inline Vec8ib operator && (Vec8ib const & a, Vec8ib const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec8ib & operator &= (Vec8ib & a, Vec8ib const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec8ib operator | (Vec8ib const & a, Vec8ib const & b) { - return Vec8ib(Vec256b(a) | Vec256b(b)); -} -static inline Vec8ib operator || (Vec8ib const & a, Vec8ib const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec8ib & operator |= (Vec8ib & a, Vec8ib const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8ib operator ^ (Vec8ib const & a, Vec8ib const & b) { - return Vec8ib(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec8ib & operator ^= (Vec8ib & a, Vec8ib const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec8ib operator ~ (Vec8ib const & a) { - return Vec8ib( ~ Vec256b(a)); -} - -// vector operator ! : element not -static inline Vec8ib operator ! (Vec8ib const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec8ib andnot (Vec8ib const & a, Vec8ib const & b) { - return Vec8ib(andnot(Vec256b(a), Vec256b(b))); -} - - -/***************************************************************************** -* -* Operators for Vec8i -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec8i operator + (Vec8i const & a, Vec8i const & b) { - return _mm256_add_epi32(a, b); -} - -// vector operator += : add -static inline Vec8i & operator += (Vec8i & a, Vec8i const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec8i operator ++ (Vec8i & a, int) { - Vec8i a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec8i & operator ++ (Vec8i & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec8i operator - (Vec8i const & a, Vec8i const & b) { - return _mm256_sub_epi32(a, b); -} - -// vector operator - : unary minus -static inline Vec8i operator - (Vec8i const & a) { - return _mm256_sub_epi32(_mm256_setzero_si256(), a); -} - -// vector operator -= : subtract -static inline Vec8i & operator -= (Vec8i & a, Vec8i const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec8i operator -- (Vec8i & a, int) { - Vec8i a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec8i & operator -- (Vec8i & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec8i operator * (Vec8i const & a, Vec8i const & b) { - return _mm256_mullo_epi32(a, b); -} - -// vector operator *= : multiply -static inline Vec8i & operator *= (Vec8i & a, Vec8i const & b) { - a = a * b; - return a; -} - -// vector operator / : divide all elements by same integer -// See bottom of file - - -// vector operator << : shift left -static inline Vec8i operator << (Vec8i const & a, int32_t b) { - return _mm256_sll_epi32(a, _mm_cvtsi32_si128(b)); -} - -// vector operator <<= : shift left -static inline Vec8i & operator <<= (Vec8i & a, int32_t b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic -static inline Vec8i operator >> (Vec8i const & a, int32_t b) { - return _mm256_sra_epi32(a, _mm_cvtsi32_si128(b)); -} - -// vector operator >>= : shift right arithmetic -static inline Vec8i & operator >>= (Vec8i & a, int32_t b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec8ib operator == (Vec8i const & a, Vec8i const & b) { - return _mm256_cmpeq_epi32(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec8ib operator != (Vec8i const & a, Vec8i const & b) { - return Vec8ib(Vec8i(~(a == b))); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec8ib operator > (Vec8i const & a, Vec8i const & b) { - return _mm256_cmpgt_epi32(a, b); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec8ib operator < (Vec8i const & a, Vec8i const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec8ib operator >= (Vec8i const & a, Vec8i const & b) { - return Vec8ib(Vec8i(~(b > a))); -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec8ib operator <= (Vec8i const & a, Vec8i const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec8i operator & (Vec8i const & a, Vec8i const & b) { - return Vec8i(Vec256b(a) & Vec256b(b)); -} -static inline Vec8i operator && (Vec8i const & a, Vec8i const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec8i & operator &= (Vec8i & a, Vec8i const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec8i operator | (Vec8i const & a, Vec8i const & b) { - return Vec8i(Vec256b(a) | Vec256b(b)); -} -static inline Vec8i operator || (Vec8i const & a, Vec8i const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec8i & operator |= (Vec8i & a, Vec8i const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec8i operator ^ (Vec8i const & a, Vec8i const & b) { - return Vec8i(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec8i & operator ^= (Vec8i & a, Vec8i const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec8i operator ~ (Vec8i const & a) { - return Vec8i( ~ Vec256b(a)); -} - -// vector operator ! : returns true for elements == 0 -static inline Vec8ib operator ! (Vec8i const & a) { - return _mm256_cmpeq_epi32(a, _mm256_setzero_si256()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec8i select (Vec8ib const & s, Vec8i const & a, Vec8i const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8i if_add (Vec8ib const & f, Vec8i const & a, Vec8i const & b) { - return a + (Vec8i(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int32_t horizontal_add (Vec8i const & a) { - __m256i sum1 = _mm256_hadd_epi32(a,a); // horizontally add 2x4 elements in 2 steps - __m256i sum2 = _mm256_hadd_epi32(sum1,sum1); -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum3 = _mm256_extractf128_si256(sum2,1); // bug in MS VS 11 -#else - __m128i sum3 = _mm256_extracti128_si256(sum2,1); // get high part -#endif - __m128i sum4 = _mm_add_epi32(_mm256_castsi256_si128(sum2),sum3); // add low and high parts - return _mm_cvtsi128_si32(sum4); -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are sign extended before adding to avoid overflow -// static inline int64_t horizontal_add_x (Vec8i const & a); // defined below - -// function add_saturated: add element by element, signed with saturation -static inline Vec8i add_saturated(Vec8i const & a, Vec8i const & b) { - __m256i sum = _mm256_add_epi32(a, b); // a + b - __m256i axb = _mm256_xor_si256(a, b); // check if a and b have different sign - __m256i axs = _mm256_xor_si256(a, sum); // check if a and sum have different sign - __m256i overf1 = _mm256_andnot_si256(axb,axs); // check if sum has wrong sign - __m256i overf2 = _mm256_srai_epi32(overf1,31); // -1 if overflow - __m256i asign = _mm256_srli_epi32(a,31); // 1 if a < 0 - __m256i sat1 = _mm256_srli_epi32(overf2,1); // 7FFFFFFF if overflow - __m256i sat2 = _mm256_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow - return selectb(overf2,sat2,sum); // sum if not overflow, else sat2 -} - -// function sub_saturated: subtract element by element, signed with saturation -static inline Vec8i sub_saturated(Vec8i const & a, Vec8i const & b) { - __m256i diff = _mm256_sub_epi32(a, b); // a + b - __m256i axb = _mm256_xor_si256(a, b); // check if a and b have different sign - __m256i axs = _mm256_xor_si256(a, diff); // check if a and sum have different sign - __m256i overf1 = _mm256_and_si256(axb,axs); // check if sum has wrong sign - __m256i overf2 = _mm256_srai_epi32(overf1,31); // -1 if overflow - __m256i asign = _mm256_srli_epi32(a,31); // 1 if a < 0 - __m256i sat1 = _mm256_srli_epi32(overf2,1); // 7FFFFFFF if overflow - __m256i sat2 = _mm256_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow - return selectb(overf2,sat2,diff); // diff if not overflow, else sat2 -} - -// function max: a > b ? a : b -static inline Vec8i max(Vec8i const & a, Vec8i const & b) { - return _mm256_max_epi32(a,b); -} - -// function min: a < b ? a : b -static inline Vec8i min(Vec8i const & a, Vec8i const & b) { - return _mm256_min_epi32(a,b); -} - -// function abs: a >= 0 ? a : -a -static inline Vec8i abs(Vec8i const & a) { - return _mm256_sign_epi32(a,a); -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec8i abs_saturated(Vec8i const & a) { - __m256i absa = abs(a); // abs(a) - __m256i overfl = _mm256_srai_epi32(absa,31); // sign - return _mm256_add_epi32(absa,overfl); // subtract 1 if 0x80000000 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec8i rotate_left(Vec8i const & a, int b) { -#ifdef __AVX512VL__ - return _mm256_rolv_epi32(a, _mm256_set1_epi32(b)); -#else - __m256i left = _mm256_sll_epi32(a,_mm_cvtsi32_si128(b & 0x1F)); // a << b - __m256i right = _mm256_srl_epi32(a,_mm_cvtsi32_si128((32-b) & 0x1F)); // a >> (32 - b) - __m256i rot = _mm256_or_si256(left,right); // or - return rot; -#endif -} - - -/***************************************************************************** -* -* Vector of 8 32-bit unsigned integers -* -*****************************************************************************/ - -class Vec8ui : public Vec8i { -public: - // Default constructor: - Vec8ui() { - } - // Constructor to broadcast the same value into all elements: - Vec8ui(uint32_t i) { - ymm = _mm256_set1_epi32(i); - } - // Constructor to build from all elements: - Vec8ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7) { - ymm = _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7); - } - // Constructor to build from two Vec4ui: - Vec8ui(Vec4ui const & a0, Vec4ui const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec8ui(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec8ui & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Member function to load from array (unaligned) - Vec8ui & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec8ui & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec8ui const & insert(uint32_t index, uint32_t value) { - Vec8i::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint32_t extract(uint32_t index) const { - return Vec8i::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint32_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec4ui: - Vec4ui get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec4ui get_high() const { - return _mm256_extracti128_si256(ymm,1); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec8ui operator + (Vec8ui const & a, Vec8ui const & b) { - return Vec8ui (Vec8i(a) + Vec8i(b)); -} - -// vector operator - : subtract -static inline Vec8ui operator - (Vec8ui const & a, Vec8ui const & b) { - return Vec8ui (Vec8i(a) - Vec8i(b)); -} - -// vector operator * : multiply -static inline Vec8ui operator * (Vec8ui const & a, Vec8ui const & b) { - return Vec8ui (Vec8i(a) * Vec8i(b)); -} - -// vector operator / : divide -// See bottom of file - -// vector operator >> : shift right logical all elements -static inline Vec8ui operator >> (Vec8ui const & a, uint32_t b) { - return _mm256_srl_epi32(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec8ui operator >> (Vec8ui const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right logical -static inline Vec8ui & operator >>= (Vec8ui & a, uint32_t b) { - a = a >> b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec8ui operator << (Vec8ui const & a, uint32_t b) { - return Vec8ui ((Vec8i)a << (int32_t)b); -} - -// vector operator << : shift left all elements -static inline Vec8ui operator << (Vec8ui const & a, int32_t b) { - return Vec8ui ((Vec8i)a << (int32_t)b); -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec8ib operator > (Vec8ui const & a, Vec8ui const & b) { - __m256i signbit = _mm256_set1_epi32(0x80000000); - __m256i a1 = _mm256_xor_si256(a,signbit); - __m256i b1 = _mm256_xor_si256(b,signbit); - return _mm256_cmpgt_epi32(a1,b1); // signed compare -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec8ib operator < (Vec8ui const & a, Vec8ui const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec8ib operator >= (Vec8ui const & a, Vec8ui const & b) { - __m256i max_ab = _mm256_max_epu32(a,b); // max(a,b), unsigned - return _mm256_cmpeq_epi32(a,max_ab); // a == max(a,b) -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec8ib operator <= (Vec8ui const & a, Vec8ui const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec8ui operator & (Vec8ui const & a, Vec8ui const & b) { - return Vec8ui(Vec256b(a) & Vec256b(b)); -} -static inline Vec8ui operator && (Vec8ui const & a, Vec8ui const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec8ui operator | (Vec8ui const & a, Vec8ui const & b) { - return Vec8ui(Vec256b(a) | Vec256b(b)); -} -static inline Vec8ui operator || (Vec8ui const & a, Vec8ui const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec8ui operator ^ (Vec8ui const & a, Vec8ui const & b) { - return Vec8ui(Vec256b(a) ^ Vec256b(b)); -} - -// vector operator ~ : bitwise not -static inline Vec8ui operator ~ (Vec8ui const & a) { - return Vec8ui( ~ Vec256b(a)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i]; -// Each word in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec8ui select (Vec8ib const & s, Vec8ui const & a, Vec8ui const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec8ui if_add (Vec8ib const & f, Vec8ui const & a, Vec8ui const & b) { - return a + (Vec8ui(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint32_t horizontal_add (Vec8ui const & a) { - return horizontal_add((Vec8i)a); -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are zero extended before adding to avoid overflow -// static inline uint64_t horizontal_add_x (Vec8ui const & a); // defined later - -// function add_saturated: add element by element, unsigned with saturation -static inline Vec8ui add_saturated(Vec8ui const & a, Vec8ui const & b) { - Vec8ui sum = a + b; - Vec8ui aorb = Vec8ui(a | b); - Vec8ui overflow = Vec8ui(sum < aorb); // overflow if a + b < (a | b) - return Vec8ui (sum | overflow); // return 0xFFFFFFFF if overflow -} - -// function sub_saturated: subtract element by element, unsigned with saturation -static inline Vec8ui sub_saturated(Vec8ui const & a, Vec8ui const & b) { - Vec8ui diff = a - b; - Vec8ui underflow = Vec8ui(diff > a); // underflow if a - b > a - return _mm256_andnot_si256(underflow,diff); // return 0 if underflow -} - -// function max: a > b ? a : b -static inline Vec8ui max(Vec8ui const & a, Vec8ui const & b) { - return _mm256_max_epu32(a,b); -} - -// function min: a < b ? a : b -static inline Vec8ui min(Vec8ui const & a, Vec8ui const & b) { - return _mm256_min_epu32(a,b); -} - - -/***************************************************************************** -* -* Vector of 4 64-bit signed integers -* -*****************************************************************************/ - -class Vec4q : public Vec256b { -public: - // Default constructor: - Vec4q() { - } - // Constructor to broadcast the same value into all elements: - Vec4q(int64_t i) { -#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER) - // MS compiler cannot use _mm256_set1_epi64x in 32 bit mode, and - // cannot put 64-bit values into xmm register without using - // mmx registers, and it makes no emms - union { - int64_t q[4]; - int32_t r[8]; - } u; - u.q[0] = u.q[1] = u.q[2] = u.q[3] = i; - ymm = _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]); -#else - ymm = _mm256_set1_epi64x(i); -#endif - } - // Constructor to build from all elements: - Vec4q(int64_t i0, int64_t i1, int64_t i2, int64_t i3) { -#if defined (_MSC_VER) && _MSC_VER < 1900 && ! defined (__x86_64__) && ! defined(__INTEL_COMPILER) - // MS compiler cannot put 64-bit values into xmm register without using - // mmx registers, and it makes no emms - union { - int64_t q[4]; - int32_t r[8]; - } u; - u.q[0] = i0; u.q[1] = i1; u.q[2] = i2; u.q[3] = i3; - ymm = _mm256_setr_epi32(u.r[0], u.r[1], u.r[2], u.r[3], u.r[4], u.r[5], u.r[6], u.r[7]); -#else - ymm = _mm256_setr_epi64x(i0, i1, i2, i3); -#endif - } - // Constructor to build from two Vec2q: - Vec4q(Vec2q const & a0, Vec2q const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec4q(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec4q & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Type cast operator to convert to __m256i used in intrinsics - operator __m256i() const { - return ymm; - } - // Member function to load from array (unaligned) - Vec4q & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec4q & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Partial load. Load n elements and set the rest to 0 - Vec4q & load_partial(int n, void const * p) { - if (n <= 0) { - *this = 0; - } - else if (n <= 2) { - *this = Vec4q(Vec2q().load_partial(n, p), 0); - } - else if (n < 4) { - *this = Vec4q(Vec2q().load(p), Vec2q().load_partial(n-2, (int64_t const*)p+2)); - } - else { - load(p); - } - return *this; - } - // Partial store. Store n elements - void store_partial(int n, void * p) const { - if (n <= 0) { - return; - } - else if (n <= 2) { - get_low().store_partial(n, p); - } - else if (n < 4) { - get_low().store(p); - get_high().store_partial(n-2, (int64_t*)p+2); - } - else { - store(p); - } - } - // cut off vector to n elements. The last 8-n elements are set to zero - Vec4q & cutoff(int n) { - *this = Vec32c(*this).cutoff(n * 8); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4q const & insert(uint32_t index, int64_t value) { - Vec4q x(value); - switch (index) { - case 0: - ymm = _mm256_blend_epi32(ymm,x,0x03); break; - case 1: - ymm = _mm256_blend_epi32(ymm,x,0x0C); break; - case 2: - ymm = _mm256_blend_epi32(ymm,x,0x30); break; - case 3: - ymm = _mm256_blend_epi32(ymm,x,0xC0); break; - } - return *this; - } - // Member function extract a single element from vector - int64_t extract(uint32_t index) const { - int64_t x[4]; - store(x); - return x[index & 3]; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - int64_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec2q: - Vec2q get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec2q get_high() const { - return _mm256_extracti128_si256(ymm,1); - } - static int size() { - return 4; - } -}; - -/***************************************************************************** -* -* Vec4qb: Vector of 4 Booleans for use with Vec4q and Vec4uq -* -*****************************************************************************/ - -class Vec4qb : public Vec4q { -public: - // Default constructor: - Vec4qb() { - } - // Constructor to build from all elements: - Vec4qb(bool x0, bool x1, bool x2, bool x3) : - Vec4q(-int64_t(x0), -int64_t(x1), -int64_t(x2), -int64_t(x3)) { - } - // Constructor to convert from type __m256i used in intrinsics: - Vec4qb(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec4qb & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Constructor to broadcast scalar value: - Vec4qb(bool b) : Vec4q(-int64_t(b)) { - } - // Assignment operator to broadcast scalar value: - Vec4qb & operator = (bool b) { - *this = Vec4qb(b); - return *this; - } -private: // Prevent constructing from int, etc. - Vec4qb(int b); - Vec4qb & operator = (int x); -public: - // Member functions to split into two Vec2qb: - Vec2qb get_low() const { - return Vec2qb(Vec4q::get_low()); - } - Vec2qb get_high() const { - return Vec2qb(Vec4q::get_high()); - } - Vec4qb & insert (int index, bool a) { - Vec4q::insert(index, -(int64_t)a); - return *this; - } - // Member function extract a single element from vector - bool extract(uint32_t index) const { - return Vec4q::extract(index) != 0; - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - bool operator [] (uint32_t index) const { - return extract(index); - } -}; - - -/***************************************************************************** -* -* Define operators for Vec4qb -* -*****************************************************************************/ - -// vector operator & : bitwise and -static inline Vec4qb operator & (Vec4qb const & a, Vec4qb const & b) { - return Vec4qb(Vec256b(a) & Vec256b(b)); -} -static inline Vec4qb operator && (Vec4qb const & a, Vec4qb const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec4qb & operator &= (Vec4qb & a, Vec4qb const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec4qb operator | (Vec4qb const & a, Vec4qb const & b) { - return Vec4qb(Vec256b(a) | Vec256b(b)); -} -static inline Vec4qb operator || (Vec4qb const & a, Vec4qb const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec4qb & operator |= (Vec4qb & a, Vec4qb const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4qb operator ^ (Vec4qb const & a, Vec4qb const & b) { - return Vec4qb(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec4qb & operator ^= (Vec4qb & a, Vec4qb const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec4qb operator ~ (Vec4qb const & a) { - return Vec4qb( ~ Vec256b(a)); -} - -// vector operator ! : element not -static inline Vec4qb operator ! (Vec4qb const & a) { - return ~ a; -} - -// vector function andnot -static inline Vec4qb andnot (Vec4qb const & a, Vec4qb const & b) { - return Vec4qb(andnot(Vec256b(a), Vec256b(b))); -} - - - - -/***************************************************************************** -* -* Operators for Vec4q -* -*****************************************************************************/ - -// vector operator + : add element by element -static inline Vec4q operator + (Vec4q const & a, Vec4q const & b) { - return _mm256_add_epi64(a, b); -} - -// vector operator += : add -static inline Vec4q & operator += (Vec4q & a, Vec4q const & b) { - a = a + b; - return a; -} - -// postfix operator ++ -static inline Vec4q operator ++ (Vec4q & a, int) { - Vec4q a0 = a; - a = a + 1; - return a0; -} - -// prefix operator ++ -static inline Vec4q & operator ++ (Vec4q & a) { - a = a + 1; - return a; -} - -// vector operator - : subtract element by element -static inline Vec4q operator - (Vec4q const & a, Vec4q const & b) { - return _mm256_sub_epi64(a, b); -} - -// vector operator - : unary minus -static inline Vec4q operator - (Vec4q const & a) { - return _mm256_sub_epi64(_mm256_setzero_si256(), a); -} - -// vector operator -= : subtract -static inline Vec4q & operator -= (Vec4q & a, Vec4q const & b) { - a = a - b; - return a; -} - -// postfix operator -- -static inline Vec4q operator -- (Vec4q & a, int) { - Vec4q a0 = a; - a = a - 1; - return a0; -} - -// prefix operator -- -static inline Vec4q & operator -- (Vec4q & a) { - a = a - 1; - return a; -} - -// vector operator * : multiply element by element -static inline Vec4q operator * (Vec4q const & a, Vec4q const & b) { -#if defined (__AVX512DQ__) && defined (__AVX512VL__) - return _mm256_mullo_epi64(a, b); -#else - // instruction does not exist. Split into 32-bit multiplies - __m256i bswap = _mm256_shuffle_epi32(b,0xB1); // swap H<->L - __m256i prodlh = _mm256_mullo_epi32(a,bswap); // 32 bit L*H products - __m256i zero = _mm256_setzero_si256(); // 0 - __m256i prodlh2 = _mm256_hadd_epi32(prodlh,zero); // a0Lb0H+a0Hb0L,a1Lb1H+a1Hb1L,0,0 - __m256i prodlh3 = _mm256_shuffle_epi32(prodlh2,0x73); // 0, a0Lb0H+a0Hb0L, 0, a1Lb1H+a1Hb1L - __m256i prodll = _mm256_mul_epu32(a,b); // a0Lb0L,a1Lb1L, 64 bit unsigned products - __m256i prod = _mm256_add_epi64(prodll,prodlh3); // a0Lb0L+(a0Lb0H+a0Hb0L)<<32, a1Lb1L+(a1Lb1H+a1Hb1L)<<32 - return prod; -#endif -} - -// vector operator *= : multiply -static inline Vec4q & operator *= (Vec4q & a, Vec4q const & b) { - a = a * b; - return a; -} - -// vector operator << : shift left -static inline Vec4q operator << (Vec4q const & a, int32_t b) { - return _mm256_sll_epi64(a, _mm_cvtsi32_si128(b)); -} - -// vector operator <<= : shift left -static inline Vec4q & operator <<= (Vec4q & a, int32_t b) { - a = a << b; - return a; -} - -// vector operator >> : shift right arithmetic -static inline Vec4q operator >> (Vec4q const & a, int32_t b) { - // instruction does not exist. Split into 32-bit shifts - if (b <= 32) { - __m128i bb = _mm_cvtsi32_si128(b); // b - __m256i sra = _mm256_sra_epi32(a,bb); // a >> b signed dwords - __m256i srl = _mm256_srl_epi64(a,bb); // a >> b unsigned qwords - __m256i mask = constant8i<0,-1,0,-1,0,-1,0,-1>(); // mask for signed high part - return selectb(mask, sra, srl); - } - else { // b > 32 - __m128i bm32 = _mm_cvtsi32_si128(b-32); // b - 32 - __m256i sign = _mm256_srai_epi32(a,31); // sign of a - __m256i sra2 = _mm256_sra_epi32(a,bm32); // a >> (b-32) signed dwords - __m256i sra3 = _mm256_srli_epi64(sra2,32); // a >> (b-32) >> 32 (second shift unsigned qword) - __m256i mask = constant8i<0,-1,0,-1,0,-1,0,-1>(); // mask for high part containing only sign - return selectb(mask, sign ,sra3); - } -} - -// vector operator >>= : shift right arithmetic -static inline Vec4q & operator >>= (Vec4q & a, int32_t b) { - a = a >> b; - return a; -} - -// vector operator == : returns true for elements for which a == b -static inline Vec4qb operator == (Vec4q const & a, Vec4q const & b) { - return _mm256_cmpeq_epi64(a, b); -} - -// vector operator != : returns true for elements for which a != b -static inline Vec4qb operator != (Vec4q const & a, Vec4q const & b) { - return Vec4qb(Vec4q(~(a == b))); -} - -// vector operator < : returns true for elements for which a < b -static inline Vec4qb operator < (Vec4q const & a, Vec4q const & b) { - return _mm256_cmpgt_epi64(b, a); -} - -// vector operator > : returns true for elements for which a > b -static inline Vec4qb operator > (Vec4q const & a, Vec4q const & b) { - return b < a; -} - -// vector operator >= : returns true for elements for which a >= b (signed) -static inline Vec4qb operator >= (Vec4q const & a, Vec4q const & b) { - return Vec4qb(Vec4q(~(a < b))); -} - -// vector operator <= : returns true for elements for which a <= b (signed) -static inline Vec4qb operator <= (Vec4q const & a, Vec4q const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec4q operator & (Vec4q const & a, Vec4q const & b) { - return Vec4q(Vec256b(a) & Vec256b(b)); -} -static inline Vec4q operator && (Vec4q const & a, Vec4q const & b) { - return a & b; -} -// vector operator &= : bitwise and -static inline Vec4q & operator &= (Vec4q & a, Vec4q const & b) { - a = a & b; - return a; -} - -// vector operator | : bitwise or -static inline Vec4q operator | (Vec4q const & a, Vec4q const & b) { - return Vec4q(Vec256b(a) | Vec256b(b)); -} -static inline Vec4q operator || (Vec4q const & a, Vec4q const & b) { - return a | b; -} -// vector operator |= : bitwise or -static inline Vec4q & operator |= (Vec4q & a, Vec4q const & b) { - a = a | b; - return a; -} - -// vector operator ^ : bitwise xor -static inline Vec4q operator ^ (Vec4q const & a, Vec4q const & b) { - return Vec4q(Vec256b(a) ^ Vec256b(b)); -} -// vector operator ^= : bitwise xor -static inline Vec4q & operator ^= (Vec4q & a, Vec4q const & b) { - a = a ^ b; - return a; -} - -// vector operator ~ : bitwise not -static inline Vec4q operator ~ (Vec4q const & a) { - return Vec4q( ~ Vec256b(a)); -} - -// vector operator ! : logical not, returns true for elements == 0 -static inline Vec4qb operator ! (Vec4q const & a) { - return a == Vec4q(_mm256_setzero_si256()); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each byte in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec4q select (Vec4qb const & s, Vec4q const & a, Vec4q const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4q if_add (Vec4qb const & f, Vec4q const & a, Vec4q const & b) { - return a + (Vec4q(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline int64_t horizontal_add (Vec4q const & a) { - __m256i sum1 = _mm256_shuffle_epi32(a,0x0E); // high element - __m256i sum2 = _mm256_add_epi64(a,sum1); // sum -#if defined (_MSC_VER) && _MSC_VER <= 1700 && ! defined(__INTEL_COMPILER) - __m128i sum3 = _mm256_extractf128_si256(sum2, 1); // bug in MS compiler VS 11 -#else - __m128i sum3 = _mm256_extracti128_si256(sum2, 1); // get high part -#endif - __m128i sum4 = _mm_add_epi64(_mm256_castsi256_si128(sum2),sum3); // add low and high parts -#if defined(__x86_64__) - return _mm_cvtsi128_si64(sum4); // 64 bit mode -#else - union { - __m128i x; // silly definition of _mm256_storel_epi64 requires __m256i - uint64_t i; - } u; - _mm_storel_epi64(&u.x,sum4); - return u.i; -#endif -} - -// function max: a > b ? a : b -static inline Vec4q max(Vec4q const & a, Vec4q const & b) { - return select(a > b, a, b); -} - -// function min: a < b ? a : b -static inline Vec4q min(Vec4q const & a, Vec4q const & b) { - return select(a < b, a, b); -} - -// function abs: a >= 0 ? a : -a -static inline Vec4q abs(Vec4q const & a) { - __m256i sign = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);// 0 > a - __m256i inv = _mm256_xor_si256(a, sign); // invert bits if negative - return _mm256_sub_epi64(inv, sign); // add 1 -} - -// function abs_saturated: same as abs, saturate if overflow -static inline Vec4q abs_saturated(Vec4q const & a) { - __m256i absa = abs(a); // abs(a) - __m256i overfl = _mm256_cmpgt_epi64(_mm256_setzero_si256(), absa); // 0 > a - return _mm256_add_epi64(absa, overfl); // subtract 1 if 0x8000000000000000 -} - -// function rotate_left all elements -// Use negative count to rotate right -static inline Vec4q rotate_left(Vec4q const & a, int b) { -#ifdef __AVX512VL__ - return _mm256_rolv_epi64(a, _mm256_set1_epi64x(int64_t(b))); -#else - __m256i left = _mm256_sll_epi64(a,_mm_cvtsi32_si128(b & 0x3F)); // a << b - __m256i right = _mm256_srl_epi64(a,_mm_cvtsi32_si128((64-b) & 0x3F)); // a >> (64 - b) - __m256i rot = _mm256_or_si256(left, right); // or - return rot; -#endif -} - - -/***************************************************************************** -* -* Vector of 4 64-bit unsigned integers -* -*****************************************************************************/ - -class Vec4uq : public Vec4q { -public: - // Default constructor: - Vec4uq() { - } - // Constructor to broadcast the same value into all elements: - Vec4uq(uint64_t i) { - ymm = Vec4q(i); - } - // Constructor to build from all elements: - Vec4uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3) { - ymm = Vec4q(i0, i1, i2, i3); - } - // Constructor to build from two Vec2uq: - Vec4uq(Vec2uq const & a0, Vec2uq const & a1) { - ymm = set_m128ir(a0, a1); - } - // Constructor to convert from type __m256i used in intrinsics: - Vec4uq(__m256i const & x) { - ymm = x; - } - // Assignment operator to convert from type __m256i used in intrinsics: - Vec4uq & operator = (__m256i const & x) { - ymm = x; - return *this; - } - // Member function to load from array (unaligned) - Vec4uq & load(void const * p) { - ymm = _mm256_loadu_si256((__m256i const*)p); - return *this; - } - // Member function to load from array, aligned by 32 - Vec4uq & load_a(void const * p) { - ymm = _mm256_load_si256((__m256i const*)p); - return *this; - } - // Member function to change a single element in vector - // Note: This function is inefficient. Use load function if changing more than one element - Vec4uq const & insert(uint32_t index, uint64_t value) { - Vec4q::insert(index, value); - return *this; - } - // Member function extract a single element from vector - uint64_t extract(uint32_t index) const { - return Vec4q::extract(index); - } - // Extract a single element. Use store function if extracting more than one element. - // Operator [] can only read an element, not write. - uint64_t operator [] (uint32_t index) const { - return extract(index); - } - // Member functions to split into two Vec2uq: - Vec2uq get_low() const { - return _mm256_castsi256_si128(ymm); - } - Vec2uq get_high() const { - return _mm256_extracti128_si256(ymm,1); - } -}; - -// Define operators for this class - -// vector operator + : add -static inline Vec4uq operator + (Vec4uq const & a, Vec4uq const & b) { - return Vec4uq (Vec4q(a) + Vec4q(b)); -} - -// vector operator - : subtract -static inline Vec4uq operator - (Vec4uq const & a, Vec4uq const & b) { - return Vec4uq (Vec4q(a) - Vec4q(b)); -} - -// vector operator * : multiply element by element -static inline Vec4uq operator * (Vec4uq const & a, Vec4uq const & b) { - return Vec4uq (Vec4q(a) * Vec4q(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec4uq operator >> (Vec4uq const & a, uint32_t b) { - return _mm256_srl_epi64(a,_mm_cvtsi32_si128(b)); -} - -// vector operator >> : shift right logical all elements -static inline Vec4uq operator >> (Vec4uq const & a, int32_t b) { - return a >> (uint32_t)b; -} - -// vector operator >>= : shift right artihmetic -static inline Vec4uq & operator >>= (Vec4uq & a, uint32_t b) { - a = a >> b; - return a; -} - -// vector operator << : shift left all elements -static inline Vec4uq operator << (Vec4uq const & a, uint32_t b) { - return Vec4uq ((Vec4q)a << (int32_t)b); -} - -// vector operator << : shift left all elements -static inline Vec4uq operator << (Vec4uq const & a, int32_t b) { - return Vec4uq ((Vec4q)a << b); -} - -// vector operator > : returns true for elements for which a > b (unsigned) -static inline Vec4qb operator > (Vec4uq const & a, Vec4uq const & b) { -//#if defined ( __XOP__ ) // AMD XOP instruction set - __m256i sign64 = Vec4uq(0x8000000000000000); - __m256i aflip = _mm256_xor_si256(a, sign64); - __m256i bflip = _mm256_xor_si256(b, sign64); - Vec4q cmp = _mm256_cmpgt_epi64(aflip,bflip); - return Vec4qb(cmp); -} - -// vector operator < : returns true for elements for which a < b (unsigned) -static inline Vec4qb operator < (Vec4uq const & a, Vec4uq const & b) { - return b > a; -} - -// vector operator >= : returns true for elements for which a >= b (unsigned) -static inline Vec4qb operator >= (Vec4uq const & a, Vec4uq const & b) { - return Vec4qb(Vec4q(~(b > a))); -} - -// vector operator <= : returns true for elements for which a <= b (unsigned) -static inline Vec4qb operator <= (Vec4uq const & a, Vec4uq const & b) { - return b >= a; -} - -// vector operator & : bitwise and -static inline Vec4uq operator & (Vec4uq const & a, Vec4uq const & b) { - return Vec4uq(Vec256b(a) & Vec256b(b)); -} -static inline Vec4uq operator && (Vec4uq const & a, Vec4uq const & b) { - return a & b; -} - -// vector operator | : bitwise or -static inline Vec4uq operator | (Vec4uq const & a, Vec4uq const & b) { - return Vec4uq(Vec256b(a) | Vec256b(b)); -} -static inline Vec4uq operator || (Vec4uq const & a, Vec4uq const & b) { - return a | b; -} - -// vector operator ^ : bitwise xor -static inline Vec4uq operator ^ (Vec4uq const & a, Vec4uq const & b) { - return Vec4uq(Vec256b(a) ^ Vec256b(b)); -} - -// Functions for this class - -// Select between two operands. Corresponds to this pseudocode: -// for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i]; -// Each word in s must be either 0 (false) or -1 (true). No other values are allowed. -// (s is signed) -static inline Vec4uq select (Vec4qb const & s, Vec4uq const & a, Vec4uq const & b) { - return selectb(s,a,b); -} - -// Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i] -static inline Vec4uq if_add (Vec4qb const & f, Vec4uq const & a, Vec4uq const & b) { - return a + (Vec4uq(f) & b); -} - -// Horizontal add: Calculates the sum of all vector elements. -// Overflow will wrap around -static inline uint64_t horizontal_add (Vec4uq const & a) { - return horizontal_add((Vec4q)a); -} - -// Horizontal add extended: Calculates the sum of all vector elements. -// Elements are sing/zero extended before adding to avoid overflow -static inline int64_t horizontal_add_x (Vec8i const & a) { - __m256i signs = _mm256_srai_epi32(a,31); // sign of all elements - Vec4q a01 = _mm256_unpacklo_epi32(a,signs); // sign-extended a0, a1, a4, a5 - Vec4q a23 = _mm256_unpackhi_epi32(a,signs); // sign-extended a2, a3, a6, a7 - return horizontal_add(a01 + a23); -} - -static inline uint64_t horizontal_add_x (Vec8ui const & a) { - __m256i zero = _mm256_setzero_si256(); // 0 - __m256i a01 = _mm256_unpacklo_epi32(a,zero); // zero-extended a0, a1 - __m256i a23 = _mm256_unpackhi_epi32(a,zero); // zero-extended a2, a3 - return horizontal_add(Vec4q(a01) + Vec4q(a23)); -} - -// function max: a > b ? a : b -static inline Vec4uq max(Vec4uq const & a, Vec4uq const & b) { - return Vec4uq(select(a > b, a, b)); -} - -// function min: a < b ? a : b -static inline Vec4uq min(Vec4uq const & a, Vec4uq const & b) { - return Vec4uq(select(a > b, b, a)); -} - - -/***************************************************************************** -* -* Vector permute functions -* -****************************************************************************** -* -* These permute functions can reorder the elements of a vector and optionally -* set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to select. -* An index of -1 will generate zero. An index of -256 means don't care. -* -* Example: -* Vec8i a(10,11,12,13,14,15,16,17); // a is (10,11,12,13,14,15,16,17) -* Vec8i b; -* b = permute8i<0,2,7,7,-1,-1,1,1>(a); // b is (10,12,17,17, 0, 0,11,11) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. -*****************************************************************************/ - -// Permute vector of 4 64-bit integers. -// Index -1 gives 0, index -256 means don't care. -template -static inline Vec4q permute4q(Vec4q const & a) { - - // Combine indexes into a single bitfield, with 8 bits for each - const int m1 = (i0 & 3) | (i1 & 3) << 8 | (i2 & 3) << 16 | (i3 & 3) << 24; - - // Mask to zero out negative indexes - const int mz = (i0<0 ? 0 : 0xFF) | (i1<0 ? 0 : 0xFF) << 8 | (i2<0 ? 0 : 0xFF) << 16 | (i3<0 ? 0 : 0xFF) << 24; - - // zeroing needed - const bool dozero = ((i0|i1|i2|i3) & 0x80) != 0; - - if (((m1 ^ 0x03020100) & mz) == 0) { - // no shuffling - if (dozero) { - // zero some elements - const __m256i maskz = constant8i < - i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, - i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > (); - return _mm256_and_si256(a, maskz); - } - return a; // do nothing - } - - if (((m1 ^ 0x02020000) & 0x02020202 & mz) == 0) { - // no exchange of data between low and high half - - if (((m1 ^ (m1 >> 16)) & 0x0101 & mz & (mz >> 16)) == 0 && !dozero) { - // same pattern in low and high half. use VPSHUFD - const int sd = (((i0>=0)?(i0&1):(i2&1)) * 10 + 4) | (((i1>=0)?(i1&1):(i3&1)) * 10 + 4) << 4; - return _mm256_shuffle_epi32(a, sd); - } - - // use VPSHUFB - const __m256i mm = constant8i < - i0 < 0 ? -1 : (i0 & 1) * 0x08080808 + 0x03020100, - i0 < 0 ? -1 : (i0 & 1) * 0x08080808 + 0x07060504, - i1 < 0 ? -1 : (i1 & 1) * 0x08080808 + 0x03020100, - i1 < 0 ? -1 : (i1 & 1) * 0x08080808 + 0x07060504, - i2 < 0 ? -1 : (i2 & 1) * 0x08080808 + 0x03020100, - i2 < 0 ? -1 : (i2 & 1) * 0x08080808 + 0x07060504, - i3 < 0 ? -1 : (i3 & 1) * 0x08080808 + 0x03020100, - i3 < 0 ? -1 : (i3 & 1) * 0x08080808 + 0x07060504 > (); - return _mm256_shuffle_epi8(a, mm); - } - - // general case. Use VPERMQ - const int ms = (i0 & 3) | (i1 & 3) << 2 | (i2 & 3) << 4 | (i3 & 3) << 6; - __m256i t1 = _mm256_permute4x64_epi64(a, ms); - - if (dozero) { - // zero some elements - const __m256i maskz = constant8i < - i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, - i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > (); - return _mm256_and_si256(t1, maskz); - } - return t1; -} - -template -static inline Vec4uq permute4uq(Vec4uq const & a) { - return Vec4uq (permute4q (a)); -} - -// Permute vector of 8 32-bit integers. -// Index -1 gives 0, index -256 means don't care. -template -static inline Vec8i permute8i(Vec8i const & a) { - - // Combine indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 - | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28; - - // Mask to zero out negative indexes - const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 - | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - // zeroing needed - const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0; - - __m256i t1, mask; - - if (((m1 ^ 0x76543210) & mz) == 0) { - // no shuffling - if (dozero) { - // zero some elements - mask = constant8i < - i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, - i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > (); - return _mm256_and_si256(a, mask); - } - return a; // do nothing - } - - // Check if we can use 64-bit permute. Even numbered indexes must be even and odd numbered - // indexes must be equal to the preceding index + 1, except for negative indexes. - if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0) { - - const bool partialzero = int((i0^i1)|(i2^i3)|(i4^i5)|(i6^i7)) < 0; // part of a 64-bit block is zeroed - const int blank1 = partialzero ? -0x100 : -1; // ignore or zero - const int n0 = i0 > 0 ? i0 /2 : i1 > 0 ? i1 /2 : blank1; // indexes for 64 bit blend - const int n1 = i2 > 0 ? i2 /2 : i3 > 0 ? i3 /2 : blank1; - const int n2 = i4 > 0 ? i4 /2 : i5 > 0 ? i5 /2 : blank1; - const int n3 = i6 > 0 ? i6 /2 : i7 > 0 ? i7 /2 : blank1; - // do 64-bit permute - t1 = permute4q (Vec4q(a)); - if (blank1 == -1 || !dozero) { - return t1; - } - // need more zeroing - mask = constant8i < - i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, - i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > (); - return _mm256_and_si256(t1, mask); - } - - if (((m1 ^ 0x44440000) & 0x44444444 & mz) == 0) { - // no exchange of data between low and high half - - if (((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0 && !dozero) { - // same pattern in low and high half. use VPSHUFD - const int sd = ((i0>=0)?(i0&3):(i4&3)) | ((i1>=0)?(i1&3):(i5&3)) << 2 | - ((i2>=0)?(i2&3):(i6&3)) << 4 | ((i3>=0)?(i3&3):(i7&3)) << 6; - return _mm256_shuffle_epi32(a, sd); - } - - // use VPSHUFB - mask = constant8i < - i0 < 0 ? -1 : (i0 & 3) * 0x04040404 + 0x03020100, - i1 < 0 ? -1 : (i1 & 3) * 0x04040404 + 0x03020100, - i2 < 0 ? -1 : (i2 & 3) * 0x04040404 + 0x03020100, - i3 < 0 ? -1 : (i3 & 3) * 0x04040404 + 0x03020100, - i4 < 0 ? -1 : (i4 & 3) * 0x04040404 + 0x03020100, - i5 < 0 ? -1 : (i5 & 3) * 0x04040404 + 0x03020100, - i6 < 0 ? -1 : (i6 & 3) * 0x04040404 + 0x03020100, - i7 < 0 ? -1 : (i7 & 3) * 0x04040404 + 0x03020100 > (); - return _mm256_shuffle_epi8(a, mask); - } - - // general case. Use VPERMD - mask = constant8i < - i0 < 0 ? -1 : (i0 & 7), i1 < 0 ? -1 : (i1 & 7), - i2 < 0 ? -1 : (i2 & 7), i3 < 0 ? -1 : (i3 & 7), - i4 < 0 ? -1 : (i4 & 7), i5 < 0 ? -1 : (i5 & 7), - i6 < 0 ? -1 : (i6 & 7), i7 < 0 ? -1 : (i7 & 7) > (); -#if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // bug in MS VS 11 beta: operands in wrong order. fixed in v. 11.0 - t1 = _mm256_permutevar8x32_epi32(mask, a); // ms -#elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__) - // Gcc 4.7.0 also has operands in wrong order. fixed in version 4.7.1 - t1 = _mm256_permutevar8x32_epi32(mask, a); // GCC -#else - t1 = _mm256_permutevar8x32_epi32(a, mask); // no-bug version -#endif - - if (dozero) { - // zero some elements - mask = constant8i < - i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, - i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > (); - return _mm256_and_si256(t1, mask); - } - return t1; -} - -template -static inline Vec8ui permute8ui(Vec8ui const & a) { - return Vec8ui (permute8i (a)); -} - -// Permute vector of 16 16-bit integers. -// Index -1 gives 0, index -256 means don't care. -template -static inline Vec16s permute16s(Vec16s const & a) { - - // Combine indexes 0 - 7 into a single bitfield, with 4 bits for each - const int mlo = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 - | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; - - // Combine indexes 8 - 15 into a single bitfield, with 4 bits for each - const int mhi = (i8&0xF) | (i9&0xF)<<4 | (i10&0xF)<<8 | (i11&0xF)<<12 - | (i12&0xF)<<16 | (i13&0xF)<<20 | (i14&0xF)<<24 | (i15&0xF)<<28; - - // Mask to zero out negative indexes 0 - 7 - const int zlo = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 - | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - // Mask to zero out negative indexes 8 - 15 - const int zhi = (i8<0?0:0xF) | (i9<0?0:0xF)<<4 | (i10<0?0:0xF)<<8 | (i11<0?0:0xF)<<12 - | (i12<0?0:0xF)<<16 | (i13<0?0:0xF)<<20 | (i14<0?0:0xF)<<24 | (i15<0?0:0xF)<<28; - - // zeroing needed - const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0; - - __m256i t1, mask; - - // special case: all zero - if (zlo == 0 && zhi == 0) { - return _mm256_setzero_si256(); - } - - // special case: rotate 128 bits - if (i0>=0 && i0 < 16 && i1 ==((i0+1)&7) && i2 ==((i0+2)&7) && i3 ==((i0+3)&7) && i4 ==((i0+4)&7) && i5 ==((i0+5)&7) && i6 ==((i0+6)&7) && i7 ==((i0+7)&7) - && i8 ==i0 +8 && i9 ==i1 +8 && i10==i2 +8 && i11==i3 +8 && i12==i4 +8 && i13==i5 +8 && i14==i6 +8 && i15==i7 +8 ) { - return _mm256_alignr_epi8(a, a, (i0 & 7) * 2); - } - - // special case: rotate 256 bits - if (i0>=0 && i0 < 16 && i1 ==((i0+1 )&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) - && i8 ==((i0+8 )&15) && i9 ==((i0+9 )&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15)) { - t1 = _mm256_permute4x64_epi64(a, 0x4E); - return _mm256_alignr_epi8(a, t1, (i0 & 7) * 2); - } - - // special case: no exchange of data between 64-bit sections, and same pattern in low and high 128 bits: - // can use VPSHUFLW or VPSHUFHW - if (((mlo ^ 0x44440000) & 0xCCCCCCCC & zlo) == 0 && ((mhi ^ 0xCCCC8888) & 0xCCCCCCCC & zhi) == 0 - && ((mlo ^ mhi) & 0x33333333 & zlo & zhi) == 0) { - - const int slo = (i0 >= 0 ? (i0&3) : i8 >= 0 ? (i8&3) : 0) | (i1 >= 0 ? (i1&3) : i9 >= 0 ? (i9&3) : 1) << 2 - | (i2 >= 0 ? (i2&3) : i10 >= 0 ? (i10&3) : 2) << 4 | (i3 >= 0 ? (i3&3) : i11 >= 0 ? (i11&3) : 3) << 6; - - const int shi = (i4 >= 0 ? (i4&3) : i12 >= 0 ? (i12&3) : 0) | (i5 >= 0 ? (i5&3) : i13 >= 0 ? (i13&3) : 1) << 2 - | (i6 >= 0 ? (i6&3) : i14 >= 0 ? (i14&3) : 2) << 4 | (i7 >= 0 ? (i7&3) : i15 >= 0 ? (i15&3) : 3) << 6; - - if (shi == 0xE4 && slo == 0xE4) { // no permute - if (dozero) { - // zero some elements - const __m256i maskz = constant8i< - int((i0 <0?0:0xFFFF) | (i1 <0?0:0xFFFF0000)), - int((i2 <0?0:0xFFFF) | (i3 <0?0:0xFFFF0000)), - int((i4 <0?0:0xFFFF) | (i5 <0?0:0xFFFF0000)), - int((i6 <0?0:0xFFFF) | (i7 <0?0:0xFFFF0000)), - int((i8 <0?0:0xFFFF) | (i9 <0?0:0xFFFF0000)), - int((i10<0?0:0xFFFF) | (i11<0?0:0xFFFF0000)), - int((i12<0?0:0xFFFF) | (i13<0?0:0xFFFF0000)), - int((i14<0?0:0xFFFF) | (i15<0?0:0xFFFF0000)) > (); - return _mm256_and_si256(a, maskz); - } - return a; // do nothing - } - if (shi == 0xE4 && !dozero) { - return _mm256_shufflelo_epi16(a, slo); // low permute only - } - if (slo == 0xE4 && !dozero) { - return _mm256_shufflehi_epi16(a, shi); // high permute only - } - } - - // Check if we can use 32-bit permute. Even numbered indexes must be even and odd numbered - // indexes must be equal to the preceding index + 1, except for negative indexes. - if (((mlo ^ 0x10101010) & 0x11111111 & zlo) == 0 && ((mlo ^ mlo >> 4) & 0x0E0E0E0E & zlo & zlo >> 4) == 0 && - ((mhi ^ 0x10101010) & 0x11111111 & zhi) == 0 && ((mhi ^ mhi >> 4) & 0x0E0E0E0E & zhi & zhi >> 4) == 0 ) { - - const bool partialzero = int((i0^i1)|(i2^i3)|(i4^i5)|(i6^i7)|(i8^i9)|(i10^i11)|(i12^i13)|(i14^i15)) < 0; // part of a 32-bit block is zeroed - const int blank1 = partialzero ? -0x100 : -1; // ignore or zero - const int n0 = i0 > 0 ? i0 /2 : i1 > 0 ? i1 /2 : blank1; // indexes for 64 bit blend - const int n1 = i2 > 0 ? i2 /2 : i3 > 0 ? i3 /2 : blank1; - const int n2 = i4 > 0 ? i4 /2 : i5 > 0 ? i5 /2 : blank1; - const int n3 = i6 > 0 ? i6 /2 : i7 > 0 ? i7 /2 : blank1; - const int n4 = i8 > 0 ? i8 /2 : i9 > 0 ? i9 /2 : blank1; - const int n5 = i10> 0 ? i10/2 : i11> 0 ? i11/2 : blank1; - const int n6 = i12> 0 ? i12/2 : i13> 0 ? i13/2 : blank1; - const int n7 = i14> 0 ? i14/2 : i15> 0 ? i15/2 : blank1; - // do 32-bit permute - t1 = permute8i (Vec8i(a)); - if (blank1 == -1 || !dozero) { - return t1; - } - // need more zeroing - mask = constant8i< - int((i0 <0?0:0xFFFF) | (i1 <0?0:0xFFFF0000)), - int((i2 <0?0:0xFFFF) | (i3 <0?0:0xFFFF0000)), - int((i4 <0?0:0xFFFF) | (i5 <0?0:0xFFFF0000)), - int((i6 <0?0:0xFFFF) | (i7 <0?0:0xFFFF0000)), - int((i8 <0?0:0xFFFF) | (i9 <0?0:0xFFFF0000)), - int((i10<0?0:0xFFFF) | (i11<0?0:0xFFFF0000)), - int((i12<0?0:0xFFFF) | (i13<0?0:0xFFFF0000)), - int((i14<0?0:0xFFFF) | (i15<0?0:0xFFFF0000)) > (); - return _mm256_and_si256(t1, mask); - } - - // special case: all elements from same half - if ((mlo & 0x88888888 & zlo) == 0 && ((mhi ^ 0x88888888) & 0x88888888 & zhi) == 0) { - mask = constant8i< - (i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16, - (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16, - (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16, - (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16, - (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16, - (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16, - (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16, - (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > (); - return _mm256_shuffle_epi8(a, mask); - } - - // special case: all elements from low half - if ((mlo & 0x88888888 & zlo) == 0 && (mhi & 0x88888888 & zhi) == 0) { - mask = constant8i< - (i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16, - (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16, - (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16, - (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16, - (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16, - (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16, - (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16, - (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > (); - t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1); // low, low - return _mm256_shuffle_epi8(t1, mask); - } - - // special case: all elements from high half - if (((mlo ^ 0x88888888) & 0x88888888 & zlo) == 0 && ((mhi ^ 0x88888888) & 0x88888888 & zhi) == 0) { - mask = constant8i< - (i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16, - (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16, - (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16, - (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16, - (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16, - (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16, - (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16, - (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > (); - t1 = _mm256_permute4x64_epi64(a, 0xEE); // high, high - return _mm256_shuffle_epi8(t1, mask); - } - - // special case: all elements from opposite half - if (((mlo ^ 0x88888888) & 0x88888888 & zlo) == 0 && (mhi & 0x88888888 & zhi) == 0) { - mask = constant8i< - (i0 < 0 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 0 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16, - (i2 < 0 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 0 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16, - (i4 < 0 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 0 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16, - (i6 < 0 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 0 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16, - (i8 < 0 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 0 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16, - (i10 < 0 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 0 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16, - (i12 < 0 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 0 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16, - (i14 < 0 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 0 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > (); - t1 = _mm256_permute4x64_epi64(a, 0x4E); // high, low - return _mm256_shuffle_epi8(t1, mask); - } - - // general case: elements from both halves - const __m256i mmsame = constant8i< - ((i0 ^8) < 8 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | ((i1 ^8) < 8 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16, - ((i2 ^8) < 8 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | ((i3 ^8) < 8 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16, - ((i4 ^8) < 8 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | ((i5 ^8) < 8 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16, - ((i6 ^8) < 8 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | ((i7 ^8) < 8 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16, - (i8 < 8 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | (i9 < 8 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16, - (i10 < 8 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | (i11 < 8 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16, - (i12 < 8 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | (i13 < 8 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16, - (i14 < 8 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | (i15 < 8 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > (); - - const __m256i mmopposite = constant8i< - (i0 < 8 ? 0xFFFF : (i0 & 7) * 0x202 + 0x100) | (i1 < 8 ? 0xFFFF : (i1 & 7) * 0x202 + 0x100) << 16, - (i2 < 8 ? 0xFFFF : (i2 & 7) * 0x202 + 0x100) | (i3 < 8 ? 0xFFFF : (i3 & 7) * 0x202 + 0x100) << 16, - (i4 < 8 ? 0xFFFF : (i4 & 7) * 0x202 + 0x100) | (i5 < 8 ? 0xFFFF : (i5 & 7) * 0x202 + 0x100) << 16, - (i6 < 8 ? 0xFFFF : (i6 & 7) * 0x202 + 0x100) | (i7 < 8 ? 0xFFFF : (i7 & 7) * 0x202 + 0x100) << 16, - ((i8 ^8) < 8 ? 0xFFFF : (i8 & 7) * 0x202 + 0x100) | ((i9 ^8) < 8 ? 0xFFFF : (i9 & 7) * 0x202 + 0x100) << 16, - ((i10^8) < 8 ? 0xFFFF : (i10 & 7) * 0x202 + 0x100) | ((i11^8) < 8 ? 0xFFFF : (i11 & 7) * 0x202 + 0x100) << 16, - ((i12^8) < 8 ? 0xFFFF : (i12 & 7) * 0x202 + 0x100) | ((i13^8) < 8 ? 0xFFFF : (i13 & 7) * 0x202 + 0x100) << 16, - ((i14^8) < 8 ? 0xFFFF : (i14 & 7) * 0x202 + 0x100) | ((i15^8) < 8 ? 0xFFFF : (i15 & 7) * 0x202 + 0x100) << 16 > (); - - __m256i topp = _mm256_permute4x64_epi64(a, 0x4E); // high, low - __m256i r1 = _mm256_shuffle_epi8(topp, mmopposite); - __m256i r2 = _mm256_shuffle_epi8(a, mmsame); - return _mm256_or_si256(r1, r2); -} - -template -static inline Vec16us permute16us(Vec16us const & a) { - return Vec16us (permute16s (a)); -} - -template -static inline Vec32c permute32c(Vec32c const & a) { - - // collect bit 4 of each index - const int m1 = - (i0 &16)>>4 | (i1 &16)>>3 | (i2 &16)>>2 | (i3 &16)>>1 | (i4 &16) | (i5 &16)<<1 | (i6 &16)<<2 | (i7 &16)<<3 | - (i8 &16)<<4 | (i9 &16)<<5 | (i10&16)<<6 | (i11&16)<<7 | (i12&16)<<8 | (i13&16)<<9 | (i14&16)<<10 | (i15&16)<<11 | - (i16&16)<<12 | (i17&16)<<13 | (i18&16)<<14 | (i19&16)<<15 | (i20&16)<<16 | (i21&16)<<17 | (i22&16)<<18 | (i23&16)<<19 | - (i24&16)<<20 | (i25&16)<<21 | (i26&16)<<22 | (i27&16)<<23 | (i28&16)<<24 | (i29&16)<<25 | (i30&16)<<26 | (i31&16)<<27 ; - - // check which elements to set to zero - const int mz = ~ ( - (i0 <0) | (i1 <0)<<1 | (i2 <0)<<2 | (i3 <0)<<3 | (i4 <0)<<4 | (i5 <0)<<5 | (i6 <0)<<6 | (i7 <0)<<7 | - (i8 <0)<<8 | (i9 <0)<<9 | (i10<0)<<10 | (i11<0)<<11 | (i12<0)<<12 | (i13<0)<<13 | (i14<0)<<14 | (i15<0)<<15 | - (i16<0)<<16 | (i17<0)<<17 | (i18<0)<<18 | (i19<0)<<19 | (i20<0)<<20 | (i21<0)<<21 | (i22<0)<<22 | (i23<0)<<23 | - (i24<0)<<24 | (i25<0)<<25 | (i26<0)<<26 | (i27<0)<<27 | (i28<0)<<28 | (i29<0)<<29 | (i30<0)<<30 | (i31<0)<<31 ); - - // Combine indexes 0-7, 8-15, 16-23, 24-31 into a bitfields, with 8 bits for each - const uint64_t g0 = (i0 &0x1F)|(i1 &0x1F)<<8|(i2 &0x1F)<<16|(i3 &0x1F)<<24|(i4 &0x1FLL)<<32|(i5 &0x1FLL)<<40|(i6 &0x1FLL)<<48|(i7 &0x1FLL)<<56; - const uint64_t g1 = (i8 &0x1F)|(i9 &0x1F)<<8|(i10&0x1F)<<16|(i11&0x1F)<<24|(i12&0x1FLL)<<32|(i13&0x1FLL)<<40|(i14&0x1FLL)<<48|(i15&0x1FLL)<<56; - const uint64_t g2 = (i16&0x1F)|(i17&0x1F)<<8|(i18&0x1F)<<16|(i19&0x1F)<<24|(i20&0x1FLL)<<32|(i21&0x1FLL)<<40|(i22&0x1FLL)<<48|(i23&0x1FLL)<<56; - const uint64_t g3 = (i24&0x1F)|(i25&0x1F)<<8|(i26&0x1F)<<16|(i27&0x1F)<<24|(i28&0x1FLL)<<32|(i29&0x1FLL)<<40|(i30&0x1FLL)<<48|(i31&0x1FLL)<<56; - - // Masks to zero out negative indexes - const uint64_t z0 = (i0 <0?0:0xFF)|(i1 <0?0:0xFF)<<8|(i2 <0?0:0xFF)<<16|(i3 <0?0:0xFF)<<24|(i4 <0?0:0xFFLL)<<32|(i5 <0?0:0xFFLL)<<40|(i6 <0?0:0xFFLL)<<48|(i7 <0?0:0xFFLL)<<56; - const uint64_t z1 = (i8 <0?0:0xFF)|(i9 <0?0:0xFF)<<8|(i10<0?0:0xFF)<<16|(i11<0?0:0xFF)<<24|(i12<0?0:0xFFLL)<<32|(i13<0?0:0xFFLL)<<40|(i14<0?0:0xFFLL)<<48|(i15<0?0:0xFFLL)<<56; - const uint64_t z2 = (i16<0?0:0xFF)|(i17<0?0:0xFF)<<8|(i18<0?0:0xFF)<<16|(i19<0?0:0xFF)<<24|(i20<0?0:0xFFLL)<<32|(i21<0?0:0xFFLL)<<40|(i22<0?0:0xFFLL)<<48|(i23<0?0:0xFFLL)<<56; - const uint64_t z3 = (i24<0?0:0xFF)|(i25<0?0:0xFF)<<8|(i26<0?0:0xFF)<<16|(i27<0?0:0xFF)<<24|(i28<0?0:0xFFLL)<<32|(i29<0?0:0xFFLL)<<40|(i30<0?0:0xFFLL)<<48|(i31<0?0:0xFFLL)<<56; - - // zeroing needed - const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15|i16|i17|i18|i19|i20|i21|i22|i23|i24|i25|i26|i27|i28|i29|i30|i31) & 0x80) != 0; - - __m256i t1, mask; - - // special case: all zero - if (mz == 0) return _mm256_setzero_si256(); - - // special case: no permute - if ((i0 <0||i0 == 0) && (i1 <0||i1 == 1) && (i2 <0||i2 == 2) && (i3 <0||i3 == 3) && (i4 <0||i4 == 4) && (i5 <0||i5 == 5) && (i6 <0||i6 == 6) && (i7 <0||i7 == 7) && - (i8 <0||i8 == 8) && (i9 <0||i9 == 9) && (i10<0||i10==10) && (i11<0||i11==11) && (i12<0||i12==12) && (i13<0||i13==13) && (i14<0||i14==14) && (i15<0||i15==15) && - (i16<0||i16==16) && (i17<0||i17==17) && (i18<0||i18==18) && (i19<0||i19==19) && (i20<0||i20==20) && (i21<0||i21==21) && (i22<0||i22==22) && (i23<0||i23==23) && - (i24<0||i24==24) && (i25<0||i25==25) && (i26<0||i26==26) && (i27<0||i27==27) && (i28<0||i28==28) && (i29<0||i29==29) && (i30<0||i30==30) && (i31<0||i31==31)) { - if (dozero) { - // zero some elements - mask = constant8i < - int((i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000)), - int((i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000)), - int((i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000)), - int((i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000)), - int((i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000)), - int((i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000)), - int((i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000)), - int((i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000)) > (); - return _mm256_and_si256(a, mask); - } - return a; // do nothing - } - - // special case: rotate 128 bits - if (i0>=0 && i0 < 32 && i1 ==((i0+1 )&15) && i2 ==((i0+2 )&15) && i3 ==((i0+3 )&15) && i4 ==((i0+4 )&15) && i5 ==((i0+5 )&15) && i6 ==((i0+6 )&15) && i7 ==((i0+7 )&15) - && i8 ==((i0+8 )&15) && i9 ==((i0+9 )&15) && i10==((i0+10)&15) && i11==((i0+11)&15) && i12==((i0+12)&15) && i13==((i0+13)&15) && i14==((i0+14)&15) && i15==((i0+15)&15) - && i16==i0 +16 && i17==i1 +16 && i18==i2 +16 && i19==i3 +16 && i20==i4 +16 && i21==i5 +16 && i22==i6 +16 && i23==i7 +16 - && i24==i8 +16 && i25==i9 +16 && i26==i10+16 && i27==i11+16 && i28==i12+16 && i29==i13+16 && i30==i14+16 && i31==i15+16 ) { - return _mm256_alignr_epi8(a, a, i0 & 15); - } - - // special case: rotate 256 bits - if (i0>=0 && i0 < 32 && i1 ==((i0+1 )&31) && i2 ==((i0+2 )&31) && i3 ==((i0+3 )&31) && i4 ==((i0+4 )&31) && i5 ==((i0+5 )&31) && i6 ==((i0+6 )&31) && i7 ==((i0+7 )&31) - && i8 ==((i0+8 )&31) && i9 ==((i0+9 )&31) && i10==((i0+10)&31) && i11==((i0+11)&31) && i12==((i0+12)&31) && i13==((i0+13)&31) && i14==((i0+14)&31) && i15==((i0+15)&31) - && i16==((i0+16)&31) && i17==((i0+17)&31) && i18==((i0+18)&31) && i19==((i0+19)&31) && i20==((i0+20)&31) && i21==((i0+21)&31) && i22==((i0+22)&31) && i23==((i0+23)&31) - && i24==((i0+24)&31) && i25==((i0+25)&31) && i26==((i0+26)&31) && i27==((i0+27)&31) && i28==((i0+28)&31) && i29==((i0+29)&31) && i30==((i0+30)&31) && i31==((i0+31)&31)) { - t1 = _mm256_permute4x64_epi64(a, 0x4E); - return _mm256_alignr_epi8(a, t1, i0 & 15); - } - - // Check if we can use 16-bit permute. Even numbered indexes must be even and odd numbered - // indexes must be equal to the preceding index + 1, except for negative indexes. - if (((g0 ^ 0x0100010001000100) & 0x0101010101010101 & z0) == 0 && ((g0 ^ g0 >> 8) & 0x00FE00FE00FE00FE & z0 & z0 >> 8) == 0 && - ((g1 ^ 0x0100010001000100) & 0x0101010101010101 & z1) == 0 && ((g1 ^ g1 >> 8) & 0x00FE00FE00FE00FE & z1 & z1 >> 8) == 0 && - ((g2 ^ 0x0100010001000100) & 0x0101010101010101 & z2) == 0 && ((g2 ^ g2 >> 8) & 0x00FE00FE00FE00FE & z2 & z2 >> 8) == 0 && - ((g3 ^ 0x0100010001000100) & 0x0101010101010101 & z3) == 0 && ((g3 ^ g3 >> 8) & 0x00FE00FE00FE00FE & z3 & z3 >> 8) == 0 ) { - - const bool partialzero = int((i0^i1)|(i2^i3)|(i4^i5)|(i6^i7)|(i8^i9)|(i10^i11)|(i12^i13)|(i14^i15) - |(i16^i17)|(i18^i19)|(i20^i21)|(i22^i23)|(i24^i25)|(i26^i27)|(i28^i29)|(i30^i31)) < 0; // part of a 16-bit block is zeroed - const int blank1 = partialzero ? -0x100 : -1; // ignore or zero - const int n0 = i0 > 0 ? i0 /2 : i1 > 0 ? i1 /2 : blank1; // indexes for 64 bit blend - const int n1 = i2 > 0 ? i2 /2 : i3 > 0 ? i3 /2 : blank1; - const int n2 = i4 > 0 ? i4 /2 : i5 > 0 ? i5 /2 : blank1; - const int n3 = i6 > 0 ? i6 /2 : i7 > 0 ? i7 /2 : blank1; - const int n4 = i8 > 0 ? i8 /2 : i9 > 0 ? i9 /2 : blank1; - const int n5 = i10> 0 ? i10/2 : i11> 0 ? i11/2 : blank1; - const int n6 = i12> 0 ? i12/2 : i13> 0 ? i13/2 : blank1; - const int n7 = i14> 0 ? i14/2 : i15> 0 ? i15/2 : blank1; - const int n8 = i16> 0 ? i16/2 : i17> 0 ? i17/2 : blank1; - const int n9 = i18> 0 ? i18/2 : i19> 0 ? i19/2 : blank1; - const int n10= i20> 0 ? i20/2 : i21> 0 ? i21/2 : blank1; - const int n11= i22> 0 ? i22/2 : i23> 0 ? i23/2 : blank1; - const int n12= i24> 0 ? i24/2 : i25> 0 ? i25/2 : blank1; - const int n13= i26> 0 ? i26/2 : i27> 0 ? i27/2 : blank1; - const int n14= i28> 0 ? i28/2 : i29> 0 ? i29/2 : blank1; - const int n15= i30> 0 ? i30/2 : i31> 0 ? i31/2 : blank1; - // do 16-bit permute - t1 = permute16s (Vec16s(a)); - if (blank1 == -1 || !dozero) { - return t1; - } - // need more zeroing - mask = constant8i < - int((i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000)), - int((i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000)), - int((i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000)), - int((i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000)), - int((i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000)), - int((i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000)), - int((i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000)), - int((i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000)) > (); - return _mm256_and_si256(a, mask); - } - - // special case: all elements from same half - if (((m1 ^ 0xFFFF0000) & mz) == 0) { - mask = constant8i < - (i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24, - (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24, - (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24, - (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24, - (i16 & 0xEF) | (i17 & 0xEF) << 8 | (i18 & 0xEF) << 16 | (i19 & 0xEF) << 24, - (i20 & 0xEF) | (i21 & 0xEF) << 8 | (i22 & 0xEF) << 16 | (i23 & 0xEF) << 24, - (i24 & 0xEF) | (i25 & 0xEF) << 8 | (i26 & 0xEF) << 16 | (i27 & 0xEF) << 24, - (i28 & 0xEF) | (i29 & 0xEF) << 8 | (i30 & 0xEF) << 16 | (i31 & 0xEF) << 24 > (); - return _mm256_shuffle_epi8(a, mask); - } - - // special case: all elements from low half - if ((m1 & mz) == 0) { - mask = constant8i < - (i0 & 0xFF) | (i1 & 0xFF) << 8 | (i2 & 0xFF) << 16 | (i3 & 0xFF) << 24, - (i4 & 0xFF) | (i5 & 0xFF) << 8 | (i6 & 0xFF) << 16 | (i7 & 0xFF) << 24, - (i8 & 0xFF) | (i9 & 0xFF) << 8 | (i10 & 0xFF) << 16 | (i11 & 0xFF) << 24, - (i12 & 0xFF) | (i13 & 0xFF) << 8 | (i14 & 0xFF) << 16 | (i15 & 0xFF) << 24, - (i16 & 0xFF) | (i17 & 0xFF) << 8 | (i18 & 0xFF) << 16 | (i19 & 0xFF) << 24, - (i20 & 0xFF) | (i21 & 0xFF) << 8 | (i22 & 0xFF) << 16 | (i23 & 0xFF) << 24, - (i24 & 0xFF) | (i25 & 0xFF) << 8 | (i26 & 0xFF) << 16 | (i27 & 0xFF) << 24, - (i28 & 0xFF) | (i29 & 0xFF) << 8 | (i30 & 0xFF) << 16 | (i31 & 0xFF) << 24 > (); - t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1); // low, low - return _mm256_shuffle_epi8(t1, mask); - } - - // special case: all elements from high half - if (((m1 ^ 0xFFFFFFFF) & mz) == 0) { - mask = constant8i < - (i0 & 0xEF) | (i1 & 0xEF) << 8 | (i2 & 0xEF) << 16 | (i3 & 0xEF) << 24, - (i4 & 0xEF) | (i5 & 0xEF) << 8 | (i6 & 0xEF) << 16 | (i7 & 0xEF) << 24, - (i8 & 0xEF) | (i9 & 0xEF) << 8 | (i10 & 0xEF) << 16 | (i11 & 0xEF) << 24, - (i12 & 0xEF) | (i13 & 0xEF) << 8 | (i14 & 0xEF) << 16 | (i15 & 0xEF) << 24, - (i16 & 0xEF) | (i17 & 0xEF) << 8 | (i18 & 0xEF) << 16 | (i19 & 0xEF) << 24, - (i20 & 0xEF) | (i21 & 0xEF) << 8 | (i22 & 0xEF) << 16 | (i23 & 0xEF) << 24, - (i24 & 0xEF) | (i25 & 0xEF) << 8 | (i26 & 0xEF) << 16 | (i27 & 0xEF) << 24, - (i28 & 0xEF) | (i29 & 0xEF) << 8 | (i30 & 0xEF) << 16 | (i31 & 0xEF) << 24 > (); - t1 = _mm256_permute4x64_epi64(a, 0xEE); // high, high - return _mm256_shuffle_epi8(t1, mask); - } - - // special case: all elements from opposite half - if (((m1 ^ 0x0000FFFF) & mz) == 0) { - mask = constant8i< - (i0 & 0xEF) | (i1 & 0xEF) << 8 | (i2 & 0xEF) << 16 | (i3 & 0xEF) << 24, - (i4 & 0xEF) | (i5 & 0xEF) << 8 | (i6 & 0xEF) << 16 | (i7 & 0xEF) << 24, - (i8 & 0xEF) | (i9 & 0xEF) << 8 | (i10 & 0xEF) << 16 | (i11 & 0xEF) << 24, - (i12 & 0xEF) | (i13 & 0xEF) << 8 | (i14 & 0xEF) << 16 | (i15 & 0xEF) << 24, - (i16 & 0xFF) | (i17 & 0xFF) << 8 | (i18 & 0xFF) << 16 | (i19 & 0xFF) << 24, - (i20 & 0xFF) | (i21 & 0xFF) << 8 | (i22 & 0xFF) << 16 | (i23 & 0xFF) << 24, - (i24 & 0xFF) | (i25 & 0xFF) << 8 | (i26 & 0xFF) << 16 | (i27 & 0xFF) << 24, - (i28 & 0xFF) | (i29 & 0xFF) << 8 | (i30 & 0xFF) << 16 | (i31 & 0xFF) << 24 > (); - - t1 = _mm256_permute4x64_epi64(a, 0x4E); // high, low - return _mm256_shuffle_epi8(t1, mask); - } - - // general case: elements from both halves - const __m256i mmsame = constant8i < - ((i0 &0xF0)?0xFF:(i0 &15)) | ((i1 &0xF0)?0xFF:(i1 &15)) << 8 | ((i2 &0xF0)?0xFF:(i2 &15)) << 16 | ((i3 &0xF0)?0xFF:(i3 &15)) << 24, - ((i4 &0xF0)?0xFF:(i4 &15)) | ((i5 &0xF0)?0xFF:(i5 &15)) << 8 | ((i6 &0xF0)?0xFF:(i6 &15)) << 16 | ((i7 &0xF0)?0xFF:(i7 &15)) << 24, - ((i8 &0xF0)?0xFF:(i8 &15)) | ((i9 &0xF0)?0xFF:(i9 &15)) << 8 | ((i10&0xF0)?0xFF:(i10&15)) << 16 | ((i11&0xF0)?0xFF:(i11&15)) << 24, - ((i12&0xF0)?0xFF:(i12&15)) | ((i13&0xF0)?0xFF:(i13&15)) << 8 | ((i14&0xF0)?0xFF:(i14&15)) << 16 | ((i15&0xF0)?0xFF:(i15&15)) << 24, - ((i16&0xF0)!=0x10?0xFF:(i16&15)) | ((i17&0xF0)!=0x10?0xFF:(i17&15)) << 8 | ((i18&0xF0)!=0x10?0xFF:(i18&15)) << 16 | ((i19&0xF0)!=0x10?0xFF:(i19&15)) << 24, - ((i20&0xF0)!=0x10?0xFF:(i20&15)) | ((i21&0xF0)!=0x10?0xFF:(i21&15)) << 8 | ((i22&0xF0)!=0x10?0xFF:(i22&15)) << 16 | ((i23&0xF0)!=0x10?0xFF:(i23&15)) << 24, - ((i24&0xF0)!=0x10?0xFF:(i24&15)) | ((i25&0xF0)!=0x10?0xFF:(i25&15)) << 8 | ((i26&0xF0)!=0x10?0xFF:(i26&15)) << 16 | ((i27&0xF0)!=0x10?0xFF:(i27&15)) << 24, - ((i28&0xF0)!=0x10?0xFF:(i28&15)) | ((i29&0xF0)!=0x10?0xFF:(i29&15)) << 8 | ((i30&0xF0)!=0x10?0xFF:(i30&15)) << 16 | ((i31&0xF0)!=0x10?0xFF:(i31&15)) << 24 > (); - - const __m256i mmopposite = constant8i < - ((i0 &0xF0)!=0x10?0xFF:(i0 &15)) | ((i1 &0xF0)!=0x10?0xFF:(i1 &15)) << 8 | ((i2 &0xF0)!=0x10?0xFF:(i2 &15)) << 16 | ((i3 &0xF0)!=0x10?0xFF:(i3 &15)) << 24, - ((i4 &0xF0)!=0x10?0xFF:(i4 &15)) | ((i5 &0xF0)!=0x10?0xFF:(i5 &15)) << 8 | ((i6 &0xF0)!=0x10?0xFF:(i6 &15)) << 16 | ((i7 &0xF0)!=0x10?0xFF:(i7 &15)) << 24, - ((i8 &0xF0)!=0x10?0xFF:(i8 &15)) | ((i9 &0xF0)!=0x10?0xFF:(i9 &15)) << 8 | ((i10&0xF0)!=0x10?0xFF:(i10&15)) << 16 | ((i11&0xF0)!=0x10?0xFF:(i11&15)) << 24, - ((i12&0xF0)!=0x10?0xFF:(i12&15)) | ((i13&0xF0)!=0x10?0xFF:(i13&15)) << 8 | ((i14&0xF0)!=0x10?0xFF:(i14&15)) << 16 | ((i15&0xF0)!=0x10?0xFF:(i15&15)) << 24, - ((i16&0xF0)?0xFF:(i16&15)) | ((i17&0xF0)?0xFF:(i17&15)) << 8 | ((i18&0xF0)?0xFF:(i18&15)) << 16 | ((i19&0xF0)?0xFF:(i19&15)) << 24, - ((i20&0xF0)?0xFF:(i20&15)) | ((i21&0xF0)?0xFF:(i21&15)) << 8 | ((i22&0xF0)?0xFF:(i22&15)) << 16 | ((i23&0xF0)?0xFF:(i23&15)) << 24, - ((i24&0xF0)?0xFF:(i24&15)) | ((i25&0xF0)?0xFF:(i25&15)) << 8 | ((i26&0xF0)?0xFF:(i26&15)) << 16 | ((i27&0xF0)?0xFF:(i27&15)) << 24, - ((i28&0xF0)?0xFF:(i28&15)) | ((i29&0xF0)?0xFF:(i29&15)) << 8 | ((i30&0xF0)?0xFF:(i30&15)) << 16 | ((i31&0xF0)?0xFF:(i31&15)) << 24 > (); - - __m256i topp = _mm256_permute4x64_epi64(a, 0x4E); // high, low - __m256i r1 = _mm256_shuffle_epi8(topp, mmopposite); - __m256i r2 = _mm256_shuffle_epi8(a, mmsame); - return _mm256_or_si256(r1, r2); -} - -template < - int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, - int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15, - int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, - int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 > - static inline Vec32uc permute32uc(Vec32uc const & a) { - return Vec32uc (permute32c (a)); -} - - -/***************************************************************************** -* -* Vector blend functions -* -****************************************************************************** -* -* These blend functions can mix elements from two different vectors and -* optionally set some elements to zero. -* -* The indexes are inserted as template parameters in <>. These indexes must be -* constants. Each template parameter is an index to the element you want to -* select, where higher indexes indicate an element from the second source -* vector. For example, if each vector has 8 elements, then indexes 0 - 7 -* will select an element from the first vector and indexes 8 - 15 will select -* an element from the second vector. A negative index will generate zero. -* -* Example: -* Vec8i a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107) -* Vec8i b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207) -* Vec8i c; -* c = blend8i<1,0,9,8,7,-1,15,15> (a,b); // c is (101, 100, 201, 200, 107, 0, 207, 207) -* -* A lot of the code here is metaprogramming aiming to find the instructions -* that best fit the template parameters and instruction set. The metacode -* will be reduced out to leave only a few vector instructions in release -* mode with optimization on. -*****************************************************************************/ - -template -static inline Vec4q blend4q(Vec4q const & a, Vec4q const & b) { - - // Combine indexes into a single bitfield, with 8 bits for each - const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24; - - // Mask to zero out negative indexes - const int mz = (i0<0 ? 0 : 0xFF) | (i1<0 ? 0 : 0xFF) << 8 | (i2<0 ? 0 : 0xFF) << 16 | (i3<0 ? 0 : 0xFF) << 24; - - // zeroing needed. An index of -0x100 means don't care - const bool dozero = ((i0|i1|i2|i3) & 0x80) != 0; - - __m256i t1, mask; - - // special case: 128 bit blend/permute - if (((m1 ^ 0x01000100) & 0x01010101 & mz) == 0 && (((m1 + 0x00010001) ^ (m1 >> 8)) & 0x00FF00FF & mz & mz >> 8) == 0) { - { - const int j0 = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : 4; // index for low 128 bits - const int j1 = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : 4; // index for high 128 bits - const bool partialzero = int((i0 ^ i1) | (i2 ^ i3)) < 0; // part of a 128-bit block is zeroed - - switch (j0 | j1 << 4) { - case 0x00: - t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1); break; - case 0x02: - t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(a), 1); break; - case 0x04: - if (dozero && !partialzero) return _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_castsi256_si128(a), 1); - t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(a), 1); break; - case 0x12: - t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(b), 0); break; - case 0x14: - if (dozero && !partialzero) return _mm256_inserti128_si256(a,_mm_setzero_si128(), 0); - t1 = a; break; - case 0x01: case 0x10: case 0x11: // all from a - return permute4q (a); - case 0x20: - t1 = _mm256_inserti128_si256(a, _mm256_castsi256_si128(b), 1); break; - case 0x22: - t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(b), 1); break; - case 0x24: - if (dozero && !partialzero) return _mm256_inserti128_si256(_mm256_setzero_si256(), _mm256_castsi256_si128(b), 1); - t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(b), 1); break; - case 0x30: - t1 = _mm256_inserti128_si256(b, _mm256_castsi256_si128(a), 0); break; - case 0x34: - if (dozero && !partialzero) return _mm256_inserti128_si256(b,_mm_setzero_si128(), 0); - t1 = b; break; - case 0x23: case 0x32: case 0x33: // all from b - return permute4q (b); - case 0x40: - if (dozero && !partialzero) return _mm256_castsi128_si256(_mm_and_si128(_mm256_castsi256_si128(a),_mm256_castsi256_si128(a))); - t1 = a; break; - case 0x42: - if (dozero && !partialzero) return _mm256_castsi128_si256(_mm_and_si128(_mm256_castsi256_si128(b),_mm256_castsi256_si128(b))); - t1 = b; break; - case 0x44: - return _mm256_setzero_si256(); - default: - t1 = _mm256_permute2x128_si256(a, b, (j0&0x0F) | (j1&0x0F) << 4); - } - } - RETURNORZERO: - if (dozero) { - // zero some elements - const __m256i maskz = constant8i < - i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, - i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > (); - return _mm256_and_si256(t1, maskz); - } - return t1; - } - - // special case: all from a - if ((m1 & 0x04040404 & mz) == 0) { - return permute4q (a); - } - - // special case: all from b - if ((~m1 & 0x04040404 & mz) == 0) { - return permute4q (b); - } - - // special case: blend without permute - if (((m1 ^ 0x03020100) & 0xFBFBFBFB & mz) == 0) { - mask = constant8i < - (i0 & 4) ? -1 : 0, (i0 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, - (i2 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0 > (); - t1 = _mm256_blendv_epi8(a, b, mask); // blend - goto RETURNORZERO; - } - - // special case: shift left - if (i0 > 0 && i0 < 4 && mz == -1 && (m1 ^ ((i0 & 3) * 0x01010101 + 0x03020100)) == 0) { - t1 = _mm256_permute2x128_si256(a, b, 0x21); - if (i0 < 2) return _mm256_alignr_epi8(t1, a, (i0 & 1) * 8); - else return _mm256_alignr_epi8(b, t1, (i0 & 1) * 8); - } - // special case: shift right - if (i0 > 4 && i0 < 8 && mz == -1 && (m1 ^ 0x04040404 ^ ((i0 & 3) * 0x01010101 + 0x03020100)) == 0) { - t1 = _mm256_permute2x128_si256(b, a, 0x21); - if (i0 < 6) return _mm256_alignr_epi8(t1, b, (i0 & 1) * 8); - else return _mm256_alignr_epi8(a, t1, (i0 & 1) * 8); - } - // special case: unpack low - if (((m1 ^ 0x06020400) & mz) == 0) { - t1 = _mm256_unpacklo_epi64(a, b); - goto RETURNORZERO; - } - // special case: unpack low - if (((m1 ^ 0x02060004) & mz) == 0) { - t1 = _mm256_unpacklo_epi64(b, a); - goto RETURNORZERO; - } - // special case: unpack high - if (((m1 ^ 0x07030501) & mz) == 0) { - t1 = _mm256_unpackhi_epi64(a, b); - goto RETURNORZERO; - } - // special case: unpack high - if (((m1 ^ 0x03070105) & mz) == 0) { - t1 = _mm256_unpackhi_epi64(b, a); - goto RETURNORZERO; - } - - // general case: permute and blend and possibly zero - const int blank = dozero ? -1 : -0x100; // ignore or zero - - // permute and blend - __m256i ta = permute4q < - (i0 & 4) ? blank : i0, (i1 & 4) ? blank : i1, (i2 & 4) ? blank : i2, (i3 & 4) ? blank : i3 > (a); - - __m256i tb = permute4q < - ((i0^4) & 4) ? blank : i0^4, ((i1^4) & 4) ? blank : i1^4, ((i2^4) & 4) ? blank : i2^4, ((i3^4) & 4) ? blank : i3^4 > (b); - - if (blank == -1) { - // we have zeroed, need only to OR - return _mm256_or_si256(ta, tb); - } - // no zeroing, need to blend - mask = constant8i < - (i0 & 4) ? -1 : 0, (i0 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, (i1 & 4) ? -1 : 0, - (i2 & 4) ? -1 : 0, (i2 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0, (i3 & 4) ? -1 : 0 > (); - - return _mm256_blendv_epi8(ta, tb, mask); // blend -} - -template -static inline Vec4uq blend4uq(Vec4uq const & a, Vec4uq const & b) { - return Vec4uq( blend4q (a,b)); -} - - -template -static inline Vec8i blend8i(Vec8i const & a, Vec8i const & b) { - - const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7; // OR indexes - - // is zeroing needed - const bool do_zero = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100 - - // Combine all the indexes into a single bitfield, with 4 bits for each - const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28; - - // Mask to zero out negative indexes - const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28; - - __m256i t1, mask; - - if (mz == 0) return _mm256_setzero_si256(); // all zero - - // special case: 64 bit blend/permute - if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ (m1 >> 4)) & 0x0E0E0E0E & mz & mz >> 4) == 0) { - // check if part of a 64-bit block is zeroed - const bool partialzero = int((i0^i1) | (i2^i3) | (i4^i5) | (i6^i7)) < 0; - const int blank1 = partialzero ? -0x100 : -1; // ignore if zeroing later anyway - // indexes for 64 bit blend - const int j0 = i0 >= 0 ? i0 / 2 : i1 >= 0 ? i1 / 2 : blank1; - const int j1 = i2 >= 0 ? i2 / 2 : i3 >= 0 ? i3 / 2 : blank1; - const int j2 = i4 >= 0 ? i4 / 2 : i5 >= 0 ? i5 / 2 : blank1; - const int j3 = i6 >= 0 ? i6 / 2 : i7 >= 0 ? i7 / 2 : blank1; - // 64-bit blend and permute - t1 = blend4q(Vec4q(a), Vec4q(b)); - if (partialzero && do_zero) { - // zero some elements - mask = constant8i< i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, - i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > (); - return _mm256_and_si256(t1, mask); - } - return t1; - } - - if ((m1 & 0x88888888 & mz) == 0) { - // all from a - return permute8i (a); - } - - if (((m1 ^ 0x88888888) & 0x88888888 & mz) == 0) { - // all from b - return permute8i (b); - } - - if ((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0) { - // blend and zero, no permute - mask = constant8i<(i0&8)?0:-1, (i1&8)?0:-1, (i2&8)?0:-1, (i3&8)?0:-1, (i4&8)?0:-1, (i5&8)?0:-1, (i6&8)?0:-1, (i7&8)?0:-1> (); - t1 = select(mask, a, b); - if (!do_zero) return t1; - // zero some elements - mask = constant8i< (i0<0&&(i0&8)) ? 0 : -1, (i1<0&&(i1&8)) ? 0 : -1, (i2<0&&(i2&8)) ? 0 : -1, (i3<0&&(i3&8)) ? 0 : -1, - (i4<0&&(i4&8)) ? 0 : -1, (i5<0&&(i5&8)) ? 0 : -1, (i6<0&&(i6&8)) ? 0 : -1, (i7<0&&(i7&8)) ? 0 : -1 > (); - return _mm256_and_si256(t1, mask); - } - - // special case: shift left - if (i0 > 0 && i0 < 8 && mz == -1 && (m1 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) == 0) { - t1 = _mm256_permute2x128_si256(a, b, 0x21); - if (i0 < 4) return _mm256_alignr_epi8(t1, a, (i0 & 3) * 4); - else return _mm256_alignr_epi8(b, t1, (i0 & 3) * 4); - } - // special case: shift right - if (i0 > 8 && i0 < 16 && mz == -1 && (m1 ^ 0x88888888 ^ ((i0 & 7) * 0x11111111u + 0x76543210u)) == 0) { - t1 = _mm256_permute2x128_si256(b, a, 0x21); - if (i0 < 12) return _mm256_alignr_epi8(t1, b, (i0 & 3) * 4); - else return _mm256_alignr_epi8(a, t1, (i0 & 3) * 4); - } - - // general case: permute and blend and possible zero - const int blank = do_zero ? -1 : -0x100; // ignore or zero - - Vec8i ta = permute8i < - (uint32_t)i0 < 8 ? i0 : blank, - (uint32_t)i1 < 8 ? i1 : blank, - (uint32_t)i2 < 8 ? i2 : blank, - (uint32_t)i3 < 8 ? i3 : blank, - (uint32_t)i4 < 8 ? i4 : blank, - (uint32_t)i5 < 8 ? i5 : blank, - (uint32_t)i6 < 8 ? i6 : blank, - (uint32_t)i7 < 8 ? i7 : blank > (a); - Vec8i tb = permute8i < - (uint32_t)(i0^8) < 8 ? (i0^8) : blank, - (uint32_t)(i1^8) < 8 ? (i1^8) : blank, - (uint32_t)(i2^8) < 8 ? (i2^8) : blank, - (uint32_t)(i3^8) < 8 ? (i3^8) : blank, - (uint32_t)(i4^8) < 8 ? (i4^8) : blank, - (uint32_t)(i5^8) < 8 ? (i5^8) : blank, - (uint32_t)(i6^8) < 8 ? (i6^8) : blank, - (uint32_t)(i7^8) < 8 ? (i7^8) : blank > (b); - if (blank == -1) { - return _mm256_or_si256(ta, tb); - } - // no zeroing, need to blend - const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) | - ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) | ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80); - return _mm256_blend_epi32(ta, tb, maskb); // blend -} - -template -static inline Vec8ui blend8ui(Vec8ui const & a, Vec8ui const & b) { - return Vec8ui( blend8i (a,b)); -} - - -template -static inline Vec16s blend16s(Vec16s const & a, Vec16s const & b) { - // collect bit 4 of each index - const int m1 = - (i0 &16)>>4 | (i1 &16)>>3 | (i2 &16)>>2 | (i3 &16)>>1 | (i4 &16) | (i5 &16)<<1 | (i6 &16)<<2 | (i7 &16)<<3 | - (i8 &16)<<4 | (i9 &16)<<5 | (i10&16)<<6 | (i11&16)<<7 | (i12&16)<<8 | (i13&16)<<9 | (i14&16)<<10 | (i15&16)<<11 ; - - // check which elements to set to zero - const int mz = 0x0000FFFF ^ ( - (i0 <0) | (i1 <0)<<1 | (i2 <0)<<2 | (i3 <0)<<3 | (i4 <0)<<4 | (i5 <0)<<5 | (i6 <0)<<6 | (i7 <0)<<7 | - (i8 <0)<<8 | (i9 <0)<<9 | (i10<0)<<10 | (i11<0)<<11 | (i12<0)<<12 | (i13<0)<<13 | (i14<0)<<14 | (i15<0)<<15 ); - - __m256i t1, mask; - - // special case: all zero - if (mz == 0) return _mm256_setzero_si256(); - - // special case: all from a - if ((m1 & mz) == 0) { - return permute16s (a); - } - - // special case: all from b - if (((m1 ^ 0xFFFF) & mz) == 0) { - return permute16s (b); - } - - // special case: blend without permute - if ((i0 <0||(i0 &15)== 0) && (i1 <0||(i1 &15)== 1) && (i2 <0||(i2 &15)== 2) && (i3 <0||(i3 &15)== 3) && - (i4 <0||(i4 &15)== 4) && (i5 <0||(i5 &15)== 5) && (i6 <0||(i6 &15)== 6) && (i7 <0||(i7 &15)== 7) && - (i8 <0||(i8 &15)== 8) && (i9 <0||(i9 &15)== 9) && (i10<0||(i10&15)==10) && (i11<0||(i11&15)==11) && - (i12<0||(i12&15)==12) && (i13<0||(i13&15)==13) && (i14<0||(i14&15)==14) && (i15<0||(i15&15)==15)) { - - mask = constant8i < - int(((i0 & 16) ? 0xFFFF : 0) | ((i1 & 16) ? 0xFFFF0000 : 0)), - int(((i2 & 16) ? 0xFFFF : 0) | ((i3 & 16) ? 0xFFFF0000 : 0)), - int(((i4 & 16) ? 0xFFFF : 0) | ((i5 & 16) ? 0xFFFF0000 : 0)), - int(((i6 & 16) ? 0xFFFF : 0) | ((i7 & 16) ? 0xFFFF0000 : 0)), - int(((i8 & 16) ? 0xFFFF : 0) | ((i9 & 16) ? 0xFFFF0000 : 0)), - int(((i10& 16) ? 0xFFFF : 0) | ((i11& 16) ? 0xFFFF0000 : 0)), - int(((i12& 16) ? 0xFFFF : 0) | ((i13& 16) ? 0xFFFF0000 : 0)), - int(((i14& 16) ? 0xFFFF : 0) | ((i15& 16) ? 0xFFFF0000 : 0)) > (); - - t1 = _mm256_blendv_epi8(a, b, mask); // blend - - if (mz != 0xFFFF) { - // zero some elements - mask = constant8i < - int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), - int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)), - int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), - int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)), - int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), - int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)), - int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)), - int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000)) > (); - return _mm256_and_si256(t1, mask); - } - return t1; - } - - // special case: shift left - const int slb = i0 > 0 ? i0 : i15 - 15; - if (slb > 0 && slb < 16 - && (i0==slb+ 0||i0<0) && (i1==slb+ 1||i1<0) && (i2 ==slb+ 2||i2 <0) && (i3 ==slb+ 3||i3 <0) && (i4 ==slb+ 4||i4 <0) && (i5 ==slb+ 5||i5 <0) && (i6 ==slb+ 6||i6 <0) && (i7 ==slb+ 7||i7 <0) - && (i8==slb+ 8||i8<0) && (i9==slb+ 9||i9<0) && (i10==slb+10||i10<0) && (i11==slb+11||i11<0) && (i12==slb+12||i12<0) && (i13==slb+13||i13<0) && (i14==slb+14||i14<0) && (i15==slb+15||i15<0)) { - t1 = _mm256_permute2x128_si256(a, b, 0x21); - if (slb < 8) t1 = _mm256_alignr_epi8(t1, a, (slb & 7) * 2); - else t1 = _mm256_alignr_epi8(b, t1, (slb & 7) * 2); - if (mz != 0xFFFF) { - // zero some elements - mask = constant8i < - int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), - int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)), - int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), - int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)), - int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), - int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)), - int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)), - int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000)) > (); - return _mm256_and_si256(t1, mask); - } - return t1; - } - // special case: shift right - const int srb = i0 > 0 ? (i0^16) : (i15^16) - 15; - if (srb > 0 && srb < 16 - && ((i0 ^16)==srb+ 0||i0 <0) && ((i1 ^16)==srb+ 1||i1 <0) && ((i2 ^16)==srb+ 2||i2 <0) && ((i3 ^16)==srb+ 3||i3 <0) && ((i4 ^16)==srb+ 4||i4 <0) && ((i5 ^16)==srb+ 5||i5 <0) && ((i6 ^16)==srb+ 6||i6 <0) && ((i7 ^16)==srb+ 7||i7 <0) - && ((i8 ^16)==srb+ 8||i8 <0) && ((i9 ^16)==srb+ 9||i9 <0) && ((i10^16)==srb+10||i10<0) && ((i11^16)==srb+11||i11<0) && ((i12^16)==srb+12||i12<0) && ((i13^16)==srb+13||i13<0) && ((i14^16)==srb+14||i14<0) && ((i15^16)==srb+15||i15<0)) { - t1 = _mm256_permute2x128_si256(b, a, 0x21); - if (srb < 8) t1 = _mm256_alignr_epi8(t1, b, (srb & 7) * 2); - else t1 = _mm256_alignr_epi8(a, t1, (srb & 7) * 2); - if (mz != 0xFFFF) { - // zero some elements - mask = constant8i < - int((i0 < 0 ? 0 : 0xFFFF) | (i1 < 0 ? 0 : 0xFFFF0000)), - int((i2 < 0 ? 0 : 0xFFFF) | (i3 < 0 ? 0 : 0xFFFF0000)), - int((i4 < 0 ? 0 : 0xFFFF) | (i5 < 0 ? 0 : 0xFFFF0000)), - int((i6 < 0 ? 0 : 0xFFFF) | (i7 < 0 ? 0 : 0xFFFF0000)), - int((i8 < 0 ? 0 : 0xFFFF) | (i9 < 0 ? 0 : 0xFFFF0000)), - int((i10 < 0 ? 0 : 0xFFFF) | (i11 < 0 ? 0 : 0xFFFF0000)), - int((i12 < 0 ? 0 : 0xFFFF) | (i13 < 0 ? 0 : 0xFFFF0000)), - int((i14 < 0 ? 0 : 0xFFFF) | (i15 < 0 ? 0 : 0xFFFF0000)) > (); - return _mm256_and_si256(t1, mask); - } - return t1; - } - - // general case: permute and blend and possibly zero - const int blank = (mz == 0xFFFF) ? -0x100 : -1; // ignore or zero - - // permute and blend - __m256i ta = permute16s < - (i0 &16)?blank:i0 , (i1 &16)?blank:i1 , (i2 &16)?blank:i2 , (i3 &16)?blank:i3 , - (i4 &16)?blank:i4 , (i5 &16)?blank:i5 , (i6 &16)?blank:i6 , (i7 &16)?blank:i7 , - (i8 &16)?blank:i8 , (i9 &16)?blank:i9 , (i10&16)?blank:i10, (i11&16)?blank:i11, - (i12&16)?blank:i12, (i13&16)?blank:i13, (i14&16)?blank:i14, (i15&16)?blank:i15 > (a); - - __m256i tb = permute16s < - ((i0 ^16)&16)?blank:i0 ^16, ((i1 ^16)&16)?blank:i1 ^16, ((i2 ^16)&16)?blank:i2 ^16, ((i3 ^16)&16)?blank:i3 ^16, - ((i4 ^16)&16)?blank:i4 ^16, ((i5 ^16)&16)?blank:i5 ^16, ((i6 ^16)&16)?blank:i6 ^16, ((i7 ^16)&16)?blank:i7 ^16, - ((i8 ^16)&16)?blank:i8 ^16, ((i9 ^16)&16)?blank:i9 ^16, ((i10^16)&16)?blank:i10^16, ((i11^16)&16)?blank:i11^16, - ((i12^16)&16)?blank:i12^16, ((i13^16)&16)?blank:i13^16, ((i14^16)&16)?blank:i14^16, ((i15^16)&16)?blank:i15^16 > (b); - - if (blank == -1) { - // we have zeroed, need only to OR - return _mm256_or_si256(ta, tb); - } - // no zeroing, need to blend - mask = constant8i < - int(((i0 & 16) ? 0xFFFF : 0) | ((i1 & 16) ? 0xFFFF0000 : 0)), - int(((i2 & 16) ? 0xFFFF : 0) | ((i3 & 16) ? 0xFFFF0000 : 0)), - int(((i4 & 16) ? 0xFFFF : 0) | ((i5 & 16) ? 0xFFFF0000 : 0)), - int(((i6 & 16) ? 0xFFFF : 0) | ((i7 & 16) ? 0xFFFF0000 : 0)), - int(((i8 & 16) ? 0xFFFF : 0) | ((i9 & 16) ? 0xFFFF0000 : 0)), - int(((i10& 16) ? 0xFFFF : 0) | ((i11& 16) ? 0xFFFF0000 : 0)), - int(((i12& 16) ? 0xFFFF : 0) | ((i13& 16) ? 0xFFFF0000 : 0)), - int(((i14& 16) ? 0xFFFF : 0) | ((i15& 16) ? 0xFFFF0000 : 0)) > (); - - return _mm256_blendv_epi8(ta, tb, mask); // blend -} - -template -static inline Vec16us blend16us(Vec16us const & a, Vec16us const & b) { - return Vec16us( blend16s (a,b)); -} - -template -static inline Vec32c blend32c(Vec32c const & a, Vec32c const & b) { - // collect bit 5 of each index - const int m1 = - (i0 &32)>>5 | (i1 &32)>>4 | (i2 &32)>>3 | (i3 &32)>>2 | (i4 &32)>>1 | (i5 &32) | (i6 &32)<<1 | (i7 &32)<<2 | - (i8 &32)<<3 | (i9 &32)<<4 | (i10&32)<<5 | (i11&32)<<6 | (i12&32)<<7 | (i13&32)<<8 | (i14&32)<<9 | (i15&32)<<10 | - (i16&32)<<11 | (i17&32)<<12 | (i18&32)<<13 | (i19&32)<<14 | (i20&32)<<15 | (i21&32)<<16 | (i22&32)<<17 | (i23&32)<<18 | - (i24&32)<<19 | (i25&32)<<20 | (i26&32)<<21 | (i27&32)<<22 | (i28&32)<<23 | (i29&32)<<24 | (i30&32)<<25 | (i31&32)<<26 ; - - // check which elements to set to zero - const int mz = ~ ( - (i0 <0) | (i1 <0)<<1 | (i2 <0)<<2 | (i3 <0)<<3 | (i4 <0)<<4 | (i5 <0)<<5 | (i6 <0)<<6 | (i7 <0)<<7 | - (i8 <0)<<8 | (i9 <0)<<9 | (i10<0)<<10 | (i11<0)<<11 | (i12<0)<<12 | (i13<0)<<13 | (i14<0)<<14 | (i15<0)<<15 | - (i16<0)<<16 | (i17<0)<<17 | (i18<0)<<18 | (i19<0)<<19 | (i20<0)<<20 | (i21<0)<<21 | (i22<0)<<22 | (i23<0)<<23 | - (i24<0)<<24 | (i25<0)<<25 | (i26<0)<<26 | (i27<0)<<27 | (i28<0)<<28 | (i29<0)<<29 | (i30<0)<<30 | (i31<0)<<31 ); - - __m256i t1, mask; - - // special case: all zero - if (mz == 0) return _mm256_setzero_si256(); - - // special case: all from a - if ((m1 & mz) == 0) { - return permute32c (a); - } - - // special case: all from b - if ((~m1 & mz) == 0) { - return permute32c (b); - } - - // special case: blend without permute - if ((i0 <0||(i0 &31)== 0) && (i1 <0||(i1 &31)== 1) && (i2 <0||(i2 &31)== 2) && (i3 <0||(i3 &31)== 3) && - (i4 <0||(i4 &31)== 4) && (i5 <0||(i5 &31)== 5) && (i6 <0||(i6 &31)== 6) && (i7 <0||(i7 &31)== 7) && - (i8 <0||(i8 &31)== 8) && (i9 <0||(i9 &31)== 9) && (i10<0||(i10&31)==10) && (i11<0||(i11&31)==11) && - (i12<0||(i12&31)==12) && (i13<0||(i13&31)==13) && (i14<0||(i14&31)==14) && (i15<0||(i15&31)==15) && - (i16<0||(i16&31)==16) && (i17<0||(i17&31)==17) && (i18<0||(i18&31)==18) && (i19<0||(i19&31)==19) && - (i20<0||(i20&31)==20) && (i21<0||(i21&31)==21) && (i22<0||(i22&31)==22) && (i23<0||(i23&31)==23) && - (i24<0||(i24&31)==24) && (i25<0||(i25&31)==25) && (i26<0||(i26&31)==26) && (i27<0||(i27&31)==27) && - (i28<0||(i28&31)==28) && (i29<0||(i29&31)==29) && (i30<0||(i30&31)==30) && (i31<0||(i31&31)==31) ) { - - mask = constant8i < - int(((i0 <<2)&0x80) | ((i1 <<10)&0x8000) | ((i2 <<18)&0x800000) | (uint32_t(i3 <<26)&0x80000000)) , - int(((i4 <<2)&0x80) | ((i5 <<10)&0x8000) | ((i6 <<18)&0x800000) | (uint32_t(i7 <<26)&0x80000000)) , - int(((i8 <<2)&0x80) | ((i9 <<10)&0x8000) | ((i10<<18)&0x800000) | (uint32_t(i11<<26)&0x80000000)) , - int(((i12<<2)&0x80) | ((i13<<10)&0x8000) | ((i14<<18)&0x800000) | (uint32_t(i15<<26)&0x80000000)) , - int(((i16<<2)&0x80) | ((i17<<10)&0x8000) | ((i18<<18)&0x800000) | (uint32_t(i19<<26)&0x80000000)) , - int(((i20<<2)&0x80) | ((i21<<10)&0x8000) | ((i22<<18)&0x800000) | (uint32_t(i23<<26)&0x80000000)) , - int(((i24<<2)&0x80) | ((i25<<10)&0x8000) | ((i26<<18)&0x800000) | (uint32_t(i27<<26)&0x80000000)) , - int(((i28<<2)&0x80) | ((i29<<10)&0x8000) | ((i30<<18)&0x800000) | (uint32_t(i31<<26)&0x80000000)) > (); - - t1 = _mm256_blendv_epi8(a, b, mask); // blend - - if (mz != -1) { - // zero some elements - const __m256i maskz = constant8i < - int((i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000)), - int((i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000)), - int((i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000)), - int((i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000)), - int((i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000)), - int((i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000)), - int((i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000)), - int((i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000)) > (); - return _mm256_and_si256(t1, maskz); - } - return t1; - } - - // special case: shift left - const int slb = i0 > 0 ? i0 : i31 - 31; - if (slb > 0 && slb < 32 - && (i0 ==slb+ 0||i0 <0) && (i1 ==slb+ 1||i1 <0) && (i2 ==slb+ 2||i2 <0) && (i3 ==slb+ 3||i3 <0) - && (i4 ==slb+ 4||i4 <0) && (i5 ==slb+ 5||i5 <0) && (i6 ==slb+ 6||i6 <0) && (i7 ==slb+ 7||i7 <0) - && (i8 ==slb+ 8||i8 <0) && (i9 ==slb+ 9||i9 <0) && (i10==slb+10||i10<0) && (i11==slb+11||i11<0) - && (i12==slb+12||i12<0) && (i13==slb+13||i13<0) && (i14==slb+14||i14<0) && (i15==slb+15||i15<0) - && (i16==slb+16||i16<0) && (i17==slb+17||i17<0) && (i18==slb+18||i18<0) && (i19==slb+19||i19<0) - && (i20==slb+20||i20<0) && (i21==slb+21||i21<0) && (i22==slb+22||i22<0) && (i23==slb+23||i23<0) - && (i24==slb+24||i24<0) && (i25==slb+25||i25<0) && (i26==slb+26||i26<0) && (i27==slb+27||i27<0) - && (i28==slb+28||i28<0) && (i29==slb+29||i29<0) && (i30==slb+30||i30<0) && (i31==slb+31||i31<0)) { - t1 = _mm256_permute2x128_si256(a, b, 0x21); - if (slb < 16) t1 = _mm256_alignr_epi8(t1, a, slb & 15); - else t1 = _mm256_alignr_epi8(b, t1, slb & 15); - if (mz != -1) { - // zero some elements - const __m256i maskz = constant8i < - int((i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000)), - int((i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000)), - int((i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000)), - int((i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000)), - int((i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000)), - int((i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000)), - int((i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000)), - int((i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000)) > (); - return _mm256_and_si256(t1, maskz); - } - return t1; - } - // special case: shift right - const int srb = i0 > 0 ? (i0^32) : (i31^32) - 31; - if (srb > 0 && srb < 32 - && ((i0 ^32)==srb+ 0||i0 <0) && ((i1 ^32)==srb+ 1||i1 <0) && ((i2 ^32)==srb+ 2||i2 <0) && ((i3 ^32)==srb+ 3||i3 <0) - && ((i4 ^32)==srb+ 4||i4 <0) && ((i5 ^32)==srb+ 5||i5 <0) && ((i6 ^32)==srb+ 6||i6 <0) && ((i7 ^32)==srb+ 7||i7 <0) - && ((i8 ^32)==srb+ 8||i8 <0) && ((i9 ^32)==srb+ 9||i9 <0) && ((i10^32)==srb+10||i10<0) && ((i11^32)==srb+11||i11<0) - && ((i12^32)==srb+12||i12<0) && ((i13^32)==srb+13||i13<0) && ((i14^32)==srb+14||i14<0) && ((i15^32)==srb+15||i15<0) - && ((i16^32)==srb+16||i16<0) && ((i17^32)==srb+17||i17<0) && ((i18^32)==srb+18||i18<0) && ((i19^32)==srb+19||i19<0) - && ((i20^32)==srb+20||i20<0) && ((i21^32)==srb+21||i21<0) && ((i22^32)==srb+22||i22<0) && ((i23^32)==srb+23||i23<0) - && ((i24^32)==srb+24||i24<0) && ((i25^32)==srb+25||i25<0) && ((i26^32)==srb+26||i26<0) && ((i27^32)==srb+27||i27<0) - && ((i28^32)==srb+28||i28<0) && ((i29^32)==srb+29||i29<0) && ((i30^32)==srb+30||i30<0) && ((i31^32)==srb+31||i31<0)) { - t1 = _mm256_permute2x128_si256(b, a, 0x21); - if (srb < 16) t1 = _mm256_alignr_epi8(t1, b, srb & 15); - else t1 = _mm256_alignr_epi8(a, t1, srb & 15); - if (mz != -1) { - // zero some elements - const __m256i maskz = constant8i < - int((i0 <0?0:0xFF) | (i1 <0?0:0xFF00) | (i2 <0?0:0xFF0000) | (i3 <0?0:0xFF000000)), - int((i4 <0?0:0xFF) | (i5 <0?0:0xFF00) | (i6 <0?0:0xFF0000) | (i7 <0?0:0xFF000000)), - int((i8 <0?0:0xFF) | (i9 <0?0:0xFF00) | (i10<0?0:0xFF0000) | (i11<0?0:0xFF000000)), - int((i12<0?0:0xFF) | (i13<0?0:0xFF00) | (i14<0?0:0xFF0000) | (i15<0?0:0xFF000000)), - int((i16<0?0:0xFF) | (i17<0?0:0xFF00) | (i18<0?0:0xFF0000) | (i19<0?0:0xFF000000)), - int((i20<0?0:0xFF) | (i21<0?0:0xFF00) | (i22<0?0:0xFF0000) | (i23<0?0:0xFF000000)), - int((i24<0?0:0xFF) | (i25<0?0:0xFF00) | (i26<0?0:0xFF0000) | (i27<0?0:0xFF000000)), - int((i28<0?0:0xFF) | (i29<0?0:0xFF00) | (i30<0?0:0xFF0000) | (i31<0?0:0xFF000000)) > (); - return _mm256_and_si256(t1, maskz); - } - return t1; - } - - // general case: permute and blend and possible zero - const int blank = (mz == -1) ? -0x100 : -1; // ignore or zero - - // permute and blend - __m256i ta = permute32c < - (i0 &32)?blank:i0 , (i1 &32)?blank:i1 , (i2 &32)?blank:i2 , (i3 &32)?blank:i3 , - (i4 &32)?blank:i4 , (i5 &32)?blank:i5 , (i6 &32)?blank:i6 , (i7 &32)?blank:i7 , - (i8 &32)?blank:i8 , (i9 &32)?blank:i9 , (i10&32)?blank:i10, (i11&32)?blank:i11, - (i12&32)?blank:i12, (i13&32)?blank:i13, (i14&32)?blank:i14, (i15&32)?blank:i15, - (i16&32)?blank:i16, (i17&32)?blank:i17, (i18&32)?blank:i18, (i19&32)?blank:i19, - (i20&32)?blank:i20, (i21&32)?blank:i21, (i22&32)?blank:i22, (i23&32)?blank:i23, - (i24&32)?blank:i24, (i25&32)?blank:i25, (i26&32)?blank:i26, (i27&32)?blank:i27, - (i28&32)?blank:i28, (i29&32)?blank:i29, (i30&32)?blank:i30, (i31&32)?blank:i31 > (a); - - __m256i tb = permute32c < - ((i0 ^32)&32)?blank:i0 ^32, ((i1 ^32)&32)?blank:i1 ^32, ((i2 ^32)&32)?blank:i2 ^32, ((i3 ^32)&32)?blank:i3 ^32, - ((i4 ^32)&32)?blank:i4 ^32, ((i5 ^32)&32)?blank:i5 ^32, ((i6 ^32)&32)?blank:i6 ^32, ((i7 ^32)&32)?blank:i7 ^32, - ((i8 ^32)&32)?blank:i8 ^32, ((i9 ^32)&32)?blank:i9 ^32, ((i10^32)&32)?blank:i10^32, ((i11^32)&32)?blank:i11^32, - ((i12^32)&32)?blank:i12^32, ((i13^32)&32)?blank:i13^32, ((i14^32)&32)?blank:i14^32, ((i15^32)&32)?blank:i15^32, - ((i16^32)&32)?blank:i16^32, ((i17^32)&32)?blank:i17^32, ((i18^32)&32)?blank:i18^32, ((i19^32)&32)?blank:i19^32, - ((i20^32)&32)?blank:i20^32, ((i21^32)&32)?blank:i21^32, ((i22^32)&32)?blank:i22^32, ((i23^32)&32)?blank:i23^32, - ((i24^32)&32)?blank:i24^32, ((i25^32)&32)?blank:i25^32, ((i26^32)&32)?blank:i26^32, ((i27^32)&32)?blank:i27^32, - ((i28^32)&32)?blank:i28^32, ((i29^32)&32)?blank:i29^32, ((i30^32)&32)?blank:i30^32, ((i31^32)&32)?blank:i31^32 > (b); - - if (blank == -1) { - // we have zeroed, need only to OR - return _mm256_or_si256(ta, tb); - } - // no zeroing, need to blend - mask = constant8i < - int(((i0 <<2)&0x80) | ((i1 <<10)&0x8000) | ((i2 <<18)&0x800000) | (uint32_t(i3 <<26)&0x80000000)) , - int(((i4 <<2)&0x80) | ((i5 <<10)&0x8000) | ((i6 <<18)&0x800000) | (uint32_t(i7 <<26)&0x80000000)) , - int(((i8 <<2)&0x80) | ((i9 <<10)&0x8000) | ((i10<<18)&0x800000) | (uint32_t(i11<<26)&0x80000000)) , - int(((i12<<2)&0x80) | ((i13<<10)&0x8000) | ((i14<<18)&0x800000) | (uint32_t(i15<<26)&0x80000000)) , - int(((i16<<2)&0x80) | ((i17<<10)&0x8000) | ((i18<<18)&0x800000) | (uint32_t(i19<<26)&0x80000000)) , - int(((i20<<2)&0x80) | ((i21<<10)&0x8000) | ((i22<<18)&0x800000) | (uint32_t(i23<<26)&0x80000000)) , - int(((i24<<2)&0x80) | ((i25<<10)&0x8000) | ((i26<<18)&0x800000) | (uint32_t(i27<<26)&0x80000000)) , - int(((i28<<2)&0x80) | ((i29<<10)&0x8000) | ((i30<<18)&0x800000) | (uint32_t(i31<<26)&0x80000000)) > (); - - return _mm256_blendv_epi8(ta, tb, mask); // blend -} - -template < - int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, - int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15, - int i16, int i17, int i18, int i19, int i20, int i21, int i22, int i23, - int i24, int i25, int i26, int i27, int i28, int i29, int i30, int i31 > - static inline Vec32uc blend32uc(Vec32uc const & a, Vec32uc const & b) { - return Vec32uc (blend32c (a, b)); -} - - -/***************************************************************************** -* -* Vector lookup functions -* -****************************************************************************** -* -* These functions use vector elements as indexes into a table. -* The table is given as one or more vectors or as an array. -* -* This can be used for several purposes: -* - table lookup -* - permute or blend with variable indexes -* - blend from more than two sources -* - gather non-contiguous data -* -* An index out of range may produce any value - the actual value produced is -* implementation dependent and may be different for different instruction -* sets. An index out of range does not produce an error message or exception. -* -* Example: -* Vec8i a(2,0,0,6,4,3,5,0); // index a is ( 2, 0, 0, 6, 4, 3, 5, 0) -* Vec8i b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107) -* Vec8i c; -* c = lookup8 (a,b); // c is (102, 100, 100, 106, 104, 103, 105, 100) -* -*****************************************************************************/ - -static inline Vec32c lookup32(Vec32c const & index, Vec32c const & table) { -#ifdef __XOP__ // AMD XOP instruction set. Use VPPERM - Vec16c t0 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_low()); - Vec16c t1 = _mm_perm_epi8(table.get_low(), table.get_high(), index.get_high()); - return Vec32c(t0, t1); -#else - Vec32c f0 = constant8i<0,0,0,0,0x10101010,0x10101010,0x10101010,0x10101010>(); - Vec32c f1 = constant8i<0x10101010,0x10101010,0x10101010,0x10101010,0,0,0,0>(); - Vec32c tablef = _mm256_permute4x64_epi64(table, 0x4E); // low and high parts swapped - Vec32c r0 = _mm256_shuffle_epi8(table, (index ^ f0) + 0x70); - Vec32c r1 = _mm256_shuffle_epi8(tablef, (index ^ f1) + 0x70); - return r0 | r1; -#endif -} - -template -static inline Vec32c lookup(Vec32uc const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 16) { - Vec16c tt = Vec16c().load(table); - Vec16c r0 = lookup16(index.get_low(), tt); - Vec16c r1 = lookup16(index.get_high(), tt); - return Vec32c(r0, r1); - } - if (n <= 32) return lookup32(index, Vec32c().load(table)); - // n > 32. Limit index - Vec32uc index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec32uc(index) & uint8_t(n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec32uc(index), uint8_t(n-1)); - } - Vec8ui mask0 = Vec8ui(0x000000FF); // mask 8 bits - Vec32c t0 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & Vec8ui(index1)), 1); // positions 0, 4, 8, ... - Vec32c t1 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1, 8)), 1); // positions 1, 5, 9, ... - Vec32c t2 = _mm256_i32gather_epi32((const int *)table, __m256i(mask0 & _mm256_srli_epi32(index1,16)), 1); // positions 2, 6, 10, ... - Vec32c t3 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1,24), 1); // positions 3, 7, 11, ... - t0 = t0 & mask0; - t1 = _mm256_slli_epi32(t1 & mask0, 8); - t2 = _mm256_slli_epi32(t2 & mask0, 16); - t3 = _mm256_slli_epi32(t3, 24); - return (t0 | t3) | (t1 | t2); -} - -template -static inline Vec32c lookup(Vec32c const & index, void const * table) { - return lookup(Vec32uc(index), table); -} - - -static inline Vec16s lookup16(Vec16s const & index, Vec16s const & table) { - return Vec16s(lookup32(Vec32c(index * 0x202 + 0x100), Vec32c(table))); -} - -template -static inline Vec16s lookup(Vec16s const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 8) { - Vec8s table1 = Vec8s().load(table); - return Vec16s( - lookup8 (index.get_low(), table1), - lookup8 (index.get_high(), table1)); - } - if (n <= 16) return lookup16(index, Vec16s().load(table)); - // n > 16. Limit index - Vec16us index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec16us(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec16us(index), n-1); - } - Vec16s t1 = _mm256_i32gather_epi32((const int *)table, __m256i(Vec8ui(index1) & 0x0000FFFF), 2); // even positions - Vec16s t2 = _mm256_i32gather_epi32((const int *)table, _mm256_srli_epi32(index1, 16) , 2); // odd positions - return blend16s<0,16,2,18,4,20,6,22,8,24,10,26,12,28,14,30>(t1, t2); -} - -static inline Vec8i lookup8(Vec8i const & index, Vec8i const & table) { - return _mm256_permutevar8x32_epi32(table, index); -} - -template -static inline Vec8i lookup(Vec8i const & index, void const * table) { - if (n <= 0) return 0; - if (n <= 8) { - Vec8i table1 = Vec8i().load(table); - return lookup8(index, table1); - } - if (n <= 16) { - Vec8i table1 = Vec8i().load(table); - Vec8i table2 = Vec8i().load((int32_t const*)table + 8); - Vec8i y1 = lookup8(index, table1); - Vec8i y2 = lookup8(index, table2); - Vec8ib s = index > 7; - return select(s, y2, y1); - } - // n > 16. Limit index - Vec8ui index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec8ui(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1 - index1 = min(Vec8ui(index), n-1); - } - return _mm256_i32gather_epi32((const int *)table, index1, 4); -} - -static inline Vec4q lookup4(Vec4q const & index, Vec4q const & table) { - return Vec4q(lookup8(Vec8i(index * 0x200000002ll + 0x100000000ll), Vec8i(table))); -} - -template -static inline Vec4q lookup(Vec4q const & index, int64_t const * table) { - if (n <= 0) return 0; - // n > 0. Limit index - Vec4uq index1; - if ((n & (n-1)) == 0) { - // n is a power of 2, make index modulo n - index1 = Vec4uq(index) & (n-1); - } - else { - // n is not a power of 2, limit to n-1. - // There is no 64-bit min instruction, but we can use the 32-bit unsigned min, - // since n is a 32-bit integer - index1 = Vec4uq(min(Vec8ui(index), constant8i())); - } -// old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long -#if defined (__clang__) && CLANG_VERSION < 30400 -// clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed - return _mm256_i64gather_epi64((const int *)table, index1, 8); -#elif defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) -// Old MS and Intel use non-standard type __int64 - return _mm256_i64gather_epi64((const int64_t *)table, index1, 8); -#else -// Gnu, Clang 3.4, MS 11.0 - return _mm256_i64gather_epi64((const long long *)table, index1, 8); -#endif -} - - -/***************************************************************************** -* -* Other permutations with variable indexes -* -*****************************************************************************/ - -// Function shift_bytes_up: shift whole vector left by b bytes. -// You may use a permute function instead if b is a compile-time constant -static inline Vec32c shift_bytes_up(Vec32c const & a, int b) { - if (b < 16) { - return Vec32c(shift_bytes_up(a.get_low(),b), shift_bytes_up(a.get_high(),b) | shift_bytes_down(a.get_low(),16-b)); - } - else { - return Vec32c(Vec16c(0), shift_bytes_up(a.get_high(),b-16)); - } -} - -// Function shift_bytes_down: shift whole vector right by b bytes -// You may use a permute function instead if b is a compile-time constant -static inline Vec32c shift_bytes_down(Vec32c const & a, int b) { - if (b < 16) { - return Vec32c(shift_bytes_down(a.get_low(),b) | shift_bytes_up(a.get_high(),16-b), shift_bytes_down(a.get_high(),b)); - } - else { - return Vec32c(shift_bytes_down(a.get_high(),b-16), Vec16c(0)); - } -} - -/***************************************************************************** -* -* Gather functions with fixed indexes -* -*****************************************************************************/ -// Load elements from array a with indices i0, i1, i2, i3, i4, i5, i6, i7 -template -static inline Vec8i gather8i(void const * a) { - Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index; // Error message if index is negative - const int i01min = i0 < i1 ? i0 : i1; - const int i23min = i2 < i3 ? i2 : i3; - const int i45min = i4 < i5 ? i4 : i5; - const int i67min = i6 < i7 ? i6 : i7; - const int i0123min = i01min < i23min ? i01min : i23min; - const int i4567min = i45min < i67min ? i45min : i67min; - const int imin = i0123min < i4567min ? i0123min : i4567min; - const int i01max = i0 > i1 ? i0 : i1; - const int i23max = i2 > i3 ? i2 : i3; - const int i45max = i4 > i5 ? i4 : i5; - const int i67max = i6 > i7 ? i6 : i7; - const int i0123max = i01max > i23max ? i01max : i23max; - const int i4567max = i45max > i67max ? i45max : i67max; - const int imax = i0123max > i4567max ? i0123max : i4567max; - - if (imax - imin <= 7) { - // load one contiguous block and permute - if (imax > 7) { - // make sure we don't read past the end of the array - Vec8i b = Vec8i().load((int32_t const *)a + imax-7); - return permute8i(b); - } - else { - Vec8i b = Vec8i().load((int32_t const *)a + imin); - return permute8i(b); - } - } - if ((i0imax-8) && (i1imax-8) && (i2imax-8) && (i3imax-8) - && (i4imax-8) && (i5imax-8) && (i6imax-8) && (i7imax-8)) { - // load two contiguous blocks and blend - Vec8i b = Vec8i().load((int32_t const *)a + imin); - Vec8i c = Vec8i().load((int32_t const *)a + imax-7); - const int j0 = i0(b, c); - } - // use AVX2 gather - return _mm256_i32gather_epi32((const int *)a, Vec8i(i0,i1,i2,i3,i4,i5,i6,i7), 4); -} - -template -static inline Vec4q gather4q(void const * a) { - Static_error_check<(i0|i1|i2|i3)>=0> Negative_array_index; // Error message if index is negative - const int i01min = i0 < i1 ? i0 : i1; - const int i23min = i2 < i3 ? i2 : i3; - const int imin = i01min < i23min ? i01min : i23min; - const int i01max = i0 > i1 ? i0 : i1; - const int i23max = i2 > i3 ? i2 : i3; - const int imax = i01max > i23max ? i01max : i23max; - if (imax - imin <= 3) { - // load one contiguous block and permute - if (imax > 3) { - // make sure we don't read past the end of the array - Vec4q b = Vec4q().load((int64_t const *)a + imax-3); - return permute4q(b); - } - else { - Vec4q b = Vec4q().load((int64_t const *)a + imin); - return permute4q(b); - } - } - if ((i0imax-4) && (i1imax-4) && (i2imax-4) && (i3imax-4)) { - // load two contiguous blocks and blend - Vec4q b = Vec4q().load((int64_t const *)a + imin); - Vec4q c = Vec4q().load((int64_t const *)a + imax-3); - const int j0 = i0(b, c); - } - // use AVX2 gather - // old compilers can't agree how to define a 64 bit integer. Intel and MS use __int64, gcc use long long -#if defined (__clang__) && CLANG_VERSION < 30400 - // clang 3.3 uses const int * in accordance with official Intel doc., which is wrong. will be fixed - return _mm256_i32gather_epi64((const int *)a, Vec4i(i0,i1,i2,i3), 8); -#elif defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER) - // Old MS and Intel use non-standard type __int64 - return _mm256_i32gather_epi64((const int64_t *)a, Vec4i(i0,i1,i2,i3), 8); -#else - // Gnu, Clang 3.4, MS 11.0 - return _mm256_i32gather_epi64((const long long *)a, Vec4i(i0,i1,i2,i3), 8); -#endif -} - - -/***************************************************************************** -* -* Vector scatter functions -* -****************************************************************************** -* -* These functions write the elements of a vector to arbitrary positions in an -* array in memory. Each vector element is written to an array position -* determined by an index. An element is not written if the corresponding -* index is out of range. -* The indexes can be specified as constant template parameters or as an -* integer vector. -* -* The scatter functions are useful if the data are distributed in a sparce -* manner into the array. If the array is dense then it is more efficient -* to permute the data into the right positions and then write the whole -* permuted vector into the array. -* -* Example: -* Vec8q a(10,11,12,13,14,15,16,17); -* int64_t b[16] = {0}; -* scatter<0,2,14,10,1,-1,5,9>(a,b); -* // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0} -* -*****************************************************************************/ - -template -static inline void scatter(Vec8i const & data, void * array) { -#if defined (__AVX512VL__) - __m256i indx = constant8i(); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3| (i4>=0)<<4| (i5>=0)<<5| (i6>=0)<<6| (i7>=0)<<7); - _mm256_mask_i32scatter_epi32((int*)array, mask, indx, data, 4); -#elif defined (__AVX512F__) - __m512i indx = _mm512_castsi256_si512(constant8i()); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3| (i4>=0)<<4| (i5>=0)<<5| (i6>=0)<<6| (i7>=0)<<7); - _mm512_mask_i32scatter_epi32((int*)array, mask, indx, _mm512_castsi256_si512(data), 4); -#else - int32_t* arr = (int32_t*)array; - const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7}; - for (int i = 0; i < 8; i++) { - if (index[i] >= 0) arr[index[i]] = data[i]; - } -#endif -} - -template -static inline void scatter(Vec4q const & data, void * array) { -#if defined (__AVX512VL__) - __m128i indx = constant4i(); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3); - _mm256_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8); -#elif defined (__AVX512F__) - __m256i indx = _mm256_castsi128_si256(constant4i()); - __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3); - _mm512_mask_i32scatter_epi64((long long*)array, mask, indx, _mm512_castsi256_si512(data), 8); -#else - int64_t* arr = (int64_t*)array; - const int index[4] = {i0,i1,i2,i3}; - for (int i = 0; i < 4; i++) { - if (index[i] >= 0) arr[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec8i const & index, uint32_t limit, Vec8i const & data, void * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit)); - _mm256_mask_i32scatter_epi32((int*)array, mask, index, data, 4); -#elif defined (__AVX512F__) - // 16 bit mask. upper 8 bits are (0<0) = false - __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit))); - _mm512_mask_i32scatter_epi32((int*)array, mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 4); -#else - int32_t* arr = (int32_t*)array; - for (int i = 0; i < 8; i++) { - if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec4q const & index, uint32_t limit, Vec4q const & data, void * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit))); - _mm256_mask_i64scatter_epi64((long long*)array, mask, index, data, 8); -#elif defined (__AVX512F__) - // 16 bit mask. upper 8 bits are (0<0) = false - __mmask16 mask = _mm512_cmplt_epu64_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit)))); - _mm512_mask_i64scatter_epi64((long long*)array, mask, _mm512_castsi256_si512(index), _mm512_castsi256_si512(data), 8); -#else - int64_t* arr = (int64_t*)array; - for (int i = 0; i < 4; i++) { - if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i]; - } -#endif -} - -static inline void scatter(Vec4i const & index, uint32_t limit, Vec4q const & data, void * array) { -#if defined (__AVX512VL__) - __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit)); - _mm256_mask_i32scatter_epi64((long long*)array, mask, index, data, 8); -#elif defined (__AVX512F__) - // 16 bit mask. upper 8 bits are (0<0) = false - __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit))); - _mm512_mask_i32scatter_epi64((long long*)array, mask, _mm256_castsi128_si256(index), _mm512_castsi256_si512(data), 8); -#else - int64_t* arr = (int64_t*)array; - for (int i = 0; i < 4; i++) { - if (uint32_t(index[i]) < limit) arr[index[i]] = data[i]; - } -#endif -} - -/***************************************************************************** -* -* Functions for conversion between integer sizes -* -*****************************************************************************/ - -// Extend 8-bit integers to 16-bit integers, signed and unsigned - -// Function extend_low : extends the low 16 elements to 16 bits with sign extension -static inline Vec16s extend_low (Vec32c const & a) { - __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a)); // get bits 64-127 to position 128-191 - __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2 - return _mm256_unpacklo_epi8(a2, sign); // interleave with sign extensions -} - -// Function extend_high : extends the high 16 elements to 16 bits with sign extension -static inline Vec16s extend_high (Vec32c const & a) { - __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a)); // get bits 128-191 to position 64-127 - __m256i sign = _mm256_cmpgt_epi8(_mm256_setzero_si256(),a2); // 0 > a2 - return _mm256_unpackhi_epi8(a2, sign); // interleave with sign extensions -} - -// Function extend_low : extends the low 16 elements to 16 bits with zero extension -static inline Vec16us extend_low (Vec32uc const & a) { - __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a)); // get bits 64-127 to position 128-191 - return _mm256_unpacklo_epi8(a2, _mm256_setzero_si256()); // interleave with zero extensions -} - -// Function extend_high : extends the high 19 elements to 16 bits with zero extension -static inline Vec16us extend_high (Vec32uc const & a) { - __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a)); // get bits 128-191 to position 64-127 - return _mm256_unpackhi_epi8(a2, _mm256_setzero_si256()); // interleave with zero extensions -} - -// Extend 16-bit integers to 32-bit integers, signed and unsigned - -// Function extend_low : extends the low 8 elements to 32 bits with sign extension -static inline Vec8i extend_low (Vec16s const & a) { - __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a)); // get bits 64-127 to position 128-191 - __m256i sign = _mm256_srai_epi16(a2, 15); // sign bit - return _mm256_unpacklo_epi16(a2 ,sign); // interleave with sign extensions -} - -// Function extend_high : extends the high 8 elements to 32 bits with sign extension -static inline Vec8i extend_high (Vec16s const & a) { - __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a)); // get bits 128-191 to position 64-127 - __m256i sign = _mm256_srai_epi16(a2, 15); // sign bit - return _mm256_unpackhi_epi16(a2, sign); // interleave with sign extensions -} - -// Function extend_low : extends the low 8 elements to 32 bits with zero extension -static inline Vec8ui extend_low (Vec16us const & a) { - __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a)); // get bits 64-127 to position 128-191 - return _mm256_unpacklo_epi16(a2, _mm256_setzero_si256()); // interleave with zero extensions -} - -// Function extend_high : extends the high 8 elements to 32 bits with zero extension -static inline Vec8ui extend_high (Vec16us const & a) { - __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a)); // get bits 128-191 to position 64-127 - return _mm256_unpackhi_epi16(a2, _mm256_setzero_si256()); // interleave with zero extensions -} - -// Extend 32-bit integers to 64-bit integers, signed and unsigned - -// Function extend_low : extends the low 4 elements to 64 bits with sign extension -static inline Vec4q extend_low (Vec8i const & a) { - __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a)); // get bits 64-127 to position 128-191 - __m256i sign = _mm256_srai_epi32(a2, 31); // sign bit - return _mm256_unpacklo_epi32(a2, sign); // interleave with sign extensions -} - -// Function extend_high : extends the high 4 elements to 64 bits with sign extension -static inline Vec4q extend_high (Vec8i const & a) { - __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a)); // get bits 128-191 to position 64-127 - __m256i sign = _mm256_srai_epi32(a2, 31); // sign bit - return _mm256_unpackhi_epi32(a2, sign); // interleave with sign extensions -} - -// Function extend_low : extends the low 4 elements to 64 bits with zero extension -static inline Vec4uq extend_low (Vec8ui const & a) { - __m256i a2 = permute4q<0,-256,1,-256>(Vec4q(a)); // get bits 64-127 to position 128-191 - return _mm256_unpacklo_epi32(a2, _mm256_setzero_si256()); // interleave with zero extensions -} - -// Function extend_high : extends the high 4 elements to 64 bits with zero extension -static inline Vec4uq extend_high (Vec8ui const & a) { - __m256i a2 = permute4q<-256,2,-256,3>(Vec4q(a)); // get bits 128-191 to position 64-127 - return _mm256_unpackhi_epi32(a2, _mm256_setzero_si256()); // interleave with zero extensions -} - -// Compress 16-bit integers to 8-bit integers, signed and unsigned, with and without saturation - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Overflow wraps around -static inline Vec32c compress (Vec16s const & low, Vec16s const & high) { - __m256i mask = _mm256_set1_epi32(0x00FF00FF); // mask for low bytes - __m256i lowm = _mm256_and_si256(low, mask); // bytes of low - __m256i highm = _mm256_and_si256(high, mask); // bytes of high - __m256i pk = _mm256_packus_epi16(lowm, highm); // unsigned pack - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Signed, with saturation -static inline Vec32c compress_saturated (Vec16s const & low, Vec16s const & high) { - __m256i pk = _mm256_packs_epi16(low,high); // packed with signed saturation - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 16-bit integers to one vector of 8-bit integers -// Unsigned, overflow wraps around -static inline Vec32uc compress (Vec16us const & low, Vec16us const & high) { - return Vec32uc (compress((Vec16s)low, (Vec16s)high)); -} - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Unsigned, with saturation -static inline Vec32uc compress_saturated (Vec16us const & low, Vec16us const & high) { - __m256i maxval = _mm256_set1_epi32(0x00FF00FF); // maximum value - __m256i minval = _mm256_setzero_si256(); // minimum value = 0 - __m256i low1 = _mm256_min_epu16(low,maxval); // upper limit - __m256i high1 = _mm256_min_epu16(high,maxval); // upper limit - __m256i low2 = _mm256_max_epu16(low1,minval); // lower limit - __m256i high2 = _mm256_max_epu16(high1,minval); // lower limit - __m256i pk = _mm256_packus_epi16(low2,high2); // this instruction saturates from signed 32 bit to unsigned 16 bit - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers -// Signed to unsigned, with saturation -static inline Vec32uc compress_saturated_s2u (Vec16s const & low, Vec16s const & high) { - __m256i pk = _mm256_packus_epi16(low,high); // this instruction saturates from signed 16 bit to unsigned 8 bit - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Compress 32-bit integers to 16-bit integers, signed and unsigned, with and without saturation - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Overflow wraps around -static inline Vec16s compress (Vec8i const & low, Vec8i const & high) { - __m256i mask = _mm256_set1_epi32(0x0000FFFF); // mask for low words - __m256i lowm = _mm256_and_si256(low,mask); // bytes of low - __m256i highm = _mm256_and_si256(high,mask); // bytes of high - __m256i pk = _mm256_packus_epi32(lowm,highm); // unsigned pack - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Signed with saturation -static inline Vec16s compress_saturated (Vec8i const & low, Vec8i const & high) { - __m256i pk = _mm256_packs_epi32(low,high); // pack with signed saturation - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Overflow wraps around -static inline Vec16us compress (Vec8ui const & low, Vec8ui const & high) { - return Vec16us (compress((Vec8i)low, (Vec8i)high)); -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Unsigned, with saturation -static inline Vec16us compress_saturated (Vec8ui const & low, Vec8ui const & high) { - __m256i maxval = _mm256_set1_epi32(0x0000FFFF); // maximum value - __m256i minval = _mm256_setzero_si256(); // minimum value = 0 - __m256i low1 = _mm256_min_epu32(low,maxval); // upper limit - __m256i high1 = _mm256_min_epu32(high,maxval); // upper limit - __m256i low2 = _mm256_max_epu32(low1,minval); // lower limit - __m256i high2 = _mm256_max_epu32(high1,minval); // lower limit - __m256i pk = _mm256_packus_epi32(low2,high2); // this instruction saturates from signed 32 bit to unsigned 16 bit - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Signed to unsigned, with saturation -static inline Vec16us compress_saturated_s2u (Vec8i const & low, Vec8i const & high) { - __m256i pk = _mm256_packus_epi32(low,high); // this instruction saturates from signed 32 bit to unsigned 16 bit - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation - -// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers -// Overflow wraps around -static inline Vec8i compress (Vec4q const & low, Vec4q const & high) { - __m256i low2 = _mm256_shuffle_epi32(low,0xD8); // low dwords of low to pos. 0 and 32 - __m256i high2 = _mm256_shuffle_epi32(high,0xD8); // low dwords of high to pos. 0 and 32 - __m256i pk = _mm256_unpacklo_epi64(low2,high2); // interleave - return _mm256_permute4x64_epi64(pk, 0xD8); // put in right place -} - -// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers -// Signed, with saturation -static inline Vec8i compress_saturated (Vec4q const & a, Vec4q const & b) { - Vec4q maxval = constant8ui<0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0,0x7FFFFFFF,0>(); - Vec4q minval = constant8ui<0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF,0x80000000,0xFFFFFFFF>(); - Vec4q a1 = min(a,maxval); - Vec4q b1 = min(b,maxval); - Vec4q a2 = max(a1,minval); - Vec4q b2 = max(b1,minval); - return compress(a2,b2); -} - -// Function compress : packs two vectors of 32-bit integers into one vector of 16-bit integers -// Overflow wraps around -static inline Vec8ui compress (Vec4uq const & low, Vec4uq const & high) { - return Vec8ui (compress((Vec4q)low, (Vec4q)high)); -} - -// Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers -// Unsigned, with saturation -static inline Vec8ui compress_saturated (Vec4uq const & low, Vec4uq const & high) { - __m256i zero = _mm256_setzero_si256(); // 0 - __m256i lowzero = _mm256_cmpeq_epi32(low,zero); // for each dword is zero - __m256i highzero = _mm256_cmpeq_epi32(high,zero); // for each dword is zero - __m256i mone = _mm256_set1_epi32(-1); // FFFFFFFF - __m256i lownz = _mm256_xor_si256(lowzero,mone); // for each dword is nonzero - __m256i highnz = _mm256_xor_si256(highzero,mone); // for each dword is nonzero - __m256i lownz2 = _mm256_srli_epi64(lownz,32); // shift down to low dword - __m256i highnz2 = _mm256_srli_epi64(highnz,32); // shift down to low dword - __m256i lowsatur = _mm256_or_si256(low,lownz2); // low, saturated - __m256i hisatur = _mm256_or_si256(high,highnz2); // high, saturated - return Vec8ui (compress(Vec4q(lowsatur), Vec4q(hisatur))); -} - - -/***************************************************************************** -* -* Integer division operators -* -* Please see the file vectori128.h for explanation. -* -*****************************************************************************/ - -// vector operator / : divide each element by divisor - -// vector of 8 32-bit signed integers -static inline Vec8i operator / (Vec8i const & a, Divisor_i const & d) { - __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier - __m256i sgn = _mm256_broadcastq_epi64(d.getsign()); // broadcast sign of d - __m256i t1 = _mm256_mul_epi32(a,m); // 32x32->64 bit signed multiplication of even elements of a - __m256i t2 = _mm256_srli_epi64(t1,32); // high dword of even numbered results - __m256i t3 = _mm256_srli_epi64(a,32); // get odd elements of a into position for multiplication - __m256i t4 = _mm256_mul_epi32(t3,m); // 32x32->64 bit signed multiplication of odd elements - __m256i t5 = constant8i<0,-1,0,-1,0,-1,0,-1> (); // mask for odd elements - __m256i t7 = _mm256_blendv_epi8(t2,t4,t5); // blend two results - __m256i t8 = _mm256_add_epi32(t7,a); // add - __m256i t9 = _mm256_sra_epi32(t8,d.gets1()); // shift right artihmetic - __m256i t10 = _mm256_srai_epi32(a,31); // sign of a - __m256i t11 = _mm256_sub_epi32(t10,sgn); // sign of a - sign of d - __m256i t12 = _mm256_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0 - return _mm256_xor_si256(t12,sgn); // change sign if divisor negative -} - -// vector of 8 32-bit unsigned integers -static inline Vec8ui operator / (Vec8ui const & a, Divisor_ui const & d) { - __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier - __m256i t1 = _mm256_mul_epu32(a,m); // 32x32->64 bit unsigned multiplication of even elements of a - __m256i t2 = _mm256_srli_epi64(t1,32); // high dword of even numbered results - __m256i t3 = _mm256_srli_epi64(a,32); // get odd elements of a into position for multiplication - __m256i t4 = _mm256_mul_epu32(t3,m); // 32x32->64 bit unsigned multiplication of odd elements - __m256i t5 = constant8i<0,-1,0,-1,0,-1,0,-1> (); // mask for odd elements - __m256i t7 = _mm256_blendv_epi8(t2,t4,t5); // blend two results - __m256i t8 = _mm256_sub_epi32(a,t7); // subtract - __m256i t9 = _mm256_srl_epi32(t8,d.gets1()); // shift right logical - __m256i t10 = _mm256_add_epi32(t7,t9); // add - return _mm256_srl_epi32(t10,d.gets2()); // shift right logical -} - -// vector of 16 16-bit signed integers -static inline Vec16s operator / (Vec16s const & a, Divisor_s const & d) { - __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier - __m256i sgn = _mm256_broadcastq_epi64(d.getsign()); // broadcast sign of d - __m256i t1 = _mm256_mulhi_epi16(a, m); // multiply high signed words - __m256i t2 = _mm256_add_epi16(t1,a); // + a - __m256i t3 = _mm256_sra_epi16(t2,d.gets1()); // shift right artihmetic - __m256i t4 = _mm256_srai_epi16(a,15); // sign of a - __m256i t5 = _mm256_sub_epi16(t4,sgn); // sign of a - sign of d - __m256i t6 = _mm256_sub_epi16(t3,t5); // + 1 if a < 0, -1 if d < 0 - return _mm256_xor_si256(t6,sgn); // change sign if divisor negative -} - -// vector of 16 16-bit unsigned integers -static inline Vec16us operator / (Vec16us const & a, Divisor_us const & d) { - __m256i m = _mm256_broadcastq_epi64(d.getm()); // broadcast multiplier - __m256i t1 = _mm256_mulhi_epu16(a, m); // multiply high signed words - __m256i t2 = _mm256_sub_epi16(a,t1); // subtract - __m256i t3 = _mm256_srl_epi16(t2,d.gets1()); // shift right logical - __m256i t4 = _mm256_add_epi16(t1,t3); // add - return _mm256_srl_epi16(t4,d.gets2()); // shift right logical -} - -// vector of 32 8-bit signed integers -static inline Vec32c operator / (Vec32c const & a, Divisor_s const & d) { - // expand into two Vec16s - Vec16s low = extend_low(a) / d; - Vec16s high = extend_high(a) / d; - return compress(low,high); -} - -// vector of 32 8-bit unsigned integers -static inline Vec32uc operator / (Vec32uc const & a, Divisor_us const & d) { - // expand into two Vec16s - Vec16us low = extend_low(a) / d; - Vec16us high = extend_high(a) / d; - return compress(low,high); -} - -// vector operator /= : divide -static inline Vec8i & operator /= (Vec8i & a, Divisor_i const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec8ui & operator /= (Vec8ui & a, Divisor_ui const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec16s & operator /= (Vec16s & a, Divisor_s const & d) { - a = a / d; - return a; -} - - -// vector operator /= : divide -static inline Vec16us & operator /= (Vec16us & a, Divisor_us const & d) { - a = a / d; - return a; - -} - -// vector operator /= : divide -static inline Vec32c & operator /= (Vec32c & a, Divisor_s const & d) { - a = a / d; - return a; -} - -// vector operator /= : divide -static inline Vec32uc & operator /= (Vec32uc & a, Divisor_us const & d) { - a = a / d; - return a; -} - - -/***************************************************************************** -* -* Integer division 2: divisor is a compile-time constant -* -*****************************************************************************/ - -// Divide Vec8i by compile-time constant -template -static inline Vec8i divide_by_i(Vec8i const & x) { - Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero - if (d == 1) return x; - if (d == -1) return -x; - if (uint32_t(d) == 0x80000000u) return Vec8i(x == Vec8i(0x80000000)) & 1; // prevent overflow when changing sign - const uint32_t d1 = d > 0 ? uint32_t(d) : -uint32_t(d); // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits) - if ((d1 & (d1-1)) == 0) { - // d1 is a power of 2. use shift - const int k = bit_scan_reverse_const(d1); - __m256i sign; - if (k > 1) sign = _mm256_srai_epi32(x, k-1); else sign = x; // k copies of sign bit - __m256i bias = _mm256_srli_epi32(sign, 32-k); // bias = x >= 0 ? 0 : k-1 - __m256i xpbias = _mm256_add_epi32 (x, bias); // x + bias - __m256i q = _mm256_srai_epi32(xpbias, k); // (x + bias) >> k - if (d > 0) return q; // d > 0: return q - return _mm256_sub_epi32(_mm256_setzero_si256(), q); // d < 0: return -q - } - // general case - const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case) - const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32)); // multiplier - const Divisor_i div(mult, sh, d < 0 ? -1 : 0); - return x / div; -} - -// define Vec8i a / const_int(d) -template -static inline Vec8i operator / (Vec8i const & a, Const_int_t) { - return divide_by_i(a); -} - -// define Vec8i a / const_uint(d) -template -static inline Vec8i operator / (Vec8i const & a, Const_uint_t) { - Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned - return divide_by_i(a); // signed divide -} - -// vector operator /= : divide -template -static inline Vec8i & operator /= (Vec8i & a, Const_int_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec8i & operator /= (Vec8i & a, Const_uint_t b) { - a = a / b; - return a; -} - - -// Divide Vec8ui by compile-time constant -template -static inline Vec8ui divide_by_ui(Vec8ui const & x) { - Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero - if (d == 1) return x; // divide by 1 - const int b = bit_scan_reverse_const(d); // floor(log2(d)) - if ((uint32_t(d) & (uint32_t(d)-1)) == 0) { - // d is a power of 2. use shift - return _mm256_srli_epi32(x, b); // x >> b - } - // general case (d > 2) - uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d - const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d - const bool round_down = (2*rem < d); // check if fraction is less than 0.5 - if (!round_down) { - mult = mult + 1; // round up mult - } - // do 32*32->64 bit unsigned multiplication and get high part of result - const __m256i multv = _mm256_set_epi32(0,mult,0,mult,0,mult,0,mult);// zero-extend mult and broadcast - __m256i t1 = _mm256_mul_epu32(x,multv); // 32x32->64 bit unsigned multiplication of x[0] and x[2] - if (round_down) { - t1 = _mm256_add_epi64(t1,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow - } - __m256i t2 = _mm256_srli_epi64(t1,32); // high dword of result 0 and 2 - __m256i t3 = _mm256_srli_epi64(x,32); // get x[1] and x[3] into position for multiplication - __m256i t4 = _mm256_mul_epu32(t3,multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3] - if (round_down) { - t4 = _mm256_add_epi64(t4,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow - } - __m256i t5 = _mm256_set_epi32(-1,0,-1,0,-1,0,-1,0); // mask of dword 1 and 3 - __m256i t7 = _mm256_blendv_epi8(t2,t4,t5); // blend two results - Vec8ui q = _mm256_srli_epi32(t7, b); // shift right by b - return q; // no overflow possible -} - -// define Vec8ui a / const_uint(d) -template -static inline Vec8ui operator / (Vec8ui const & a, Const_uint_t) { - return divide_by_ui(a); -} - -// define Vec8ui a / const_int(d) -template -static inline Vec8ui operator / (Vec8ui const & a, Const_int_t) { - Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous - return divide_by_ui(a); // unsigned divide -} - -// vector operator /= : divide -template -static inline Vec8ui & operator /= (Vec8ui & a, Const_uint_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec8ui & operator /= (Vec8ui & a, Const_int_t b) { - a = a / b; - return a; -} - - -// Divide Vec16s by compile-time constant -template -static inline Vec16s divide_by_i(Vec16s const & x) { - const int16_t d0 = int16_t(d); // truncate d to 16 bits - Static_error_check<(d0 != 0)> Dividing_by_zero; // Error message if dividing by zero - if (d0 == 1) return x; // divide by 1 - if (d0 == -1) return -x; // divide by -1 - if (uint16_t(d0) == 0x8000u) return Vec16s(x == Vec16s(0x8000)) & 1;// prevent overflow when changing sign - const uint16_t d1 = d0 > 0 ? d0 : -d0; // compile-time abs(d0) - if ((d1 & (d1-1)) == 0) { - // d is a power of 2. use shift - const int k = bit_scan_reverse_const(uint32_t(d1)); - __m256i sign; - if (k > 1) sign = _mm256_srai_epi16(x, k-1); else sign = x; // k copies of sign bit - __m256i bias = _mm256_srli_epi16(sign, 16-k); // bias = x >= 0 ? 0 : k-1 - __m256i xpbias = _mm256_add_epi16 (x, bias); // x + bias - __m256i q = _mm256_srai_epi16(xpbias, k); // (x + bias) >> k - if (d0 > 0) return q; // d0 > 0: return q - return _mm256_sub_epi16(_mm256_setzero_si256(), q); // d0 < 0: return -q - } - // general case - const int L = bit_scan_reverse_const(uint16_t(d1-1)) + 1; // ceil(log2(d)). (d < 2 handled above) - const int16_t mult = int16_t(1 + (1u << (15+L)) / uint32_t(d1) - 0x10000);// multiplier - const int shift1 = L - 1; - const Divisor_s div(mult, shift1, d0 > 0 ? 0 : -1); - return x / div; -} - -// define Vec16s a / const_int(d) -template -static inline Vec16s operator / (Vec16s const & a, Const_int_t) { - return divide_by_i(a); -} - -// define Vec16s a / const_uint(d) -template -static inline Vec16s operator / (Vec16s const & a, Const_uint_t) { - Static_error_check< (d<0x8000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned - return divide_by_i(a); // signed divide -} - -// vector operator /= : divide -template -static inline Vec16s & operator /= (Vec16s & a, Const_int_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec16s & operator /= (Vec16s & a, Const_uint_t b) { - a = a / b; - return a; -} - - -// Divide Vec16us by compile-time constant -template -static inline Vec16us divide_by_ui(Vec16us const & x) { - const uint16_t d0 = uint16_t(d); // truncate d to 16 bits - Static_error_check<(d0 != 0)> Dividing_by_zero; // Error message if dividing by zero - if (d0 == 1) return x; // divide by 1 - const int b = bit_scan_reverse_const(d0); // floor(log2(d)) - if ((d0 & (d0-1)) == 0) { - // d is a power of 2. use shift - return _mm256_srli_epi16(x, b); // x >> b - } - // general case (d > 2) - uint16_t mult = uint16_t((uint32_t(1) << (b+16)) / d0); // multiplier = 2^(32+b) / d - const uint32_t rem = (uint32_t(1) << (b+16)) - uint32_t(d0)*mult;// remainder 2^(32+b) % d - const bool round_down = (2*rem < d0); // check if fraction is less than 0.5 - Vec16us x1 = x; - if (round_down) { - x1 = x1 + 1; // round down mult and compensate by adding 1 to x - } - else { - mult = mult + 1; // round up mult. no compensation needed - } - const __m256i multv = _mm256_set1_epi16(mult); // broadcast mult - __m256i xm = _mm256_mulhi_epu16(x1, multv); // high part of 16x16->32 bit unsigned multiplication - Vec16us q = _mm256_srli_epi16(xm, b); // shift right by b - if (round_down) { - Vec16sb overfl = (x1 == Vec16us(_mm256_setzero_si256())); // check for overflow of x+1 - return select(overfl, Vec16us(mult >> b), q); // deal with overflow (rarely needed) - } - else { - return q; // no overflow possible - } -} - -// define Vec16us a / const_uint(d) -template -static inline Vec16us operator / (Vec16us const & a, Const_uint_t) { - return divide_by_ui(a); -} - -// define Vec16us a / const_int(d) -template -static inline Vec16us operator / (Vec16us const & a, Const_int_t) { - Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous - return divide_by_ui(a); // unsigned divide -} - -// vector operator /= : divide -template -static inline Vec16us & operator /= (Vec16us & a, Const_uint_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec16us & operator /= (Vec16us & a, Const_int_t b) { - a = a / b; - return a; -} - - -// define Vec32c a / const_int(d) -template -static inline Vec32c operator / (Vec32c const & a, Const_int_t) { - // expand into two Vec16s - Vec16s low = extend_low(a) / Const_int_t(); - Vec16s high = extend_high(a) / Const_int_t(); - return compress(low,high); -} - -// define Vec32c a / const_uint(d) -template -static inline Vec32c operator / (Vec32c const & a, Const_uint_t) { - Static_error_check< (uint8_t(d)<0x80u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned - return a / Const_int_t(); // signed divide -} - -// vector operator /= : divide -template -static inline Vec32c & operator /= (Vec32c & a, Const_int_t b) { - a = a / b; - return a; -} -// vector operator /= : divide -template -static inline Vec32c & operator /= (Vec32c & a, Const_uint_t b) { - a = a / b; - return a; -} - -// define Vec32uc a / const_uint(d) -template -static inline Vec32uc operator / (Vec32uc const & a, Const_uint_t) { - // expand into two Vec16us - Vec16us low = extend_low(a) / Const_uint_t(); - Vec16us high = extend_high(a) / Const_uint_t(); - return compress(low,high); -} - -// define Vec32uc a / const_int(d) -template -static inline Vec32uc operator / (Vec32uc const & a, Const_int_t) { - Static_error_check< (int8_t(d)>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous - return a / Const_uint_t(); // unsigned divide -} - -// vector operator /= : divide -template -static inline Vec32uc & operator /= (Vec32uc & a, Const_uint_t b) { - a = a / b; - return a; -} - -// vector operator /= : divide -template -static inline Vec32uc & operator /= (Vec32uc & a, Const_int_t b) { - a = a / b; - return a; -} - -/***************************************************************************** -* -* Horizontal scan functions -* -*****************************************************************************/ - -// Get index to the first element that is true. Return -1 if all are false -static inline int horizontal_find_first(Vec32cb const & x) { - uint32_t a = _mm256_movemask_epi8(x); - if (a == 0) return -1; - int32_t b = bit_scan_forward(a); - return b; -} - -static inline int horizontal_find_first(Vec16sb const & x) { - return horizontal_find_first(Vec32cb(x)) >> 1; -} - -static inline int horizontal_find_first(Vec8ib const & x) { - return horizontal_find_first(Vec32cb(x)) >> 2; -} - -static inline int horizontal_find_first(Vec4qb const & x) { - return horizontal_find_first(Vec32cb(x)) >> 3; -} - -// Count the number of elements that are true -static inline uint32_t horizontal_count(Vec32cb const & x) { - uint32_t a = _mm256_movemask_epi8(x); - return vml_popcnt(a); -} - -static inline uint32_t horizontal_count(Vec16sb const & x) { - return horizontal_count(Vec32cb(x)) >> 1; -} - -static inline uint32_t horizontal_count(Vec8ib const & x) { - return horizontal_count(Vec32cb(x)) >> 2; -} - -static inline uint32_t horizontal_count(Vec4qb const & x) { - return horizontal_count(Vec32cb(x)) >> 3; -} - -/***************************************************************************** -* -* Boolean <-> bitfield conversion functions -* -*****************************************************************************/ - -// to_bits: convert boolean vector to integer bitfield -static inline uint32_t to_bits(Vec32cb const & x) { - return (uint32_t)_mm256_movemask_epi8(x); -} - -// to_Vec16c: convert integer bitfield to boolean vector -static inline Vec32cb to_Vec32cb(uint32_t x) { - return Vec32cb(Vec32c(to_Vec16cb(uint16_t(x)), to_Vec16cb(uint16_t(x>>16)))); -} - -// to_bits: convert boolean vector to integer bitfield -static inline uint16_t to_bits(Vec16sb const & x) { - __m128i a = _mm_packs_epi16(x.get_low(), x.get_high()); // 16-bit words to bytes - return (uint16_t)_mm_movemask_epi8(a); -} - -// to_Vec16sb: convert integer bitfield to boolean vector -static inline Vec16sb to_Vec16sb(uint16_t x) { - return Vec16sb(Vec16s(to_Vec8sb(uint8_t(x)), to_Vec8sb(uint8_t(x>>8)))); -} - -#if INSTRSET < 9 || MAX_VECTOR_SIZE < 512 -// These functions are defined in Vectori512.h if AVX512 instruction set is used - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec8ib const & x) { - __m128i a = _mm_packs_epi32(x.get_low(), x.get_high()); // 32-bit dwords to 16-bit words - __m128i b = _mm_packs_epi16(a, a); // 16-bit words to bytes - return (uint8_t)_mm_movemask_epi8(b); -} - -// to_Vec8ib: convert integer bitfield to boolean vector -static inline Vec8ib to_Vec8ib(uint8_t x) { - return Vec8ib(Vec8i(to_Vec4ib(x), to_Vec4ib(x>>4))); -} - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4qb const & x) { - uint32_t a = _mm256_movemask_epi8(x); - return ((a & 1) | ((a >> 7) & 2)) | (((a >> 14) & 4) | ((a >> 21) & 8)); -} - -// to_Vec4qb: convert integer bitfield to boolean vector -static inline Vec4qb to_Vec4qb(uint8_t x) { - return Vec4qb(Vec4q(-(x&1), -((x>>1)&1), -((x>>2)&1), -((x>>3)&1))); -} - -#else // function prototypes here only - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec8ib x); - -// to_Vec8ib: convert integer bitfield to boolean vector -static inline Vec8ib to_Vec8ib(uint8_t x); - -// to_bits: convert boolean vector to integer bitfield -static inline uint8_t to_bits(Vec4qb x); - -// to_Vec4qb: convert integer bitfield to boolean vector -static inline Vec4qb to_Vec4qb(uint8_t x); - -#endif // INSTRSET < 9 || MAX_VECTOR_SIZE < 512 - -#ifdef VCL_NAMESPACE -} -#endif - -#endif // VECTORI256_H diff --git a/LICENSE b/LICENSE index d159169..f288702 100644 --- a/LICENSE +++ b/LICENSE @@ -1,281 +1,622 @@ GNU GENERAL PUBLIC LICENSE - Version 2, June 1991 + Version 3, 29 June 2007 - Copyright (C) 1989, 1991 Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Copyright (C) 2007 Free Software Foundation, Inc. Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed. Preamble - The licenses for most software are designed to take away your -freedom to share and change it. By contrast, the GNU General Public -License is intended to guarantee your freedom to share and change free -software--to make sure the software is free for all its users. This -General Public License applies to most of the Free Software -Foundation's software and to any other program whose authors commit to -using it. (Some other Free Software Foundation software is covered by -the GNU Lesser General Public License instead.) You can apply it to + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to your programs, too. When we speak of free software, we are referring to freedom, not price. Our General Public Licenses are designed to make sure that you have the freedom to distribute copies of free software (and charge for -this service if you wish), that you receive source code or can get it -if you want it, that you can change the software or use pieces of it -in new free programs; and that you know you can do these things. +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. - To protect your rights, we need to make restrictions that forbid -anyone to deny you these rights or to ask you to surrender the rights. -These restrictions translate to certain responsibilities for you if you -distribute copies of the software, or if you modify it. + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. For example, if you distribute copies of such a program, whether -gratis or for a fee, you must give the recipients all the rights that -you have. You must make sure that they, too, receive or can get the -source code. And you must show them these terms so they know their -rights. - - We protect your rights with two steps: (1) copyright the software, and -(2) offer you this license which gives you legal permission to copy, -distribute and/or modify the software. - - Also, for each author's protection and ours, we want to make certain -that everyone understands that there is no warranty for this free -software. If the software is modified by someone else and passed on, we -want its recipients to know that what they have is not the original, so -that any problems introduced by others will not reflect on the original -authors' reputations. - - Finally, any free program is threatened constantly by software -patents. We wish to avoid the danger that redistributors of a free -program will individually obtain patent licenses, in effect making the -program proprietary. To prevent this, we have made it clear that any -patent must be licensed for everyone's free use or not licensed at all. +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. The precise terms and conditions for copying, distribution and modification follow. - GNU GENERAL PUBLIC LICENSE - TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION - - 0. This License applies to any program or other work which contains -a notice placed by the copyright holder saying it may be distributed -under the terms of this General Public License. The "Program", below, -refers to any such program or work, and a "work based on the Program" -means either the Program or any derivative work under copyright law: -that is to say, a work containing the Program or a portion of it, -either verbatim or with modifications and/or translated into another -language. (Hereinafter, translation is included without limitation in -the term "modification".) Each licensee is addressed as "you". - -Activities other than copying, distribution and modification are not -covered by this License; they are outside its scope. The act of -running the Program is not restricted, and the output from the Program -is covered only if its contents constitute a work based on the -Program (independent of having been made by running the Program). -Whether that is true depends on what the Program does. - - 1. You may copy and distribute verbatim copies of the Program's -source code as you receive it, in any medium, provided that you -conspicuously and appropriately publish on each copy an appropriate -copyright notice and disclaimer of warranty; keep intact all the -notices that refer to this License and to the absence of any warranty; -and give any other recipients of the Program a copy of this License -along with the Program. - -You may charge a fee for the physical act of transferring a copy, and -you may at your option offer warranty protection in exchange for a fee. - - 2. You may modify your copy or copies of the Program or any portion -of it, thus forming a work based on the Program, and copy and -distribute such modifications or work under the terms of Section 1 -above, provided that you also meet all of these conditions: - - a) You must cause the modified files to carry prominent notices - stating that you changed the files and the date of any change. - - b) You must cause any work that you distribute or publish, that in - whole or in part contains or is derived from the Program or any - part thereof, to be licensed as a whole at no charge to all third - parties under the terms of this License. - - c) If the modified program normally reads commands interactively - when run, you must cause it, when started running for such - interactive use in the most ordinary way, to print or display an - announcement including an appropriate copyright notice and a - notice that there is no warranty (or else, saying that you provide - a warranty) and that users may redistribute the program under - these conditions, and telling the user how to view a copy of this - License. (Exception: if the Program itself is interactive but - does not normally print such an announcement, your work based on - the Program is not required to print an announcement.) - -These requirements apply to the modified work as a whole. If -identifiable sections of that work are not derived from the Program, -and can be reasonably considered independent and separate works in -themselves, then this License, and its terms, do not apply to those -sections when you distribute them as separate works. But when you -distribute the same sections as part of a whole which is a work based -on the Program, the distribution of the whole must be on the terms of -this License, whose permissions for other licensees extend to the -entire whole, and thus to each and every part regardless of who wrote it. - -Thus, it is not the intent of this section to claim rights or contest -your rights to work written entirely by you; rather, the intent is to -exercise the right to control the distribution of derivative or -collective works based on the Program. - -In addition, mere aggregation of another work not based on the Program -with the Program (or with a work based on the Program) on a volume of -a storage or distribution medium does not bring the other work under -the scope of this License. - - 3. You may copy and distribute the Program (or a work based on it, -under Section 2) in object code or executable form under the terms of -Sections 1 and 2 above provided that you also do one of the following: - - a) Accompany it with the complete corresponding machine-readable - source code, which must be distributed under the terms of Sections - 1 and 2 above on a medium customarily used for software interchange; or, - - b) Accompany it with a written offer, valid for at least three - years, to give any third party, for a charge no more than your - cost of physically performing source distribution, a complete - machine-readable copy of the corresponding source code, to be - distributed under the terms of Sections 1 and 2 above on a medium - customarily used for software interchange; or, - - c) Accompany it with the information you received as to the offer - to distribute corresponding source code. (This alternative is - allowed only for noncommercial distribution and only if you - received the program in object code or executable form with such - an offer, in accord with Subsection b above.) - -The source code for a work means the preferred form of the work for -making modifications to it. For an executable work, complete source -code means all the source code for all modules it contains, plus any -associated interface definition files, plus the scripts used to -control compilation and installation of the executable. However, as a -special exception, the source code distributed need not include -anything that is normally distributed (in either source or binary -form) with the major components (compiler, kernel, and so on) of the -operating system on which the executable runs, unless that component -itself accompanies the executable. - -If distribution of executable or object code is made by offering -access to copy from a designated place, then offering equivalent -access to copy the source code from the same place counts as -distribution of the source code, even though third parties are not -compelled to copy the source along with the object code. - - 4. You may not copy, modify, sublicense, or distribute the Program -except as expressly provided under this License. Any attempt -otherwise to copy, modify, sublicense or distribute the Program is -void, and will automatically terminate your rights under this License. -However, parties who have received copies, or rights, from you under -this License will not have their licenses terminated so long as such -parties remain in full compliance. - - 5. You are not required to accept this License, since you have not -signed it. However, nothing else grants you permission to modify or -distribute the Program or its derivative works. These actions are -prohibited by law if you do not accept this License. Therefore, by -modifying or distributing the Program (or any work based on the -Program), you indicate your acceptance of this License to do so, and -all its terms and conditions for copying, distributing or modifying -the Program or works based on it. - - 6. Each time you redistribute the Program (or any work based on the -Program), the recipient automatically receives a license from the -original licensor to copy, distribute or modify the Program subject to -these terms and conditions. You may not impose any further -restrictions on the recipients' exercise of the rights granted herein. -You are not responsible for enforcing compliance by third parties to + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of this License. - 7. If, as a consequence of a court judgment or allegation of patent -infringement or for any other reason (not limited to patent issues), -conditions are imposed on you (whether by court order, agreement or + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot -distribute so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you -may not distribute the Program at all. For example, if a patent -license would not permit royalty-free redistribution of the Program by -all those who receive copies directly or indirectly through you, then -the only way you could satisfy both it and this License would be to -refrain entirely from distribution of the Program. - -If any portion of this section is held invalid or unenforceable under -any particular circumstance, the balance of the section is intended to -apply and the section as a whole is intended to apply in other -circumstances. - -It is not the purpose of this section to induce you to infringe any -patents or other property right claims or to contest validity of any -such claims; this section has the sole purpose of protecting the -integrity of the free software distribution system, which is -implemented by public license practices. Many people have made -generous contributions to the wide range of software distributed -through that system in reliance on consistent application of that -system; it is up to the author/donor to decide if he or she is willing -to distribute software through any other system and a licensee cannot -impose that choice. - -This section is intended to make thoroughly clear what is believed to -be a consequence of the rest of this License. - - 8. If the distribution and/or use of the Program is restricted in -certain countries either by patents or by copyrighted interfaces, the -original copyright holder who places the Program under this License -may add an explicit geographical distribution limitation excluding -those countries, so that distribution is permitted only in or among -countries not thus excluded. In such case, this License incorporates -the limitation as if written in the body of this License. - - 9. The Free Software Foundation may publish revised and/or new versions -of the General Public License from time to time. Such new versions will +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will be similar in spirit to the present version, but may differ in detail to address new problems or concerns. -Each version is given a distinguishing version number. If the Program -specifies a version number of this License which applies to it and "any -later version", you have the option of following the terms and conditions -either of that version or of any later version published by the Free -Software Foundation. If the Program does not specify a version number of -this License, you may choose any version ever published by the Free Software -Foundation. - - 10. If you wish to incorporate parts of the Program into other free -programs whose distribution conditions are different, write to the author -to ask for permission. For software which is copyrighted by the Free -Software Foundation, write to the Free Software Foundation; we sometimes -make exceptions for this. Our decision will be guided by the two goals -of preserving the free status of all derivatives of our free software and -of promoting the sharing and reuse of software generally. - - NO WARRANTY - - 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY -FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN -OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES -PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED -OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS -TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE -PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, -REPAIR OR CORRECTION. - - 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR -REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, -INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING -OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED -TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY -YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER -PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE -POSSIBILITY OF SUCH DAMAGES. + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. END OF TERMS AND CONDITIONS @@ -287,15 +628,15 @@ free software which everyone can redistribute and change under these terms. To do so, attach the following notices to the program. It is safest to attach them to the start of each source file to most effectively -convey the exclusion of warranty; and each file should have at least +state the exclusion of warranty; and each file should have at least the "copyright" line and a pointer to where the full notice is found. Copyright (C) - This program is free software; you can redistribute it and/or modify + This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2 of the License, or + the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, @@ -303,37 +644,31 @@ the "copyright" line and a pointer to where the full notice is found. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. - You should have received a copy of the GNU General Public License along - with this program; if not, write to the Free Software Foundation, Inc., - 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + You should have received a copy of the GNU General Public License + along with this program. If not, see . Also add information on how to contact you by electronic and paper mail. -If the program is interactive, make it output a short notice like this -when it starts in an interactive mode: + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: - Gnomovision version 69, Copyright (C) year name of author - Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. This is free software, and you are welcome to redistribute it under certain conditions; type `show c' for details. The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, the commands you use may -be called something other than `show w' and `show c'; they could even be -mouse-clicks or menu items--whatever suits your program. - -You should also get your employer (if you work as a programmer) or your -school, if any, to sign a "copyright disclaimer" for the program, if -necessary. Here is a sample; alter the names: - - Yoyodyne, Inc., hereby disclaims all copyright interest in the program - `Gnomovision' (which makes passes at compilers) written by James Hacker. - - , 1 April 1989 - Ty Coon, President of Vice - -This General Public License does not permit incorporating your program into -proprietary programs. If your program is a subroutine library, you may -consider it more useful to permit linking proprietary applications with the -library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 81758c6..62080cd 100644 --- a/README.md +++ b/README.md @@ -303,6 +303,7 @@ opt - 1 - use c 2 - use sse2 3 - use avx2 + 3 - use avx512 ``` diff --git a/meson.build b/meson.build index aa2b782..3cb0aa4 100644 --- a/meson.build +++ b/meson.build @@ -1,51 +1,65 @@ project('DFTTest', 'cpp', - default_options : ['buildtype=release', 'b_ndebug=if-release', 'cpp_std=c++14'], - meson_version : '>=0.48.0', - version : '6' + default_options: ['buildtype=release', 'b_ndebug=if-release', 'cpp_std=c++17'], + meson_version: '>=0.48.0', + version: '7' ) -add_project_arguments('-ffast-math', language : 'cpp') - sources = [ 'DFTTest/DFTTest.cpp', - 'DFTTest/DFTTest.hpp', - 'DFTTest/vectorclass/instrset.h', - 'DFTTest/vectorclass/instrset_detect.cpp' + 'DFTTest/DFTTest.h' ] -vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args : true, includes : true) +vapoursynth_dep = dependency('vapoursynth').partial_dependency(compile_args: true, includes: true) fftw3f_dep = dependency('fftw3f') libs = [] if host_machine.cpu_family().startswith('x86') - add_project_arguments('-DVS_TARGET_CPU_X86', '-mfpmath=sse', '-msse2', language : 'cpp') + add_project_arguments('-fno-math-errno', '-fno-trapping-math', '-DDFTTEST_X86', '-mfpmath=sse', '-msse2', language: 'cpp') sources += [ 'DFTTest/DFTTest_SSE2.cpp', - 'DFTTest/vectorclass/vectorclass.h', - 'DFTTest/vectorclass/vectorf128.h', - 'DFTTest/vectorclass/vectorf256.h', - 'DFTTest/vectorclass/vectorf256e.h', - 'DFTTest/vectorclass/vectori128.h', - 'DFTTest/vectorclass/vectori256.h', - 'DFTTest/vectorclass/vectori256e.h', - 'DFTTest/vectorclass/vectormath_common.h', - 'DFTTest/vectorclass/vectormath_exp.h' + 'DFTTest/VCL2/instrset.h', + 'DFTTest/VCL2/instrset_detect.cpp', + 'DFTTest/VCL2/vector_convert.h', + 'DFTTest/VCL2/vectorclass.h', + 'DFTTest/VCL2/vectorf128.h', + 'DFTTest/VCL2/vectorf256.h', + 'DFTTest/VCL2/vectorf256e.h', + 'DFTTest/VCL2/vectorf512.h', + 'DFTTest/VCL2/vectorf512e.h', + 'DFTTest/VCL2/vectori128.h', + 'DFTTest/VCL2/vectori256.h', + 'DFTTest/VCL2/vectori256e.h', + 'DFTTest/VCL2/vectori512.h', + 'DFTTest/VCL2/vectori512e.h', + 'DFTTest/VCL2/vectori512s.h', + 'DFTTest/VCL2/vectori512se.h', + 'DFTTest/VCL2/vectormath_common.h', + 'DFTTest/VCL2/vectormath_exp.h', + 'DFTTest/VCL2/vectormath_hyp.h', + 'DFTTest/VCL2/vectormath_lib.h', + 'DFTTest/VCL2/vectormath_trig.h' ] libs += static_library('avx2', 'DFTTest/DFTTest_AVX2.cpp', - dependencies : [vapoursynth_dep, fftw3f_dep], - cpp_args : ['-mavx2', '-mfma'], - gnu_symbol_visibility : 'hidden' + dependencies: [vapoursynth_dep, fftw3f_dep], + cpp_args: ['-mavx2', '-mfma'], + gnu_symbol_visibility: 'hidden' + ) + + libs += static_library('avx512', 'DFTTest/DFTTest_AVX512.cpp', + dependencies: [vapoursynth_dep, fftw3f_dep], + cpp_args: ['-mavx512f', '-mavx512vl', '-mavx512bw', '-mavx512dq', '-mfma'], + gnu_symbol_visibility: 'hidden' ) endif shared_module('dfttest', sources, - dependencies : [vapoursynth_dep, fftw3f_dep], - link_with : libs, - install : true, - install_dir : join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth'), - gnu_symbol_visibility : 'hidden' + dependencies: [vapoursynth_dep, fftw3f_dep], + link_with: libs, + install: true, + install_dir: join_paths(vapoursynth_dep.get_pkgconfig_variable('libdir'), 'vapoursynth'), + gnu_symbol_visibility: 'hidden' )