diff --git a/DFTTest/DFTTest.cpp b/DFTTest/DFTTest.cpp
index 51d7bc2..c9e9b98 100644
--- a/DFTTest/DFTTest.cpp
+++ b/DFTTest/DFTTest.cpp
@@ -7,9 +7,9 @@
**
** Copyright (C) 2007-2010 Kevin Stone
**
-** This program is free software; you can redistribute it and/or modify
+** This program is free software: you can redistribute it and/or modify
** it under the terms of the GNU General Public License as published by
-** the Free Software Foundation; either version 2 of the License, or
+** the Free Software Foundation, either version 3 of the License, or
** (at your option) any later version.
**
** This program is distributed in the hope that it will be useful,
@@ -18,8 +18,7 @@
** GNU General Public License for more details.
**
** You should have received a copy of the GNU General Public License
-** along with this program; if not, write to the Free Software
-** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+** along with this program. If not, see .
*/
#define _USE_MATH_DEFINES
@@ -27,249 +26,73 @@
#include
#include
-#include
#include
-#include "DFTTest.hpp"
+#include "DFTTest.h"
-#ifdef VS_TARGET_CPU_X86
-#include "vectorclass/vectorclass.h"
+using namespace std::literals;
-template extern void filter_sse2(float *, const float *, const int, const float *, const float *, const float *) noexcept;
-template extern void filter_avx2(float *, const float *, const int, const float *, const float *, const float *) noexcept;
+#ifdef DFTTEST_X86
+#include "VCL2/vectorclass.h"
-template extern void func_0_sse2(VSFrameRef * [3], VSFrameRef *, const DFTTestData *, const VSAPI *) noexcept;
-template extern void func_0_avx2(VSFrameRef * [3], VSFrameRef *, const DFTTestData *, const VSAPI *) noexcept;
+template extern void filter_sse2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept;
+template extern void filter_avx2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept;
+template extern void filter_avx512(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept;
-template extern void func_1_sse2(VSFrameRef * [15][3], VSFrameRef *, const int, const DFTTestData *, const VSAPI *) noexcept;
-template extern void func_1_avx2(VSFrameRef * [15][3], VSFrameRef *, const int, const DFTTestData *, const VSAPI *) noexcept;
+template extern void func_0_sse2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+template extern void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+template extern void func_0_avx512(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+
+template extern void func_1_sse2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+template extern void func_1_avx2(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+template extern void func_1_avx512(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
#endif
#define EXTRA(a,b) (((a) % (b)) ? ((b) - ((a) % (b))) : 0)
-struct NPInfo {
- int fn, b, y, x;
-};
-
-static double besselI0(double p) noexcept {
- p /= 2.;
- double n = 1., t = 1., d = 1.;
- int k = 1;
- double v;
-
- do {
- n *= p;
- d *= k;
- v = n / d;
- t += v * v;
- } while (++k < 15 && v > 1e-8);
-
- return t;
-}
-
-static double getWinValue(const double n, const double size, const int win, const double beta) noexcept {
- switch (win) {
- case 0: // hanning
- return 0.5 - 0.5 * std::cos(2. * M_PI * n / size);
- case 1: // hamming
- return 0.53836 - 0.46164 * std::cos(2. * M_PI * n / size);
- case 2: // blackman
- return 0.42 - 0.5 * std::cos(2. * M_PI * n / size) + 0.08 * std::cos(4. * M_PI * n / size);
- case 3: // 4 term blackman-harris
- return 0.35875 - 0.48829 * std::cos(2. * M_PI * n / size) + 0.14128 * std::cos(4. * M_PI * n / size) - 0.01168 * std::cos(6. * M_PI * n / size);
- case 4: // kaiser-bessel
- {
- const double v = 2. * n / size - 1.;
- return besselI0(M_PI * beta * std::sqrt(1. - v * v)) / besselI0(M_PI * beta);
- }
- case 5: // 7 term blackman-harris
- return 0.27105140069342415 -
- 0.433297939234486060 * std::cos(2. * M_PI * n / size) +
- 0.218122999543110620 * std::cos(4. * M_PI * n / size) -
- 0.065925446388030898 * std::cos(6. * M_PI * n / size) +
- 0.010811742098372268 * std::cos(8. * M_PI * n / size) -
- 7.7658482522509342E-4 * std::cos(10. * M_PI * n / size) +
- 1.3887217350903198E-5 * std::cos(12. * M_PI * n / size);
- case 6: // flat top
- return 0.2810639 - 0.5208972 * std::cos(2. * M_PI * n / size) + 0.1980399 * std::cos(4. * M_PI * n / size);
- case 7: // rectangular
- return 1.;
- case 8: // Bartlett
- return 2. / size * (size / 2. - std::abs(n - size / 2.));
- case 9: // Bartlett-Hann
- return 0.62 - 0.48 * (n / size - 0.5) - 0.38 * std::cos(2. * M_PI * n / size);
- case 10: // Nuttall
- return 0.355768 - 0.487396 * std::cos(2. * M_PI * n / size) + 0.144232 * std::cos(4. * M_PI * n / size) - 0.012604 * std::cos(6. * M_PI * n / size);
- case 11: // Blackman-Nuttall
- return 0.3635819 - 0.4891775 * std::cos(2. * M_PI * n / size) + 0.1365995 * std::cos(4. * M_PI * n / size) - 0.0106411 * std::cos(6. * M_PI * n / size);
- default:
- return 0.;
- }
-}
-
-static void normalizeForOverlapAdd(double * VS_RESTRICT hw, const int bsize, const int osize) noexcept {
- double * VS_RESTRICT nw = new double[bsize]();
- const int inc = bsize - osize;
-
- for (int q = 0; q < bsize; q++) {
- for (int h = q; h >= 0; h -= inc)
- nw[q] += hw[h] * hw[h];
- for (int h = q + inc; h < bsize; h += inc)
- nw[q] += hw[h] * hw[h];
- }
-
- for (int q = 0; q < bsize; q++)
- hw[q] /= std::sqrt(nw[q]);
-
- delete[] nw;
-}
-
-static void createWindow(float * VS_RESTRICT hw, const int tmode, const int smode, const DFTTestData * d) noexcept {
- double * VS_RESTRICT tw = new double[d->tbsize];
- for (int j = 0; j < d->tbsize; j++)
- tw[j] = getWinValue(j + 0.5, d->tbsize, d->twin, d->tbeta);
- if (tmode == 1)
- normalizeForOverlapAdd(tw, d->tbsize, d->tosize);
-
- double * VS_RESTRICT sw = new double[d->sbsize];
- for (int j = 0; j < d->sbsize; j++)
- sw[j] = getWinValue(j + 0.5, d->sbsize, d->swin, d->sbeta);
- if (smode == 1)
- normalizeForOverlapAdd(sw, d->sbsize, d->sosize);
-
- const double nscale = 1. / std::sqrt(d->bvolume);
- for (int j = 0; j < d->tbsize; j++)
- for (int k = 0; k < d->sbsize; k++)
- for (int q = 0; q < d->sbsize; q++)
- hw[(j * d->sbsize + k) * d->sbsize + q] = static_cast(tw[j] * sw[k] * sw[q] * nscale);
-
- delete[] tw;
- delete[] sw;
+template
+static auto getArg(const VSAPI * vsapi, const VSMap * map, const char * key, const arg_t defaultValue) noexcept {
+ arg_t arg{};
+ int err{};
+
+ if constexpr (std::is_same_v)
+ arg = !!vsapi->propGetInt(map, key, 0, &err);
+ else if constexpr (std::is_same_v)
+ arg = int64ToIntS(vsapi->propGetInt(map, key, 0, &err));
+ else if constexpr (std::is_same_v)
+ arg = vsapi->propGetInt(map, key, 0, &err);
+ else if constexpr (std::is_same_v)
+ arg = static_cast(vsapi->propGetFloat(map, key, 0, &err));
+ else if constexpr (std::is_same_v)
+ arg = vsapi->propGetFloat(map, key, 0, &err);
+
+ if (err)
+ arg = defaultValue;
+
+ return arg;
}
-static float * parseSigmaLocation(const double * s, const int num, int & poscnt, const float sigma, const float pfact) {
- float * parray = nullptr;
-
- if (!s) {
- parray = new float[4];
- parray[0] = 0.0f;
- parray[2] = 1.0f;
- parray[1] = parray[3] = std::pow(sigma, pfact);
- poscnt = 2;
- } else {
- const double * sT = s;
- bool found[2] = { false, false };
- poscnt = 0;
-
- for (int i = 0; i < num; i += 2) {
- const float pos = static_cast(sT[i]);
-
- if (pos < 0.0f || pos > 1.0f)
- throw std::string{ "sigma location - invalid pos (" } + std::to_string(pos) + ")";
-
- if (pos == 0.0f)
- found[0] = true;
- else if (pos == 1.0f)
- found[1] = true;
-
- poscnt++;
- }
-
- if (!found[0] || !found[1])
- throw std::string{ "sigma location - one or more end points not provided" };
-
- parray = new float[poscnt * 2];
- sT = s;
- poscnt = 0;
-
- for (int i = 0; i < num; i += 2) {
- parray[poscnt * 2 + 0] = static_cast(sT[i + 0]);
- parray[poscnt * 2 + 1] = std::pow(static_cast(sT[i + 1]), pfact);
-
- poscnt++;
- }
-
- for (int i = 1; i < poscnt; i++) {
- int j = i;
- const float t0 = parray[j * 2 + 0];
- const float t1 = parray[j * 2 + 1];
-
- while (j > 0 && parray[(j - 1) * 2] > t0) {
- parray[j * 2 + 0] = parray[(j - 1) * 2 + 0];
- parray[j * 2 + 1] = parray[(j - 1) * 2 + 1];
- j--;
- }
-
- parray[j * 2 + 0] = t0;
- parray[j * 2 + 1] = t1;
- }
- }
-
- return parray;
-}
-
-static float interp(const float pf, const float * pv, const int cnt) noexcept {
- int lidx = 0;
- for (int i = cnt - 1; i >= 0; i--) {
- if (pv[i * 2] <= pf) {
- lidx = i;
- break;
- }
- }
-
- int hidx = cnt - 1;
- for (int i = 0; i < cnt; i++) {
- if (pv[i * 2] >= pf) {
- hidx = i;
- break;
- }
- }
-
- const float d0 = pf - pv[lidx * 2];
- const float d1 = pv[hidx * 2] - pf;
-
- if (hidx == lidx || d0 <= 0.0f)
- return pv[lidx * 2 + 1];
- if (d1 <= 0.0f)
- return pv[hidx * 2 + 1];
-
- const float tf = d0 / (d0 + d1);
- return pv[lidx * 2 + 1] * (1.0f - tf) + pv[hidx * 2 + 1] * tf;
-}
-
-static float getSVal(const int pos, const int len, const float * pv, const int cnt, float & pf) noexcept {
- if (len == 1) {
- pf = 0.0f;
- return 1.0f;
- }
-
- const int ld2 = len / 2;
- if (pos > ld2)
- pf = (len - pos) / static_cast(ld2);
- else
- pf = pos / static_cast(ld2);
-
- return interp(pf, pv, cnt);
-}
-
-template
-static void copyPad(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestData * d, const VSAPI * vsapi) noexcept {
+template
+static auto copyPad(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane]) {
const int srcWidth = vsapi->getFrameWidth(src, plane);
const int dstWidth = vsapi->getFrameWidth(dst[plane], 0);
const int srcHeight = vsapi->getFrameHeight(src, plane);
const int dstHeight = vsapi->getFrameHeight(dst[plane], 0);
- const int dstStride = vsapi->getStride(dst[plane], 0) / sizeof(T);
+ const int dstStride = vsapi->getStride(dst[plane], 0) / sizeof(pixel_t);
const int offy = (dstHeight - srcHeight) / 2;
const int offx = (dstWidth - srcWidth) / 2;
- vs_bitblt(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * offy + offx * sizeof(T), vsapi->getStride(dst[plane], 0),
- vsapi->getReadPtr(src, plane), vsapi->getStride(src, plane),
- srcWidth * sizeof(T), srcHeight);
+ vs_bitblt(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * offy + offx * sizeof(pixel_t),
+ vsapi->getStride(dst[plane], 0),
+ vsapi->getReadPtr(src, plane),
+ vsapi->getStride(src, plane),
+ srcWidth * sizeof(pixel_t),
+ srcHeight);
- T * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst[plane], 0)) + dstStride * offy;
+ pixel_t * VS_RESTRICT dstp = reinterpret_cast(vsapi->getWritePtr(dst[plane], 0)) + dstStride * offy;
for (int y = offy; y < srcHeight + offy; y++) {
int w = offx * 2;
@@ -287,25 +110,22 @@ static void copyPad(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestDa
for (int y = 0; y < offy; y++, w--)
memcpy(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * y,
vsapi->getReadPtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * w,
- dstWidth * sizeof(T));
+ dstWidth * sizeof(pixel_t));
w = offy + srcHeight - 2;
for (int y = offy + srcHeight; y < dstHeight; y++, w--)
memcpy(vsapi->getWritePtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * y,
vsapi->getReadPtr(dst[plane], 0) + vsapi->getStride(dst[plane], 0) * w,
- dstWidth * sizeof(T));
+ dstWidth * sizeof(pixel_t));
}
}
}
-template
-static inline void proc0(const T * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept;
-
-template<>
-inline void proc0(const uint8_t * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept {
+template
+static inline auto proc0(const pixel_t * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float srcScale) noexcept {
for (int u = 0; u < p1; u++) {
for (int v = 0; v < p1; v++)
- d[v] = s0[v] * s1[v];
+ d[v] = s0[v] * srcScale * s1[v];
s0 += p0;
s1 += p1;
@@ -313,31 +133,7 @@ inline void proc0(const uint8_t * s0, const float * s1, float * VS_RESTRICT d, c
}
}
-template<>
-inline void proc0(const uint16_t * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept {
- for (int u = 0; u < p1; u++) {
- for (int v = 0; v < p1; v++)
- d[v] = s0[v] * divisor * s1[v];
-
- s0 += p0;
- s1 += p1;
- d += p1;
- }
-}
-
-template<>
-inline void proc0(const float * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1, const float divisor) noexcept {
- for (int u = 0; u < p1; u++) {
- for (int v = 0; v < p1; v++)
- d[v] = s0[v] * 255.0f * s1[v];
-
- s0 += p0;
- s1 += p1;
- d += p1;
- }
-}
-
-static inline void proc1(const float * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1) noexcept {
+static inline auto proc1(const float * s0, const float * s1, float * VS_RESTRICT d, const int p0, const int p1) noexcept {
for (int u = 0; u < p0; u++) {
for (int v = 0; v < p0; v++)
d[v] += s0[v] * s1[v];
@@ -348,179 +144,128 @@ static inline void proc1(const float * s0, const float * s1, float * VS_RESTRICT
}
}
-static inline void removeMean(float * VS_RESTRICT dftc, const float * dftgc, const int ccnt, float * VS_RESTRICT dftc2) noexcept {
+static inline auto removeMean(float * VS_RESTRICT dftc, const float * dftgc, const int ccnt, float * VS_RESTRICT dftc2) noexcept {
const float gf = dftc[0] / dftgc[0];
for (int h = 0; h < ccnt; h += 2) {
- dftc2[h] = gf * dftgc[h];
+ dftc2[h + 0] = gf * dftgc[h + 0];
dftc2[h + 1] = gf * dftgc[h + 1];
- dftc[h] -= dftc2[h];
+ dftc[h + 0] -= dftc2[h + 0];
dftc[h + 1] -= dftc2[h + 1];
}
}
-static inline void addMean(float * VS_RESTRICT dftc, const int ccnt, const float * dftc2) noexcept {
+static inline auto addMean(float * VS_RESTRICT dftc, const int ccnt, const float * dftc2) noexcept {
for (int h = 0; h < ccnt; h += 2) {
- dftc[h] += dftc2[h];
+ dftc[h + 0] += dftc2[h + 0];
dftc[h + 1] += dftc2[h + 1];
}
}
template
-static inline void filter_c(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept;
-
-template<>
-inline void filter_c<0>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 2) {
- const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1];
- const float mult = std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f);
- dftc[h] *= mult;
- dftc[h + 1] *= mult;
- }
-}
-
-template<>
-inline void filter_c<1>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 2) {
- const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1];
- if (psd < sigmas[h])
- dftc[h] = dftc[h + 1] = 0.0f;
- }
-}
-
-template<>
-inline void filter_c<2>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 2) {
- dftc[h] *= sigmas[h];
- dftc[h + 1] *= sigmas[h];
- }
-}
+static inline void filter_c(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
+ const float beta = pmin[0];
-template<>
-inline void filter_c<3>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
for (int h = 0; h < ccnt; h += 2) {
- const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1];
-
- if (psd >= pmin[h] && psd <= pmax[h]) {
- dftc[h] *= sigmas[h];
+ float psd, mult;
+
+ if constexpr (type != 2)
+ psd = dftc[h + 0] * dftc[h + 0] + dftc[h + 1] * dftc[h + 1];
+
+ if constexpr (type == 0) {
+ mult = std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f);
+ } else if constexpr (type == 1) {
+ if (psd < sigmas[h])
+ dftc[h + 0] = dftc[h + 1] = 0.0f;
+ } else if constexpr (type == 2) {
+ dftc[h + 0] *= sigmas[h];
dftc[h + 1] *= sigmas[h];
+ } else if constexpr (type == 3) {
+ if (psd >= pmin[h] && psd <= pmax[h]) {
+ dftc[h + 0] *= sigmas[h];
+ dftc[h + 1] *= sigmas[h];
+ } else {
+ dftc[h + 0] *= sigmas2[h];
+ dftc[h + 1] *= sigmas2[h];
+ }
+ } else if constexpr (type == 4) {
+ mult = sigmas[h] * std::sqrt(psd * pmax[h] / ((psd + pmin[h]) * (psd + pmax[h]) + 1e-15f));
+ } else if constexpr (type == 5) {
+ mult = std::pow(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f), beta);
} else {
- dftc[h] *= sigmas2[h];
- dftc[h + 1] *= sigmas2[h];
+ mult = std::sqrt(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f));
}
- }
-}
-template<>
-inline void filter_c<4>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 2) {
- const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1] + 1e-15f;
- const float mult = sigmas[h] * std::sqrt(psd * pmax[h] / ((psd + pmin[h]) * (psd + pmax[h])));
- dftc[h] *= mult;
- dftc[h + 1] *= mult;
- }
-}
-
-template<>
-inline void filter_c<5>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- const float beta = pmin[0];
-
- for (int h = 0; h < ccnt; h += 2) {
- const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1];
- const float mult = std::pow(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f), beta);
- dftc[h] *= mult;
- dftc[h + 1] *= mult;
- }
-}
-
-template<>
-inline void filter_c<6>(float * VS_RESTRICT dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 2) {
- const float psd = dftc[h] * dftc[h] + dftc[h + 1] * dftc[h + 1];
- const float mult = std::sqrt(std::max((psd - sigmas[h]) / (psd + 1e-15f), 0.0f));
- dftc[h] *= mult;
- dftc[h + 1] *= mult;
- }
-}
-
-template
-static void cast(const float * ebp, T * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride,
- const float multiplier, const int peak) noexcept;
-
-template<>
-void cast(const float * ebp, uint8_t * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride,
- const float multiplier, const int peak) noexcept {
- for (int y = 0; y < dstHeight; y++) {
- for (int x = 0; x < dstWidth; x++)
- dstp[x] = std::min(std::max(static_cast(ebp[x] + 0.5f), 0), 255);
-
- ebp += ebpStride;
- dstp += dstStride;
+ if constexpr (type == 0 || type > 3) {
+ dftc[h + 0] *= mult;
+ dftc[h + 1] *= mult;
+ }
}
}
-template<>
-void cast(const float * ebp, uint16_t * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride,
- const float multiplier, const int peak) noexcept {
+template
+static auto cast(const float * ebp, pixel_t * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride,
+ const float dstScale, const int peak) noexcept {
for (int y = 0; y < dstHeight; y++) {
- for (int x = 0; x < dstWidth; x++)
- dstp[x] = std::min(std::max(static_cast(ebp[x] * multiplier + 0.5f), 0), peak);
+ for (int x = 0; x < dstWidth; x++) {
+ if constexpr (std::is_integral_v)
+ dstp[x] = std::clamp(static_cast(ebp[x] * dstScale + 0.5f), 0, peak);
+ else
+ dstp[x] = ebp[x] * dstScale;
+ }
ebp += ebpStride;
dstp += dstStride;
}
}
-template<>
-void cast(const float * ebp, float * VS_RESTRICT dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride,
- const float multiplier, const int peak) noexcept {
- for (int y = 0; y < dstHeight; y++) {
- for (int x = 0; x < dstWidth; x++)
- dstp[x] = ebp[x] * (1.0f / 255.0f);
+template
+static void func_0_c(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+ const float * hw = d->hw.get();
+ const float * sigmas = d->sigmas.get();
+ const float * sigmas2 = d->sigmas2.get();
+ const float * pmins = d->pmins.get();
+ const float * pmaxs = d->pmaxs.get();
+ const fftwf_complex * dftgc = d->dftgc.get();
+ fftwf_plan ft = d->ft.get();
+ fftwf_plan fti = d->fti.get();
- ebp += ebpStride;
- dstp += dstStride;
- }
-}
-
-template
-static void func_0_c(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept {
const auto threadId = std::this_thread::get_id();
- float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0));
- float * dftr = d->dftr.at(threadId);
- fftwf_complex * dftc = d->dftc.at(threadId);
- fftwf_complex * dftc2 = d->dftc2.at(threadId);
+ float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0));
+ float * dftr = d->dftr.at(threadId).get();
+ fftwf_complex * dftc = d->dftc.at(threadId).get();
+ fftwf_complex * dftc2 = d->dftc2.at(threadId).get();
for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane]) {
const int width = d->padWidth[plane];
const int height = d->padHeight[plane];
const int eheight = d->eheight[plane];
- const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(T);
- const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float);
- const T * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0));
+ const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(pixel_t);
+ const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float);
+ const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0));
float * ebpSaved = ebuff;
memset(ebuff, 0, ebpStride * height * sizeof(float));
for (int y = 0; y < eheight; y += d->inc) {
for (int x = 0; x <= width - d->sbsize; x += d->inc) {
- proc0(srcp + x, d->hw, dftr, srcStride, d->sbsize, d->divisor);
+ proc0(srcp + x, hw, dftr, srcStride, d->sbsize, d->srcScale);
- fftwf_execute_dft_r2c(d->ft, dftr, dftc);
+ fftwf_execute_dft_r2c(ft, dftr, dftc);
if (d->zmean)
- removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2));
+ removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2));
- d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2);
+ d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2);
if (d->zmean)
addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2));
- fftwf_execute_dft_c2r(d->fti, dftc, dftr);
+ fftwf_execute_dft_c2r(fti, dftc, dftr);
if (d->type & 1) // spatial overlapping
- proc1(dftr, d->hw, ebpSaved + x, d->sbsize, ebpStride);
+ proc1(dftr, hw, ebpSaved + x, d->sbsize, ebpStride);
else
- ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * d->hw[d->sbd1 * d->sbsize + d->sbd1];
+ ebpSaved[x + d->sbd1 * ebpStride + d->sbd1] = dftr[d->sbd1 * d->sbsize + d->sbd1] * hw[d->sbd1 * d->sbsize + d->sbd1];
}
srcp += srcStride * d->inc;
@@ -529,54 +274,64 @@ static void func_0_c(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData *
const int dstWidth = vsapi->getFrameWidth(dst, plane);
const int dstHeight = vsapi->getFrameHeight(dst, plane);
- const int dstStride = vsapi->getStride(dst, plane) / sizeof(T);
- T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane));
+ const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t);
+ pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane));
const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2;
- cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak);
+ cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak);
}
}
}
-template
-static void func_1_c(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * d, const VSAPI * vsapi) noexcept {
+template
+static void func_1_c(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+ const float * hw = d->hw.get();
+ const float * sigmas = d->sigmas.get();
+ const float * sigmas2 = d->sigmas2.get();
+ const float * pmins = d->pmins.get();
+ const float * pmaxs = d->pmaxs.get();
+ const fftwf_complex * dftgc = d->dftgc.get();
+ fftwf_plan ft = d->ft.get();
+ fftwf_plan fti = d->fti.get();
+
const auto threadId = std::this_thread::get_id();
- float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0));
- float * dftr = d->dftr.at(threadId);
- fftwf_complex * dftc = d->dftc.at(threadId);
- fftwf_complex * dftc2 = d->dftc2.at(threadId);
+ float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0));
+ float * dftr = d->dftr.at(threadId).get();
+ fftwf_complex * dftc = d->dftc.at(threadId).get();
+ fftwf_complex * dftc2 = d->dftc2.at(threadId).get();
for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane]) {
const int width = d->padWidth[plane];
const int height = d->padHeight[plane];
const int eheight = d->eheight[plane];
- const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(T);
- const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float);
- const T * srcp[15] = {};
+ const int srcStride = vsapi->getStride(src[0][plane], 0) / sizeof(pixel_t);
+ const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float);
+
+ const pixel_t * srcp[15] = {};
for (int i = 0; i < d->tbsize; i++)
- srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0));
+ srcp[i] = reinterpret_cast(vsapi->getReadPtr(src[i][plane], 0));
memset(ebuff, 0, ebpStride * height * sizeof(float));
for (int y = 0; y < eheight; y += d->inc) {
for (int x = 0; x <= width - d->sbsize; x += d->inc) {
for (int z = 0; z < d->tbsize; z++)
- proc0(srcp[z] + x, d->hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->divisor);
+ proc0(srcp[z] + x, hw + d->barea * z, dftr + d->barea * z, srcStride, d->sbsize, d->srcScale);
- fftwf_execute_dft_r2c(d->ft, dftr, dftc);
+ fftwf_execute_dft_r2c(ft, dftr, dftc);
if (d->zmean)
- removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2));
+ removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc), d->ccnt2, reinterpret_cast(dftc2));
- d->filterCoeffs(reinterpret_cast(dftc), d->sigmas, d->ccnt2, d->uf0b ? &d->f0beta : d->pmins, d->pmaxs, d->sigmas2);
+ d->filterCoeffs(reinterpret_cast(dftc), sigmas, d->ccnt2, d->uf0b ? &d->f0beta : pmins, pmaxs, sigmas2);
if (d->zmean)
addMean(reinterpret_cast(dftc), d->ccnt2, reinterpret_cast(dftc2));
- fftwf_execute_dft_c2r(d->fti, dftc, dftr);
+ fftwf_execute_dft_c2r(fti, dftc, dftr);
if (d->type & 1) // spatial overlapping
- proc1(dftr + pos * d->barea, d->hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride);
+ proc1(dftr + pos * d->barea, hw + pos * d->barea, ebuff + y * ebpStride + x, d->sbsize, ebpStride);
else
- ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * d->hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1];
+ ebuff[(y + d->sbd1) * ebpStride + x + d->sbd1] = dftr[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1] * hw[pos * d->barea + d->sbd1 * d->sbsize + d->sbd1];
}
for (int q = 0; q < d->tbsize; q++)
@@ -585,109 +340,14 @@ static void func_1_c(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, c
const int dstWidth = vsapi->getFrameWidth(dst, plane);
const int dstHeight = vsapi->getFrameHeight(dst, plane);
- const int dstStride = vsapi->getStride(dst, plane) / sizeof(T);
- T * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane));
+ const int dstStride = vsapi->getStride(dst, plane) / sizeof(pixel_t);
+ pixel_t * dstp = reinterpret_cast(vsapi->getWritePtr(dst, plane));
const float * ebp = ebuff + ebpStride * ((height - dstHeight) / 2) + (width - dstWidth) / 2;
- cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->multiplier, d->peak);
+ cast(ebp, dstp, dstWidth, dstHeight, dstStride, ebpStride, d->dstScale, d->peak);
}
}
}
-static void selectFunctions(const unsigned ftype, const unsigned opt, DFTTestData * d) noexcept {
- if (ftype == 0) {
- if (std::abs(d->f0beta - 1.0f) < 0.00005f)
- d->filterCoeffs = filter_c<0>;
- else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
- d->filterCoeffs = filter_c<6>;
- else
- d->filterCoeffs = filter_c<5>;
- } else if (ftype == 1) {
- d->filterCoeffs = filter_c<1>;
- } else if (ftype == 2) {
- d->filterCoeffs = filter_c<2>;
- } else if (ftype == 3) {
- d->filterCoeffs = filter_c<3>;
- } else {
- d->filterCoeffs = filter_c<4>;
- }
-
- if (d->vi->format->bytesPerSample == 1) {
- d->copyPad = copyPad;
- d->func_0 = func_0_c;
- d->func_1 = func_1_c;
- } else if (d->vi->format->bytesPerSample == 2) {
- d->copyPad = copyPad;
- d->func_0 = func_0_c;
- d->func_1 = func_1_c;
- } else {
- d->copyPad = copyPad;
- d->func_0 = func_0_c;
- d->func_1 = func_1_c;
- }
-
-#ifdef VS_TARGET_CPU_X86
- const int iset = instrset_detect();
-
- if ((opt == 0 && iset >= 8) || opt == 3) {
- if (ftype == 0) {
- if (std::abs(d->f0beta - 1.0f) < 0.00005f)
- d->filterCoeffs = filter_avx2<0>;
- else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
- d->filterCoeffs = filter_avx2<6>;
- else
- d->filterCoeffs = filter_avx2<5>;
- } else if (ftype == 1) {
- d->filterCoeffs = filter_avx2<1>;
- } else if (ftype == 2) {
- d->filterCoeffs = filter_avx2<2>;
- } else if (ftype == 3) {
- d->filterCoeffs = filter_avx2<3>;
- } else {
- d->filterCoeffs = filter_avx2<4>;
- }
-
- if (d->vi->format->bytesPerSample == 1) {
- d->func_0 = func_0_avx2;
- d->func_1 = func_1_avx2;
- } else if (d->vi->format->bytesPerSample == 2) {
- d->func_0 = func_0_avx2;
- d->func_1 = func_1_avx2;
- } else {
- d->func_0 = func_0_avx2;
- d->func_1 = func_1_avx2;
- }
- } else if ((opt == 0 && iset >= 2) || opt == 2) {
- if (ftype == 0) {
- if (std::abs(d->f0beta - 1.0f) < 0.00005f)
- d->filterCoeffs = filter_sse2<0>;
- else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
- d->filterCoeffs = filter_sse2<6>;
- else
- d->filterCoeffs = filter_sse2<5>;
- } else if (ftype == 1) {
- d->filterCoeffs = filter_sse2<1>;
- } else if (ftype == 2) {
- d->filterCoeffs = filter_sse2<2>;
- } else if (ftype == 3) {
- d->filterCoeffs = filter_sse2<3>;
- } else {
- d->filterCoeffs = filter_sse2<4>;
- }
-
- if (d->vi->format->bytesPerSample == 1) {
- d->func_0 = func_0_sse2;
- d->func_1 = func_1_sse2;
- } else if (d->vi->format->bytesPerSample == 2) {
- d->func_0 = func_0_sse2;
- d->func_1 = func_1_sse2;
- } else {
- d->func_0 = func_0_sse2;
- d->func_1 = func_1_sse2;
- }
- }
-#endif
-}
-
static void VS_CC dfttestInit(VSMap * in, VSMap * out, void ** instanceData, VSNode * node, VSCore * core, const VSAPI * vsapi) {
DFTTestData * d = static_cast(*instanceData);
vsapi->setVideoInfo(d->vi, 1, node);
@@ -710,25 +370,27 @@ static const VSFrameRef * VS_CC dfttestGetFrame(int n, int activationReason, voi
auto threadId = std::this_thread::get_id();
if (!d->ebuff.count(threadId)) {
- d->ebuff.emplace(threadId, vsapi->newVideoFrame(vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core), d->padWidth[0], d->padHeight[0], nullptr, core));
+ d->ebuff.emplace(threadId,
+ unique_VSFrameRef{ vsapi->newVideoFrame(vsapi->registerFormat(cmGray, stFloat, 32, 0, 0, core), d->padWidth[0], d->padHeight[0], nullptr, core),
+ vsapi->freeFrame });
- float * dftr = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32);
+ float * dftr = vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64);
if (!dftr)
- throw std::string{ "malloc failure (dftr)" };
- d->dftr.emplace(threadId, dftr);
+ throw "malloc failure (dftr)";
+ d->dftr.emplace(threadId, unique_float{ dftr, vs_aligned_free });
- fftwf_complex * dftc = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32);
+ fftwf_complex * dftc = vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64);
if (!dftc)
- throw std::string{ "malloc failure (dftc)" };
- d->dftc.emplace(threadId, dftc);
+ throw "malloc failure (dftc)";
+ d->dftc.emplace(threadId, unique_fftwf_complex{ dftc, vs_aligned_free });
- fftwf_complex * dftc2 = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32);
+ fftwf_complex * dftc2 = vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64);
if (!dftc2)
- throw std::string{ "malloc failure (dftc2)" };
- d->dftc2.emplace(threadId, dftc2);
+ throw "malloc failure (dftc2)";
+ d->dftc2.emplace(threadId, unique_fftwf_complex{ dftc2, vs_aligned_free });
}
- } catch (const std::string & error) {
- vsapi->setFilterError(("DFTTest: " + error).c_str(), frameCtx);
+ } catch (const char * error) {
+ vsapi->setFilterError(("DFTTest: "s + error).c_str(), frameCtx);
return nullptr;
}
@@ -760,7 +422,7 @@ static const VSFrameRef * VS_CC dfttestGetFrame(int n, int activationReason, voi
const int pos = d->tbsize / 2;
for (int i = n - pos; i <= n + pos; i++) {
- src[i - n + pos] = vsapi->getFrameFilter(std::min(std::max(i, 0), d->vi->numFrames - 1), d->node, frameCtx);
+ src[i - n + pos] = vsapi->getFrameFilter(std::clamp(i, 0, d->vi->numFrames - 1), d->node, frameCtx);
for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane])
@@ -788,31 +450,7 @@ static const VSFrameRef * VS_CC dfttestGetFrame(int n, int activationReason, voi
static void VS_CC dfttestFree(void * instanceData, VSCore * core, const VSAPI * vsapi) {
DFTTestData * d = static_cast(instanceData);
-
vsapi->freeNode(d->node);
-
- vs_aligned_free(d->hw);
- vs_aligned_free(d->dftgc);
- vs_aligned_free(d->sigmas);
- vs_aligned_free(d->sigmas2);
- vs_aligned_free(d->pmins);
- vs_aligned_free(d->pmaxs);
-
- fftwf_destroy_plan(d->ft);
- fftwf_destroy_plan(d->fti);
-
- for (auto & iter : d->ebuff)
- vsapi->freeFrame(iter.second);
-
- for (auto & iter : d->dftr)
- vs_aligned_free(iter.second);
-
- for (auto & iter : d->dftc)
- vs_aligned_free(iter.second);
-
- for (auto & iter : d->dftc2)
- vs_aligned_free(iter.second);
-
delete d;
}
@@ -820,192 +458,399 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
std::unique_ptr d = std::make_unique();
int err;
- d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
- d->vi = vsapi->getVideoInfo(d->node);
-
- try {
- if (!isConstantFormat(d->vi) ||
- (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
- (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
- throw std::string{ "only constant format 8-16 bit integer and 32 bit float input supported" };
-
- const int ftype = int64ToIntS(vsapi->propGetInt(in, "ftype", 0, &err));
-
- float sigma = static_cast(vsapi->propGetFloat(in, "sigma", 0, &err));
- if (err)
- sigma = 8.0f;
-
- float sigma2 = static_cast(vsapi->propGetFloat(in, "sigma2", 0, &err));
- if (err)
- sigma2 = 8.0f;
-
- const float pmin = static_cast(vsapi->propGetFloat(in, "pmin", 0, &err));
-
- float pmax = static_cast(vsapi->propGetFloat(in, "pmax", 0, &err));
- if (err)
- pmax = 500.0f;
-
- d->sbsize = int64ToIntS(vsapi->propGetInt(in, "sbsize", 0, &err));
- if (err)
- d->sbsize = 16;
+ auto createWindow = [&](unique_float & hw, const int tmode, const int smode) noexcept {
+ auto getWinValue = [](const double n, const double size, const int win, const double beta) noexcept {
+ auto besselI0 = [](double p) noexcept {
+ p /= 2.0;
+ double n = 1.0, t = 1.0, d = 1.0;
+ int k = 1;
+ double v;
+
+ do {
+ n *= p;
+ d *= k;
+ v = n / d;
+ t += v * v;
+ } while (++k < 15 && v > 1e-8);
+
+ return t;
+ };
+
+ switch (win) {
+ case 0: // hanning
+ return 0.5 - 0.5 * std::cos(2.0 * M_PI * n / size);
+ case 1: // hamming
+ return 0.53836 - 0.46164 * std::cos(2.0 * M_PI * n / size);
+ case 2: // blackman
+ return 0.42 - 0.5 * std::cos(2.0 * M_PI * n / size) + 0.08 * std::cos(4.0 * M_PI * n / size);
+ case 3: // 4 term blackman-harris
+ return 0.35875 - 0.48829 * std::cos(2.0 * M_PI * n / size) + 0.14128 * std::cos(4.0 * M_PI * n / size) - 0.01168 * std::cos(6.0 * M_PI * n / size);
+ case 4: // kaiser-bessel
+ {
+ const double v = 2.0 * n / size - 1.0;
+ return besselI0(M_PI * beta * std::sqrt(1.0 - v * v)) / besselI0(M_PI * beta);
+ }
+ case 5: // 7 term blackman-harris
+ return 0.27105140069342415 -
+ 0.433297939234486060 * std::cos(2.0 * M_PI * n / size) +
+ 0.218122999543110620 * std::cos(4.0 * M_PI * n / size) -
+ 0.065925446388030898 * std::cos(6.0 * M_PI * n / size) +
+ 0.010811742098372268 * std::cos(8.0 * M_PI * n / size) -
+ 7.7658482522509342e-4 * std::cos(10.0 * M_PI * n / size) +
+ 1.3887217350903198e-5 * std::cos(12.0 * M_PI * n / size);
+ case 6: // flat top
+ return 0.2810639 - 0.5208972 * std::cos(2.0 * M_PI * n / size) + 0.1980399 * std::cos(4.0 * M_PI * n / size);
+ case 7: // rectangular
+ return 1.0;
+ case 8: // Bartlett
+ return 2.0 / size * (size / 2.0 - std::abs(n - size / 2.0));
+ case 9: // Bartlett-Hann
+ return 0.62 - 0.48 * (n / size - 0.5) - 0.38 * std::cos(2.0 * M_PI * n / size);
+ case 10: // Nuttall
+ return 0.355768 - 0.487396 * std::cos(2.0 * M_PI * n / size) + 0.144232 * std::cos(4.0 * M_PI * n / size) - 0.012604 * std::cos(6.0 * M_PI * n / size);
+ case 11: // Blackman-Nuttall
+ return 0.3635819 - 0.4891775 * std::cos(2.0 * M_PI * n / size) + 0.1365995 * std::cos(4.0 * M_PI * n / size) - 0.0106411 * std::cos(6.0 * M_PI * n / size);
+ default:
+ return 0.0;
+ }
+ };
- int smode = int64ToIntS(vsapi->propGetInt(in, "smode", 0, &err));
- if (err)
- smode = 1;
+ auto normalizeForOverlapAdd = [](std::unique_ptr & hw, const int bsize, const int osize) noexcept {
+ std::unique_ptr nw = std::make_unique(bsize);
+ const int inc = bsize - osize;
- d->sosize = int64ToIntS(vsapi->propGetInt(in, "sosize", 0, &err));
- if (err)
- d->sosize = 12;
+ for (int q = 0; q < bsize; q++) {
+ for (int h = q; h >= 0; h -= inc)
+ nw[q] += hw[h] * hw[h];
+ for (int h = q + inc; h < bsize; h += inc)
+ nw[q] += hw[h] * hw[h];
+ }
- d->tbsize = int64ToIntS(vsapi->propGetInt(in, "tbsize", 0, &err));
- if (err)
- d->tbsize = 3;
+ for (int q = 0; q < bsize; q++)
+ hw[q] /= std::sqrt(nw[q]);
+ };
+
+ std::unique_ptr tw = std::make_unique(d->tbsize);
+ for (int j = 0; j < d->tbsize; j++)
+ tw[j] = getWinValue(j + 0.5, d->tbsize, d->twin, d->tbeta);
+ if (tmode == 1)
+ normalizeForOverlapAdd(tw, d->tbsize, d->tosize);
+
+ std::unique_ptr sw = std::make_unique(d->sbsize);
+ for (int j = 0; j < d->sbsize; j++)
+ sw[j] = getWinValue(j + 0.5, d->sbsize, d->swin, d->sbeta);
+ if (smode == 1)
+ normalizeForOverlapAdd(sw, d->sbsize, d->sosize);
+
+ const double nscale = 1.0 / std::sqrt(d->bvolume);
+ for (int j = 0; j < d->tbsize; j++)
+ for (int k = 0; k < d->sbsize; k++)
+ for (int q = 0; q < d->sbsize; q++)
+ hw[(j * d->sbsize + k) * d->sbsize + q] = static_cast(tw[j] * sw[k] * sw[q] * nscale);
+ };
+
+ auto interp = [](const float pf, const std::unique_ptr & pv, const int cnt) noexcept {
+ int lidx = 0;
+ for (int i = cnt - 1; i >= 0; i--) {
+ if (pv[i * 2] <= pf) {
+ lidx = i;
+ break;
+ }
+ }
- const int tmode = int64ToIntS(vsapi->propGetInt(in, "tmode", 0, &err));
+ int hidx = cnt - 1;
+ for (int i = 0; i < cnt; i++) {
+ if (pv[i * 2] >= pf) {
+ hidx = i;
+ break;
+ }
+ }
- d->tosize = int64ToIntS(vsapi->propGetInt(in, "tosize", 0, &err));
+ const float d0 = pf - pv[lidx * 2];
+ const float d1 = pv[hidx * 2] - pf;
- d->swin = int64ToIntS(vsapi->propGetInt(in, "swin", 0, &err));
+ if (hidx == lidx || d0 <= 0.0f)
+ return pv[lidx * 2 + 1];
+ if (d1 <= 0.0f)
+ return pv[hidx * 2 + 1];
- d->twin = int64ToIntS(vsapi->propGetInt(in, "twin", 0, &err));
- if (err)
- d->twin = 7;
+ const float tf = d0 / (d0 + d1);
+ return pv[lidx * 2 + 1] * (1.0f - tf) + pv[hidx * 2 + 1] * tf;
+ };
- d->sbeta = static_cast(vsapi->propGetFloat(in, "sbeta", 0, &err));
- if (err)
- d->sbeta = 2.5f;
+ auto getSVal = [&](const int pos, const int len, const std::unique_ptr & pv, const int cnt, float & pf) noexcept {
+ if (len == 1) {
+ pf = 0.0f;
+ return 1.0f;
+ }
- d->tbeta = static_cast(vsapi->propGetFloat(in, "tbeta", 0, &err));
- if (err)
- d->tbeta = 2.5f;
+ const int ld2 = len / 2;
+ pf = (pos > ld2 ? len - pos : pos) / static_cast(ld2);
+ return interp(pf, pv, cnt);
+ };
- d->zmean = !!vsapi->propGetInt(in, "zmean", 0, &err);
- if (err)
- d->zmean = true;
+ try {
+ d->node = vsapi->propGetNode(in, "clip", 0, nullptr);
+ d->vi = vsapi->getVideoInfo(d->node);
- d->f0beta = static_cast(vsapi->propGetFloat(in, "f0beta", 0, &err));
- if (err)
- d->f0beta = 1.0f;
+ if (!isConstantFormat(d->vi) ||
+ (d->vi->format->sampleType == stInteger && d->vi->format->bitsPerSample > 16) ||
+ (d->vi->format->sampleType == stFloat && d->vi->format->bitsPerSample != 32))
+ throw "only constant format 8-16 bit integer and 32 bit float input supported";
+
+ const int ftype = getArg(vsapi, in, "ftype", 0);
+ const float sigma = getArg(vsapi, in, "sigma", 8.0f);
+ const float sigma2 = getArg(vsapi, in, "sigma2", 8.0f);
+ const float pmin = getArg(vsapi, in, "pmin", 0.0f);
+ const float pmax = getArg(vsapi, in, "pmax", 500.0f);
+ d->sbsize = getArg(vsapi, in, "sbsize", 16);
+ const int smode = getArg(vsapi, in, "smode", 1);
+ d->sosize = getArg(vsapi, in, "sosize", 12);
+ d->tbsize = getArg(vsapi, in, "tbsize", 3);
+ const int tmode = getArg(vsapi, in, "tmode", 0);
+ d->tosize = getArg(vsapi, in, "tosize", 0);
+ d->swin = getArg(vsapi, in, "swin", 0);
+ d->twin = getArg(vsapi, in, "twin", 7);
+ d->sbeta = getArg(vsapi, in, "sbeta", 2.5);
+ d->tbeta = getArg(vsapi, in, "tbeta", 2.5);
+ d->zmean = getArg(vsapi, in, "zmean", true);
+ d->f0beta = getArg(vsapi, in, "f0beta", 1.0f);
+ const float alpha = getArg(vsapi, in, "alpha", ftype == 0 ? 5.0f : 7.0f);
+ const int ssystem = getArg(vsapi, in, "ssystem", 0);
+ const int opt = getArg(vsapi, in, "opt", 0);
const int64_t * nlocation = vsapi->propGetIntArray(in, "nlocation", &err);
-
- float alpha = static_cast(vsapi->propGetFloat(in, "alpha", 0, &err));
- if (err)
- alpha = (ftype == 0) ? 5.0f : 7.0f;
-
const double * slocation = vsapi->propGetFloatArray(in, "slocation", &err);
-
const double * ssx = vsapi->propGetFloatArray(in, "ssx", &err);
-
const double * ssy = vsapi->propGetFloatArray(in, "ssy", &err);
-
const double * sst = vsapi->propGetFloatArray(in, "sst", &err);
- const int ssystem = int64ToIntS(vsapi->propGetInt(in, "ssystem", 0, &err));
+ const int numNlocation = vsapi->propNumElements(in, "nlocation");
+ const int numSlocation = vsapi->propNumElements(in, "slocation");
+ const int numSsx = vsapi->propNumElements(in, "ssx");
+ const int numSsy = vsapi->propNumElements(in, "ssy");
+ const int numSst = vsapi->propNumElements(in, "sst");
- const int m = vsapi->propNumElements(in, "planes");
+ {
+ const int m = vsapi->propNumElements(in, "planes");
- for (int i = 0; i < 3; i++)
- d->process[i] = (m <= 0);
+ for (int i = 0; i < 3; i++)
+ d->process[i] = (m <= 0);
- for (int i = 0; i < m; i++) {
- const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr));
+ for (int i = 0; i < m; i++) {
+ const int n = int64ToIntS(vsapi->propGetInt(in, "planes", i, nullptr));
- if (n < 0 || n >= d->vi->format->numPlanes)
- throw std::string{ "plane index out of range" };
+ if (n < 0 || n >= d->vi->format->numPlanes)
+ throw "plane index out of range";
- if (d->process[n])
- throw std::string{ "plane specified twice" };
+ if (d->process[n])
+ throw "plane specified twice";
- d->process[n] = true;
+ d->process[n] = true;
+ }
}
- const int opt = int64ToIntS(vsapi->propGetInt(in, "opt", 0, &err));
-
if (ftype < 0 || ftype > 4)
- throw std::string{ "ftype must be 0, 1, 2, 3, or 4" };
+ throw "ftype must be 0, 1, 2, 3, or 4";
if (d->sbsize < 1)
- throw std::string{ "sbsize must be greater than or equal to 1" };
+ throw "sbsize must be greater than or equal to 1";
if (smode < 0 || smode > 1)
- throw std::string{ "smode must be 0 or 1" };
+ throw "smode must be 0 or 1";
if (smode == 0 && !(d->sbsize & 1))
- throw std::string{ "sbsize must be odd when using smode=0" };
+ throw "sbsize must be odd when using smode=0";
if (smode == 0)
d->sosize = 0;
if (d->sosize < 0 || d->sosize >= d->sbsize)
- throw std::string{ "sosize must be between 0 and sbsize-1 (inclusive)" };
+ throw "sosize must be between 0 and sbsize-1 (inclusive)";
if (d->sosize > d->sbsize / 2 && d->sbsize % (d->sbsize - d->sosize) != 0)
- throw std::string{ "spatial overlap greater than 50% requires that sbsize-sosize is a divisor of sbsize" };
+ throw "spatial overlap greater than 50% requires that sbsize-sosize is a divisor of sbsize";
if (d->tbsize < 1 || d->tbsize > 15)
- throw std::string{ "tbsize must be between 1 and 15 (inclusive)" };
+ throw "tbsize must be between 1 and 15 (inclusive)";
if (tmode != 0)
- throw std::string{ "tmode must be 0. tmode=1 is not implemented" };
+ throw "tmode must be 0. tmode=1 is not implemented";
if (tmode == 0 && !(d->tbsize & 1))
- throw std::string{ "tbsize must be odd when using tmode=0" };
+ throw "tbsize must be odd when using tmode=0";
if (tmode == 0)
d->tosize = 0;
if (d->tosize < 0 || d->tosize >= d->tbsize)
- throw std::string{ "tosize must be between 0 and tbsize-1 (inclusive)" };
+ throw "tosize must be between 0 and tbsize-1 (inclusive)";
if (d->tosize > d->tbsize / 2 && d->tbsize % (d->tbsize - d->tosize) != 0)
- throw std::string{ "temporal overlap greater than 50% requires that tbsize-tosize is a divisor of tbsize" };
+ throw "temporal overlap greater than 50% requires that tbsize-tosize is a divisor of tbsize";
if (d->tbsize > d->vi->numFrames)
- throw std::string{ "tbsize must be less than or equal to the number of frames in the clip" };
+ throw "tbsize must be less than or equal to the number of frames in the clip";
if (d->swin < 0 || d->swin > 11)
- throw std::string{ "swin must be between 0 and 11 (inclusive)" };
+ throw "swin must be between 0 and 11 (inclusive)";
if (d->twin < 0 || d->twin > 11)
- throw std::string{ "twin must be between 0 and 11 (inclusive)" };
+ throw "twin must be between 0 and 11 (inclusive)";
- if (nlocation && (vsapi->propNumElements(in, "nlocation") & 3))
- throw std::string{ "the number of elements in nlocation must be a multiple of 4" };
+ if (nlocation && (numNlocation & 3))
+ throw "number of elements in nlocation must be a multiple of 4";
if (alpha <= 0.0f)
- throw std::string{ "alpha must be greater than 0.0" };
+ throw "alpha must be greater than 0.0";
- if (slocation && (vsapi->propNumElements(in, "slocation") & 1))
- throw std::string{ "the number of elements in slocation must be a multiple of 2" };
+ if (slocation && (numSlocation & 1))
+ throw "number of elements in slocation must be a multiple of 2";
- if (ssx && (vsapi->propNumElements(in, "ssx") & 1))
- throw std::string{ "the number of elements in ssx must be a multiple of 2" };
+ if (ssx && (numSsx & 1))
+ throw "number of elements in ssx must be a multiple of 2";
- if (ssy && (vsapi->propNumElements(in, "ssy") & 1))
- throw std::string{ "the number of elements in ssy must be a multiple of 2" };
+ if (ssy && (numSsy & 1))
+ throw "number of elements in ssy must be a multiple of 2";
- if (sst && (vsapi->propNumElements(in, "sst") & 1))
- throw std::string{ "the number of elements in sst must be a multiple of 2" };
+ if (sst && (numSst & 1))
+ throw "number of elements in sst must be a multiple of 2";
if (ssystem < 0 || ssystem > 1)
- throw std::string{ "ssystem must be 0 or 1" };
+ throw "ssystem must be 0 or 1";
+
+ if (opt < 0 || opt > 4)
+ throw "opt must be 0, 1, 2, 3, or 4";
+
+ {
+ if (ftype == 0) {
+ if (std::abs(d->f0beta - 1.0f) < 0.00005f)
+ d->filterCoeffs = filter_c<0>;
+ else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
+ d->filterCoeffs = filter_c<6>;
+ else
+ d->filterCoeffs = filter_c<5>;
+ } else if (ftype == 1) {
+ d->filterCoeffs = filter_c<1>;
+ } else if (ftype == 2) {
+ d->filterCoeffs = filter_c<2>;
+ } else if (ftype == 3) {
+ d->filterCoeffs = filter_c<3>;
+ } else {
+ d->filterCoeffs = filter_c<4>;
+ }
- if (opt < 0 || opt > 3)
- throw std::string{ "opt must be 0, 1, 2, or 3" };
+ if (d->vi->format->bytesPerSample == 1) {
+ d->copyPad = copyPad;
+ d->func_0 = func_0_c;
+ d->func_1 = func_1_c;
+ } else if (d->vi->format->bytesPerSample == 2) {
+ d->copyPad = copyPad;
+ d->func_0 = func_0_c;
+ d->func_1 = func_1_c;
+ } else {
+ d->copyPad = copyPad;
+ d->func_0 = func_0_c;
+ d->func_1 = func_1_c;
+ }
- const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
- d->ebuff.reserve(numThreads);
- d->dftr.reserve(numThreads);
- d->dftc.reserve(numThreads);
- d->dftc2.reserve(numThreads);
+#ifdef DFTTEST_X86
+ const int iset = instrset_detect();
+ if ((opt == 0 && iset >= 10) || opt == 4) {
+ if (ftype == 0) {
+ if (std::abs(d->f0beta - 1.0f) < 0.00005f)
+ d->filterCoeffs = filter_avx512<0>;
+ else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
+ d->filterCoeffs = filter_avx512<6>;
+ else
+ d->filterCoeffs = filter_avx512<5>;
+ } else if (ftype == 1) {
+ d->filterCoeffs = filter_avx512<1>;
+ } else if (ftype == 2) {
+ d->filterCoeffs = filter_avx512<2>;
+ } else if (ftype == 3) {
+ d->filterCoeffs = filter_avx512<3>;
+ } else {
+ d->filterCoeffs = filter_avx512<4>;
+ }
- selectFunctions(ftype, opt, d.get());
+ if (d->vi->format->bytesPerSample == 1) {
+ d->func_0 = func_0_avx512;
+ d->func_1 = func_1_avx512;
+ } else if (d->vi->format->bytesPerSample == 2) {
+ d->func_0 = func_0_avx512;
+ d->func_1 = func_1_avx512;
+ } else {
+ d->func_0 = func_0_avx512;
+ d->func_1 = func_1_avx512;
+ }
+ } else if ((opt == 0 && iset >= 8) || opt == 3) {
+ if (ftype == 0) {
+ if (std::abs(d->f0beta - 1.0f) < 0.00005f)
+ d->filterCoeffs = filter_avx2<0>;
+ else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
+ d->filterCoeffs = filter_avx2<6>;
+ else
+ d->filterCoeffs = filter_avx2<5>;
+ } else if (ftype == 1) {
+ d->filterCoeffs = filter_avx2<1>;
+ } else if (ftype == 2) {
+ d->filterCoeffs = filter_avx2<2>;
+ } else if (ftype == 3) {
+ d->filterCoeffs = filter_avx2<3>;
+ } else {
+ d->filterCoeffs = filter_avx2<4>;
+ }
+
+ if (d->vi->format->bytesPerSample == 1) {
+ d->func_0 = func_0_avx2;
+ d->func_1 = func_1_avx2;
+ } else if (d->vi->format->bytesPerSample == 2) {
+ d->func_0 = func_0_avx2;
+ d->func_1 = func_1_avx2;
+ } else {
+ d->func_0 = func_0_avx2;
+ d->func_1 = func_1_avx2;
+ }
+ } else if ((opt == 0 && iset >= 2) || opt == 2) {
+ if (ftype == 0) {
+ if (std::abs(d->f0beta - 1.0f) < 0.00005f)
+ d->filterCoeffs = filter_sse2<0>;
+ else if (std::abs(d->f0beta - 0.5f) < 0.00005f)
+ d->filterCoeffs = filter_sse2<6>;
+ else
+ d->filterCoeffs = filter_sse2<5>;
+ } else if (ftype == 1) {
+ d->filterCoeffs = filter_sse2<1>;
+ } else if (ftype == 2) {
+ d->filterCoeffs = filter_sse2<2>;
+ } else if (ftype == 3) {
+ d->filterCoeffs = filter_sse2<3>;
+ } else {
+ d->filterCoeffs = filter_sse2<4>;
+ }
+
+ if (d->vi->format->bytesPerSample == 1) {
+ d->func_0 = func_0_sse2;
+ d->func_1 = func_1_sse2;
+ } else if (d->vi->format->bytesPerSample == 2) {
+ d->func_0 = func_0_sse2;
+ d->func_1 = func_1_sse2;
+ } else {
+ d->func_0 = func_0_sse2;
+ d->func_1 = func_1_sse2;
+ }
+ }
+#endif
+ }
if (d->vi->format->sampleType == stInteger) {
- d->multiplier = static_cast(1 << (d->vi->format->bitsPerSample - 8));
- d->divisor = 1.0f / d->multiplier;
+ d->dstScale = static_cast(1 << (d->vi->format->bitsPerSample - 8));
+ d->srcScale = 1.0f / d->dstScale;
d->peak = (1 << d->vi->format->bitsPerSample) - 1;
+ } else {
+ d->srcScale = 255.0f;
+ d->dstScale = 1.0f / 255.0f;
}
if (ftype != 0)
@@ -1038,28 +883,31 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
}
}
- d->hw = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32);
+ d->hw = { vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free };
if (!d->hw)
- throw std::string{ "malloc failure (hw)" };
- createWindow(d->hw, tmode, smode, d.get());
+ throw "malloc failure (hw)";
+
+ createWindow(d->hw, tmode, smode);
- float * dftgr = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32);
- d->dftgc = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32);
+ unique_float dftgr{ vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free };
+ d->dftgc = { vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free };
if (!dftgr || !d->dftgc)
- throw std::string{ "malloc failure (dftgr/dftgc)" };
+ throw "malloc failure (dftgr/dftgc)";
+
+ fftwf_make_planner_thread_safe();
if (d->tbsize > 1) {
- d->ft = fftwf_plan_dft_r2c_3d(d->tbsize, d->sbsize, d->sbsize, dftgr, d->dftgc, FFTW_PATIENT | FFTW_DESTROY_INPUT);
- d->fti = fftwf_plan_dft_c2r_3d(d->tbsize, d->sbsize, d->sbsize, d->dftgc, dftgr, FFTW_PATIENT | FFTW_DESTROY_INPUT);
+ d->ft = { fftwf_plan_dft_r2c_3d(d->tbsize, d->sbsize, d->sbsize, dftgr.get(), d->dftgc.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan };
+ d->fti = { fftwf_plan_dft_c2r_3d(d->tbsize, d->sbsize, d->sbsize, d->dftgc.get(), dftgr.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan };
} else {
- d->ft = fftwf_plan_dft_r2c_2d(d->sbsize, d->sbsize, dftgr, d->dftgc, FFTW_PATIENT | FFTW_DESTROY_INPUT);
- d->fti = fftwf_plan_dft_c2r_2d(d->sbsize, d->sbsize, d->dftgc, dftgr, FFTW_PATIENT | FFTW_DESTROY_INPUT);
+ d->ft = { fftwf_plan_dft_r2c_2d(d->sbsize, d->sbsize, dftgr.get(), d->dftgc.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan };
+ d->fti = { fftwf_plan_dft_c2r_2d(d->sbsize, d->sbsize, d->dftgc.get(), dftgr.get(), FFTW_PATIENT | FFTW_DESTROY_INPUT), fftwf_destroy_plan };
}
float wscale = 0.0f;
- const float * hwT = d->hw;
- float * VS_RESTRICT dftgrT = dftgr;
+ const float * hwT = d->hw.get();
+ float * VS_RESTRICT dftgrT = dftgr.get();
for (int s = 0; s < d->tbsize; s++) {
for (int i = 0; i < d->sbsize; i++) {
for (int k = 0; k < d->sbsize; k++) {
@@ -1070,20 +918,81 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
dftgrT += d->sbsize;
}
}
- fftwf_execute_dft_r2c(d->ft, dftgr, d->dftgc);
- vs_aligned_free(dftgr);
wscale = 1.0f / wscale;
const float wscalef = (ftype < 2) ? wscale : 1.0f;
- d->sigmas = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32);
- d->sigmas2 = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32);
- d->pmins = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32);
- d->pmaxs = vs_aligned_malloc((d->ccnt2 + 7) * sizeof(float), 32);
+ fftwf_execute_dft_r2c(d->ft.get(), dftgr.get(), d->dftgc.get());
+
+ d->sigmas = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free };
+ d->sigmas2 = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free };
+ d->pmins = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free };
+ d->pmaxs = { vs_aligned_malloc((d->ccnt2 + 15) * sizeof(float), 64), vs_aligned_free };
if (!d->sigmas || !d->sigmas2 || !d->pmins || !d->pmaxs)
- throw std::string{ "malloc failure (sigmas/sigmas2/pmins/pmaxs)" };
+ throw "malloc failure (sigmas/sigmas2/pmins/pmaxs)";
if (slocation || ssx || ssy || sst) {
+ auto parseSigmaLocation = [&](const double * s, const int num, int & poscnt, const float pfact) {
+ float * parray = nullptr;
+
+ if (!s) {
+ parray = new float[4];
+ parray[0] = 0.0f;
+ parray[2] = 1.0f;
+ parray[1] = parray[3] = std::pow(sigma, pfact);
+ poscnt = 2;
+ } else {
+ const double * sT = s;
+ bool found[] = { false, false };
+ poscnt = 0;
+
+ for (int i = 0; i < num; i += 2) {
+ const float pos = static_cast(sT[i]);
+
+ if (pos < 0.0f || pos > 1.0f)
+ throw "sigma location - invalid pos (" + std::to_string(pos) + ")";
+
+ if (pos == 0.0f)
+ found[0] = true;
+ else if (pos == 1.0f)
+ found[1] = true;
+
+ poscnt++;
+ }
+
+ if (!found[0] || !found[1])
+ throw "sigma location - one or more end points not provided";
+
+ parray = new float[poscnt * 2];
+ sT = s;
+ poscnt = 0;
+
+ for (int i = 0; i < num; i += 2) {
+ parray[poscnt * 2 + 0] = static_cast(sT[i + 0]);
+ parray[poscnt * 2 + 1] = std::pow(static_cast(sT[i + 1]), pfact);
+
+ poscnt++;
+ }
+
+ for (int i = 1; i < poscnt; i++) {
+ int j = i;
+ const float t0 = parray[j * 2 + 0];
+ const float t1 = parray[j * 2 + 1];
+
+ while (j > 0 && parray[(j - 1) * 2] > t0) {
+ parray[j * 2 + 0] = parray[(j - 1) * 2 + 0];
+ parray[j * 2 + 1] = parray[(j - 1) * 2 + 1];
+ j--;
+ }
+
+ parray[j * 2 + 0] = t0;
+ parray[j * 2 + 1] = t1;
+ }
+ }
+
+ return parray;
+ };
+
int ndim = 3;
if (d->tbsize == 1)
ndim -= 1;
@@ -1092,16 +1001,16 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
const float ndiv = 1.0f / ndim;
int tcnt = 0, sycnt = 0, sxcnt = 0;
- float * tdata, * sydata, * sxdata;
+ std::unique_ptr tdata, sydata, sxdata;
if (slocation) {
- tdata = parseSigmaLocation(slocation, vsapi->propNumElements(in, "slocation"), tcnt, sigma, ssystem ? 1.0f : ndiv);
- sydata = parseSigmaLocation(slocation, vsapi->propNumElements(in, "slocation"), sycnt, sigma, ssystem ? 1.0f : ndiv);
- sxdata = parseSigmaLocation(slocation, vsapi->propNumElements(in, "slocation"), sxcnt, sigma, ssystem ? 1.0f : ndiv);
+ tdata = std::unique_ptr{ parseSigmaLocation(slocation, numSlocation, tcnt, ssystem ? 1.0f : ndiv) };
+ sydata = std::unique_ptr{ parseSigmaLocation(slocation, numSlocation, sycnt, ssystem ? 1.0f : ndiv) };
+ sxdata = std::unique_ptr{ parseSigmaLocation(slocation, numSlocation, sxcnt, ssystem ? 1.0f : ndiv) };
} else {
- tdata = parseSigmaLocation(sst, vsapi->propNumElements(in, "sst"), tcnt, sigma, ndiv);
- sydata = parseSigmaLocation(ssy, vsapi->propNumElements(in, "ssy"), sycnt, sigma, ndiv);
- sxdata = parseSigmaLocation(ssx, vsapi->propNumElements(in, "ssx"), sxcnt, sigma, ndiv);
+ tdata = std::unique_ptr{ parseSigmaLocation(sst, numSst, tcnt, ndiv) };
+ sydata = std::unique_ptr{ parseSigmaLocation(ssy, numSsy, sycnt, ndiv) };
+ sxdata = std::unique_ptr{ parseSigmaLocation(ssx, numSsx, sxcnt, ndiv) };
}
const int cpx = d->sbsize / 2 + 1;
@@ -1125,14 +1034,10 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
}
const int pos = ((z * d->sbsize + y) * cpx + x) * 2;
- d->sigmas[pos] = d->sigmas[pos + 1] = val / wscalef;
+ d->sigmas[pos + 0] = d->sigmas[pos + 1] = val / wscalef;
}
}
}
-
- delete[] tdata;
- delete[] sydata;
- delete[] sxdata;
} else {
for (int i = 0; i < d->ccnt2; i++)
d->sigmas[i] = sigma / wscalef;
@@ -1145,17 +1050,22 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
}
if (nlocation && ftype < 2) {
- memset(d->sigmas, 0, d->ccnt2 * sizeof(float));
+ struct NPInfo final {
+ int fn, b, y, x;
+ };
+
+ memset(d->sigmas.get(), 0, d->ccnt2 * sizeof(float));
- float * VS_RESTRICT hw2 = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32);
+ unique_float hw2{ vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free };
if (!hw2)
- throw std::string{ "malloc failure (hw2)" };
- createWindow(hw2, 0, 0, d.get());
+ throw "malloc failure (hw2)";
- float * VS_RESTRICT dftr = vs_aligned_malloc((d->bvolume + 7) * sizeof(float), 32);
- fftwf_complex * dftgc2 = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32);
+ createWindow(hw2, 0, 0);
+
+ unique_float dftr{ vs_aligned_malloc((d->bvolume + 15) * sizeof(float), 64), vs_aligned_free };
+ unique_fftwf_complex dftgc2{ vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free };
if (!dftr || !dftgc2)
- throw std::string{ "malloc failure (dftr/dftgc2)" };
+ throw "malloc failure (dftr/dftgc2)";
float wscale2 = 0.0f;
int w = 0;
@@ -1168,33 +1078,33 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
}
}
wscale2 = 1.0f / wscale2;
- fftwf_execute_dft_r2c(d->ft, dftr, dftgc2);
+ fftwf_execute_dft_r2c(d->ft.get(), dftr.get(), dftgc2.get());
int nnpoints = 0;
- NPInfo * npts = new NPInfo[500];
+ std::unique_ptr npts = std::make_unique(500);
- for (int i = 0; i < vsapi->propNumElements(in, "nlocation"); i += 4) {
+ for (int i = 0; i < numNlocation; i += 4) {
const int fn = int64ToIntS(nlocation[i + 0]);
const int b = int64ToIntS(nlocation[i + 1]);
const int y = int64ToIntS(nlocation[i + 2]);
const int x = int64ToIntS(nlocation[i + 3]);
if (fn < 0 || fn > d->vi->numFrames - d->tbsize)
- throw std::string{ "invalid frame number in nlocation (" } + std::to_string(fn) + ")";
+ throw "invalid frame number in nlocation (" + std::to_string(fn) + ")";
if (b < 0 || b >= d->vi->format->numPlanes)
- throw std::string{ "invalid plane number in nlocation (" } + std::to_string(b) + ")";
+ throw "invalid plane number in nlocation (" + std::to_string(b) + ")";
const int height = d->vi->height >> (b ? d->vi->format->subSamplingH : 0);
if (y < 0 || y > height - d->sbsize)
- throw std::string{ "invalid y pos in nlocation (" } + std::to_string(y) + ")";
+ throw "invalid y pos in nlocation (" + std::to_string(y) + ")";
const int width = d->vi->width >> (b ? d->vi->format->subSamplingW : 0);
if (x < 0 || x > width - d->sbsize)
- throw std::string{ "invalid x pos in nlocation (" } + std::to_string(x) + ")";
+ throw "invalid x pos in nlocation (" + std::to_string(x) + ")";
if (nnpoints >= 500)
- throw std::string{ "maximum number of entries in nlocation is 500" };
+ throw "maximum number of entries in nlocation is 500";
npts[nnpoints].fn = fn;
npts[nnpoints].b = b;
@@ -1204,10 +1114,12 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
}
for (int ct = 0; ct < nnpoints; ct++) {
- fftwf_complex * dftc = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32);
- fftwf_complex * dftc2 = vs_aligned_malloc((d->ccnt + 7) * sizeof(fftwf_complex), 32);
- if (!dftc || !dftc2)
- throw std::string{ "malloc failure (dftc/dftc2)" };
+ unique_fftwf_complex _dftc{ vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free };
+ unique_fftwf_complex dftc2{ vs_aligned_malloc((d->ccnt + 15) * sizeof(fftwf_complex), 64), vs_aligned_free };
+ if (!_dftc || !dftc2)
+ throw "malloc failure (dftc/dftc2)";
+
+ float * dftc = reinterpret_cast(_dftc.get());
for (int z = 0; z < d->tbsize; z++) {
const VSFrameRef * src = vsapi->getFrame(npts[ct].fn + z, d->node, nullptr, 0);
@@ -1215,42 +1127,44 @@ static void VS_CC dfttestCreate(const VSMap * in, VSMap * out, void * userData,
if (d->vi->format->bytesPerSample == 1) {
const uint8_t * srcp = vsapi->getReadPtr(src, npts[ct].b) + stride * npts[ct].y + npts[ct].x;
- proc0(srcp, hw2 + d->barea * z, dftr + d->barea * z, stride, d->sbsize, d->divisor);
+ proc0(srcp, hw2.get() + d->barea * z, dftr.get() + d->barea * z, stride, d->sbsize, d->srcScale);
} else if (d->vi->format->bytesPerSample == 2) {
const uint16_t * srcp = reinterpret_cast(vsapi->getReadPtr(src, npts[ct].b)) + stride * npts[ct].y + npts[ct].x;
- proc0(srcp, hw2 + d->barea * z, dftr + d->barea * z, stride, d->sbsize, d->divisor);
+ proc0(srcp, hw2.get() + d->barea * z, dftr.get() + d->barea * z, stride, d->sbsize, d->srcScale);
} else {
const float * srcp = reinterpret_cast(vsapi->getReadPtr(src, npts[ct].b)) + stride * npts[ct].y + npts[ct].x;
- proc0(srcp, hw2 + d->barea * z, dftr + d->barea * z, stride, d->sbsize, d->divisor);
+ proc0(srcp, hw2.get() + d->barea * z, dftr.get() + d->barea * z, stride, d->sbsize, d->srcScale);
}
vsapi->freeFrame(src);
}
- fftwf_execute_dft_r2c(d->ft, dftr, dftc);
+ fftwf_execute_dft_r2c(d->ft.get(), dftr.get(), reinterpret_cast(dftc));
if (d->zmean)
- removeMean(reinterpret_cast(dftc), reinterpret_cast(dftgc2), d->ccnt2, reinterpret_cast(dftc2));
+ removeMean(dftc, reinterpret_cast(dftgc2.get()), d->ccnt2, reinterpret_cast(dftc2.get()));
for (int h = 0; h < d->ccnt2; h += 2) {
- const float psd = reinterpret_cast(dftc)[h] * reinterpret_cast(dftc)[h] + reinterpret_cast(dftc)[h + 1] * reinterpret_cast(dftc)[h + 1];
- d->sigmas[h] += psd;
+ const float psd = dftc[h + 0] * dftc[h + 0] + dftc[h + 1] * dftc[h + 1];
+ d->sigmas[h + 0] += psd;
d->sigmas[h + 1] += psd;
}
-
- vs_aligned_free(dftc);
- vs_aligned_free(dftc2);
}
- vs_aligned_free(hw2);
- vs_aligned_free(dftr);
- vs_aligned_free(dftgc2);
- delete[] npts;
-
const float scale = 1.0f / nnpoints;
for (int h = 0; h < d->ccnt2; h++)
- d->sigmas[h] *= scale * (wscale2 / wscale) * alpha;
+ d->sigmas[h] = d->sigmas[h] * scale * (wscale2 / wscale) * alpha;
}
+
+ const unsigned numThreads = vsapi->getCoreInfo(core)->numThreads;
+ d->ebuff.reserve(numThreads);
+ d->dftr.reserve(numThreads);
+ d->dftc.reserve(numThreads);
+ d->dftc2.reserve(numThreads);
+ } catch (const char * error) {
+ vsapi->setError(out, ("DFTTest: "s + error).c_str());
+ vsapi->freeNode(d->node);
+ return;
} catch (const std::string & error) {
vsapi->setError(out, ("DFTTest: " + error).c_str());
vsapi->freeNode(d->node);
diff --git a/DFTTest/DFTTest.h b/DFTTest/DFTTest.h
new file mode 100644
index 0000000..8cc99aa
--- /dev/null
+++ b/DFTTest/DFTTest.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+
+using unique_float = std::unique_ptr;
+using unique_fftwf_complex = std::unique_ptr;
+using unique_VSFrameRef = std::unique_ptr;
+
+struct DFTTestData final {
+ VSNodeRef * node;
+ const VSVideoInfo * vi;
+ int sbsize, sosize, tbsize, tosize, swin, twin;
+ double sbeta, tbeta;
+ float f0beta;
+ bool zmean, process[3];
+ float srcScale, dstScale;
+ int barea, bvolume, ccnt, ccnt2, type, sbd1, inc, peak;
+ bool uf0b;
+ const VSFormat * padFormat;
+ int padWidth[3], padHeight[3], eheight[3];
+ unique_float hw{ nullptr, nullptr }, sigmas{ nullptr, nullptr }, sigmas2{ nullptr, nullptr }, pmins{ nullptr, nullptr }, pmaxs{ nullptr, nullptr };
+ unique_fftwf_complex dftgc{ nullptr, nullptr };
+ std::unique_ptr ft{ nullptr, nullptr }, fti{ nullptr, nullptr };
+ std::unordered_map ebuff;
+ std::unordered_map dftr;
+ std::unordered_map dftc, dftc2;
+ void (*copyPad)(const VSFrameRef * src, VSFrameRef * dst[3], const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+ void (*filterCoeffs)(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept;
+ void (*func_0)(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+ void (*func_1)(VSFrameRef * src[15][3], VSFrameRef * dst, const int pos, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept;
+};
diff --git a/DFTTest/DFTTest.hpp b/DFTTest/DFTTest.hpp
deleted file mode 100644
index 6682fa1..0000000
--- a/DFTTest/DFTTest.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#include
-#include
-
-#include
-#include
-
-#include
-
-struct DFTTestData {
- VSNodeRef * node;
- const VSVideoInfo * vi;
- int sbsize, sosize, tbsize, tosize, swin, twin;
- float sbeta, tbeta, f0beta;
- bool zmean, process[3];
- float divisor, multiplier;
- int peak, barea, bvolume, ccnt, type, sbd1, ccnt2, inc;
- bool uf0b;
- const VSFormat * padFormat;
- int padWidth[3], padHeight[3], eheight[3];
- float * hw, * sigmas, * sigmas2, * pmins, * pmaxs;
- fftwf_complex * dftgc;
- fftwf_plan ft, fti;
- std::unordered_map ebuff;
- std::unordered_map dftr;
- std::unordered_map dftc, dftc2;
- void (*copyPad)(const VSFrameRef *, VSFrameRef * [3], const DFTTestData *, const VSAPI *) noexcept;
- void (*filterCoeffs)(float *, const float *, const int, const float *, const float *, const float *) noexcept;
- void (*func_0)(VSFrameRef * [3], VSFrameRef *, const DFTTestData *, const VSAPI *) noexcept;
- void (*func_1)(VSFrameRef * [15][3], VSFrameRef *, const int, const DFTTestData *, const VSAPI *) noexcept;
-};
diff --git a/DFTTest/DFTTest.vcxproj b/DFTTest/DFTTest.vcxproj
index dd8aee8..20e849a 100644
--- a/DFTTest/DFTTest.vcxproj
+++ b/DFTTest/DFTTest.vcxproj
@@ -7,7 +7,6 @@
- true
16.0
{23B9F54B-763D-491C-A201-657918CD1377}
Win32Proj
@@ -32,16 +31,17 @@
- C:\Program Files\VapourSynth\sdk\include\vapoursynth;C:\fftw-3.3.8-dll64;$(IncludePath)
- C:\fftw-3.3.8-dll64;$(LibraryPath)
+ C:\Program Files\VapourSynth\sdk\include\vapoursynth;$(IncludePath)
- VS_TARGET_CPU_X86;_CRT_SECURE_NO_WARNINGS;NDEBUG;%(PreprocessorDefinitions)
+ DFTTEST_X86;_CRT_SECURE_NO_WARNINGS;NDEBUG;%(PreprocessorDefinitions)
Level3
true
false
+ false
true
+ stdcpp17
Windows
@@ -55,11 +55,14 @@
AdvancedVectorExtensions2
+
+ AdvancedVectorExtensions512
+
-
+
-
+
diff --git a/DFTTest/DFTTest.vcxproj.filters b/DFTTest/DFTTest.vcxproj.filters
index b8ccc71..011a3d5 100644
--- a/DFTTest/DFTTest.vcxproj.filters
+++ b/DFTTest/DFTTest.vcxproj.filters
@@ -24,12 +24,15 @@
Source Files
-
+
+ Source Files
+
+
Source Files
-
+
Header Files
diff --git a/DFTTest/DFTTest_AVX2.cpp b/DFTTest/DFTTest_AVX2.cpp
index b9b1211..ddd859d 100644
--- a/DFTTest/DFTTest_AVX2.cpp
+++ b/DFTTest/DFTTest_AVX2.cpp
@@ -1,52 +1,27 @@
-#ifdef VS_TARGET_CPU_X86
-#ifndef __AVX2__
-#define __AVX2__
-#endif
-
-#include "DFTTest.hpp"
+#ifdef DFTTEST_X86
+#include "DFTTest.h"
-#include "vectorclass/vectormath_exp.h"
+#include "VCL2/vectormath_exp.h"
-template
-static inline void proc0(const T * s0, const float * s1, float * d, const int p0, const int p1, const float divisor) noexcept;
-
-template<>
-inline void proc0(const uint8_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept {
+template
+static inline auto proc0(const pixel_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float srcScale) noexcept {
for (int u = 0; u < p1; u++) {
- for (int v = 0; v < p1; v += 8) {
- const Vec8f s0 = to_float(Vec8i().load_8uc(_s0 + v));
- const Vec8f s1 = Vec8f().load(_s1 + v);
- (s0 * s1).store(d + v);
- }
+ for (int v = 0; v < p1; v += Vec8f().size()) {
+ Vec8f s0;
- _s0 += p0;
- _s1 += p1;
- d += p1;
- }
-}
+ if constexpr (std::is_same_v)
+ s0 = to_float(Vec8i().load_8uc(_s0 + v));
+ else if constexpr (std::is_same_v)
+ s0 = to_float(Vec8i().load_8us(_s0 + v));
+ else
+ s0 = Vec8f().load(_s0 + v);
-template<>
-inline void proc0(const uint16_t * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept {
- for (int u = 0; u < p1; u++) {
- for (int v = 0; v < p1; v += 8) {
- const Vec8f s0 = to_float(Vec8i().load_8us(_s0 + v));
const Vec8f s1 = Vec8f().load(_s1 + v);
- (s0 * divisor * s1).store(d + v);
- }
- _s0 += p0;
- _s1 += p1;
- d += p1;
- }
-}
-
-template<>
-inline void proc0(const float * _s0, const float * _s1, float * d, const int p0, const int p1, const float divisor) noexcept {
- for (int u = 0; u < p1; u++) {
- for (int v = 0; v < p1; v += 8) {
- const Vec8f s0 = Vec8f().load(_s0 + v);
- const Vec8f s1 = Vec8f().load(_s1 + v);
- (s0 * 255.0f * s1).store(d + v);
+ if constexpr (std::is_same_v)
+ (s0 * s1).store(d + v);
+ else
+ (s0 * srcScale * s1).store(d + v);
}
_s0 += p0;
@@ -55,9 +30,9 @@ inline void proc0(const float * _s0, const float * _s1, float * d, const int p0,
}
}
-static inline void proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept {
+static inline auto proc1(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept {
for (int u = 0; u < p0; u++) {
- for (int v = 0; v < p0; v += 8) {
+ for (int v = 0; v < p0; v += Vec8f().size()) {
const Vec8f s0 = Vec8f().load(_s0 + v);
const Vec8f s1 = Vec8f().load(_s1 + v);
const Vec8f d = Vec8f().load(_d + v);
@@ -70,13 +45,13 @@ static inline void proc1(const float * _s0, const float * _s1, float * _d, const
}
}
-static inline void proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept {
- const int regularPart = p0 & -8;
+static inline auto proc1Partial(const float * _s0, const float * _s1, float * _d, const int p0, const int p1) noexcept {
+ const int regularPart = p0 & ~(Vec8f().size() - 1);
for (int u = 0; u < p0; u++) {
int v;
- for (v = 0; v < regularPart; v += 8) {
+ for (v = 0; v < regularPart; v += Vec8f().size()) {
const Vec8f s0 = Vec8f().load(_s0 + v);
const Vec8f s1 = Vec8f().load(_s1 + v);
const Vec8f d = Vec8f().load(_d + v);
@@ -94,10 +69,10 @@ static inline void proc1Partial(const float * _s0, const float * _s1, float * _d
}
}
-static inline void removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept {
+static inline auto removeMean(float * _dftc, const float * _dftgc, const int ccnt, float * _dftc2) noexcept {
const Vec8f gf = _dftc[0] / _dftgc[0];
- for (int h = 0; h < ccnt; h += 8) {
+ for (int h = 0; h < ccnt; h += Vec8f().size()) {
const Vec8f dftgc = Vec8f().load_a(_dftgc + h);
const Vec8f dftc = Vec8f().load_a(_dftc + h);
const Vec8f dftc2 = gf * dftgc;
@@ -106,8 +81,8 @@ static inline void removeMean(float * _dftc, const float * _dftgc, const int ccn
}
}
-static inline void addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
+static inline auto addMean(float * _dftc, const int ccnt, const float * _dftc2) noexcept {
+ for (int h = 0; h < ccnt; h += Vec8f().size()) {
const Vec8f dftc = Vec8f().load_a(_dftc + h);
const Vec8f dftc2 = Vec8f().load_a(_dftc2 + h);
(dftc + dftc2).store_a(_dftc + h);
@@ -115,133 +90,65 @@ static inline void addMean(float * _dftc, const int ccnt, const float * _dftc2)
}
template
-void filter_avx2(float * dftc, const float * sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept;
-
-template<>
-void filter_avx2<0>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- const Vec8f dftcSquare = dftc * dftc;
- const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare);
- const Vec8f mult = max((psd - sigmas) * approx_recipr(psd + 1e-15f), zero_8f());
- (dftc * mult).store_a(_dftc + h);
- }
-}
-
-template<>
-void filter_avx2<1>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- const Vec8f dftcSquare = dftc * dftc;
- const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare);
- select(psd < sigmas, zero_8f(), dftc).store_a(_dftc + h);
- }
-}
-
-template<>
-void filter_avx2<2>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- (dftc * sigmas).store_a(_dftc + h);
- }
-}
-
-template<>
-void filter_avx2<3>(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- const Vec8f sigmas2 = Vec8f().load_a(_sigmas2 + h);
- const Vec8f pmin = Vec8f().load_a(_pmin + h);
- const Vec8f pmax = Vec8f().load_a(_pmax + h);
- const Vec8f dftcSquare = dftc * dftc;
- const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare);
- select(psd >= pmin && psd <= pmax, dftc * sigmas, dftc * sigmas2).store_a(_dftc + h);
- }
-}
-
-template<>
-void filter_avx2<4>(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- const Vec8f pmin = Vec8f().load_a(_pmin + h);
- const Vec8f pmax = Vec8f().load_a(_pmax + h);
- const Vec8f dftcSquare = dftc * dftc;
- const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare) + 1e-15f;
- const Vec8f mult = sigmas * sqrt(psd * pmax * approx_recipr((psd + pmin) * (psd + pmax)));
- (dftc * mult).store_a(_dftc + h);
- }
-}
+inline void filter_avx2(float * _dftc, const float * _sigmas, const int ccnt, const float * _pmin, const float * _pmax, const float * _sigmas2) noexcept {
+ const Vec8f beta = _pmin[0];
-template<>
-void filter_avx2<5>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- const Vec8f beta = pmin[0];
+ for (int h = 0; h < ccnt; h += Vec8f().size()) {
+ Vec8f dftc, psd, sigmas, pmin, pmax, mult;
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- const Vec8f dftcSquare = dftc * dftc;
- const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare);
- const Vec8f mult = pow(max((psd - sigmas) * approx_recipr(psd + 1e-15f), zero_8f()), beta);
- (dftc * mult).store_a(_dftc + h);
- }
-}
+ dftc = Vec8f().load_a(_dftc + h);
+ sigmas = Vec8f().load_a(_sigmas + h);
-template<>
-void filter_avx2<6>(float * _dftc, const float * _sigmas, const int ccnt, const float * pmin, const float * pmax, const float * sigmas2) noexcept {
- for (int h = 0; h < ccnt; h += 8) {
- const Vec8f dftc = Vec8f().load_a(_dftc + h);
- const Vec8f sigmas = Vec8f().load_a(_sigmas + h);
- const Vec8f dftcSquare = dftc * dftc;
- const Vec8f psd = dftcSquare + permute8f<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare);
- const Vec8f mult = sqrt(max((psd - sigmas) * approx_recipr(psd + 1e-15f), zero_8f()));
- (dftc * mult).store_a(_dftc + h);
- }
-}
-
-template
-static void cast(const float * ebp, T * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept;
-
-template<>
-void cast(const float * ebp, uint8_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept {
- for (int y = 0; y < dstHeight; y++) {
- for (int x = 0; x < dstWidth; x += 16) {
- const Vec8i srcp_8i_1 = truncate_to_int(Vec8f().load(ebp + x) + 0.5f);
- const Vec8i srcp_8i_2 = truncate_to_int(Vec8f().load(ebp + x + 8) + 0.5f);
- const Vec16s srcp_16s = compress_saturated(srcp_8i_1, srcp_8i_2);
- const Vec16uc srcp = compress_saturated_s2u(srcp_16s.get_low(), srcp_16s.get_high());
- srcp.stream(dstp + x);
+ if constexpr (type != 2) {
+ const Vec8f dftcSquare = dftc * dftc;
+ psd = dftcSquare + permute8<1, 0, 3, 2, 5, 4, 7, 6>(dftcSquare);
}
- ebp += ebpStride;
- dstp += dstStride;
- }
-}
+ if constexpr (type == 3 || type == 4) {
+ pmin = Vec8f().load_a(_pmin + h);
+ pmax = Vec8f().load_a(_pmax + h);
+ }
-template<>
-void cast(const float * ebp, uint16_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept {
- for (int y = 0; y < dstHeight; y++) {
- for (int x = 0; x < dstWidth; x += 8) {
- const Vec8i srcp_8i = truncate_to_int(mul_add(Vec8f().load(ebp + x), multiplier, 0.5f));
- const Vec8us srcp = compress_saturated_s2u(srcp_8i.get_low(), srcp_8i.get_high());
- min(srcp, peak).stream(dstp + x);
+ if constexpr (type == 0) {
+ mult = max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_8f());
+ } else if constexpr (type == 1) {
+ dftc = select(psd < sigmas, zero_8f(), dftc);
+ } else if constexpr (type == 2) {
+ dftc *= sigmas;
+ } else if constexpr (type == 3) {
+ const Vec8f sigmas2 = Vec8f().load_a(_sigmas2 + h);
+ dftc = select(psd >= pmin && psd <= pmax, dftc * sigmas, dftc * sigmas2);
+ } else if constexpr (type == 4) {
+ mult = sigmas * sqrt(psd * pmax * rcp_nr(mul_add(psd + pmin, psd + pmax, 1e-15f)));
+ } else if constexpr (type == 5) {
+ mult = pow(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_8f()), beta);
+ } else {
+ mult = sqrt(max((psd - sigmas) * rcp_nr(psd + 1e-15f), zero_8f()));
}
- ebp += ebpStride;
- dstp += dstStride;
+ if constexpr (type == 0 || type > 3)
+ dftc *= mult;
+
+ dftc.store_a(_dftc + h);
}
}
-template<>
-void cast(const float * ebp, float * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float multiplier, const int peak) noexcept {
+template
+static auto cast(const float * ebp, pixel_t * dstp, const int dstWidth, const int dstHeight, const int dstStride, const int ebpStride, const float dstScale, const int peak) noexcept {
for (int y = 0; y < dstHeight; y++) {
- for (int x = 0; x < dstWidth; x += 8) {
- const Vec8f srcp = Vec8f().load(ebp + x);
- (srcp * (1.0f / 255.0f)).stream(dstp + x);
+ for (int x = 0; x < dstWidth; x += Vec8f().size()) {
+ if constexpr (std::is_same_v) {
+ const Vec8i srcp = truncatei(Vec8f().load(ebp + x) + 0.5f);
+ const auto result = compress_saturated_s2u(compress_saturated(srcp, zero_si256()), zero_si256()).get_low();
+ result.storel(dstp + x);
+ } else if constexpr (std::is_same_v) {
+ const Vec8i srcp = truncatei(mul_add(Vec8f().load(ebp + x), dstScale, 0.5f));
+ const auto result = compress_saturated_s2u(srcp, zero_si256()).get_low();
+ min(result, peak).store_nt(dstp + x);
+ } else {
+ const Vec8f srcp = Vec8f().load(ebp + x) * dstScale;
+ srcp.store_nt(dstp + x);
+ }
}
ebp += ebpStride;
@@ -249,47 +156,56 @@ void cast(const float * ebp, float * dstp, const int dstWidth, const int dstHeig
}
}
-template
-void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * d, const VSAPI * vsapi) noexcept {
+template
+void func_0_avx2(VSFrameRef * src[3], VSFrameRef * dst, const DFTTestData * const VS_RESTRICT d, const VSAPI * vsapi) noexcept {
+ const float * hw = d->hw.get();
+ const float * sigmas = d->sigmas.get();
+ const float * sigmas2 = d->sigmas2.get();
+ const float * pmins = d->pmins.get();
+ const float * pmaxs = d->pmaxs.get();
+ const fftwf_complex * dftgc = d->dftgc.get();
+ fftwf_plan ft = d->ft.get();
+ fftwf_plan fti = d->fti.get();
+
const auto threadId = std::this_thread::get_id();
- float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId), 0));
- float * dftr = d->dftr.at(threadId);
- fftwf_complex * dftc = d->dftc.at(threadId);
- fftwf_complex * dftc2 = d->dftc2.at(threadId);
+ float * ebuff = reinterpret_cast(vsapi->getWritePtr(d->ebuff.at(threadId).get(), 0));
+ float * dftr = d->dftr.at(threadId).get();
+ fftwf_complex * dftc = d->dftc.at(threadId).get();
+ fftwf_complex * dftc2 = d->dftc2.at(threadId).get();
for (int plane = 0; plane < d->vi->format->numPlanes; plane++) {
if (d->process[plane]) {
const int width = d->padWidth[plane];
const int height = d->padHeight[plane];
const int eheight = d->eheight[plane];
- const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(T);
- const int ebpStride = vsapi->getStride(d->ebuff.at(threadId), 0) / sizeof(float);
- const T * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0));
+ const int srcStride = vsapi->getStride(src[plane], 0) / sizeof(pixel_t);
+ const int ebpStride = vsapi->getStride(d->ebuff.at(threadId).get(), 0) / sizeof(float);
+ const pixel_t * srcp = reinterpret_cast(vsapi->getReadPtr(src[plane], 0));
float * ebpSaved = ebuff;
memset(ebuff, 0, ebpStride * height * sizeof(float));
for (int y = 0; y < eheight; y += d->inc) {
for (int x = 0; x <= width - d->sbsize; x += d->inc) {
- proc0(srcp + x, d->hw, dftr, srcStride, d->sbsize, d->divisor);
+ proc0(srcp + x, hw, dftr, srcStride, d->sbsize, d->srcScale);
- fftwf_execute_dft_r2c(d->ft, dftr, dftc);
+ fftwf_execute_dft_r2c(ft, dftr, dftc);
if (d->zmean)
- removeMean(reinterpret_cast(dftc), reinterpret_cast(d->dftgc), d->ccnt2, reinterpret_cast(dftc2));
+ removeMean(reinterpret_cast