diff --git a/bestla/bestla/bestla_parallel.h b/bestla/bestla/bestla_parallel.h index 7e104a3ae..9b424f5c3 100644 --- a/bestla/bestla/bestla_parallel.h +++ b/bestla/bestla/bestla_parallel.h @@ -599,7 +599,10 @@ class IThreading { #if BTLA_OPENMP class OMPThreading : public IThreading { public: - explicit OMPThreading(int nthreads) : IThreading(nthreads) { omp_set_num_threads(nthreads); } + explicit OMPThreading(int nthreads) : IThreading(nthreads) { + printf("Using OMP\n"); + omp_set_num_threads(nthreads); + } void parallel_for(const thread_func& func) override { if (mThreadNum > 1) { #pragma omp parallel @@ -624,7 +627,10 @@ class OMPThreading : public IThreading { class StdThreading : public IThreading { public: - explicit StdThreading(int nthreads) : IThreading(nthreads) { create_threads(); } + explicit StdThreading(int nthreads) : IThreading(nthreads) { + printf("Using Std\n"); + create_threads(); + } void parallel_for(const thread_func& func) override { if (mThreadNum > 1) { running.store(mThreadNum - 1); diff --git a/bestla/bestla/ut/bestla_parallel.cpp b/bestla/bestla/ut/bestla_parallel.cpp index f3b8b1669..13a863946 100644 --- a/bestla/bestla/ut/bestla_parallel.cpp +++ b/bestla/bestla/ut/bestla_parallel.cpp @@ -27,7 +27,7 @@ class UT_OMPThreading { kernel::wrapper::Transpose2D::template forward(src.data(), ref.data(), row, col, col, row); parallel::Scheduler2D _para({threads, row, col, 1, 1}); - DefaultThreading.parallel_for([&](int tidx) { + UT_Threading::get()->parallel_for([&](int tidx) { parallel::ThreadProblem2D thdp{tidx}; _para.getIndex(thdp); if (thdp.valid) { @@ -61,7 +61,7 @@ class UT_StdThreading { kernel::wrapper::Transpose2D::template forward(src.data(), ref.data(), row, col, col, row); parallel::Scheduler2D _para({threads, row, col, 1, 1}); - DefaultThreading.parallel_for([&](int tidx) { + UT_Threading::get()->parallel_for([&](int tidx) { parallel::ThreadProblem2D thdp{tidx}; _para.getIndex(thdp); if (thdp.valid) { diff --git a/bestla/bestla/ut/bestla_prologue_a.cpp b/bestla/bestla/ut/bestla_prologue_a.cpp index c0ae19c4c..5f7795479 100644 --- a/bestla/bestla/ut/bestla_prologue_a.cpp +++ b/bestla/bestla/ut/bestla_prologue_a.cpp @@ -131,7 +131,7 @@ class UT_ActivationU8KBlockQuantize { auto quanAct = actA.createStorage(m, k, kblock, hasreduce); avector bufA(quanAct.mSize); quanAct.assign(bufA.data()); - actA.quantize({raw.data(), lda, &quanAct}, m, k, &DefaultThreading); + actA.quantize({raw.data(), lda, &quanAct}, m, k, UT_Threading::get()); ut::buffer_error(q.data(), quanAct.template APtr(), q.size(), uint8_t(1)); ut::buffer_error(zp.data(), quanAct.template ZPtr(), zp.size(), uint8_t(1)); @@ -186,7 +186,7 @@ class UT_ActivationS8KBlockQuantize { auto quanAct = actA.createStorage(m, k, kblock, hasreduce); avector bufA(quanAct.mSize); quanAct.assign(bufA.data()); - actA.quantize({raw.data(), k, &quanAct}, m, k, &DefaultThreading); + actA.quantize({raw.data(), k, &quanAct}, m, k, UT_Threading::get()); ut::buffer_error(q.data(), quanAct.template APtr(), q.size(), int8_t(1)); if (hasreduce) { avector redref(reduce.size(), 0.f), redqref(reduce.size(), 0.f); @@ -235,7 +235,7 @@ class UT_ShuffleActivationKblock { auto reordA = kernel.createReorderStorage(m, k, 32); avector bufA(reordA.mSize); reordA.assign(bufA.data()); - kernel.preprocess({src.data(), k, nullptr, indices.data(), &reordA}, m, k, 32, &DefaultThreading); + kernel.preprocess({src.data(), k, nullptr, indices.data(), &reordA}, m, k, 32, UT_Threading::get()); kernel.getActivation(&dstptr, &dststride, {src.data(), k, nullptr, indices.data(), &reordA}, m, kpad, 0, 0, cache, CacheSize); @@ -272,7 +272,7 @@ class UT_ShuffleActivationKblock { avector bufA(quanAct.mSize + reordAct.mSize); quanAct.assign(bufA.data()); reordAct.assign(bufA.data() + quanAct.mSize); - actA.quantize({raw_cp.data(), k, &quanAct, indices.data(), &reordAct}, m, k, &DefaultThreading); + actA.quantize({raw_cp.data(), k, &quanAct, indices.data(), &reordAct}, m, k, UT_Threading::get()); ut::buffer_error(quanAct.template APtr(), q.data(), q.size(), int8_t(1)); if (hasreduce) { avector redref(reduce.size(), 0.f), redqref(reduce.size(), 0.f); diff --git a/bestla/bestla/ut/bestla_prologue_b.cpp b/bestla/bestla/ut/bestla_prologue_b.cpp index 34915a44c..9091c6205 100644 --- a/bestla/bestla/ut/bestla_prologue_b.cpp +++ b/bestla/bestla/ut/bestla_prologue_b.cpp @@ -69,11 +69,11 @@ class UT_BlockQunatize_INT8 { auto ptr = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S8, bestla_dtype, bestla_dtype, asym); avector buffer(ptr.mSize); ptr.assign(buffer.data()); - kernel.packWeight(n, k, dequanRef.data(), ldb, &ptr, &DefaultThreading); + kernel.packWeight(n, k, dequanRef.data(), ldb, &ptr, UT_Threading::get()); avector dequant(n * k); - kernel.unpackWeight(n, k, &ptr, dequant.data(), n, &DefaultThreading); + kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get()); avector ws8(n * k); - kernel.unpackWeight(n, k, &ptr, ws8.data(), n, &DefaultThreading); + kernel.unpackWeight(n, k, &ptr, ws8.data(), n, UT_Threading::get()); ut::buffer_error(quanW.data(), ws8.data(), ws8.size(), (int8_t)1); ut::buffer_error(dequanRef.data(), dequant.data(), dequanRef.size(), 0.01f); } @@ -119,13 +119,13 @@ class UT_BlockQunatize_INT8 { auto ptr = kernel.createStorage(n, k, blocksize, BTLA_DTYPE::S8, bestla_dtype, bestla_dtype, asym); avector buffer(ptr.mSize); ptr.assign(buffer.data()); - kernel.packTransposeWeight(n, k, dequanT.data(), k, &ptr, &DefaultThreading); + kernel.packTransposeWeight(n, k, dequanT.data(), k, &ptr, UT_Threading::get()); avector dequant(n * k), tardequanT(k * n); - kernel.unpackWeight(n, k, &ptr, dequant.data(), n, &DefaultThreading); - kernel.unpackTransposeWeight(n, k, &ptr, tardequanT.data(), k, &DefaultThreading); + kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get()); + kernel.unpackTransposeWeight(n, k, &ptr, tardequanT.data(), k, UT_Threading::get()); ut::buffer_error(dequanT.data(), tardequanT.data(), tardequanT.size(), 0.01f); avector ws8(n * k); - kernel.unpackWeight(n, k, &ptr, ws8.data(), n, &DefaultThreading); + kernel.unpackWeight(n, k, &ptr, ws8.data(), n, UT_Threading::get()); ut::buffer_error(quanW.data(), ws8.data(), ws8.size(), (int8_t)1); ut::buffer_error(dequanRef.data(), dequant.data(), dequanRef.size(), 0.01f); } @@ -160,12 +160,12 @@ class UT_BlockQunatize_F8 { avector ref_buffer(ptr.mSize); ptr.assign(buffer.data()); ref_ptr.assign(ref_buffer.data()); - kernel.packWeight(n, k, raw.data(), ldb, &ptr, &DefaultThreading); - ref_ker.packWeight(n, k, raw.data(), ldb, &ref_ptr, &DefaultThreading); + kernel.packWeight(n, k, raw.data(), ldb, &ptr, UT_Threading::get()); + ref_ker.packWeight(n, k, raw.data(), ldb, &ref_ptr, UT_Threading::get()); avector dequant(n * k, 0); avector ref_dequant(n * k, 0); - kernel.unpackWeight(n, k, &ptr, dequant.data(), n, &DefaultThreading); - ref_ker.unpackWeight(n, k, &ref_ptr, ref_dequant.data(), n, &DefaultThreading); + kernel.unpackWeight(n, k, &ptr, dequant.data(), n, UT_Threading::get()); + ref_ker.unpackWeight(n, k, &ref_ptr, ref_dequant.data(), n, UT_Threading::get()); ut::buffer_error(ref_dequant.data(), dequant.data(), dequant.size(), 0.01f); } }; @@ -351,10 +351,10 @@ class UT_TransposeBlockQuantize_F4 { avector buf(packedW.mSize), buf1(packedW1.mSize); packedW.assign(buf.data()); packedW1.assign(buf1.data()); - kernel.packTransposeWeight(n, k, dequanRef.data(), k, &packedW, &DefaultThreading); - kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &packedW1, &DefaultThreading); + kernel.packTransposeWeight(n, k, dequanRef.data(), k, &packedW, UT_Threading::get()); + kernel.packQWeight(n, k, quanW.data(), ldb, scales.data(), nullptr, &packedW1, UT_Threading::get()); avector dequant(n * k); - kernel.unpackTransposeWeight(n, k, &packedW1, dequant.data(), k, &DefaultThreading); + kernel.unpackTransposeWeight(n, k, &packedW1, dequant.data(), k, UT_Threading::get()); if (SCA_T != BTLA_DTYPE::DQ8_BNB) { ut::buffer_error(packedW.SPtr(), packedW1.SPtr(), packedW1.CSize()); ut::buffer_error(dequanRef.data(), dequant.data(), dequant.size()); @@ -417,11 +417,11 @@ class UT_BlockQuantize_INT4 { auto packedW = kernel.createStorage(n, k, blocksize, qtype, bestla_dtype, bestla_dtype, asym); avector buffer(packedW.mSize); packedW.assign(buffer.data()); - kernel.packWeight(n, k, dequant.data(), ldb, &packedW, &DefaultThreading); + kernel.packWeight(n, k, dequant.data(), ldb, &packedW, UT_Threading::get()); avector unpackf32(dequant.size()); avector unpack512f32(dequant.size()); - kernel.unpackWeight(n, k, &packedW, unpackf32.data(), n, &DefaultThreading); - kernel512.unpackWeight(n, k, &packedW, unpack512f32.data(), n, &DefaultThreading); + kernel.unpackWeight(n, k, &packedW, unpackf32.data(), n, UT_Threading::get()); + kernel512.unpackWeight(n, k, &packedW, unpack512f32.data(), n, UT_Threading::get()); ut::buffer_error(unpackf32.data(), unpack512f32.data(), unpackf32.size(), 0.01f); } void ut_512vnni(int n, int k, int blocksize, BTLA_DTYPE qtype, bool asym = false) { @@ -459,9 +459,9 @@ class UT_BlockQuantize_INT4 { auto packedW = kernel.createStorage(n, k, blocksize, qtype, bestla_dtype, bestla_dtype, asym); avector buffer(packedW.mSize); packedW.assign(buffer.data()); - kernel.packWeight(n, k, dequant.data(), ldb, &packedW, &DefaultThreading); + kernel.packWeight(n, k, dequant.data(), ldb, &packedW, UT_Threading::get()); avector unpackf32(dequant.size()); - kernel.unpackWeight(n, k, &packedW, unpackf32.data(), n, &DefaultThreading); + kernel.unpackWeight(n, k, &packedW, unpackf32.data(), n, UT_Threading::get()); int lsb = 16; float err_thres = lsb * 0.01f; // lsb*max_scale ut::buffer_error(dequant.data(), unpackf32.data(), dequant.size(), err_thres); @@ -549,7 +549,7 @@ class UT_ShuffleIndices { } avector buf0(packedW.mSize), buf1(packedW.mSize); packedW.assign(buf0.data()); - ProWei.setShuffleIndices(groupindices.data(), &packedW, &DefaultThreading); + ProWei.setShuffleIndices(groupindices.data(), &packedW, UT_Threading::get()); buffer_error(reflut.data(), packedW.ShfIndice(), reflut.size()); storage::gemm::StorageWeightKBlockNInteger tmp(GemmCore::ID); @@ -585,7 +585,7 @@ class UT_ShuffleIndices { rordA.assign(bufA.data()); typename Launcher::Param args{ gp, {aarray.data(), k, nullptr, wptr_->ShfIndice(), &rordA}, {wptr_}, {output.data(), n}}; - parallel::GemmRunWithA>(kernel, args, &DefaultThreading); + parallel::GemmRunWithA>(kernel, args, UT_Threading::get()); } else { using Launcher = @@ -603,7 +603,7 @@ class UT_ShuffleIndices { redA.template RPtr(), redA.lda}; typename Launcher::Param args{ gp, {aarray.data(), k, &redA, wptr_->ShfIndice(), &rordA}, {wptr_}, blkargs, {output.data(), n}}; - parallel::GemmRunWithA>(kernel, args, &DefaultThreading); + parallel::GemmRunWithA>(kernel, args, UT_Threading::get()); } ut::buffer_error(output.data(), oarray.data(), output.size()); @@ -741,9 +741,9 @@ class UT_CompFp32 { avector matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n); fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f); fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f); - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n); utils::GemmProblem gp(1, m, n, k, blocksize); typename Launcher::Param args{gp, @@ -751,7 +751,7 @@ class UT_CompFp32 { {&packedw}, {packedw.template SPtr(), packedw.SDtype(), packedw.CStep()}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); auto err = INT8_ERR; auto dbits = bestla_dtype_bits(qtype); auto type = bestla_dtype_type(qtype); @@ -788,9 +788,9 @@ class UT_CompFp32 { avector matBf32(k * n), matAf32(m * k), matC(m * n), refC(m * n), refCupk(m * n); fill_buffer_randn(matBf32.data(), matBf32.size(), -0.5f, 0.5f); fill_buffer_randn(matAf32.data(), matAf32.size(), -0.5f, 0.5f); - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n); GemmProblem gp(1, m, n, k, blocksize); typename Launcher::Param args{gp, @@ -798,7 +798,7 @@ class UT_CompFp32 { {&packedw}, {packedw.template SPtr(), packedw.SDtype(), packedw.CStep()}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); auto err = FP4_ERR; if (qtype == BTLA_DTYPE::F8_E5M2 || qtype == BTLA_DTYPE::F8_E4M3) err = F8_ERR; @@ -864,7 +864,7 @@ class UTBenchmark_CompFp32 { using Launcher = wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; using WType = typename Wei::StorageWeight; @@ -883,7 +883,7 @@ class UTBenchmark_CompFp32 { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); } - kernel.mProB.packWeight(n, k, B, n, &packBs[0], &DefaultThreading); + kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get()); for (size_t i = 1; i < batch; i++) { memcpy(packBs[i].template WPtr(), packBs[0].template WPtr(), packBs[0].template WSize()); memcpy(packBs[i].template SPtr(), packBs[0].template SPtr(), packBs[0].CSize() * sizeof(Scale_T)); @@ -896,7 +896,7 @@ class UTBenchmark_CompFp32 { log.start(); GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {&packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; double band = double(memsize) / log.avg_val / 1e6; @@ -921,7 +921,7 @@ class UTBenchmark_CompFp32 { prologue_b::gemm::WeightKBlockNInteger, epilogue::gemm::AccumulatorWriteBackFp32>; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; using WType = typename Wei::StorageWeight; @@ -943,7 +943,7 @@ class UTBenchmark_CompFp32 { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); } - kernel.mProB.packWeight(n, k, B, n, &packBs[0], &DefaultThreading); + kernel.mProB.packWeight(n, k, B, n, &packBs[0], UT_Threading::get()); for (size_t i = 1; i < batch; i++) { memcpy(packBs[i].template WPtr(), packBs[0].template WPtr(), packBs[0].template WSize()); memcpy(packBs[i].template SPtr(), packBs[0].template SPtr(), packBs[0].CSize() * sizeof(Scale_T)); @@ -960,8 +960,8 @@ class UTBenchmark_CompFp32 { {&packBs[i]}, // {packBs[i].template SPtr(), packBs[i].SDtype(), packBs[i].CStep()}, {C + i * m * n, n}}; - // parallel::GemmRun(kernel, args, &DefaultThreading); - parallel::GemmRunWithA(kernel, args, &DefaultThreading); + // parallel::GemmRun(kernel, args, UT_Threading::get()); + parallel::GemmRunWithA(kernel, args, UT_Threading::get()); } if (log.stop()) { double t = log.avg_val / batch; @@ -1139,16 +1139,16 @@ class UT_CompInt8 { reduceAf32[i * kblks + j / blocksize] += matAf32[i * k + j]; } } - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n); auto quanA = launcher.mProA.createStorage(m, k, blocksize, isAsym); utils::avector bufferA(quanA.mSize); quanA.assign(bufferA.data()); GemmProblem gp(1, m, n, k, blocksize); typename Launcher::Param args{gp, {matAf32.data(), k, &quanA}, {&packedw}, {matC.data(), n}}; - parallel::GemmRunWithA(launcher, args, &DefaultThreading); + parallel::GemmRunWithA(launcher, args, UT_Threading::get()); auto err = INT8_ERR; auto dbits = bestla_dtype_bits(qtype); auto type = bestla_dtype_type(qtype); @@ -1204,9 +1204,9 @@ class UT_CompInt8 { reduceAf32[i * kblks + j / blocksize] += matAf32[i * k + j]; } } - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n); GemmProblem gp(1, m, n, k, blocksize); typename Launcher::Param args{ @@ -1217,7 +1217,7 @@ class UT_CompInt8 { packedw.template RPtr(), packedw.RDtype(), isAsym ? packedw.template ZPtr() : nullptr, isAsym ? reduceAf32.data() : nullptr, blocksize}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); auto err = INT8_ERR; auto dbits = bestla_dtype_bits(qtype); auto type = bestla_dtype_type(qtype); @@ -1272,9 +1272,9 @@ class UT_CompInt8 { reduceAf32[i * kblks + j / blocksize] += matAf32[i * k + j]; } } - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n); GemmProblem gp(1, m, n, k, blocksize); typename Launcher::Param args{ @@ -1285,7 +1285,7 @@ class UT_CompInt8 { quanA.CStep(), quanA.template ZPtr(), packedw.template RPtr(), packedw.RDtype(), packedw.template ZPtr(), quanA.template RPtr(), blocksize}, {matC.data(), n}}; - parallel::GemmRunWithA(launcher, args, &DefaultThreading); + parallel::GemmRunWithA(launcher, args, UT_Threading::get()); auto err = INT8_ERR; auto dbits = bestla_dtype_bits(qtype); auto type = bestla_dtype_type(qtype); @@ -1334,9 +1334,9 @@ class UT_CompInt8 { reduceAf32[i * kblks + j / blocksize] += matAf32[i * k + j]; } } - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refCupk.data(), k, n, n); GemmProblem gp(1, m, n, k, blocksize); typename Launcher::Param args{ @@ -1346,7 +1346,7 @@ class UT_CompInt8 { {packedw.template SPtr(), packedw.SDtype(), packedw.CStep(), scaleAf32.data(), kblks, nullptr, nullptr, bestla_dtype, packedw.template ZPtr(), reduceAf32.data(), blocksize}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); auto err = INT8_ERR; auto dbits = bestla_dtype_bits(qtype); auto type = bestla_dtype_type(qtype); @@ -1446,9 +1446,9 @@ class UT_CompBf16 { for (size_t i = 0; i < matBf32.size(); i++) { matBf32[i] = matBbf16[i]; } - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); for (size_t i = 0; i < matBf32.size(); i++) { matBbf16[i] = static_cast(matBf32[i]); } @@ -1459,7 +1459,7 @@ class UT_CompBf16 { {&packedw}, {packedw.template SPtr(), packedw.SDtype(), packedw.CStep()}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); auto err = get_ut_err(qtype); buffer_error(refC.data(), matC.data(), refC.size(), err); buffer_error(refCupk.data(), matC.data(), refCupk.size(), 0.05f); @@ -1520,7 +1520,7 @@ class UTBenchmark_CompBf16 { using Launcher = wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; using WType = typename Wei::StorageWeight; @@ -1537,7 +1537,7 @@ class UTBenchmark_CompBf16 { for (size_t i = 0; i < batch; i++) { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); - kernel.mProB.packWeight(n, k, B + i * n * k, n, &packBs[i], &DefaultThreading); + kernel.mProB.packWeight(n, k, B + i * n * k, n, &packBs[i], UT_Threading::get()); } auto psize = (size_t)m * n * k * 2; auto memsize = (size_t)packBs[0].mSize + (m * k + m * n) * sizeof(float); @@ -1547,7 +1547,7 @@ class UTBenchmark_CompBf16 { log.start(); GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {&packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; double band = double(memsize) / log.avg_val / 1e6; @@ -1657,7 +1657,7 @@ class UT_ORT_NBits { } } rA.assign(tmpA.data()); - launcher.mProA.reduce({matAf32.data(), k, &rA}, m, k, blocksize, &DefaultThreading); // for reduce UT + launcher.mProA.reduce({matAf32.data(), k, &rA}, m, k, blocksize, UT_Threading::get()); // for reduce UT buffer_error(reduceA.data(), rA.template RPtr(), reduceA.size(), FP32_ERR); memset(tmpA.data(), 0, tmpA.size()); // clear } @@ -1675,11 +1675,11 @@ class UT_ORT_NBits { } } launcher.mProB.packNbitsWeightQ4(n, k, isasym, (uint8_t*)matBs4.data(), k, scalesB.data(), (uint8_t*)zpBs4.data(), - &packedw, &DefaultThreading); - launcher.mProB.reduceWeight(&packedw, &DefaultThreading); + &packedw, UT_Threading::get()); + launcher.mProB.reduceWeight(&packedw, UT_Threading::get()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); avector revB(matBf32.size()); - launcher.mProB.unpackWeight(n, k, &packedw, revB.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, revB.data(), n, UT_Threading::get()); buffer_error(matBf32.data(), revB.data(), revB.size(), FP32_ERR); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), revB.data(), refCupk.data(), k, n, n); GemmProblem gp(1, m, n, k, blocksize); @@ -1691,9 +1691,9 @@ class UT_ORT_NBits { isasym ? packedw.template ZPtr() : nullptr, rA.template RPtr(), rA.lda}, {matC.data(), n}}; if (isasym) { - parallel::GemmRunWithA(launcher, args, &DefaultThreading); + parallel::GemmRunWithA(launcher, args, UT_Threading::get()); } else { - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); } auto err = INT4_ERR; buffer_error(refC.data(), matC.data(), refC.size(), err); @@ -1738,7 +1738,7 @@ class UT_ORT_NBits { } } rA.assign(tmpA.data()); - launcher.mProA.reduce({matAf32.data(), k, &rA}, m, k, blocksize, &DefaultThreading); // for reduce UT + launcher.mProA.reduce({matAf32.data(), k, &rA}, m, k, blocksize, UT_Threading::get()); // for reduce UT buffer_error(reduceA.data(), rA.template RPtr(), reduceA.size(), FP32_ERR); memset(tmpA.data(), 0, tmpA.size()); // clear } @@ -1748,7 +1748,7 @@ class UT_ORT_NBits { } } - launcher.mProB.packQWeight(n, k, qdata.data(), n, sdata.data(), zdata.data(), &packedw, &DefaultThreading); + launcher.mProB.packQWeight(n, k, qdata.data(), n, sdata.data(), zdata.data(), &packedw, UT_Threading::get()); auto bfile = readFile2Buffer("bestla_w3.weight.bin"); WType packedfile(0); @@ -1758,7 +1758,7 @@ class UT_ORT_NBits { buffer_error(packedw.ZPtr(), packedfile.ZPtr(), packedw.CSize()); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), matBf32.data(), refC.data(), k, n, n); avector revB(matBf32.size()); - launcher.mProB.unpackWeight(n, k, &packedw, revB.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, revB.data(), n, UT_Threading::get()); buffer_error(matBf32.data(), revB.data(), revB.size(), FP32_ERR); gemmref_fp32fp32fp32(m, n, k, matAf32.data(), revB.data(), refCupk.data(), k, n, n); GemmProblem gp(1, m, n, k, blocksize); @@ -1770,9 +1770,9 @@ class UT_ORT_NBits { isasym ? packedw.template ZPtr() : nullptr, rA.template RPtr(), rA.lda}, {matC.data(), n}}; if (isasym) { - parallel::GemmRunWithA(launcher, args, &DefaultThreading); + parallel::GemmRunWithA(launcher, args, UT_Threading::get()); } else { - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); } auto err = INT4_ERR; buffer_error(refC.data(), matC.data(), refC.size(), err); @@ -1856,9 +1856,9 @@ class UT_CompFp16 { for (size_t i = 0; i < matBf32.size(); i++) { matBf32[i] = matBbf16[i]; } - launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, &DefaultThreading); + launcher.mProB.packWeight(n, k, matBf32.data(), n, &packedw, UT_Threading::get()); gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n); - launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, &DefaultThreading); + launcher.mProB.unpackWeight(n, k, &packedw, matBf32.data(), n, UT_Threading::get()); for (size_t i = 0; i < matBf32.size(); i++) { matBbf16[i] = static_cast(matBf32[i]); } diff --git a/bestla/bestla/ut/bestla_ut.cpp b/bestla/bestla/ut/bestla_ut.cpp index d55ac5b56..a2ac93714 100644 --- a/bestla/bestla/ut/bestla_ut.cpp +++ b/bestla/bestla/ut/bestla_ut.cpp @@ -1,15 +1,6 @@ #include #include -namespace bestla { -namespace ut { -#ifdef _OPENMP -parallel::OMPThreading DefaultThreading(4); -#else -parallel::StdThreading DefaultThreading(4); -#endif // _OPNEMP -} // namespace ut -} // namespace bestla int main() { printf("BesTLA UT done\n"); return 0; diff --git a/bestla/bestla/ut/bestla_ut.h b/bestla/bestla/ut/bestla_ut.h index b570253b1..cd787548d 100644 --- a/bestla/bestla/ut/bestla_ut.h +++ b/bestla/bestla/ut/bestla_ut.h @@ -26,11 +26,20 @@ using sAVX512_VNNI = gemm::ICoreRowNAvx512vnni<48, 8>; using sAMX_INT8_US = gemm::ICoreRowNAmxint8<64, 16>; using sAMX_INT8_SS = gemm::ICoreRowNAmxint8SS<64, 16>; using sAVX2 = gemm::SCoreRowNAvx2<24, 4>; -#ifdef _OPENMP -extern parallel::OMPThreading DefaultThreading; + +class UT_Threading { + public: + static bestla::parallel::IThreading* get() { +#if BTLA_UT_OPENMP + static bestla::parallel::OMPThreading DefaultThreading(4); #else -extern parallel::StdThreading DefaultThreading; + static bestla::parallel::StdThreading DefaultThreading(4); #endif // _OPNEMP + return &DefaultThreading; + } + + static void set_threads(int n_thread) { get()->set_threads(n_thread); } +}; constexpr size_t CacheSize = size_t(100) << 10; static int8_t cache[CacheSize]; @@ -129,11 +138,11 @@ utils::aligned_vector<_T> readFile2Buffer(const char* filepath) { return buf; } -#define UT_START() \ - { \ - GetCPUDevice(); \ - ut::DefaultThreading.set_threads(_cd->getThreads()); \ - printf("Test Class: %s\n", __FUNCTION__); \ +#define UT_START() \ + { \ + GetCPUDevice(); \ + ut::UT_Threading::set_threads(_cd->getThreads()); \ + printf("Test Class: %s\n", __FUNCTION__); \ } template static double buffer_error(_T* ref, _T* tar, size_t size, _T thres = _T(0)) { diff --git a/bestla/bestla/ut/bestla_wrapper.cpp b/bestla/bestla/ut/bestla_wrapper.cpp index 8b9e9d415..2676d3e59 100644 --- a/bestla/bestla/ut/bestla_wrapper.cpp +++ b/bestla/bestla/ut/bestla_wrapper.cpp @@ -47,10 +47,10 @@ class UT_Fp32Fp32 { auto packw = launcher.mProB.createStorage(n, k); avector buffer(packw.mSize); packw.assign(buffer.data()); - launcher.mProB.packWeight(n, k, {matB.data(), n, &packw}, &DefaultThreading); + launcher.mProB.packWeight(n, k, {matB.data(), n, &packw}, UT_Threading::get()); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {matA.data(), k}, {matB.data(), n, &packw}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); ut::buffer_error(ref.data(), matC.data(), ref.size(), 0.001f); } @@ -65,7 +65,7 @@ class UT_Fp32Fp32 { wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; auto tmpB = kernel.mProB.createStorage(n, k); @@ -74,7 +74,7 @@ class UT_Fp32Fp32 { for (size_t i = 0; i < batch; i++) { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); - kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, &DefaultThreading); + kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get()); } auto psize = (size_t)m * n * k * 2; tm.start(); @@ -83,7 +83,7 @@ class UT_Fp32Fp32 { log.start(); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; printf("%s %s Flops:%.3f PerCoreFlops:%.3f\n ", corestr, log.get_log_str(), flops, flops / threads); @@ -190,14 +190,14 @@ class UT_U8S8S32 { auto packw = launcher.mProB.createStorage(n, k); avector buffer(packw.mSize); packw.assign(buffer.data()); - launcher.mProB.packWeight(n, k, {matBs8.data(), n, &packw}, &DefaultThreading); + launcher.mProB.packWeight(n, k, {matBs8.data(), n, &packw}, UT_Threading::get()); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{ gp, {matAu8.data(), k}, {matBs8.data(), n, &packw}, {matC.data(), n, 1, scaleAf32.data(), scaleBf32.data(), zpAu8.data(), reduceB.data()}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); ut::buffer_error(refC.data(), matC.data(), refC.size(), 0.001f); } @@ -212,7 +212,7 @@ class UT_U8S8S32 { wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; auto tmpB = kernel.mProB.createStorage(n, k); @@ -221,7 +221,7 @@ class UT_U8S8S32 { for (size_t i = 0; i < batch; i++) { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); - kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, &DefaultThreading); + kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get()); } auto psize = (size_t)m * n * k * 2; tm.start(); @@ -230,7 +230,7 @@ class UT_U8S8S32 { log.start(); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops, @@ -324,11 +324,11 @@ class UT_S8S8S32 { auto packw = launcher.mProB.createStorage(n, k); avector buffer(packw.mSize); packw.assign(buffer.data()); - launcher.mProB.packWeight(n, k, {matBs8.data(), n, &packw}, &DefaultThreading); + launcher.mProB.packWeight(n, k, {matBs8.data(), n, &packw}, UT_Threading::get()); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{ gp, {matAu8.data(), k}, {matBs8.data(), n, &packw}, {matC.data(), n, 1, scaleAf32.data(), scaleBf32.data()}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); ut::buffer_error(refC.data(), matC.data(), refC.size(), 0.001f); } @@ -343,7 +343,7 @@ class UT_S8S8S32 { wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; auto tmpB = kernel.mProB.createStorage(n, k); @@ -352,7 +352,7 @@ class UT_S8S8S32 { for (size_t i = 0; i < batch; i++) { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); - kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, &DefaultThreading); + kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get()); } auto psize = (size_t)m * n * k * 2; tm.start(); @@ -361,7 +361,7 @@ class UT_S8S8S32 { log.start(); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops, @@ -430,11 +430,11 @@ class UT_Bf16Bf16Fp32 { fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f)); fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::bf16(-0.5f), utils::bf16(0.5f)); avector matC(m * n), refC(m * n); - launcher.mProB.packWeight(n, k, {matBbf16.data(), n, &packw}, &DefaultThreading); + launcher.mProB.packWeight(n, k, {matBbf16.data(), n, &packw}, UT_Threading::get()); gemmref_bf16bf16fp32(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {matAbf16.data(), k}, {matBbf16.data(), n, &packw}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); buffer_error(refC.data(), matC.data(), refC.size(), 0.05f); } @@ -449,7 +449,7 @@ class UT_Bf16Bf16Fp32 { wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; auto tmpB = kernel.mProB.createStorage(n, k); @@ -458,7 +458,7 @@ class UT_Bf16Bf16Fp32 { for (size_t i = 0; i < batch; i++) { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); - kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, &DefaultThreading); + kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get()); } auto psize = (size_t)m * n * k * 2; tm.start(); @@ -467,7 +467,7 @@ class UT_Bf16Bf16Fp32 { log.start(); utils::GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops, @@ -534,11 +534,11 @@ class UT_Fp16Fp16Fp16 { avector matAbf16(m * k), matBbf16(k * n), matC(m * n), refC(m * n); fill_buffer_randn(matAbf16.data(), matAbf16.size(), utils::fp16(-0.5f), utils::fp16(0.5f)); fill_buffer_randn(matBbf16.data(), matBbf16.size(), utils::fp16(-0.5f), utils::fp16(0.5f)); - launcher.mProB.packWeight(n, k, {matBbf16.data(), n, &packw}, &DefaultThreading); + launcher.mProB.packWeight(n, k, {matBbf16.data(), n, &packw}, UT_Threading::get()); gemmref_fp16fp16fp16(m, n, k, matAbf16.data(), matBbf16.data(), refC.data(), k, n, n); GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {matAbf16.data(), k}, {matBbf16.data(), n, &packw}, {matC.data(), n}}; - parallel::GemmRun(launcher, args, &DefaultThreading); + parallel::GemmRun(launcher, args, UT_Threading::get()); buffer_error(refC.data(), matC.data(), refC.size(), utils::fp16(0.0002f * k)); } @@ -553,7 +553,7 @@ class UT_Fp16Fp16Fp16 { wrapper::gemm::LauncherBase; Launcher kernel; - DefaultThreading.set_threads(threads); + UT_Threading::set_threads(threads); auto corestr = gemm::CoreAttr::to_str(Core_T::ID); utils::timer tm; auto tmpB = kernel.mProB.createStorage(n, k); @@ -562,7 +562,7 @@ class UT_Fp16Fp16Fp16 { for (size_t i = 0; i < batch; i++) { packBs[i] = tmpB; packBs[i].assign(bufB.data() + i * tmpB.mSize); - kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, &DefaultThreading); + kernel.mProB.packWeight(n, k, {B + i * n * k, n, &packBs[i]}, UT_Threading::get()); } auto psize = (size_t)m * n * k * 2; tm.start(); @@ -571,7 +571,7 @@ class UT_Fp16Fp16Fp16 { log.start(); GemmProblem gp(1, m, n, k); typename Launcher::Param args{gp, {A + i * m * k, k}, {0, 0, &packBs[i]}, {C + i * m * n, n}}; - parallel::GemmRun(kernel, args, &DefaultThreading); + parallel::GemmRun(kernel, args, UT_Threading::get()); if (log.stop()) { double flops = double(psize) / log.avg_val / 1e6; printf("Threads %d %s %s Flops:%.3f PerCoreFlops:%.3f\n", threads, corestr, log.get_log_str(), flops, diff --git a/bestla/bestla/ut/kernel_jit.cpp b/bestla/bestla/ut/kernel_jit.cpp index ce1198c99..7d91297ef 100644 --- a/bestla/bestla/ut/kernel_jit.cpp +++ b/bestla/bestla/ut/kernel_jit.cpp @@ -27,9 +27,9 @@ class UT_Memcpy2D_AVX512F { kernel::jit::JitMemcpy2DAvx512f::forward(src.data(), dst.data(), row, col, srcstep, dststep); } tm.start(); - parallel::Scheduler2D para({DefaultThreading.num_threads(), row, col, 4, 64}); + parallel::Scheduler2D para({UT_Threading::get()->num_threads(), row, col, 4, 64}); for (size_t i = 0; i < TestLoop; i++) { - DefaultThreading.parallel_for([&](int tidx) { + UT_Threading::get()->parallel_for([&](int tidx) { parallel::ThreadProblem2D thdp{tidx}; para.getIndex(thdp); if (thdp.valid) { @@ -47,7 +47,7 @@ class UT_Memcpy2D_AVX512F { tm.start(); for (size_t i = 0; i < TestLoop; i++) { - DefaultThreading.parallel_for([&](int tidx) { + UT_Threading::get()->parallel_for([&](int tidx) { parallel::ThreadProblem2D thdp{tidx}; para.getIndex(thdp); if (thdp.valid) {