diff --git a/CMakePresets.json b/CMakePresets.json index 0d470f1ed..76d74df0f 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -39,6 +39,18 @@ "NS_USE_OMP": "OFF" } }, + { + "name": "linux-release-ut-thread", + "displayName": "Linux Release Thread Pool for UTs", + "description": "Release", + "inherits": "linux-debug", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "NS_USE_OMP": "OFF", + "BTLA_UT_ALL": "ON", + "BTLA_UT_BENCHMARK": "ON" + } + }, { "name": "windows-base", "description": "Target Windows with the Visual Studio development environment.", diff --git a/bestla/bestla/bestla_device.h b/bestla/bestla/bestla_device.h index d7c1f2fbb..68535eae1 100644 --- a/bestla/bestla/bestla_device.h +++ b/bestla/bestla/bestla_device.h @@ -233,9 +233,9 @@ class CpuDevice { inline bool AVX512_BF16() { return mHasAVX512_BF16; } inline bool AVX512_FP16() { return mHasAVX512_FP16; } inline float* const getPE() { return PE; } - inline size_t getPcoreNum() { return P_core.size(); } - inline size_t getEcoreNum() { return E_core.size(); } - inline size_t getSMTcoreNum() { return SMT_core.size(); } + inline int getPcoreNum() { return static_cast(P_core.size()); } + inline int getEcoreNum() { return static_cast(E_core.size()); } + inline int getSMTcoreNum() { return static_cast(SMT_core.size()); } inline int* getPCores() { return P_core.data(); } inline int* getECores() { return E_core.data(); } inline int* getSMTCores() { return SMT_core.data(); } @@ -467,15 +467,15 @@ class CpuDevice { bool isClient() { return mClient; } protected: - uint32_t L2Cache, L1Cache, L3Cache; + uint32_t L2Cache = 0, L1Cache = 0, L3Cache = 0; bool mHybrid = false, mClient = false; - bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512BW, - mHasAVX512_BF16, mHasAVX512_FP16; - int numcores; - int numthreads; + bool mHasAVX2 = false, mHasAVX_VNNI = false, mHasAVX = false, mHasAVX512_VNNI = false, mHasAMX_INT8 = false, + mHasAMX_BF16 = false, mHasAVX512F = false, mHasAVX512BW, mHasAVX512_BF16 = false, mHasAVX512_FP16 = false; + int numcores = 0; + int numthreads = 0; std::vector P_core, E_core, SMT_core; - uint32_t E_L2Cache, E_L1Cache; - float PE[int(BTLA_ISA::ISA_COUNT)]; + uint32_t E_L2Cache = 0, E_L1Cache = 0; + float PE[int(BTLA_ISA::ISA_COUNT)] = {1.f}; }; #define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance(); diff --git a/bestla/bestla/bestla_gemm.h b/bestla/bestla/bestla_gemm.h index fe521c4ab..88356ab25 100644 --- a/bestla/bestla/bestla_gemm.h +++ b/bestla/bestla/bestla_gemm.h @@ -2510,7 +2510,7 @@ class Avx512vnniN16P4 : protected bestla::xbyak::JitAvx512vnni { static_assert(_NTILE % RegLen == 0); static int constexpr NRegs = _NTILE / RegLen; static int constexpr MRegs = _MTILE == 0 ? (RegCount - 1 - NRegs) / (NRegs * 2) : _MTILE; - static_assert(NRegs * MRegs <= RegCount - 1); + static_assert(NRegs * MRegs * 2 <= RegCount - 1); static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4; static int constexpr KUNROLL = 2; static auto constexpr ISA = BTLA_ISA::AVX512_VNNI; @@ -3397,7 +3397,7 @@ class AvxvnniN8P4 : protected bestla::xbyak::JitAvxvnni { static_assert(_NTILE % RegLen == 0); static int constexpr NRegs = _NTILE / RegLen; static int constexpr MRegs = _MTILE == 0 ? (RegCount - 3) / (NRegs * 2) : _MTILE; - static_assert(NRegs * MRegs <= RegCount - 3); + static_assert(NRegs * MRegs * 2 <= RegCount - 3); static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4; static int constexpr KUNROLL = 2; static auto constexpr ISA = BTLA_ISA::AVX_VNNI; @@ -4073,7 +4073,7 @@ class Avx2vnniN8P4 : protected bestla::xbyak::JitAvx2 { static int constexpr NRegs = _NTILE / RegLen; static int constexpr TmpReserve = std::is_same_v ? 2 : 4; static int constexpr MRegs = _MTILE == 0 ? (RegCount - (TmpReserve + 1)) / (NRegs * 2) : _MTILE; - static_assert(NRegs * MRegs <= RegCount - (TmpReserve + 1)); + static_assert(NRegs * MRegs * 2 <= RegCount - (TmpReserve + 1)); static int constexpr NTILE = RegLen * NRegs, MTILE = MRegs, KTILE = 4; static int constexpr KUNROLL = 2; static auto constexpr ISA = BTLA_ISA::AVX2; @@ -4819,7 +4819,14 @@ class CoreCodeBase { static int constexpr PREFERRED_N = NTILE * 4; static auto constexpr ISA = Code::ISA; static auto constexpr ID = CoreAttr::make_core_id(NTILE, PACK_ROW, COMP, ISA); - void configure(int _M, int _N, int _K) { (void)(0); } + static void configure(int _M, int _N, int _K) { (void)(0); } + + static CoreCodeBase* getInstance() { + static CoreCodeBase instance; + return &instance; + } + + std::array mCodes; protected: CoreCodeBase() { @@ -4827,7 +4834,6 @@ class CoreCodeBase { mCodes[i].generate_code(i + 1); } } - std::array mCodes; }; template