Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
[BesTLA] New thread pool and hybrid dispatcher (#118)
Browse files Browse the repository at this point in the history
---------

Co-authored-by: ZheWang <[email protected]>
Co-authored-by: Luo, Yu <[email protected]>
  • Loading branch information
3 people authored Mar 8, 2024
1 parent ad3d19e commit fd19a44
Show file tree
Hide file tree
Showing 29 changed files with 1,714 additions and 1,262 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/scripts/formatScan/clangtidy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ log_path=${log_dir}/clangtidy.log
cd ${REPO_DIR}
mkdir build
cd build
cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_USE_OPENMP=OFF
cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_ENABLE_OPENMP=OFF -DNS_USE_OMP=OFF
ninja 2>&1 | tee ${log_path}

if [[ ! -f ${log_path} ]] || [[ $(grep -c "warning:" ${log_path}) != 0 ]] || [[ $(grep -c "error" ${log_path}) != 0 ]]; then
Expand Down
11 changes: 6 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ option(NS_AVX512_VBMI "neural_speed: enable AVX512-VBMI"
option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI" OFF)
option(NS_FMA "neural_speed: enable FMA" ON)
option(NS_AMX "neural_speed: enable AMX" OFF)
option(NS_USE_OMP "neural_speed: use OpenMP thread pool." ON)

option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
option(NS_BTLA_UT "enable BesTLA's unit tests" OFF)
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
option(NS_USE_CLANG_TIDY "neural_speed: clang-tidy check" OFF)

Expand Down Expand Up @@ -135,12 +135,13 @@ if (NS_PYTHON_API)
add_subdirectory(third_party/pybind11)
endif()

if (NS_BTLA_UT)
set(BTLA_UT_ALL ON)
if(NS_USE_OMP)
include(FindOpenMP)
# compile BesTLA's OMPTheading class, then it can be used in ne_layers
set(BTLA_ENABLE_OPENMP ON CACHE BOOL "BesTLA enable compiling OpenMP threading")
add_compile_definitions(NS_USE_OMP)
endif()
include(FindOpenMP)

set(BTLA_USE_OPENMP ON CACHE BOOL "BesTLA use OpenMP")
add_subdirectory(bestla)

add_subdirectory(neural_speed)
48 changes: 43 additions & 5 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,16 @@
"inherits": "linux-debug",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
},
{
"name": "linux-release-thread",
"displayName": "Linux Release Thread Pool",
"description": "Release",
"inherits": "linux-debug",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"NS_USE_OMP": "OFF"
}
},
{
"name": "windows-base",
"description": "Target Windows with the Visual Studio development environment.",
Expand All @@ -49,23 +59,51 @@
"value": "x64",
"strategy": "external"
},
"cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" }
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"NS_PROFILING": "ON",
"NS_USE_OMP": "ON",
"BTLA_UT_DEBUG": "ON"
}
},
{
"name": "x64-release",
"displayName": "x64 Release",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"BTLA_UT_DEBUG": "OFF"
}
},
{
"name": "x64-release-thread",
"displayName": "x64 Release without OpenMP",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-release",
"cacheVariables": {
"NS_USE_OMP": "OFF"
}
},
{
"name": "x64-bestla-UT",
"displayName": "x64 BesTLA unit test",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"inherits": "x64-release",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"NS_BTLA_UT": "ON"
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"BTLA_UT_ALL": "ON",
"BTLA_UT_BENCHMARK": "ON",
"BTLA_UT_OPENMP": "ON"
}
},
{
"name": "x64-ut-thread",
"displayName": "x64 BesTLA UT without OpenMP",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-bestla-UT",
"cacheVariables": {
"BTLA_UT_OPENMP": "OFF"
}
}
]
Expand Down
31 changes: 23 additions & 8 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ project(bestla LANGUAGES CXX VERSION 0.1.0)
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(BTLA_USE_OPENMP "Enable OpenMP thread pool" OFF)
option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)

option(BTLA_UT_ALL "Enable all unit tests" OFF)
option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
Expand All @@ -19,7 +19,7 @@ option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
option(BTLA_UT_NOASAN "Disable sanitize" OFF)
option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(BTLA_UT_OPENMP "Use OpenMP" ON)
option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)

add_library(${PROJECT_NAME} INTERFACE)
add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
Expand All @@ -30,10 +30,10 @@ target_include_directories(
)


if(BTLA_USE_OPENMP)
message(STATUS "BesTLA using OpenMP")
if(BTLA_ENABLE_OPENMP)
message(STATUS "BesTLA enable OpenMP ThreadPool")
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
endif(BTLA_USE_OPENMP)
endif(BTLA_ENABLE_OPENMP)

if(WIN32)
target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
Expand Down Expand Up @@ -64,12 +64,14 @@ endif()

function(add_ut_flag UT_OPTION)
if(${${UT_OPTION}})
target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
# target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
add_compile_definitions(${UT_OPTION})
endif()
endfunction()

if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
Expand All @@ -96,8 +98,21 @@ if(UT_BUILD)
add_ut_flag(BTLA_UT_KERNEL_INTRIN)
add_ut_flag(BTLA_UT_KERNEL_JIT)
add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
add_ut_flag(BTLA_UT_BENCHMARK)

target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
endif(UT_BUILD)

if(BTLA_UT_BENCHMARK)
file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE OpenMP::OpenMP_CXX)
endif()
if(NOT WIN32)
target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
endif()
target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
endif(BTLA_UT_BENCHMARK)
1 change: 1 addition & 0 deletions bestla/bestla/bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ enum class BTLA_ISA : uint8_t {
AMX_INT8,
AVX512_FP16,
AVX512_BF16,
ISA_COUNT,
};
enum class BTLA_DTYPE : uint32_t {
EleBitsMask = 0xff,
Expand Down
93 changes: 81 additions & 12 deletions bestla/bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ class CpuDevice {
public:
inline int getThreads() { return numthreads; }
inline int getCores() { return numcores; }
inline uint32_t getL3CacheSize() { return L3Cache; }
inline uint32_t getL2CacheSize() { return L2Cache; }
inline uint32_t getL1CacheSize() { return L1Cache; }
inline uint32_t getL2CacheSize_E() { return E_L2Cache; }
Expand All @@ -228,7 +229,7 @@ class CpuDevice {
inline bool AMX_BF16() { return mHasAMX_BF16; }
inline bool AVX512_BF16() { return mHasAVX512_BF16; }
inline bool AVX512_FP16() { return mHasAVX512_FP16; }
inline float getPE() { return (P_core.size() * P_power) / (E_core.size() * E_power); }
inline float* const getPE() { return PE; }
inline size_t getPcoreNum() { return P_core.size(); }
inline size_t getEcoreNum() { return E_core.size(); }
inline size_t getSMTcoreNum() { return SMT_core.size(); }
Expand Down Expand Up @@ -328,12 +329,40 @@ class CpuDevice {
}
}
numcores = P_core.size() + E_core.size();
numthreads = P_core.size() * 2 + E_core.size();
numthreads = P_core.size() + E_core.size() + SMT_core.size();

{
// set PE
uint32_t tmp[4];
_cpu.getCpuid(1, tmp);
if (p) printf("!!!\t%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
const int famliy = (tmp[0] >> 8) & ((1u << 4) - 1); // cpu.extractBit(a[0], 8, 11);
const int extendedModel = (tmp[0] >> 16) & ((1u << 4) - 1); // cpu.extractBit(a[0], 16, 24);
{
for (int i = 0; i < int(BTLA_ISA::ISA_COUNT); i++) PE[i] = 1.0f;
// CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
if (famliy == 6) switch (extendedModel) {
case 9: // ALD
PE[int(BTLA_ISA::AVX2)] = 3.0f;
PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
break;
case 10: // MTL
PE[int(BTLA_ISA::AVX2)] = 2.2f;
PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
break;
case 11: // RPL
PE[int(BTLA_ISA::AVX2)] = 1.8f;
PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
break;
}
}
}
} else {
L1Cache = _cpu.getDataCacheSize(0);
L2Cache = _cpu.getDataCacheSize(1);
numthreads = numcores;
}
L3Cache = _cpu.getDataCacheSize(2);
#if FIXED_CACHE
L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
Expand All @@ -357,7 +386,7 @@ class CpuDevice {
Xbyak::util::Cpu cpu;
uint32_t tmp[4];
cpu.getCpuid(0x1A, tmp);
int core_type = (tmp[0] >> 24) & ((1u << 7) - 1); // cpu.extractBit(a[0], 24, 31);
int core_type = (tmp[0] >> 24) & ((1u << 8) - 1); // cpu.extractBit(a[0], 24, 31);
switch (core_type) {
case 32:
// printf("Atom\n");
Expand Down Expand Up @@ -407,7 +436,7 @@ class CpuDevice {
}
static void core_bond(int core) {
#ifdef _WIN32
SetThreadAffinityMask(GetCurrentThread(), 1 << core);
SetThreadAffinityMask(GetCurrentThread(), 1LL << core);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
Expand All @@ -420,7 +449,7 @@ class CpuDevice {
static void core_bond(std::thread& thread, int core) {
#ifdef _WIN32
HANDLE handle = thread.native_handle();
SetThreadAffinityMask(handle, 1 << core);
SetThreadAffinityMask(handle, 1LL << core);
#else
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
Expand All @@ -434,29 +463,69 @@ class CpuDevice {
bool isHybrid() { return mHybrid; }

protected:
uint32_t L2Cache, L1Cache;
uint32_t L2Cache, L1Cache, L3Cache;
bool mHybrid = false;
bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
mHasAVX512_FP16;
int numcores;
int numthreads;
std::vector<int> P_core, E_core, SMT_core;
uint32_t E_L2Cache, E_L1Cache;
float P_power = 4.8, E_power = 2.3;
float PE[int(BTLA_ISA::ISA_COUNT)];
};

#define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();

class CpuBase {
class CpuRuntime {
public:
CpuBase() {
CpuRuntime() = default;
static CpuRuntime& getInstance(int thread) {
static std::map<int, CpuRuntime> instances;
if (instances.count(thread) == 0) instances[thread] = CpuRuntime(thread);
return instances[thread];
}

inline float getPE(const BTLA_ISA isa) {
// printf("GET:%d\t%f\n",int(isa), *cur_PE);
return PE[int(isa)] * P_core_num / E_core_num;
}

inline void adjustPE(const BTLA_ISA isa, const float PE_) {
// printf("Adjust:%d,%f\n",int(isa),PE_);
PE[int(isa)] *= PE_;
}

size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
int P_core_num = 0, E_core_num = 0;
bool mHybrid = false;

private:
CpuRuntime(int thread) {
GetCPUDevice();
mL2Cache = _cd->getL2CacheSize();
mL1Cache = _cd->getL1CacheSize();
mNumThreads = _cd->getThreads();
maxThreads = _cd->getThreads();
mHybrid = false;
if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
mL1Cache_P = mL1Cache / 2;
mL2Cache_P = mL2Cache / 2;
P_core_num = _cd->getPcoreNum();
E_core_num = _cd->getEcoreNum();
} else {
mL1Cache_P = mL1Cache;
mL2Cache_P = mL2Cache;
P_core_num = _cd->getPcoreNum();
E_core_num = thread - P_core_num;
}
mL1Cache_E = _cd->getL1CacheSize_E();
mL2Cache_E = _cd->getL2CacheSize_E();
mHybrid = true;
memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
}
}
size_t mL2Cache, mL1Cache;
int mNumThreads;
float PE[int(BTLA_ISA::ISA_COUNT)];
int maxThreads;
};
} // namespace device
} // namespace bestla
Loading

0 comments on commit fd19a44

Please sign in to comment.