[BesTLA] New thread pool and hybrid dispatcher (#118)

--------- Co-authored-by: ZheWang <[email protected]> Co-authored-by: Luo, Yu <[email protected]>
intel · Mar 8, 2024 · fd19a44 · fd19a44
1 parent ad3d19e
commit fd19a44
Show file tree

Hide file tree

Showing 29 changed files with 1,714 additions and 1,262 deletions.
diff --git a/.github/workflows/scripts/formatScan/clangtidy.sh b/.github/workflows/scripts/formatScan/clangtidy.sh
@@ -11,7 +11,7 @@ log_path=${log_dir}/clangtidy.log
 cd ${REPO_DIR}
 mkdir build
 cd build
-cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_USE_OPENMP=OFF
+cmake .. -G Ninja -DNS_USE_CLANG_TIDY=CHECK -DBTLA_ENABLE_OPENMP=OFF -DNS_USE_OMP=OFF
 ninja 2>&1 | tee ${log_path}
 
 if [[ ! -f ${log_path} ]] || [[ $(grep -c "warning:" ${log_path}) != 0 ]] || [[ $(grep -c "error" ${log_path}) != 0 ]]; then

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -60,9 +60,9 @@ option(NS_AVX512_VBMI            "neural_speed: enable AVX512-VBMI"
 option(NS_AVX512_VNNI            "neural_speed: enable AVX512-VNNI"                             OFF)
 option(NS_FMA                    "neural_speed: enable FMA"                                     ON)
 option(NS_AMX                    "neural_speed: enable AMX"                                     OFF)
+option(NS_USE_OMP                "neural_speed: use OpenMP thread pool."                        ON)
 
 option(NS_BUILD_TESTS            "neural_speed: build tests"                       ${NS_STANDALONE})
-option(NS_BTLA_UT                "enable BesTLA's unit tests"                                   OFF)
 option(NS_BUILD_EXAMPLES         "neural_speed: build examples"                    ${NS_STANDALONE})
 option(NS_USE_CLANG_TIDY         "neural_speed: clang-tidy check"                               OFF)
 
@@ -135,12 +135,13 @@ if (NS_PYTHON_API)
   add_subdirectory(third_party/pybind11)
 endif()
 
-if (NS_BTLA_UT)
-  set(BTLA_UT_ALL ON)
+if(NS_USE_OMP)
+  include(FindOpenMP)
+  # compile BesTLA's OMPTheading class, then it can be used in ne_layers
+  set(BTLA_ENABLE_OPENMP ON CACHE BOOL "BesTLA enable compiling OpenMP threading")
+  add_compile_definitions(NS_USE_OMP)
 endif()
-include(FindOpenMP)
 
-set(BTLA_USE_OPENMP ON CACHE BOOL "BesTLA use OpenMP")
 add_subdirectory(bestla)
 
 add_subdirectory(neural_speed)
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -23,6 +23,16 @@
       "inherits": "linux-debug",
       "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
     },
+    {
+      "name": "linux-release-thread",
+      "displayName": "Linux Release Thread Pool",
+      "description": "Release",
+      "inherits": "linux-debug",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "NS_USE_OMP": "OFF"
+      }
+    },
     {
       "name": "windows-base",
       "description": "Target Windows with the Visual Studio development environment.",
@@ -49,23 +59,51 @@
         "value": "x64",
         "strategy": "external"
       },
-      "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" }
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "NS_PROFILING": "ON",
+        "NS_USE_OMP": "ON",
+        "BTLA_UT_DEBUG": "ON"
+      }
     },
     {
       "name": "x64-release",
       "displayName": "x64 Release",
       "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
       "inherits": "x64-debug",
-      "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "BTLA_UT_DEBUG": "OFF"
+      }
+    },
+    {
+      "name": "x64-release-thread",
+      "displayName": "x64 Release without OpenMP",
+      "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
+      "inherits": "x64-release",
+      "cacheVariables": {
+        "NS_USE_OMP": "OFF"
+      }
     },
     {
       "name": "x64-bestla-UT",
       "displayName": "x64 BesTLA unit test",
       "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
-      "inherits": "x64-debug",
+      "inherits": "x64-release",
       "cacheVariables": {
-        "CMAKE_BUILD_TYPE": "Release",
-        "NS_BTLA_UT": "ON"
+        "CMAKE_BUILD_TYPE": "RelWithDebInfo",
+        "BTLA_UT_ALL": "ON",
+        "BTLA_UT_BENCHMARK": "ON",
+        "BTLA_UT_OPENMP": "ON"
+      }
+    },
+    {
+      "name": "x64-ut-thread",
+      "displayName": "x64 BesTLA UT without OpenMP",
+      "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
+      "inherits": "x64-bestla-UT",
+      "cacheVariables": {
+        "BTLA_UT_OPENMP": "OFF"
       }
     }
   ]

diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
@@ -4,7 +4,7 @@ project(bestla LANGUAGES CXX VERSION 0.1.0)
 file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
 file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
 
-option(BTLA_USE_OPENMP "Enable OpenMP thread pool" OFF)
+option(BTLA_ENABLE_OPENMP "Compile OpenMP thread pool if OMP can be found" OFF)
 
 option(BTLA_UT_ALL "Enable all unit tests" OFF)
 option(BTLA_UT_DEBUG "Enable debug unit tests" OFF)
@@ -19,7 +19,7 @@ option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
 option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
 option(BTLA_UT_NOASAN "Disable sanitize" OFF)
 option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
-option(BTLA_UT_OPENMP "Use OpenMP" ON)
+option(BTLA_UT_OPENMP "Use OpenMP for UT tests" OFF)
 
 add_library(${PROJECT_NAME} INTERFACE)
 add_library(neural_speed::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
@@ -30,10 +30,10 @@ target_include_directories(
 )
 
 
-if(BTLA_USE_OPENMP)
-  message(STATUS "BesTLA using OpenMP")
+if(BTLA_ENABLE_OPENMP)
+  message(STATUS "BesTLA enable OpenMP ThreadPool")
   target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
-endif(BTLA_USE_OPENMP)
+endif(BTLA_ENABLE_OPENMP)
 
 if(WIN32)
 	target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
@@ -64,12 +64,14 @@ endif()
 
 function(add_ut_flag UT_OPTION)
 	if(${${UT_OPTION}})
-	  target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
+	  # target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
+    add_compile_definitions(${UT_OPTION})
 	endif()
 endfunction()
 
 if(UT_BUILD)
 	file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everything even run parts of UTs
+  list(REMOVE_ITEM srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
 	file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
   include_directories(${PROJECT_NAME})
 	add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
@@ -96,8 +98,21 @@ if(UT_BUILD)
 	add_ut_flag(BTLA_UT_KERNEL_INTRIN)
 	add_ut_flag(BTLA_UT_KERNEL_JIT)
 	add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
-	add_ut_flag(BTLA_UT_BENCHMARK)
-
 	target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
 endif(UT_BUILD)
 
+if(BTLA_UT_BENCHMARK)
+  file(GLOB srcs ${PROJECT_NAME}/ut/bestla_benchmark.cpp) #compile everything even run parts of UTs
+  file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
+  include_directories(${PROJECT_NAME})
+	add_executable(${PROJECT_NAME}_benchmark ${srcs} ${headers} ${ut_headers})
+  if(BTLA_UT_OPENMP)
+    include(FindOpenMP)
+    target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_USE_OPENMP)
+    target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE OpenMP::OpenMP_CXX)
+  endif()
+  if(NOT WIN32)
+		target_link_options(${PROJECT_NAME}_benchmark PRIVATE -lpthread)
+	endif()
+  target_link_libraries(${PROJECT_NAME}_benchmark PRIVATE ${PROJECT_NAME})
+endif(BTLA_UT_BENCHMARK)
diff --git a/bestla/bestla/bestla.h b/bestla/bestla/bestla.h
@@ -31,6 +31,7 @@ enum class BTLA_ISA : uint8_t {
   AMX_INT8,
   AVX512_FP16,
   AVX512_BF16,
+  ISA_COUNT,
 };
 enum class BTLA_DTYPE : uint32_t {
   EleBitsMask = 0xff,

diff --git a/bestla/bestla/bestla_device.h b/bestla/bestla/bestla_device.h
@@ -215,6 +215,7 @@ class CpuDevice {
  public:
   inline int getThreads() { return numthreads; }
   inline int getCores() { return numcores; }
+  inline uint32_t getL3CacheSize() { return L3Cache; }
   inline uint32_t getL2CacheSize() { return L2Cache; }
   inline uint32_t getL1CacheSize() { return L1Cache; }
   inline uint32_t getL2CacheSize_E() { return E_L2Cache; }
@@ -228,7 +229,7 @@ class CpuDevice {
   inline bool AMX_BF16() { return mHasAMX_BF16; }
   inline bool AVX512_BF16() { return mHasAVX512_BF16; }
   inline bool AVX512_FP16() { return mHasAVX512_FP16; }
-  inline float getPE() { return (P_core.size() * P_power) / (E_core.size() * E_power); }
+  inline float* const getPE() { return PE; }
   inline size_t getPcoreNum() { return P_core.size(); }
   inline size_t getEcoreNum() { return E_core.size(); }
   inline size_t getSMTcoreNum() { return SMT_core.size(); }
@@ -328,12 +329,40 @@ class CpuDevice {
         }
       }
       numcores = P_core.size() + E_core.size();
-      numthreads = P_core.size() * 2 + E_core.size();
+      numthreads = P_core.size() + E_core.size() + SMT_core.size();
+
+      {
+        // set PE
+        uint32_t tmp[4];
+        _cpu.getCpuid(1, tmp);
+        if (p) printf("!!!\t%x\t%x\t%x\t%x!!!\n", tmp[0], tmp[1], tmp[2], tmp[3]);
+        const int famliy = (tmp[0] >> 8) & ((1u << 4) - 1);          // cpu.extractBit(a[0], 8, 11);
+        const int extendedModel = (tmp[0] >> 16) & ((1u << 4) - 1);  // cpu.extractBit(a[0], 16, 24);
+        {
+          for (int i = 0; i < int(BTLA_ISA::ISA_COUNT); i++) PE[i] = 1.0f;
+          // CPU identification refer to: https://en.wikichip.org/wiki/intel/cpuid
+          if (famliy == 6) switch (extendedModel) {
+              case 9:  // ALD
+                PE[int(BTLA_ISA::AVX2)] = 3.0f;
+                PE[int(BTLA_ISA::AVX_VNNI)] = 5.0f;
+                break;
+              case 10:  // MTL
+                PE[int(BTLA_ISA::AVX2)] = 2.2f;
+                PE[int(BTLA_ISA::AVX_VNNI)] = 3.0f;
+                break;
+              case 11:  // RPL
+                PE[int(BTLA_ISA::AVX2)] = 1.8f;
+                PE[int(BTLA_ISA::AVX_VNNI)] = 2.6f;
+                break;
+            }
+        }
+      }
     } else {
       L1Cache = _cpu.getDataCacheSize(0);
       L2Cache = _cpu.getDataCacheSize(1);
       numthreads = numcores;
     }
+    L3Cache = _cpu.getDataCacheSize(2);
 #if FIXED_CACHE
     L2Cache = L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : L2Cache;
     E_L2Cache = E_L2Cache >= FIXED_CACHE_SIZE ? FIXED_CACHE_SIZE : E_L2Cache;
@@ -357,7 +386,7 @@ class CpuDevice {
     Xbyak::util::Cpu cpu;
     uint32_t tmp[4];
     cpu.getCpuid(0x1A, tmp);
-    int core_type = (tmp[0] >> 24) & ((1u << 7) - 1);  // cpu.extractBit(a[0], 24, 31);
+    int core_type = (tmp[0] >> 24) & ((1u << 8) - 1);  // cpu.extractBit(a[0], 24, 31);
     switch (core_type) {
       case 32:
         // printf("Atom\n");
@@ -407,7 +436,7 @@ class CpuDevice {
   }
   static void core_bond(int core) {
 #ifdef _WIN32
-    SetThreadAffinityMask(GetCurrentThread(), 1 << core);
+    SetThreadAffinityMask(GetCurrentThread(), 1LL << core);
 #else
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
@@ -420,7 +449,7 @@ class CpuDevice {
   static void core_bond(std::thread& thread, int core) {
 #ifdef _WIN32
     HANDLE handle = thread.native_handle();
-    SetThreadAffinityMask(handle, 1 << core);
+    SetThreadAffinityMask(handle, 1LL << core);
 #else
     cpu_set_t cpuset;
     CPU_ZERO(&cpuset);
@@ -434,29 +463,69 @@ class CpuDevice {
   bool isHybrid() { return mHybrid; }
 
  protected:
-  uint32_t L2Cache, L1Cache;
+  uint32_t L2Cache, L1Cache, L3Cache;
   bool mHybrid = false;
   bool mHasAVX2, mHasAVX_VNNI, mHasAVX, mHasAVX512_VNNI, mHasAMX_INT8, mHasAMX_BF16, mHasAVX512F, mHasAVX512_BF16,
       mHasAVX512_FP16;
   int numcores;
   int numthreads;
   std::vector<int> P_core, E_core, SMT_core;
   uint32_t E_L2Cache, E_L1Cache;
-  float P_power = 4.8, E_power = 2.3;
+  float PE[int(BTLA_ISA::ISA_COUNT)];
 };
 
 #define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();
 
-class CpuBase {
+class CpuRuntime {
  public:
-  CpuBase() {
+  CpuRuntime() = default;
+  static CpuRuntime& getInstance(int thread) {
+    static std::map<int, CpuRuntime> instances;
+    if (instances.count(thread) == 0) instances[thread] = CpuRuntime(thread);
+    return instances[thread];
+  }
+
+  inline float getPE(const BTLA_ISA isa) {
+    // printf("GET:%d\t%f\n",int(isa), *cur_PE);
+    return PE[int(isa)] * P_core_num / E_core_num;
+  }
+
+  inline void adjustPE(const BTLA_ISA isa, const float PE_) {
+    // printf("Adjust:%d,%f\n",int(isa),PE_);
+    PE[int(isa)] *= PE_;
+  }
+
+  size_t mL2Cache, mL1Cache, mL2Cache_P = 0, mL1Cache_P = 0, mL2Cache_E = 0, mL1Cache_E = 0;
+  int P_core_num = 0, E_core_num = 0;
+  bool mHybrid = false;
+
+ private:
+  CpuRuntime(int thread) {
     GetCPUDevice();
     mL2Cache = _cd->getL2CacheSize();
     mL1Cache = _cd->getL1CacheSize();
-    mNumThreads = _cd->getThreads();
+    maxThreads = _cd->getThreads();
+    mHybrid = false;
+    if (_cd->isHybrid() && thread > _cd->getPcoreNum()) {
+      if (thread > _cd->getPcoreNum() + _cd->getEcoreNum()) {
+        mL1Cache_P = mL1Cache / 2;
+        mL2Cache_P = mL2Cache / 2;
+        P_core_num = _cd->getPcoreNum();
+        E_core_num = _cd->getEcoreNum();
+      } else {
+        mL1Cache_P = mL1Cache;
+        mL2Cache_P = mL2Cache;
+        P_core_num = _cd->getPcoreNum();
+        E_core_num = thread - P_core_num;
+      }
+      mL1Cache_E = _cd->getL1CacheSize_E();
+      mL2Cache_E = _cd->getL2CacheSize_E();
+      mHybrid = true;
+      memcpy(PE, _cd->getPE(), int(BTLA_ISA::ISA_COUNT) * sizeof(float));
+    }
   }
-  size_t mL2Cache, mL1Cache;
-  int mNumThreads;
+  float PE[int(BTLA_ISA::ISA_COUNT)];
+  int maxThreads;
 };
 }  // namespace device
 }  // namespace bestla