intel · luoyu-intel · Jun 21, 2024 · May 10, 2024 · May 17, 2024 · May 19, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -61,6 +61,7 @@ option(NS_AVX512_VNNI            "neural_speed: enable AVX512-VNNI"
 option(NS_FMA                    "neural_speed: enable FMA"                                     ON)
 option(NS_AMX                    "neural_speed: enable AMX"                                     OFF)
 option(NS_USE_OMP                "neural_speed: use OpenMP thread pool."                        ON)
+option(NS_SYCL                   "neural_speed: enable SYCL for GPUs."                          OFF)
 
 option(NS_BUILD_TESTS            "neural_speed: build tests"                       ${NS_STANDALONE})
 option(NS_BUILD_EXAMPLES         "neural_speed: build examples"                    ${NS_STANDALONE})
@@ -143,6 +144,11 @@ if(NS_USE_OMP)
   add_compile_definitions(NS_USE_OMP)
 endif()
 
+if(NS_SYCL)
+  set(BTLA_SYCL ON CACHE BOOL "BesTLA with SYCL")
+  add_compile_definitions(NS_SYCL)
+endif()
+
 add_subdirectory(bestla)
 
 add_subdirectory(neural_speed)
diff --git a/CMakePresets.json b/CMakePresets.json
@@ -138,8 +138,10 @@
         "CMAKE_BUILD_TYPE": "Debug",
         "BTLA_UT_DEBUG": "ON",
         "BTLA_UT_ALL": "OFF",
-        "BTLA_SYCL": "ON",
+        "NS_SYCL": "ON",
         "BTLA_UT_BENCHMARK": "ON",
+        "BTLA_UT_OPENMP": "ON",
+        "BTLA_ENABLE_OPENMP": "ON",
         "CMAKE_CXX_COMPILER": "icx",
         "CMAKE_C_COMPILER": "icx"
       }

diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
@@ -1,6 +1,9 @@
 cmake_minimum_required(VERSION 3.12)
 project(bestla LANGUAGES CXX VERSION 0.1.0)
 
+if(BTLA_SYCL)
+  include(cmake/sycl.cmake)
+endif()
 include(cmake/FindSIMD.cmake)
 
 file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
@@ -55,11 +58,11 @@ endforeach()
 set(sycl_headers)
 set(sycl_libs)
 if(BTLA_SYCL)
-  include(cmake/sycl.cmake)
   file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
   target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_SYCL)
   list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
-  add_compile_options(-march=native)
+  target_compile_options(${PROJECT_NAME} INTERFACE -march=native)
+  target_link_libraries(${PROJECT_NAME} INTERFACE ${sycl_libs})
   #add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
 endif(BTLA_SYCL)
 
@@ -103,7 +106,7 @@ function(add_ut_flag UT_OPTION)
 endfunction()
 
 set(benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
-# list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
+list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
 
 
 if(UT_BUILD)
@@ -150,6 +153,9 @@ endif(UT_BUILD)
 if(BTLA_UT_BENCHMARK)
   file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
   include_directories(${PROJECT_NAME})
+  if(NOT BTLA_SYCL)
+    list(REMOVE_ITEM benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
+  endif()
 	add_executable(${PROJECT_NAME}_benchmark ${benchmark_srcs} ${headers} ${ut_headers})
   if(BTLA_UT_OPENMP)
     include(FindOpenMP)

diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h
@@ -126,6 +126,28 @@ class WeightKBlockNInteger {
     return tmp;
   }
 
+  AUTOCALL void convertTransStorage(StorageWeight& srcstor, StorageWeight& dststor, parallel::IThreading* threading) {
+    auto s8buf = utils::amalloc<int8_t>((size_t)srcstor.mK * srcstor.mN);
+    auto s8transbuf = utils::amalloc<int8_t>((size_t)srcstor.mKPad * srcstor.mNPad);
+    unpackWeight(srcstor.mN, srcstor.mK, &srcstor, s8buf, srcstor.mN, threading);
+    transposeWeight<int8_t>(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading);
+    compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr<int8_t>(), srcstor.mDType,
+                   threading);
+    if (s8buf) {
+      utils::afree(s8buf);
+    }
+    if (s8transbuf) {
+      utils::afree(s8transbuf);
+    }
+    int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize);
+    if (srcstor.mCorrection.mScaEleSize == 4) {
+      transposeWeight<float>(nk_scale, srcstor.mNPad, srcstor.template SPtr<float>(), srcstor.mNPad,
+                             dststor.template SPtr<float>(), dststor.CStep(), threading);
+    } else if (srcstor.mCorrection.mScaEleSize == 2) {
+      transposeWeight<uint16_t>(nk_scale, srcstor.mNPad, srcstor.template SPtr<uint16_t>(), srcstor.mNPad,
+                                dststor.template SPtr<uint16_t>(), dststor.CStep(), threading);
+    }
+  }
   AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
                                  utils::aligned_vector<float>* dq_buf) {
     if (qtype == BTLA_DTYPE::DQ8_BNB) {

diff --git a/bestla/bestla/bestla_storage.h b/bestla/bestla/bestla_storage.h
@@ -706,6 +706,22 @@ class StorageWeightKBlockNInteger : public IWeightKBlockBase {
     mPrologueID = BTLA_PROLOGUEB_IDS::WeightKBlockNInteger;
   }
 
+  StorageWeightKBlockNInteger toTrans() {
+    StorageWeightKBlockNInteger trans(-1);
+    trans.mK = mK;
+    trans.mN = mN;
+    trans.mNPad = mNPad;
+    trans.mKPad = mKPad;
+    trans.mBlockSize = mBlockSize;
+    trans.mDType = mDType;
+    trans.mQBuf.resize(mQBuf.size<int8_t>());
+    int nk_scale = utils::updiv(mKPad, mBlockSize);
+    trans.mCorrection.resize(mNPad, nk_scale, mCorrection.mScaT, mCorrection.mZpT, mCorrection.mRedT,
+                             mCorrection.mZpBuf.size<int>() > 0, mCorrection.mRedBuf.size<int>() > 0);
+    trans.update_size();
+    return trans;
+  }
+
   size_t resize(int NPad, int KPad, int Block, int N, int K, BTLA_DTYPE qtype, BTLA_DTYPE scalet, BTLA_DTYPE redt,
                 bool IsAsym) {
     BTLA_DTYPE zpt = BTLA_DTYPE::S8;

diff --git a/bestla/bestla/bestla_utils.h b/bestla/bestla/bestla_utils.h
@@ -62,6 +62,17 @@
 
 // As long as the compiler supports the ISA, we will enable it.
 // Only the ISA you use in your project will be compiled.
+#if defined(_MSC_VER) && defined(__INTEL_LLVM_COMPILER)
+#define CompileAVX512F() defined(__AVX512F__)
+#define CompileAVX512VNNI() defined(__AVX512VNNI__)
+#define CompileAVX2() defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
+#define CompileAVXVNNI() defined(__AVXVNNI__)
+#define CompileAMX() defined(__AMX_TILE__)
+#define CompileBF16() defined(__AVX512BF16__)
+#define CompileFP16() defined(__AVX512FP16__)
+#define CompileAMXBF16() (CompileAMX())
+#define CompileAMXINT8() (CompileAMX())
+#else
 #define CompileAVX512F() BTLA_AVX512_FOUND
 #define CompileAVX512VNNI() BTLA_AVX512_VNNI_FOUND
 #define CompileAVX2() BTLA_AVX2_FOUND
@@ -72,6 +83,7 @@
 #define CompileAMXFP16() BTLA_AMX_FP16_FOUND
 #define CompileAMXINT8() BTLA_AMX_INT8_FOUND
 #define CompileAMX() BTLA_AMX_BF16_FOUND
+#endif
 
 // called by launcher, time critical functions
 #define TLACALL             \
@@ -475,6 +487,8 @@ class isa_base {
 
 static inline int padto_le(int src, int padding) { return src / padding * padding; }
 
+static inline int64_t padto_le(int64_t src, int64_t padding) { return src / padding * padding; }
+
 static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }
 
 static inline int updiv(int a, int b) { return (a + b - 1) / b; }

diff --git a/bestla/bestla/kernel_avx2.h b/bestla/bestla/kernel_avx2.h
@@ -25,7 +25,7 @@ namespace avx2 {
 #pragma GCC push_options
 #pragma GCC target("avx2", "fma", "f16c")
 #elif defined(ICX)
-#pragma clang attribute push(__attribute__((target("avx,avx2,fma"))), apply_to = function)
+//#pragma clang attribute push(__attribute__((target("avx2,fma,f16c"))), apply_to = function)
 #endif
 
 static inline void zero_reg() { _mm256_zeroupper(); }

diff --git a/bestla/bestla/sycl/sycl_device.h b/bestla/bestla/sycl/sycl_device.h
@@ -23,7 +23,7 @@ namespace sycl_device {
 
 class SyclDevice {
  public:
-  SyclDevice(bool profile) {
+  SyclDevice(int gpu_idx, bool profile) {
     // Create an exception handler for asynchronous SYCL exceptions
     static auto exception_handler = [](sycl::exception_list e_list) {
       for (std::exception_ptr const& e : e_list) {
@@ -37,12 +37,38 @@ class SyclDevice {
         }
       }
     };
+    auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
+    assert(gpu_idx < devices.size());
+
+    if (profile) {
+      sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
+      mQueue = sycl::queue(devices[gpu_idx], exception_handler, prop);
+    } else {
+      sycl::property_list prop = {sycl::property::queue::in_order()};
+      mQueue = sycl::queue(devices[gpu_idx], exception_handler);
+    }
+  }
 
+  SyclDevice(bool profile) {
+    // Create an exception handler for asynchronous SYCL exceptions
+    static auto exception_handler = [](sycl::exception_list e_list) {
+      for (std::exception_ptr const& e : e_list) {
+        try {
+          std::rethrow_exception(e);
+        } catch (std::exception const& e) {
+#if _DEBUG
+          std::cout << "Failure" << std::endl;
+#endif
+          std::terminate();
+        }
+      }
+    };
     auto d_selector{sycl::default_selector_v};
     if (profile) {
-      sycl::property_list prop = {sycl::property::queue::enable_profiling()};
+      sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
       mQueue = sycl::queue(d_selector, exception_handler, prop);
     } else {
+      sycl::property_list prop = {sycl::property::queue::in_order()};
       mQueue = sycl::queue(d_selector, exception_handler);
     }
   }
@@ -51,20 +77,44 @@ class SyclDevice {
 
   inline std::string getName() { return mQueue.get_device().get_info<sycl::info::device::name>(); };
 
+  size_t getGlobalMemSize() { return mQueue.get_device().get_info<sycl::info::device::global_mem_size>(); }
+  size_t getMaxMemAllocSize() { return mQueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>(); }
+
+  double getGlobalMemSizeGB() { return double(getGlobalMemSize()) / 1e9; }
+  double getMaxMemAllocSizeMB() { return double(getGlobalMemSize()) / 1e6; }
+
+  static inline bool is_cpu(const sycl::device& dev) {
+    return dev.get_info<sycl::info::device::device_type>() == sycl::info::device_type::cpu;
+  }
+
+  static inline bool is_gpu(const sycl::device& dev) {
+    return dev.get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu;
+  }
+
+  static inline bool is_cpu(sycl::queue* q) {
+    return q->get_device().get_info<sycl::info::device::device_type>() == sycl::info::device_type::cpu;
+  }
+
+  static inline bool is_gpu(sycl::queue* q) {
+    return q->get_device().get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu;
+  }
+
   void print() {
     std::cout << "Running on device: " << mQueue.get_device().get_info<sycl::info::device::name>() << "\n";
-    std::cout << "EU count:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count>()
-              << "\n";  // 448
-    std::cout << "EU count per subslice:"
-              << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>() << "\n";  // 8
-    std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>()
-              << "\n";  // 8
-    std::cout << "HW threads per EU:"
-              << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>() << "\n";  // 8
-    std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_slices>()
-              << "\n";  // 7
-    std::cout << "Subslice per slice:"
-              << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_subslices_per_slice>() << "\n";  // 8
+    if (is_gpu(mQueue.get_device())) {
+      std::cout << "EU count:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_count>() << "\n";
+      std::cout << "EU count per subslice:"
+                << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_count_per_subslice>() << "\n";
+      std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>()
+                << "\n";
+      std::cout << "HW threads per EU:"
+                << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>() << "\n";
+      std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_slices>() << "\n";
+      std::cout << "Subslice per slice:"
+                << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>() << "\n";
+    }
+    std::cout << "Global Memory size: " << getGlobalMemSizeGB() << "\n";
+    std::cout << "Global Memory size: " << getMaxMemAllocSize() << "\n";
   }
   sycl::queue mQueue;
 };

diff --git a/bestla/bestla/sycl/sycl_gemm.h b/bestla/bestla/sycl/sycl_gemm.h
@@ -16,7 +16,7 @@
 #ifdef BTLA_SYCL
 #include <array>
 
-#include "bestla_utils.h"
+#include "bestla/bestla_utils.h"
 #include <sycl/sycl.hpp>
 
 namespace bestla {
@@ -64,6 +64,17 @@ class SGemmCoreSharedB {
 
   using SLM_B_Acc = sycl::local_accessor<TB, 1>;
 
+  using AType = TA;
+  using BType = TB;
+  using CType = TC;
+  static auto constexpr NTILE = WgNEle;
+  static auto constexpr MTILE = WgMEle;
+  static auto constexpr KTILE = TileK;
+  static auto constexpr PACK_ROW = 1;
+  static int constexpr PREFERRED_N = NTILE;
+  static auto constexpr ISA = BTLA_ISA::ISA_COUNT;
+  static auto constexpr ID = 0;
+
   static inline void compute(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
                              const sycl_utils::nd_item_helper<SGemmCoreSharedB<ConfigT>>& helper) {
 #pragma unroll(1)

diff --git a/bestla/bestla/sycl/sycl_prologue_a.h b/bestla/bestla/sycl/sycl_prologue_a.h
@@ -16,7 +16,7 @@
 #ifdef BTLA_SYCL
 #include <array>
 
-#include "bestla_utils.h"
+#include "bestla/bestla_utils.h"
 #include <sycl/sycl.hpp>
 
 namespace bestla {