Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

[SYCL] Support SYCL layer for LLaMA2 model #272

Merged
merged 77 commits into from
Jun 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
23aad22
fixed all UTs
ThanatosShinji May 10, 2024
83592d1
move sycl benchmark to benchmark project
ThanatosShinji May 17, 2024
1d6914e
add q4 UT for sycl prologue_b
ThanatosShinji May 19, 2024
c29095a
sycl gemv case
ThanatosShinji May 19, 2024
365697c
add UT case
ThanatosShinji May 21, 2024
0c1b0fe
add to trans storage conversion
ThanatosShinji May 21, 2024
5ede223
add sycl context to model context. compile ne_layers with dpcpp
ThanatosShinji May 22, 2024
ac79c6c
add context sycl memory allocation
ThanatosShinji May 26, 2024
d533fc8
inference with data exchange
ThanatosShinji May 26, 2024
804f4fc
use backend instead of ne type
ThanatosShinji May 26, 2024
9dfd12a
new api to assign sycl buffer
ThanatosShinji May 28, 2024
cf3e0ee
add backend parameter for new tensor
ThanatosShinji May 28, 2024
a20a5d3
add sycl int4 to graph compute
ThanatosShinji May 29, 2024
1666286
fix sync
ThanatosShinji May 29, 2024
09c97ee
compile without sycl
ThanatosShinji Jun 1, 2024
f835c11
sync with main
ThanatosShinji Jun 1, 2024
793cbea
fix tensor size bug
ThanatosShinji Jun 1, 2024
d685cfa
refactor layer config
ThanatosShinji Jun 2, 2024
db905fd
sync ut
ThanatosShinji Jun 2, 2024
a5c9c7f
support 2 layers of sycl
ThanatosShinji Jun 2, 2024
c400b9c
sync main
luoyu-intel Jun 4, 2024
b0fe7ea
revert ISA detect for dpcpp
luoyu-intel Jun 4, 2024
c0dcbdf
compile without dpcpp
luoyu-intel Jun 4, 2024
40e6e21
fix avxvnni intrin code
luoyu-intel Jun 4, 2024
1789f3f
protect crash if it's a CPU SYCL device
luoyu-intel Jun 4, 2024
7e64369
add device mul function
luoyu-intel Jun 4, 2024
0ae743e
fix the sync issue
ThanatosShinji Jun 4, 2024
4e26650
run model with all FFN layers on SYCL
luoyu-intel Jun 5, 2024
479bc87
fix compile
luoyu-intel Jun 5, 2024
da3c676
clang-format
luoyu-intel Jun 5, 2024
c9b46ed
revert model config
luoyu-intel Jun 5, 2024
da787cb
fix fun ret
ThanatosShinji Jun 5, 2024
b433333
fix the kernel bug
ThanatosShinji Jun 5, 2024
01737dc
remove all grad tensors.
luoyu-intel Jun 6, 2024
ecc1cbe
fix some bugs.
luoyu-intel Jun 6, 2024
a7aab9c
support llama shapes, add new UT case, update new api of dpcpp
luoyu-intel Jun 6, 2024
cd60d31
support all ffn layers
luoyu-intel Jun 6, 2024
45e56f5
add sync for CPU Device
luoyu-intel Jun 6, 2024
0ed2228
clang-format
luoyu-intel Jun 6, 2024
f73ee66
fix warning
luoyu-intel Jun 6, 2024
c43f2ef
clang-format
luoyu-intel Jun 6, 2024
ef9e365
add back f32 model support
luoyu-intel Jun 7, 2024
99c41c0
fix typo, remove unused code
luoyu-intel Jun 7, 2024
c30c23f
bring more layers to SYCL
luoyu-intel Jun 9, 2024
8890f50
add embedding support and use omp in sycl
luoyu-intel Jun 9, 2024
a189d90
optimize gemv k iteration
ThanatosShinji Jun 10, 2024
b9e7aa2
optimize rms_norm, add debug macro for no-mha forward.
ThanatosShinji Jun 10, 2024
d908ac8
add mha ut
ThanatosShinji Jun 13, 2024
fd1fbef
prepare for SYCL MHA
ThanatosShinji Jun 15, 2024
52285a9
add SYCL rope
ThanatosShinji Jun 15, 2024
03da9a8
all device f32 mha
ThanatosShinji Jun 17, 2024
89da98b
remove unused code
luoyu-intel Jun 18, 2024
0323edd
fixed
luoyu-intel Jun 18, 2024
360f099
fixed
luoyu-intel Jun 18, 2024
ebfa1d5
refactor sycl context for multiple allocation
ThanatosShinji Jun 18, 2024
fb8c397
support n_gpu_layer
ThanatosShinji Jun 19, 2024
13a8813
reuse scratch
ThanatosShinji Jun 20, 2024
ab3d76c
add new mha version
ThanatosShinji Jun 20, 2024
4d254eb
new version of MHA
ThanatosShinji Jun 20, 2024
023d78c
lower malloc size
luoyu-intel Jun 20, 2024
72f71c5
compile without sycl
ThanatosShinji Jun 20, 2024
e881934
run llama without sycl build
ThanatosShinji Jun 20, 2024
243d744
clang-format
luoyu-intel Jun 20, 2024
e4e64e0
fix clang-tidy
luoyu-intel Jun 21, 2024
2aae1ba
fix py build
luoyu-intel Jun 21, 2024
3d36da0
fix warning
luoyu-intel Jun 21, 2024
5028cf2
use std header
luoyu-intel Jun 21, 2024
f3e6eb4
update math
luoyu-intel Jun 21, 2024
234b157
update math
luoyu-intel Jun 21, 2024
a580f56
revert scratch without SYCL
luoyu-intel Jun 21, 2024
116073a
use cl for c_compiler
luoyu-intel Jun 21, 2024
481dd4f
compile on linux
luoyu-intel Jun 21, 2024
f37e04d
Revert "compile on linux"
luoyu-intel Jun 21, 2024
145dc98
Revert "use cl for c_compiler"
luoyu-intel Jun 21, 2024
42fa774
fix memory leak, set lower extra memory size.
luoyu-intel Jun 21, 2024
5d02676
revert embedding size on CPU
luoyu-intel Jun 21, 2024
5bee840
clang-format
luoyu-intel Jun 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ option(NS_AVX512_VNNI "neural_speed: enable AVX512-VNNI"
option(NS_FMA "neural_speed: enable FMA" ON)
option(NS_AMX "neural_speed: enable AMX" OFF)
option(NS_USE_OMP "neural_speed: use OpenMP thread pool." ON)
option(NS_SYCL "neural_speed: enable SYCL for GPUs." OFF)

option(NS_BUILD_TESTS "neural_speed: build tests" ${NS_STANDALONE})
option(NS_BUILD_EXAMPLES "neural_speed: build examples" ${NS_STANDALONE})
Expand Down Expand Up @@ -143,6 +144,11 @@ if(NS_USE_OMP)
add_compile_definitions(NS_USE_OMP)
endif()

if(NS_SYCL)
set(BTLA_SYCL ON CACHE BOOL "BesTLA with SYCL")
add_compile_definitions(NS_SYCL)
endif()

add_subdirectory(bestla)

add_subdirectory(neural_speed)
4 changes: 3 additions & 1 deletion CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,10 @@
"CMAKE_BUILD_TYPE": "Debug",
"BTLA_UT_DEBUG": "ON",
"BTLA_UT_ALL": "OFF",
"BTLA_SYCL": "ON",
"NS_SYCL": "ON",
"BTLA_UT_BENCHMARK": "ON",
"BTLA_UT_OPENMP": "ON",
"BTLA_ENABLE_OPENMP": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "icx"
}
Expand Down
12 changes: 9 additions & 3 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
cmake_minimum_required(VERSION 3.12)
project(bestla LANGUAGES CXX VERSION 0.1.0)

if(BTLA_SYCL)
include(cmake/sycl.cmake)
endif()
include(cmake/FindSIMD.cmake)

file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
Expand Down Expand Up @@ -55,11 +58,11 @@ endforeach()
set(sycl_headers)
set(sycl_libs)
if(BTLA_SYCL)
include(cmake/sycl.cmake)
file(GLOB sycl_headers ${PROJECT_NAME}/sycl/*.h ${PROJECT_NAME}/sycl/*.hpp)
target_compile_definitions(${PROJECT_NAME} INTERFACE BTLA_SYCL)
list(APPEND sycl_libs IntelSYCL::SYCL_CXX)
add_compile_options(-march=native)
target_compile_options(${PROJECT_NAME} INTERFACE -march=native)
target_link_libraries(${PROJECT_NAME} INTERFACE ${sycl_libs})
#add_link_options(-fsycl-targets=spir64 -Xsycl-target-backend "-options -ze-opt-large-register-file")
endif(BTLA_SYCL)

Expand Down Expand Up @@ -103,7 +106,7 @@ function(add_ut_flag UT_OPTION)
endfunction()

set(benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/bestla_benchmark.cpp)
# list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
list(APPEND benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)


if(UT_BUILD)
Expand Down Expand Up @@ -150,6 +153,9 @@ endif(UT_BUILD)
if(BTLA_UT_BENCHMARK)
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
if(NOT BTLA_SYCL)
list(REMOVE_ITEM benchmark_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}/ut/sycl_benchmark.cpp)
endif()
add_executable(${PROJECT_NAME}_benchmark ${benchmark_srcs} ${headers} ${ut_headers})
if(BTLA_UT_OPENMP)
include(FindOpenMP)
Expand Down
22 changes: 22 additions & 0 deletions bestla/bestla/bestla_prologue_b.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,28 @@ class WeightKBlockNInteger {
return tmp;
}

AUTOCALL void convertTransStorage(StorageWeight& srcstor, StorageWeight& dststor, parallel::IThreading* threading) {
auto s8buf = utils::amalloc<int8_t>((size_t)srcstor.mK * srcstor.mN);
auto s8transbuf = utils::amalloc<int8_t>((size_t)srcstor.mKPad * srcstor.mNPad);
unpackWeight(srcstor.mN, srcstor.mK, &srcstor, s8buf, srcstor.mN, threading);
transposeWeight<int8_t>(srcstor.mK, srcstor.mN, s8buf, srcstor.mN, s8transbuf, srcstor.mKPad, threading);
compressWeight(srcstor.mKPad, srcstor.mNPad, s8transbuf, srcstor.mKPad, dststor.WPtr<int8_t>(), srcstor.mDType,
threading);
if (s8buf) {
utils::afree(s8buf);
}
if (s8transbuf) {
utils::afree(s8transbuf);
}
int nk_scale = utils::updiv(srcstor.mKPad, srcstor.mBlockSize);
if (srcstor.mCorrection.mScaEleSize == 4) {
transposeWeight<float>(nk_scale, srcstor.mNPad, srcstor.template SPtr<float>(), srcstor.mNPad,
dststor.template SPtr<float>(), dststor.CStep(), threading);
} else if (srcstor.mCorrection.mScaEleSize == 2) {
transposeWeight<uint16_t>(nk_scale, srcstor.mNPad, srcstor.template SPtr<uint16_t>(), srcstor.mNPad,
dststor.template SPtr<uint16_t>(), dststor.CStep(), threading);
}
}
AUTOCALL void doubleQuantScale(float* scale, size_t scale_size, int dq_blocksize, BTLA_DTYPE qtype,
utils::aligned_vector<float>* dq_buf) {
if (qtype == BTLA_DTYPE::DQ8_BNB) {
Expand Down
16 changes: 16 additions & 0 deletions bestla/bestla/bestla_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -706,6 +706,22 @@ class StorageWeightKBlockNInteger : public IWeightKBlockBase {
mPrologueID = BTLA_PROLOGUEB_IDS::WeightKBlockNInteger;
}

StorageWeightKBlockNInteger toTrans() {
StorageWeightKBlockNInteger trans(-1);
trans.mK = mK;
trans.mN = mN;
trans.mNPad = mNPad;
trans.mKPad = mKPad;
trans.mBlockSize = mBlockSize;
trans.mDType = mDType;
trans.mQBuf.resize(mQBuf.size<int8_t>());
int nk_scale = utils::updiv(mKPad, mBlockSize);
trans.mCorrection.resize(mNPad, nk_scale, mCorrection.mScaT, mCorrection.mZpT, mCorrection.mRedT,
mCorrection.mZpBuf.size<int>() > 0, mCorrection.mRedBuf.size<int>() > 0);
trans.update_size();
return trans;
}

size_t resize(int NPad, int KPad, int Block, int N, int K, BTLA_DTYPE qtype, BTLA_DTYPE scalet, BTLA_DTYPE redt,
bool IsAsym) {
BTLA_DTYPE zpt = BTLA_DTYPE::S8;
Expand Down
14 changes: 14 additions & 0 deletions bestla/bestla/bestla_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,17 @@

// As long as the compiler supports the ISA, we will enable it.
// Only the ISA you use in your project will be compiled.
#if defined(_MSC_VER) && defined(__INTEL_LLVM_COMPILER)
#define CompileAVX512F() defined(__AVX512F__)
#define CompileAVX512VNNI() defined(__AVX512VNNI__)
#define CompileAVX2() defined(__AVX2__) && defined(__F16C__) && defined(__FMA__)
#define CompileAVXVNNI() defined(__AVXVNNI__)
#define CompileAMX() defined(__AMX_TILE__)
#define CompileBF16() defined(__AVX512BF16__)
#define CompileFP16() defined(__AVX512FP16__)
#define CompileAMXBF16() (CompileAMX())
#define CompileAMXINT8() (CompileAMX())
#else
#define CompileAVX512F() BTLA_AVX512_FOUND
#define CompileAVX512VNNI() BTLA_AVX512_VNNI_FOUND
#define CompileAVX2() BTLA_AVX2_FOUND
Expand All @@ -72,6 +83,7 @@
#define CompileAMXFP16() BTLA_AMX_FP16_FOUND
#define CompileAMXINT8() BTLA_AMX_INT8_FOUND
#define CompileAMX() BTLA_AMX_BF16_FOUND
#endif

// called by launcher, time critical functions
#define TLACALL \
Expand Down Expand Up @@ -475,6 +487,8 @@ class isa_base {

static inline int padto_le(int src, int padding) { return src / padding * padding; }

static inline int64_t padto_le(int64_t src, int64_t padding) { return src / padding * padding; }

static inline size_t padto_le(size_t src, int padding) { return src / size_t(padding) * size_t(padding); }

static inline int updiv(int a, int b) { return (a + b - 1) / b; }
Expand Down
2 changes: 1 addition & 1 deletion bestla/bestla/kernel_avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace avx2 {
#pragma GCC push_options
#pragma GCC target("avx2", "fma", "f16c")
#elif defined(ICX)
#pragma clang attribute push(__attribute__((target("avx,avx2,fma"))), apply_to = function)
//#pragma clang attribute push(__attribute__((target("avx2,fma,f16c"))), apply_to = function)
#endif

static inline void zero_reg() { _mm256_zeroupper(); }
Expand Down
78 changes: 64 additions & 14 deletions bestla/bestla/sycl/sycl_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace sycl_device {

class SyclDevice {
public:
SyclDevice(bool profile) {
SyclDevice(int gpu_idx, bool profile) {
// Create an exception handler for asynchronous SYCL exceptions
static auto exception_handler = [](sycl::exception_list e_list) {
for (std::exception_ptr const& e : e_list) {
Expand All @@ -37,12 +37,38 @@ class SyclDevice {
}
}
};
auto devices = sycl::device::get_devices(sycl::info::device_type::gpu);
assert(gpu_idx < devices.size());

if (profile) {
sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
mQueue = sycl::queue(devices[gpu_idx], exception_handler, prop);
} else {
sycl::property_list prop = {sycl::property::queue::in_order()};
mQueue = sycl::queue(devices[gpu_idx], exception_handler);
}
}

SyclDevice(bool profile) {
// Create an exception handler for asynchronous SYCL exceptions
static auto exception_handler = [](sycl::exception_list e_list) {
for (std::exception_ptr const& e : e_list) {
try {
std::rethrow_exception(e);
} catch (std::exception const& e) {
#if _DEBUG
std::cout << "Failure" << std::endl;
#endif
std::terminate();
}
}
};
auto d_selector{sycl::default_selector_v};
if (profile) {
sycl::property_list prop = {sycl::property::queue::enable_profiling()};
sycl::property_list prop = {sycl::property::queue::enable_profiling(), sycl::property::queue::in_order()};
mQueue = sycl::queue(d_selector, exception_handler, prop);
} else {
sycl::property_list prop = {sycl::property::queue::in_order()};
mQueue = sycl::queue(d_selector, exception_handler);
}
}
Expand All @@ -51,20 +77,44 @@ class SyclDevice {

inline std::string getName() { return mQueue.get_device().get_info<sycl::info::device::name>(); };

size_t getGlobalMemSize() { return mQueue.get_device().get_info<sycl::info::device::global_mem_size>(); }
size_t getMaxMemAllocSize() { return mQueue.get_device().get_info<sycl::info::device::max_mem_alloc_size>(); }

double getGlobalMemSizeGB() { return double(getGlobalMemSize()) / 1e9; }
double getMaxMemAllocSizeMB() { return double(getGlobalMemSize()) / 1e6; }

static inline bool is_cpu(const sycl::device& dev) {
return dev.get_info<sycl::info::device::device_type>() == sycl::info::device_type::cpu;
}

static inline bool is_gpu(const sycl::device& dev) {
return dev.get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu;
}

static inline bool is_cpu(sycl::queue* q) {
return q->get_device().get_info<sycl::info::device::device_type>() == sycl::info::device_type::cpu;
}

static inline bool is_gpu(sycl::queue* q) {
return q->get_device().get_info<sycl::info::device::device_type>() == sycl::info::device_type::gpu;
}

void print() {
std::cout << "Running on device: " << mQueue.get_device().get_info<sycl::info::device::name>() << "\n";
std::cout << "EU count:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count>()
<< "\n"; // 448
std::cout << "EU count per subslice:"
<< mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>() << "\n"; // 8
std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_eu_simd_width>()
<< "\n"; // 8
std::cout << "HW threads per EU:"
<< mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_hw_threads_per_eu>() << "\n"; // 8
std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_slices>()
<< "\n"; // 7
std::cout << "Subslice per slice:"
<< mQueue.get_device().get_info<sycl::info::device::ext_intel_gpu_subslices_per_slice>() << "\n"; // 8
if (is_gpu(mQueue.get_device())) {
std::cout << "EU count:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_count>() << "\n";
std::cout << "EU count per subslice:"
<< mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_count_per_subslice>() << "\n";
std::cout << "EU SIMD width:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_eu_simd_width>()
<< "\n";
std::cout << "HW threads per EU:"
<< mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>() << "\n";
std::cout << "GPU slices:" << mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_slices>() << "\n";
std::cout << "Subslice per slice:"
<< mQueue.get_device().get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>() << "\n";
}
std::cout << "Global Memory size: " << getGlobalMemSizeGB() << "\n";
std::cout << "Global Memory size: " << getMaxMemAllocSize() << "\n";
}
sycl::queue mQueue;
};
Expand Down
13 changes: 12 additions & 1 deletion bestla/bestla/sycl/sycl_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#ifdef BTLA_SYCL
#include <array>

#include "bestla_utils.h"
#include "bestla/bestla_utils.h"
#include <sycl/sycl.hpp>

namespace bestla {
Expand Down Expand Up @@ -64,6 +64,17 @@ class SGemmCoreSharedB {

using SLM_B_Acc = sycl::local_accessor<TB, 1>;

using AType = TA;
using BType = TB;
using CType = TC;
static auto constexpr NTILE = WgNEle;
static auto constexpr MTILE = WgMEle;
static auto constexpr KTILE = TileK;
static auto constexpr PACK_ROW = 1;
static int constexpr PREFERRED_N = NTILE;
static auto constexpr ISA = BTLA_ISA::ISA_COUNT;
static auto constexpr ID = 0;

static inline void compute(const TA* aptr, int lda, const SLM_B_Acc& bacc, TACC* accptr,
const sycl_utils::nd_item_helper<SGemmCoreSharedB<ConfigT>>& helper) {
#pragma unroll(1)
Expand Down
2 changes: 1 addition & 1 deletion bestla/bestla/sycl/sycl_prologue_a.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#ifdef BTLA_SYCL
#include <array>

#include "bestla_utils.h"
#include "bestla/bestla_utils.h"
#include <sycl/sycl.hpp>

namespace bestla {
Expand Down
Loading
Loading