Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
Rename all jblas files and their usage (#7)
Browse files Browse the repository at this point in the history
* pass compilation

* rename common files

* BesTLA ut passed

* rename variables and namespaces
  • Loading branch information
luoyu-intel authored Jan 1, 2024
1 parent b330746 commit d5c26d4
Show file tree
Hide file tree
Showing 121 changed files with 9,113 additions and 6,501 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
################################################################################
# This .gitignore file was automatically created by Microsoft(R) Visual Studio.
################################################################################

/.vs
/out
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ if (NE_GPU)
endif()

option(NE_BUILD_TESTS "neural_engine: build tests" ${NE_STANDALONE})
option(NE_BTLA_UT "enable BesTLA's unit tests" OFF)
option(NE_BUILD_EXAMPLES "neural_engine: build examples" ${NE_STANDALONE})
if(NE_BUILD_TESTS)
add_compile_definitions(NE_BUILD_TESTS)
Expand Down Expand Up @@ -139,6 +140,11 @@ if (NE_PYTHON_API)
add_subdirectory(third_party/pybind11)
endif()

add_subdirectory(bestla jblas)
if (NE_BTLA_UT)
set(BTLA_UT_ALL ON)
endif()
include(FindOpenMP)

add_subdirectory(bestla)

add_subdirectory(neural_speed)
10 changes: 10 additions & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,16 @@
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
},
{
"name": "x64-bestla-UT",
"displayName": "x64 BesTLA unit test",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"NE_BTLA_UT": "ON"
}
}
]
}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ Argument description of inference.py:
| --keep | Number of tokens to keep from the initial prompt: Int (default: 0, -1 = all) |
| --shift-roped-k | Use [ring-buffer](./docs/infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) |
| --glm_tokenizer | The path of the chatglm tokenizer: String (default: THUDM/chatglm-6b) |
| --memory-f32 <br> --memory-f16 <br> --memory-auto | Data type of kv memory (default to auto);<br>If set to auto, the runtime will try with jblas flash attn managed format (currently requires GCC11+ & AMX) and fall back to fp16 if failed |
| --memory-f32 <br> --memory-f16 <br> --memory-auto | Data type of kv memory (default to auto);<br>If set to auto, the runtime will try with bestla flash attn managed format (currently requires GCC11+ & AMX) and fall back to fp16 if failed |


### 3. Tensor Parallelism cross nodes/sockets
Expand Down
101 changes: 51 additions & 50 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,40 +1,40 @@
cmake_minimum_required(VERSION 3.5)
cmake_minimum_required(VERSION 3.12)

project(jblas LANGUAGES CXX VERSION 0.1.0)
project(bestla LANGUAGES CXX VERSION 0.1.0)
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(JBLAS_UT_ALL "Enable all unit tests" OFF)
option(JBLAS_UT_DEBUG "Enable debug unit tests" ON)
option(JBLAS_UT_EPILOGUE "Enable unit test for epilogue" OFF)
option(JBLAS_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
option(JBLAS_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
option(JBLAS_UT_GEMM "Enable unit test for micro gemm kernels" OFF)
option(JBLAS_UT_WRAPPER "Enable unit test for parallel gemms" OFF)
option(JBLAS_UT_PARALLEL "Enable unit test for parallel set" OFF)
option(JBLAS_UT_KERNEL_JIT "Enable unit test for jit kernels" OFF)
option(JBLAS_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
option(JBLAS_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
option(JBLAS_UT_NOASAN "Disable sanitize" OFF)
option(JBLAS_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(JBLAS_UT_OPENMP "Use OpenMP" ON)

if(JBLAS_UT_ALL)
set(JBLAS_UT_EPILOGUE ON)
set(JBLAS_UT_PROLOGUE_A ON)
set(JBLAS_UT_PROLOGUE_B ON)
set(JBLAS_UT_GEMM ON)
set(JBLAS_UT_WRAPPER ON)
set(JBLAS_UT_PARALLEL ON)
set(JBLAS_UT_KERNEL_JIT ON)
set(JBLAS_UT_KERNEL_INTRIN ON)
set(JBLAS_UT_KERNEL_WRAPPER ON)
endif(JBLAS_UT_ALL)
option(BTLA_UT_ALL "Enable all unit tests" OFF)
option(BTLA_UT_DEBUG "Enable debug unit tests" ON)
option(BTLA_UT_EPILOGUE "Enable unit test for epilogue" OFF)
option(BTLA_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
option(BTLA_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
option(BTLA_UT_GEMM "Enable unit test for micro gemm kernels" OFF)
option(BTLA_UT_WRAPPER "Enable unit test for parallel gemms" OFF)
option(BTLA_UT_PARALLEL "Enable unit test for parallel set" OFF)
option(BTLA_UT_KERNEL_JIT "Enable unit test for jit kernels" OFF)
option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
option(BTLA_UT_NOASAN "Disable sanitize" OFF)
option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(BTLA_UT_OPENMP "Use OpenMP" ON)

if(BTLA_UT_ALL)
set(BTLA_UT_EPILOGUE ON)
set(BTLA_UT_PROLOGUE_A ON)
set(BTLA_UT_PROLOGUE_B ON)
set(BTLA_UT_GEMM ON)
set(BTLA_UT_WRAPPER ON)
set(BTLA_UT_PARALLEL ON)
set(BTLA_UT_KERNEL_JIT ON)
set(BTLA_UT_KERNEL_INTRIN ON)
set(BTLA_UT_KERNEL_WRAPPER ON)
endif(BTLA_UT_ALL)

set(UT_BUILD FALSE)
if(JBLAS_UT_DEBUG OR JBLAS_UT_PROLOGUE_A OR JBLAS_UT_PROLOGUE_B OR JBLAS_UT_EPILOGUE OR JBLAS_UT_GEMM
OR JBLAS_UT_WRAPPER OR JBLAS_UT_PARALLEL OR JBLAS_UT_KERNEL_JIT OR JBLAS_UT_KERNEL_INTRIN
OR JBLAS_UT_KERNEL_WRAPPER)
if(BTLA_UT_DEBUG OR BTLA_UT_PROLOGUE_A OR BTLA_UT_PROLOGUE_B OR BTLA_UT_EPILOGUE OR BTLA_UT_GEMM
OR BTLA_UT_WRAPPER OR BTLA_UT_PARALLEL OR BTLA_UT_KERNEL_JIT OR BTLA_UT_KERNEL_INTRIN
OR BTLA_UT_KERNEL_WRAPPER)
set(UT_BUILD TRUE)
endif()

Expand Down Expand Up @@ -91,10 +91,7 @@ if(WIN32)
target_link_options(${PROJECT_NAME} INTERFACE /STACK:5242880) #Stack requires up to L2 cache size
endif(WIN32)

if(JBLAS_UT_OPENMP)
include(FindOpenMP)
target_link_libraries(${PROJECT_NAME} INTERFACE OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
endif()


set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
Expand All @@ -109,27 +106,31 @@ target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everthing even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})

if(BTLA_UT_OPENMP)
include(FindOpenMP)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
endif()
if(NOT WIN32)
if(NOT JBLAS_UT_NOASAN)
target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
if(NOT BTLA_UT_NOASAN)
target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
endif()
target_link_options(${PROJECT_NAME}_ut PRIVATE -lpthread)
endif()

add_ut_flag(JBLAS_UT_DEBUG)
add_ut_flag(JBLAS_UT_EPILOGUE)
add_ut_flag(JBLAS_UT_PROLOGUE_A)
add_ut_flag(JBLAS_UT_PROLOGUE_B)
add_ut_flag(JBLAS_UT_GEMM)
add_ut_flag(JBLAS_UT_PARALLEL)
add_ut_flag(JBLAS_UT_WRAPPER)
add_ut_flag(JBLAS_UT_KERNEL_INTRIN)
add_ut_flag(JBLAS_UT_KERNEL_JIT)
add_ut_flag(JBLAS_UT_KERNEL_WRAPPER)
add_ut_flag(JBLAS_UT_BENCHMARK)
add_ut_flag(BTLA_UT_DEBUG)
add_ut_flag(BTLA_UT_EPILOGUE)
add_ut_flag(BTLA_UT_PROLOGUE_A)
add_ut_flag(BTLA_UT_PROLOGUE_B)
add_ut_flag(BTLA_UT_GEMM)
add_ut_flag(BTLA_UT_PARALLEL)
add_ut_flag(BTLA_UT_WRAPPER)
add_ut_flag(BTLA_UT_KERNEL_INTRIN)
add_ut_flag(BTLA_UT_KERNEL_JIT)
add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
add_ut_flag(BTLA_UT_BENCHMARK)

target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
endif(UT_BUILD)
Expand Down
4 changes: 2 additions & 2 deletions bestla/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,6 @@ Compile:

Usage:
```cmake
add_subdirectory(jblas)
target_link_libraries("${YOUR_PROJECT}" jblas::jblas)
add_subdirectory(bestla)
target_link_libraries("${YOUR_PROJECT}" bestla::bestla)
```
46 changes: 20 additions & 26 deletions bestla/jblas/jit_blas.h → bestla/bestla/bestla.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,26 @@
// limitations under the License.
#pragma once
#include <stdint.h>
enum JBLAS_CODE {
JblasSuccess = 0,
JblasInvalidParam = 1,
JblasInvalidISA = 2,
JblasRuntimeError = 4,
JblasNotSupport = 8,
enum class BTLA_CODE {
Success = 0,
InvalidParam = 1,
InvalidISA = 2,
RuntimeError = 4,
NotSupport = 8,
};
enum JBLAS_ISA : uint8_t {
JblasNoSIMD = 0,
JblasAVX,
JblasAVX2,
JblasAVX_VNNI,
JblasAVX512F,
JblasAVX512_VNNI,
JblasAMX_BF16,
JblasAMX_INT8,
JblasAVX512_FP16,
JblasAVX512_BF16,
enum class BTLA_ISA : uint8_t {
NoSIMD = 0,
AVX,
AVX2,
AVX_VNNI,
AVX512F,
AVX512_VNNI,
AMX_BF16,
AMX_INT8,
AVX512_FP16,
AVX512_BF16,
};
enum class JBLAS_DTYPE : uint32_t {
enum class BTLA_DTYPE : uint32_t {
EleBitsMask = 0xff,
EleBitsShift = 0,
EleBitsUndef = 0,
Expand Down Expand Up @@ -70,15 +70,9 @@ enum class JBLAS_DTYPE : uint32_t {
U32 = EleBits32 | TypeInt | SubType1,
};

enum JBLAS_LAYOUT { JblasRowMajor = 101, JblasColMajor = 102 };
enum JBLAS_TRANSPOSE {
JblasNoTrans = 111,
JblasTrans = 112,
JblasConjTrans = 113,
};
enum JBLAS_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR };
enum class BTLA_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR };

enum class JBLAS_PROLOGUEB_IDS : uint32_t {
enum class BTLA_PROLOGUEB_IDS : uint32_t {
Undef = (uint32_t)-1,
Begin = 0,
NormalBegin = Begin,
Expand Down
24 changes: 12 additions & 12 deletions bestla/jblas/jit_blas_device.h → bestla/bestla/bestla_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@
#include <map>
#include <thread>
#include <vector>
#include "jit_blas.h"
#include "bestla.h"
#include "xbyak/xbyak_util.h"
#ifdef _WIN32
#include <windows.h>
#else
#include <sched.h>
#endif

namespace jblas {
namespace bestla {

namespace device {

Expand Down Expand Up @@ -195,16 +195,16 @@ class SapphireRapids {
static constexpr bool AMX_COMPLEX = 0;
};

template <JBLAS_ISA ISA_T>
template <BTLA_ISA ISA_T>
class isa_base {
public:
static bool constexpr avx = ISA_T >= JblasAVX;
static bool constexpr avx2 = ISA_T >= JblasAVX2;
static bool constexpr avx512f = ISA_T >= JblasAVX512F;
static bool constexpr avx512_vnni = ISA_T >= JblasAVX512_VNNI;
static bool constexpr avx512_fp16 = ISA_T >= JblasAVX512_FP16;
static bool constexpr amx_bf16 = ISA_T >= JblasAMX_BF16;
static bool constexpr amx_int8 = ISA_T >= JblasAMX_INT8;
static bool constexpr avx = ISA_T >= BTLA_ISA::AVX;
static bool constexpr avx2 = ISA_T >= BTLA_ISA::AVX2;
static bool constexpr avx512f = ISA_T >= BTLA_ISA::AVX512F;
static bool constexpr avx512_vnni = ISA_T >= BTLA_ISA::AVX512_VNNI;
static bool constexpr avx512_fp16 = ISA_T >= BTLA_ISA::AVX512_FP16;
static bool constexpr amx_bf16 = ISA_T >= BTLA_ISA::AMX_BF16;
static bool constexpr amx_int8 = ISA_T >= BTLA_ISA::AMX_INT8;
};

class CpuDevice {
Expand Down Expand Up @@ -422,7 +422,7 @@ class CpuDevice {
float P_power = 4.8, E_power = 2.3;
};

#define GetCPUDevice() auto _cd = jblas::device::CpuDevice::getInstance();
#define GetCPUDevice() auto _cd = bestla::device::CpuDevice::getInstance();

class CpuBase {
public:
Expand All @@ -436,4 +436,4 @@ class CpuBase {
int mNumThreads;
};
} // namespace device
} // namespace jblas
} // namespace bestla
Loading

0 comments on commit d5c26d4

Please sign in to comment.