diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..7d2913bd3
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+################################################################################
+# This .gitignore file was automatically created by Microsoft(R) Visual Studio.
+################################################################################
+
+/.vs
+/out
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 463bb2117..69366f3a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ if (NE_GPU)
endif()
option(NE_BUILD_TESTS "neural_engine: build tests" ${NE_STANDALONE})
+option(NE_BTLA_UT "enable BesTLA's unit tests" OFF)
option(NE_BUILD_EXAMPLES "neural_engine: build examples" ${NE_STANDALONE})
if(NE_BUILD_TESTS)
add_compile_definitions(NE_BUILD_TESTS)
@@ -139,6 +140,11 @@ if (NE_PYTHON_API)
add_subdirectory(third_party/pybind11)
endif()
-add_subdirectory(bestla jblas)
+if (NE_BTLA_UT)
+ set(BTLA_UT_ALL ON)
+endif()
+include(FindOpenMP)
+
+add_subdirectory(bestla)
add_subdirectory(neural_speed)
diff --git a/CMakePresets.json b/CMakePresets.json
index a3c8cdf25..6cca625b1 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -57,6 +57,16 @@
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
+ },
+ {
+ "name": "x64-bestla-UT",
+ "displayName": "x64 BesTLA unit test",
+ "description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
+ "inherits": "x64-debug",
+ "cacheVariables": {
+ "CMAKE_BUILD_TYPE": "Release",
+ "NE_BTLA_UT": "ON"
+ }
}
]
}
diff --git a/README.md b/README.md
index 4683601e5..1be563b17 100644
--- a/README.md
+++ b/README.md
@@ -385,7 +385,7 @@ Argument description of inference.py:
| --keep | Number of tokens to keep from the initial prompt: Int (default: 0, -1 = all) |
| --shift-roped-k | Use [ring-buffer](./docs/infinite_inference.md#shift-rope-k-and-ring-buffer) and thus do not re-computing after reaching ctx_size (default: False) |
| --glm_tokenizer | The path of the chatglm tokenizer: String (default: THUDM/chatglm-6b) |
-| --memory-f32
--memory-f16
--memory-auto | Data type of kv memory (default to auto);
If set to auto, the runtime will try with jblas flash attn managed format (currently requires GCC11+ & AMX) and fall back to fp16 if failed |
+| --memory-f32
--memory-f16
--memory-auto | Data type of kv memory (default to auto);
If set to auto, the runtime will try with bestla flash attn managed format (currently requires GCC11+ & AMX) and fall back to fp16 if failed |
### 3. Tensor Parallelism cross nodes/sockets
diff --git a/bestla/CMakeLists.txt b/bestla/CMakeLists.txt
index d05d1b299..c07ac66dc 100644
--- a/bestla/CMakeLists.txt
+++ b/bestla/CMakeLists.txt
@@ -1,40 +1,40 @@
-cmake_minimum_required(VERSION 3.5)
+cmake_minimum_required(VERSION 3.12)
-project(jblas LANGUAGES CXX VERSION 0.1.0)
+project(bestla LANGUAGES CXX VERSION 0.1.0)
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)
-option(JBLAS_UT_ALL "Enable all unit tests" OFF)
-option(JBLAS_UT_DEBUG "Enable debug unit tests" ON)
-option(JBLAS_UT_EPILOGUE "Enable unit test for epilogue" OFF)
-option(JBLAS_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
-option(JBLAS_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
-option(JBLAS_UT_GEMM "Enable unit test for micro gemm kernels" OFF)
-option(JBLAS_UT_WRAPPER "Enable unit test for parallel gemms" OFF)
-option(JBLAS_UT_PARALLEL "Enable unit test for parallel set" OFF)
-option(JBLAS_UT_KERNEL_JIT "Enable unit test for jit kernels" OFF)
-option(JBLAS_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
-option(JBLAS_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
-option(JBLAS_UT_NOASAN "Disable sanitize" OFF)
-option(JBLAS_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
-option(JBLAS_UT_OPENMP "Use OpenMP" ON)
-
-if(JBLAS_UT_ALL)
-set(JBLAS_UT_EPILOGUE ON)
-set(JBLAS_UT_PROLOGUE_A ON)
-set(JBLAS_UT_PROLOGUE_B ON)
-set(JBLAS_UT_GEMM ON)
-set(JBLAS_UT_WRAPPER ON)
-set(JBLAS_UT_PARALLEL ON)
-set(JBLAS_UT_KERNEL_JIT ON)
-set(JBLAS_UT_KERNEL_INTRIN ON)
-set(JBLAS_UT_KERNEL_WRAPPER ON)
-endif(JBLAS_UT_ALL)
+option(BTLA_UT_ALL "Enable all unit tests" OFF)
+option(BTLA_UT_DEBUG "Enable debug unit tests" ON)
+option(BTLA_UT_EPILOGUE "Enable unit test for epilogue" OFF)
+option(BTLA_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
+option(BTLA_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
+option(BTLA_UT_GEMM "Enable unit test for micro gemm kernels" OFF)
+option(BTLA_UT_WRAPPER "Enable unit test for parallel gemms" OFF)
+option(BTLA_UT_PARALLEL "Enable unit test for parallel set" OFF)
+option(BTLA_UT_KERNEL_JIT "Enable unit test for jit kernels" OFF)
+option(BTLA_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
+option(BTLA_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
+option(BTLA_UT_NOASAN "Disable sanitize" OFF)
+option(BTLA_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
+option(BTLA_UT_OPENMP "Use OpenMP" ON)
+
+if(BTLA_UT_ALL)
+set(BTLA_UT_EPILOGUE ON)
+set(BTLA_UT_PROLOGUE_A ON)
+set(BTLA_UT_PROLOGUE_B ON)
+set(BTLA_UT_GEMM ON)
+set(BTLA_UT_WRAPPER ON)
+set(BTLA_UT_PARALLEL ON)
+set(BTLA_UT_KERNEL_JIT ON)
+set(BTLA_UT_KERNEL_INTRIN ON)
+set(BTLA_UT_KERNEL_WRAPPER ON)
+endif(BTLA_UT_ALL)
set(UT_BUILD FALSE)
-if(JBLAS_UT_DEBUG OR JBLAS_UT_PROLOGUE_A OR JBLAS_UT_PROLOGUE_B OR JBLAS_UT_EPILOGUE OR JBLAS_UT_GEMM
-OR JBLAS_UT_WRAPPER OR JBLAS_UT_PARALLEL OR JBLAS_UT_KERNEL_JIT OR JBLAS_UT_KERNEL_INTRIN
-OR JBLAS_UT_KERNEL_WRAPPER)
+if(BTLA_UT_DEBUG OR BTLA_UT_PROLOGUE_A OR BTLA_UT_PROLOGUE_B OR BTLA_UT_EPILOGUE OR BTLA_UT_GEMM
+OR BTLA_UT_WRAPPER OR BTLA_UT_PARALLEL OR BTLA_UT_KERNEL_JIT OR BTLA_UT_KERNEL_INTRIN
+OR BTLA_UT_KERNEL_WRAPPER)
set(UT_BUILD TRUE)
endif()
@@ -91,10 +91,7 @@ if(WIN32)
target_link_options(${PROJECT_NAME} INTERFACE /STACK:5242880) #Stack requires up to L2 cache size
endif(WIN32)
-if(JBLAS_UT_OPENMP)
-include(FindOpenMP)
-target_link_libraries(${PROJECT_NAME} INTERFACE OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
-endif()
+
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -109,27 +106,31 @@ target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everthing even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
+ include_directories(${PROJECT_NAME})
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})
-
+ if(BTLA_UT_OPENMP)
+ include(FindOpenMP)
+ target_link_libraries(${PROJECT_NAME}_ut PRIVATE OpenMP::OpenMP_CXX OpenMP::OpenMP_C)
+ endif()
if(NOT WIN32)
- if(NOT JBLAS_UT_NOASAN)
- target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
- target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
+ if(NOT BTLA_UT_NOASAN)
+ target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
+ target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
endif()
target_link_options(${PROJECT_NAME}_ut PRIVATE -lpthread)
endif()
- add_ut_flag(JBLAS_UT_DEBUG)
- add_ut_flag(JBLAS_UT_EPILOGUE)
- add_ut_flag(JBLAS_UT_PROLOGUE_A)
- add_ut_flag(JBLAS_UT_PROLOGUE_B)
- add_ut_flag(JBLAS_UT_GEMM)
- add_ut_flag(JBLAS_UT_PARALLEL)
- add_ut_flag(JBLAS_UT_WRAPPER)
- add_ut_flag(JBLAS_UT_KERNEL_INTRIN)
- add_ut_flag(JBLAS_UT_KERNEL_JIT)
- add_ut_flag(JBLAS_UT_KERNEL_WRAPPER)
- add_ut_flag(JBLAS_UT_BENCHMARK)
+ add_ut_flag(BTLA_UT_DEBUG)
+ add_ut_flag(BTLA_UT_EPILOGUE)
+ add_ut_flag(BTLA_UT_PROLOGUE_A)
+ add_ut_flag(BTLA_UT_PROLOGUE_B)
+ add_ut_flag(BTLA_UT_GEMM)
+ add_ut_flag(BTLA_UT_PARALLEL)
+ add_ut_flag(BTLA_UT_WRAPPER)
+ add_ut_flag(BTLA_UT_KERNEL_INTRIN)
+ add_ut_flag(BTLA_UT_KERNEL_JIT)
+ add_ut_flag(BTLA_UT_KERNEL_WRAPPER)
+ add_ut_flag(BTLA_UT_BENCHMARK)
target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
endif(UT_BUILD)
diff --git a/bestla/README.md b/bestla/README.md
index 08a29a9c9..8b46f5a9b 100644
--- a/bestla/README.md
+++ b/bestla/README.md
@@ -52,6 +52,6 @@ Compile:
Usage:
```cmake
-add_subdirectory(jblas)
-target_link_libraries("${YOUR_PROJECT}" jblas::jblas)
+add_subdirectory(bestla)
+target_link_libraries("${YOUR_PROJECT}" bestla::bestla)
```
diff --git a/bestla/jblas/jit_blas.h b/bestla/bestla/bestla.h
similarity index 74%
rename from bestla/jblas/jit_blas.h
rename to bestla/bestla/bestla.h
index 8446698e3..890704814 100644
--- a/bestla/jblas/jit_blas.h
+++ b/bestla/bestla/bestla.h
@@ -13,26 +13,26 @@
// limitations under the License.
#pragma once
#include
-enum JBLAS_CODE {
- JblasSuccess = 0,
- JblasInvalidParam = 1,
- JblasInvalidISA = 2,
- JblasRuntimeError = 4,
- JblasNotSupport = 8,
+enum class BTLA_CODE {
+ Success = 0,
+ InvalidParam = 1,
+ InvalidISA = 2,
+ RuntimeError = 4,
+ NotSupport = 8,
};
-enum JBLAS_ISA : uint8_t {
- JblasNoSIMD = 0,
- JblasAVX,
- JblasAVX2,
- JblasAVX_VNNI,
- JblasAVX512F,
- JblasAVX512_VNNI,
- JblasAMX_BF16,
- JblasAMX_INT8,
- JblasAVX512_FP16,
- JblasAVX512_BF16,
+enum class BTLA_ISA : uint8_t {
+ NoSIMD = 0,
+ AVX,
+ AVX2,
+ AVX_VNNI,
+ AVX512F,
+ AVX512_VNNI,
+ AMX_BF16,
+ AMX_INT8,
+ AVX512_FP16,
+ AVX512_BF16,
};
-enum class JBLAS_DTYPE : uint32_t {
+enum class BTLA_DTYPE : uint32_t {
EleBitsMask = 0xff,
EleBitsShift = 0,
EleBitsUndef = 0,
@@ -70,15 +70,9 @@ enum class JBLAS_DTYPE : uint32_t {
U32 = EleBits32 | TypeInt | SubType1,
};
-enum JBLAS_LAYOUT { JblasRowMajor = 101, JblasColMajor = 102 };
-enum JBLAS_TRANSPOSE {
- JblasNoTrans = 111,
- JblasTrans = 112,
- JblasConjTrans = 113,
-};
-enum JBLAS_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR };
+enum class BTLA_ELTWISEOP { GELU, SWISH, TANH, EXP, LOW_PRECISION_EXP, RELU, LINEAR };
-enum class JBLAS_PROLOGUEB_IDS : uint32_t {
+enum class BTLA_PROLOGUEB_IDS : uint32_t {
Undef = (uint32_t)-1,
Begin = 0,
NormalBegin = Begin,
diff --git a/bestla/jblas/jit_blas_device.h b/bestla/bestla/bestla_device.h
similarity index 95%
rename from bestla/jblas/jit_blas_device.h
rename to bestla/bestla/bestla_device.h
index 4e54b63d3..ebfc7d8de 100644
--- a/bestla/jblas/jit_blas_device.h
+++ b/bestla/bestla/bestla_device.h
@@ -15,7 +15,7 @@
#include