Skip to content
This repository has been archived by the owner on Aug 30, 2024. It is now read-only.

Commit

Permalink
rename jblas tp bestla
Browse files Browse the repository at this point in the history
directory reorg

Signed-off-by: Hengyu Meng <[email protected]>
  • Loading branch information
airMeng committed Nov 27, 2023
1 parent 356b8ac commit 3718cbc
Show file tree
Hide file tree
Showing 47 changed files with 27,524 additions and 6,081 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "third_party/pybind11"]
path = third_party/pybind11
url = https://github.com/pybind/pybind11.git
30 changes: 4 additions & 26 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,36 +130,14 @@ if (NOT MSVC)
endif()
endif()

include(cmake/ISA.cmake)
include(cmake/Common.cmake)

set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)

set(COMMON_HEADER_DIRS
${PROJECT_SOURCE_DIR}
)

if(NE_GPU)
list(APPEND COMMON_HEADER_DIRS ${GPU_ROOT}/include)
list(APPEND COMMON_LIB_DIRS ${GPU_ROOT})
if (NE_PYTHON_API)
add_subdirectory(third_party/pybind11)
endif()

include_directories(
${COMMON_HEADER_DIRS}
)

link_directories(
${COMMON_LIB_DIRS}
)
add_subdirectory(bestla jblas)

add_subdirectory(../../library/jblas jblas)

add_subdirectory(core)
add_subdirectory(vectors)
add_subdirectory(models)

if (NE_BUILD_APPLICATIONS)
add_subdirectory(application)
endif()
add_subdirectory(neural_speed)
144 changes: 144 additions & 0 deletions bestla/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
cmake_minimum_required(VERSION 3.5)

project(jblas LANGUAGES CXX VERSION 0.1.0)
file(GLOB headers ${PROJECT_NAME}/*.h ${PROJECT_NAME}/*.hpp)
file(GLOB xbyak_headers ${PROJECT_NAME}/xbyak/*.h ${PROJECT_NAME}/xbyak/*.hpp)

option(JBLAS_UT_ALL "Enable all unit tests" OFF)
option(JBLAS_UT_DEBUG "Enable debug unit tests" ON)
option(JBLAS_UT_EPILOGUE "Enable unit test for epilogue" OFF)
option(JBLAS_UT_PROLOGUE_A "Enable unit test for activation prologue" OFF)
option(JBLAS_UT_PROLOGUE_B "Enable unit test for weight prologue" OFF)
option(JBLAS_UT_GEMM "Enable unit test for micro gemm kernels" OFF)
option(JBLAS_UT_WRAPPER "Enable unit test for parallel gemms" OFF)
option(JBLAS_UT_PARALLEL "Enable unit test for parallel set" OFF)
option(JBLAS_UT_KERNEL_JIT "Enable unit test for jit kernels" OFF)
option(JBLAS_UT_KERNEL_INTRIN "Enable unit test for intrinsic kernels" OFF)
option(JBLAS_UT_KERNEL_WRAPPER "Enable unit test for runtime ISA kernels" OFF)
option(JBLAS_UT_NOASAN "Disable sanitize" OFF)
option(JBLAS_UT_BENCHMARK "Benchmark ON may take a long time to finish all tests" OFF)
option(JBLAS_UT_OPENMP "Use OpenMP" ON)

if(JBLAS_UT_ALL)
set(JBLAS_UT_EPILOGUE ON)
set(JBLAS_UT_PROLOGUE_A ON)
set(JBLAS_UT_PROLOGUE_B ON)
set(JBLAS_UT_GEMM ON)
set(JBLAS_UT_WRAPPER ON)
set(JBLAS_UT_PARALLEL ON)
set(JBLAS_UT_KERNEL_JIT ON)
set(JBLAS_UT_KERNEL_INTRIN ON)
set(JBLAS_UT_KERNEL_WRAPPER ON)
endif(JBLAS_UT_ALL)

set(UT_BUILD FALSE)
if(JBLAS_UT_DEBUG OR JBLAS_UT_PROLOGUE_A OR JBLAS_UT_PROLOGUE_B OR JBLAS_UT_EPILOGUE OR JBLAS_UT_GEMM
OR JBLAS_UT_WRAPPER OR JBLAS_UT_PARALLEL OR JBLAS_UT_KERNEL_JIT OR JBLAS_UT_KERNEL_INTRIN
OR JBLAS_UT_KERNEL_WRAPPER)
set(UT_BUILD TRUE)
endif()

include(GNUInstallDirs)
add_library(${PROJECT_NAME} INTERFACE)
add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME})

target_include_directories(
${PROJECT_NAME} INTERFACE
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>"
)

install(
TARGETS ${PROJECT_NAME}
EXPORT ${PROJECT_NAME}-targets
INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
)

include(CMakePackageConfigHelpers)
configure_package_config_file(
cmake/config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
COMPATIBILITY SameMajorVersion
)

install(
FILES
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config-version.cmake"
DESTINATION
${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)

install(
EXPORT ${PROJECT_NAME}-targets
NAMESPACE ${PROJECT_NAME}::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}
)

if(WIN32)
target_compile_definitions(${PROJECT_NAME} INTERFACE _CRT_SECURE_NO_WARNINGS NOMINMAX)
target_compile_options(${PROJECT_NAME} INTERFACE /wd4068 /wd4849 /wd6262 /wd4702 /wd4100)
#4068 ignore unroll and GCC flags
#4849 ignore collapse
#6262 ignore stack too large
#4702 unreachable code(false warning on constexpr condition)
#4100 unreferenced formal parameter

target_link_options(${PROJECT_NAME} INTERFACE /STACK:5242880) #Stack requires up to L2 cache size
endif(WIN32)

if(JBLAS_UT_OPENMP)
include(FindOpenMP)
target_link_libraries(${PROJECT_NAME} INTERFACE OpenMP::OpenMP_CXX)
endif()

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

function(add_ut_flag UT_OPTION)
if(${${UT_OPTION}})
target_compile_definitions(${PROJECT_NAME}_ut PRIVATE ${UT_OPTION})
endif()
endfunction()

target_compile_features(${PROJECT_NAME} INTERFACE cxx_std_17)
if(UT_BUILD)
file(GLOB srcs ${PROJECT_NAME}/ut/*.cc ${PROJECT_NAME}/ut/*.cpp) #compile everthing even run parts of UTs
file(GLOB ut_headers ${PROJECT_NAME}/ut/*.h)
add_executable(${PROJECT_NAME}_ut ${srcs} ${headers} ${ut_headers})

if(NOT WIN32)
if(NOT JBLAS_UT_NOASAN)
target_compile_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
target_link_options(${PROJECT_NAME}_ut PRIVATE -fsanitize=address)
endif()
target_link_options(${PROJECT_NAME}_ut PRIVATE -lpthread)
endif()

add_ut_flag(JBLAS_UT_DEBUG)
add_ut_flag(JBLAS_UT_EPILOGUE)
add_ut_flag(JBLAS_UT_PROLOGUE_A)
add_ut_flag(JBLAS_UT_PROLOGUE_B)
add_ut_flag(JBLAS_UT_GEMM)
add_ut_flag(JBLAS_UT_PARALLEL)
add_ut_flag(JBLAS_UT_WRAPPER)
add_ut_flag(JBLAS_UT_KERNEL_INTRIN)
add_ut_flag(JBLAS_UT_KERNEL_JIT)
add_ut_flag(JBLAS_UT_KERNEL_WRAPPER)
add_ut_flag(JBLAS_UT_BENCHMARK)

target_link_libraries(${PROJECT_NAME}_ut PRIVATE ${PROJECT_NAME})
endif(UT_BUILD)

install(
FILES ${headers}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
)
install(
FILES ${xbyak_headers}
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}/xbyak/
)
88 changes: 88 additions & 0 deletions bestla/CMakePresets.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"version": 3,
"configurePresets": [
{
"name": "linux-debug",
"displayName": "Linux Debug",
"description": "Target the Windows Subsystem for Linux (WSL) or a remote Linux system.",
"generator": "Ninja",
"binaryDir": "${sourceDir}/out/build/${presetName}",
"installDir": "${sourceDir}/out/install/${presetName}",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug"
},
"condition": {
"type": "equals",
"lhs": "${hostSystemName}",
"rhs": "Linux"
},
"vendor": { "microsoft.com/VisualStudioRemoteSettings/CMake/1.0": { "sourceDir": "$env{HOME}/.vs/$ms{projectDirName}" } }
},
{
"name": "linux-release-UT",
"displayName": "linux Release for UT",
"description": "Run all UT",
"inherits": "linux-debug",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"JBLAS_UT_ALL": "ON"
}
},
{
"name": "linux-release",
"displayName": "linux Release",
"description": "Release",
"inherits": "linux-debug",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release",
"JBLAS_UT_ALL": "OFF"
}
},
{
"name": "windows-base",
"description": "Target Windows with the Visual Studio development environment.",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/out/build/${presetName}",
"installDir": "${sourceDir}/out/install/${presetName}",
"cacheVariables": {
"CMAKE_C_COMPILER": "cl.exe",
"CMAKE_CXX_COMPILER": "cl.exe",
"JBLAS_UT_ALL": "OFF"
},
"condition": {
"type": "equals",
"lhs": "${hostSystemName}",
"rhs": "Windows"
}
},
{
"name": "x64-debug",
"displayName": "x64 Debug",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (Debug)",
"inherits": "windows-base",
"architecture": {
"value": "x64",
"strategy": "external"
},
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug",
"JBLAS_UT_DEBUG": "ON"
}
},
{
"name": "x64-release",
"displayName": "x64 Release",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-debug",
"cacheVariables": { "CMAKE_BUILD_TYPE": "Release" }
},
{
"name": "x64-release-UT",
"displayName": "x64 Release for UT",
"description": "Target Windows (64-bit) with the Visual Studio development environment. (RelWithDebInfo)",
"inherits": "x64-release",
"cacheVariables": { "JBLAS_UT_ALL": "ON" }
}
]
}
47 changes: 47 additions & 0 deletions bestla/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Jblas
Jblas is a lightweight, header-only acceleration library for high-performance GEMM and related computations on Intel platform. Inspired by Cutlass, it provides high-level template class abstractions for various elements required for computation, and allows flexible kernel construction through template combinations to meet specific needs, maximizing the reuse of existing template classes. Users can also develop custom template classes to expand Jblas’s computational capabilities. Jblas includes several different types of template classes, specifically:

- `Interface`: Exposes gemm computation interface to users, while allowing users to specify their own Launcher template classes and Parallel template classes.
- `Launcher`: Schedules computation-related template classes, allowing users to specify their own computation-related template classes, including GemmCore, Prologue, and Epilogue.
- `Parallel`: Specifies data splitting strategy for task distribution among different cores. Jblas’s default Parallel template class adopts an L2-cache-fusion concept, i.e., each core tries to temporarily store the data it processes in its L2-cache during each round of gemm-tile computation.
- `GemmCore`: A computation-related template class that provides a micro-kernel for performing a tile gemm computation with a specific ISA. It is the most important template class in Jblas. Currently, GemmCore supports the following ISAs:
- AVX2
- AVX_VNNI
- AVX512F
- AVX512_VNNI
- AMX_BF16
- AMX_INT8
- AVX512_FP16
- `Prologue`: A computation-related template class that preprocesses (such as data type conversion/padding) input data to meet GemmCore’s input data requirements.
- `Epilogue`: A computation-related template class that post-processes (such as eltwiseop-fusion) the results of gemm-core computations to expand Jblas’s application scenarios.

The interaction logic between different template classes and the calculation process of gemm are shown in the following figure.
![bit4_emulation](docs/workflow.png)
# Highlights
## Weight-only
Jblas provides weight-only linear computational capabilities for LLM inference. We provide a series of Prologues for quantize/compress/serialize/deserialize fp32 weights in different ways. Specifically, we support compressed weights of the following data types:

- S8
- S4_CLIP
- S4_FULLRANGE
- FP4
- NF4
## Postop-fusion
Jblas provides assembly-level postop-fusion through epilogue to minimize the overhead caused by data movement. Specifically, we support the following postop-fusions:

- GELU
- SWISH
- RELU
- EXP
- TANH
## Compilation Requirements and Usage
Compile:

- GCC version >=8.5.0
- CMake version >=3.5

Usage:
```cmake
add_subdirectory(jblas)
target_link_libraries("${YOUR_PROJECT}" jblas::jblas)
```
3 changes: 3 additions & 0 deletions bestla/cmake/config.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
@PACKAGE_INIT@

include("${CMAKE_CURRENT_LIST_DIR}/@[email protected]")
8 changes: 8 additions & 0 deletions bestla/cmake/meson-config.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@PACKAGE_INIT@

if(NOT TARGET @TARGET_NAME@)
add_library(@TARGET_NAME@ INTERFACE IMPORTED)
set_target_properties(@TARGET_NAME@ PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "@ABSOLUTE_INCLUDE_DIR@"
)
endif()
Loading

0 comments on commit 3718cbc

Please sign in to comment.