Skip to content

Commit

Permalink
add amx kernel for gemm
Browse files Browse the repository at this point in the history
add intel amx isa detection

add vnni kernel for gemv cases

add vnni and amx kernel support for block_q8_0

code cleanup

fix packing B issue

enable openmp

fine tune amx kernel

switch to aten parallel pattern

add error message for nested parallelism

code cleanup

add f16 support in ggml-amx

add amx kernels for QK_K quant formats: Q4_K, Q5_K, Q6_K and IQ4_XS

update CMakeList

update README

fix some compilation warning

fix compiler warning when amx is not enabled

minor change

ggml-ci

move ggml_amx_init from ggml.c to ggml-amx/mmq.cpp

ggml-ci

update CMakeLists with -mamx-tile, -mamx-int8 and -mamx-bf16

ggml-ci

add amx as an ggml-backend

update header file, the old path for immintrin.h has changed to ggml-cpu-impl.h

minor change

update CMakeLists.txt

minor change

apply weight prepacking in set_tensor method in ggml-backend

fix compile error

ggml-ci

minor change

ggml-ci

update CMakeLists.txt

ggml-ci

add march dependency

minor change

ggml-ci

change ggml_backend_buffer_is_host to return false for amx backend

ggml-ci

fix supports_op

use device reg for AMX backend

ggml-ci

minor change

ggml-ci

minor change

fix rebase

set .buffer_from_host_ptr to be false for AMX backend
  • Loading branch information
mingfeima committed Oct 18, 2024
1 parent 2194200 commit 2d3fc54
Show file tree
Hide file tree
Showing 14 changed files with 3,204 additions and 7 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()

if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()

if (NOT DEFINED GGML_CUDA_GRAPHS)
set(GGML_CUDA_GRAPHS_DEFAULT ON)
endif()
Expand Down
24 changes: 19 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,11 +93,6 @@ GGML_METAL := 1
DEPRECATE_WARNING := 1
endif

ifdef LLAMA_OPENMP
GGML_OPENMP := 1
DEPRECATE_WARNING := 1
endif

ifdef LLAMA_RPC
GGML_RPC := 1
DEPRECATE_WARNING := 1
Expand Down Expand Up @@ -584,6 +579,11 @@ ifndef GGML_NO_LLAMAFILE
OBJ_GGML += ggml/src/llamafile/sgemm.o
endif

ifndef GGML_NO_AMX
MK_CPPFLAGS += -DGGML_USE_AMX
OBJ_GGML += ggml/src/ggml-amx.o ggml/src/ggml-amx/mmq.o
endif

ifdef GGML_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJ_GGML += ggml/src/ggml-rpc.o
Expand Down Expand Up @@ -1087,6 +1087,19 @@ ggml/src/llamafile/sgemm.o: \
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE

ifndef GGML_NO_AMX
ggml/src/ggml-amx.o: \
ggml/src/ggml-amx.cpp \
ggml/include/ggml-amx.h
$(CXX) $(CXXFLAGS) -c $< -o $@

ggml/src/ggml-amx/mmq.o: \
ggml/src/ggml-amx/mmq.cpp \
ggml/src/ggml-amx/mmq.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif

ifdef GGML_RPC
ggml/src/ggml-rpc.o: \
ggml/src/ggml-rpc.cpp \
Expand Down Expand Up @@ -1238,6 +1251,7 @@ clean:
rm -vrf ggml/src/ggml-metal-embed.metal
rm -vrf ggml/src/ggml-cuda/*.o
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -vrf ggml/src/ggml-amx/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -f vulkan-shaders-gen ggml/src/ggml-vulkan-shaders.hpp ggml/src/ggml-vulkan-shaders.cpp
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ variety of hardware - locally and in the cloud.

- Plain C/C++ implementation without any dependencies
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
- AVX, AVX2 and AVX512 support for x86 architectures
- AVX, AVX2, AVX512 and AMX support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
- Vulkan and SYCL backend support
Expand Down
4 changes: 4 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
Expand Down Expand Up @@ -158,6 +161,7 @@ set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_AMX "ggml: use AMX" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
Expand Down
25 changes: 25 additions & 0 deletions ggml/include/ggml-amx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include "ggml.h"
#include "ggml-backend.h"


#ifdef __cplusplus
extern "C" {
#endif

// buffer_type API
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);

GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);

// backend API
GGML_API ggml_backend_t ggml_backend_amx_init(void);

GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);

GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);

#ifdef __cplusplus
}
#endif
1 change: 1 addition & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2488,6 +2488,7 @@ extern "C" {
GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_amx_int8 (void);
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void);
Expand Down
42 changes: 42 additions & 0 deletions ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,26 @@ if (GGML_LLAMAFILE)
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
endif()

if (GGML_AMX)
if (CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
else()
set(GGML_AMX OFF)
message(WARNING "AMX requires gcc version > 11.0. Turning off GGML_AMX.")
endif()

if (GGML_AMX)
message(STATUS "Using AMX")

list(APPEND GGML_CDEF_PUBLIC GGML_USE_AMX)

file(GLOB GGML_HEADERS_AMX "ggml-amx/*.h")
list(APPEND GGML_HEADERS_AMX "../include/ggml-amx.h")

file(GLOB GGML_SOURCES_AMX "ggml-amx/*.cpp")
list(APPEND GGML_SOURCES_AMX "ggml-amx.cpp")
endif()
endif()

if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES

Expand Down Expand Up @@ -1180,6 +1200,18 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
Expand Down Expand Up @@ -1215,6 +1247,15 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected")
Expand Down Expand Up @@ -1340,6 +1381,7 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_AMX} ${GGML_HEADERS_AMX}
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
ggml-aarch64.c ggml-aarch64.h
)
Expand Down
Loading

0 comments on commit 2d3fc54

Please sign in to comment.