From 5931c1f233c616083d64e41a228249d58e039aa5 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 25 Nov 2024 15:13:39 +0100 Subject: [PATCH] ggml : add support for dynamic loading of backends (#10469) * ggml : add support for dynamic loading of backends --------- Co-authored-by: Georgi Gerganov --- Makefile | 3 +- Package.swift | 3 +- common/common.cpp | 3 + examples/CMakeLists.txt | 27 +-- examples/eval-callback/CMakeLists.txt | 3 +- examples/llama-bench/llama-bench.cpp | 15 +- examples/main/main.cpp | 12 +- examples/simple-chat/simple-chat.cpp | 3 + examples/simple/simple.cpp | 4 + ggml/CMakeLists.txt | 1 + ggml/include/ggml-backend.h | 15 ++ ggml/include/ggml-cpu.h | 38 +--- ggml/include/ggml.h | 31 +++ ggml/src/CMakeLists.txt | 41 +++- ggml/src/ggml-amx/CMakeLists.txt | 10 +- ggml/src/ggml-amx/ggml-amx.cpp | 7 +- ggml/src/ggml-backend-impl.h | 44 +++-- ggml/src/ggml-backend-reg.cpp | 252 +++++++++++++++++++++++-- ggml/src/ggml-blas/CMakeLists.txt | 9 +- ggml/src/ggml-blas/ggml-blas.cpp | 7 +- ggml/src/ggml-cann/CMakeLists.txt | 6 +- ggml/src/ggml-cann/ggml-cann.cpp | 13 +- ggml/src/ggml-cpu/CMakeLists.txt | 19 +- ggml/src/ggml-cpu/ggml-cpu.c | 23 --- ggml/src/ggml-cpu/ggml-cpu.cpp | 50 +++-- ggml/src/ggml-cuda/CMakeLists.txt | 11 +- ggml/src/ggml-cuda/ggml-cuda.cu | 71 ++++++- ggml/src/ggml-hip/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/ggml-kompute.cpp | 7 +- ggml/src/ggml-metal/CMakeLists.txt | 9 +- ggml/src/ggml-metal/ggml-metal.m | 34 +++- ggml/src/ggml-musa/CMakeLists.txt | 10 +- ggml/src/ggml-rpc/CMakeLists.txt | 8 +- ggml/src/ggml-rpc/ggml-rpc.cpp | 7 +- ggml/src/ggml-sycl/CMakeLists.txt | 10 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 12 +- ggml/src/ggml-vulkan/CMakeLists.txt | 12 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +- ggml/src/ggml.c | 23 +++ pocs/CMakeLists.txt | 4 +- src/llama.cpp | 77 ++++---- tests/CMakeLists.txt | 13 +- tests/test-backend-ops.cpp | 26 +-- 44 files changed, 728 insertions(+), 272 deletions(-) diff --git a/Makefile b/Makefile index dd6d864ad513a..14c05e93e7535 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 @@ -290,6 +290,7 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE + MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, diff --git a/Package.swift b/Package.swift index 6b68aecdebec1..d9e8a4e2d21d7 100644 --- a/Package.swift +++ b/Package.swift @@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL") + .define("GGML_USE_METAL"), + .define("GGML_USE_CPU") ] ) #endif diff --git a/common/common.cpp b/common/common.cpp index c398329d05bf5..98524f7467ab4 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -377,6 +377,9 @@ void common_init() { #endif LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); + + // load dynamic backends + ggml_backend_load_all(); } std::string common_params_get_system_info(const common_params & params) { diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9bd099d4ef8a5..632409d5591b9 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -27,24 +24,16 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) add_subdirectory(simple) @@ -52,4 +41,18 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38e16e..5d1048aad74b6 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -5,5 +5,6 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75cbec7..bac606f471639 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 957451af7ce0a..d0c28f317b8c5 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 5f9973163732d..7f4da666b08ec 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -62,6 +62,9 @@ int main(int argc, char ** argv) { } }, nullptr); + // load dynamic backends + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 59760fe95db22..3288c0250a001 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -74,6 +74,10 @@ int main(int argc, char ** argv) { } } + // load dynamic backends + + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2d32da1b6d879..70b5cfdf7fbb4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -33,6 +33,7 @@ else() endif() option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) +option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) # # option list diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index cef164764bb1a..19881a5059f17 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -190,6 +190,14 @@ extern "C" { typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); // Get additional buffer types provided by the device (returns a NULL-terminated array) typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); + // Set the abort callback for the backend + typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); + // Get a list of feature flags supported by the backend (returns a NULL-terminated array) + struct ggml_backend_feature { + const char * name; + const char * value; + }; + typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); // // Backend registry @@ -214,6 +222,13 @@ extern "C" { // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); + // Load a backend from a dynamic library and register it + GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Unload a backend if loaded dynamically and unregister it + GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); + // Load all known backends from dynamic libraries + GGML_API void ggml_backend_load_all(void); + // // Backend scheduler // diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef9798364..a5358d047a08e 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -7,29 +7,6 @@ extern "C" { #endif - // Scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // Threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -75,14 +52,11 @@ extern "C" { GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -104,10 +78,10 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_fma (void); - GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 69e6a24344b97..9843b09fbe83e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2215,6 +2215,37 @@ extern "C" { GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); + // ggml threadpool + // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend + // the goal should be to create an API that other backends can use move everything to the ggml base + + // scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8df0e85c0d092..071508ddae021 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -202,6 +202,10 @@ endif() # ggml +if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS) + message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS") +endif() + add_library(ggml-base ../include/ggml.h ../include/ggml-alloc.h @@ -226,6 +230,31 @@ add_library(ggml target_link_libraries(ggml PUBLIC ggml-base) +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(ggml PRIVATE dl) +endif() + +function(ggml_add_backend_library backend) + if (GGML_BACKEND_DL) + add_library(${backend} MODULE ${ARGN}) + # write the shared library to the output directory + set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) + else() + add_library(${backend} ${ARGN}) + target_link_libraries(ggml PUBLIC ${backend}) + install(TARGETS ${backend} LIBRARY) + endif() + + target_link_libraries(${backend} PRIVATE ggml-base) + target_include_directories(${backend} PRIVATE ..) + + if (${BUILD_SHARED_LIBS}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) + target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) + endif() +endfunction() + function(ggml_add_backend backend) string(TOUPPER "GGML_${backend}" backend_id) if (${backend_id}) @@ -236,14 +265,10 @@ function(ggml_add_backend backend) # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp if (${backend_id}) message(STATUS "Including ${backend} backend") - if (${BUILD_SHARED_LIBS}) - target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) - target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) + if (NOT GGML_BACKEND_DL) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) endif() - install(TARGETS ${backend_target} LIBRARY) - target_link_libraries(ggml PUBLIC ${backend_target}) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) endif() endif() endfunction() @@ -256,10 +281,10 @@ ggml_add_backend(CUDA) ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) +ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) -ggml_add_backend(MUSA) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt index d6676f3f67b20..cf3ade6f020ed 100644 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ b/ggml/src/ggml-amx/CMakeLists.txt @@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA file(GLOB GGML_SOURCES_AMX "*.cpp") - add_library(ggml-amx - ${GGML_HEADERS_AMX} - ${GGML_SOURCES_AMX}) - - target_link_libraries(ggml-amx PRIVATE ggml-base) - target_include_directories(ggml-amx PRIVATE . ..) + ggml_add_backend_library(ggml-amx + ${GGML_HEADERS_AMX} + ${GGML_SOURCES_AMX} + ) # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags # TODO: integrate AMX backend into the CPU backend diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp index 8568e7965fd2e..6bfb3da274c39 100644 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ b/ggml/src/ggml-amx/ggml-amx.cpp @@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { ggml_backend_reg_t ggml_backend_amx_reg(void) { static struct ggml_backend_reg ggml_backend_amx_reg = { - /* .iface = */ ggml_backend_amx_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_amx_reg_i, + /* .context = */ NULL, }; return &ggml_backend_amx_reg; @@ -444,3 +445,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) { } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index fa8d5b7fb68c9..dff7749b416dc 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -8,6 +8,8 @@ extern "C" { #endif + #define GGML_BACKEND_API_VERSION 1 + // // Backend buffer type // @@ -63,20 +65,20 @@ extern "C" { enum ggml_backend_buffer_usage usage; }; - ggml_backend_buffer_t ggml_backend_buffer_init( + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, void * context, size_t size); // do not use directly, use ggml_backend_tensor_copy instead - bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); // multi-buffer // buffer that contains a collection of buffers - ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); // // Backend (stream) @@ -199,17 +201,37 @@ extern "C" { }; struct ggml_backend_reg { - // int api_version; // TODO: for dynamic loading + int api_version; // initialize to GGML_BACKEND_API_VERSION struct ggml_backend_reg_i iface; void * context; }; - // Internal backend registry API - void ggml_backend_register(ggml_backend_reg_t reg); - void ggml_backend_device_register(ggml_backend_dev_t device); - // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function - // typedef ggml_backend_register_t * (*ggml_backend_init)(void); + GGML_API void ggml_backend_register(ggml_backend_reg_t reg); + GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); + + // Add backend dynamic loading support to the backend + typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + + #ifdef GGML_BACKEND_DL + #ifdef __cplusplus + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + } \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ + } + #endif + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) + #endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d82017457..43d03d7fa7385 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,11 +1,29 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml-impl.h" +#include #include +#include #include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#elif defined(__APPLE__) +# include +# include +#else +# include +# include +#endif + // Backend registry +#ifdef GGML_USE_CPU +#include "ggml-cpu.h" +#endif #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -43,8 +61,13 @@ #include "ggml-kompute.h" #endif +struct ggml_backend_reg_entry { + ggml_backend_reg_t reg; + void * handle; +}; + struct ggml_backend_registry { - std::vector backends; + std::vector backends; std::vector devices; ggml_backend_registry() { @@ -75,11 +98,19 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif - +#ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); +#endif } - void register_backend(ggml_backend_reg_t reg) { + ~ggml_backend_registry() { + while (!backends.empty()) { + // use silent since the log system may have been destroyed at this point + unload_backend(backends.back().reg, true); + } + } + + void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { if (!reg) { return; } @@ -88,7 +119,7 @@ struct ggml_backend_registry { GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif - backends.push_back(reg); + backends.push_back({ reg, handle }); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); } @@ -100,6 +131,111 @@ struct ggml_backend_registry { #endif devices.push_back(device); } + + ggml_backend_reg_t load_backend(const char * path, bool silent) { +#ifdef _WIN32 + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + + HMODULE handle = LoadLibraryA(path); + + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + } + SetErrorMode(old_mode); + return nullptr; + } + + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + + SetErrorMode(old_mode); + + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + } + FreeLibrary(handle); + return nullptr; + } +#else + void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + } + return nullptr; + } + + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); + } + dlclose(handle); + return nullptr; + } +#endif + ggml_backend_reg_t reg = backend_init(); + + if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { + if (!silent) { + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); + } else { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", + __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); + } + } +#ifdef _WIN32 + FreeLibrary(handle); +#else + dlclose(handle); +#endif + return nullptr; + } + + GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + register_backend(reg, handle); + return reg; + } + + void unload_backend(ggml_backend_reg_t reg, bool silent) { + auto it = std::find_if(backends.begin(), backends.end(), + [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); + + if (it == backends.end()) { + if (!silent) { + GGML_LOG_ERROR("%s: backend not found\n", __func__); + } + return; + } + + if (!silent) { + GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + } + + // remove devices + devices.erase( + std::remove_if(devices.begin(), devices.end(), + [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), + devices.end()); + + // unload library + if (it->handle) { +#ifdef _WIN32 + FreeLibrary((HMODULE) it->handle); +#else + dlclose(it->handle); +#endif + } + + // remove backend + backends.erase(it); + } }; static ggml_backend_registry & get_reg() { @@ -123,7 +259,7 @@ size_t ggml_backend_reg_count() { ggml_backend_reg_t ggml_backend_reg_get(size_t index) { GGML_ASSERT(index < ggml_backend_reg_count()); - return get_reg().backends[index]; + return get_reg().backends[index].reg; } ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { @@ -133,7 +269,7 @@ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { return reg; } } - return NULL; + return nullptr; } // Device enumeration @@ -153,7 +289,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { return dev; } } - return NULL; + return nullptr; } ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { @@ -163,14 +299,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { return dev; } } - return NULL; + return nullptr; } // Convenience functions ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -178,7 +314,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -189,7 +325,97 @@ ggml_backend_t ggml_backend_init_best(void) { dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } if (!dev) { - return NULL; + return nullptr; } - return ggml_backend_dev_init(dev, NULL); + return ggml_backend_dev_init(dev, nullptr); +} + +// Dynamic loading +ggml_backend_reg_t ggml_backend_load(const char * path) { + return get_reg().load_backend(path, false); +} + +void ggml_backend_unload(ggml_backend_reg_t reg) { + get_reg().unload_backend(reg, true); +} + +void ggml_backend_load_all() { + std::vector search_prefix; + + // add the executable directory to the search path + // FIXME: this is convenient for development, but it should probably be disabled in production + +#if defined(__APPLE__) + // get executable path + std::vector path; + uint32_t size; + while (true) { + size = path.size(); + if (_NSGetExecutablePath(path.data(), &size) == 0) { + break; + } + path.resize(size); + } + std::string base_path(path.data(), size); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + search_prefix.push_back(base_path + "/"); +#elif defined(__linux__) + std::string base_path = "."; + std::vector path(1024); + while (true) { + // get executable path + ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); + if (len == -1) { + break; + } + if (len < (ssize_t) path.size()) { + base_path = std::string(path.data(), len); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + break; + } + path.resize(path.size() * 2); + } + + search_prefix.push_back(base_path + "/"); +#endif + + auto & reg = get_reg(); + + auto try_load = [&](const std::string & name) { + std::string os_name; +#ifdef _WIN32 + os_name = "ggml-" + name + ".dll"; +#else + os_name = "libggml-" + name + ".so"; +#endif + if (reg.load_backend(os_name.c_str(), true)) { + return; + } + for (const auto & prefix : search_prefix) { + if (reg.load_backend((prefix + os_name).c_str(), true)) { + return; + } + } + }; + + try_load("amx"); + try_load("blas"); + try_load("cann"); + try_load("cuda"); + try_load("hip"); + try_load("kompute"); + try_load("metal"); + try_load("rpc"); + try_load("sycl"); + try_load("vulkan"); + try_load("musa"); + try_load("cpu"); } diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index e2cbabf0dae74..0bf3c05d93a89 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -11,12 +11,9 @@ find_package(BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - add_library(ggml-blas - ggml-blas.cpp - ) - - target_link_libraries(ggml-blas PRIVATE ggml-base) - target_include_directories(ggml-blas PRIVATE . ..) + ggml_add_backend_library(ggml-blas + ggml-blas.cpp + ) if (${GGML_BLAS_VENDOR} MATCHES "Apple") add_compile_definitions(ACCELERATE_NEW_LAPACK) diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 648c9d875e346..ec158dfac6e3e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -506,9 +506,12 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { ggml_backend_reg_t ggml_backend_blas_reg(void) { static struct ggml_backend_reg ggml_backend_blas_reg = { - /* .iface = */ ggml_backend_blas_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_blas_reg_i, + /* .context = */ NULL, }; return &ggml_backend_blas_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 756200b893d02..901327185fb75 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR) file(GLOB GGML_SOURCES_CANN "*.cpp") - add_library(ggml-cann ${GGML_SOURCES_CANN}) - target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES}) - target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) + ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN}) + target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES}) + target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 776340881434d..d96f65936136d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2064,16 +2064,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() { dev_ctx->name = GGML_CANN_NAME + std::to_string(i); ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cann_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cann_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx }; } @@ -2126,3 +2127,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } + +GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg) diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 2880523331dbd..c2905d1fbf4e8 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,14 +1,13 @@ -add_library(ggml-cpu - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - ) +ggml_add_backend_library(ggml-cpu + ggml-cpu.c + ggml-cpu.cpp + ggml-cpu-aarch64.c + ggml-cpu-aarch64.h + ggml-cpu-quants.c + ggml-cpu-quants.h + ) -target_link_libraries(ggml-cpu PRIVATE ggml-base) -target_include_directories(ggml-cpu PRIVATE . ..) +target_include_directories(ggml-cpu PRIVATE .) if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4b58254e7d108..c6ede19d9d1c0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int #endif // GGML_USE_OPENMP -void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { - p->n_threads = n_threads; - p->prio = 0; // default priority (usually means normal or inherited) - p->poll = 50; // hybrid-polling enabled - p->strict_cpu = false; // no strict placement (all threads share same cpumask) - p->paused = false; // threads are ready to go - memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) -} - -struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { - struct ggml_threadpool_params p; - ggml_threadpool_params_init(&p, n_threads); - return p; -} - -bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { - if (p0->n_threads != p1->n_threads ) return false; - if (p0->prio != p1->prio ) return false; - if (p0->poll != p1->poll ) return false; - if (p0->strict_cpu != p1->strict_cpu ) return false; - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; -} - static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_threadpool_params * tpp, struct ggml_cgraph * cgraph, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9b375..febed433ada2b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg return &ggml_backend_cpu_device; } -struct ggml_backend_feature { - const char * name; - const char * value; -}; - -// Not used yet // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, -// and additionally to allow other backends to expose their own list of features that applications can query using the same API. +// and additionally to allow other backends to expose their own list of features that applications can query using the same API static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static std::vector features = []() { + ggml_cpu_init(); + std::vector features; if (ggml_cpu_has_sse3()) { features.push_back({ "SSE3", "1" }); @@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_avx()) { features.push_back({ "AVX", "1" }); } + if (ggml_cpu_has_avx_vnni()) { + features.push_back({ "AVX_VNNI", "1" }); + } if (ggml_cpu_has_avx2()) { features.push_back({ "AVX2", "1" }); } @@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_fma()) { features.push_back({ "FMA", "1" }); } - if (ggml_cpu_has_avx_vnni()) { - features.push_back({ "AVX_VNNI", "1" }); - } if (ggml_cpu_has_avx512()) { features.push_back({ "AVX512", "1" }); } @@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } + // TODO: rename this + #ifdef GGML_USE_CPU_AARCH64 + features.push_back({ "AARCH64_REPACK", "1" }); + #endif features.push_back({ nullptr, nullptr }); @@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { return (void *)ggml_backend_cpu_get_extra_bufts; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cpu_get_features; + } + if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { + return (void *)ggml_backend_cpu_set_abort_callback; + } + if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { + return (void *)ggml_numa_init; + } + if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { + return (void *)ggml_is_numa; + } + + // threadpool - TODO: move to ggml-base + if (strcmp(name, "ggml_threadpool_new") == 0) { + return (void *)ggml_threadpool_new; + } + if (strcmp(name, "ggml_threadpool_free") == 0) { + return (void *)ggml_threadpool_free; + } + if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { + return (void *)ggml_backend_cpu_set_threadpool; + } return NULL; @@ -655,9 +678,12 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) { ggml_cpu_init(); static struct ggml_backend_reg ggml_backend_cpu_reg = { - /* .iface = */ ggml_backend_cpu_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, }; return &ggml_backend_cpu_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index e1482a269d698..b0cb93e070fd3 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) endif() - add_library(ggml-cuda - ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_CUDA} - ) - - target_link_libraries(ggml-cuda PRIVATE ggml-base) - target_include_directories(ggml-cuda PRIVATE . ..) + ggml_add_backend_library(ggml-cuda + ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_CUDA} + ) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dd94ab03d5b6c..2a78a4393d0f7 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } +static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) { + static std::vector features = []() { + std::vector features; + #define _STRINGIFY(...) #__VA_ARGS__ + #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__) + + #ifdef __CUDA_ARCH_LIST__ + features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) }); + #endif + + #ifdef GGML_CUDA_FORCE_MMQ + features.push_back({ "FORCE_MMQ", "1" }); + #endif + + #ifdef GGML_CUDA_FORCE_CUBLAS + features.push_back({ "FORCE_CUBLAS", "1" }); + #endif + + #ifdef GGML_CUDA_NO_VMM + features.push_back({ "NO_VMM", "1" }); + #endif + + #ifdef GGML_CUDA_NO_PEER_COPY + features.push_back({ "NO_PEER_COPY", "1" }); + #endif + + #ifdef GGML_CUDA_F16 + features.push_back({ "F16", "1" }); + #endif + + #ifdef GGML_CUDA_USE_GRAPHS + features.push_back({ "USE_GRAPHS", "1" }); + #endif + + #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) }); + #endif + + #ifdef GGML_CUDA_FA_ALL_QUANTS + features.push_back({ "FA_ALL_QUANTS", "1" }); + #endif + + #undef _STRINGIFY + #undef STRINGIFY + + features.push_back({ nullptr, nullptr }); + + return features; + }(); + + return features.data(); + + GGML_UNUSED(reg); +} + static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { @@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { return (void *)ggml_backend_cuda_unregister_host_buffer; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cuda_get_features; + } return nullptr; } @@ -3169,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->description = prop.name; ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cuda_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cuda_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cuda_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cuda_reg_interface, + /* .context = */ ctx }; } @@ -3209,3 +3268,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg) diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index fccf8eb8440b8..b15fbd24d6b36 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -64,12 +64,10 @@ else() list(APPEND GGML_SOURCES_ROCM ${SRCS}) endif() -add_library(ggml-hip - ${GGML_HEADERS_ROCM} - ${GGML_SOURCES_ROCM}) - -target_link_libraries(ggml-hip PRIVATE ggml-base) -target_include_directories(ggml-hip PRIVATE . ..) +ggml_add_backend_library(ggml-hip + ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_ROCM} + ) # TODO: do not use CUDA definitions for HIP target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt index 0bd027c7f537e..dc623926c7685 100644 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ b/ggml/src/ggml-kompute/CMakeLists.txt @@ -6,13 +6,13 @@ if (NOT glslc_executable) message(FATAL_ERROR "glslc not found") endif() -add_library(ggml-kompute - ggml-kompute.cpp - ../../include/ggml-kompute.h - ) +ggml_add_backend_library(ggml-kompute + ggml-kompute.cpp + ../../include/ggml-kompute.h + ) target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) -target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 2fea9e4cc8d38..24566404ded0f 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -2176,9 +2176,12 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { ggml_backend_reg_t ggml_backend_kompute_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_kompute_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_kompute_reg_i, + /* .context = */ nullptr, }; return ® } + +GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index b237d79f47ddb..1bad272068244 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) message(STATUS "Metal framework found") -add_library(ggml-metal - ggml-metal.m - ) +ggml_add_backend_library(ggml-metal + ggml-metal.m + ) target_link_libraries(ggml-metal PRIVATE - ggml-base ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} ) -target_include_directories(ggml-metal PRIVATE . ..) - if (GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG) endif() diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 3a533d7f9c9af..63baaf163581c 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4372,19 +4372,45 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r GGML_UNUSED(index); } +static struct ggml_backend_feature g_ggml_backend_metal_features[] = { +#if defined(GGML_METAL_EMBED_LIBRARY) + { "EMBED_LIBRARY", "1" }, +#endif +#if defined(GGML_METAL_USE_BF16) + { "BF16", "1" }, +#endif + { nil, nil }, +}; + +static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) { + return g_ggml_backend_metal_features; + + GGML_UNUSED(reg); +} + +static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_metal_get_features; + } + + return NULL; + + GGML_UNUSED(reg); +} static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { /* .get_name = */ ggml_backend_metal_reg_get_name, /* .device_count = */ ggml_backend_metal_reg_device_count, /* .device_get = */ ggml_backend_metal_reg_device_get, - /* .get_proc_address = */ NULL, + /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; ggml_backend_reg_t ggml_backend_metal_reg(void) { // TODO: make this thread-safe somehow? { g_ggml_backend_metal_reg = (struct ggml_backend_reg) { - /* .iface = */ ggml_backend_metal_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_metal_reg_i, + /* .context = */ NULL, }; g_ggml_backend_metal_device = (struct ggml_backend_device) { @@ -4396,3 +4422,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) { return &g_ggml_backend_metal_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg) diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index f3c0136920540..e1a69186e669f 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND) set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22") endforeach() - add_library(ggml-musa - ${GGML_HEADERS_MUSA} - ${GGML_SOURCES_MUSA}) - - target_link_libraries(ggml-musa PRIVATE ggml-base) - target_include_directories(ggml-musa PRIVATE . ..) + ggml_add_backend_library(ggml-musa + ${GGML_HEADERS_MUSA} + ${GGML_SOURCES_MUSA} + ) # TODO: do not use CUDA definitions for MUSA target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt index a2d6770eb053f..f5acb8ec2cb28 100644 --- a/ggml/src/ggml-rpc/CMakeLists.txt +++ b/ggml/src/ggml-rpc/CMakeLists.txt @@ -1,10 +1,8 @@ message(STATUS "Using RPC backend") -add_library(ggml-rpc - ggml-rpc.cpp) - -target_link_libraries(ggml-rpc PRIVATE ggml-base) -target_include_directories(ggml-rpc PRIVATE . ..) +ggml_add_backend_library(ggml-rpc + ggml-rpc.cpp + ) if (WIN32) target_link_libraries(ggml-rpc PRIVATE ws2_32) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 47357daabdf54..43108242639a3 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { ggml_backend_reg_t ggml_backend_rpc_reg(void) { static struct ggml_backend_reg ggml_backend_rpc_reg = { - /* .iface = */ ggml_backend_rpc_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_rpc_reg_i, + /* .context = */ NULL, }; return &ggml_backend_rpc_reg; @@ -1401,3 +1402,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { return dev; } + +GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index d1d0ff83d636c..83f223fd7b6fc 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -16,12 +16,10 @@ endif() message(STATUS "SYCL found") #todo: AOT -add_library(ggml-sycl - ggml-sycl.cpp - ../../include/ggml-sycl.h) - -target_link_libraries(ggml-sycl PRIVATE ggml-base) -target_include_directories(ggml-sycl PRIVATE . ..) +ggml_add_backend_library(ggml-sycl + ggml-sycl.cpp + ../../include/ggml-sycl.h + ) if (GGML_SYCL_F16) if (GGML_SYCL_TARGET STREQUAL "AMD") diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 255bc64c6badd..b6392ed8dcc6a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4637,16 +4637,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() { dev_ctx->description = prop.get_name(); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_sycl_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_sycl_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_sycl_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_sycl_reg_interface, + /* .context = */ ctx }; } @@ -4678,3 +4679,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) { return sycl_backend; } +GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 1e85dd15b7ab1..ae0485e04255d 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED) if (Vulkan_FOUND) message(STATUS "Vulkan found") - add_library(ggml-vulkan - ggml-vulkan.cpp - ../../include/ggml-vulkan.h - ) + ggml_add_backend_library(ggml-vulkan + ggml-vulkan.cpp + ../../include/ggml-vulkan.h + ) - target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan) - target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) + target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ca71da2f7b7f5..49527fdf40e94 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -6738,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { ggml_backend_reg_t ggml_backend_vk_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_vk_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, }; return ® @@ -7365,3 +7366,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 78e7874dee04d..1a2318cb188c4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt index 03e1d2c04be65..d49d14dee4351 100644 --- a/pocs/CMakeLists.txt +++ b/pocs/CMakeLists.txt @@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(vdot) + if (NOT GGML_BACKEND_DL) + add_subdirectory(vdot) + endif() endif() diff --git a/src/llama.cpp b/src/llama.cpp index 20df09b133bfb..83bbc10a57a43 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4866,7 +4866,9 @@ struct llama_model_loader { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); mmaps_used.emplace_back(mapping->size, 0); if (mlock_mmaps) { std::unique_ptr mlock_mmap(new llama_mlock()); @@ -9190,7 +9192,7 @@ static bool llm_load_tensors( ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); if (!dev) { // FIXME: workaround for CPU backend buft having a NULL device - dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); @@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute( int n_threads, ggml_threadpool * threadpool) { if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); - ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(lctx.backend_cpu, threadpool); } // set the number of threads for all the backends @@ -19478,7 +19481,11 @@ void llama_backend_init(void) { void llama_numa_init(enum ggml_numa_strategy numa) { if (numa != GGML_NUMA_STRATEGY_DISABLED) { - ggml_numa_init(numa); + auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + GGML_ASSERT(dev && "CPU backend is not loaded"); + auto * reg = ggml_backend_dev_backend_reg(dev); + auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); + numa_init_fn(numa); } } @@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - ctx->abort_callback = params.abort_callback; - ctx->abort_callback_data = params.abort_callback_data; - ctx->logits_all = params.logits_all; // build worst-case graph for encoder if a model contains encoder @@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model( } // add CPU backend - ctx->backend_cpu = ggml_backend_cpu_init(); + ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); llama_free(ctx); @@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model( } } + llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); + if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); @@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model( std::vector backend_ptrs; for (auto & backend : ctx->backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { for (auto & backend : ctx->backends) { - if (ggml_backend_is_cpu(backend.get())) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend continue; } @@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; + + for (auto & backend : ctx->backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); + } + } } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { @@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int } const char * llama_print_system_info(void) { - ggml_cpu_init(); // some ARM features are detected at runtime - static std::string s; - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; - s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (get_features_fn) { + ggml_backend_feature * features = get_features_fn(reg); + s += ggml_backend_reg_name(reg); + s += " : "; + for (; features->name; features++) { + s += features->name; + s += " = "; + s += features->value; + s += " | "; + } + } + } return s.c_str(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b06f122e89873..82373ff4e1862 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU # llama_target_and_test(test-double-float.cpp) # SLOW llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) -llama_target_and_test(test-quantize-fns.cpp) -llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-llama-grammar.cpp) -llama_target_and_test(test-barrier.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) -llama_target_and_test(test-rope.cpp) - llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model") +if (NOT GGML_BACKEND_DL) + # these tests use the backends directly and cannot be built with dynamic loading + llama_target_and_test(test-barrier.cpp) + llama_target_and_test(test-quantize-fns.cpp) + llama_target_and_test(test-quantize-perf.cpp) + llama_target_and_test(test-rope.cpp) +endif() + # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b2b5705243ea2..6376b0e4c66cf 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -16,7 +16,6 @@ #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -639,19 +637,20 @@ struct test_case { // determine number of runs int n_runs; + bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU; if (op_flops(out) > 0) { // based on flops const uint64_t GFLOP = 1000 * 1000 * 1000; const uint64_t target_flops_cpu = 8ULL * GFLOP; const uint64_t target_flops_gpu = 100ULL * GFLOP; - uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu; + uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1; } else { // based on memory size const size_t GB = 1ULL << 30; const size_t target_size_cpu = 8 * GB; const size_t target_size_gpu = 32 * GB; - size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu; + size_t target_size = is_cpu ? target_size_cpu : target_size_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; } @@ -3873,7 +3872,11 @@ static std::vector> make_test_cases_perf() { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { if (mode == MODE_TEST) { auto test_cases = make_test_cases_eval(); - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (backend_cpu == NULL) { + printf(" Failed to initialize CPU backend\n"); + return false; + } size_t n_ok = 0; for (auto & test : test_cases) { @@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) { } } - // enumerate backends + // load and enumerate backends + ggml_backend_load_all(); + printf("Testing %zu devices\n\n", ggml_backend_dev_count()); size_t n_ok = 0; @@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) { continue; } - ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); - GGML_ASSERT(backend != NULL); - - if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { + if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) { printf(" Skipping CPU backend\n"); - ggml_backend_free(backend); n_ok++; continue; } + ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); + GGML_ASSERT(backend != NULL); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) {