From d5a3beb0e04a4254a28a8b6022973af994a90773 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 24 Nov 2024 00:00:52 +0100 Subject: [PATCH 01/11] ggml : add support for dynamic loading of backends --- common/common.cpp | 3 + examples/CMakeLists.txt | 27 ++++---- examples/llama-bench/llama-bench.cpp | 15 ++++- examples/main/main.cpp | 12 ++-- examples/simple-chat/simple-chat.cpp | 3 + examples/simple/simple.cpp | 4 ++ ggml/CMakeLists.txt | 1 + ggml/include/ggml-backend.h | 13 ++++ ggml/include/ggml-cpu.h | 38 ++--------- ggml/include/ggml.h | 31 +++++++++ ggml/src/CMakeLists.txt | 19 ++++-- ggml/src/ggml-amx/ggml-amx.cpp | 2 + ggml/src/ggml-backend-impl.h | 24 ++++++- ggml/src/ggml-backend-reg.cpp | 90 +++++++++++++++++++++++++- ggml/src/ggml-blas/ggml-blas.cpp | 2 + ggml/src/ggml-cann/ggml-cann.cpp | 2 + ggml/src/ggml-cpu/ggml-cpu.c | 23 ------- ggml/src/ggml-cpu/ggml-cpu.cpp | 45 ++++++++++--- ggml/src/ggml-cuda/ggml-cuda.cu | 60 +++++++++++++++++ ggml/src/ggml-kompute/ggml-kompute.cpp | 2 + ggml/src/ggml-metal/ggml-metal.m | 2 + ggml/src/ggml-rpc/ggml-rpc.cpp | 2 + ggml/src/ggml-sycl/ggml-sycl.cpp | 1 + ggml/src/ggml-vulkan/ggml-vulkan.cpp | 2 + ggml/src/ggml.c | 23 +++++++ pocs/CMakeLists.txt | 4 +- src/llama.cpp | 77 ++++++++++++---------- tests/CMakeLists.txt | 13 ++-- tests/test-backend-ops.cpp | 26 ++++---- 29 files changed, 422 insertions(+), 144 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index d314523db4c62..1e06cad7248eb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -377,6 +377,9 @@ void common_init() { #endif LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type); + + // load dynamic backends + ggml_backend_load_all(); } std::string common_params_get_system_info(const common_params & params) { diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d63a96c1c2547..bceded4e41349 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -12,13 +12,10 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(cvector-generator) add_subdirectory(batched-bench) add_subdirectory(batched) - add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) - add_subdirectory(export-lora) add_subdirectory(gbnf-validator) add_subdirectory(gguf-hash) add_subdirectory(gguf-split) @@ -27,28 +24,34 @@ else() add_subdirectory(imatrix) add_subdirectory(infill) add_subdirectory(llama-bench) - add_subdirectory(llava) add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(main) add_subdirectory(parallel) add_subdirectory(passkey) add_subdirectory(perplexity) - add_subdirectory(quantize-stats) add_subdirectory(quantize) add_subdirectory(retrieval) - if (GGML_RPC) - add_subdirectory(rpc) - endif() if (LLAMA_BUILD_SERVER) - add_subdirectory(server) - endif() - if (GGML_SYCL) - add_subdirectory(sycl) + add_subdirectory(server) endif() add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(simple-chat) add_subdirectory(speculative) add_subdirectory(tokenize) + if (NOT GGML_BACKEND_DL) + # these examples use the backends directly and cannot be built with dynamic loading + add_subdirectory(convert-llama2c-to-ggml) + add_subdirectory(cvector-generator) + add_subdirectory(export-lora) + add_subdirectory(quantize-stats) + add_subdirectory(llava) + if (GGML_RPC) + add_subdirectory(rpc) + endif() + if (GGML_SYCL) + add_subdirectory(sycl) + endif() + endif() endif() diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 3dc84a75cbec7..bac606f471639 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -1477,6 +1477,17 @@ int main(int argc, char ** argv) { cmd_params params = parse_cmd_params(argc, argv); + // initialize backends + ggml_backend_load_all(); + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (!cpu_dev) { + fprintf(stderr, "%s: error: CPU backend is not loaded\n", __func__); + return 1; + } + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(cpu_reg, "ggml_threadpool_free"); + // initialize llama.cpp if (!params.verbose) { llama_log_set(llama_null_log_callback, NULL); @@ -1551,7 +1562,7 @@ int main(int argc, char ** argv) { tpp.poll = t.poll; tpp.prio = params.prio; - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { fprintf(stderr, "%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); exit(1); @@ -1612,7 +1623,7 @@ int main(int argc, char ** argv) { llama_free(ctx); - ggml_threadpool_free(threadpool); + ggml_threadpool_free_fn(threadpool); } llama_free_model(lmodel); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 7c4ce4be2abae..775eec1b1fb43 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -165,6 +165,10 @@ int main(int argc, char ** argv) { LOG_INF("%s: llama threadpool init, n_threads = %d\n", __func__, (int) params.cpuparams.n_threads); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * ggml_threadpool_new_fn = (decltype(ggml_threadpool_new) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_new"); + auto * ggml_threadpool_free_fn = (decltype(ggml_threadpool_free) *) ggml_backend_reg_get_proc_address(reg, "ggml_threadpool_free"); + struct ggml_threadpool_params tpp_batch = ggml_threadpool_params_from_cpu_params(params.cpuparams_batch); struct ggml_threadpool_params tpp = @@ -174,7 +178,7 @@ int main(int argc, char ** argv) { struct ggml_threadpool * threadpool_batch = NULL; if (!ggml_threadpool_params_match(&tpp, &tpp_batch)) { - threadpool_batch = ggml_threadpool_new(&tpp_batch); + threadpool_batch = ggml_threadpool_new_fn(&tpp_batch); if (!threadpool_batch) { LOG_ERR("%s: batch threadpool create failed : n_threads %d\n", __func__, tpp_batch.n_threads); return 1; @@ -184,7 +188,7 @@ int main(int argc, char ** argv) { tpp.paused = true; } - struct ggml_threadpool * threadpool = ggml_threadpool_new(&tpp); + struct ggml_threadpool * threadpool = ggml_threadpool_new_fn(&tpp); if (!threadpool) { LOG_ERR("%s: threadpool create failed : n_threads %d\n", __func__, tpp.n_threads); return 1; @@ -890,8 +894,8 @@ int main(int argc, char ** argv) { llama_backend_free(); - ggml_threadpool_free(threadpool); - ggml_threadpool_free(threadpool_batch); + ggml_threadpool_free_fn(threadpool); + ggml_threadpool_free_fn(threadpool_batch); return 0; } diff --git a/examples/simple-chat/simple-chat.cpp b/examples/simple-chat/simple-chat.cpp index 5f9973163732d..7f4da666b08ec 100644 --- a/examples/simple-chat/simple-chat.cpp +++ b/examples/simple-chat/simple-chat.cpp @@ -62,6 +62,9 @@ int main(int argc, char ** argv) { } }, nullptr); + // load dynamic backends + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); model_params.n_gpu_layers = ngl; diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 59760fe95db22..3288c0250a001 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -74,6 +74,10 @@ int main(int argc, char ** argv) { } } + // load dynamic backends + + ggml_backend_load_all(); + // initialize the model llama_model_params model_params = llama_model_default_params(); diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 2d32da1b6d879..70b5cfdf7fbb4 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -33,6 +33,7 @@ else() endif() option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT}) +option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF) # # option list diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index cef164764bb1a..d9aca71ae553a 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -190,6 +190,14 @@ extern "C" { typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads); // Get additional buffer types provided by the device (returns a NULL-terminated array) typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device); + // Set the abort callback for the backend + typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data); + // Get a list of feature flags supported by the backend (returns a NULL-terminated array) + struct ggml_backend_feature { + const char * name; + const char * value; + }; + typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg); // // Backend registry @@ -214,6 +222,11 @@ extern "C" { // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); + // Load a backend from a dynamic library + GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Load all known backends from dynamic libraries + GGML_API void ggml_backend_load_all(void); + // // Backend scheduler // diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h index 7571ef9798364..a5358d047a08e 100644 --- a/ggml/include/ggml-cpu.h +++ b/ggml/include/ggml-cpu.h @@ -7,29 +7,6 @@ extern "C" { #endif - // Scheduling priorities - enum ggml_sched_priority { - GGML_SCHED_PRIO_NORMAL, - GGML_SCHED_PRIO_MEDIUM, - GGML_SCHED_PRIO_HIGH, - GGML_SCHED_PRIO_REALTIME - }; - - // Threadpool params - // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults - struct ggml_threadpool_params { - bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) - int n_threads; // number of threads - enum ggml_sched_priority prio; // thread priority - uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) - bool strict_cpu; // strict cpu placement - bool paused; // start in paused state - }; - - struct ggml_threadpool; // forward declaration, see ggml.c - - typedef struct ggml_threadpool * ggml_threadpool_t; - // the compute plan that needs to be prepared for ggml_graph_compute() // since https://github.com/ggerganov/ggml/issues/287 struct ggml_cplan { @@ -75,14 +52,11 @@ extern "C" { GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3); GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value); - GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); - GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); - GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); - GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); - GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); - GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); - GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); + GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params); + GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool); + GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool); + GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool); // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data @@ -104,10 +78,10 @@ extern "C" { GGML_BACKEND_API int ggml_cpu_has_sse3 (void); GGML_BACKEND_API int ggml_cpu_has_ssse3 (void); GGML_BACKEND_API int ggml_cpu_has_avx (void); + GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx2 (void); GGML_BACKEND_API int ggml_cpu_has_f16c (void); GGML_BACKEND_API int ggml_cpu_has_fma (void); - GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void); GGML_BACKEND_API int ggml_cpu_has_avx512 (void); GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void); GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 69e6a24344b97..9843b09fbe83e 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2215,6 +2215,37 @@ extern "C" { GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type); + // ggml threadpool + // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend + // the goal should be to create an API that other backends can use move everything to the ggml base + + // scheduling priorities + enum ggml_sched_priority { + GGML_SCHED_PRIO_NORMAL, + GGML_SCHED_PRIO_MEDIUM, + GGML_SCHED_PRIO_HIGH, + GGML_SCHED_PRIO_REALTIME + }; + + // threadpool params + // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults + struct ggml_threadpool_params { + bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings) + int n_threads; // number of threads + enum ggml_sched_priority prio; // thread priority + uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling) + bool strict_cpu; // strict cpu placement + bool paused; // start in paused state + }; + + struct ggml_threadpool; // forward declaration, see ggml.c + + typedef struct ggml_threadpool * ggml_threadpool_t; + + GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads); + GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads); + GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1); + #ifdef __cplusplus } #endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8df0e85c0d092..c506a413d16ef 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -202,6 +202,10 @@ endif() # ggml +if (GGML_BACKEND_DL) + add_compile_definitions(GGML_BACKEND_DL) +endif() + add_library(ggml-base ../include/ggml.h ../include/ggml-alloc.h @@ -239,11 +243,18 @@ function(ggml_add_backend backend) if (${BUILD_SHARED_LIBS}) target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) + if (GGML_BACKEND_DL) + target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_DL) + endif() + endif() + if (GGML_BACKEND_DL) + install(TARGETS ${backend_target} RUNTIME) + else() + install(TARGETS ${backend_target} LIBRARY) + target_link_libraries(ggml PUBLIC ${backend_target}) + string(TOUPPER "GGML_USE_${backend}" backend_use) + target_compile_definitions(ggml PUBLIC ${backend_use}) endif() - install(TARGETS ${backend_target} LIBRARY) - target_link_libraries(ggml PUBLIC ${backend_target}) - string(TOUPPER "GGML_USE_${backend}" backend_use) - target_compile_definitions(ggml PUBLIC ${backend_use}) endif() endif() endfunction() diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp index 8568e7965fd2e..0e13266689e11 100644 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ b/ggml/src/ggml-amx/ggml-amx.cpp @@ -444,3 +444,5 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) { } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_amx_reg) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index fa8d5b7fb68c9..e22ddbb02b4fa 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -204,12 +204,30 @@ extern "C" { void * context; }; - // Internal backend registry API void ggml_backend_register(ggml_backend_reg_t reg); void ggml_backend_device_register(ggml_backend_dev_t device); - // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function - // typedef ggml_backend_register_t * (*ggml_backend_init)(void); + + // Add backend dynamic loading support to the backend + #ifdef GGML_BACKEND_DL + #ifdef __cplusplus + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \ + } \ + ggml_backend_reg_t ggml_backend_init() { \ + return reg_fn(); \ + } + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \ + ggml_backend_reg_t ggml_backend_init() { \ + return reg_fn(); \ + } + #endif + #else + # define GGML_BACKEND_DL_IMPL(reg_fn) + #endif #ifdef __cplusplus } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 63e9d82017457..6b68c956c0bed 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,11 +1,13 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" -#include "ggml-cpu.h" #include "ggml-impl.h" #include #include // Backend registry +#ifdef GGML_USE_CPU +#include "ggml-cpu.h" +#endif #ifdef GGML_USE_CUDA #include "ggml-cuda.h" @@ -75,8 +77,9 @@ struct ggml_backend_registry { #ifdef GGML_USE_KOMPUTE register_backend(ggml_backend_kompute_reg()); #endif - +#ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); +#endif } void register_backend(ggml_backend_reg_t reg) { @@ -193,3 +196,86 @@ ggml_backend_t ggml_backend_init_best(void) { } return ggml_backend_dev_init(dev, NULL); } + +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include +#endif + +typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + +ggml_backend_reg_t ggml_backend_load(const char * path) { +#ifdef _WIN32 + HMODULE handle = LoadLibraryA(path); + if (!handle) { + GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + return NULL; + } + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + if (!backend_init) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + FreeLibrary(handle); + return NULL; + } + ggml_backend_reg_t reg = backend_init(); + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); + FreeLibrary(handle); + return NULL; + } + GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + ggml_backend_register(reg); + return reg; +#else + void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + if (!handle) { + GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + return NULL; + } + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + if (!backend_init) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); + dlclose(handle); + return NULL; + } + ggml_backend_reg_t reg = backend_init(); + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); + dlclose(handle); + return NULL; + } + GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + ggml_backend_register(reg); + return reg; +#endif +} + +void ggml_backend_load_all() { +#ifdef _WIN32 + #define GGML_BACKEND_PATH(backend) "ggml-" backend ".dll" +#elif defined(__APPLE__) + // path is hardcoded to the cmake build directory for now + // FIXME: should also search default system paths + #define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".dylib" +#else + #define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".so" +#endif + + ggml_backend_load(GGML_BACKEND_PATH("amx")); + ggml_backend_load(GGML_BACKEND_PATH("blas")); + ggml_backend_load(GGML_BACKEND_PATH("cann")); + ggml_backend_load(GGML_BACKEND_PATH("cuda")); + ggml_backend_load(GGML_BACKEND_PATH("hip")); + ggml_backend_load(GGML_BACKEND_PATH("kompute")); + ggml_backend_load(GGML_BACKEND_PATH("metal")); + ggml_backend_load(GGML_BACKEND_PATH("rpc")); + ggml_backend_load(GGML_BACKEND_PATH("sycl")); + ggml_backend_load(GGML_BACKEND_PATH("vulkan")); + ggml_backend_load(GGML_BACKEND_PATH("musa")); + ggml_backend_load(GGML_BACKEND_PATH("cpu")); +} diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index 648c9d875e346..b3f804937a86e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -512,3 +512,5 @@ ggml_backend_reg_t ggml_backend_blas_reg(void) { return &ggml_backend_blas_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg) diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 776340881434d..e592560c67a2d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2126,3 +2126,5 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, ggml_cann_set_device(device); ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); } + +GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg) diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 4b58254e7d108..c6ede19d9d1c0 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -13578,29 +13578,6 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int #endif // GGML_USE_OPENMP -void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { - p->n_threads = n_threads; - p->prio = 0; // default priority (usually means normal or inherited) - p->poll = 50; // hybrid-polling enabled - p->strict_cpu = false; // no strict placement (all threads share same cpumask) - p->paused = false; // threads are ready to go - memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) -} - -struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { - struct ggml_threadpool_params p; - ggml_threadpool_params_init(&p, n_threads); - return p; -} - -bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { - if (p0->n_threads != p1->n_threads ) return false; - if (p0->prio != p1->prio ) return false; - if (p0->poll != p1->poll ) return false; - if (p0->strict_cpu != p1->strict_cpu ) return false; - return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; -} - static struct ggml_threadpool * ggml_threadpool_new_impl( struct ggml_threadpool_params * tpp, struct ggml_cgraph * cgraph, diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 573b7c5b9b375..55f88992a81e5 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -541,16 +541,12 @@ static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg return &ggml_backend_cpu_device; } -struct ggml_backend_feature { - const char * name; - const char * value; -}; - -// Not used yet // This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically, -// and additionally to allow other backends to expose their own list of features that applications can query using the same API. +// and additionally to allow other backends to expose their own list of features that applications can query using the same API static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) { static std::vector features = []() { + ggml_cpu_init(); + std::vector features; if (ggml_cpu_has_sse3()) { features.push_back({ "SSE3", "1" }); @@ -561,6 +557,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_avx()) { features.push_back({ "AVX", "1" }); } + if (ggml_cpu_has_avx_vnni()) { + features.push_back({ "AVX_VNNI", "1" }); + } if (ggml_cpu_has_avx2()) { features.push_back({ "AVX2", "1" }); } @@ -570,9 +569,6 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_fma()) { features.push_back({ "FMA", "1" }); } - if (ggml_cpu_has_avx_vnni()) { - features.push_back({ "AVX_VNNI", "1" }); - } if (ggml_cpu_has_avx512()) { features.push_back({ "AVX512", "1" }); } @@ -619,6 +615,10 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_llamafile()) { features.push_back({ "LLAMAFILE", "1" }); } + // TODO: rename this + #ifdef GGML_USE_CPU_AARCH64 + features.push_back({ "AARCH64_REPACK", "1" }); + #endif features.push_back({ nullptr, nullptr }); @@ -637,6 +637,29 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) { return (void *)ggml_backend_cpu_get_extra_bufts; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cpu_get_features; + } + if (strcmp(name, "ggml_backend_set_abort_callback") == 0) { + return (void *)ggml_backend_cpu_set_abort_callback; + } + if (strcmp(name, "ggml_backend_cpu_numa_init") == 0) { + return (void *)ggml_numa_init; + } + if (strcmp(name, "ggml_backend_cpu_is_numa") == 0) { + return (void *)ggml_is_numa; + } + + // threadpool - TODO: move to ggml-base + if (strcmp(name, "ggml_threadpool_new") == 0) { + return (void *)ggml_threadpool_new; + } + if (strcmp(name, "ggml_threadpool_free") == 0) { + return (void *)ggml_threadpool_free; + } + if (strcmp(name, "ggml_backend_cpu_set_threadpool") == 0) { + return (void *)ggml_backend_cpu_set_threadpool; + } return NULL; @@ -661,3 +684,5 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) { return &ggml_backend_cpu_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cpu_reg) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index dd94ab03d5b6c..a66b3a9f3ff60 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3126,6 +3126,61 @@ static ggml_backend_dev_t ggml_backend_cuda_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } +static ggml_backend_feature * ggml_backend_cuda_get_features(ggml_backend_reg_t reg) { + static std::vector features = []() { + std::vector features; + #define _STRINGIFY(...) #__VA_ARGS__ + #define STRINGIFY(...) _STRINGIFY(__VA_ARGS__) + + #ifdef __CUDA_ARCH_LIST__ + features.push_back({ "ARCHS", STRINGIFY(__CUDA_ARCH_LIST__) }); + #endif + + #ifdef GGML_CUDA_FORCE_MMQ + features.push_back({ "FORCE_MMQ", "1" }); + #endif + + #ifdef GGML_CUDA_FORCE_CUBLAS + features.push_back({ "FORCE_CUBLAS", "1" }); + #endif + + #ifdef GGML_CUDA_NO_VMM + features.push_back({ "NO_VMM", "1" }); + #endif + + #ifdef GGML_CUDA_NO_PEER_COPY + features.push_back({ "NO_PEER_COPY", "1" }); + #endif + + #ifdef GGML_CUDA_F16 + features.push_back({ "F16", "1" }); + #endif + + #ifdef GGML_CUDA_USE_GRAPHS + features.push_back({ "USE_GRAPHS", "1" }); + #endif + + #ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE + features.push_back({ "PEER_MAX_BATCH_SIZE", STRINGIFY(GGML_CUDA_PEER_MAX_BATCH_SIZE) }); + #endif + + #ifdef GGML_CUDA_FA_ALL_QUANTS + features.push_back({ "FA_ALL_QUANTS", "1" }); + #endif + + #undef _STRINGIFY + #undef STRINGIFY + + features.push_back({ nullptr, nullptr }); + + return features; + }(); + + return features.data(); + + GGML_UNUSED(reg); +} + static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { GGML_UNUSED(reg); if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { @@ -3137,6 +3192,9 @@ static void * ggml_backend_cuda_reg_get_proc_address(ggml_backend_reg_t reg, con if (strcmp(name, "ggml_backend_unregister_host_buffer") == 0) { return (void *)ggml_backend_cuda_unregister_host_buffer; } + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_cuda_get_features; + } return nullptr; } @@ -3209,3 +3267,5 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return cuda_backend; } + +GGML_BACKEND_DL_IMPL(ggml_backend_cuda_reg) diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index 2fea9e4cc8d38..c2a7b6afa8a80 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -2182,3 +2182,5 @@ ggml_backend_reg_t ggml_backend_kompute_reg() { return ® } + +GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index d1abb3cef0ec4..1b1967f945afa 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4396,3 +4396,5 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) { return &g_ggml_backend_metal_reg; } + +GGML_BACKEND_DL_IMPL(ggml_backend_metal_reg) diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 47357daabdf54..31c1313b94557 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1401,3 +1401,5 @@ ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) { return dev; } + +GGML_BACKEND_DL_IMPL(ggml_backend_rpc_reg) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 255bc64c6badd..f36640fd64491 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4678,3 +4678,4 @@ ggml_backend_t ggml_backend_sycl_init(int device) { return sycl_backend; } +GGML_BACKEND_DL_IMPL(ggml_backend_sycl_reg) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ca71da2f7b7f5..f9e5fabfaa9ee 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -7365,3 +7365,5 @@ static void ggml_vk_check_results_1(ggml_tensor * tensor) { VK_LOG_DEBUG("END ggml_vk_check_results_1(" << tensor->name << ")"); } #endif + +GGML_BACKEND_DL_IMPL(ggml_backend_vk_reg) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 78e7874dee04d..1a2318cb188c4 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -7571,3 +7571,26 @@ void ggml_log_set(ggml_log_callback log_callback, void * user_data) { g_logger_state.log_callback = log_callback ? log_callback : ggml_log_callback_default; g_logger_state.log_callback_user_data = user_data; } + +void ggml_threadpool_params_init(struct ggml_threadpool_params * p, int n_threads) { + p->n_threads = n_threads; + p->prio = 0; // default priority (usually means normal or inherited) + p->poll = 50; // hybrid-polling enabled + p->strict_cpu = false; // no strict placement (all threads share same cpumask) + p->paused = false; // threads are ready to go + memset(p->cpumask, 0, GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited) +} + +struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads) { + struct ggml_threadpool_params p; + ggml_threadpool_params_init(&p, n_threads); + return p; +} + +bool ggml_threadpool_params_match(const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1) { + if (p0->n_threads != p1->n_threads ) return false; + if (p0->prio != p1->prio ) return false; + if (p0->poll != p1->poll ) return false; + if (p0->strict_cpu != p1->strict_cpu ) return false; + return memcmp(p0->cpumask, p1->cpumask, GGML_MAX_N_THREADS) == 0; +} diff --git a/pocs/CMakeLists.txt b/pocs/CMakeLists.txt index 03e1d2c04be65..d49d14dee4351 100644 --- a/pocs/CMakeLists.txt +++ b/pocs/CMakeLists.txt @@ -8,5 +8,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() - add_subdirectory(vdot) + if (NOT GGML_BACKEND_DL) + add_subdirectory(vdot) + endif() endif() diff --git a/src/llama.cpp b/src/llama.cpp index 001711037d5d1..88a802dfbe2d5 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -4866,7 +4866,9 @@ struct llama_model_loader { mappings.reserve(files.size()); mmaps_used.reserve(files.size()); for (const auto & file : files) { - std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU)); + auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa"); + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, is_numa_fn())); mmaps_used.emplace_back(mapping->size, 0); if (mlock_mmaps) { std::unique_ptr mlock_mmap(new llama_mlock()); @@ -9190,7 +9192,7 @@ static bool llm_load_tensors( ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft); if (!dev) { // FIXME: workaround for CPU backend buft having a NULL device - dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0); + dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } ggml_backend_dev_props props; ggml_backend_dev_get_props(dev, &props); @@ -17443,8 +17445,9 @@ static enum ggml_status llama_graph_compute( int n_threads, ggml_threadpool * threadpool) { if (lctx.backend_cpu != nullptr) { - ggml_backend_cpu_set_threadpool(lctx.backend_cpu, threadpool); - ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(lctx.backend_cpu)); + auto * set_threadpool_fn = (decltype(ggml_backend_cpu_set_threadpool) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_set_threadpool"); + set_threadpool_fn(lctx.backend_cpu, threadpool); } // set the number of threads for all the backends @@ -19478,7 +19481,11 @@ void llama_backend_init(void) { void llama_numa_init(enum ggml_numa_strategy numa) { if (numa != GGML_NUMA_STRATEGY_DISABLED) { - ggml_numa_init(numa); + auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + GGML_ASSERT(dev && "CPU backend is not loaded"); + auto * reg = ggml_backend_dev_backend_reg(dev); + auto * numa_init_fn = (decltype(ggml_numa_init) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_numa_init"); + numa_init_fn(numa); } } @@ -19752,9 +19759,6 @@ struct llama_context * llama_new_context_with_model( __func__, n_ctx_per_seq, hparams.n_ctx_train); } - ctx->abort_callback = params.abort_callback; - ctx->abort_callback_data = params.abort_callback_data; - ctx->logits_all = params.logits_all; // build worst-case graph for encoder if a model contains encoder @@ -19803,7 +19807,7 @@ struct llama_context * llama_new_context_with_model( } // add CPU backend - ctx->backend_cpu = ggml_backend_cpu_init(); + ctx->backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); if (ctx->backend_cpu == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); llama_free(ctx); @@ -19823,6 +19827,8 @@ struct llama_context * llama_new_context_with_model( } } + llama_set_abort_callback(ctx, params.abort_callback, params.abort_callback_data); + if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); @@ -19868,7 +19874,8 @@ struct llama_context * llama_new_context_with_model( std::vector backend_ptrs; for (auto & backend : ctx->backends) { auto * buft = ggml_backend_get_default_buffer_type(backend.get()); - if (ggml_backend_is_cpu(backend.get()) && !model->devices.empty()) { + auto backend_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (backend_type == GGML_BACKEND_DEVICE_TYPE_CPU && !model->devices.empty()) { // use the host buffer of the first device CPU for faster transfer of the intermediate state auto * dev = model->devices[0]; auto * host_buft = ggml_backend_dev_host_buffer_type(dev); @@ -19896,7 +19903,8 @@ struct llama_context * llama_new_context_with_model( // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { for (auto & backend : ctx->backends) { - if (ggml_backend_is_cpu(backend.get())) { + auto dev_type = ggml_backend_dev_type(ggml_backend_get_device(backend.get())); + if (dev_type == GGML_BACKEND_DEVICE_TYPE_CPU) { // ignore CPU backend continue; } @@ -21450,6 +21458,14 @@ int32_t llama_n_threads_batch(struct llama_context * ctx) { void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { ctx->abort_callback = abort_callback; ctx->abort_callback_data = abort_callback_data; + + for (auto & backend : ctx->backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); + } + } } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { @@ -22191,32 +22207,23 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int } const char * llama_print_system_info(void) { - ggml_cpu_init(); // some ARM features are detected at runtime - static std::string s; - s = ""; - s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | "; - s += "AVX_VNNI = " + std::to_string(ggml_cpu_has_avx_vnni()) + " | "; - s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | "; - s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | "; - s += "AVX512_VBMI = " + std::to_string(ggml_cpu_has_avx512_vbmi()) + " | "; - s += "AVX512_VNNI = " + std::to_string(ggml_cpu_has_avx512_vnni()) + " | "; - s += "AVX512_BF16 = " + std::to_string(ggml_cpu_has_avx512_bf16()) + " | "; - s += "AMX_INT8 = " + std::to_string(ggml_cpu_has_amx_int8()) + " | "; - s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | "; - s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | "; - s += "SVE = " + std::to_string(ggml_cpu_has_sve()) + " | "; - s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | "; - s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | "; - s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | "; - s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | "; - s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | "; - s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; - s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; - s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; - s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; - s += "LLAMAFILE = " + std::to_string(ggml_cpu_has_llamafile()) + " | "; + for (int i = 0; i < ggml_backend_reg_count(); i++) { + auto * reg = ggml_backend_reg_get(i); + auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); + if (get_features_fn) { + ggml_backend_feature * features = get_features_fn(reg); + s += ggml_backend_reg_name(reg); + s += " : "; + for (; features->name; features++) { + s += features->name; + s += " = "; + s += features->value; + s += " | "; + } + } + } return s.c_str(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b06f122e89873..82373ff4e1862 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,23 +110,26 @@ llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CU # llama_target_and_test(test-double-float.cpp) # SLOW llama_target_and_test(test-log.cpp) llama_target_and_test(test-arg-parser.cpp) -llama_target_and_test(test-quantize-fns.cpp) -llama_target_and_test(test-quantize-perf.cpp) llama_target_and_test(test-sampling.cpp) llama_target_and_test(test-chat-template.cpp) llama_target_and_test(test-grammar-parser.cpp) llama_target_and_test(test-grammar-integration.cpp) llama_target_and_test(test-llama-grammar.cpp) -llama_target_and_test(test-barrier.cpp) # llama_target_and_test(test-opt.cpp) # SLOW llama_target_and_test(test-backend-ops.cpp) -llama_target_and_test(test-rope.cpp) - llama_target_and_test(test-model-load-cancel.cpp LABEL "model") llama_target_and_test(test-autorelease.cpp LABEL "model") +if (NOT GGML_BACKEND_DL) + # these tests use the backends directly and cannot be built with dynamic loading + llama_target_and_test(test-barrier.cpp) + llama_target_and_test(test-quantize-fns.cpp) + llama_target_and_test(test-quantize-perf.cpp) + llama_target_and_test(test-rope.cpp) +endif() + # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8 if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64") llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b2b5705243ea2..6376b0e4c66cf 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -16,7 +16,6 @@ #include -#include #include #include @@ -26,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -639,19 +637,20 @@ struct test_case { // determine number of runs int n_runs; + bool is_cpu = ggml_backend_dev_type(ggml_backend_get_device(backend)) == GGML_BACKEND_DEVICE_TYPE_CPU; if (op_flops(out) > 0) { // based on flops const uint64_t GFLOP = 1000 * 1000 * 1000; const uint64_t target_flops_cpu = 8ULL * GFLOP; const uint64_t target_flops_gpu = 100ULL * GFLOP; - uint64_t target_flops = ggml_backend_is_cpu(backend) ? target_flops_cpu : target_flops_gpu; + uint64_t target_flops = is_cpu ? target_flops_cpu : target_flops_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_flops / op_flops(out)) + 1; } else { // based on memory size const size_t GB = 1ULL << 30; const size_t target_size_cpu = 8 * GB; const size_t target_size_gpu = 32 * GB; - size_t target_size = ggml_backend_is_cpu(backend) ? target_size_cpu : target_size_gpu; + size_t target_size = is_cpu ? target_size_cpu : target_size_gpu; n_runs = std::min(ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1; } @@ -3873,7 +3872,11 @@ static std::vector> make_test_cases_perf() { static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op_name) { if (mode == MODE_TEST) { auto test_cases = make_test_cases_eval(); - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + ggml_backend_t backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, NULL); + if (backend_cpu == NULL) { + printf(" Failed to initialize CPU backend\n"); + return false; + } size_t n_ok = 0; for (auto & test : test_cases) { @@ -3953,7 +3956,9 @@ int main(int argc, char ** argv) { } } - // enumerate backends + // load and enumerate backends + ggml_backend_load_all(); + printf("Testing %zu devices\n\n", ggml_backend_dev_count()); size_t n_ok = 0; @@ -3969,16 +3974,15 @@ int main(int argc, char ** argv) { continue; } - ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); - GGML_ASSERT(backend != NULL); - - if (backend_filter == NULL && ggml_backend_is_cpu(backend) && mode != MODE_GRAD) { + if (backend_filter == NULL && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU && mode != MODE_GRAD) { printf(" Skipping CPU backend\n"); - ggml_backend_free(backend); n_ok++; continue; } + ggml_backend_t backend = ggml_backend_dev_init(dev, NULL); + GGML_ASSERT(backend != NULL); + ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads"); if (ggml_backend_set_n_threads_fn) { From ccd8df8a9dda8ff9b044520df4b9fa0bfcd6deef Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 24 Nov 2024 00:59:39 +0100 Subject: [PATCH 02/11] add ggml_backend_unload --- ggml/include/ggml-backend.h | 4 +- ggml/src/ggml-backend-reg.cpp | 85 ++++++++++++++++++++++++----------- 2 files changed, 62 insertions(+), 27 deletions(-) diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index d9aca71ae553a..19881a5059f17 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -222,8 +222,10 @@ extern "C" { // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL) GGML_API ggml_backend_t ggml_backend_init_best(void); - // Load a backend from a dynamic library + // Load a backend from a dynamic library and register it GGML_API ggml_backend_reg_t ggml_backend_load(const char * path); + // Unload a backend if loaded dynamically and unregister it + GGML_API void ggml_backend_unload(ggml_backend_reg_t reg); // Load all known backends from dynamic libraries GGML_API void ggml_backend_load_all(void); diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 6b68c956c0bed..78096af183ebd 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -1,6 +1,7 @@ #include "ggml-backend-impl.h" #include "ggml-backend.h" #include "ggml-impl.h" +#include #include #include @@ -45,8 +46,13 @@ #include "ggml-kompute.h" #endif +struct ggml_backend_reg_entry { + ggml_backend_reg_t reg; + void * handle; +}; + struct ggml_backend_registry { - std::vector backends; + std::vector backends; std::vector devices; ggml_backend_registry() { @@ -82,7 +88,13 @@ struct ggml_backend_registry { #endif } - void register_backend(ggml_backend_reg_t reg) { + ~ggml_backend_registry() { + while (!backends.empty()) { + ggml_backend_unload(backends.back().reg); + } + } + + void register_backend(ggml_backend_reg_t reg, void * handle = nullptr) { if (!reg) { return; } @@ -91,7 +103,7 @@ struct ggml_backend_registry { GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n", __func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg)); #endif - backends.push_back(reg); + backends.push_back({ reg, handle }); for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) { register_device(ggml_backend_reg_dev_get(reg, i)); } @@ -126,7 +138,7 @@ size_t ggml_backend_reg_count() { ggml_backend_reg_t ggml_backend_reg_get(size_t index) { GGML_ASSERT(index < ggml_backend_reg_count()); - return get_reg().backends[index]; + return get_reg().backends[index].reg; } ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { @@ -136,7 +148,7 @@ ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) { return reg; } } - return NULL; + return nullptr; } // Device enumeration @@ -156,7 +168,7 @@ ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) { return dev; } } - return NULL; + return nullptr; } ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { @@ -166,14 +178,14 @@ ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) { return dev; } } - return NULL; + return nullptr; } // Convenience functions ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_name(name); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -181,7 +193,7 @@ ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) { ggml_backend_dev_t dev = ggml_backend_dev_by_type(type); if (!dev) { - return NULL; + return nullptr; } return ggml_backend_dev_init(dev, params); } @@ -192,9 +204,9 @@ ggml_backend_t ggml_backend_init_best(void) { dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); } if (!dev) { - return NULL; + return nullptr; } - return ggml_backend_dev_init(dev, NULL); + return ggml_backend_dev_init(dev, nullptr); } #ifdef _WIN32 @@ -214,45 +226,66 @@ ggml_backend_reg_t ggml_backend_load(const char * path) { HMODULE handle = LoadLibraryA(path); if (!handle) { GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); - return NULL; + return nullptr; } ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); if (!backend_init) { GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); FreeLibrary(handle); - return NULL; - } - ggml_backend_reg_t reg = backend_init(); - if (!reg) { - GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); - FreeLibrary(handle); - return NULL; + return nullptr; } - GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); - ggml_backend_register(reg); - return reg; #else void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); if (!handle) { GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); - return NULL; + return nullptr; } auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); if (!backend_init) { GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); dlclose(handle); - return NULL; + return nullptr; } +#endif ggml_backend_reg_t reg = backend_init(); if (!reg) { GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); dlclose(handle); - return NULL; + return nullptr; } GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); - ggml_backend_register(reg); + get_reg().register_backend(reg, handle); return reg; +} + +void ggml_backend_unload(ggml_backend_reg_t reg) { + auto it = std::find_if(get_reg().backends.begin(), get_reg().backends.end(), + [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); + + if (it == get_reg().backends.end()) { + GGML_LOG_ERROR("%s: backend not found\n", __func__); + return; + } + + GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + + // remove devices + get_reg().devices.erase( + std::remove_if(get_reg().devices.begin(), get_reg().devices.end(), + [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), + get_reg().devices.end()); + + // unload library + if (it->handle) { +#ifdef _WIN32 + FreeLibrary((HMODULE) it->handle); +#else + dlclose(it->handle); #endif + } + + // remove backend + get_reg().backends.erase(it); } void ggml_backend_load_all() { From 1605605b54789c39a2fe775304160170e7737e10 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 24 Nov 2024 01:08:56 +0100 Subject: [PATCH 03/11] link to libdl on linux --- Makefile | 1 + ggml/src/CMakeLists.txt | 4 ++++ ggml/src/ggml-backend-reg.cpp | 5 +++++ 3 files changed, 10 insertions(+) diff --git a/Makefile b/Makefile index 5c899438515e1..bd2a79b93cf71 100644 --- a/Makefile +++ b/Makefile @@ -290,6 +290,7 @@ endif # some memory allocation are available on Linux through GNU extensions in libc ifeq ($(UNAME_S),Linux) MK_CPPFLAGS += -D_GNU_SOURCE + MK_LDFLAGS += -ldl endif # RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1, diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c506a413d16ef..8cf2640fa08e0 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -230,6 +230,10 @@ add_library(ggml target_link_libraries(ggml PUBLIC ggml-base) +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(ggml PRIVATE dl) +endif() + function(ggml_add_backend backend) string(TOUPPER "GGML_${backend}" backend_id) if (${backend_id}) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 78096af183ebd..742b860bd0c5b 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -250,9 +250,14 @@ ggml_backend_reg_t ggml_backend_load(const char * path) { ggml_backend_reg_t reg = backend_init(); if (!reg) { GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); +#ifdef _WIN32 + FreeLibrary(handle); +#else dlclose(handle); +#endif return nullptr; } + GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); get_reg().register_backend(reg, handle); return reg; From 808d434901d067b4991f05efed81755d5999a87a Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 24 Nov 2024 02:05:21 +0100 Subject: [PATCH 04/11] fixes --- Makefile | 2 +- examples/eval-callback/CMakeLists.txt | 4 +- ggml/src/ggml-backend-reg.cpp | 89 +++++++++++++++------------ src/llama.cpp | 2 +- 4 files changed, 56 insertions(+), 41 deletions(-) diff --git a/Makefile b/Makefile index bd2a79b93cf71..14defdd107b7a 100644 --- a/Makefile +++ b/Makefile @@ -251,7 +251,7 @@ endif # # keep standard at C11 and C++11 -MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon +MK_CPPFLAGS = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon -DGGML_USE_CPU MK_CFLAGS = -std=c11 -fPIC MK_CXXFLAGS = -std=c++11 -fPIC MK_NVCCFLAGS = -std=c++11 diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index a48753d38e16e..89264242850ba 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -5,5 +5,7 @@ target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) -add_test(NAME ${TEST_TARGET} COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +add_test(NAME ${TEST_TARGET} + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0 + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../) # HACK for dl backends set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 742b860bd0c5b..60ba9d0e291e7 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -5,6 +5,17 @@ #include #include +#ifdef _WIN32 +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif +# include +#else +# include +#endif + + // Backend registry #ifdef GGML_USE_CPU #include "ggml-cpu.h" @@ -90,7 +101,8 @@ struct ggml_backend_registry { ~ggml_backend_registry() { while (!backends.empty()) { - ggml_backend_unload(backends.back().reg); + // use silent since the log system may have been destroyed at this point + unload_backend(backends.back().reg, true); } } @@ -115,6 +127,43 @@ struct ggml_backend_registry { #endif devices.push_back(device); } + + void unload_backend(ggml_backend_reg_t reg, bool silent) { + if (!silent) { + GGML_LOG_INFO("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + } + auto it = std::find_if(backends.begin(), backends.end(), + [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); + + if (it == backends.end()) { + if (!silent) { + GGML_LOG_ERROR("%s: backend not found\n", __func__); + } + return; + } + + if (!silent) { + GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + } + + // remove devices + devices.erase( + std::remove_if(devices.begin(), devices.end(), + [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), + devices.end()); + + // unload library + if (it->handle) { +#ifdef _WIN32 + FreeLibrary((HMODULE) it->handle); +#else + dlclose(it->handle); +#endif + } + + // remove backend + backends.erase(it); + } }; static ggml_backend_registry & get_reg() { @@ -209,16 +258,6 @@ ggml_backend_t ggml_backend_init_best(void) { return ggml_backend_dev_init(dev, nullptr); } -#ifdef _WIN32 -# define WIN32_LEAN_AND_MEAN -# ifndef NOMINMAX -# define NOMINMAX -# endif -# include -#else -# include -#endif - typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); ggml_backend_reg_t ggml_backend_load(const char * path) { @@ -264,33 +303,7 @@ ggml_backend_reg_t ggml_backend_load(const char * path) { } void ggml_backend_unload(ggml_backend_reg_t reg) { - auto it = std::find_if(get_reg().backends.begin(), get_reg().backends.end(), - [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); - - if (it == get_reg().backends.end()) { - GGML_LOG_ERROR("%s: backend not found\n", __func__); - return; - } - - GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); - - // remove devices - get_reg().devices.erase( - std::remove_if(get_reg().devices.begin(), get_reg().devices.end(), - [reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }), - get_reg().devices.end()); - - // unload library - if (it->handle) { -#ifdef _WIN32 - FreeLibrary((HMODULE) it->handle); -#else - dlclose(it->handle); -#endif - } - - // remove backend - get_reg().backends.erase(it); + get_reg().unload_backend(reg, true); } void ggml_backend_load_all() { diff --git a/src/llama.cpp b/src/llama.cpp index 88a802dfbe2d5..44f0b8fadfb2e 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -22209,7 +22209,7 @@ int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int const char * llama_print_system_info(void) { static std::string s; - for (int i = 0; i < ggml_backend_reg_count(); i++) { + for (size_t i = 0; i < ggml_backend_reg_count(); i++) { auto * reg = ggml_backend_reg_get(i); auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features"); if (get_features_fn) { From ad1e27a0af6555509ce5871c6e8c4d8d59fb5985 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 24 Nov 2024 10:53:35 +0200 Subject: [PATCH 05/11] metal : export ggml_backend_get_features() ggml-ci --- ggml/src/ggml-metal/ggml-metal.m | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 1b1967f945afa..a3950e17f5739 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4372,11 +4372,36 @@ static ggml_backend_dev_t ggml_backend_metal_reg_device_get(ggml_backend_reg_t r GGML_UNUSED(index); } +static struct ggml_backend_feature g_ggml_backend_metal_features[] = { +#if defined(GGML_METAL_EMBED_LIBRARY) + { "EMBED_LIBRARY", "1" }, +#endif +#if defined(GGML_METAL_USE_BF16) + { "BF16", "1" }, +#endif + { nil, nil }, +}; + +static struct ggml_backend_feature * ggml_backend_metal_get_features(ggml_backend_reg_t reg) { + return g_ggml_backend_metal_features; + + GGML_UNUSED(reg); +} + +static void * ggml_backend_metal_get_proc_address(ggml_backend_reg_t reg, const char * name) { + if (strcmp(name, "ggml_backend_get_features") == 0) { + return (void *)ggml_backend_metal_get_features; + } + + return NULL; + + GGML_UNUSED(reg); +} static struct ggml_backend_reg_i ggml_backend_metal_reg_i = { /* .get_name = */ ggml_backend_metal_reg_get_name, /* .device_count = */ ggml_backend_metal_reg_device_count, /* .device_get = */ ggml_backend_metal_reg_device_get, - /* .get_proc_address = */ NULL, + /* .get_proc_address = */ ggml_backend_metal_get_proc_address, }; ggml_backend_reg_t ggml_backend_metal_reg(void) { From 402a0e94dcf9b895ad9293c870436c3c49c1e6c2 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Sun, 24 Nov 2024 19:12:22 +0100 Subject: [PATCH 06/11] Update ggml/src/ggml-backend-impl.h Co-authored-by: Georgi Gerganov --- ggml/src/ggml-backend-impl.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index e22ddbb02b4fa..8d670b894c33c 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -211,18 +211,18 @@ extern "C" { // Add backend dynamic loading support to the backend #ifdef GGML_BACKEND_DL #ifdef __cplusplus - # define GGML_BACKEND_DL_IMPL(reg_fn) \ - extern "C" { \ - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \ - } \ - ggml_backend_reg_t ggml_backend_init() { \ - return reg_fn(); \ + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + extern "C" { \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + } \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ } #else - # define GGML_BACKEND_DL_IMPL(reg_fn) \ - GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(); \ - ggml_backend_reg_t ggml_backend_init() { \ - return reg_fn(); \ + # define GGML_BACKEND_DL_IMPL(reg_fn) \ + GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \ + ggml_backend_reg_t ggml_backend_init(void) { \ + return reg_fn(); \ } #endif #else From bd9f7b42971b364dc23e625d21eb6f5ac5041b5c Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 24 Nov 2024 23:22:16 +0100 Subject: [PATCH 07/11] refactor cmake build use MODULE target type for dl backend set backend output directory to the runtime directory ggml_backend_load_all searches backends in the system path first, then in the executable directory ggml-ci --- ggml/src/CMakeLists.txt | 40 ++++-- ggml/src/ggml-amx/CMakeLists.txt | 10 +- ggml/src/ggml-backend-impl.h | 16 ++- ggml/src/ggml-backend-reg.cpp | 198 ++++++++++++++++++--------- ggml/src/ggml-blas/CMakeLists.txt | 9 +- ggml/src/ggml-cann/CMakeLists.txt | 6 +- ggml/src/ggml-cpu/CMakeLists.txt | 19 ++- ggml/src/ggml-cuda/CMakeLists.txt | 11 +- ggml/src/ggml-hip/CMakeLists.txt | 10 +- ggml/src/ggml-kompute/CMakeLists.txt | 10 +- ggml/src/ggml-metal/CMakeLists.txt | 9 +- ggml/src/ggml-musa/CMakeLists.txt | 10 +- ggml/src/ggml-rpc/CMakeLists.txt | 8 +- ggml/src/ggml-sycl/CMakeLists.txt | 10 +- ggml/src/ggml-vulkan/CMakeLists.txt | 12 +- 15 files changed, 220 insertions(+), 158 deletions(-) diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 8cf2640fa08e0..071508ddae021 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -202,8 +202,8 @@ endif() # ggml -if (GGML_BACKEND_DL) - add_compile_definitions(GGML_BACKEND_DL) +if (GGML_BACKEND_DL AND NOT BUILD_SHARED_LIBS) + message(FATAL_ERROR "GGML_BACKEND_DL requires BUILD_SHARED_LIBS") endif() add_library(ggml-base @@ -234,6 +234,27 @@ if (CMAKE_SYSTEM_NAME MATCHES "Linux") target_link_libraries(ggml PRIVATE dl) endif() +function(ggml_add_backend_library backend) + if (GGML_BACKEND_DL) + add_library(${backend} MODULE ${ARGN}) + # write the shared library to the output directory + set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) + else() + add_library(${backend} ${ARGN}) + target_link_libraries(ggml PUBLIC ${backend}) + install(TARGETS ${backend} LIBRARY) + endif() + + target_link_libraries(${backend} PRIVATE ggml-base) + target_include_directories(${backend} PRIVATE ..) + + if (${BUILD_SHARED_LIBS}) + target_compile_definitions(${backend} PRIVATE GGML_BACKEND_BUILD) + target_compile_definitions(${backend} PUBLIC GGML_BACKEND_SHARED) + endif() +endfunction() + function(ggml_add_backend backend) string(TOUPPER "GGML_${backend}" backend_id) if (${backend_id}) @@ -244,18 +265,7 @@ function(ggml_add_backend backend) # however, currently it is necessary for AMX, since it is enabled by default on llama.cpp if (${backend_id}) message(STATUS "Including ${backend} backend") - if (${BUILD_SHARED_LIBS}) - target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_BUILD) - target_compile_definitions(${backend_target} PUBLIC GGML_BACKEND_SHARED) - if (GGML_BACKEND_DL) - target_compile_definitions(${backend_target} PRIVATE GGML_BACKEND_DL) - endif() - endif() - if (GGML_BACKEND_DL) - install(TARGETS ${backend_target} RUNTIME) - else() - install(TARGETS ${backend_target} LIBRARY) - target_link_libraries(ggml PUBLIC ${backend_target}) + if (NOT GGML_BACKEND_DL) string(TOUPPER "GGML_USE_${backend}" backend_use) target_compile_definitions(ggml PUBLIC ${backend_use}) endif() @@ -271,10 +281,10 @@ ggml_add_backend(CUDA) ggml_add_backend(HIP) ggml_add_backend(Kompute) ggml_add_backend(METAL) +ggml_add_backend(MUSA) ggml_add_backend(RPC) ggml_add_backend(SYCL) ggml_add_backend(Vulkan) -ggml_add_backend(MUSA) foreach (target ggml-base ggml) target_include_directories(${target} PUBLIC $ $) diff --git a/ggml/src/ggml-amx/CMakeLists.txt b/ggml/src/ggml-amx/CMakeLists.txt index d6676f3f67b20..cf3ade6f020ed 100644 --- a/ggml/src/ggml-amx/CMakeLists.txt +++ b/ggml/src/ggml-amx/CMakeLists.txt @@ -9,12 +9,10 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MA file(GLOB GGML_SOURCES_AMX "*.cpp") - add_library(ggml-amx - ${GGML_HEADERS_AMX} - ${GGML_SOURCES_AMX}) - - target_link_libraries(ggml-amx PRIVATE ggml-base) - target_include_directories(ggml-amx PRIVATE . ..) + ggml_add_backend_library(ggml-amx + ${GGML_HEADERS_AMX} + ${GGML_SOURCES_AMX} + ) # this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags # TODO: integrate AMX backend into the CPU backend diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 8d670b894c33c..4f31104e787ff 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -63,20 +63,20 @@ extern "C" { enum ggml_backend_buffer_usage usage; }; - ggml_backend_buffer_t ggml_backend_buffer_init( + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( ggml_backend_buffer_type_t buft, struct ggml_backend_buffer_i iface, void * context, size_t size); // do not use directly, use ggml_backend_tensor_copy instead - bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); + GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); // multi-buffer // buffer that contains a collection of buffers - ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); - bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); - void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); + GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); // // Backend (stream) @@ -205,10 +205,12 @@ extern "C" { }; // Internal backend registry API - void ggml_backend_register(ggml_backend_reg_t reg); - void ggml_backend_device_register(ggml_backend_dev_t device); + GGML_API void ggml_backend_register(ggml_backend_reg_t reg); + GGML_API void ggml_backend_device_register(ggml_backend_dev_t device); // Add backend dynamic loading support to the backend + typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); + #ifdef GGML_BACKEND_DL #ifdef __cplusplus # define GGML_BACKEND_DL_IMPL(reg_fn) \ diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 60ba9d0e291e7..943691f749a45 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include #include +#include #include #ifdef _WIN32 @@ -11,11 +12,14 @@ # define NOMINMAX # endif # include +#elif defined(__APPLE__) +# include +# include #else # include +# include #endif - // Backend registry #ifdef GGML_USE_CPU #include "ggml-cpu.h" @@ -128,10 +132,59 @@ struct ggml_backend_registry { devices.push_back(device); } - void unload_backend(ggml_backend_reg_t reg, bool silent) { - if (!silent) { - GGML_LOG_INFO("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg)); + ggml_backend_reg_t load_backend(const char * path, bool silent) { +#ifdef _WIN32 + HMODULE handle = LoadLibraryA(path); + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); + } + return nullptr; + } + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); + } + FreeLibrary(handle); + return nullptr; + } +#else + void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + if (!handle) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); + } + return nullptr; + } + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + if (!backend_init) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); + } + dlclose(handle); + return nullptr; + } +#endif + ggml_backend_reg_t reg = backend_init(); + if (!reg) { + if (!silent) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); + } + #ifdef _WIN32 + FreeLibrary(handle); + #else + dlclose(handle); + #endif + return nullptr; } + + GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + register_backend(reg, handle); + return reg; + } + + void unload_backend(ggml_backend_reg_t reg, bool silent) { auto it = std::find_if(backends.begin(), backends.end(), [reg](ggml_backend_reg_entry entry) { return entry.reg == reg; }); @@ -258,48 +311,9 @@ ggml_backend_t ggml_backend_init_best(void) { return ggml_backend_dev_init(dev, nullptr); } -typedef ggml_backend_reg_t (*ggml_backend_init_t)(void); - +// Dynamic loading ggml_backend_reg_t ggml_backend_load(const char * path) { -#ifdef _WIN32 - HMODULE handle = LoadLibraryA(path); - if (!handle) { - GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); - return nullptr; - } - ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); - if (!backend_init) { - GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); - FreeLibrary(handle); - return nullptr; - } -#else - void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); - if (!handle) { - GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); - return nullptr; - } - auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); - if (!backend_init) { - GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); - dlclose(handle); - return nullptr; - } -#endif - ggml_backend_reg_t reg = backend_init(); - if (!reg) { - GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); -#ifdef _WIN32 - FreeLibrary(handle); -#else - dlclose(handle); -#endif - return nullptr; - } - - GGML_LOG_DEBUG("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); - get_reg().register_backend(reg, handle); - return reg; + return get_reg().load_backend(path, false); } void ggml_backend_unload(ggml_backend_reg_t reg) { @@ -307,26 +321,82 @@ void ggml_backend_unload(ggml_backend_reg_t reg) { } void ggml_backend_load_all() { + std::vector search_prefix; + + // add the executable directory to the search path + // FIXME: this is convenient for development, but it should probably be disabled in production + +#if defined(__APPLE__) + // get executable path + std::vector path; + uint32_t size; + while (true) { + size = path.size(); + if (_NSGetExecutablePath(path.data(), &size) == 0) { + break; + } + path.resize(size); + } + std::string base_path(path.data(), size); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + search_prefix.push_back(base_path + "/"); +#elif defined(__linux__) + std::string base_path = "."; + std::vector path(1024); + while (true) { + // get executable path + ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); + if (len == -1) { + break; + } + if (len < (ssize_t) path.size()) { + base_path = std::string(path.data(), len); + // remove executable name + auto last_slash = base_path.find_last_of('/'); + if (last_slash != std::string::npos) { + base_path = base_path.substr(0, last_slash); + } + break; + } + path.resize(path.size() * 2); + } + + search_prefix.push_back(base_path + "/"); +#endif + + auto & reg = get_reg(); + + auto try_load = [&](const std::string & name) { + std::string os_name; #ifdef _WIN32 - #define GGML_BACKEND_PATH(backend) "ggml-" backend ".dll" -#elif defined(__APPLE__) - // path is hardcoded to the cmake build directory for now - // FIXME: should also search default system paths - #define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".dylib" + os_name = "ggml-" + name + ".dll"; #else - #define GGML_BACKEND_PATH(backend) "build/ggml/src/ggml-" backend "/libggml-" backend ".so" + os_name = "libggml-" + name + ".so"; #endif - - ggml_backend_load(GGML_BACKEND_PATH("amx")); - ggml_backend_load(GGML_BACKEND_PATH("blas")); - ggml_backend_load(GGML_BACKEND_PATH("cann")); - ggml_backend_load(GGML_BACKEND_PATH("cuda")); - ggml_backend_load(GGML_BACKEND_PATH("hip")); - ggml_backend_load(GGML_BACKEND_PATH("kompute")); - ggml_backend_load(GGML_BACKEND_PATH("metal")); - ggml_backend_load(GGML_BACKEND_PATH("rpc")); - ggml_backend_load(GGML_BACKEND_PATH("sycl")); - ggml_backend_load(GGML_BACKEND_PATH("vulkan")); - ggml_backend_load(GGML_BACKEND_PATH("musa")); - ggml_backend_load(GGML_BACKEND_PATH("cpu")); + if (reg.load_backend(os_name.c_str(), true)) { + return; + } + for (const auto & prefix : search_prefix) { + if (reg.load_backend((prefix + os_name).c_str(), true)) { + return; + } + } + }; + + try_load("amx"); + try_load("blas"); + try_load("cann"); + try_load("cuda"); + try_load("hip"); + try_load("kompute"); + try_load("metal"); + try_load("rpc"); + try_load("sycl"); + try_load("vulkan"); + try_load("musa"); + try_load("cpu"); } diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index e2cbabf0dae74..0bf3c05d93a89 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -11,12 +11,9 @@ find_package(BLAS) if (BLAS_FOUND) message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}") - add_library(ggml-blas - ggml-blas.cpp - ) - - target_link_libraries(ggml-blas PRIVATE ggml-base) - target_include_directories(ggml-blas PRIVATE . ..) + ggml_add_backend_library(ggml-blas + ggml-blas.cpp + ) if (${GGML_BLAS_VENDOR} MATCHES "Apple") add_compile_definitions(ACCELERATE_NEW_LAPACK) diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 756200b893d02..901327185fb75 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -61,9 +61,9 @@ if (CANN_INSTALL_DIR) file(GLOB GGML_SOURCES_CANN "*.cpp") - add_library(ggml-cann ${GGML_SOURCES_CANN}) - target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES}) - target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS}) + ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN}) + target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES}) + target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS}) target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64) target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 2880523331dbd..c2905d1fbf4e8 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -1,14 +1,13 @@ -add_library(ggml-cpu - ggml-cpu.c - ggml-cpu.cpp - ggml-cpu-aarch64.c - ggml-cpu-aarch64.h - ggml-cpu-quants.c - ggml-cpu-quants.h - ) +ggml_add_backend_library(ggml-cpu + ggml-cpu.c + ggml-cpu.cpp + ggml-cpu-aarch64.c + ggml-cpu-aarch64.h + ggml-cpu-quants.c + ggml-cpu-quants.h + ) -target_link_libraries(ggml-cpu PRIVATE ggml-base) -target_include_directories(ggml-cpu PRIVATE . ..) +target_include_directories(ggml-cpu PRIVATE .) if (APPLE AND GGML_ACCELERATE) find_library(ACCELERATE_FRAMEWORK Accelerate) diff --git a/ggml/src/ggml-cuda/CMakeLists.txt b/ggml/src/ggml-cuda/CMakeLists.txt index e1482a269d698..b0cb93e070fd3 100644 --- a/ggml/src/ggml-cuda/CMakeLists.txt +++ b/ggml/src/ggml-cuda/CMakeLists.txt @@ -46,13 +46,10 @@ if (CUDAToolkit_FOUND) list(APPEND GGML_SOURCES_CUDA ${SRCS}) endif() - add_library(ggml-cuda - ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_CUDA} - ) - - target_link_libraries(ggml-cuda PRIVATE ggml-base) - target_include_directories(ggml-cuda PRIVATE . ..) + ggml_add_backend_library(ggml-cuda + ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_CUDA} + ) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt index fccf8eb8440b8..b15fbd24d6b36 100644 --- a/ggml/src/ggml-hip/CMakeLists.txt +++ b/ggml/src/ggml-hip/CMakeLists.txt @@ -64,12 +64,10 @@ else() list(APPEND GGML_SOURCES_ROCM ${SRCS}) endif() -add_library(ggml-hip - ${GGML_HEADERS_ROCM} - ${GGML_SOURCES_ROCM}) - -target_link_libraries(ggml-hip PRIVATE ggml-base) -target_include_directories(ggml-hip PRIVATE . ..) +ggml_add_backend_library(ggml-hip + ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_ROCM} + ) # TODO: do not use CUDA definitions for HIP target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt index 0bd027c7f537e..dc623926c7685 100644 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ b/ggml/src/ggml-kompute/CMakeLists.txt @@ -6,13 +6,13 @@ if (NOT glslc_executable) message(FATAL_ERROR "glslc not found") endif() -add_library(ggml-kompute - ggml-kompute.cpp - ../../include/ggml-kompute.h - ) +ggml_add_backend_library(ggml-kompute + ggml-kompute.cpp + ../../include/ggml-kompute.h + ) target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) -target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) +target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) diff --git a/ggml/src/ggml-metal/CMakeLists.txt b/ggml/src/ggml-metal/CMakeLists.txt index b237d79f47ddb..1bad272068244 100644 --- a/ggml/src/ggml-metal/CMakeLists.txt +++ b/ggml/src/ggml-metal/CMakeLists.txt @@ -4,19 +4,16 @@ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) message(STATUS "Metal framework found") -add_library(ggml-metal - ggml-metal.m - ) +ggml_add_backend_library(ggml-metal + ggml-metal.m + ) target_link_libraries(ggml-metal PRIVATE - ggml-base ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK} ) -target_include_directories(ggml-metal PRIVATE . ..) - if (GGML_METAL_NDEBUG) add_compile_definitions(GGML_METAL_NDEBUG) endif() diff --git a/ggml/src/ggml-musa/CMakeLists.txt b/ggml/src/ggml-musa/CMakeLists.txt index f3c0136920540..e1a69186e669f 100644 --- a/ggml/src/ggml-musa/CMakeLists.txt +++ b/ggml/src/ggml-musa/CMakeLists.txt @@ -47,12 +47,10 @@ if (MUSAToolkit_FOUND) set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22") endforeach() - add_library(ggml-musa - ${GGML_HEADERS_MUSA} - ${GGML_SOURCES_MUSA}) - - target_link_libraries(ggml-musa PRIVATE ggml-base) - target_include_directories(ggml-musa PRIVATE . ..) + ggml_add_backend_library(ggml-musa + ${GGML_HEADERS_MUSA} + ${GGML_SOURCES_MUSA} + ) # TODO: do not use CUDA definitions for MUSA target_compile_definitions(ggml PUBLIC GGML_USE_CUDA) diff --git a/ggml/src/ggml-rpc/CMakeLists.txt b/ggml/src/ggml-rpc/CMakeLists.txt index a2d6770eb053f..f5acb8ec2cb28 100644 --- a/ggml/src/ggml-rpc/CMakeLists.txt +++ b/ggml/src/ggml-rpc/CMakeLists.txt @@ -1,10 +1,8 @@ message(STATUS "Using RPC backend") -add_library(ggml-rpc - ggml-rpc.cpp) - -target_link_libraries(ggml-rpc PRIVATE ggml-base) -target_include_directories(ggml-rpc PRIVATE . ..) +ggml_add_backend_library(ggml-rpc + ggml-rpc.cpp + ) if (WIN32) target_link_libraries(ggml-rpc PRIVATE ws2_32) diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index d1d0ff83d636c..83f223fd7b6fc 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -16,12 +16,10 @@ endif() message(STATUS "SYCL found") #todo: AOT -add_library(ggml-sycl - ggml-sycl.cpp - ../../include/ggml-sycl.h) - -target_link_libraries(ggml-sycl PRIVATE ggml-base) -target_include_directories(ggml-sycl PRIVATE . ..) +ggml_add_backend_library(ggml-sycl + ggml-sycl.cpp + ../../include/ggml-sycl.h + ) if (GGML_SYCL_F16) if (GGML_SYCL_TARGET STREQUAL "AMD") diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 1e85dd15b7ab1..ae0485e04255d 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -3,13 +3,13 @@ find_package(Vulkan COMPONENTS glslc REQUIRED) if (Vulkan_FOUND) message(STATUS "Vulkan found") - add_library(ggml-vulkan - ggml-vulkan.cpp - ../../include/ggml-vulkan.h - ) + ggml_add_backend_library(ggml-vulkan + ggml-vulkan.cpp + ../../include/ggml-vulkan.h + ) - target_link_libraries(ggml-vulkan PRIVATE ggml-base Vulkan::Vulkan) - target_include_directories(ggml-vulkan PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR}) + target_link_libraries(ggml-vulkan PRIVATE Vulkan::Vulkan) + target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector From 53d7f4f658383f2d072e08d703c3975e29e02a91 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 24 Nov 2024 23:54:16 +0100 Subject: [PATCH 08/11] add version checking --- ggml/src/ggml-amx/ggml-amx.cpp | 5 +++-- ggml/src/ggml-backend-impl.h | 4 +++- ggml/src/ggml-backend-reg.cpp | 9 +++++++-- ggml/src/ggml-blas/ggml-blas.cpp | 5 +++-- ggml/src/ggml-cann/ggml-cann.cpp | 11 ++++++----- ggml/src/ggml-cpu/ggml-cpu.cpp | 5 +++-- ggml/src/ggml-cuda/ggml-cuda.cu | 11 ++++++----- ggml/src/ggml-kompute/ggml-kompute.cpp | 5 +++-- ggml/src/ggml-metal/ggml-metal.m | 5 +++-- ggml/src/ggml-rpc/ggml-rpc.cpp | 5 +++-- ggml/src/ggml-sycl/ggml-sycl.cpp | 11 ++++++----- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 5 +++-- 12 files changed, 49 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-amx/ggml-amx.cpp b/ggml/src/ggml-amx/ggml-amx.cpp index 0e13266689e11..6bfb3da274c39 100644 --- a/ggml/src/ggml-amx/ggml-amx.cpp +++ b/ggml/src/ggml-amx/ggml-amx.cpp @@ -409,8 +409,9 @@ static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = { ggml_backend_reg_t ggml_backend_amx_reg(void) { static struct ggml_backend_reg ggml_backend_amx_reg = { - /* .iface = */ ggml_backend_amx_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_amx_reg_i, + /* .context = */ NULL, }; return &ggml_backend_amx_reg; diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h index 4f31104e787ff..dff7749b416dc 100644 --- a/ggml/src/ggml-backend-impl.h +++ b/ggml/src/ggml-backend-impl.h @@ -8,6 +8,8 @@ extern "C" { #endif + #define GGML_BACKEND_API_VERSION 1 + // // Backend buffer type // @@ -199,7 +201,7 @@ extern "C" { }; struct ggml_backend_reg { - // int api_version; // TODO: for dynamic loading + int api_version; // initialize to GGML_BACKEND_API_VERSION struct ggml_backend_reg_i iface; void * context; }; diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 943691f749a45..e6fb8dd78f46a 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -167,9 +167,14 @@ struct ggml_backend_registry { } #endif ggml_backend_reg_t reg = backend_init(); - if (!reg) { + if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!silent) { - GGML_LOG_ERROR("%s: failed to initialize backend from %s\n", __func__, path); + if (!reg) { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); + } else { + GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", + __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); + } } #ifdef _WIN32 FreeLibrary(handle); diff --git a/ggml/src/ggml-blas/ggml-blas.cpp b/ggml/src/ggml-blas/ggml-blas.cpp index b3f804937a86e..ec158dfac6e3e 100644 --- a/ggml/src/ggml-blas/ggml-blas.cpp +++ b/ggml/src/ggml-blas/ggml-blas.cpp @@ -506,8 +506,9 @@ static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = { ggml_backend_reg_t ggml_backend_blas_reg(void) { static struct ggml_backend_reg ggml_backend_blas_reg = { - /* .iface = */ ggml_backend_blas_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_blas_reg_i, + /* .context = */ NULL, }; return &ggml_backend_blas_reg; diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index e592560c67a2d..d96f65936136d 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -2064,16 +2064,17 @@ ggml_backend_reg_t ggml_backend_cann_reg() { dev_ctx->name = GGML_CANN_NAME + std::to_string(i); ggml_cann_set_device(i); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cann_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cann_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx }; } diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index 55f88992a81e5..febed433ada2b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -678,8 +678,9 @@ ggml_backend_reg_t ggml_backend_cpu_reg(void) { ggml_cpu_init(); static struct ggml_backend_reg ggml_backend_cpu_reg = { - /* .iface = */ ggml_backend_cpu_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cpu_reg_i, + /* .context = */ NULL, }; return &ggml_backend_cpu_reg; diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index a66b3a9f3ff60..2a78a4393d0f7 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3227,16 +3227,17 @@ ggml_backend_reg_t ggml_backend_cuda_reg() { dev_ctx->description = prop.name; ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_cuda_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_cuda_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_cuda_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cuda_reg_interface, + /* .context = */ ctx }; } diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp index c2a7b6afa8a80..24566404ded0f 100644 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ b/ggml/src/ggml-kompute/ggml-kompute.cpp @@ -2176,8 +2176,9 @@ static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { ggml_backend_reg_t ggml_backend_kompute_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_kompute_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_kompute_reg_i, + /* .context = */ nullptr, }; return ® diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index a3950e17f5739..6fecb76359ed3 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -4408,8 +4408,9 @@ ggml_backend_reg_t ggml_backend_metal_reg(void) { // TODO: make this thread-safe somehow? { g_ggml_backend_metal_reg = (struct ggml_backend_reg) { - /* .iface = */ ggml_backend_metal_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_metal_reg_i, + /* .context = */ NULL, }; g_ggml_backend_metal_device = (struct ggml_backend_device) { diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp index 31c1313b94557..43108242639a3 100644 --- a/ggml/src/ggml-rpc/ggml-rpc.cpp +++ b/ggml/src/ggml-rpc/ggml-rpc.cpp @@ -1369,8 +1369,9 @@ static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = { ggml_backend_reg_t ggml_backend_rpc_reg(void) { static struct ggml_backend_reg ggml_backend_rpc_reg = { - /* .iface = */ ggml_backend_rpc_reg_i, - /* .context = */ NULL, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_rpc_reg_i, + /* .context = */ NULL, }; return &ggml_backend_rpc_reg; diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f36640fd64491..b6392ed8dcc6a 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4637,16 +4637,17 @@ ggml_backend_reg_t ggml_backend_sycl_reg() { dev_ctx->description = prop.get_name(); ggml_backend_dev_t dev = new ggml_backend_device { - /* .interface = */ ggml_backend_sycl_device_interface, - /* .reg = */ ®, - /* .context = */ dev_ctx + /* .iface = */ ggml_backend_sycl_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx }; ctx->devices.push_back(dev); } reg = ggml_backend_reg { - /* .interface = */ ggml_backend_sycl_reg_interface, - /* .context = */ ctx + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_sycl_reg_interface, + /* .context = */ ctx }; } diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f9e5fabfaa9ee..49527fdf40e94 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -6738,8 +6738,9 @@ static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = { ggml_backend_reg_t ggml_backend_vk_reg() { static ggml_backend_reg reg = { - /* .iface = */ ggml_backend_vk_reg_i, - /* .context = */ nullptr, + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_vk_reg_i, + /* .context = */ nullptr, }; return ® From ae99c8fa550ec9a4c707b030637d28b1a24bdc0d Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 25 Nov 2024 03:29:03 +0100 Subject: [PATCH 09/11] suppress error dialogs on windows --- ggml/src/ggml-backend-reg.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index e6fb8dd78f46a..43d03d7fa7385 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -134,14 +134,24 @@ struct ggml_backend_registry { ggml_backend_reg_t load_backend(const char * path, bool silent) { #ifdef _WIN32 + // suppress error dialogs for missing DLLs + DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); + SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); + HMODULE handle = LoadLibraryA(path); + if (!handle) { if (!silent) { GGML_LOG_ERROR("%s: failed to load %s: %lu\n", __func__, path, GetLastError()); } + SetErrorMode(old_mode); return nullptr; } + ggml_backend_init_t backend_init = (ggml_backend_init_t) GetProcAddress(handle, "ggml_backend_init"); + + SetErrorMode(old_mode); + if (!backend_init) { if (!silent) { GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %lu\n", __func__, path, GetLastError()); @@ -151,13 +161,16 @@ struct ggml_backend_registry { } #else void * handle = dlopen(path, RTLD_NOW | RTLD_LOCAL); + if (!handle) { if (!silent) { GGML_LOG_ERROR("%s: failed to load %s: %s\n", __func__, path, dlerror()); } return nullptr; } + auto * backend_init = (ggml_backend_init_t) dlsym(handle, "ggml_backend_init"); + if (!backend_init) { if (!silent) { GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s: %s\n", __func__, path, dlerror()); @@ -167,6 +180,7 @@ struct ggml_backend_registry { } #endif ggml_backend_reg_t reg = backend_init(); + if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!silent) { if (!reg) { @@ -176,11 +190,11 @@ struct ggml_backend_registry { __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); } } - #ifdef _WIN32 +#ifdef _WIN32 FreeLibrary(handle); - #else +#else dlclose(handle); - #endif +#endif return nullptr; } From 6d19135b9b7f1e41e507b36438af955b6a801143 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 25 Nov 2024 03:34:29 +0100 Subject: [PATCH 10/11] add cpu backend to the swift build --- Package.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Package.swift b/Package.swift index 6b68aecdebec1..d9e8a4e2d21d7 100644 --- a/Package.swift +++ b/Package.swift @@ -43,7 +43,8 @@ linkerSettings.append(.linkedFramework("Accelerate")) cSettings.append( contentsOf: [ .define("GGML_USE_ACCELERATE"), - .define("GGML_USE_METAL") + .define("GGML_USE_METAL"), + .define("GGML_USE_CPU") ] ) #endif From b81e5ca026370fb521c9c292d3abb30c47ee3729 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 25 Nov 2024 03:47:48 +0100 Subject: [PATCH 11/11] remove eval-callback test hack since the backend loader now checks the executable directory --- examples/eval-callback/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt index 89264242850ba..5d1048aad74b6 100644 --- a/examples/eval-callback/CMakeLists.txt +++ b/examples/eval-callback/CMakeLists.txt @@ -6,6 +6,5 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11) set(TEST_TARGET test-eval-callback) add_test(NAME ${TEST_TARGET} - COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0 - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/../../) # HACK for dl backends + COMMAND llama-eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)