From aa1fcba59fef8f3685f2851ac1de4b4420c69cd1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 3 Oct 2024 14:00:17 +0200 Subject: [PATCH 01/92] feat(llamacpp): initial commit # Conflicts: # Cargo.lock --- Cargo.toml | 2 +- backends/llamacpp/CMakeLists.txt | 28 ++++++++++++ backends/llamacpp/Cargo.toml | 8 ++++ backends/llamacpp/cmake/fmt.cmake | 6 +++ backends/llamacpp/cmake/spdlog.cmake | 17 +++++++ backends/llamacpp/csrc/backend.cpp | 66 ++++++++++++++++++++++++++++ backends/llamacpp/csrc/backend.hpp | 28 ++++++++++++ backends/llamacpp/src/main.rs | 3 ++ 8 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 backends/llamacpp/CMakeLists.txt create mode 100644 backends/llamacpp/Cargo.toml create mode 100644 backends/llamacpp/cmake/fmt.cmake create mode 100644 backends/llamacpp/cmake/spdlog.cmake create mode 100644 backends/llamacpp/csrc/backend.cpp create mode 100644 backends/llamacpp/csrc/backend.hpp create mode 100644 backends/llamacpp/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 9a7e76c412b..f3ab5ee546f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,7 +7,7 @@ members = [ "backends/trtllm", "launcher", "router" -] +, "backends/llamacpp"] default-members = [ "benchmark", "backends/v2", diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt new file mode 100644 index 00000000000..2f9026f1656 --- /dev/null +++ b/backends/llamacpp/CMakeLists.txt @@ -0,0 +1,28 @@ +cmake_minimum_required(VERSION 3.20) + +project(tgi-llama-cpp-backend VERSION 1.0.0) +set(CMAKE_CXX_STANDARD 20) + +include(FetchContent) + +set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against") + + +# Add dependencies +include(cmake/fmt.cmake) +include(cmake/spdlog.cmake) + +# Download llama.cpp repo at the specific version +fetchcontent_declare( + llama +# DOWNLOAD_EXTRACT_TIMESTAMP TRUE + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git + GIT_TAG b3837 + GIT_SHALLOW FALSE +) + +fetchcontent_makeavailable(llama) + +add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) +target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11) +target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common) diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml new file mode 100644 index 00000000000..2e8ed7ddca9 --- /dev/null +++ b/backends/llamacpp/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "text-generation-backend-llamacpp" +version.workspace = true +edition.workspace = true +authors.workspace = true +homepage.workspace = true + +[dependencies] diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake new file mode 100644 index 00000000000..f94a9c5668f --- /dev/null +++ b/backends/llamacpp/cmake/fmt.cmake @@ -0,0 +1,6 @@ +FetchContent_Declare( + fmt + GIT_REPOSITORY https://github.com/fmtlib/fmt + GIT_TAG 11.0.1 +) +FetchContent_MakeAvailable(fmt) diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake new file mode 100644 index 00000000000..c4ee5c97a58 --- /dev/null +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -0,0 +1,17 @@ +set(SPDLOG_USE_FMT ON) +set(SPDLOG_BUILD_SHARED OFF) +set(SPDLOG_FMT_EXTERNAL ON) + +# Define the level at which SPDLOG_ compilation level is defined +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) +else () + add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) +endif () + +fetchcontent_declare( + spdlog + GIT_REPOSITORY https://github.com/gabime/spdlog.git + GIT_TAG v1.14.1 +) +fetchcontent_makeavailable(spdlog) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp new file mode 100644 index 00000000000..9ce1dbc92ff --- /dev/null +++ b/backends/llamacpp/csrc/backend.cpp @@ -0,0 +1,66 @@ +// +// Created by Morgan Funtowicz on 9/28/2024. +// + +#include +#include +#include +#include +#include "backend.hpp" + +namespace huggingface::tgi::backends::llama { + + std::unique_ptr CreateLlamaCppBackend(std::string_view root) { + SPDLOG_INFO(FMT_STRING("Loading model from {}"), root); + gpt_init(); + + // Fake argv + std::vector args = {"tgi_llama_cpp_backend", "--model", root}; + std::vector argv; + for(const auto& arg : args) { + argv.push_back(const_cast(arg.data())); + } + argv.push_back(nullptr); + + // Create the GPT parameters + gpt_params params; + if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) { + throw std::runtime_error("Failed to create GPT Params from model"); + } + + + // Create the inference engine + SPDLOG_INFO("Allocating llama.cpp model from gpt_params"); + auto result = llama_init_from_gpt_params(params); + + // Unpack all the inference engine components + auto model = result.model; + auto context = result.context; + auto loras = result.lora_adapters; + + // Make sure everything is correctly initialized + if(model == nullptr) + throw std::runtime_error(fmt::format("Failed to load model from {}", root)); + + return std::make_unique(model, context); + } + + TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) + : model(model), ctx(ctx), batch() { + + } + + TgiLlamaCppBackend::~TgiLlamaCppBackend() { + if(model) + { + SPDLOG_DEBUG("Freeing llama.cpp model"); + llama_free_model(model); + } + + if(ctx) + { + SPDLOG_DEBUG("Freeing llama.cpp context"); + llama_free(ctx); + } + } +} \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp new file mode 100644 index 00000000000..a643454e756 --- /dev/null +++ b/backends/llamacpp/csrc/backend.hpp @@ -0,0 +1,28 @@ +// +// Created by Morgan Funtowicz on 9/28/2024. +// + +#ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP + +#include +#include + +namespace huggingface::tgi::backends::llama { + const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; + + + class TgiLlamaCppBackend { + private: + llama_model* model; + llama_context* ctx; + llama_batch batch; + public: + TgiLlamaCppBackend(llama_model* const model, llama_context* const); + ~TgiLlamaCppBackend(); + }; + + std::unique_ptr CreateLlamaCppBackend(std::string_view root); +} + +#endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs new file mode 100644 index 00000000000..e7a11a969c0 --- /dev/null +++ b/backends/llamacpp/src/main.rs @@ -0,0 +1,3 @@ +fn main() { + println!("Hello, world!"); +} From 7d1f8a2bd6695be7a3efd6512c70093c6ae22d6d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 3 Oct 2024 15:25:15 +0200 Subject: [PATCH 02/92] feat(llamacpp): correctly handle CMAKE_BUILD_TYPE for spdlog macros --- backends/llamacpp/cmake/spdlog.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake index c4ee5c97a58..9cd210dd1d1 100644 --- a/backends/llamacpp/cmake/spdlog.cmake +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -3,7 +3,7 @@ set(SPDLOG_BUILD_SHARED OFF) set(SPDLOG_FMT_EXTERNAL ON) # Define the level at which SPDLOG_ compilation level is defined -if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") +if (CMAKE_BUILD_TYPE STREQUAL "Debug") add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) else () add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) From 52d57dca798f7eb0ba92b91733e33579921fa03a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 4 Oct 2024 10:42:31 +0200 Subject: [PATCH 03/92] feat(llamacpp): initial end2end build --- backends/llamacpp/CMakeLists.txt | 18 ++- backends/llamacpp/Cargo.toml | 17 +++ backends/llamacpp/build.rs | 94 +++++++++++++ backends/llamacpp/cmake/spdlog.cmake | 7 +- backends/llamacpp/csrc/backend.cpp | 11 +- backends/llamacpp/csrc/backend.hpp | 7 +- backends/llamacpp/offline/main.cpp | 22 +++ backends/llamacpp/src/backend.rs | 18 +++ backends/llamacpp/src/lib.rs | 11 ++ backends/llamacpp/src/main.rs | 203 ++++++++++++++++++++++++++- backends/trtllm/CMakeLists.txt | 2 + 11 files changed, 398 insertions(+), 12 deletions(-) create mode 100644 backends/llamacpp/build.rs create mode 100644 backends/llamacpp/offline/main.cpp create mode 100644 backends/llamacpp/src/backend.rs create mode 100644 backends/llamacpp/src/lib.rs diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 2f9026f1656..4671314f3dc 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -6,12 +6,18 @@ set(CMAKE_CXX_STANDARD 20) include(FetchContent) set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against") - +option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") +option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") # Add dependencies include(cmake/fmt.cmake) include(cmake/spdlog.cmake) +if(${LLAMA_CPP_BUILD_CUDA}) + message(STATUS "Enabling llama.cpp CUDA support") + set(GGML_CUDA ON) +endif() + # Download llama.cpp repo at the specific version fetchcontent_declare( llama @@ -25,4 +31,12 @@ fetchcontent_makeavailable(llama) add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11) -target_link_libraries(tgi_llama_cpp_backend_impl fmt::fmt spdlog::spdlog llama common) +target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) + +if(${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) + message(STATUS "Building llama.cpp offline runner") + add_executable(tgi_llama_cpp_offline_runner offline/main.cpp) + target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl) +endif() + + diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 2e8ed7ddca9..fdd980c308f 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -6,3 +6,20 @@ authors.workspace = true homepage.workspace = true [dependencies] +clap = { version = "4.5.19", features = ["derive"] } +cxx = "1.0" +hf-hub = { workspace = true } +image = { version = "0.25.1", features = ["default-formats"] } +metrics = { workspace = true } +metrics-exporter-prometheus = { workspace = true } +serde_json = "1.0.128" +text-generation-router = { path = "../../router" } +thiserror = "1.0.64" +tokio = "1.40.0" +tokio-stream = "0.1.16" +tokenizers = { workspace = true } + +[build-dependencies] +cmake = "0.1" +cxx-build = { version = "1.0", features = ["parallel"] } +pkg-config = "0.3" \ No newline at end of file diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs new file mode 100644 index 00000000000..4e8859aba7e --- /dev/null +++ b/backends/llamacpp/build.rs @@ -0,0 +1,94 @@ +use cxx_build::CFG; +use std::env; +use std::path::PathBuf; + +const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl"; +const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; +const MPI_REQUIRED_VERSION: &str = "4.1"; + +macro_rules! probe { + ($name: expr, $version: expr) => { + if let Err(_) = pkg_config::probe_library($name) { + pkg_config::probe_library(&format!("{}-{}", $name, $version)) + .expect(&format!("Failed to locate {}", $name)); + } + }; +} + +fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf { + let install_path = env::var("CMAKE_INSTALL_PREFIX") + .map(|val| PathBuf::from(val)) + .unwrap_or(out_dir.join("dist")); + + let _ = cmake::Config::new(".") + .uses_cxx11() + .generator("Ninja") + .profile(match is_debug { + true => "Debug", + false => "Release", + }) + .env("OPT_LEVEL", opt_level) + .define("CMAKE_INSTALL_PREFIX", &install_path) + // .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc") + // .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list) + // .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path) + .build(); + + // Additional transitive CMake dependencies + let deps_folder = out_dir.join("build").join("_deps"); + for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES { + let dep_name = match is_debug { + true => format!("{}d", dependency), + false => String::from(dependency), + }; + let dep_path = deps_folder.join(format!("{}-build", dependency)); + println!("cargo:rustc-link-search={}", dep_path.display()); + println!("cargo:rustc-link-lib=static={}", dep_name); + } + + let deps_folder = out_dir.join("build").join("_deps"); + deps_folder +} + +fn build_ffi_layer(deps_folder: &PathBuf) { + println!("cargo:warning={}", &deps_folder.display()); + CFG.include_prefix = "backends/llamacpp"; + cxx_build::bridge("src/lib.rs") + .static_flag(true) + .include(deps_folder.join("fmt-src").join("include")) + .include(deps_folder.join("spdlog-src").join("include")) + .include(deps_folder.join("llama-src").join("common")) + .include(deps_folder.join("llama-src").join("ggml").join("include")) + .include(deps_folder.join("llama-src").join("include")) + .file("csrc/backend.cpp") + .std("c++20") + .compile(CMAKE_LLAMA_CPP_TARGET); + + println!("cargo:rerun-if-changed=CMakeLists.txt"); + println!("cargo:rerun-if-changed=csrc/backend.hpp"); + println!("cargo:rerun-if-changed=csrc/backend.cpp"); +} + +fn main() { + // Misc variables + let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap()); + let build_profile = env::var("PROFILE").unwrap(); + let (is_debug, opt_level) = match build_profile.as_ref() { + "debug" => (true, "0"), + _ => (false, "3"), + }; + + // Build the backend + let deps_folder = build_backend(is_debug, opt_level, &out_dir); + + // Build the FFI layer calling the backend above + build_ffi_layer(&deps_folder); + + // Emit linkage search path + probe!("ompi", MPI_REQUIRED_VERSION); + + // Backend + // BACKEND_DEPS.iter().for_each(|name| { + // println!("cargo:rustc-link-lib=static={}", name); + // }); +} diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake index 9cd210dd1d1..68658ba5019 100644 --- a/backends/llamacpp/cmake/spdlog.cmake +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -4,9 +4,10 @@ set(SPDLOG_FMT_EXTERNAL ON) # Define the level at which SPDLOG_ compilation level is defined if (CMAKE_BUILD_TYPE STREQUAL "Debug") - add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG) -else () - add_compile_definitions(SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO) + message(STATUS "Verbose logging is enabled in debug build") + add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_DEBUG) +else() + add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO) endif () fetchcontent_declare( diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 9ce1dbc92ff..875fdb684bf 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -46,8 +46,11 @@ namespace huggingface::tgi::backends::llama { } TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) - : model(model), ctx(ctx), batch() { - + : model(model), ctx(ctx), batch() + { + char modelName[128]; + llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); + SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); } TgiLlamaCppBackend::~TgiLlamaCppBackend() { @@ -63,4 +66,8 @@ namespace huggingface::tgi::backends::llama { llama_free(ctx); } } + + void TgiLlamaCppBackend::schedule() { + std::vector tokens; + } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index a643454e756..7e3c9020c93 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -1,7 +1,6 @@ // // Created by Morgan Funtowicz on 9/28/2024. // - #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP @@ -9,7 +8,7 @@ #include namespace huggingface::tgi::backends::llama { - const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; +// const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; class TgiLlamaCppBackend { @@ -18,8 +17,10 @@ namespace huggingface::tgi::backends::llama { llama_context* ctx; llama_batch batch; public: - TgiLlamaCppBackend(llama_model* const model, llama_context* const); + TgiLlamaCppBackend(llama_model *model, llama_context *ctx); ~TgiLlamaCppBackend(); + + void schedule(); }; std::unique_ptr CreateLlamaCppBackend(std::string_view root); diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp new file mode 100644 index 00000000000..4009588d4d1 --- /dev/null +++ b/backends/llamacpp/offline/main.cpp @@ -0,0 +1,22 @@ +// +// Created by mfuntowicz on 10/3/24. +// + +#include +#include +#include +#include +#include "../csrc/backend.hpp" + +int main(int argc, char** argv) { + if(argc < 2) { + fmt::print("No model folder provider"); + return 1; + } + + spdlog::set_level(spdlog::level::debug); + + const std::string_view model_root = argv[1]; + auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root); + fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root); +} \ No newline at end of file diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs new file mode 100644 index 00000000000..8af1067b9d4 --- /dev/null +++ b/backends/llamacpp/src/backend.rs @@ -0,0 +1,18 @@ +use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; +use text_generation_router::validation::ValidGenerateRequest; +use tokio_stream::wrappers::UnboundedReceiverStream; + +pub struct TgiLlamaCppBakend {} + +impl Backend for TgiLlamaCppBakend { + fn schedule( + &self, + request: ValidGenerateRequest, + ) -> Result>, InferError> { + Err(InferError::GenerationError("Not implemented yet".into())) + } + + async fn health(&self, current_health: bool) -> bool { + todo!() + } +} diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs new file mode 100644 index 00000000000..d4c3caf9a0c --- /dev/null +++ b/backends/llamacpp/src/lib.rs @@ -0,0 +1,11 @@ +pub mod backend; + +#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")] +mod ffi { + unsafe extern "C++" { + include!("backends/llamacpp/csrc/backend.cpp"); + + /// Represent an instance of the llama.cpp backend instance on C++ side + type LlamaCppBackendImpl; + } +} diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index e7a11a969c0..7226473c70f 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,3 +1,202 @@ -fn main() { - println!("Hello, world!"); +use clap::{Parser, Subcommand}; +use text_generation_router::{server, usage_stats}; +use thiserror::Error; +use text_generation_router::server::ApiDoc; + +/// App Configuration +#[derive(Parser, Debug)] +#[clap(author, version, about, long_about = None)] +struct Args { + #[command(subcommand)] + command: Option, + + #[clap(default_value = "128", long, env)] + max_concurrent_requests: usize, + #[clap(default_value = "2", long, env)] + max_best_of: usize, + #[clap(default_value = "4", long, env)] + max_stop_sequences: usize, + #[clap(default_value = "5", long, env)] + max_top_n_tokens: u32, + #[clap(default_value = "1024", long, env)] + max_input_tokens: usize, + #[clap(default_value = "2048", long, env)] + max_total_tokens: usize, + #[clap(default_value = "1.2", long, env)] + waiting_served_ratio: f32, + #[clap(default_value = "4096", long, env)] + max_batch_prefill_tokens: u32, + #[clap(long, env)] + max_batch_total_tokens: Option, + #[clap(default_value = "20", long, env)] + max_waiting_tokens: usize, + #[clap(long, env)] + max_batch_size: Option, + #[clap(default_value = "0.0.0.0", long, env)] + hostname: String, + #[clap(default_value = "3000", long, short, env)] + port: u16, + #[clap(default_value = "/tmp/text-generation-server-0", long, env)] + master_shard_uds_path: String, + #[clap(default_value = "bigscience/bloom", long, env)] + tokenizer_name: String, + #[clap(long, env)] + tokenizer_config_path: Option, + #[clap(long, env)] + revision: Option, + #[clap(default_value = "2", long, env)] + validation_workers: usize, + #[clap(long, env)] + api_key: Option, + #[clap(long, env)] + json_output: bool, + #[clap(long, env)] + otlp_endpoint: Option, + #[clap(default_value = "text-generation-inference.router", long, env)] + otlp_service_name: String, + #[clap(long, env)] + cors_allow_origin: Option>, + #[clap(long, env)] + ngrok: bool, + #[clap(long, env)] + ngrok_authtoken: Option, + #[clap(long, env)] + ngrok_edge: Option, + #[clap(long, env, default_value_t = false)] + messages_api_enabled: bool, + #[clap(long, env, default_value_t = false)] + disable_grammar_support: bool, + #[clap(default_value = "4", long, env)] + max_client_batch_size: usize, + #[clap(default_value = "on", long, env)] + usage_stats: usage_stats::UsageStatsLevel, } + +#[derive(Debug, Subcommand)] +enum Commands { + PrintSchema, +} + +#[tokio::main] +async fn main() -> Result<(), RouterError> { + // Get args + let args = Args::parse(); + // Pattern match configuration + let Args { + command, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + waiting_served_ratio, + max_batch_prefill_tokens, + max_batch_total_tokens, + max_waiting_tokens, + max_batch_size, + hostname, + port, + master_shard_uds_path, + tokenizer_name, + tokenizer_config_path, + revision, + validation_workers, + api_key, + json_output, + otlp_endpoint, + otlp_service_name, + cors_allow_origin, + ngrok, + ngrok_authtoken, + ngrok_edge, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + usage_stats, + } = args; + + if let Some(Commands::PrintSchema) = command { + use utoipa::OpenApi; + let api_doc = ApiDoc::openapi(); + let api_doc = serde_json::to_string_pretty(&api_doc).unwrap(); + println!("{}", api_doc); + std::process::exit(0); + }; + text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output); + + // Validate args + if max_input_tokens >= max_total_tokens { + return Err(RouterError::ArgumentValidation( + "`max_input_tokens` must be < `max_total_tokens`".to_string(), + )); + } + if max_input_tokens as u32 > max_batch_prefill_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be >= `max_input_tokens`. Given: {max_batch_prefill_tokens} and {max_input_tokens}"))); + } + + if validation_workers == 0 { + return Err(RouterError::ArgumentValidation( + "`validation_workers` must be > 0".to_string(), + )); + } + + if let Some(ref max_batch_total_tokens) = max_batch_total_tokens { + if max_batch_prefill_tokens > *max_batch_total_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); + } + if max_total_tokens as u32 > *max_batch_total_tokens { + return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); + } + } + + if let Some(max_batch_size) = max_batch_size { + if max_batch_size == 0 { + return Err(RouterError::ArgumentValidation( + "`max_batch_size` must be > 0".to_string(), + )); + } + } + + let backend = LlamaCppBackend::new(); + + // Run server + server::run( + backend, + max_concurrent_requests, + max_best_of, + max_stop_sequences, + max_top_n_tokens, + max_input_tokens, + max_total_tokens, + validation_workers, + api_key, + tokenizer_name, + tokenizer_config_path, + revision, + hostname, + port, + cors_allow_origin, + ngrok, + ngrok_authtoken, + ngrok_edge, + messages_api_enabled, + disable_grammar_support, + max_client_batch_size, + usage_stats, + ) + .await?; + Ok(()) +} + +#[derive(Debug, Error)] +enum RouterError { + #[error("Argument validation error: {0}")] + ArgumentValidation(String), + #[error("Backend failed: {0}")] + Backend(#[from] V3Error), + #[error("WebServer error: {0}")] + WebServer(#[from] server::WebServerError), + #[error("Tokio runtime failed to start: {0}")] + Tokio(#[from] std::io::Error), +} \ No newline at end of file diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 831372cdf99..80b2b4305af 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -18,6 +18,8 @@ set(CMAKE_CXX_STANDARD 20) include(FetchContent) include(ExternalProject) +set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--allow-unsupported-compiler -ccbin=gcc") + option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF) set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support") From e4432d36b1dbcdd53d614072cde4f08734e726b1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 18 Oct 2024 17:10:22 +0200 Subject: [PATCH 04/92] misc(cmake): add parameter to build specific cuda arch --- backends/llamacpp/CMakeLists.txt | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 4671314f3dc..890d99daa99 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -1,11 +1,12 @@ -cmake_minimum_required(VERSION 3.20) +cmake_minimum_required(VERSION 3.24) project(tgi-llama-cpp-backend VERSION 1.0.0) -set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD 23) include(FetchContent) -set(LLAMA_CPP_TARGET_VERSION "b3837" STRING "Version of llama.cpp to build against") +set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against") +set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build") option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") @@ -13,18 +14,22 @@ option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama. include(cmake/fmt.cmake) include(cmake/spdlog.cmake) -if(${LLAMA_CPP_BUILD_CUDA}) +if (${LLAMA_CPP_BUILD_CUDA}) message(STATUS "Enabling llama.cpp CUDA support") + + if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES ${LLAMA_CPP_TARGET_CUDA_ARCHS}) + endif () set(GGML_CUDA ON) -endif() +endif () # Download llama.cpp repo at the specific version fetchcontent_declare( - llama -# DOWNLOAD_EXTRACT_TIMESTAMP TRUE - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3837 - GIT_SHALLOW FALSE + llama + # DOWNLOAD_EXTRACT_TIMESTAMP TRUE + GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git + GIT_TAG b3837 + GIT_SHALLOW FALSE ) fetchcontent_makeavailable(llama) @@ -33,10 +38,10 @@ add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11) target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) -if(${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) +if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") add_executable(tgi_llama_cpp_offline_runner offline/main.cpp) target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl) -endif() +endif () From fa89d1e613c6f8971e14d84aba821d20984967cd Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 09:14:35 +0200 Subject: [PATCH 05/92] misc(cmake): wut --- Cargo.lock | 21 +++++++++++++++++++++ LICENSE | 3 ++- backends/llamacpp/src/lib.rs | 2 +- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 72441430240..4075556bfef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4183,6 +4183,27 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "text-generation-backend-llamacpp" +version = "2.4.1-dev0" +dependencies = [ + "clap 4.5.20", + "cmake", + "cxx", + "cxx-build", + "hf-hub", + "image", + "metrics", + "metrics-exporter-prometheus", + "pkg-config", + "serde_json", + "text-generation-router", + "thiserror", + "tokenizers", + "tokio", + "tokio-stream", +] + [[package]] name = "text-generation-backends-trtllm" version = "2.4.1-dev0" diff --git a/LICENSE b/LICENSE index 7d0e80345c7..d6456956733 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,4 @@ + Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ @@ -186,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2022 Hugging Face + Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index d4c3caf9a0c..bea7c06fc65 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -1,6 +1,6 @@ pub mod backend; -#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")] +#[cxx::bridge(namespace = "huggingface::tgi::backends::llama::impl")] mod ffi { unsafe extern "C++" { include!("backends/llamacpp/csrc/backend.cpp"); From 05ad68467625ac9c1c6831b43bd4359454387820 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 21 Oct 2024 09:14:51 +0200 Subject: [PATCH 06/92] feat(llamacpp): enable cuda --- backends/llamacpp/build.rs | 12 +++++++---- backends/llamacpp/csrc/backend.cpp | 33 +++++++++++++++++------------- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 4e8859aba7e..26ea8d929b9 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -2,6 +2,7 @@ use cxx_build::CFG; use std::env; use std::path::PathBuf; +const CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS: &str = "75-real;80-real;86-real;89-real;90-real"; const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl"; const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; const MPI_REQUIRED_VERSION: &str = "4.1"; @@ -20,6 +21,10 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf .map(|val| PathBuf::from(val)) .unwrap_or(out_dir.join("dist")); + let build_cuda = option_env!("LLAMA_CPP_BUILD_CUDA").unwrap_or("OFF"); + let cuda_archs = + option_env!("LLAMA_CPP_TARGET_CUDA_ARCHS").unwrap_or(CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS); + let _ = cmake::Config::new(".") .uses_cxx11() .generator("Ninja") @@ -29,9 +34,8 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf }) .env("OPT_LEVEL", opt_level) .define("CMAKE_INSTALL_PREFIX", &install_path) - // .define("CMAKE_CUDA_COMPILER", "/usr/local/cuda/bin/nvcc") - // .define("TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST", cuda_arch_list) - // .define("TGI_TRTLLM_BACKEND_TRT_ROOT", tensorrt_path) + .define("LLAMA_CPP_BUILD_CUDA", build_cuda) + .define("LLAMA_CPP_TARGET_CUDA_ARCHS", cuda_archs) .build(); // Additional transitive CMake dependencies @@ -61,7 +65,7 @@ fn build_ffi_layer(deps_folder: &PathBuf) { .include(deps_folder.join("llama-src").join("ggml").join("include")) .include(deps_folder.join("llama-src").join("include")) .file("csrc/backend.cpp") - .std("c++20") + .std("c++23") .compile(CMAKE_LLAMA_CPP_TARGET); println!("cargo:rerun-if-changed=CMakeLists.txt"); diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 875fdb684bf..38a94c8ac5d 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -10,14 +10,15 @@ namespace huggingface::tgi::backends::llama { - std::unique_ptr CreateLlamaCppBackend(std::string_view root) { + std::unique_ptr + CreateLlamaCppBackend(std::string_view root) { SPDLOG_INFO(FMT_STRING("Loading model from {}"), root); gpt_init(); // Fake argv std::vector args = {"tgi_llama_cpp_backend", "--model", root}; - std::vector argv; - for(const auto& arg : args) { + std::vector argv; + for (const auto &arg: args) { argv.push_back(const_cast(arg.data())); } argv.push_back(nullptr); @@ -39,35 +40,39 @@ namespace huggingface::tgi::backends::llama { auto loras = result.lora_adapters; // Make sure everything is correctly initialized - if(model == nullptr) + if (model == nullptr) throw std::runtime_error(fmt::format("Failed to load model from {}", root)); - return std::make_unique(model, context); + return std::make_unique(model, context); } - TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) - : model(model), ctx(ctx), batch() - { + huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, + llama_context *const ctx) + : model(model), ctx(ctx), batch() { char modelName[128]; llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); } - TgiLlamaCppBackend::~TgiLlamaCppBackend() { - if(model) - { + huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() { + if (model) { SPDLOG_DEBUG("Freeing llama.cpp model"); llama_free_model(model); } - if(ctx) - { + if (ctx) { SPDLOG_DEBUG("Freeing llama.cpp context"); llama_free(ctx); } } - void TgiLlamaCppBackend::schedule() { + void huggingface::tgi::backends::llama::TgiLlamaCppBackend::schedule() { std::vector tokens; } + + namespace impl { + class LlamaCppBackendImpl { + + }; + } } \ No newline at end of file From 091107632068a8d9f24ff4505e4264015e5da101 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 22 Oct 2024 15:22:56 +0200 Subject: [PATCH 07/92] feat(backend): correctly load llama.cpp model from llama api and not gpt2 --- backends/llamacpp/csrc/backend.cpp | 54 ++++++++++++------------------ backends/llamacpp/csrc/backend.hpp | 8 ++++- backends/llamacpp/offline/main.cpp | 7 ++-- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 38a94c8ac5d..332bb4d5f0d 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -2,52 +2,40 @@ // Created by Morgan Funtowicz on 9/28/2024. // -#include -#include +#include +#include +#include +#include #include +#include #include #include "backend.hpp" namespace huggingface::tgi::backends::llama { - std::unique_ptr - CreateLlamaCppBackend(std::string_view root) { - SPDLOG_INFO(FMT_STRING("Loading model from {}"), root); - gpt_init(); + std::expected, TgiLlamaCppBackendError> + CreateLlamaCppBackend(const std::filesystem::path& modelPath) { + SPDLOG_INFO(FMT_STRING("Loading model from {}"), modelPath); + llama_backend_init(); + llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); - // Fake argv - std::vector args = {"tgi_llama_cpp_backend", "--model", root}; - std::vector argv; - for (const auto &arg: args) { - argv.push_back(const_cast(arg.data())); + // Load the model + if(!exists(modelPath)) { + return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); } - argv.push_back(nullptr); - // Create the GPT parameters - gpt_params params; - if (!gpt_params_parse(args.size(), argv.data(), params, LLAMA_EXAMPLE_SERVER)) { - throw std::runtime_error("Failed to create GPT Params from model"); - } - - - // Create the inference engine - SPDLOG_INFO("Allocating llama.cpp model from gpt_params"); - auto result = llama_init_from_gpt_params(params); - - // Unpack all the inference engine components - auto model = result.model; - auto context = result.context; - auto loras = result.lora_adapters; - - // Make sure everything is correctly initialized - if (model == nullptr) - throw std::runtime_error(fmt::format("Failed to load model from {}", root)); + auto params = llama_model_default_params(); + auto* model = llama_load_model_from_file(modelPath.c_str(), params); + auto* context = llama_new_context_with_model(model, { + .n_batch = 1, + .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, + .flash_attn = true, + }); return std::make_unique(model, context); } - huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, - llama_context *const ctx) + huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) : model(model), ctx(ctx), batch() { char modelName[128]; llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 7e3c9020c93..bcf728dbf33 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -4,12 +4,17 @@ #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#include #include #include namespace huggingface::tgi::backends::llama { // const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; + enum TgiLlamaCppBackendError { + MODEL_FILE_DOESNT_EXIST = 1 + }; + class TgiLlamaCppBackend { private: @@ -23,7 +28,8 @@ namespace huggingface::tgi::backends::llama { void schedule(); }; - std::unique_ptr CreateLlamaCppBackend(std::string_view root); + std::expected, TgiLlamaCppBackendError> + CreateLlamaCppBackend(const std::filesystem::path& root); } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 4009588d4d1..2f50cac1ef0 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include "../csrc/backend.hpp" @@ -16,7 +17,7 @@ int main(int argc, char** argv) { spdlog::set_level(spdlog::level::debug); - const std::string_view model_root = argv[1]; - auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(model_root); - fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", model_root); + const auto modelPath = absolute(std::filesystem::path(argv[1])); + if(auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); backend.has_value()) + fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", modelPath); } \ No newline at end of file From 098c66920d7e70e0221f0ebb34bec29f84b1cfe5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 22 Oct 2024 15:23:16 +0200 Subject: [PATCH 08/92] feat(backend): tell cmake to build llama-common and link to it --- backends/llamacpp/CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 890d99daa99..9f08d0f3a0c 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -6,6 +6,7 @@ set(CMAKE_CXX_STANDARD 23) include(FetchContent) set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against") +set(LLAMA_BUILD_COMMON ON) set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build") option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") @@ -28,7 +29,7 @@ fetchcontent_declare( llama # DOWNLOAD_EXTRACT_TIMESTAMP TRUE GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3837 + GIT_TAG b3958 GIT_SHALLOW FALSE ) @@ -41,7 +42,8 @@ target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") add_executable(tgi_llama_cpp_offline_runner offline/main.cpp) - target_link_libraries(tgi_llama_cpp_offline_runner tgi_llama_cpp_backend_impl) + + target_link_libraries(tgi_llama_cpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common) endif () From 45d5a6a8c5b21144cecf1db550822077148925c9 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 23 Oct 2024 00:09:10 +0200 Subject: [PATCH 09/92] feat(backend): add some initial decoding steps --- backends/llamacpp/csrc/backend.cpp | 103 ++++++++++++++++++++++++----- backends/llamacpp/csrc/backend.hpp | 41 ++++++++++-- backends/llamacpp/offline/main.cpp | 26 ++++++-- 3 files changed, 146 insertions(+), 24 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 332bb4d5f0d..859041c20eb 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -2,20 +2,23 @@ // Created by Morgan Funtowicz on 9/28/2024. // -#include #include +#include + #include #include +#include #include #include #include + #include "backend.hpp" namespace huggingface::tgi::backends::llama { std::expected, TgiLlamaCppBackendError> CreateLlamaCppBackend(const std::filesystem::path& modelPath) { - SPDLOG_INFO(FMT_STRING("Loading model from {}"), modelPath); + SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); llama_backend_init(); llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); @@ -28,39 +31,109 @@ namespace huggingface::tgi::backends::llama { auto* model = llama_load_model_from_file(modelPath.c_str(), params); auto* context = llama_new_context_with_model(model, { .n_batch = 1, + .n_threads = 16, .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, - .flash_attn = true, + .flash_attn = false, }); return std::make_unique(model, context); } huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) - : model(model), ctx(ctx), batch() { - char modelName[128]; - llama_model_meta_val_str(model, "general.name", modelName, sizeof(modelName)); + : model(model), ctx(ctx) { +#ifndef NDEBUG + char modelName[256]; + llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); +#endif } huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() { - if (model) { + if (ctx) { + SPDLOG_DEBUG("Freeing llama.cpp context"); + llama_free(ctx); + } + + if(model) { SPDLOG_DEBUG("Freeing llama.cpp model"); llama_free_model(model); } + } - if (ctx) { - SPDLOG_DEBUG("Freeing llama.cpp context"); - llama_free(ctx); + std::vector TgiLlamaCppBackend::Tokenize(const std::string &text) const { + std::vector tokens(llama_n_seq_max(ctx)); + + if(auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); nTokens < 0){ + tokens.resize(-nTokens); + llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); + } else { + tokens.resize(nTokens); } + + SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size()); + return tokens; } - void huggingface::tgi::backends::llama::TgiLlamaCppBackend::schedule() { - std::vector tokens; + std::unique_ptr TgiLlamaCppBackend::GetSamplerFromArgs( + const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint64_t seed) { + auto *sampler = llama_sampler_chain_init({.no_perf = false}); + + // Penalties + llama_sampler_chain_add(sampler, llama_sampler_init_penalties( + llama_n_vocab(model), + llama_token_eos(model), + llama_token_nl (model), + 0.0f, + repetitionPenalty, + frequencyPenalty, + 0.0f, + false, + false + )); + llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(topK))); + + if(0 < topP && topP < 1) { + llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1)); + } + + llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); + return std::make_unique(sampler); } - namespace impl { - class LlamaCppBackendImpl { + std::vector huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate( + std::span tokens, const uint32_t topK, const float_t topP, const uint32_t maxNewTokens) { + SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); + + // Allocate generation result + std::vector generated; + generated.reserve(llama_n_seq_max(ctx) - tokens.size()); + + // Retrieve decoding context + auto batch = llama_batch_get_one(const_cast(tokens.data()), static_cast(tokens.size())); + auto sampler = GetSamplerFromArgs(topK, topP, 1.0, 1.0, 2014); - }; + // Decode + for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { +#ifndef NDEBUG + const auto start = std::chrono::steady_clock::now(); + const auto status = llama_decode(ctx, batch); + const auto end = std::chrono::steady_clock::now(); + const auto latency = std::chrono::duration_cast(end - start); + SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); +#else + const auto status = llama_decode(ctx, batch); +#endif + if (status == LLAMA_SUCCESS) { + // Sample the new token + auto new_token_id = llama_sampler_sample(*sampler, ctx, -1); + generated.emplace_back(new_token_id); + generating = !llama_token_is_eog(model, new_token_id); + + // Next iteration + batch = llama_batch_get_one(&new_token_id, 1); + } + } + generated.shrink_to_fit(); + return generated; } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index bcf728dbf33..e109a158cd7 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -4,28 +4,61 @@ #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#include +#include #include #include #include -namespace huggingface::tgi::backends::llama { -// const char* TGI_BACKEND_LLAMA_CPP_NAME = "llama.cpp"; +#define LLAMA_SUCCESS 0 +namespace huggingface::tgi::backends::llama { enum TgiLlamaCppBackendError { MODEL_FILE_DOESNT_EXIST = 1 }; class TgiLlamaCppBackend { + using TokenId = int32_t; + private: llama_model* model; llama_context* ctx; - llama_batch batch; + + /** + * + * @param topK + * @param topP + * @return + */ + std::unique_ptr GetSamplerFromArgs( + uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed); + public: TgiLlamaCppBackend(llama_model *model, llama_context *ctx); ~TgiLlamaCppBackend(); - void schedule(); + /** + * + * @param text + * @return + */ + [[nodiscard]] std::vector Tokenize(const std::string& text) const; + + /** + * + * @param tokens + * @param topK + * @param topP + * @param maxNewTokens + * @return + */ + [[nodiscard]] std::vector Generate( + std::span tokens, + uint32_t topK, + float_t topP = 1.0f, + uint32_t maxNewTokens = std::numeric_limits::max() + ); }; std::expected, TgiLlamaCppBackendError> diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 2f50cac1ef0..3165261fa9f 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -3,21 +3,37 @@ // #include +#include #include #include -#include +#include #include #include "../csrc/backend.hpp" int main(int argc, char** argv) { - if(argc < 2) { + if (argc < 2) { fmt::print("No model folder provider"); return 1; } spdlog::set_level(spdlog::level::debug); + const auto prompt = "My name is Morgan"; + const auto modelPath = absolute(std::filesystem::path(argv[1])); - if(auto backend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); backend.has_value()) - fmt::print(fmt::emphasis::bold | fg(fmt::color::yellow), "Successfully initialized llama.cpp model from {}\n", modelPath); -} \ No newline at end of file + if (auto maybeBackend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) { + // Retrieve the backend + const auto& backend = *maybeBackend; + + // Generate + const auto promptTokens = backend->Tokenize(prompt); + const auto out = backend->Generate(promptTokens, 30, 1.0, 32); + fmt::print(FMT_STRING("Generated: {}"), out); + } else { + switch (maybeBackend.error()) { + case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST: + fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath); + return maybeBackend.error(); + } + } +} From 92bb1136533be3dc6fd99f1beba9986ec70efec2 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 23 Oct 2024 00:10:41 +0200 Subject: [PATCH 10/92] feat(backend): use llama_token as TokenId type --- backends/llamacpp/csrc/backend.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index e109a158cd7..73bad99cb41 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -19,7 +19,7 @@ namespace huggingface::tgi::backends::llama { class TgiLlamaCppBackend { - using TokenId = int32_t; + using TokenId = llama_token; private: llama_model* model; From d4b5be10f9aeaea16da52ad60a459435c81d8444 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 23 Oct 2024 14:12:32 +0200 Subject: [PATCH 11/92] feat(backend): minor refactor --- backends/llamacpp/csrc/backend.cpp | 3 +-- backends/llamacpp/csrc/backend.hpp | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 859041c20eb..f283b2ac7e9 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -123,7 +123,7 @@ namespace huggingface::tgi::backends::llama { #else const auto status = llama_decode(ctx, batch); #endif - if (status == LLAMA_SUCCESS) { + if (LLAMA_SUCCESS(status)) { // Sample the new token auto new_token_id = llama_sampler_sample(*sampler, ctx, -1); generated.emplace_back(new_token_id); @@ -133,7 +133,6 @@ namespace huggingface::tgi::backends::llama { batch = llama_batch_get_one(&new_token_id, 1); } } - generated.shrink_to_fit(); return generated; } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 73bad99cb41..26d690c8321 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -10,7 +10,7 @@ #include #include -#define LLAMA_SUCCESS 0 +#define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llama { enum TgiLlamaCppBackendError { From 37faeb34b248bc3b6568539b6ae3a7a6d85f2c0d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 23 Oct 2024 14:12:52 +0200 Subject: [PATCH 12/92] feat(backend): expose frequency and repetition penalties --- backends/llamacpp/csrc/backend.cpp | 13 ++++++++++--- backends/llamacpp/csrc/backend.hpp | 17 +++++++++++++---- backends/llamacpp/offline/main.cpp | 11 +++++++++-- 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index f283b2ac7e9..1f6dcfaefdf 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -100,8 +100,15 @@ namespace huggingface::tgi::backends::llama { return std::make_unique(sampler); } - std::vector huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate( - std::span tokens, const uint32_t topK, const float_t topP, const uint32_t maxNewTokens) { + std::expected, TgiLlamaCppBackendError> huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate( + std::span tokens, + const uint32_t topK, + const float_t topP, + const float_t frequencyPenalty, + const float_t repetitionPenalty, + const uint32_t maxNewTokens, + const uint64_t seed + ) { SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); // Allocate generation result @@ -110,7 +117,7 @@ namespace huggingface::tgi::backends::llama { // Retrieve decoding context auto batch = llama_batch_get_one(const_cast(tokens.data()), static_cast(tokens.size())); - auto sampler = GetSamplerFromArgs(topK, topP, 1.0, 1.0, 2014); + auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed); // Decode for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 26d690c8321..5f356bc06b6 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -13,7 +13,7 @@ #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llama { - enum TgiLlamaCppBackendError { + enum TgiLlamaCppBackendError: uint8_t { MODEL_FILE_DOESNT_EXIST = 1 }; @@ -43,24 +43,33 @@ namespace huggingface::tgi::backends::llama { * @param text * @return */ - [[nodiscard]] std::vector Tokenize(const std::string& text) const; + [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]] + std::vector Tokenize(const std::string& text) const; /** * * @param tokens * @param topK * @param topP + * @param frequencyPenalty + * @param repetitionPenalty * @param maxNewTokens + * @param seed * @return */ - [[nodiscard]] std::vector Generate( + [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] + std::expected, TgiLlamaCppBackendError> Generate( std::span tokens, uint32_t topK, float_t topP = 1.0f, - uint32_t maxNewTokens = std::numeric_limits::max() + float_t frequencyPenalty = 0.0f, + float_t repetitionPenalty = 0.0f, + uint32_t maxNewTokens = std::numeric_limits::max() - 1, + uint64_t seed = 2014 ); }; + [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]] std::expected, TgiLlamaCppBackendError> CreateLlamaCppBackend(const std::filesystem::path& root); } diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 3165261fa9f..c2ae05c726f 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -27,8 +27,15 @@ int main(int argc, char** argv) { // Generate const auto promptTokens = backend->Tokenize(prompt); - const auto out = backend->Generate(promptTokens, 30, 1.0, 32); - fmt::print(FMT_STRING("Generated: {}"), out); + const auto out = backend->Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32); + + if(out.has_value()) + fmt::print(FMT_STRING("Generated: {}"), *out); + else { + const auto err = out.error(); + fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast(err)); + } + } else { switch (maybeBackend.error()) { case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST: From f9c248657dbe3b418e97a3039a934d5aa628b777 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 23 Oct 2024 22:11:58 +0200 Subject: [PATCH 13/92] chore(backend): minor formatting --- backends/llamacpp/csrc/backend.cpp | 2 +- backends/llamacpp/csrc/backend.hpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 1f6dcfaefdf..c8806957bb7 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -15,10 +15,10 @@ #include "backend.hpp" namespace huggingface::tgi::backends::llama { - std::expected, TgiLlamaCppBackendError> CreateLlamaCppBackend(const std::filesystem::path& modelPath) { SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); + llama_backend_init(); llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 5f356bc06b6..e4c31ad6411 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -17,7 +17,6 @@ namespace huggingface::tgi::backends::llama { MODEL_FILE_DOESNT_EXIST = 1 }; - class TgiLlamaCppBackend { using TokenId = llama_token; From 355d8a55b46f4ac56a8741bc3e3960a6bed2c03a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 24 Oct 2024 09:56:40 +0200 Subject: [PATCH 14/92] feat(backend): wip Rust binding --- backends/llamacpp/CMakeLists.txt | 7 +++++++ backends/llamacpp/build.rs | 6 ++++-- backends/llamacpp/csrc/backend.hpp | 1 + backends/llamacpp/csrc/ffi.hpp | 19 +++++++++++++++++++ backends/llamacpp/src/backend.rs | 15 ++++++++++++++- backends/llamacpp/src/lib.rs | 9 +++++++-- 6 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 backends/llamacpp/csrc/ffi.hpp diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 9f08d0f3a0c..644db5ae162 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -11,6 +11,13 @@ set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") +if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + message(STATUS "Targeting libc++") + set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS}) +else() + message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}") +endif() + # Add dependencies include(cmake/fmt.cmake) include(cmake/spdlog.cmake) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 26ea8d929b9..d84e517f2c6 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -59,18 +59,20 @@ fn build_ffi_layer(deps_folder: &PathBuf) { CFG.include_prefix = "backends/llamacpp"; cxx_build::bridge("src/lib.rs") .static_flag(true) + .std("c++23") .include(deps_folder.join("fmt-src").join("include")) .include(deps_folder.join("spdlog-src").join("include")) .include(deps_folder.join("llama-src").join("common")) .include(deps_folder.join("llama-src").join("ggml").join("include")) .include(deps_folder.join("llama-src").join("include")) - .file("csrc/backend.cpp") - .std("c++23") + .include("csrc/backend.hpp") + .file("csrc/ffi.cpp") .compile(CMAKE_LLAMA_CPP_TARGET); println!("cargo:rerun-if-changed=CMakeLists.txt"); println!("cargo:rerun-if-changed=csrc/backend.hpp"); println!("cargo:rerun-if-changed=csrc/backend.cpp"); + println!("cargo:rerun-if-changed=csrc/ffi.hpp"); } fn main() { diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index e4c31ad6411..7075642acd5 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #define LLAMA_SUCCESS(x) x == 0 diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp new file mode 100644 index 00000000000..e924316e36e --- /dev/null +++ b/backends/llamacpp/csrc/ffi.hpp @@ -0,0 +1,19 @@ +// +// Created by mfuntowicz on 10/23/24. +// + +#ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP +#define TGI_LLAMA_CPP_BACKEND_FFI_HPP + +#include "backend.hpp" +//#include "backends/llamacpp/src/lib.rs.h" + + +namespace huggingface::tgi::backends::llama { + class LlamaCppBackendImpl { + + }; +} + + +#endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 8af1067b9d4..89daeee3658 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,8 +1,21 @@ +use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl}; +use cxx::UniquePtr; +use std::path::Path; use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; use text_generation_router::validation::ValidGenerateRequest; use tokio_stream::wrappers::UnboundedReceiverStream; -pub struct TgiLlamaCppBakend {} +pub struct TgiLlamaCppBakend { + backend: UniquePtr, +} + +impl TgiLlamaCppBakend { + pub fn new>(model_path: P) -> Result { + Ok(Self { + backend: create_llamacpp_backend(model_path.as_ref().to_str().unwrap()), + }) + } +} impl Backend for TgiLlamaCppBakend { fn schedule( diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index bea7c06fc65..d25e3ca0bea 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -1,11 +1,16 @@ pub mod backend; -#[cxx::bridge(namespace = "huggingface::tgi::backends::llama::impl")] +#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")] mod ffi { unsafe extern "C++" { - include!("backends/llamacpp/csrc/backend.cpp"); + include!("backends/llamacpp/csrc/ffi.hpp"); /// Represent an instance of the llama.cpp backend instance on C++ side type LlamaCppBackendImpl; + + #[rust_name = "create_llamacpp_backend"] + fn CreateLlamaCppBackend( + engine_folder: &str, + ) -> UniquePtr; } } From e4d803c94ef8a48a172a57f783d2e5f9c9387edd Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 24 Oct 2024 16:42:50 +0200 Subject: [PATCH 15/92] feat(backend): build and link through build.rs --- Cargo.lock | 90 ++++++++++++++++++++++++++++-- backends/llamacpp/CMakeLists.txt | 18 +++--- backends/llamacpp/Cargo.toml | 6 ++ backends/llamacpp/build.rs | 86 +++++++++++++++------------- backends/llamacpp/csrc/backend.cpp | 51 +++++++++-------- backends/llamacpp/csrc/backend.hpp | 22 ++++++-- backends/llamacpp/csrc/ffi.hpp | 34 ++++++++++- backends/llamacpp/offline/main.cpp | 6 +- backends/llamacpp/src/backend.rs | 59 ++++++++++++++++---- backends/llamacpp/src/lib.rs | 8 +-- backends/llamacpp/src/main.rs | 28 ++++++---- 11 files changed, 295 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4075556bfef..479e94d7fca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2732,6 +2732,20 @@ dependencies = [ "thiserror", ] +[[package]] +name = "opentelemetry" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17" +dependencies = [ + "futures-core", + "futures-sink", + "js-sys", + "once_cell", + "pin-project-lite", + "thiserror", +] + [[package]] name = "opentelemetry-otlp" version = "0.13.0" @@ -2849,6 +2863,24 @@ dependencies = [ "thiserror", ] +[[package]] +name = "opentelemetry_sdk" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3" +dependencies = [ + "async-trait", + "futures-channel", + "futures-executor", + "futures-util", + "glob", + "once_cell", + "opentelemetry 0.26.0", + "percent-encoding", + "rand", + "thiserror", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4187,12 +4219,14 @@ dependencies = [ name = "text-generation-backend-llamacpp" version = "2.4.1-dev0" dependencies = [ + "async-trait", "clap 4.5.20", "cmake", "cxx", "cxx-build", "hf-hub", "image", + "log", "metrics", "metrics-exporter-prometheus", "pkg-config", @@ -4202,6 +4236,10 @@ dependencies = [ "tokenizers", "tokio", "tokio-stream", + "tracing", + "tracing-opentelemetry 0.27.0", + "tracing-subscriber", + "utoipa 5.1.2", ] [[package]] @@ -4330,7 +4368,7 @@ dependencies = [ "tracing-opentelemetry 0.21.0", "tracing-subscriber", "ureq", - "utoipa", + "utoipa 4.2.3", "utoipa-swagger-ui", "uuid", "vergen", @@ -4381,7 +4419,7 @@ dependencies = [ "tracing", "tracing-opentelemetry 0.21.0", "tracing-subscriber", - "utoipa", + "utoipa 4.2.3", "utoipa-swagger-ui", ] @@ -4432,7 +4470,7 @@ dependencies = [ "tracing", "tracing-opentelemetry 0.21.0", "tracing-subscriber", - "utoipa", + "utoipa 4.2.3", "utoipa-swagger-ui", ] @@ -4946,6 +4984,24 @@ dependencies = [ "web-time 1.1.0", ] +[[package]] +name = "tracing-opentelemetry" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b" +dependencies = [ + "js-sys", + "once_cell", + "opentelemetry 0.26.0", + "opentelemetry_sdk 0.26.0", + "smallvec", + "tracing", + "tracing-core", + "tracing-log 0.2.0", + "tracing-subscriber", + "web-time 1.1.0", +] + [[package]] name = "tracing-opentelemetry-instrumentation-sdk" version = "0.16.0" @@ -5136,7 +5192,19 @@ dependencies = [ "indexmap 2.6.0", "serde", "serde_json", - "utoipa-gen", + "utoipa-gen 4.3.0", +] + +[[package]] +name = "utoipa" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e12e84f0ff45b6818029cd0f67280e453c80132c1b9897df407ecc20b9f7cfd" +dependencies = [ + "indexmap 2.5.0", + "serde", + "serde_json", + "utoipa-gen 5.1.2", ] [[package]] @@ -5152,6 +5220,18 @@ dependencies = [ "syn 2.0.85", ] +[[package]] +name = "utoipa-gen" +version = "5.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dfc694d3a3118d2b9e80d68be83bf1aab7988510916934db83da61c14e7e6b2" +dependencies = [ + "proc-macro2", + "quote", + "regex", + "syn 2.0.79", +] + [[package]] name = "utoipa-swagger-ui" version = "6.0.0" @@ -5164,7 +5244,7 @@ dependencies = [ "rust-embed", "serde", "serde_json", - "utoipa", + "utoipa 4.2.3", "zip", ] diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 644db5ae162..c4b6f0ce2ff 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -11,12 +11,12 @@ set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") -if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") message(STATUS "Targeting libc++") set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS}) -else() +else () message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}") -endif() +endif () # Add dependencies include(cmake/fmt.cmake) @@ -42,15 +42,17 @@ fetchcontent_declare( fetchcontent_makeavailable(llama) -add_library(tgi_llama_cpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) -target_compile_features(tgi_llama_cpp_backend_impl PRIVATE cxx_std_11) -target_link_libraries(tgi_llama_cpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) +add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) +target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11) +target_link_libraries(tgi_llamacpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) + +install(TARGETS tgi_llamacpp_backend_impl spdlog llama common) if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") - add_executable(tgi_llama_cpp_offline_runner offline/main.cpp) + add_executable(tgi_llama_cppoffline_runner offline/main.cpp) - target_link_libraries(tgi_llama_cpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common) + target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common) endif () diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index fdd980c308f..4a14dcdfd05 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -6,6 +6,7 @@ authors.workspace = true homepage.workspace = true [dependencies] +async-trait = "0.1" clap = { version = "4.5.19", features = ["derive"] } cxx = "1.0" hf-hub = { workspace = true } @@ -18,6 +19,11 @@ thiserror = "1.0.64" tokio = "1.40.0" tokio-stream = "0.1.16" tokenizers = { workspace = true } +tracing = "0.1" +tracing-opentelemetry = "0.27.0" +tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] } +utoipa = { version = "5.1.2", features = ["axum_extras"] } +log = "0.4.22" [build-dependencies] cmake = "0.1" diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index d84e517f2c6..642a9665cc0 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -1,12 +1,14 @@ use cxx_build::CFG; use std::env; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; const CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS: &str = "75-real;80-real;86-real;89-real;90-real"; -const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llama_cpp_backend_impl"; -const ADDITIONAL_BACKEND_LINK_LIBRARIES: [&str; 2] = ["spdlog", "fmt"]; +const CMAKE_LLAMA_CPP_TARGET: &str = "tgi_llamacpp_backend_impl"; +const CMAKE_LLAMA_CPP_FFI_TARGET: &str = "tgi_llamacpp_backend"; const MPI_REQUIRED_VERSION: &str = "4.1"; +const BACKEND_DEPS: [&str; 2] = [CMAKE_LLAMA_CPP_TARGET, CMAKE_LLAMA_CPP_FFI_TARGET]; + macro_rules! probe { ($name: expr, $version: expr) => { if let Err(_) = pkg_config::probe_library($name) { @@ -16,11 +18,12 @@ macro_rules! probe { }; } -fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf { - let install_path = env::var("CMAKE_INSTALL_PREFIX") - .map(|val| PathBuf::from(val)) - .unwrap_or(out_dir.join("dist")); - +fn build_backend( + is_debug: bool, + opt_level: &str, + out_dir: &Path, + install_path: &PathBuf, +) -> PathBuf { let build_cuda = option_env!("LLAMA_CPP_BUILD_CUDA").unwrap_or("OFF"); let cuda_archs = option_env!("LLAMA_CPP_TARGET_CUDA_ARCHS").unwrap_or(CMAKE_LLAMA_CPP_DEFAULT_CUDA_ARCHS); @@ -38,41 +41,28 @@ fn build_backend(is_debug: bool, opt_level: &str, out_dir: &PathBuf) -> PathBuf .define("LLAMA_CPP_TARGET_CUDA_ARCHS", cuda_archs) .build(); - // Additional transitive CMake dependencies - let deps_folder = out_dir.join("build").join("_deps"); - for dependency in ADDITIONAL_BACKEND_LINK_LIBRARIES { - let dep_name = match is_debug { - true => format!("{}d", dependency), - false => String::from(dependency), - }; - let dep_path = deps_folder.join(format!("{}-build", dependency)); - println!("cargo:rustc-link-search={}", dep_path.display()); - println!("cargo:rustc-link-lib=static={}", dep_name); - } + let lib_path = install_path.join("lib64"); + println!("cargo:rustc-link-search=native={}", lib_path.display()); let deps_folder = out_dir.join("build").join("_deps"); deps_folder } -fn build_ffi_layer(deps_folder: &PathBuf) { - println!("cargo:warning={}", &deps_folder.display()); +fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) { + println!("cargo:warning={}", deps_folder.display()); CFG.include_prefix = "backends/llamacpp"; cxx_build::bridge("src/lib.rs") .static_flag(true) .std("c++23") - .include(deps_folder.join("fmt-src").join("include")) - .include(deps_folder.join("spdlog-src").join("include")) - .include(deps_folder.join("llama-src").join("common")) - .include(deps_folder.join("llama-src").join("ggml").join("include")) - .include(deps_folder.join("llama-src").join("include")) - .include("csrc/backend.hpp") - .file("csrc/ffi.cpp") - .compile(CMAKE_LLAMA_CPP_TARGET); - - println!("cargo:rerun-if-changed=CMakeLists.txt"); - println!("cargo:rerun-if-changed=csrc/backend.hpp"); - println!("cargo:rerun-if-changed=csrc/backend.cpp"); - println!("cargo:rerun-if-changed=csrc/ffi.hpp"); + .include(deps_folder.join("spdlog-src").join("include")) // Why spdlog doesnt install headers? + // .include(deps_folder.join("fmt-src").join("include")) // Why spdlog doesnt install headers? + // .include(deps_folder.join("llama-src").join("include")) // Why spdlog doesnt install headers? + .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why spdlog doesnt install headers? + .include(deps_folder.join("llama-src").join("common").join("include")) // Why spdlog doesnt install headers? + .include(install_prefix.join("include")) + .include("csrc") + .file("csrc/ffi.hpp") + .compile(CMAKE_LLAMA_CPP_FFI_TARGET); } fn main() { @@ -84,17 +74,35 @@ fn main() { _ => (false, "3"), }; + let install_path = env::var("CMAKE_INSTALL_PREFIX") + .map(|val| PathBuf::from(val)) + .unwrap_or(out_dir.join("dist")); + // Build the backend - let deps_folder = build_backend(is_debug, opt_level, &out_dir); + let deps_path = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path); // Build the FFI layer calling the backend above - build_ffi_layer(&deps_folder); + build_ffi_layer(&deps_path, &install_path); // Emit linkage search path probe!("ompi", MPI_REQUIRED_VERSION); // Backend - // BACKEND_DEPS.iter().for_each(|name| { - // println!("cargo:rustc-link-lib=static={}", name); - // }); + BACKEND_DEPS.iter().for_each(|name| { + println!("cargo:rustc-link-lib=static={}", name); + }); + + // Linkage info + println!("cargo:rustc-link-search=native={}", out_dir.display()); + println!("cargo:rustc-link-lib=static=fmtd"); + println!("cargo:rustc-link-lib=static=spdlogd"); + println!("cargo:rustc-link-lib=static=common"); + println!("cargo:rustc-link-lib=dylib=ggml"); + println!("cargo:rustc-link-lib=dylib=llama"); + + // Rerun if one of these file change + println!("cargo:rerun-if-changed=CMakeLists.txt"); + println!("cargo:rerun-if-changed=csrc/backend.hpp"); + println!("cargo:rerun-if-changed=csrc/backend.cpp"); + println!("cargo:rerun-if-changed=csrc/ffi.hpp"); } diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index c8806957bb7..ba4a02d5fc4 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -14,33 +14,35 @@ #include "backend.hpp" -namespace huggingface::tgi::backends::llama { - std::expected, TgiLlamaCppBackendError> - CreateLlamaCppBackend(const std::filesystem::path& modelPath) { +namespace huggingface::tgi::backends::llamacpp { + [[nodiscard]] + std::expected, TgiLlamaCppBackendError> + TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath) noexcept { SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); llama_backend_init(); llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); // Load the model - if(!exists(modelPath)) { + if (!exists(modelPath)) { return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); } auto params = llama_model_default_params(); - auto* model = llama_load_model_from_file(modelPath.c_str(), params); - auto* context = llama_new_context_with_model(model, { - .n_batch = 1, - .n_threads = 16, - .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, - .flash_attn = false, + auto *model = llama_load_model_from_file(modelPath.c_str(), params); + auto *context = llama_new_context_with_model(model, { + .n_batch = 1, + .n_threads = 16, + .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, + .flash_attn = false, }); - return std::make_unique(model, context); + return std::make_pair(model, context); } - huggingface::tgi::backends::llama::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) - : model(model), ctx(ctx) { + huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, + llama_context *const ctx) + : model(model), ctx(ctx) { #ifndef NDEBUG char modelName[256]; llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); @@ -48,13 +50,13 @@ namespace huggingface::tgi::backends::llama { #endif } - huggingface::tgi::backends::llama::TgiLlamaCppBackend::~TgiLlamaCppBackend() { + huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::~TgiLlamaCppBackend() { if (ctx) { SPDLOG_DEBUG("Freeing llama.cpp context"); llama_free(ctx); } - if(model) { + if (model) { SPDLOG_DEBUG("Freeing llama.cpp model"); llama_free_model(model); } @@ -63,7 +65,8 @@ namespace huggingface::tgi::backends::llama { std::vector TgiLlamaCppBackend::Tokenize(const std::string &text) const { std::vector tokens(llama_n_seq_max(ctx)); - if(auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); nTokens < 0){ + if (auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, + true); nTokens < 0) { tokens.resize(-nTokens); llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); } else { @@ -75,14 +78,15 @@ namespace huggingface::tgi::backends::llama { } std::unique_ptr TgiLlamaCppBackend::GetSamplerFromArgs( - const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, const uint64_t seed) { + const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, + const uint64_t seed) { auto *sampler = llama_sampler_chain_init({.no_perf = false}); // Penalties llama_sampler_chain_add(sampler, llama_sampler_init_penalties( llama_n_vocab(model), llama_token_eos(model), - llama_token_nl (model), + llama_token_nl(model), 0.0f, repetitionPenalty, frequencyPenalty, @@ -92,15 +96,16 @@ namespace huggingface::tgi::backends::llama { )); llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(topK))); - if(0 < topP && topP < 1) { + if (0 < topP && topP < 1) { llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1)); } llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); - return std::make_unique(sampler); + return std::make_unique(sampler); } - std::expected, TgiLlamaCppBackendError> huggingface::tgi::backends::llama::TgiLlamaCppBackend::Generate( + std::expected, TgiLlamaCppBackendError> + huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::Generate( std::span tokens, const uint32_t topK, const float_t topP, @@ -108,7 +113,7 @@ namespace huggingface::tgi::backends::llama { const float_t repetitionPenalty, const uint32_t maxNewTokens, const uint64_t seed - ) { + ) { SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); // Allocate generation result @@ -120,7 +125,7 @@ namespace huggingface::tgi::backends::llama { auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed); // Decode - for(auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { + for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { #ifndef NDEBUG const auto start = std::chrono::steady_clock::now(); const auto status = llama_decode(ctx, batch); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 7075642acd5..7fa47e84d1c 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -9,12 +9,14 @@ #include #include #include +#include + #include #define LLAMA_SUCCESS(x) x == 0 -namespace huggingface::tgi::backends::llama { - enum TgiLlamaCppBackendError: uint8_t { +namespace huggingface::tgi::backends::llamacpp { + enum TgiLlamaCppBackendError : uint8_t { MODEL_FILE_DOESNT_EXIST = 1 }; @@ -22,8 +24,8 @@ namespace huggingface::tgi::backends::llama { using TokenId = llama_token; private: - llama_model* model; - llama_context* ctx; + llama_model *model; + llama_context *ctx; /** * @@ -35,7 +37,15 @@ namespace huggingface::tgi::backends::llama { uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed); public: + /** + * + * @return + */ + static std::expected, TgiLlamaCppBackendError> + FromGGUF(const std::filesystem::path &) noexcept; + TgiLlamaCppBackend(llama_model *model, llama_context *ctx); + ~TgiLlamaCppBackend(); /** @@ -44,7 +54,7 @@ namespace huggingface::tgi::backends::llama { * @return */ [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]] - std::vector Tokenize(const std::string& text) const; + std::vector Tokenize(const std::string &text) const; /** * @@ -71,7 +81,7 @@ namespace huggingface::tgi::backends::llama { [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]] std::expected, TgiLlamaCppBackendError> - CreateLlamaCppBackend(const std::filesystem::path& root); + CreateLlamaCppBackend(const std::filesystem::path &root); } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index e924316e36e..82f3f29651d 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -5,14 +5,44 @@ #ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP #define TGI_LLAMA_CPP_BACKEND_FFI_HPP +#include +#include +#include + +#include #include "backend.hpp" -//#include "backends/llamacpp/src/lib.rs.h" + +namespace huggingface::tgi::backends::llamacpp::impl { + class LlamaCppBackendImpl; +} + + +#include "backends/llamacpp/src/lib.rs.h" -namespace huggingface::tgi::backends::llama { +namespace huggingface::tgi::backends::llamacpp::impl { + + class LlamaCppBackendException : std::exception { + + }; + class LlamaCppBackendImpl { + private: + TgiLlamaCppBackend _inner; + public: + LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {} }; + + std::unique_ptr CreateLlamaCppBackendImpl(rust::Str modelPath) { + const auto cxxPath = std::string_view(modelPath); + if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath)); maybe.has_value()) { + auto [model, context] = *maybe; + return std::make_unique(model, context); + } else { + throw LlamaCppBackendException(); + } + } } diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index c2ae05c726f..56eb88c5464 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -10,6 +10,8 @@ #include #include "../csrc/backend.hpp" +using namespace huggingface::tgi::backends::llamacpp; + int main(int argc, char** argv) { if (argc < 2) { fmt::print("No model folder provider"); @@ -21,7 +23,7 @@ int main(int argc, char** argv) { const auto prompt = "My name is Morgan"; const auto modelPath = absolute(std::filesystem::path(argv[1])); - if (auto maybeBackend = huggingface::tgi::backends::llama::CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) { + if (auto maybeBackend = CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) { // Retrieve the backend const auto& backend = *maybeBackend; @@ -38,7 +40,7 @@ int main(int argc, char** argv) { } else { switch (maybeBackend.error()) { - case huggingface::tgi::backends::llama::TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST: + case TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST: fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath); return maybeBackend.error(); } diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 89daeee3658..7b22e4a2d71 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,31 +1,66 @@ use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl}; +use async_trait::async_trait; use cxx::UniquePtr; -use std::path::Path; +use std::path::{Path, PathBuf}; +use std::sync::Arc; use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; use text_generation_router::validation::ValidGenerateRequest; +use thiserror::Error; +use tokio::task::spawn_blocking; use tokio_stream::wrappers::UnboundedReceiverStream; +use tracing::info; -pub struct TgiLlamaCppBakend { - backend: UniquePtr, +unsafe impl Send for LlamaCppBackendImpl {} + +#[derive(Debug, Error)] +pub enum LlamaCppBackendError { + #[error("Provided GGUF model path {0} doesn't exist")] + ModelFileDoesntExist(String), + + #[error("Failed to initialize model from GGUF file {0}: {1}")] + ModelInitializationFailed(PathBuf, String), } -impl TgiLlamaCppBakend { - pub fn new>(model_path: P) -> Result { - Ok(Self { - backend: create_llamacpp_backend(model_path.as_ref().to_str().unwrap()), - }) +pub struct LlamaCppBackend {} + +impl LlamaCppBackend { + pub fn new + Send>(model_path: P) -> Result { + let path = Arc::new(model_path.as_ref()); + if !path.exists() { + return Err(LlamaCppBackendError::ModelFileDoesntExist( + path.display().to_string(), + )); + } + + let mut backend = create_llamacpp_backend(path.to_str().unwrap()).map_err(|err| { + LlamaCppBackendError::ModelInitializationFailed( + path.to_path_buf(), + err.what().to_string(), + ) + })?; + + info!( + "Successfully initialized llama.cpp backend from {}", + path.display() + ); + + spawn_blocking(move || scheduler_loop(backend)); + Ok(Self {}) } } -impl Backend for TgiLlamaCppBakend { +async fn scheduler_loop(mut backend: UniquePtr) {} + +#[async_trait] +impl Backend for LlamaCppBackend { fn schedule( &self, - request: ValidGenerateRequest, + _request: ValidGenerateRequest, ) -> Result>, InferError> { Err(InferError::GenerationError("Not implemented yet".into())) } - async fn health(&self, current_health: bool) -> bool { - todo!() + async fn health(&self, _: bool) -> bool { + true } } diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index d25e3ca0bea..2bfc30654a6 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -1,6 +1,6 @@ pub mod backend; -#[cxx::bridge(namespace = "huggingface::tgi::backends::llama")] +#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp::impl")] mod ffi { unsafe extern "C++" { include!("backends/llamacpp/csrc/ffi.hpp"); @@ -9,8 +9,8 @@ mod ffi { type LlamaCppBackendImpl; #[rust_name = "create_llamacpp_backend"] - fn CreateLlamaCppBackend( - engine_folder: &str, - ) -> UniquePtr; + fn CreateLlamaCppBackendImpl( + modelPath: &str, + ) -> Result>; } } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 7226473c70f..7420e16a518 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,7 +1,8 @@ use clap::{Parser, Subcommand}; +use std::path::PathBuf; +use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackendError}; use text_generation_router::{server, usage_stats}; use thiserror::Error; -use text_generation_router::server::ApiDoc; /// App Configuration #[derive(Parser, Debug)] @@ -38,6 +39,8 @@ struct Args { port: u16, #[clap(default_value = "/tmp/text-generation-server-0", long, env)] master_shard_uds_path: String, + #[clap(long, env, help = "Path to GGUF model file(s) to load")] + gguf_path: PathBuf, #[clap(default_value = "bigscience/bloom", long, env)] tokenizer_name: String, #[clap(long, env)] @@ -98,6 +101,7 @@ async fn main() -> Result<(), RouterError> { hostname, port, master_shard_uds_path, + gguf_path, tokenizer_name, tokenizer_config_path, revision, @@ -116,13 +120,13 @@ async fn main() -> Result<(), RouterError> { usage_stats, } = args; - if let Some(Commands::PrintSchema) = command { - use utoipa::OpenApi; - let api_doc = ApiDoc::openapi(); - let api_doc = serde_json::to_string_pretty(&api_doc).unwrap(); - println!("{}", api_doc); - std::process::exit(0); - }; + // if let Some(Commands::PrintSchema) = command { + // use utoipa::OpenApi; + // let api_doc = ApiDoc::openapi(); + // let api_doc = serde_json::to_string_pretty(&api_doc).unwrap(); + // println!("{}", api_doc); + // std::process::exit(0); + // }; text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output); // Validate args @@ -158,7 +162,7 @@ async fn main() -> Result<(), RouterError> { } } - let backend = LlamaCppBackend::new(); + let backend = LlamaCppBackend::new(gguf_path)?; // Run server server::run( @@ -185,7 +189,7 @@ async fn main() -> Result<(), RouterError> { max_client_batch_size, usage_stats, ) - .await?; + .await?; Ok(()) } @@ -194,9 +198,9 @@ enum RouterError { #[error("Argument validation error: {0}")] ArgumentValidation(String), #[error("Backend failed: {0}")] - Backend(#[from] V3Error), + Backend(#[from] LlamaCppBackendError), #[error("WebServer error: {0}")] WebServer(#[from] server::WebServerError), #[error("Tokio runtime failed to start: {0}")] Tokio(#[from] std::io::Error), -} \ No newline at end of file +} From f0859c247f0dadbfe6d26920221ba270bb99f258 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 25 Oct 2024 07:27:12 +0200 Subject: [PATCH 16/92] misc(build): handle different lib destination folder lib/lib64 --- backends/llamacpp/build.rs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 642a9665cc0..6d6bd514957 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -41,7 +41,12 @@ fn build_backend( .define("LLAMA_CPP_TARGET_CUDA_ARCHS", cuda_archs) .build(); - let lib_path = install_path.join("lib64"); + // On some x64 and ARM mainly the lib install destination is "lib" and not "lib64" + let lib_path = if install_path.join("lib64").exists() { + install_path.join("lib64") + } else { + install_path.join("lib") + }; println!("cargo:rustc-link-search=native={}", lib_path.display()); let deps_folder = out_dir.join("build").join("_deps"); @@ -55,14 +60,12 @@ fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) { .static_flag(true) .std("c++23") .include(deps_folder.join("spdlog-src").join("include")) // Why spdlog doesnt install headers? - // .include(deps_folder.join("fmt-src").join("include")) // Why spdlog doesnt install headers? - // .include(deps_folder.join("llama-src").join("include")) // Why spdlog doesnt install headers? - .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why spdlog doesnt install headers? - .include(deps_folder.join("llama-src").join("common").join("include")) // Why spdlog doesnt install headers? + .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why ggml doesnt install headers? + .include(deps_folder.join("llama-src").join("common").join("include")) // Why common doesnt install headers? .include(install_prefix.join("include")) .include("csrc") .file("csrc/ffi.hpp") - .compile(CMAKE_LLAMA_CPP_FFI_TARGET); + .compile(CMAKE_LLAMA_CPP_FFI_TARGET); // Make sure this target is not the same as cmake above } fn main() { @@ -94,8 +97,15 @@ fn main() { // Linkage info println!("cargo:rustc-link-search=native={}", out_dir.display()); - println!("cargo:rustc-link-lib=static=fmtd"); - println!("cargo:rustc-link-lib=static=spdlogd"); + + if is_debug { + println!("cargo:rustc-link-lib=static=fmtd"); + println!("cargo:rustc-link-lib=static=spdlogd"); + } else { + println!("cargo:rustc-link-lib=static=fmt"); + println!("cargo:rustc-link-lib=static=spdlog"); + } + println!("cargo:rustc-link-lib=static=common"); println!("cargo:rustc-link-lib=dylib=ggml"); println!("cargo:rustc-link-lib=dylib=llama"); From 179309b364ebafc4a3da0179f03fee7a5277799a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 25 Oct 2024 08:02:45 +0200 Subject: [PATCH 17/92] misc(build): refactor build type detection in cmake --- backends/llamacpp/CMakeLists.txt | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index c4b6f0ce2ff..adcc6af29dc 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -45,12 +45,15 @@ fetchcontent_makeavailable(llama) add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11) target_link_libraries(tgi_llamacpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) - install(TARGETS tgi_llamacpp_backend_impl spdlog llama common) +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + target_compile_definitions(tgi_llamacpp_backend_impl PRIVATE TGI_LLAMACPP_BACKEND_DEBUG=1) +endif () + if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") - add_executable(tgi_llama_cppoffline_runner offline/main.cpp) + add_executable(tgi_llamacpp_offline_runner offline/main.cpp) target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common) endif () From a316c532550a76485fcf152cbb911144b0b80231 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 25 Oct 2024 08:11:42 +0200 Subject: [PATCH 18/92] feat(llamacpp): expose number of threads for the backend when constructing the model --- backends/llamacpp/csrc/backend.cpp | 12 ++++++++---- backends/llamacpp/csrc/backend.hpp | 2 +- backends/llamacpp/csrc/ffi.hpp | 4 ++-- backends/llamacpp/src/backend.rs | 18 +++++++++++------- backends/llamacpp/src/lib.rs | 1 + backends/llamacpp/src/main.rs | 19 +++++++++---------- 6 files changed, 32 insertions(+), 24 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index ba4a02d5fc4..907fe58e688 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -17,12 +17,16 @@ namespace huggingface::tgi::backends::llamacpp { [[nodiscard]] std::expected, TgiLlamaCppBackendError> - TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath) noexcept { + TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath, const uint16_t nThreads) noexcept { SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); llama_backend_init(); llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); +#ifdef TGI_LLAMACPP_BACKEND_DEBUG + llama_print_system_info(); +#endif + // Load the model if (!exists(modelPath)) { return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); @@ -32,7 +36,7 @@ namespace huggingface::tgi::backends::llamacpp { auto *model = llama_load_model_from_file(modelPath.c_str(), params); auto *context = llama_new_context_with_model(model, { .n_batch = 1, - .n_threads = 16, + .n_threads = nThreads, .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, .flash_attn = false, }); @@ -43,7 +47,7 @@ namespace huggingface::tgi::backends::llamacpp { huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, llama_context *const ctx) : model(model), ctx(ctx) { -#ifndef NDEBUG +#ifdef TGI_LLAMACPP_BACKEND_DEBUG char modelName[256]; llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); @@ -126,7 +130,7 @@ namespace huggingface::tgi::backends::llamacpp { // Decode for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { -#ifndef NDEBUG +#ifdef TGI_LLAMACPP_BACKEND_DEBUG const auto start = std::chrono::steady_clock::now(); const auto status = llama_decode(ctx, batch); const auto end = std::chrono::steady_clock::now(); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 7fa47e84d1c..24b49949612 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -42,7 +42,7 @@ namespace huggingface::tgi::backends::llamacpp { * @return */ static std::expected, TgiLlamaCppBackendError> - FromGGUF(const std::filesystem::path &) noexcept; + FromGGUF(const std::filesystem::path &, uint16_t) noexcept; TgiLlamaCppBackend(llama_model *model, llama_context *ctx); diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 82f3f29651d..09d8af2d9bd 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -34,9 +34,9 @@ namespace huggingface::tgi::backends::llamacpp::impl { LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {} }; - std::unique_ptr CreateLlamaCppBackendImpl(rust::Str modelPath) { + std::unique_ptr CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) { const auto cxxPath = std::string_view(modelPath); - if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath)); maybe.has_value()) { + if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath), nThreads); maybe.has_value()) { auto [model, context] = *maybe; return std::make_unique(model, context); } else { diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 7b22e4a2d71..0693ed34a8b 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -24,7 +24,10 @@ pub enum LlamaCppBackendError { pub struct LlamaCppBackend {} impl LlamaCppBackend { - pub fn new + Send>(model_path: P) -> Result { + pub fn new + Send>( + model_path: P, + n_threads: u16, + ) -> Result { let path = Arc::new(model_path.as_ref()); if !path.exists() { return Err(LlamaCppBackendError::ModelFileDoesntExist( @@ -32,12 +35,13 @@ impl LlamaCppBackend { )); } - let mut backend = create_llamacpp_backend(path.to_str().unwrap()).map_err(|err| { - LlamaCppBackendError::ModelInitializationFailed( - path.to_path_buf(), - err.what().to_string(), - ) - })?; + let mut backend = + create_llamacpp_backend(path.to_str().unwrap(), n_threads).map_err(|err| { + LlamaCppBackendError::ModelInitializationFailed( + path.to_path_buf(), + err.what().to_string(), + ) + })?; info!( "Successfully initialized llama.cpp backend from {}", diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 2bfc30654a6..673fe130255 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -11,6 +11,7 @@ mod ffi { #[rust_name = "create_llamacpp_backend"] fn CreateLlamaCppBackendImpl( modelPath: &str, + n_threads: u16, ) -> Result>; } } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 7420e16a518..3920da21d9d 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -23,24 +23,25 @@ struct Args { max_input_tokens: usize, #[clap(default_value = "2048", long, env)] max_total_tokens: usize, - #[clap(default_value = "1.2", long, env)] - waiting_served_ratio: f32, #[clap(default_value = "4096", long, env)] max_batch_prefill_tokens: u32, #[clap(long, env)] max_batch_total_tokens: Option, - #[clap(default_value = "20", long, env)] - max_waiting_tokens: usize, #[clap(long, env)] max_batch_size: Option, #[clap(default_value = "0.0.0.0", long, env)] hostname: String, #[clap(default_value = "3000", long, short, env)] port: u16, - #[clap(default_value = "/tmp/text-generation-server-0", long, env)] - master_shard_uds_path: String, #[clap(long, env, help = "Path to GGUF model file(s) to load")] gguf_path: PathBuf, + #[clap( + long, + env, + default_value = "1", + help = "Number of CPU threads allocated to one llama.cpp model" + )] + cores_per_instance: u16, #[clap(default_value = "bigscience/bloom", long, env)] tokenizer_name: String, #[clap(long, env)] @@ -93,15 +94,13 @@ async fn main() -> Result<(), RouterError> { max_top_n_tokens, max_input_tokens, max_total_tokens, - waiting_served_ratio, max_batch_prefill_tokens, max_batch_total_tokens, - max_waiting_tokens, max_batch_size, hostname, port, - master_shard_uds_path, gguf_path, + cores_per_instance, tokenizer_name, tokenizer_config_path, revision, @@ -162,7 +161,7 @@ async fn main() -> Result<(), RouterError> { } } - let backend = LlamaCppBackend::new(gguf_path)?; + let backend = LlamaCppBackend::new(gguf_path, cores_per_instance)?; // Run server server::run( From 0c1dd0ed2b3d38dfaa2aa5409b39c7b73eca9493 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 29 Oct 2024 22:30:36 +0100 Subject: [PATCH 19/92] feat(llamacpp): wip explosion --- backends/llamacpp/csrc/backend.cpp | 172 ++++++++++------------------- backends/llamacpp/csrc/backend.hpp | 95 +++++++++------- backends/llamacpp/csrc/ffi.hpp | 4 +- 3 files changed, 116 insertions(+), 155 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 907fe58e688..080a4401409 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -15,140 +15,86 @@ #include "backend.hpp" namespace huggingface::tgi::backends::llamacpp { - [[nodiscard]] - std::expected, TgiLlamaCppBackendError> - TgiLlamaCppBackend::FromGGUF(const std::filesystem::path &modelPath, const uint16_t nThreads) noexcept { - SPDLOG_DEBUG(FMT_STRING("Loading model from {}"), modelPath); - llama_backend_init(); - llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); + std::unique_ptr SamplingParams::IntoLlamaSampler(const llama_model *pModel) const { + auto *pSampler = llama_sampler_chain_init({.no_perf = false}); -#ifdef TGI_LLAMACPP_BACKEND_DEBUG - llama_print_system_info(); -#endif + // Penalties + llama_sampler_chain_add(pSampler, llama_sampler_init_penalties( + llama_n_vocab(pModel), + llama_token_eos(pModel), + llama_token_nl(pModel), + 0.0f, + repetitionPenalty, + frequencyPenalty, + 0.0f, + false, + false + )); + llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(topK))); - // Load the model - if (!exists(modelPath)) { - return std::unexpected(TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST); + if (0 < topP && topP < 1) { + llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(topP, 1)); } - auto params = llama_model_default_params(); - auto *model = llama_load_model_from_file(modelPath.c_str(), params); - auto *context = llama_new_context_with_model(model, { - .n_batch = 1, - .n_threads = nThreads, - .attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL, - .flash_attn = false, - }); - - return std::make_pair(model, context); + llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); + return std::unique_ptr(pSampler); } - huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::TgiLlamaCppBackend(llama_model *const model, - llama_context *const ctx) - : model(model), ctx(ctx) { + Worker::Worker(std::shared_ptr pModel, const llama_context_params ¶ms) + : mModel_(pModel), mParams_(params) { + #ifdef TGI_LLAMACPP_BACKEND_DEBUG char modelName[256]; - llama_model_meta_val_str(llama_get_model(ctx), "general.name", modelName, sizeof(modelName)); + llama_model_meta_val_str(pModel.get(), "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); #endif } - huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::~TgiLlamaCppBackend() { - if (ctx) { - SPDLOG_DEBUG("Freeing llama.cpp context"); - llama_free(ctx); - } - - if (model) { - SPDLOG_DEBUG("Freeing llama.cpp model"); - llama_free_model(model); - } - } + void Worker::Loop(std::atomic_flag &running, std::atomic_uint8_t &waiting, std::queue &backlog) { + auto *context = llama_new_context_with_model(mModel_.get(), mParams_); - std::vector TgiLlamaCppBackend::Tokenize(const std::string &text) const { - std::vector tokens(llama_n_seq_max(ctx)); + while (running.test(std::memory_order_acquire)) { + if (waiting.load(std::memory_order_acquire) > 0) { + --waiting; - if (auto nTokens = llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, - true); nTokens < 0) { - tokens.resize(-nTokens); - llama_tokenize(model, text.c_str(), text.length(), tokens.data(), tokens.capacity(), true, true); - } else { - tokens.resize(nTokens); - } + auto request = backlog.front(); + auto sampler = request.IntoLlamaSampler(mModel_.get()); - SPDLOG_DEBUG(FMT_STRING("Tokenized input with {:d} tokens"), tokens.size()); - return tokens; - } + // Retrieve decoding context + auto batch = llama_batch_get_one(tokens.data(), tokens.size()); + // Decode + for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < 1; ++nDecoded) { +#ifdef TGI_LLAMACPP_BACKEND_DEBUG + const auto start = std::chrono::steady_clock::now(); + const auto status = llama_decode(context, batch); + const auto end = std::chrono::steady_clock::now(); + const auto latency = std::chrono::duration_cast(end - start); + SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); +#else + const auto status = llama_decode(ctx, batch); +#endif + if (LLAMA_SUCCESS(status)) { + // Sample the new token + auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); + generated.emplace_back(new_token_id); + generating = !llama_token_is_eog(mModel_.get(), new_token_id); - std::unique_ptr TgiLlamaCppBackend::GetSamplerFromArgs( - const uint32_t topK, const float_t topP, const float_t frequencyPenalty, const float_t repetitionPenalty, - const uint64_t seed) { - auto *sampler = llama_sampler_chain_init({.no_perf = false}); + // Next iteration + batch = llama_batch_get_one(&new_token_id, 1); + } + } - // Penalties - llama_sampler_chain_add(sampler, llama_sampler_init_penalties( - llama_n_vocab(model), - llama_token_eos(model), - llama_token_nl(model), - 0.0f, - repetitionPenalty, - frequencyPenalty, - 0.0f, - false, - false - )); - llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(topK))); + backlog.pop(); - if (0 < topP && topP < 1) { - llama_sampler_chain_add(sampler, llama_sampler_init_top_p(topP, 1)); + } } - llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); - return std::make_unique(sampler); + llama_free(context); } - std::expected, TgiLlamaCppBackendError> - huggingface::tgi::backends::llamacpp::TgiLlamaCppBackend::Generate( - std::span tokens, - const uint32_t topK, - const float_t topP, - const float_t frequencyPenalty, - const float_t repetitionPenalty, - const uint32_t maxNewTokens, - const uint64_t seed - ) { - SPDLOG_DEBUG(FMT_STRING("Received {:d} tokens to schedule"), tokens.size()); - - // Allocate generation result - std::vector generated; - generated.reserve(llama_n_seq_max(ctx) - tokens.size()); - - // Retrieve decoding context - auto batch = llama_batch_get_one(const_cast(tokens.data()), static_cast(tokens.size())); - auto sampler = GetSamplerFromArgs(topK, topP, frequencyPenalty, repetitionPenalty, seed); - - // Decode - for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < maxNewTokens; ++nDecoded) { -#ifdef TGI_LLAMACPP_BACKEND_DEBUG - const auto start = std::chrono::steady_clock::now(); - const auto status = llama_decode(ctx, batch); - const auto end = std::chrono::steady_clock::now(); - const auto latency = std::chrono::duration_cast(end - start); - SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); -#else - const auto status = llama_decode(ctx, batch); -#endif - if (LLAMA_SUCCESS(status)) { - // Sample the new token - auto new_token_id = llama_sampler_sample(*sampler, ctx, -1); - generated.emplace_back(new_token_id); - generating = !llama_token_is_eog(model, new_token_id); - - // Next iteration - batch = llama_batch_get_one(&new_token_id, 1); - } - } - return generated; - } + huggingface::tgi::backends::llamacpp::BackendBase::BackendBase(llama_model *model) + : mModel_(model, llama_free_model) { llama_backend_init(); } + + BackendBase::~BackendBase() { llama_backend_free(); } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 24b49949612..e4814d45689 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -4,9 +4,11 @@ #ifndef TGI_LLAMA_CPP_BACKEND_BACKEND_HPP #define TGI_LLAMA_CPP_BACKEND_BACKEND_HPP +#include #include #include #include +#include #include #include #include @@ -16,72 +18,85 @@ #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llamacpp { - enum TgiLlamaCppBackendError : uint8_t { + enum BackendError : uint8_t { MODEL_FILE_DOESNT_EXIST = 1 }; - class TgiLlamaCppBackend { - using TokenId = llama_token; - - private: - llama_model *model; - llama_context *ctx; + struct SamplingParams { + uint32_t topK = std::numeric_limits::max(); + float_t topP = 1.0f; + float_t frequencyPenalty = 0.0f; + float_t repetitionPenalty = 0.0f; + uint64_t seed = 2014; /** - * - * @param topK - * @param topP + * Convert this GenerationParams to the respective llama_sampler structure + * @param Pointer to the model data * @return */ - std::unique_ptr GetSamplerFromArgs( - uint32_t topK, float_t topP, float_t frequencyPenalty, float_t repetitionPenalty, uint64_t seed); + std::unique_ptr IntoLlamaSampler(const llama_model *) const; + }; + + class Worker { + protected: + constexpr static auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; public: - /** - * - * @return - */ - static std::expected, TgiLlamaCppBackendError> - FromGGUF(const std::filesystem::path &, uint16_t) noexcept; + using model_ptr_type = std::shared_ptr; + using context_params_type = llama_context_params; + using token_id_type = llama_token; + + private: + const model_ptr_type mModel_; + context_params_type mParams_; - TgiLlamaCppBackend(llama_model *model, llama_context *ctx); + public: + Worker(std::shared_ptr pModel, const llama_context_params ¶ms); - ~TgiLlamaCppBackend(); + void Loop(std::atomic_flag &, std::atomic_uint8_t &, std::queue &) const; + }; + + + class BackendBase { + + private: + std::shared_ptr mModel_; + + public: + explicit BackendBase(llama_model *model); + + ~BackendBase(); /** * - * @param text + * @param tokens + * @params out + * @param params + * @param maxNewTokens * @return */ - [[nodiscard("Tokens will be freed after this call if not assigned to an lvalue")]] - std::vector Tokenize(const std::string &text) const; + [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] + std::expected, BackendError> Generate( + std::span tokens, + std::span out, + const SamplingParams ¶ms, + uint32_t maxNewTokens = std::numeric_limits::max() - 1 + ); /** * * @param tokens - * @param topK - * @param topP - * @param frequencyPenalty - * @param repetitionPenalty + * @param params * @param maxNewTokens - * @param seed * @return */ [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - std::expected, TgiLlamaCppBackendError> Generate( - std::span tokens, - uint32_t topK, - float_t topP = 1.0f, - float_t frequencyPenalty = 0.0f, - float_t repetitionPenalty = 0.0f, - uint32_t maxNewTokens = std::numeric_limits::max() - 1, - uint64_t seed = 2014 + std::expected, BackendError> Generate( + std::span tokens, + const SamplingParams ¶ms, + uint32_t maxNewTokens = std::numeric_limits::max() - 1 ); }; - - [[nodiscard("Create backend will be freed after this call if not assigned to an lvalue")]] - std::expected, TgiLlamaCppBackendError> - CreateLlamaCppBackend(const std::filesystem::path &root); } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 09d8af2d9bd..d15728b9a02 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -28,10 +28,10 @@ namespace huggingface::tgi::backends::llamacpp::impl { class LlamaCppBackendImpl { private: - TgiLlamaCppBackend _inner; + BackendBase _inner; public: - LlamaCppBackendImpl(llama_model *model, llama_context *context) : _inner(model, context) {} + LlamaCppBackendImpl(llama_model *model) : _inner(model) {} }; std::unique_ptr CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) { From dbc5b7a0f7defc463ca43ec6eeae43e0a1f2182b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sat, 26 Oct 2024 22:24:05 +0200 Subject: [PATCH 20/92] misc(offline): link correctly --- backends/llamacpp/offline/main.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 56eb88c5464..d8121d3df37 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -2,7 +2,6 @@ // Created by mfuntowicz on 10/3/24. // -#include #include #include #include @@ -12,7 +11,7 @@ using namespace huggingface::tgi::backends::llamacpp; -int main(int argc, char** argv) { +int main(int argc, char **argv) { if (argc < 2) { fmt::print("No model folder provider"); return 1; @@ -23,15 +22,16 @@ int main(int argc, char** argv) { const auto prompt = "My name is Morgan"; const auto modelPath = absolute(std::filesystem::path(argv[1])); - if (auto maybeBackend = CreateLlamaCppBackend(modelPath); maybeBackend.has_value()) { + if (auto maybeBackend = TgiLlamaCppBackend::FromGGUF(modelPath); maybeBackend.has_value()) { // Retrieve the backend - const auto& backend = *maybeBackend; + auto [model, context] = *maybeBackend; + auto backend = TgiLlamaCppBackend(model, context); // Generate - const auto promptTokens = backend->Tokenize(prompt); - const auto out = backend->Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32); + const auto promptTokens = backend.Tokenize(prompt); + const auto out = backend.Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32); - if(out.has_value()) + if (out.has_value()) fmt::print(FMT_STRING("Generated: {}"), *out); else { const auto err = out.error(); From 611590440dd0b0bf23d04ba5604ecead8ef509b3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 28 Oct 2024 22:44:47 +0100 Subject: [PATCH 21/92] misc(offline): expose more parameters for generate --- backends/llamacpp/src/backend.rs | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 0693ed34a8b..af50470d2a3 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,12 +1,13 @@ use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl}; use async_trait::async_trait; -use cxx::UniquePtr; +use cxx::{Exception, UniquePtr}; +use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::Arc; +use std::thread::spawn; use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; use text_generation_router::validation::ValidGenerateRequest; use thiserror::Error; -use tokio::task::spawn_blocking; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::info; @@ -48,12 +49,27 @@ impl LlamaCppBackend { path.display() ); - spawn_blocking(move || scheduler_loop(backend)); + let j = spawn(|| scheduler_loop(backend)); + j.join().ok(); Ok(Self {}) } } -async fn scheduler_loop(mut backend: UniquePtr) {} +fn scheduler_loop(mut backend: UniquePtr) { + println!("Scheduler loop"); + let tokens = [128000i32, 5159, 836, 374, 23809]; + let mut generated = vec![0i32; 128]; + match backend + .pin_mut() + .generate(&tokens, &mut generated, 40, 32, 1.0, 1.0, 1.0, 1.0, 2014) + { + Ok(n_tokens) => { + generated.truncate(n_tokens); + println!("Generated {} tokens -> {:?}", n_tokens, generated); + } + Err(err) => println!("Error: {}", err), + } +} #[async_trait] impl Backend for LlamaCppBackend { From b98c635781e365a9669eca8ba7f2b770b4893855 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 30 Oct 2024 22:40:37 +0100 Subject: [PATCH 22/92] feat(backend): entirely rewrite backend --- backends/llamacpp/csrc/backend.cpp | 155 +++++++++++++++++++++-------- backends/llamacpp/csrc/backend.hpp | 148 +++++++++++++++++++++------ 2 files changed, 230 insertions(+), 73 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 080a4401409..daf8de54484 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -16,85 +16,156 @@ namespace huggingface::tgi::backends::llamacpp { - std::unique_ptr SamplingParams::IntoLlamaSampler(const llama_model *pModel) const { + void llama_batch_fill_prompt(llama_batch &batch, std::span input_tokens) { + for (auto i = 0; i < input_tokens.size(); ++i) { + batch.token[i] = input_tokens[i]; + batch.pos[i] = i; + batch.n_seq_id[i] = 1; + batch.seq_id[i] = 0; + batch.logits[i] = false; + ++batch.n_tokens; + } + + batch.logits[batch.n_tokens] = true; + } + + std::unique_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const { auto *pSampler = llama_sampler_chain_init({.no_perf = false}); // Penalties llama_sampler_chain_add(pSampler, llama_sampler_init_penalties( - llama_n_vocab(pModel), - llama_token_eos(pModel), - llama_token_nl(pModel), + llama_n_vocab(model), + llama_token_eos(model), + llama_token_nl(model), 0.0f, - repetitionPenalty, - frequencyPenalty, + repetition_penalty, + frequency_penalty, 0.0f, false, false )); - llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(topK))); + llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(top_k))); - if (0 < topP && topP < 1) { - llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(topP, 1)); + if (0 < top_p && top_p < 1) { + llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1)); } llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); return std::unique_ptr(pSampler); } - Worker::Worker(std::shared_ptr pModel, const llama_context_params ¶ms) - : mModel_(pModel), mParams_(params) { + worker_t::worker_t(std::shared_ptr model, const llama_context_params ¶ms) + : mModel_(model), mParams_(params) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG char modelName[256]; - llama_model_meta_val_str(pModel.get(), "general.name", modelName, sizeof(modelName)); + llama_model_meta_val_str(model.get(), "general.name", modelName, sizeof(modelName)); SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); #endif } - void Worker::Loop(std::atomic_flag &running, std::atomic_uint8_t &waiting, std::queue &backlog) { + void worker_t::loop(std::stop_source &driver, std::queue &backlog) const { auto *context = llama_new_context_with_model(mModel_.get(), mParams_); - while (running.test(std::memory_order_acquire)) { - if (waiting.load(std::memory_order_acquire) > 0) { - --waiting; + while (!driver.stop_requested()) { + const auto generation_context = backlog.front(); + + generate(context, generation_context, std::nullopt); + backlog.pop(); + + SPDLOG_DEBUG("Processed request ({:d} remaining)", backlog.size()); + } + + llama_free(context); + } + + size_t worker_t::generate( + llama_context *context, + const generation_context_t &generation_context, + const std::optional &callback) const { + // Store information about context and generation size + auto prompt_length = std::ssize(generation_context.input_tokens); + auto max_new_tokens = generation_context.generation_params.max_new_tokens; + + // Convert sampling params to what llama.cpp is looking for + auto sampler = generation_context.sampling_params.into_llama_sampler(mModel_.get()); - auto request = backlog.front(); - auto sampler = request.IntoLlamaSampler(mModel_.get()); + // Setup the prompt + auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end()); + auto batch = llama_batch_get_one(copy.data(), copy.size()); + + // Decode + auto n_decoded_tokens = 0; + for (bool generating = true; generating && n_decoded_tokens < max_new_tokens; ++n_decoded_tokens) { + const auto callback_ = callback.value_or(llama_void_callback); - // Retrieve decoding context - auto batch = llama_batch_get_one(tokens.data(), tokens.size()); - // Decode - for (auto [generating, nDecoded] = std::pair{true, 0uz}; generating && nDecoded < 1; ++nDecoded) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG - const auto start = std::chrono::steady_clock::now(); - const auto status = llama_decode(context, batch); - const auto end = std::chrono::steady_clock::now(); - const auto latency = std::chrono::duration_cast(end - start); - SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); + const auto start = std::chrono::steady_clock::now(); + const auto status = llama_decode(context, batch); + const auto end = std::chrono::steady_clock::now(); + const auto latency = std::chrono::duration_cast(end - start); + SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); #else - const auto status = llama_decode(ctx, batch); + const auto status = llama_decode(ctx, batch); #endif - if (LLAMA_SUCCESS(status)) { - // Sample the new token - auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); - generated.emplace_back(new_token_id); - generating = !llama_token_is_eog(mModel_.get(), new_token_id); + batch.n_tokens = 0; + if (LLAMA_SUCCESS(status)) { + // Sample the new token + auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); + auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id); - // Next iteration - batch = llama_batch_get_one(&new_token_id, 1); - } - } + generation_context.generated_tokens[n_decoded_tokens] = new_token_id; + generating = !is_eos; - backlog.pop(); + // Bubble up the generated token if a callback is provided + std::invoke(std::forward(callback_), new_token_id, is_eos); + batch = llama_batch_get_one(&new_token_id, 1); } } - llama_free(context); + return n_decoded_tokens; + } + + + backend_base_t::backend_base_t(llama_model *model) : mModel_(model, llama_free_model) { llama_backend_init(); } + + backend_base_t::~backend_base_t() { llama_backend_free(); } + + std::expected, backend_error_t> backend_base_t::generate( + std::span tokens, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback + ) { + // TODO: Should we provide a way to change this value? + auto generated = std::vector(2 << 8); + + auto nTokensGenerated = generate(tokens, generated, generation_params, sampling_params, callback); + if (nTokensGenerated.has_value()) + generated.resize(*nTokensGenerated); + return generated; } - huggingface::tgi::backends::llamacpp::BackendBase::BackendBase(llama_model *model) - : mModel_(model, llama_free_model) { llama_backend_init(); } - BackendBase::~BackendBase() { llama_backend_free(); } + /** Single worker_t Backend impl **/ + + single_worker_backend_t::single_worker_backend_t(llama_model *model, + const std::optional ¶ms) + : backend_base_t(model), + mContext_(llama_context_factory(model)), + mWorker_(mModel_, params.value_or(llama_context_default_params())) { + llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); + } + + std::expected + single_worker_backend_t::generate( + std::span tokens, + std::span out, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback + ) { + return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback); + } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index e4814d45689..e7545a3c9e1 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -8,25 +8,42 @@ #include #include #include +#include #include #include +#include #include +#include #include #include +#include #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llamacpp { - enum BackendError : uint8_t { + + static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; + typedef std::unique_ptr llama_context_smart_ptr; + + typedef std::function llama_decode_callback; + static constexpr auto llama_void_callback = [](llama_token token_id, bool is_eos) {}; + + /** + * + */ + enum backend_error_t : uint8_t { MODEL_FILE_DOESNT_EXIST = 1 }; - struct SamplingParams { - uint32_t topK = std::numeric_limits::max(); - float_t topP = 1.0f; - float_t frequencyPenalty = 0.0f; - float_t repetitionPenalty = 0.0f; + /** + * + */ + struct sampling_params_t { + uint32_t top_k = std::numeric_limits::max(); + float_t top_p = 1.0f; + float_t frequency_penalty = 0.0f; + float_t repetition_penalty = 0.0f; uint64_t seed = 2014; /** @@ -34,38 +51,72 @@ namespace huggingface::tgi::backends::llamacpp { * @param Pointer to the model data * @return */ - std::unique_ptr IntoLlamaSampler(const llama_model *) const; + std::unique_ptr into_llama_sampler(const llama_model *pModel) const; }; - class Worker { - protected: - constexpr static auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; + /** + * + */ + struct generation_params_t { + uint32_t max_new_tokens = std::numeric_limits::max(); + }; - public: - using model_ptr_type = std::shared_ptr; - using context_params_type = llama_context_params; - using token_id_type = llama_token; + struct generation_context_t { + generation_params_t generation_params; + sampling_params_t sampling_params; + std::span input_tokens; + std::span generated_tokens; + }; + /** + * + */ + class worker_t { private: - const model_ptr_type mModel_; - context_params_type mParams_; + const std::shared_ptr mModel_; + const llama_context_params mParams_; public: - Worker(std::shared_ptr pModel, const llama_context_params ¶ms); + /** + * + * @param model + * @param params + */ + worker_t(std::shared_ptr model, const llama_context_params ¶ms); - void Loop(std::atomic_flag &, std::atomic_uint8_t &, std::queue &) const; + /** + * + * @param context + * @param generation_context + * @param callback + */ + size_t + generate(llama_context *, const generation_context_t &, const std::optional &) const; + + /** + * + */ + void loop(std::stop_source &driver, std::queue &backlog) const; }; - class BackendBase { + class backend_base_t { - private: + protected: std::shared_ptr mModel_; public: - explicit BackendBase(llama_model *model); - ~BackendBase(); + /** + * + * @param model + */ + explicit backend_base_t(llama_model *model); + + /** + * Destructor + */ + ~backend_base_t(); /** * @@ -76,12 +127,13 @@ namespace huggingface::tgi::backends::llamacpp { * @return */ [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - std::expected, BackendError> Generate( - std::span tokens, - std::span out, - const SamplingParams ¶ms, - uint32_t maxNewTokens = std::numeric_limits::max() - 1 - ); + virtual std::expected generate( + std::span input_tokens, + std::span generated_tokens, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback + ) = 0; /** * @@ -91,12 +143,46 @@ namespace huggingface::tgi::backends::llamacpp { * @return */ [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - std::expected, BackendError> Generate( + std::expected, backend_error_t> generate( std::span tokens, - const SamplingParams ¶ms, - uint32_t maxNewTokens = std::numeric_limits::max() - 1 + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback = std::nullopt ); }; + + + class single_worker_backend_t : backend_base_t { + private: + constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_smart_ptr { + auto llParams = llama_context_default_params(); + llParams.flash_attn = true; + llParams.n_batch = 1; + llParams.no_perf = true; + llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL; + + return {llama_new_context_with_model(pModel, llParams), llama_context_deleter}; + }; + + llama_context_smart_ptr mContext_; + worker_t mWorker_; + + public: + explicit single_worker_backend_t(llama_model *pModel, const std::optional &); + + using backend_base_t::generate; + + std::expected + generate( + std::span tokens, + std::span out, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback + ) override; + + + }; } #endif //TGI_LLAMA_CPP_BACKEND_BACKEND_HPP From 6a5f6b07551bf59d45b1b800779bda4b98709722 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 30 Oct 2024 22:40:49 +0100 Subject: [PATCH 23/92] misc(offline): update offline tester --- backends/llamacpp/CMakeLists.txt | 2 +- backends/llamacpp/offline/main.cpp | 36 ++++++++++++------------------ 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index adcc6af29dc..e536efc57a2 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -55,7 +55,7 @@ if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") add_executable(tgi_llamacpp_offline_runner offline/main.cpp) - target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llama_cpp_backend_impl llama common) + target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog fmt::fmt) endif () diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index d8121d3df37..57e55efefe0 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -22,27 +22,19 @@ int main(int argc, char **argv) { const auto prompt = "My name is Morgan"; const auto modelPath = absolute(std::filesystem::path(argv[1])); - if (auto maybeBackend = TgiLlamaCppBackend::FromGGUF(modelPath); maybeBackend.has_value()) { - // Retrieve the backend - auto [model, context] = *maybeBackend; - auto backend = TgiLlamaCppBackend(model, context); - - // Generate - const auto promptTokens = backend.Tokenize(prompt); - const auto out = backend.Generate(promptTokens, 30, 1.0, 2.0, 0.0, 32); - - if (out.has_value()) - fmt::print(FMT_STRING("Generated: {}"), *out); - else { - const auto err = out.error(); - fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast(err)); - } - - } else { - switch (maybeBackend.error()) { - case TgiLlamaCppBackendError::MODEL_FILE_DOESNT_EXIST: - fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Specified file {} doesnt exist", modelPath); - return maybeBackend.error(); - } + const auto params = llama_model_default_params(); + auto *model = llama_load_model_from_file(modelPath.c_str(), params); + + auto backend = single_worker_backend_t(model, {}); + + // generate + const auto promptTokens = {128000, 9906, 856, 836, 374, 23809, 128001}; + const auto out = backend.generate(promptTokens, {.max_new_tokens = 32}, {.top_k = 40}); + + if (out.has_value()) + fmt::print(FMT_STRING("Generated: {}"), *out); + else { + const auto err = out.error(); + fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast(err)); } } From d52b4c497887e097ea189fae8d29431e16e1e905 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 31 Oct 2024 17:51:57 +0100 Subject: [PATCH 24/92] feat(backend): full rework of the backend internal to safer c++ --- backends/llamacpp/csrc/backend.cpp | 16 ++++-- backends/llamacpp/csrc/backend.hpp | 12 +++++ backends/llamacpp/csrc/ffi.hpp | 86 ++++++++++++++++++++++++------ backends/llamacpp/src/backend.rs | 43 ++++++++------- backends/llamacpp/src/lib.rs | 51 +++++++++++++++--- backends/llamacpp/src/main.rs | 2 +- 6 files changed, 166 insertions(+), 44 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index daf8de54484..f2f5d4c6aca 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -21,7 +21,7 @@ namespace huggingface::tgi::backends::llamacpp { batch.token[i] = input_tokens[i]; batch.pos[i] = i; batch.n_seq_id[i] = 1; - batch.seq_id[i] = 0; + batch.seq_id[i] = nullptr; batch.logits[i] = false; ++batch.n_tokens; } @@ -84,13 +84,12 @@ namespace huggingface::tgi::backends::llamacpp { const generation_context_t &generation_context, const std::optional &callback) const { // Store information about context and generation size - auto prompt_length = std::ssize(generation_context.input_tokens); auto max_new_tokens = generation_context.generation_params.max_new_tokens; // Convert sampling params to what llama.cpp is looking for auto sampler = generation_context.sampling_params.into_llama_sampler(mModel_.get()); - // Setup the prompt + // Set up the prompt auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end()); auto batch = llama_batch_get_one(copy.data(), copy.size()); @@ -168,4 +167,15 @@ namespace huggingface::tgi::backends::llamacpp { ) { return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback); } + + std::expected + multi_worker_backend_t::generate( + std::span, + std::span, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback) { + SPDLOG_ERROR("Not implemented yet"); + return 0uz; + } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index e7545a3c9e1..871490f255b 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -180,8 +180,20 @@ namespace huggingface::tgi::backends::llamacpp { const sampling_params_t &sampling_params, const std::optional &callback ) override; + }; + class multi_worker_backend_t : backend_base_t { + private: + llama_context_smart_ptr mContext_; + public: + std::expected generate( + std::span, + std::span, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + const std::optional &callback + ) override; }; } diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index d15728b9a02..182541141c6 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -12,36 +12,92 @@ #include #include "backend.hpp" -namespace huggingface::tgi::backends::llamacpp::impl { - class LlamaCppBackendImpl; +namespace huggingface::tgi::backends::llamacpp { + struct generation_params_t; + struct sampling_params_t; + + class llama_cpp_backend_impl_t; } #include "backends/llamacpp/src/lib.rs.h" -namespace huggingface::tgi::backends::llamacpp::impl { +namespace huggingface::tgi::backends::llamacpp { + + // Concept identifying types which have a .generate() -> size_t method to do in-place generation + template + concept has_emplace_generate = requires( + T t, + std::span input_tokens, + std::span generated_tokens, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + llama_decode_callback callback + ) { + { + t.generate(input_tokens, generated_tokens, generation_params, sampling_params, callback) + } -> std::same_as>; + }; + + static_assert(has_emplace_generate, + "single_worker_backend_t doesn't meet concept is_generate_emplace_capable"); + static_assert(has_emplace_generate, + "multi_worker_backend_t doesn't meet concept is_generate_emplace_capable"); - class LlamaCppBackendException : std::exception { + class llama_cpp_backend_exception_t : std::exception { }; - class LlamaCppBackendImpl { + /** + * Llama.cpp backend interfacing with Rust FFI layer + */ + class llama_cpp_backend_impl_t { private: - BackendBase _inner; + std::variant mInner_; public: - LlamaCppBackendImpl(llama_model *model) : _inner(model) {} - }; + explicit llama_cpp_backend_impl_t(single_worker_backend_t &&backend) : mInner_(std::move(backend)) {} + + explicit llama_cpp_backend_impl_t(multi_worker_backend_t &&backend) : mInner_(std::move(backend)) {} - std::unique_ptr CreateLlamaCppBackendImpl(rust::Str modelPath, uint16_t nThreads) { - const auto cxxPath = std::string_view(modelPath); - if (auto maybe = TgiLlamaCppBackend::FromGGUF(std::filesystem::path(cxxPath), nThreads); maybe.has_value()) { - auto [model, context] = *maybe; - return std::make_unique(model, context); - } else { - throw LlamaCppBackendException(); + size_t generate( + rust::Slice input_tokens, + rust::Slice generated_tokens, + const generation_params_t &generation_params, + const sampling_params_t &sampling_params, + rust::Fn callback + ) { + // Define the visitor lambda function which requires the has_emplace_generate constraint on T + static auto inner_fw = [=, &generation_params, &sampling_params](T &&backend) + -> std::expected { + + // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t* + auto input_tokens_v = + std::span(reinterpret_cast(input_tokens.data()), input_tokens.size()); + auto generated_tokens_v = + std::span(reinterpret_cast(generated_tokens.data()), generated_tokens.size()); + + return backend.generate( + input_tokens_v, generated_tokens_v, generation_params, sampling_params, callback); + }; + + if (const auto result = std::visit(inner_fw, mInner_); result.has_value()) { + return *result; + } else { + throw llama_cpp_backend_exception_t(); + } } + }; + + std::unique_ptr create_single_worker_backend(rust::Str modelPath) { + const auto cxxPath = std::string(modelPath); + auto params = llama_model_default_params(); + params.use_mmap = true; + + auto *model = llama_load_model_from_file(cxxPath.c_str(), params); + auto backend = single_worker_backend_t(model, std::nullopt); + return std::make_unique(std::move(backend)); } } diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index af50470d2a3..6e9e8d2d8af 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,7 +1,8 @@ -use crate::ffi::{create_llamacpp_backend, LlamaCppBackendImpl}; +use crate::ffi::{ + create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams, +}; use async_trait::async_trait; use cxx::{Exception, UniquePtr}; -use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::thread::spawn; @@ -25,10 +26,7 @@ pub enum LlamaCppBackendError { pub struct LlamaCppBackend {} impl LlamaCppBackend { - pub fn new + Send>( - model_path: P, - n_threads: u16, - ) -> Result { + pub fn new + Send>(model_path: P) -> Result { let path = Arc::new(model_path.as_ref()); if !path.exists() { return Err(LlamaCppBackendError::ModelFileDoesntExist( @@ -36,13 +34,12 @@ impl LlamaCppBackend { )); } - let mut backend = - create_llamacpp_backend(path.to_str().unwrap(), n_threads).map_err(|err| { - LlamaCppBackendError::ModelInitializationFailed( - path.to_path_buf(), - err.what().to_string(), - ) - })?; + let mut backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| { + LlamaCppBackendError::ModelInitializationFailed( + path.to_path_buf(), + err.what().to_string(), + ) + })?; info!( "Successfully initialized llama.cpp backend from {}", @@ -57,12 +54,20 @@ impl LlamaCppBackend { fn scheduler_loop(mut backend: UniquePtr) { println!("Scheduler loop"); - let tokens = [128000i32, 5159, 836, 374, 23809]; - let mut generated = vec![0i32; 128]; - match backend - .pin_mut() - .generate(&tokens, &mut generated, 40, 32, 1.0, 1.0, 1.0, 1.0, 2014) - { + let tokens = [128000u32, 5159, 836, 374, 23809]; + let mut generated = vec![0u32; 16]; + let generation_params = GenerationParams { + max_new_tokens: generated.len() as u32, + }; + let sampling_params = SamplingParams::default(); + + match backend.pin_mut().generate( + &tokens, + &mut generated, + &generation_params, + &sampling_params, + |new_token_id: u32, is_eos: bool| println!("Generated {new_token_id} (is_eos: {is_eos})"), + ) { Ok(n_tokens) => { generated.truncate(n_tokens); println!("Generated {} tokens -> {:?}", n_tokens, generated); diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 673fe130255..9fb79501ba9 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -1,17 +1,56 @@ +use crate::ffi::SamplingParams; + pub mod backend; -#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp::impl")] +impl Default for SamplingParams { + fn default() -> Self { + Self { + top_k: u32::MAX, + top_p: 1.0f32, + frequency_penalty: 0.0f32, + repetition_penalty: 0.0f32, + seed: 2014u64, + } + } +} + +#[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")] mod ffi { + struct GenerationParams { + max_new_tokens: u32, + } + + struct SamplingParams { + top_k: u32, + top_p: f32, + frequency_penalty: f32, + repetition_penalty: f32, + seed: u64, + } + unsafe extern "C++" { include!("backends/llamacpp/csrc/ffi.hpp"); + #[cxx_name = "generation_params_t"] + type GenerationParams; + + #[cxx_name = "sampling_params_t"] + type SamplingParams; + /// Represent an instance of the llama.cpp backend instance on C++ side + #[cxx_name = "llama_cpp_backend_impl_t"] type LlamaCppBackendImpl; - #[rust_name = "create_llamacpp_backend"] - fn CreateLlamaCppBackendImpl( - modelPath: &str, - n_threads: u16, - ) -> Result>; + #[rust_name = "create_single_worker_backend"] + fn create_single_worker_backend(modelPath: &str) -> Result>; + + fn generate( + self: Pin<&mut LlamaCppBackendImpl>, + tokens: &[u32], + generated: &mut [u32], + generation_params: &GenerationParams, + sampling_params: &SamplingParams, + callback: fn(u32, bool), + ) -> Result; } } diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 3920da21d9d..62f81848744 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -161,7 +161,7 @@ async fn main() -> Result<(), RouterError> { } } - let backend = LlamaCppBackend::new(gguf_path, cores_per_instance)?; + let backend = LlamaCppBackend::new(gguf_path)?; // Run server server::run( From 3af2c6837c77d0d38a7feaf8819289615ce9821a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 31 Oct 2024 17:52:18 +0100 Subject: [PATCH 25/92] misc(offline): match rework --- backends/llamacpp/offline/main.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 57e55efefe0..7eb7dbde0a9 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -18,9 +18,7 @@ int main(int argc, char **argv) { } spdlog::set_level(spdlog::level::debug); - - const auto prompt = "My name is Morgan"; - + const auto modelPath = absolute(std::filesystem::path(argv[1])); const auto params = llama_model_default_params(); auto *model = llama_load_model_from_file(modelPath.c_str(), params); @@ -28,7 +26,7 @@ int main(int argc, char **argv) { auto backend = single_worker_backend_t(model, {}); // generate - const auto promptTokens = {128000, 9906, 856, 836, 374, 23809, 128001}; + const auto promptTokens = {128000, 5159, 836, 374, 23809, 11}; const auto out = backend.generate(promptTokens, {.max_new_tokens = 32}, {.top_k = 40}); if (out.has_value()) From f39edc72ff4eaa3226d3ea469ebad6c107dfd5cb Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 31 Oct 2024 21:32:29 +0100 Subject: [PATCH 26/92] feat(backend): add mapping for ignore_eos_token stopping criteria --- backends/llamacpp/csrc/backend.cpp | 6 ++++-- backends/llamacpp/csrc/backend.hpp | 3 ++- backends/llamacpp/src/lib.rs | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index f2f5d4c6aca..665f78df789 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -113,8 +113,10 @@ namespace huggingface::tgi::backends::llamacpp { auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id); - generation_context.generated_tokens[n_decoded_tokens] = new_token_id; - generating = !is_eos; + if (!generation_context.generation_params.ignore_eos_token) { + generation_context.generated_tokens[n_decoded_tokens] = new_token_id; + generating = !is_eos; + } // Bubble up the generated token if a callback is provided std::invoke(std::forward(callback_), new_token_id, is_eos); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 871490f255b..44952a5ddbf 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -27,7 +27,7 @@ namespace huggingface::tgi::backends::llamacpp { typedef std::unique_ptr llama_context_smart_ptr; typedef std::function llama_decode_callback; - static constexpr auto llama_void_callback = [](llama_token token_id, bool is_eos) {}; + static constexpr auto llama_void_callback = [](llama_token, bool) {}; /** * @@ -59,6 +59,7 @@ namespace huggingface::tgi::backends::llamacpp { */ struct generation_params_t { uint32_t max_new_tokens = std::numeric_limits::max(); + bool ignore_eos_token = false; }; struct generation_context_t { diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 9fb79501ba9..33088d54c25 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -18,6 +18,7 @@ impl Default for SamplingParams { mod ffi { struct GenerationParams { max_new_tokens: u32, + ignore_eos_token: bool, } struct SamplingParams { From d4aee42fd8dc16113c42c1d6032f405717c5794b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 1 Nov 2024 00:49:50 +0100 Subject: [PATCH 27/92] feat(backend): add logit parameter in the callback fn --- backends/llamacpp/csrc/backend.cpp | 4 +++- backends/llamacpp/csrc/backend.hpp | 4 ++-- backends/llamacpp/csrc/ffi.hpp | 2 +- backends/llamacpp/src/lib.rs | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 665f78df789..50d5897cb25 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -111,6 +111,7 @@ namespace huggingface::tgi::backends::llamacpp { if (LLAMA_SUCCESS(status)) { // Sample the new token auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); + auto new_token_logits = 0.0f; // TODO: return logit auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id); if (!generation_context.generation_params.ignore_eos_token) { @@ -119,7 +120,8 @@ namespace huggingface::tgi::backends::llamacpp { } // Bubble up the generated token if a callback is provided - std::invoke(std::forward(callback_), new_token_id, is_eos); + std::invoke( + std::forward(callback_), new_token_id, new_token_logits, is_eos); batch = llama_batch_get_one(&new_token_id, 1); } diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 44952a5ddbf..288bf36afce 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -26,8 +26,8 @@ namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; typedef std::unique_ptr llama_context_smart_ptr; - typedef std::function llama_decode_callback; - static constexpr auto llama_void_callback = [](llama_token, bool) {}; + typedef std::function llama_decode_callback; + static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {}; /** * diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 182541141c6..5c404b01176 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -66,7 +66,7 @@ namespace huggingface::tgi::backends::llamacpp { rust::Slice generated_tokens, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - rust::Fn callback + rust::Fn callback ) { // Define the visitor lambda function which requires the has_emplace_generate constraint on T static auto inner_fw = [=, &generation_params, &sampling_params](T &&backend) diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 33088d54c25..8d51a15a1bb 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -51,7 +51,7 @@ mod ffi { generated: &mut [u32], generation_params: &GenerationParams, sampling_params: &SamplingParams, - callback: fn(u32, bool), + callback: fn(u32, f32, bool), ) -> Result; } } From 612f2f939f2b40d76db4a77032695fb90e1fd084 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 1 Nov 2024 00:50:42 +0100 Subject: [PATCH 28/92] feat(backend): bind incoming request to the server --- backends/llamacpp/src/backend.rs | 158 +++++++++++++++++++++++++------ backends/llamacpp/src/lib.rs | 2 + 2 files changed, 129 insertions(+), 31 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 6e9e8d2d8af..670f4397901 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -2,18 +2,54 @@ use crate::ffi::{ create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams, }; use async_trait::async_trait; -use cxx::{Exception, UniquePtr}; +use cxx::UniquePtr; use std::path::{Path, PathBuf}; +use std::sync::mpsc::{channel, Receiver, SendError, Sender}; use std::sync::Arc; -use std::thread::spawn; +use std::thread::{spawn, JoinHandle}; use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; -use text_generation_router::validation::ValidGenerateRequest; +use text_generation_router::validation::{ + ValidGenerateRequest, ValidParameters, ValidStoppingParameters, +}; +use text_generation_router::Token; use thiserror::Error; +use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; +use tokio::sync::TryAcquireError; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::info; +use tracing::{error, info}; unsafe impl Send for LlamaCppBackendImpl {} +impl From<&ValidParameters> for SamplingParams { + fn from(v: &ValidParameters) -> Self { + Self { + top_k: v.top_k, + top_p: v.top_p, + frequency_penalty: v.frequency_penalty, + repetition_penalty: v.repetition_penalty, + seed: v.seed, + } + } +} + +impl From<&ValidStoppingParameters> for GenerationParams { + fn from(v: &ValidStoppingParameters) -> Self { + Self { + max_new_tokens: v.max_new_tokens, + ignore_eos_token: v.ignore_eos_token, + } + } +} + +#[cfg_attr(debug_assertions, derive(Debug))] +struct InferContext { + pub(crate) stream: UnboundedSender>, + pub(crate) input_tokens: Arc>, + pub(crate) generated_tokens: Vec, + pub(crate) generation_params: GenerationParams, + pub(crate) sampling_params: SamplingParams, +} + #[derive(Debug, Error)] pub enum LlamaCppBackendError { #[error("Provided GGUF model path {0} doesn't exist")] @@ -23,7 +59,10 @@ pub enum LlamaCppBackendError { ModelInitializationFailed(PathBuf, String), } -pub struct LlamaCppBackend {} +pub struct LlamaCppBackend { + backlog: Sender, + scheduler_handle: JoinHandle<()>, +} impl LlamaCppBackend { pub fn new + Send>(model_path: P) -> Result { @@ -34,7 +73,7 @@ impl LlamaCppBackend { )); } - let mut backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| { + let backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| { LlamaCppBackendError::ModelInitializationFailed( path.to_path_buf(), err.what().to_string(), @@ -46,33 +85,67 @@ impl LlamaCppBackend { path.display() ); - let j = spawn(|| scheduler_loop(backend)); - j.join().ok(); - Ok(Self {}) + let (submitter, receiver) = channel(); + let handle = spawn(|| scheduler_loop(backend, receiver)); + Ok(Self { + backlog: submitter, + scheduler_handle: handle, + }) } } -fn scheduler_loop(mut backend: UniquePtr) { - println!("Scheduler loop"); - let tokens = [128000u32, 5159, 836, 374, 23809]; - let mut generated = vec![0u32; 16]; - let generation_params = GenerationParams { - max_new_tokens: generated.len() as u32, - }; - let sampling_params = SamplingParams::default(); - - match backend.pin_mut().generate( - &tokens, - &mut generated, - &generation_params, - &sampling_params, - |new_token_id: u32, is_eos: bool| println!("Generated {new_token_id} (is_eos: {is_eos})"), - ) { - Ok(n_tokens) => { - generated.truncate(n_tokens); - println!("Generated {} tokens -> {:?}", n_tokens, generated); +fn scheduler_loop( + mut backend: UniquePtr, + mut backlog: Receiver, +) { + loop { + println!("Looping"); + if let Ok(mut ctx) = backlog.recv() { + println!("{ctx:?}, {}", &ctx.generated_tokens.capacity()); + match backend.pin_mut().generate( + &ctx.input_tokens, + &mut ctx.generated_tokens, + &ctx.generation_params, + &ctx.sampling_params, + |new_token_id: u32, new_token_logit: f32, is_eos: bool| { + let response = InferStreamResponse::Intermediate { + token: Token { + id: new_token_id, + text: "".to_string(), + logprob: new_token_logit, + special: false, + }, + top_tokens: vec![], + }; + println!("Generated token: {response:?}"); + // let _ = tokio::spawn(async { + // match ctx.stream.send(Ok(response)) { + // Ok(_) => {} + // Err(ref err) => { + // error!( + // "Failed to send back token to the client: {}", + // err.to_string() + // ); + // } + // } + // }); + }, + ) { + Ok(n_tokens) => { + unsafe { + ctx.generated_tokens.set_len(n_tokens); + } + println!( + "Generated {} tokens -> {:?}", + n_tokens, &ctx.generated_tokens + ); + } + Err(err) => println!("Error: {}", err), + } + } else { + info!("IPC channel is closed, exiting the scheduler loop"); + break; } - Err(err) => println!("Error: {}", err), } } @@ -80,9 +153,32 @@ fn scheduler_loop(mut backend: UniquePtr) { impl Backend for LlamaCppBackend { fn schedule( &self, - _request: ValidGenerateRequest, + request: ValidGenerateRequest, ) -> Result>, InferError> { - Err(InferError::GenerationError("Not implemented yet".into())) + if let Some(input_ids) = request.input_ids { + let (sx, rx) = unbounded_channel(); + let sampling_params = SamplingParams::from(&request.parameters); + let generation_params = GenerationParams::from(&request.stopping_parameters); + + let ctx = InferContext { + stream: sx, + input_tokens: Arc::clone(&input_ids), + generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize), + generation_params, + sampling_params, + }; + + match self.backlog.send(ctx) { + Ok(_) => Ok(UnboundedReceiverStream::new(rx)), + Err(_) => Err(InferError::GenerationError( + "Failed to sent the request".to_string(), + )), + } + } else { + Err(InferError::GenerationError( + "Unsupported modalities".to_string(), + )) + } } async fn health(&self, _: bool) -> bool { diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 8d51a15a1bb..489188c1a6f 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -16,11 +16,13 @@ impl Default for SamplingParams { #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")] mod ffi { + #[derive(Debug, Copy, Clone)] struct GenerationParams { max_new_tokens: u32, ignore_eos_token: bool, } + #[derive(Debug, Copy, Clone)] struct SamplingParams { top_k: u32, top_p: f32, From b50dcddbb8d5b02633083dbcb626d33b531fc9b3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 00:36:32 +0100 Subject: [PATCH 29/92] feat(backend): avoid dropping the boxed stream at the end of the callback --- backends/llamacpp/csrc/ffi.hpp | 21 +++++++-- backends/llamacpp/src/backend.rs | 80 +++++++++++++++++++------------- backends/llamacpp/src/lib.rs | 23 +++++++-- 3 files changed, 84 insertions(+), 40 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 5c404b01176..c823b72b83f 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -21,6 +21,7 @@ namespace huggingface::tgi::backends::llamacpp { #include "backends/llamacpp/src/lib.rs.h" +#include "rust/cxx.h" namespace huggingface::tgi::backends::llamacpp { @@ -61,17 +62,22 @@ namespace huggingface::tgi::backends::llamacpp { explicit llama_cpp_backend_impl_t(multi_worker_backend_t &&backend) : mInner_(std::move(backend)) {} - size_t generate( + size_t stream( rust::Slice input_tokens, rust::Slice generated_tokens, - const generation_params_t &generation_params, + const generation_params_t generation_params, const sampling_params_t &sampling_params, - rust::Fn callback + OpaqueStream *stream, + rust::Fn callback ) { // Define the visitor lambda function which requires the has_emplace_generate constraint on T - static auto inner_fw = [=, &generation_params, &sampling_params](T &&backend) + static auto inner_fw = [=, &sampling_params, &stream, &callback](T &&backend) -> std::expected { + auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){ + callback(stream, new_token_id, logits, is_eos); + }; + // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t* auto input_tokens_v = std::span(reinterpret_cast(input_tokens.data()), input_tokens.size()); @@ -79,7 +85,12 @@ namespace huggingface::tgi::backends::llamacpp { std::span(reinterpret_cast(generated_tokens.data()), generated_tokens.size()); return backend.generate( - input_tokens_v, generated_tokens_v, generation_params, sampling_params, callback); + input_tokens_v, + generated_tokens_v, + generation_params, + sampling_params, + context_forwarding_callback + ); }; if (const auto result = std::visit(inner_fw, mInner_); result.has_value()) { diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 670f4397901..09afbc7bec0 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,23 +1,27 @@ use crate::ffi::{ create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams, }; +use crate::OpaqueStream; use async_trait::async_trait; use cxx::UniquePtr; use std::path::{Path, PathBuf}; use std::sync::mpsc::{channel, Receiver, SendError, Sender}; use std::sync::Arc; use std::thread::{spawn, JoinHandle}; -use text_generation_router::infer::{Backend, InferError, InferStreamResponse}; +use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::validation::{ ValidGenerateRequest, ValidParameters, ValidStoppingParameters, }; -use text_generation_router::Token; +use text_generation_router::{FinishReason, Token}; use thiserror::Error; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::sync::TryAcquireError; +use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{error, info}; +type BoxedOpaqueStream = Box; + unsafe impl Send for LlamaCppBackendImpl {} impl From<&ValidParameters> for SamplingParams { @@ -86,7 +90,7 @@ impl LlamaCppBackend { ); let (submitter, receiver) = channel(); - let handle = spawn(|| scheduler_loop(backend, receiver)); + let handle = unsafe { spawn(|| scheduler_loop(backend, receiver)) }; Ok(Self { backlog: submitter, scheduler_handle: handle, @@ -94,47 +98,59 @@ impl LlamaCppBackend { } } -fn scheduler_loop( +fn llama_generate_callback( + channel: *mut OpaqueStream, + new_token_id: u32, + new_token_logit: f32, + is_eos: bool, +) { + let response = InferStreamResponse::Intermediate { + token: Token { + id: new_token_id, + text: "".to_string(), + logprob: new_token_logit, + special: false, + }, + top_tokens: vec![], + }; + println!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}"); + + unsafe { + if let Err(ref err) = (*channel).0.send(Ok(response)) { + error!( + "Failed to send back token to the client: {}", + err.to_string() + ); + } + } +} + +unsafe fn scheduler_loop( mut backend: UniquePtr, mut backlog: Receiver, ) { loop { - println!("Looping"); if let Ok(mut ctx) = backlog.recv() { - println!("{ctx:?}, {}", &ctx.generated_tokens.capacity()); - match backend.pin_mut().generate( + let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream)); + let stream_ptr = Box::into_raw(stream); + let result = backend.pin_mut().stream( &ctx.input_tokens, &mut ctx.generated_tokens, - &ctx.generation_params, + ctx.generation_params, &ctx.sampling_params, - |new_token_id: u32, new_token_logit: f32, is_eos: bool| { - let response = InferStreamResponse::Intermediate { - token: Token { - id: new_token_id, - text: "".to_string(), - logprob: new_token_logit, - special: false, - }, - top_tokens: vec![], - }; - println!("Generated token: {response:?}"); - // let _ = tokio::spawn(async { - // match ctx.stream.send(Ok(response)) { - // Ok(_) => {} - // Err(ref err) => { - // error!( - // "Failed to send back token to the client: {}", - // err.to_string() - // ); - // } - // } - // }); - }, - ) { + stream_ptr, + llama_generate_callback, + ); + + // Make sure we re-keep track of the OpaqueStream box + let _ = Box::from_raw(stream_ptr); + + match result { Ok(n_tokens) => { unsafe { ctx.generated_tokens.set_len(n_tokens); } + println!( "Generated {} tokens -> {:?}", n_tokens, &ctx.generated_tokens diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 489188c1a6f..f923526f98b 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -1,4 +1,6 @@ use crate::ffi::SamplingParams; +use text_generation_router::infer::{InferError, InferStreamResponse}; +use tokio::sync::mpsc::UnboundedSender; pub mod backend; @@ -14,6 +16,8 @@ impl Default for SamplingParams { } } +struct OpaqueStream(UnboundedSender>); + #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")] mod ffi { #[derive(Debug, Copy, Clone)] @@ -31,6 +35,10 @@ mod ffi { seed: u64, } + extern "Rust" { + type OpaqueStream; + } + unsafe extern "C++" { include!("backends/llamacpp/csrc/ffi.hpp"); @@ -47,13 +55,22 @@ mod ffi { #[rust_name = "create_single_worker_backend"] fn create_single_worker_backend(modelPath: &str) -> Result>; - fn generate( + // fn generate( + // self: Pin<&mut LlamaCppBackendImpl>, + // tokens: &[u32], + // generated: &mut [u32], + // generation_params: GenerationParams, + // sampling_params: &SamplingParams, + // ) -> Result; + + unsafe fn stream( self: Pin<&mut LlamaCppBackendImpl>, tokens: &[u32], generated: &mut [u32], - generation_params: &GenerationParams, + generation_params: GenerationParams, sampling_params: &SamplingParams, - callback: fn(u32, f32, bool), + stream: *mut OpaqueStream, + callback: unsafe fn(*mut OpaqueStream, u32, f32, bool), ) -> Result; } } From 3e82f14f577fd2ac3c8b2b4352e0c0bfbca8373d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 00:46:04 +0100 Subject: [PATCH 30/92] feat(backend): somewhat generates the final infer response --- backends/llamacpp/src/backend.rs | 34 +++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 09afbc7bec0..5262bd8a919 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -18,7 +18,7 @@ use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::sync::TryAcquireError; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{error, info}; +use tracing::{debug, error, info}; type BoxedOpaqueStream = Box; @@ -113,7 +113,7 @@ fn llama_generate_callback( }, top_tokens: vec![], }; - println!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}"); + debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}"); unsafe { if let Err(ref err) = (*channel).0.send(Ok(response)) { @@ -121,7 +121,7 @@ fn llama_generate_callback( "Failed to send back token to the client: {}", err.to_string() ); - } + }; } } @@ -131,6 +131,7 @@ unsafe fn scheduler_loop( ) { loop { if let Ok(mut ctx) = backlog.recv() { + let start = Instant::now(); let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream)); let stream_ptr = Box::into_raw(stream); let result = backend.pin_mut().stream( @@ -143,7 +144,7 @@ unsafe fn scheduler_loop( ); // Make sure we re-keep track of the OpaqueStream box - let _ = Box::from_raw(stream_ptr); + let stream = Box::from_raw(stream_ptr); match result { Ok(n_tokens) => { @@ -151,12 +152,27 @@ unsafe fn scheduler_loop( ctx.generated_tokens.set_len(n_tokens); } - println!( - "Generated {} tokens -> {:?}", - n_tokens, &ctx.generated_tokens - ); + let _ = stream.0.send(Ok(InferStreamResponse::End { + token: Token { + id: ctx.generated_tokens[n_tokens - 1], + text: "".to_string(), + logprob: 0.0, + special: false, + }, + top_tokens: vec![], + generated_text: GeneratedText { + text: "".to_string(), + generated_tokens: n_tokens as u32, + finish_reason: FinishReason::Length, + seed: Some(ctx.sampling_params.seed), + }, + start, + queued: start, + })); + + debug!("Generated {n_tokens} tokens -> {:?}", ctx.generated_tokens); } - Err(err) => println!("Error: {}", err), + Err(err) => println!("Error: {err}"), } } else { info!("IPC channel is closed, exiting the scheduler loop"); From bd8f0f15e11d433e31d64ae101d6e5c62b1765d6 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 00:52:58 +0100 Subject: [PATCH 31/92] feat(backend): fix invalid reference to ctx instead of context in release build --- backends/llamacpp/csrc/backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 50d5897cb25..b88067f8b1a 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp { const auto latency = std::chrono::duration_cast(end - start); SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); #else - const auto status = llama_decode(ctx, batch); + const auto status = llama_decode(context, batch); #endif batch.n_tokens = 0; if (LLAMA_SUCCESS(status)) { From 2cdfed94d92299479d9d022f9d88dd11cf785a29 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 00:53:17 +0100 Subject: [PATCH 32/92] feat(backend): correctly link to shared fmt and spdlog instead of static --- backends/llamacpp/build.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 6d6bd514957..eefc6403278 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -102,8 +102,8 @@ fn main() { println!("cargo:rustc-link-lib=static=fmtd"); println!("cargo:rustc-link-lib=static=spdlogd"); } else { - println!("cargo:rustc-link-lib=static=fmt"); - println!("cargo:rustc-link-lib=static=spdlog"); + println!("cargo:rustc-link-lib=fmt"); + println!("cargo:rustc-link-lib=spdlog"); } println!("cargo:rustc-link-lib=static=common"); From 86a2ae6ba2ad74b28521cb9f1732b4af96811709 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 00:53:34 +0100 Subject: [PATCH 33/92] chore: unsued variables --- backends/llamacpp/src/backend.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 5262bd8a919..bfdac34b520 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -5,7 +5,7 @@ use crate::OpaqueStream; use async_trait::async_trait; use cxx::UniquePtr; use std::path::{Path, PathBuf}; -use std::sync::mpsc::{channel, Receiver, SendError, Sender}; +use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Arc; use std::thread::{spawn, JoinHandle}; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; @@ -15,7 +15,6 @@ use text_generation_router::validation::{ use text_generation_router::{FinishReason, Token}; use thiserror::Error; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; -use tokio::sync::TryAcquireError; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, info}; @@ -127,7 +126,7 @@ fn llama_generate_callback( unsafe fn scheduler_loop( mut backend: UniquePtr, - mut backlog: Receiver, + backlog: Receiver, ) { loop { if let Ok(mut ctx) = backlog.recv() { From 7b0a56f40fc5766bef8c707a800b53462399f31c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 11:17:02 +0100 Subject: [PATCH 34/92] feat(backend): fix memory leaking on llama_sampler when the decode ends --- backends/llamacpp/csrc/backend.cpp | 4 ++-- backends/llamacpp/csrc/backend.hpp | 13 ++++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index b88067f8b1a..4b6086200aa 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -29,7 +29,7 @@ namespace huggingface::tgi::backends::llamacpp { batch.logits[batch.n_tokens] = true; } - std::unique_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const { + llama_sampler_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const { auto *pSampler = llama_sampler_chain_init({.no_perf = false}); // Penalties @@ -51,7 +51,7 @@ namespace huggingface::tgi::backends::llamacpp { } llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); - return std::unique_ptr(pSampler); + return llama_sampler_ptr(pSampler, llama_sampler_deleter); } worker_t::worker_t(std::shared_ptr model, const llama_context_params ¶ms) diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 288bf36afce..70f992687f7 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -24,7 +24,10 @@ namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; - typedef std::unique_ptr llama_context_smart_ptr; + typedef std::unique_ptr llama_context_ptr; + + static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); }; + typedef std::unique_ptr llama_sampler_ptr; typedef std::function llama_decode_callback; static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {}; @@ -51,7 +54,7 @@ namespace huggingface::tgi::backends::llamacpp { * @param Pointer to the model data * @return */ - std::unique_ptr into_llama_sampler(const llama_model *pModel) const; + llama_sampler_ptr into_llama_sampler(const llama_model *pModel) const; }; /** @@ -155,7 +158,7 @@ namespace huggingface::tgi::backends::llamacpp { class single_worker_backend_t : backend_base_t { private: - constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_smart_ptr { + constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr { auto llParams = llama_context_default_params(); llParams.flash_attn = true; llParams.n_batch = 1; @@ -165,7 +168,7 @@ namespace huggingface::tgi::backends::llamacpp { return {llama_new_context_with_model(pModel, llParams), llama_context_deleter}; }; - llama_context_smart_ptr mContext_; + llama_context_ptr mContext_; worker_t mWorker_; public: @@ -185,7 +188,7 @@ namespace huggingface::tgi::backends::llamacpp { class multi_worker_backend_t : backend_base_t { private: - llama_context_smart_ptr mContext_; + llama_context_ptr mContext_; public: std::expected generate( From 31d925477600564da81668727f86b954b1a13e26 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 11:25:12 +0100 Subject: [PATCH 35/92] feat(backend): remove static from inner_fw visitor as it leads to invalid memory locations --- backends/llamacpp/csrc/ffi.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index c823b72b83f..63f8d3b6c7f 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -71,7 +71,7 @@ namespace huggingface::tgi::backends::llamacpp { rust::Fn callback ) { // Define the visitor lambda function which requires the has_emplace_generate constraint on T - static auto inner_fw = [=, &sampling_params, &stream, &callback](T &&backend) + auto inner_fw = [=, &sampling_params, &stream, &callback](T &&backend) -> std::expected { auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){ From 188442f67dd68520896b81fe56abf49f55c7082d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 14:26:57 +0100 Subject: [PATCH 36/92] misc(lint): make clippy happier --- Cargo.lock | 36 ++++++----------------------------- backends/llamacpp/Cargo.toml | 2 +- backends/llamacpp/src/main.rs | 36 ++++++++++++++--------------------- 3 files changed, 21 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 479e94d7fca..6b6cb7a7e18 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4239,7 +4239,7 @@ dependencies = [ "tracing", "tracing-opentelemetry 0.27.0", "tracing-subscriber", - "utoipa 5.1.2", + "utoipa", ] [[package]] @@ -4368,7 +4368,7 @@ dependencies = [ "tracing-opentelemetry 0.21.0", "tracing-subscriber", "ureq", - "utoipa 4.2.3", + "utoipa", "utoipa-swagger-ui", "uuid", "vergen", @@ -4419,7 +4419,7 @@ dependencies = [ "tracing", "tracing-opentelemetry 0.21.0", "tracing-subscriber", - "utoipa 4.2.3", + "utoipa", "utoipa-swagger-ui", ] @@ -4470,7 +4470,7 @@ dependencies = [ "tracing", "tracing-opentelemetry 0.21.0", "tracing-subscriber", - "utoipa 4.2.3", + "utoipa", "utoipa-swagger-ui", ] @@ -5192,19 +5192,7 @@ dependencies = [ "indexmap 2.6.0", "serde", "serde_json", - "utoipa-gen 4.3.0", -] - -[[package]] -name = "utoipa" -version = "5.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e12e84f0ff45b6818029cd0f67280e453c80132c1b9897df407ecc20b9f7cfd" -dependencies = [ - "indexmap 2.5.0", - "serde", - "serde_json", - "utoipa-gen 5.1.2", + "utoipa-gen", ] [[package]] @@ -5220,18 +5208,6 @@ dependencies = [ "syn 2.0.85", ] -[[package]] -name = "utoipa-gen" -version = "5.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dfc694d3a3118d2b9e80d68be83bf1aab7988510916934db83da61c14e7e6b2" -dependencies = [ - "proc-macro2", - "quote", - "regex", - "syn 2.0.79", -] - [[package]] name = "utoipa-swagger-ui" version = "6.0.0" @@ -5244,7 +5220,7 @@ dependencies = [ "rust-embed", "serde", "serde_json", - "utoipa 4.2.3", + "utoipa", "zip", ] diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 4a14dcdfd05..48a0bb84362 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -22,7 +22,7 @@ tokenizers = { workspace = true } tracing = "0.1" tracing-opentelemetry = "0.27.0" tracing-subscriber = { version = "0.3", features = ["json", "env-filter"] } -utoipa = { version = "5.1.2", features = ["axum_extras"] } +utoipa = { version = "4.2.3", features = ["axum_extras"] } log = "0.4.22" [build-dependencies] diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index 62f81848744..f128a6a3fc6 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,6 +1,7 @@ use clap::{Parser, Subcommand}; use std::path::PathBuf; use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackendError}; +use text_generation_router::server::ApiDoc; use text_generation_router::{server, usage_stats}; use thiserror::Error; @@ -35,13 +36,8 @@ struct Args { port: u16, #[clap(long, env, help = "Path to GGUF model file(s) to load")] gguf_path: PathBuf, - #[clap( - long, - env, - default_value = "1", - help = "Number of CPU threads allocated to one llama.cpp model" - )] - cores_per_instance: u16, + #[clap(long, env, default_value = "1", help = "Number of model instance(s)")] + num_model_instance: u16, #[clap(default_value = "bigscience/bloom", long, env)] tokenizer_name: String, #[clap(long, env)] @@ -67,8 +63,6 @@ struct Args { #[clap(long, env)] ngrok_edge: Option, #[clap(long, env, default_value_t = false)] - messages_api_enabled: bool, - #[clap(long, env, default_value_t = false)] disable_grammar_support: bool, #[clap(default_value = "4", long, env)] max_client_batch_size: usize, @@ -100,7 +94,7 @@ async fn main() -> Result<(), RouterError> { hostname, port, gguf_path, - cores_per_instance, + num_model_instance, tokenizer_name, tokenizer_config_path, revision, @@ -113,19 +107,17 @@ async fn main() -> Result<(), RouterError> { ngrok, ngrok_authtoken, ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats, } = args; - // if let Some(Commands::PrintSchema) = command { - // use utoipa::OpenApi; - // let api_doc = ApiDoc::openapi(); - // let api_doc = serde_json::to_string_pretty(&api_doc).unwrap(); - // println!("{}", api_doc); - // std::process::exit(0); - // }; + if let Some(Commands::PrintSchema) = command { + use utoipa::OpenApi; + let api_doc = ApiDoc::openapi().to_pretty_json().unwrap(); + println!("{}", api_doc); + std::process::exit(0); + }; text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output); // Validate args @@ -144,11 +136,11 @@ async fn main() -> Result<(), RouterError> { )); } - if let Some(ref max_batch_total_tokens) = max_batch_total_tokens { - if max_batch_prefill_tokens > *max_batch_total_tokens { + if let Some(max_batch_total_tokens) = max_batch_total_tokens { + if max_batch_prefill_tokens > max_batch_total_tokens { return Err(RouterError::ArgumentValidation(format!("`max_batch_prefill_tokens` must be <= `max_batch_total_tokens`. Given: {max_batch_prefill_tokens} and {max_batch_total_tokens}"))); } - if max_total_tokens as u32 > *max_batch_total_tokens { + if max_total_tokens as u32 > max_batch_total_tokens { return Err(RouterError::ArgumentValidation(format!("`max_total_tokens` must be <= `max_batch_total_tokens`. Given: {max_total_tokens} and {max_batch_total_tokens}"))); } } @@ -177,13 +169,13 @@ async fn main() -> Result<(), RouterError> { tokenizer_name, tokenizer_config_path, revision, + false, hostname, port, cors_allow_origin, ngrok, ngrok_authtoken, ngrok_edge, - messages_api_enabled, disable_grammar_support, max_client_batch_size, usage_stats, From 05ff551950dad2948f5f8fa10234496179dffd42 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 23:07:22 +0100 Subject: [PATCH 37/92] feat(backend): add number of generated tokens in the callback --- backends/llamacpp/csrc/backend.cpp | 4 ++-- backends/llamacpp/csrc/backend.hpp | 4 ++-- backends/llamacpp/csrc/ffi.hpp | 6 +++--- backends/llamacpp/src/backend.rs | 3 ++- backends/llamacpp/src/lib.rs | 2 +- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 4b6086200aa..54e41a14312 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -120,8 +120,8 @@ namespace huggingface::tgi::backends::llamacpp { } // Bubble up the generated token if a callback is provided - std::invoke( - std::forward(callback_), new_token_id, new_token_logits, is_eos); + std::invoke(std::forward(callback_), + new_token_id, new_token_logits, is_eos, n_decoded_tokens); batch = llama_batch_get_one(&new_token_id, 1); } diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 70f992687f7..ebae7fb0db5 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -29,8 +29,8 @@ namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); }; typedef std::unique_ptr llama_sampler_ptr; - typedef std::function llama_decode_callback; - static constexpr auto llama_void_callback = [](llama_token, float_t, bool) {}; + typedef std::function llama_decode_callback; + static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) {}; /** * diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 63f8d3b6c7f..df924cb7fd1 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -68,14 +68,14 @@ namespace huggingface::tgi::backends::llamacpp { const generation_params_t generation_params, const sampling_params_t &sampling_params, OpaqueStream *stream, - rust::Fn callback + rust::Fn callback ) { // Define the visitor lambda function which requires the has_emplace_generate constraint on T auto inner_fw = [=, &sampling_params, &stream, &callback](T &&backend) -> std::expected { - auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos){ - callback(stream, new_token_id, logits, is_eos); + auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){ + callback(stream, new_token_id, logits, is_eos, n_generated_tokens); }; // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t* diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index bfdac34b520..c3fff6979b6 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -102,6 +102,7 @@ fn llama_generate_callback( new_token_id: u32, new_token_logit: f32, is_eos: bool, + n_generated_tokens: usize, ) { let response = InferStreamResponse::Intermediate { token: Token { @@ -112,7 +113,7 @@ fn llama_generate_callback( }, top_tokens: vec![], }; - debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos}"); + info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos} ({n_generated_tokens})"); unsafe { if let Err(ref err) = (*channel).0.send(Ok(response)) { diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index f923526f98b..277f77cbf04 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -70,7 +70,7 @@ mod ffi { generation_params: GenerationParams, sampling_params: &SamplingParams, stream: *mut OpaqueStream, - callback: unsafe fn(*mut OpaqueStream, u32, f32, bool), + callback: unsafe fn(*mut OpaqueStream, u32, f32, bool, usize), ) -> Result; } } From 06424aa9ff44a7d3edee24cb8ce7de5681222184 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 3 Nov 2024 23:50:46 +0100 Subject: [PATCH 38/92] feat(backend): correctly handle the max_new_tokens case for is_eos --- backends/llamacpp/csrc/backend.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 54e41a14312..733a826a70b 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -113,6 +113,7 @@ namespace huggingface::tgi::backends::llamacpp { auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); auto new_token_logits = 0.0f; // TODO: return logit auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id); + auto effective_n_decoded_tokens = n_decoded_tokens + 1; if (!generation_context.generation_params.ignore_eos_token) { generation_context.generated_tokens[n_decoded_tokens] = new_token_id; @@ -121,7 +122,10 @@ namespace huggingface::tgi::backends::llamacpp { // Bubble up the generated token if a callback is provided std::invoke(std::forward(callback_), - new_token_id, new_token_logits, is_eos, n_decoded_tokens); + new_token_id, + new_token_logits, + is_eos || effective_n_decoded_tokens == max_new_tokens, + effective_n_decoded_tokens); batch = llama_batch_get_one(&new_token_id, 1); } From 11c593dc69f9c7b800cd0dbac73e1e00d696867a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 4 Nov 2024 00:11:55 +0100 Subject: [PATCH 39/92] feat(backend): make eog clearer on c++ side --- backends/llamacpp/csrc/backend.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 733a826a70b..79c09a26c6a 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -95,7 +95,7 @@ namespace huggingface::tgi::backends::llamacpp { // Decode auto n_decoded_tokens = 0; - for (bool generating = true; generating && n_decoded_tokens < max_new_tokens; ++n_decoded_tokens) { + for (bool generating = true; generating; ++n_decoded_tokens) { const auto callback_ = callback.value_or(llama_void_callback); #ifdef TGI_LLAMACPP_BACKEND_DEBUG @@ -108,24 +108,27 @@ namespace huggingface::tgi::backends::llamacpp { const auto status = llama_decode(context, batch); #endif batch.n_tokens = 0; - if (LLAMA_SUCCESS(status)) { + if (LLAMA_SUCCESS(status)) [[likely]] { // Sample the new token auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); + auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id); auto new_token_logits = 0.0f; // TODO: return logit - auto is_eos = llama_token_is_eog(mModel_.get(), new_token_id); - auto effective_n_decoded_tokens = n_decoded_tokens + 1; - if (!generation_context.generation_params.ignore_eos_token) { - generation_context.generated_tokens[n_decoded_tokens] = new_token_id; - generating = !is_eos; - } + // Push the token to the generated vector on Rust side + generation_context.generated_tokens[n_decoded_tokens] = new_token_id; + + // Handle termination cases + const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1; + const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog; + + generating = !(has_reach_max_tokens | has_reach_eog); // Bubble up the generated token if a callback is provided std::invoke(std::forward(callback_), new_token_id, new_token_logits, - is_eos || effective_n_decoded_tokens == max_new_tokens, - effective_n_decoded_tokens); + !generating, + n_decoded_tokens + 1); batch = llama_batch_get_one(&new_token_id, 1); } From 5b7a951389216a58cc603c28b1c3ea8e87930bca Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 4 Nov 2024 16:17:43 +0100 Subject: [PATCH 40/92] feat(backend): refactor the callback to handle intermediate and end inference message --- backends/llamacpp/csrc/backend.cpp | 35 +++---- backends/llamacpp/csrc/backend.hpp | 44 ++++---- backends/llamacpp/csrc/ffi.hpp | 27 ++--- backends/llamacpp/src/backend.rs | 157 ++++++++++++++++------------- backends/llamacpp/src/lib.rs | 12 +-- 5 files changed, 138 insertions(+), 137 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 79c09a26c6a..65898dfe772 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -114,9 +114,6 @@ namespace huggingface::tgi::backends::llamacpp { auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id); auto new_token_logits = 0.0f; // TODO: return logit - // Push the token to the generated vector on Rust side - generation_context.generated_tokens[n_decoded_tokens] = new_token_id; - // Handle termination cases const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1; const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog; @@ -150,10 +147,15 @@ namespace huggingface::tgi::backends::llamacpp { ) { // TODO: Should we provide a way to change this value? auto generated = std::vector(2 << 8); + auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos, + size_t num_generated_tokens) { + generated.emplace_back(new_token_id); + + if (callback.has_value()) + (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens); + }; - auto nTokensGenerated = generate(tokens, generated, generation_params, sampling_params, callback); - if (nTokensGenerated.has_value()) - generated.resize(*nTokensGenerated); + auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback); return generated; } @@ -168,25 +170,24 @@ namespace huggingface::tgi::backends::llamacpp { llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); } - std::expected - single_worker_backend_t::generate( + std::expected + single_worker_backend_t::stream( std::span tokens, - std::span out, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - const std::optional &callback + const llama_decode_callback &callback ) { - return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens, out}, callback); + return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens}, callback); } std::expected - multi_worker_backend_t::generate( - std::span, - std::span, + multi_worker_backend_t::stream( + std::span tokens, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - const std::optional &callback) { - SPDLOG_ERROR("Not implemented yet"); - return 0uz; + const llama_decode_callback &callback + ) { + SPDLOG_WARN("Not implemented for multi_worker_t"); + return 0; } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index ebae7fb0db5..1fef7fb8931 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -69,7 +69,6 @@ namespace huggingface::tgi::backends::llamacpp { generation_params_t generation_params; sampling_params_t sampling_params; std::span input_tokens; - std::span generated_tokens; }; /** @@ -125,34 +124,34 @@ namespace huggingface::tgi::backends::llamacpp { /** * * @param tokens - * @params out - * @param params - * @param maxNewTokens + * @param generation_params + * @param sampling_params + * @param callback * @return */ [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - virtual std::expected generate( - std::span input_tokens, - std::span generated_tokens, + std::expected, backend_error_t> generate( + std::span tokens, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - const std::optional &callback - ) = 0; + const std::optional &callback = std::nullopt + ); /** * * @param tokens - * @param params - * @param maxNewTokens + * @param generation_params + * @param sampling_params + * @params callback * @return */ [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - std::expected, backend_error_t> generate( + virtual std::expected stream( std::span tokens, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - const std::optional &callback = std::nullopt - ); + const llama_decode_callback &callback + ) = 0; }; @@ -174,16 +173,11 @@ namespace huggingface::tgi::backends::llamacpp { public: explicit single_worker_backend_t(llama_model *pModel, const std::optional &); - using backend_base_t::generate; - - std::expected - generate( + std::expected stream( std::span tokens, - std::span out, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - const std::optional &callback - ) override; + const llama_decode_callback &callback) override; }; class multi_worker_backend_t : backend_base_t { @@ -191,13 +185,11 @@ namespace huggingface::tgi::backends::llamacpp { llama_context_ptr mContext_; public: - std::expected generate( - std::span, - std::span, + std::expected stream( + std::span tokens, const generation_params_t &generation_params, const sampling_params_t &sampling_params, - const std::optional &callback - ) override; + const llama_decode_callback &callback) override; }; } diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index df924cb7fd1..3ae392f624c 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -28,23 +28,20 @@ namespace huggingface::tgi::backends::llamacpp { // Concept identifying types which have a .generate() -> size_t method to do in-place generation template - concept has_emplace_generate = requires( + concept has_stream_method = requires( T t, std::span input_tokens, - std::span generated_tokens, const generation_params_t &generation_params, const sampling_params_t &sampling_params, llama_decode_callback callback ) { { - t.generate(input_tokens, generated_tokens, generation_params, sampling_params, callback) + t.stream(input_tokens, generation_params, sampling_params, callback) } -> std::same_as>; }; - static_assert(has_emplace_generate, - "single_worker_backend_t doesn't meet concept is_generate_emplace_capable"); - static_assert(has_emplace_generate, - "multi_worker_backend_t doesn't meet concept is_generate_emplace_capable"); + static_assert(has_stream_method, "single_worker_backend_t doesn't meet concept has_stream_method"); + static_assert(has_stream_method, "multi_worker_backend_t doesn't meet concept has_stream_method"); class llama_cpp_backend_exception_t : std::exception { @@ -64,29 +61,25 @@ namespace huggingface::tgi::backends::llamacpp { size_t stream( rust::Slice input_tokens, - rust::Slice generated_tokens, const generation_params_t generation_params, const sampling_params_t &sampling_params, - OpaqueStream *stream, - rust::Fn callback + InferContext *ctx, + rust::Fn callback ) { // Define the visitor lambda function which requires the has_emplace_generate constraint on T - auto inner_fw = [=, &sampling_params, &stream, &callback](T &&backend) + auto inner_fw = [=, &sampling_params, &ctx, &callback](T &&backend) -> std::expected { - auto context_forwarding_callback = [=, &stream](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){ - callback(stream, new_token_id, logits, is_eos, n_generated_tokens); + auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){ + callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); }; // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t* auto input_tokens_v = std::span(reinterpret_cast(input_tokens.data()), input_tokens.size()); - auto generated_tokens_v = - std::span(reinterpret_cast(generated_tokens.data()), generated_tokens.size()); - return backend.generate( + return backend.stream( input_tokens_v, - generated_tokens_v, generation_params, sampling_params, context_forwarding_callback diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index c3fff6979b6..06e8d43ed9b 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,7 +1,6 @@ use crate::ffi::{ create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams, }; -use crate::OpaqueStream; use async_trait::async_trait; use cxx::UniquePtr; use std::path::{Path, PathBuf}; @@ -14,12 +13,13 @@ use text_generation_router::validation::{ }; use text_generation_router::{FinishReason, Token}; use thiserror::Error; +use tokio::sync::mpsc::error::SendError; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, info}; -type BoxedOpaqueStream = Box; +type InferResult = Result; unsafe impl Send for LlamaCppBackendImpl {} @@ -45,14 +45,19 @@ impl From<&ValidStoppingParameters> for GenerationParams { } #[cfg_attr(debug_assertions, derive(Debug))] -struct InferContext { - pub(crate) stream: UnboundedSender>, +struct GenerationContext { pub(crate) input_tokens: Arc>, pub(crate) generated_tokens: Vec, pub(crate) generation_params: GenerationParams, pub(crate) sampling_params: SamplingParams, } +pub(crate) struct InferContext { + pub(crate) start: Instant, + pub(crate) stream: UnboundedSender, + pub(crate) generation: GenerationContext, +} + #[derive(Debug, Error)] pub enum LlamaCppBackendError { #[error("Provided GGUF model path {0} doesn't exist")] @@ -63,7 +68,7 @@ pub enum LlamaCppBackendError { } pub struct LlamaCppBackend { - backlog: Sender, + backlog: Sender<(GenerationContext, UnboundedSender)>, scheduler_handle: JoinHandle<()>, } @@ -98,81 +103,96 @@ impl LlamaCppBackend { } fn llama_generate_callback( - channel: *mut OpaqueStream, + ctx: *mut InferContext, new_token_id: u32, new_token_logit: f32, - is_eos: bool, + is_final: bool, n_generated_tokens: usize, ) { - let response = InferStreamResponse::Intermediate { - token: Token { - id: new_token_id, - text: "".to_string(), - logprob: new_token_logit, - special: false, + info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})"); + + // Decode token + let token = Token { + id: new_token_id, + text: "".to_string(), + logprob: new_token_logit, + special: false, + }; + + let ctx = unsafe { &mut *ctx }; + + // Append the new token to the generated ones + ctx.generation.generated_tokens.push(new_token_id); + + // Create the streamed response + let response = match is_final { + false => InferStreamResponse::Intermediate { + token, + top_tokens: vec![], }, - top_tokens: vec![], + true => { + // Decode the whole text + let text = String::new(); + + // Stream end response + InferStreamResponse::End { + token, + top_tokens: vec![], + generated_text: GeneratedText { + text, + generated_tokens: n_generated_tokens as u32, + finish_reason: FinishReason::Length, + seed: Some(ctx.generation.sampling_params.seed), + }, + start: ctx.start, + queued: ctx.start, + } + } }; - info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_eos={is_eos} ({n_generated_tokens})"); - - unsafe { - if let Err(ref err) = (*channel).0.send(Ok(response)) { - error!( - "Failed to send back token to the client: {}", - err.to_string() - ); - }; + + // Send back to the client + if let Err(ref err) = ctx.stream.send(Ok(response)) { + error!("Failed to send back the response to the client, cancelling request"); + // TODO: cancel the request } } unsafe fn scheduler_loop( mut backend: UniquePtr, - backlog: Receiver, + backlog: Receiver<(GenerationContext, UnboundedSender)>, ) { + // This loop will mostly decode single token at every step, so no need to rely on parallelism + tokenizers::utils::parallelism::set_parallelism(false); + loop { - if let Ok(mut ctx) = backlog.recv() { + if let Ok((generation, stream)) = backlog.recv() { let start = Instant::now(); - let stream = BoxedOpaqueStream::new(OpaqueStream(ctx.stream)); - let stream_ptr = Box::into_raw(stream); - let result = backend.pin_mut().stream( - &ctx.input_tokens, - &mut ctx.generated_tokens, - ctx.generation_params, - &ctx.sampling_params, - stream_ptr, - llama_generate_callback, - ); - - // Make sure we re-keep track of the OpaqueStream box - let stream = Box::from_raw(stream_ptr); - - match result { - Ok(n_tokens) => { - unsafe { - ctx.generated_tokens.set_len(n_tokens); - } - - let _ = stream.0.send(Ok(InferStreamResponse::End { - token: Token { - id: ctx.generated_tokens[n_tokens - 1], - text: "".to_string(), - logprob: 0.0, - special: false, - }, - top_tokens: vec![], - generated_text: GeneratedText { - text: "".to_string(), - generated_tokens: n_tokens as u32, - finish_reason: FinishReason::Length, - seed: Some(ctx.sampling_params.seed), - }, - start, - queued: start, - })); - - debug!("Generated {n_tokens} tokens -> {:?}", ctx.generated_tokens); + let generation_params = generation.generation_params; // copy + let sampling_params = generation.sampling_params; // copy + let input_tokens = Arc::clone(&generation.input_tokens); + + // Creating the whole InferContext and pushing it to the heap + { + let ctx = Box::new(InferContext { + start, + stream, + generation, + }); + + let boxed_ctx = Box::into_raw(ctx); + + if let Err(e) = backend.pin_mut().stream( + &input_tokens, + generation_params, + &sampling_params, + boxed_ctx, + llama_generate_callback, + ) { + error!("Error while decoding tokens... {}", e.what()); } - Err(err) => println!("Error: {err}"), + + // Make sure we re-keep track of the OpaqueStream box + let _ = Box::from_raw(boxed_ctx); } } else { info!("IPC channel is closed, exiting the scheduler loop"); @@ -186,21 +206,20 @@ impl Backend for LlamaCppBackend { fn schedule( &self, request: ValidGenerateRequest, - ) -> Result>, InferError> { + ) -> Result, InferError> { if let Some(input_ids) = request.input_ids { let (sx, rx) = unbounded_channel(); let sampling_params = SamplingParams::from(&request.parameters); let generation_params = GenerationParams::from(&request.stopping_parameters); - let ctx = InferContext { - stream: sx, + let ctx = GenerationContext { input_tokens: Arc::clone(&input_ids), generated_tokens: Vec::with_capacity(generation_params.max_new_tokens as usize), generation_params, sampling_params, }; - match self.backlog.send(ctx) { + match self.backlog.send((ctx, sx)) { Ok(_) => Ok(UnboundedReceiverStream::new(rx)), Err(_) => Err(InferError::GenerationError( "Failed to sent the request".to_string(), diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 277f77cbf04..01f2054db89 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -1,6 +1,5 @@ +use crate::backend::InferContext; use crate::ffi::SamplingParams; -use text_generation_router::infer::{InferError, InferStreamResponse}; -use tokio::sync::mpsc::UnboundedSender; pub mod backend; @@ -16,8 +15,6 @@ impl Default for SamplingParams { } } -struct OpaqueStream(UnboundedSender>); - #[cxx::bridge(namespace = "huggingface::tgi::backends::llamacpp")] mod ffi { #[derive(Debug, Copy, Clone)] @@ -36,7 +33,7 @@ mod ffi { } extern "Rust" { - type OpaqueStream; + type InferContext; } unsafe extern "C++" { @@ -66,11 +63,10 @@ mod ffi { unsafe fn stream( self: Pin<&mut LlamaCppBackendImpl>, tokens: &[u32], - generated: &mut [u32], generation_params: GenerationParams, sampling_params: &SamplingParams, - stream: *mut OpaqueStream, - callback: unsafe fn(*mut OpaqueStream, u32, f32, bool, usize), + stream: *mut InferContext, + callback: unsafe fn(*mut InferContext, u32, f32, bool, usize), ) -> Result; } } From 958c72a44a4bba4f8cdcb12d09d4038de7dc95bf Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 4 Nov 2024 16:26:05 +0100 Subject: [PATCH 41/92] misc(ffi): remove unused ffi mapping --- backends/llamacpp/src/lib.rs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 01f2054db89..006c7387ae3 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -52,14 +52,6 @@ mod ffi { #[rust_name = "create_single_worker_backend"] fn create_single_worker_backend(modelPath: &str) -> Result>; - // fn generate( - // self: Pin<&mut LlamaCppBackendImpl>, - // tokens: &[u32], - // generated: &mut [u32], - // generation_params: GenerationParams, - // sampling_params: &SamplingParams, - // ) -> Result; - unsafe fn stream( self: Pin<&mut LlamaCppBackendImpl>, tokens: &[u32], From 1473259f84fb0272b357392a13eaa168d39bc1c4 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 4 Nov 2024 17:01:22 +0100 Subject: [PATCH 42/92] feat(backend): add early stopping criteria from TGI stream callback --- backends/llamacpp/csrc/backend.cpp | 16 +++++++++------- backends/llamacpp/csrc/backend.hpp | 4 ++-- backends/llamacpp/csrc/ffi.hpp | 6 +++--- backends/llamacpp/src/backend.rs | 13 ++++++++----- backends/llamacpp/src/lib.rs | 2 +- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 65898dfe772..f69563811da 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -121,11 +121,12 @@ namespace huggingface::tgi::backends::llamacpp { generating = !(has_reach_max_tokens | has_reach_eog); // Bubble up the generated token if a callback is provided - std::invoke(std::forward(callback_), - new_token_id, - new_token_logits, - !generating, - n_decoded_tokens + 1); + const auto should_stop = std::invoke(std::forward(callback_), + new_token_id, + new_token_logits, + !generating, + n_decoded_tokens + 1); + generating ^= should_stop; batch = llama_batch_get_one(&new_token_id, 1); } @@ -148,11 +149,12 @@ namespace huggingface::tgi::backends::llamacpp { // TODO: Should we provide a way to change this value? auto generated = std::vector(2 << 8); auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos, - size_t num_generated_tokens) { + size_t num_generated_tokens) -> bool { generated.emplace_back(new_token_id); if (callback.has_value()) - (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens); + return (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens); + return true; }; auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback); diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 1fef7fb8931..bf9df5cca0e 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -29,8 +29,8 @@ namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); }; typedef std::unique_ptr llama_sampler_ptr; - typedef std::function llama_decode_callback; - static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) {}; + typedef std::function llama_decode_callback; + static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; }; /** * diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 3ae392f624c..f33a2f1ad57 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -64,14 +64,14 @@ namespace huggingface::tgi::backends::llamacpp { const generation_params_t generation_params, const sampling_params_t &sampling_params, InferContext *ctx, - rust::Fn callback + rust::Fn callback ) { // Define the visitor lambda function which requires the has_emplace_generate constraint on T auto inner_fw = [=, &sampling_params, &ctx, &callback](T &&backend) -> std::expected { - auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens){ - callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); + auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool { + return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); }; // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t* diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 06e8d43ed9b..531a07dc6fb 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -13,11 +13,10 @@ use text_generation_router::validation::{ }; use text_generation_router::{FinishReason, Token}; use thiserror::Error; -use tokio::sync::mpsc::error::SendError; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{debug, error, info}; +use tracing::{error, info}; type InferResult = Result; @@ -45,7 +44,7 @@ impl From<&ValidStoppingParameters> for GenerationParams { } #[cfg_attr(debug_assertions, derive(Debug))] -struct GenerationContext { +pub(crate) struct GenerationContext { pub(crate) input_tokens: Arc>, pub(crate) generated_tokens: Vec, pub(crate) generation_params: GenerationParams, @@ -108,7 +107,7 @@ fn llama_generate_callback( new_token_logit: f32, is_final: bool, n_generated_tokens: usize, -) { +) -> bool { info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})"); // Decode token @@ -151,10 +150,14 @@ fn llama_generate_callback( }; // Send back to the client - if let Err(ref err) = ctx.stream.send(Ok(response)) { + if let Err(ref _err) = ctx.stream.send(Ok(response)) { error!("Failed to send back the response to the client, cancelling request"); // TODO: cancel the request + return true; // should_stop } + + // should_stop + false } unsafe fn scheduler_loop( diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 006c7387ae3..abcdd1fad06 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -58,7 +58,7 @@ mod ffi { generation_params: GenerationParams, sampling_params: &SamplingParams, stream: *mut InferContext, - callback: unsafe fn(*mut InferContext, u32, f32, bool, usize), + callback: unsafe fn(*mut InferContext, u32, f32, bool, usize) -> bool, ) -> Result; } } From 1149186794a919d58cf5d43c7a497d81555f20c5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 4 Nov 2024 23:01:57 +0100 Subject: [PATCH 43/92] feat(backend): expose tokenizer to the GenerationContext to decode token --- backends/llamacpp/src/backend.rs | 65 +++++++++++++++++++++----------- backends/llamacpp/src/main.rs | 21 ++++++++--- 2 files changed, 58 insertions(+), 28 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 531a07dc6fb..08fac6755a7 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -13,6 +13,7 @@ use text_generation_router::validation::{ }; use text_generation_router::{FinishReason, Token}; use thiserror::Error; +use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; @@ -54,6 +55,7 @@ pub(crate) struct GenerationContext { pub(crate) struct InferContext { pub(crate) start: Instant, pub(crate) stream: UnboundedSender, + pub(crate) tokenizer: Tokenizer, pub(crate) generation: GenerationContext, } @@ -72,7 +74,10 @@ pub struct LlamaCppBackend { } impl LlamaCppBackend { - pub fn new + Send>(model_path: P) -> Result { + pub fn new + Send>( + model_path: P, + tokenizer: Tokenizer, + ) -> Result { let path = Arc::new(model_path.as_ref()); if !path.exists() { return Err(LlamaCppBackendError::ModelFileDoesntExist( @@ -93,7 +98,7 @@ impl LlamaCppBackend { ); let (submitter, receiver) = channel(); - let handle = unsafe { spawn(|| scheduler_loop(backend, receiver)) }; + let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) }; Ok(Self { backlog: submitter, scheduler_handle: handle, @@ -110,19 +115,25 @@ fn llama_generate_callback( ) -> bool { info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})"); - // Decode token - let token = Token { - id: new_token_id, - text: "".to_string(), - logprob: new_token_logit, - special: false, - }; - let ctx = unsafe { &mut *ctx }; // Append the new token to the generated ones ctx.generation.generated_tokens.push(new_token_id); + // Decode token + let token = match ctx.tokenizer.decode(&[new_token_id], false) { + Ok(text) => { + let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text); + Token { + id: new_token_id, + text, + logprob: new_token_logit, + special, + } + } + Err(_) => panic!("Failed to decode token"), + }; + // Create the streamed response let response = match is_final { false => InferStreamResponse::Intermediate { @@ -131,21 +142,26 @@ fn llama_generate_callback( }, true => { // Decode the whole text - let text = String::new(); - - // Stream end response - InferStreamResponse::End { - token, - top_tokens: vec![], - generated_text: GeneratedText { - text, - generated_tokens: n_generated_tokens as u32, - finish_reason: FinishReason::Length, - seed: Some(ctx.generation.sampling_params.seed), + match ctx + .tokenizer + .decode(&ctx.generation.generated_tokens, false) + { + Ok(text) => InferStreamResponse::End { + token, + top_tokens: vec![], + generated_text: GeneratedText { + text, + generated_tokens: n_generated_tokens as u32, + finish_reason: FinishReason::Length, + seed: Some(ctx.generation.sampling_params.seed), + }, + start: ctx.start, + queued: ctx.start, }, - start: ctx.start, - queued: ctx.start, + Err(_) => panic!("Failed to decode token"), } + + // Stream end response } }; @@ -162,6 +178,7 @@ fn llama_generate_callback( unsafe fn scheduler_loop( mut backend: UniquePtr, + tokenizer: Tokenizer, backlog: Receiver<(GenerationContext, UnboundedSender)>, ) { // This loop will mostly decode single token at every step, so no need to rely on parallelism @@ -170,6 +187,7 @@ unsafe fn scheduler_loop( loop { if let Ok((generation, stream)) = backlog.recv() { let start = Instant::now(); + let tokenizer = tokenizer.clone(); let generation_params = generation.generation_params; // copy let sampling_params = generation.sampling_params; // copy let input_tokens = Arc::clone(&generation.input_tokens); @@ -179,6 +197,7 @@ unsafe fn scheduler_loop( let ctx = Box::new(InferContext { start, stream, + tokenizer, generation, }); diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index f128a6a3fc6..c5d735ab719 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -4,6 +4,7 @@ use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackend use text_generation_router::server::ApiDoc; use text_generation_router::{server, usage_stats}; use thiserror::Error; +use tokenizers::FromPretrainedParameters; /// App Configuration #[derive(Parser, Debug)] @@ -36,9 +37,9 @@ struct Args { port: u16, #[clap(long, env, help = "Path to GGUF model file(s) to load")] gguf_path: PathBuf, - #[clap(long, env, default_value = "1", help = "Number of model instance(s)")] - num_model_instance: u16, - #[clap(default_value = "bigscience/bloom", long, env)] + // #[clap(long, env, default_value = "1", help = "Number of model instance(s)")] + // num_model_instance: u16, + #[clap(long, env, required = true)] tokenizer_name: String, #[clap(long, env)] tokenizer_config_path: Option, @@ -94,7 +95,7 @@ async fn main() -> Result<(), RouterError> { hostname, port, gguf_path, - num_model_instance, + // num_model_instance, tokenizer_name, tokenizer_config_path, revision, @@ -153,7 +154,17 @@ async fn main() -> Result<(), RouterError> { } } - let backend = LlamaCppBackend::new(gguf_path)?; + let auth_token = std::env::var("HF_TOKEN") + .or_else(|_| std::env::var("HUGGING_FACE_HUB_TOKEN")) + .ok(); + let options = FromPretrainedParameters { + revision: revision.clone().unwrap_or("main".to_string()), + user_agent: Default::default(), + auth_token, + }; + let tokenizer = tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options)) + .expect("Failed to retrieve tokenizer"); + let backend = LlamaCppBackend::new(gguf_path, tokenizer)?; // Run server server::run( From 52208f5b78fd8cc31d01f440b5f5e250896c1e64 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 4 Nov 2024 23:24:50 +0100 Subject: [PATCH 44/92] misc(backend): decrease log verbosity in callback --- backends/llamacpp/src/backend.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 08fac6755a7..62b4743daac 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -17,7 +17,7 @@ use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; -use tracing::{error, info}; +use tracing::{debug, error, info}; type InferResult = Result; @@ -113,7 +113,7 @@ fn llama_generate_callback( is_final: bool, n_generated_tokens: usize, ) -> bool { - info!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})"); + debug!("Generated token: {new_token_id} -> logits={new_token_logit}, is_final={is_final} ({n_generated_tokens})"); let ctx = unsafe { &mut *ctx }; From 62dba1a878ba7e3c8151485adfb6159457c34c5a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 5 Nov 2024 23:46:52 +0100 Subject: [PATCH 45/92] misc(cmake): use url deps and not git repo --- backends/llamacpp/cmake/fmt.cmake | 3 +-- backends/llamacpp/cmake/spdlog.cmake | 7 +++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake index f94a9c5668f..840280ca8ba 100644 --- a/backends/llamacpp/cmake/fmt.cmake +++ b/backends/llamacpp/cmake/fmt.cmake @@ -1,6 +1,5 @@ FetchContent_Declare( fmt - GIT_REPOSITORY https://github.com/fmtlib/fmt - GIT_TAG 11.0.1 + URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz ) FetchContent_MakeAvailable(fmt) diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake index 68658ba5019..04c218b5814 100644 --- a/backends/llamacpp/cmake/spdlog.cmake +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -1,18 +1,17 @@ set(SPDLOG_USE_FMT ON) -set(SPDLOG_BUILD_SHARED OFF) +set(SPDLOG_BUILD_SHARED ON) set(SPDLOG_FMT_EXTERNAL ON) # Define the level at which SPDLOG_ compilation level is defined if (CMAKE_BUILD_TYPE STREQUAL "Debug") message(STATUS "Verbose logging is enabled in debug build") add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_DEBUG) -else() +else () add_compile_definitions(SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO) endif () fetchcontent_declare( spdlog - GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG v1.14.1 + URL https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz ) fetchcontent_makeavailable(spdlog) From 588421833c53b1ee6328b3d19650f6d93623e910 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 5 Nov 2024 23:47:22 +0100 Subject: [PATCH 46/92] misc(backend): missing header --- backends/llamacpp/csrc/ffi.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index f33a2f1ad57..a3d14ee52f7 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include "backend.hpp" From a1154b17ec8489ad77217af8cf52027e9413ecb6 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 5 Nov 2024 23:47:38 +0100 Subject: [PATCH 47/92] feat(backend): avoid copy constructor --- backends/llamacpp/csrc/ffi.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index a3d14ee52f7..9daacf2c84d 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -101,8 +101,7 @@ namespace huggingface::tgi::backends::llamacpp { params.use_mmap = true; auto *model = llama_load_model_from_file(cxxPath.c_str(), params); - auto backend = single_worker_backend_t(model, std::nullopt); - return std::make_unique(std::move(backend)); + return std::make_unique(single_worker_backend_t { model, std::nullopt }); } } From 7eec0f704f05cbcc55e8b2a8132679b497d02fe3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 5 Nov 2024 23:48:13 +0100 Subject: [PATCH 48/92] chore(backend): minor fixes mostly format --- backends/llamacpp/build.rs | 8 ++++---- backends/llamacpp/csrc/backend.cpp | 2 +- backends/llamacpp/csrc/backend.hpp | 7 ++++++- backends/llamacpp/src/backend.rs | 4 ++-- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index eefc6403278..1ab926d4635 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -99,11 +99,11 @@ fn main() { println!("cargo:rustc-link-search=native={}", out_dir.display()); if is_debug { - println!("cargo:rustc-link-lib=static=fmtd"); - println!("cargo:rustc-link-lib=static=spdlogd"); + println!("cargo:rustc-link-lib=dylib=fmtd"); + println!("cargo:rustc-link-lib=dylib=spdlogd"); } else { - println!("cargo:rustc-link-lib=fmt"); - println!("cargo:rustc-link-lib=spdlog"); + println!("cargo:rustc-link-lib=dylib=fmt"); + println!("cargo:rustc-link-lib=dylib=spdlog"); } println!("cargo:rustc-link-lib=static=common"); diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index f69563811da..739b84a1d36 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -170,7 +170,7 @@ namespace huggingface::tgi::backends::llamacpp { mContext_(llama_context_factory(model)), mWorker_(mModel_, params.value_or(llama_context_default_params())) { llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); - } + }; std::expected single_worker_backend_t::stream( diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index bf9df5cca0e..4abc202ded6 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -157,10 +157,11 @@ namespace huggingface::tgi::backends::llamacpp { class single_worker_backend_t : backend_base_t { private: - constexpr const static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr { + constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr { auto llParams = llama_context_default_params(); llParams.flash_attn = true; llParams.n_batch = 1; + llParams.n_threads = 1; llParams.no_perf = true; llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL; @@ -173,6 +174,8 @@ namespace huggingface::tgi::backends::llamacpp { public: explicit single_worker_backend_t(llama_model *pModel, const std::optional &); + using backend_base_t::generate; + std::expected stream( std::span tokens, const generation_params_t &generation_params, @@ -185,6 +188,8 @@ namespace huggingface::tgi::backends::llamacpp { llama_context_ptr mContext_; public: + using backend_base_t::generate; + std::expected stream( std::span tokens, const generation_params_t &generation_params, diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 62b4743daac..609c8405767 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -70,7 +70,7 @@ pub enum LlamaCppBackendError { pub struct LlamaCppBackend { backlog: Sender<(GenerationContext, UnboundedSender)>, - scheduler_handle: JoinHandle<()>, + _scheduler_handle: JoinHandle<()>, } impl LlamaCppBackend { @@ -101,7 +101,7 @@ impl LlamaCppBackend { let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) }; Ok(Self { backlog: submitter, - scheduler_handle: handle, + _scheduler_handle: handle, }) } } From a7afde41a94776e7324137b1091a8883f0afde00 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 5 Nov 2024 23:48:22 +0100 Subject: [PATCH 49/92] feat(backend): dockerfile --- Dockerfile.llamacpp | 51 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 Dockerfile.llamacpp diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp new file mode 100644 index 00000000000..e24ce9bd32b --- /dev/null +++ b/Dockerfile.llamacpp @@ -0,0 +1,51 @@ +# Build dependencies resolver stage +FROM lukemathwalker/cargo-chef:latest AS chef +WORKDIR /usr/src/text-generation-inference/ + +FROM chef AS planner +COPY Cargo.lock Cargo.lock +COPY Cargo.toml Cargo.toml +COPY rust-toolchain.toml rust-toolchain.toml +COPY backends backends +COPY benchmark benchmark +COPY clients clients +COPY launcher launcher +COPY router router + +RUN cargo chef prepare --recipe-path recipe.json + +FROM chef AS builder +RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \ + clang \ + cmake \ + gcc g++ \ + libc++-dev \ + libopenmpi-dev \ + libssl-dev \ + ninja-build \ + openssl \ + python3-dev + + +RUN update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \ + && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 10 \ + && update-alternatives --auto cc \ + && update-alternatives --auto c++ \ + && update-alternatives --display cc \ + && update-alternatives --display c++ \ + && cc --version \ + && c++ --version + +COPY --from=planner usr/src/text-generation-inference/recipe.json recipe.json +RUN cargo chef cook --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --recipe-path recipe.json + +COPY Cargo.lock Cargo.lock +COPY Cargo.toml Cargo.toml +COPY rust-toolchain.toml rust-toolchain.toml +COPY backends backends +COPY benchmark benchmark +COPY launcher launcher +COPY router router + +ENV RUSTFLAGS="-L/usr/lib" +RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen \ No newline at end of file From 20652824d99076f58e989f46430f68b2d619f489 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 6 Nov 2024 17:33:37 +0100 Subject: [PATCH 50/92] feat(dockerfile): build process --- Dockerfile.llamacpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index e24ce9bd32b..0864c1bad08 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -15,6 +15,7 @@ COPY router router RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder +ENV CMAKE_INSTALL_PREFIX=${CWD}/dist RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \ clang \ cmake \ @@ -26,7 +27,6 @@ RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \ openssl \ python3-dev - RUN update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \ && update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 10 \ && update-alternatives --auto cc \ @@ -36,7 +36,7 @@ RUN update-alternatives --install /usr/bin/cc cc /usr/bin/clang 10 \ && cc --version \ && c++ --version -COPY --from=planner usr/src/text-generation-inference/recipe.json recipe.json +COPY --from=planner /usr/src/text-generation-inference/recipe.json recipe.json RUN cargo chef cook --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --recipe-path recipe.json COPY Cargo.lock Cargo.lock @@ -48,4 +48,8 @@ COPY launcher launcher COPY router router ENV RUSTFLAGS="-L/usr/lib" -RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen \ No newline at end of file +RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen + +FROM ubuntu:24.04 +COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher +COPY --from=builder /usr/src/text-generation-inference/dist /usr/ \ No newline at end of file From 26d0266cec6f327bd41c0a8050dbc1725e670f32 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 6 Nov 2024 17:46:46 +0100 Subject: [PATCH 51/92] feat(backend): handle all the tokenization failure and send back to the client --- backends/llamacpp/src/backend.rs | 69 +++++++++++++++++--------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 609c8405767..8214c36a73b 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -124,56 +124,59 @@ fn llama_generate_callback( let token = match ctx.tokenizer.decode(&[new_token_id], false) { Ok(text) => { let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text); - Token { + Ok(Token { id: new_token_id, text, logprob: new_token_logit, special, - } + }) } - Err(_) => panic!("Failed to decode token"), + Err(ref err) => Err(InferError::GenerationError(err.to_string())), }; // Create the streamed response - let response = match is_final { - false => InferStreamResponse::Intermediate { - token, - top_tokens: vec![], - }, - true => { - // Decode the whole text - match ctx - .tokenizer - .decode(&ctx.generation.generated_tokens, false) - { - Ok(text) => InferStreamResponse::End { + let response = match token { + Ok(token) => { + match is_final { + false => Ok(InferStreamResponse::Intermediate { token, top_tokens: vec![], - generated_text: GeneratedText { - text, - generated_tokens: n_generated_tokens as u32, - finish_reason: FinishReason::Length, - seed: Some(ctx.generation.sampling_params.seed), - }, - start: ctx.start, - queued: ctx.start, - }, - Err(_) => panic!("Failed to decode token"), + }), + true => { + // Decode the whole text + match ctx + .tokenizer + .decode(&ctx.generation.generated_tokens, false) + { + Ok(text) => Ok(InferStreamResponse::End { + token, + top_tokens: vec![], + generated_text: GeneratedText { + text, + generated_tokens: n_generated_tokens as u32, + finish_reason: FinishReason::Length, + seed: Some(ctx.generation.sampling_params.seed), + }, + start: ctx.start, + queued: ctx.start, + }), + Err(err) => Err(InferError::GenerationError(err.to_string())), + } + } } - - // Stream end response } + Err(err) => Err(err), }; // Send back to the client - if let Err(ref _err) = ctx.stream.send(Ok(response)) { + let should_stop = if let Err(ref _err) = ctx.stream.send(response) { error!("Failed to send back the response to the client, cancelling request"); - // TODO: cancel the request - return true; // should_stop - } + true + } else { + true + }; - // should_stop - false + should_stop } unsafe fn scheduler_loop( From cf17928f83fdb0d1224d5286b2953effde8cf28a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 8 Nov 2024 00:53:53 +0100 Subject: [PATCH 52/92] misc(cmake): remove dependency on fmt --- backends/llamacpp/CMakeLists.txt | 1 - backends/llamacpp/build.rs | 4 ++-- backends/llamacpp/cmake/fmt.cmake | 5 ----- backends/llamacpp/cmake/spdlog.cmake | 2 +- backends/llamacpp/csrc/backend.cpp | 4 +--- 5 files changed, 4 insertions(+), 12 deletions(-) delete mode 100644 backends/llamacpp/cmake/fmt.cmake diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index e536efc57a2..938f7360011 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -19,7 +19,6 @@ else () endif () # Add dependencies -include(cmake/fmt.cmake) include(cmake/spdlog.cmake) if (${LLAMA_CPP_BUILD_CUDA}) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 1ab926d4635..5331e87d451 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -99,10 +99,10 @@ fn main() { println!("cargo:rustc-link-search=native={}", out_dir.display()); if is_debug { - println!("cargo:rustc-link-lib=dylib=fmtd"); + // println!("cargo:rustc-link-lib=dylib=fmtd"); println!("cargo:rustc-link-lib=dylib=spdlogd"); } else { - println!("cargo:rustc-link-lib=dylib=fmt"); + // println!("cargo:rustc-link-lib=dylib=fmt"); println!("cargo:rustc-link-lib=dylib=spdlog"); } diff --git a/backends/llamacpp/cmake/fmt.cmake b/backends/llamacpp/cmake/fmt.cmake deleted file mode 100644 index 840280ca8ba..00000000000 --- a/backends/llamacpp/cmake/fmt.cmake +++ /dev/null @@ -1,5 +0,0 @@ -FetchContent_Declare( - fmt - URL https://github.com/fmtlib/fmt/archive/refs/tags/11.0.2.tar.gz -) -FetchContent_MakeAvailable(fmt) diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake index 04c218b5814..bd81d6d51a3 100644 --- a/backends/llamacpp/cmake/spdlog.cmake +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -1,6 +1,6 @@ set(SPDLOG_USE_FMT ON) set(SPDLOG_BUILD_SHARED ON) -set(SPDLOG_FMT_EXTERNAL ON) +set(SPDLOG_FMT_EXTERNAL OFF) # Define the level at which SPDLOG_ compilation level is defined if (CMAKE_BUILD_TYPE STREQUAL "Debug") diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 739b84a1d36..11781273aed 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -7,9 +7,7 @@ #include #include -#include -#include -#include +#include #include #include "backend.hpp" From 4f5397c4147aab2e5818426e162321f7179dc2d1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 8 Nov 2024 00:54:05 +0100 Subject: [PATCH 53/92] misc(cmake): use URL base llama.cpp repo --- backends/llamacpp/CMakeLists.txt | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 938f7360011..f92bbe68661 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -33,17 +33,14 @@ endif () # Download llama.cpp repo at the specific version fetchcontent_declare( llama - # DOWNLOAD_EXTRACT_TIMESTAMP TRUE - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b3958 - GIT_SHALLOW FALSE + URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4048.tar.gz ) fetchcontent_makeavailable(llama) add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11) -target_link_libraries(tgi_llamacpp_backend_impl PUBLIC fmt::fmt spdlog::spdlog llama common) +target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama common) install(TARGETS tgi_llamacpp_backend_impl spdlog llama common) if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") @@ -54,7 +51,7 @@ if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") add_executable(tgi_llamacpp_offline_runner offline/main.cpp) - target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog fmt::fmt) + target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog) endif () From 86d30aea43c6b858fa260aaa49b2c95320f97236 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sat, 9 Nov 2024 22:10:33 +0100 Subject: [PATCH 54/92] feat(backend): simplify overall cpp structure --- backends/llamacpp/csrc/backend.cpp | 103 ++++---------------------- backends/llamacpp/csrc/backend.hpp | 110 ++-------------------------- backends/llamacpp/csrc/ffi.hpp | 79 +++++++------------- backends/llamacpp/offline/main.cpp | 43 +++++++---- backends/llamacpp/src/backend.rs | 113 +++++++++++++++++------------ backends/llamacpp/src/lib.rs | 9 +-- backends/llamacpp/src/main.rs | 8 +- 7 files changed, 144 insertions(+), 321 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 11781273aed..837f87ea052 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -49,43 +49,28 @@ namespace huggingface::tgi::backends::llamacpp { } llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); - return llama_sampler_ptr(pSampler, llama_sampler_deleter); + return {pSampler, llama_sampler_deleter}; } worker_t::worker_t(std::shared_ptr model, const llama_context_params ¶ms) - : mModel_(model), mParams_(params) { + : model_(model), context_(llama_new_context_with_model(model_.get(), params)) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG char modelName[256]; llama_model_meta_val_str(model.get(), "general.name", modelName, sizeof(modelName)); - SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); + SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); #endif } - void worker_t::loop(std::stop_source &driver, std::queue &backlog) const { - auto *context = llama_new_context_with_model(mModel_.get(), mParams_); - - while (!driver.stop_requested()) { - const auto generation_context = backlog.front(); - - generate(context, generation_context, std::nullopt); - backlog.pop(); - - SPDLOG_DEBUG("Processed request ({:d} remaining)", backlog.size()); - } - - llama_free(context); - } - - size_t worker_t::generate( - llama_context *context, - const generation_context_t &generation_context, - const std::optional &callback) const { + std::expected + worker_t::generate(const generation_context_t &generation_context, + const std::optional &callback) const { // Store information about context and generation size + const auto callback_ = callback.value_or(llama_void_callback); auto max_new_tokens = generation_context.generation_params.max_new_tokens; // Convert sampling params to what llama.cpp is looking for - auto sampler = generation_context.sampling_params.into_llama_sampler(mModel_.get()); + auto sampler = generation_context.sampling_params.into_llama_sampler(model_.get()); // Set up the prompt auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end()); @@ -94,11 +79,10 @@ namespace huggingface::tgi::backends::llamacpp { // Decode auto n_decoded_tokens = 0; for (bool generating = true; generating; ++n_decoded_tokens) { - const auto callback_ = callback.value_or(llama_void_callback); #ifdef TGI_LLAMACPP_BACKEND_DEBUG const auto start = std::chrono::steady_clock::now(); - const auto status = llama_decode(context, batch); + const auto status = llama_decode(context_.get(), batch); const auto end = std::chrono::steady_clock::now(); const auto latency = std::chrono::duration_cast(end - start); SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); @@ -108,8 +92,8 @@ namespace huggingface::tgi::backends::llamacpp { batch.n_tokens = 0; if (LLAMA_SUCCESS(status)) [[likely]] { // Sample the new token - auto new_token_id = llama_sampler_sample(sampler.get(), context, -1); - auto is_eog = llama_token_is_eog(mModel_.get(), new_token_id); + auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1); + auto is_eog = llama_token_is_eog(model_.get(), new_token_id); auto new_token_logits = 0.0f; // TODO: return logit // Handle termination cases @@ -119,11 +103,8 @@ namespace huggingface::tgi::backends::llamacpp { generating = !(has_reach_max_tokens | has_reach_eog); // Bubble up the generated token if a callback is provided - const auto should_stop = std::invoke(std::forward(callback_), - new_token_id, - new_token_logits, - !generating, - n_decoded_tokens + 1); + const auto should_stop = + std::invoke(callback_, new_token_id, new_token_logits, !generating, n_decoded_tokens + 1); generating ^= should_stop; batch = llama_batch_get_one(&new_token_id, 1); @@ -132,62 +113,4 @@ namespace huggingface::tgi::backends::llamacpp { return n_decoded_tokens; } - - - backend_base_t::backend_base_t(llama_model *model) : mModel_(model, llama_free_model) { llama_backend_init(); } - - backend_base_t::~backend_base_t() { llama_backend_free(); } - - std::expected, backend_error_t> backend_base_t::generate( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const std::optional &callback - ) { - // TODO: Should we provide a way to change this value? - auto generated = std::vector(2 << 8); - auto inner_callback = [&](uint32_t new_token_id, float_t new_token_logit, bool is_eos, - size_t num_generated_tokens) -> bool { - generated.emplace_back(new_token_id); - - if (callback.has_value()) - return (*callback)(new_token_id, new_token_logit, is_eos, num_generated_tokens); - return true; - }; - - auto nTokensGenerated = stream(tokens, generation_params, sampling_params, inner_callback); - return generated; - } - - - /** Single worker_t Backend impl **/ - - single_worker_backend_t::single_worker_backend_t(llama_model *model, - const std::optional ¶ms) - : backend_base_t(model), - mContext_(llama_context_factory(model)), - mWorker_(mModel_, params.value_or(llama_context_default_params())) { - llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); - }; - - std::expected - single_worker_backend_t::stream( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const llama_decode_callback &callback - ) { - return mWorker_.generate(mContext_.get(), {generation_params, sampling_params, tokens}, callback); - } - - std::expected - multi_worker_backend_t::stream( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const llama_decode_callback &callback - ) { - SPDLOG_WARN("Not implemented for multi_worker_t"); - return 0; - } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 4abc202ded6..de37df75eb5 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -76,8 +76,8 @@ namespace huggingface::tgi::backends::llamacpp { */ class worker_t { private: - const std::shared_ptr mModel_; - const llama_context_params mParams_; + std::shared_ptr model_; + llama_context_ptr context_; public: /** @@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp { * @param model * @param params */ - worker_t(std::shared_ptr model, const llama_context_params ¶ms); + worker_t(std::shared_ptr, const llama_context_params &); /** * @@ -93,108 +93,8 @@ namespace huggingface::tgi::backends::llamacpp { * @param generation_context * @param callback */ - size_t - generate(llama_context *, const generation_context_t &, const std::optional &) const; - - /** - * - */ - void loop(std::stop_source &driver, std::queue &backlog) const; - }; - - - class backend_base_t { - - protected: - std::shared_ptr mModel_; - - public: - - /** - * - * @param model - */ - explicit backend_base_t(llama_model *model); - - /** - * Destructor - */ - ~backend_base_t(); - - /** - * - * @param tokens - * @param generation_params - * @param sampling_params - * @param callback - * @return - */ - [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - std::expected, backend_error_t> generate( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const std::optional &callback = std::nullopt - ); - - /** - * - * @param tokens - * @param generation_params - * @param sampling_params - * @params callback - * @return - */ - [[nodiscard("Generated tokens will be freed after this call if not assigned to an lvalue")]] - virtual std::expected stream( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const llama_decode_callback &callback - ) = 0; - }; - - - class single_worker_backend_t : backend_base_t { - private: - constexpr static auto llama_context_factory = [](llama_model *pModel) -> llama_context_ptr { - auto llParams = llama_context_default_params(); - llParams.flash_attn = true; - llParams.n_batch = 1; - llParams.n_threads = 1; - llParams.no_perf = true; - llParams.attention_type = llama_attention_type::LLAMA_ATTENTION_TYPE_CAUSAL; - - return {llama_new_context_with_model(pModel, llParams), llama_context_deleter}; - }; - - llama_context_ptr mContext_; - worker_t mWorker_; - - public: - explicit single_worker_backend_t(llama_model *pModel, const std::optional &); - - using backend_base_t::generate; - - std::expected stream( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const llama_decode_callback &callback) override; - }; - - class multi_worker_backend_t : backend_base_t { - private: - llama_context_ptr mContext_; - - public: - using backend_base_t::generate; - - std::expected stream( - std::span tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - const llama_decode_callback &callback) override; + [[nodiscard]] std::expected + generate(const generation_context_t &, const std::optional &) const; }; } diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 9daacf2c84d..51a524cbbdd 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -7,58 +7,41 @@ #include #include +#include #include #include #include -#include "backend.hpp" namespace huggingface::tgi::backends::llamacpp { - struct generation_params_t; - struct sampling_params_t; - - class llama_cpp_backend_impl_t; + class llama_cpp_worker_frontend_t; } - +#include "backend.hpp" #include "backends/llamacpp/src/lib.rs.h" #include "rust/cxx.h" namespace huggingface::tgi::backends::llamacpp { - // Concept identifying types which have a .generate() -> size_t method to do in-place generation - template - concept has_stream_method = requires( - T t, - std::span input_tokens, - const generation_params_t &generation_params, - const sampling_params_t &sampling_params, - llama_decode_callback callback - ) { - { - t.stream(input_tokens, generation_params, sampling_params, callback) - } -> std::same_as>; + auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); }; + auto make_shared_llama_model = [](llama_model *model) { + return std::shared_ptr(model, llama_model_deleter); }; - static_assert(has_stream_method, "single_worker_backend_t doesn't meet concept has_stream_method"); - static_assert(has_stream_method, "multi_worker_backend_t doesn't meet concept has_stream_method"); - - class llama_cpp_backend_exception_t : std::exception { - - }; + class llama_cpp_backend_exception_t : std::exception {}; /** - * Llama.cpp backend interfacing with Rust FFI layer + * Llama.cpp frontend over the worker interfacing with Rust FFI layer */ - class llama_cpp_backend_impl_t { + class llama_cpp_worker_frontend_t { private: - std::variant mInner_; + std::shared_ptr model_; + worker_t worker_; public: - explicit llama_cpp_backend_impl_t(single_worker_backend_t &&backend) : mInner_(std::move(backend)) {} - - explicit llama_cpp_backend_impl_t(multi_worker_backend_t &&backend) : mInner_(std::move(backend)) {} + explicit llama_cpp_worker_frontend_t(llama_model *model): + model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {} size_t stream( rust::Slice input_tokens, @@ -67,41 +50,31 @@ namespace huggingface::tgi::backends::llamacpp { InferContext *ctx, rust::Fn callback ) { - // Define the visitor lambda function which requires the has_emplace_generate constraint on T - auto inner_fw = [=, &sampling_params, &ctx, &callback](T &&backend) - -> std::expected { - - auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool { - return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); - }; - - // Ask the compiler to create view over Rust slice transmuting from uint32_t* to int32_t* - auto input_tokens_v = - std::span(reinterpret_cast(input_tokens.data()), input_tokens.size()); - - return backend.stream( - input_tokens_v, - generation_params, - sampling_params, - context_forwarding_callback - ); + auto context_forwarding_callback = + [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool { + return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); }; - if (const auto result = std::visit(inner_fw, mInner_); result.has_value()) { + // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token* + auto input_tokens_v = + std::span(reinterpret_cast(input_tokens.data()), input_tokens.size()); + + const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v}; + if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] { return *result; } else { - throw llama_cpp_backend_exception_t(); + throw llama_cpp_backend_exception_t {}; } } }; - std::unique_ptr create_single_worker_backend(rust::Str modelPath) { + std::unique_ptr create_worker_frontend(rust::Str modelPath) { const auto cxxPath = std::string(modelPath); auto params = llama_model_default_params(); params.use_mmap = true; - auto *model = llama_load_model_from_file(cxxPath.c_str(), params); - return std::make_unique(single_worker_backend_t { model, std::nullopt }); + auto *model = (llama_load_model_from_file(cxxPath.c_str(), params)); + return std::make_unique(model); } } diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 7eb7dbde0a9..721abf051f5 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -1,16 +1,17 @@ // // Created by mfuntowicz on 10/3/24. // +#include -#include -#include -#include -#include +#include #include +#include s #include "../csrc/backend.hpp" using namespace huggingface::tgi::backends::llamacpp; +const auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); }; + int main(int argc, char **argv) { if (argc < 2) { fmt::print("No model folder provider"); @@ -18,21 +19,31 @@ int main(int argc, char **argv) { } spdlog::set_level(spdlog::level::debug); - + const auto modelPath = absolute(std::filesystem::path(argv[1])); const auto params = llama_model_default_params(); - auto *model = llama_load_model_from_file(modelPath.c_str(), params); + auto model = std::unique_ptr( + llama_load_model_from_file(modelPath.c_str(), params) + ); - auto backend = single_worker_backend_t(model, {}); + auto prompt = "My name is Morgan"; + auto tokens = std::vector(16); + const auto nb_tokens = llama_tokenize(model.get(), prompt, sizeof(prompt), tokens.data(), tokens.size(), true, + false); + tokens.resize(nb_tokens); + auto backend = worker_t{std::move(model), {.n_batch = 1, .n_threads = 4}}; + + fmt::println("Tokenized: {}", tokens); // generate - const auto promptTokens = {128000, 5159, 836, 374, 23809, 11}; - const auto out = backend.generate(promptTokens, {.max_new_tokens = 32}, {.top_k = 40}); - - if (out.has_value()) - fmt::print(FMT_STRING("Generated: {}"), *out); - else { - const auto err = out.error(); - fmt::print(fmt::emphasis::bold | fg(fmt::color::red), "Got an error: {:d}", static_cast(err)); - } + auto generated_tokens = std::vector(32); + const auto n_generated_tokens = backend.generate( + {{.max_new_tokens = 32}, {.top_k = 40}, tokens}, + [&generated_tokens](llama_token new_token_id, float_t logit, bool is_eos, size_t step) -> bool { + generated_tokens.emplace(generated_tokens.begin() + (step - 1), new_token_id); + return false; + } + ); + generated_tokens.resize(n_generated_tokens.value()); + fmt::println("Generated {} tokens", generated_tokens); } diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 8214c36a73b..8e36aa63160 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,8 +1,9 @@ use crate::ffi::{ - create_single_worker_backend, GenerationParams, LlamaCppBackendImpl, SamplingParams, + create_worker_frontend, GenerationParams, LlamaCppWorkerFrontend, SamplingParams, }; use async_trait::async_trait; use cxx::UniquePtr; +use std::ops::Deref; use std::path::{Path, PathBuf}; use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Arc; @@ -21,7 +22,7 @@ use tracing::{debug, error, info}; type InferResult = Result; -unsafe impl Send for LlamaCppBackendImpl {} +unsafe impl Send for LlamaCppWorkerFrontend {} impl From<&ValidParameters> for SamplingParams { fn from(v: &ValidParameters) -> Self { @@ -68,41 +69,54 @@ pub enum LlamaCppBackendError { ModelInitializationFailed(PathBuf, String), } -pub struct LlamaCppBackend { - backlog: Sender<(GenerationContext, UnboundedSender)>, - _scheduler_handle: JoinHandle<()>, +// pub struct LlamaCppBackend { +// backlog: Sender<(GenerationContext, UnboundedSender)>, +// _scheduler_handle: JoinHandle<()>, +// } + +struct LlamaCppWorker { + sender: Sender<(GenerationContext, UnboundedSender)>, + handle: JoinHandle<()>, +} + +pub enum LlamaCppBackend { + Single(LlamaCppWorker), + // Multi(Vec) } impl LlamaCppBackend { - pub fn new + Send>( + fn allocate_worker( + path: &Path, + ) -> Result, LlamaCppBackendError> { + create_worker_frontend(&path.display().to_string()).map_err(|ref err| { + LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string()) + }) + } + + pub fn new>( model_path: P, tokenizer: Tokenizer, + num_cores_per_instance: u16, ) -> Result { - let path = Arc::new(model_path.as_ref()); + let shared_path = Arc::new(model_path); + let path = shared_path.deref().as_ref(); if !path.exists() { return Err(LlamaCppBackendError::ModelFileDoesntExist( path.display().to_string(), )); } - let backend = create_single_worker_backend(path.to_str().unwrap()).map_err(|err| { - LlamaCppBackendError::ModelInitializationFailed( - path.to_path_buf(), - err.what().to_string(), - ) - })?; - - info!( - "Successfully initialized llama.cpp backend from {}", - path.display() - ); + let worker = match num_cores_per_instance { + 0 => { + let worker = Self::allocate_worker(path)?; + let (sender, receiver) = channel(); + let handle = spawn(|| scheduler_loop(worker, tokenizer, receiver)); + LlamaCppBackend::Single(LlamaCppWorker { sender, handle }) + } + _ => panic!("No supported yet"), + }; - let (submitter, receiver) = channel(); - let handle = unsafe { spawn(|| scheduler_loop(backend, tokenizer, receiver)) }; - Ok(Self { - backlog: submitter, - _scheduler_handle: handle, - }) + Ok(worker) } } @@ -169,18 +183,16 @@ fn llama_generate_callback( }; // Send back to the client - let should_stop = if let Err(ref _err) = ctx.stream.send(response) { + if let Err(ref _err) = ctx.stream.send(response) { error!("Failed to send back the response to the client, cancelling request"); true } else { - true - }; - - should_stop + false + } } -unsafe fn scheduler_loop( - mut backend: UniquePtr, +fn scheduler_loop( + mut backend: UniquePtr, tokenizer: Tokenizer, backlog: Receiver<(GenerationContext, UnboundedSender)>, ) { @@ -204,20 +216,23 @@ unsafe fn scheduler_loop( generation, }); - let boxed_ctx = Box::into_raw(ctx); + // We leak the box to avoid it being freed after the first callback call + // when going out of scope + unsafe { + let boxed_ctx = Box::into_raw(ctx); + if let Err(e) = backend.pin_mut().stream( + &input_tokens, + generation_params, + &sampling_params, + boxed_ctx, + llama_generate_callback, + ) { + error!("Error while decoding tokens... {}", e.what()); + } - if let Err(e) = backend.pin_mut().stream( - &input_tokens, - generation_params, - &sampling_params, - boxed_ctx, - llama_generate_callback, - ) { - error!("Error while decoding tokens... {}", e.what()); + // Make sure we re-keep track of the OpaqueStream box + let _ = Box::from_raw(boxed_ctx); } - - // Make sure we re-keep track of the OpaqueStream box - let _ = Box::from_raw(boxed_ctx); } } else { info!("IPC channel is closed, exiting the scheduler loop"); @@ -244,11 +259,13 @@ impl Backend for LlamaCppBackend { sampling_params, }; - match self.backlog.send((ctx, sx)) { - Ok(_) => Ok(UnboundedReceiverStream::new(rx)), - Err(_) => Err(InferError::GenerationError( - "Failed to sent the request".to_string(), - )), + match self { + LlamaCppBackend::Single(worker) => match worker.sender.send((ctx, sx)) { + Ok(_) => Ok(UnboundedReceiverStream::new(rx)), + Err(_) => Err(InferError::GenerationError( + "Failed to sent the request".to_string(), + )), + }, } } else { Err(InferError::GenerationError( diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index abcdd1fad06..4f0fa800276 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -46,14 +46,13 @@ mod ffi { type SamplingParams; /// Represent an instance of the llama.cpp backend instance on C++ side - #[cxx_name = "llama_cpp_backend_impl_t"] - type LlamaCppBackendImpl; + #[cxx_name = "llama_cpp_worker_frontend_t"] + type LlamaCppWorkerFrontend; - #[rust_name = "create_single_worker_backend"] - fn create_single_worker_backend(modelPath: &str) -> Result>; + fn create_worker_frontend(modelPath: &str) -> Result>; unsafe fn stream( - self: Pin<&mut LlamaCppBackendImpl>, + self: Pin<&mut LlamaCppWorkerFrontend>, tokens: &[u32], generation_params: GenerationParams, sampling_params: &SamplingParams, diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index c5d735ab719..a2abd5556a8 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -37,8 +37,8 @@ struct Args { port: u16, #[clap(long, env, help = "Path to GGUF model file(s) to load")] gguf_path: PathBuf, - // #[clap(long, env, default_value = "1", help = "Number of model instance(s)")] - // num_model_instance: u16, + #[clap(long, env, help = "Number of CPU core per instance(s)")] + num_cores_per_instance: Option, #[clap(long, env, required = true)] tokenizer_name: String, #[clap(long, env)] @@ -95,7 +95,7 @@ async fn main() -> Result<(), RouterError> { hostname, port, gguf_path, - // num_model_instance, + num_cores_per_instance, tokenizer_name, tokenizer_config_path, revision, @@ -164,7 +164,7 @@ async fn main() -> Result<(), RouterError> { }; let tokenizer = tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options)) .expect("Failed to retrieve tokenizer"); - let backend = LlamaCppBackend::new(gguf_path, tokenizer)?; + let backend = LlamaCppBackend::new(gguf_path, tokenizer, num_cores_per_instance.unwrap_or(0))?; // Run server server::run( From 6915fa3441e3ab0026d996e3c6c100930b1e5dda Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sat, 9 Nov 2024 22:19:38 +0100 Subject: [PATCH 55/92] feat(backend): remove reinterpret_cast converting from uint32_t to llama_token(int32_t) --- backends/llamacpp/csrc/ffi.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 51a524cbbdd..70669b7cdb9 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -56,8 +56,8 @@ namespace huggingface::tgi::backends::llamacpp { }; // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token* - auto input_tokens_v = - std::span(reinterpret_cast(input_tokens.data()), input_tokens.size()); + auto input_tokens_v = std::vector(input_tokens.size()); + std::memcpy(input_tokens_v.data(), input_tokens.data(), input_tokens.size()); const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v}; if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] { From 7e2890fe2cf14270e6e7ecd92500072b4655ab8c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 11 Nov 2024 19:50:11 +0100 Subject: [PATCH 56/92] feat(backend): remove unused function --- backends/llamacpp/csrc/backend.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 837f87ea052..66017fc5513 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -14,19 +14,6 @@ namespace huggingface::tgi::backends::llamacpp { - void llama_batch_fill_prompt(llama_batch &batch, std::span input_tokens) { - for (auto i = 0; i < input_tokens.size(); ++i) { - batch.token[i] = input_tokens[i]; - batch.pos[i] = i; - batch.n_seq_id[i] = 1; - batch.seq_id[i] = nullptr; - batch.logits[i] = false; - ++batch.n_tokens; - } - - batch.logits[batch.n_tokens] = true; - } - llama_sampler_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const { auto *pSampler = llama_sampler_chain_init({.no_perf = false}); From 488ba938983ec7b0cf47e4a53ff28b590fb3de31 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 11 Nov 2024 19:50:33 +0100 Subject: [PATCH 57/92] feat(backend): fix invalid reference to context in release mode --- backends/llamacpp/csrc/backend.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 66017fc5513..eb91e51782c 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -74,7 +74,7 @@ namespace huggingface::tgi::backends::llamacpp { const auto latency = std::chrono::duration_cast(end - start); SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); #else - const auto status = llama_decode(context, batch); + const auto status = llama_decode(context_.get(), batch); #endif batch.n_tokens = 0; if (LLAMA_SUCCESS(status)) [[likely]] { From 363d5e45de275b3c2739e2a4f9abad5cfa7e9baa Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 13 Nov 2024 00:07:59 +0100 Subject: [PATCH 58/92] feat(backend): use std::ranges to map uint32_t to llama_token --- backends/llamacpp/csrc/ffi.hpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 70669b7cdb9..948e96a0d1d 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -8,8 +8,8 @@ #include #include #include +#include #include -#include #include @@ -56,9 +56,16 @@ namespace huggingface::tgi::backends::llamacpp { }; // Ask the compiler to create view over Rust slice transmuting from uint32_t* to llama_token* - auto input_tokens_v = std::vector(input_tokens.size()); - std::memcpy(input_tokens_v.data(), input_tokens.data(), input_tokens.size()); + static auto as_llama_token = [](const uint32_t x){ return static_cast(x); }; +#ifdef __cpp_lib_ranges_to_container + auto input_tokens_v = input_tokens | std::views::transform(as_llama_token) | std::ranges::to(); +#else + auto input_tokens_ = input_tokens | std::views::transform(as_llama_token); + auto input_tokens_v = std::vector(input_tokens_.begin(), input_tokens_.end()); +#endif + + // Defer the generation to the actual worker_t const auto generation_context = generation_context_t {generation_params, sampling_params, input_tokens_v}; if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] { return *result; From 02cd6fe427b8ba705a4a138926971f8dc5562a9f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 13 Nov 2024 00:08:26 +0100 Subject: [PATCH 59/92] chore(backend): minor improvements --- backends/llamacpp/csrc/ffi.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 948e96a0d1d..43694fa3276 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -50,6 +50,8 @@ namespace huggingface::tgi::backends::llamacpp { InferContext *ctx, rust::Fn callback ) { + // Wrapper around the provided Rust callback to inject the InferContext when returning from the C++ FFI boundaries + // It captures the context (ctx) using reference and will automatically call the Rust callback forwarding the InferContext auto context_forwarding_callback = [=, &ctx](uint32_t new_token_id, float_t logits, bool is_eos, size_t n_generated_tokens) -> bool { return callback(ctx, new_token_id, logits, is_eos, n_generated_tokens); @@ -76,11 +78,18 @@ namespace huggingface::tgi::backends::llamacpp { }; std::unique_ptr create_worker_frontend(rust::Str modelPath) { - const auto cxxPath = std::string(modelPath); + // Initialize the numa context from numactl + static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){ + llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL); + return true; + }(); + + // Allocate model weights parameters auto params = llama_model_default_params(); params.use_mmap = true; - auto *model = (llama_load_model_from_file(cxxPath.c_str(), params)); + // Allocate the model from the Rust provided, string path + auto *model = (llama_load_model_from_file(static_cast(modelPath).c_str(), params)); return std::make_unique(model); } } From daf1631e09710343c3e208be3282bd53c4cf3ccd Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 13 Nov 2024 00:08:49 +0100 Subject: [PATCH 60/92] dockerfile(backend): initial working version of llama.cpp container --- Dockerfile.llamacpp | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index 0864c1bad08..3dab2a2968d 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -15,8 +15,10 @@ COPY router router RUN cargo chef prepare --recipe-path recipe.json FROM chef AS builder -ENV CMAKE_INSTALL_PREFIX=${CWD}/dist -RUN apt update && DEBIAN_FRONTEND=noninteractive apt install -y \ +ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt update && DEBIAN_FRONTEND=noninteractive apt install -y \ clang \ cmake \ gcc g++ \ @@ -48,8 +50,23 @@ COPY launcher launcher COPY router router ENV RUSTFLAGS="-L/usr/lib" +ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen -FROM ubuntu:24.04 +FROM ubuntu:22.04 +ENV DEBIAN_FRONTEND=noninteractive + +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt update && \ + apt upgrade -y && \ + apt install -y \ + openssl \ + python3.11-dev + COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher -COPY --from=builder /usr/src/text-generation-inference/dist /usr/ \ No newline at end of file +COPY --from=builder /usr/src/text-generation-inference/dist /usr/ + +ENV PORT=8080 +WORKDIR /usr/src/text-generation-inference +ENTRYPOINT ["text-generation-launcher"] \ No newline at end of file From 57b215467bc28b37a2b7a4ca98ea74b4a171d179 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 13 Nov 2024 00:22:11 +0100 Subject: [PATCH 61/92] feat(backend): simplify Rust callback --- backends/llamacpp/src/backend.rs | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 8e36aa63160..2dd5b70d116 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -134,23 +134,18 @@ fn llama_generate_callback( // Append the new token to the generated ones ctx.generation.generated_tokens.push(new_token_id); - // Decode token - let token = match ctx.tokenizer.decode(&[new_token_id], false) { + // Generate response + let response = match ctx.tokenizer.decode(&[new_token_id], false) { Ok(text) => { let special = ctx.tokenizer.get_added_vocabulary().is_special_token(&text); - Ok(Token { + let token = Token { id: new_token_id, text, logprob: new_token_logit, special, - }) - } - Err(ref err) => Err(InferError::GenerationError(err.to_string())), - }; + }; - // Create the streamed response - let response = match token { - Ok(token) => { + // Should we generate an ending or intermediate response? match is_final { false => Ok(InferStreamResponse::Intermediate { token, @@ -179,16 +174,14 @@ fn llama_generate_callback( } } } - Err(err) => Err(err), + Err(ref err) => Err(InferError::GenerationError(err.to_string())), }; // Send back to the client - if let Err(ref _err) = ctx.stream.send(response) { - error!("Failed to send back the response to the client, cancelling request"); - true - } else { - false - } + let status = ctx.stream.send(response).inspect_err(|err| { + error!("Failed to send back the response: {}", err); + }); + status.is_err() } fn scheduler_loop( From 6f059c4b5ddc7ddc3e5654767c9b7d24caa517da Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 14 Nov 2024 08:41:38 +0100 Subject: [PATCH 62/92] feat(backend): wrap Arc tokenizer to avoid duplicating --- backends/llamacpp/src/backend.rs | 60 ++++++++++++++------------------ backends/llamacpp/src/lib.rs | 2 +- backends/llamacpp/src/main.rs | 7 ++-- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 2dd5b70d116..dc29b707a1e 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -53,10 +53,10 @@ pub(crate) struct GenerationContext { pub(crate) sampling_params: SamplingParams, } -pub(crate) struct InferContext { +pub(crate) struct InferContext<'a> { pub(crate) start: Instant, pub(crate) stream: UnboundedSender, - pub(crate) tokenizer: Tokenizer, + pub(crate) tokenizer: &'a Tokenizer, pub(crate) generation: GenerationContext, } @@ -69,11 +69,6 @@ pub enum LlamaCppBackendError { ModelInitializationFailed(PathBuf, String), } -// pub struct LlamaCppBackend { -// backlog: Sender<(GenerationContext, UnboundedSender)>, -// _scheduler_handle: JoinHandle<()>, -// } - struct LlamaCppWorker { sender: Sender<(GenerationContext, UnboundedSender)>, handle: JoinHandle<()>, @@ -95,7 +90,7 @@ impl LlamaCppBackend { pub fn new>( model_path: P, - tokenizer: Tokenizer, + tokenizer: Arc, num_cores_per_instance: u16, ) -> Result { let shared_path = Arc::new(model_path); @@ -110,7 +105,7 @@ impl LlamaCppBackend { 0 => { let worker = Self::allocate_worker(path)?; let (sender, receiver) = channel(); - let handle = spawn(|| scheduler_loop(worker, tokenizer, receiver)); + let handle = spawn(move || scheduler_loop(worker, tokenizer, receiver)); LlamaCppBackend::Single(LlamaCppWorker { sender, handle }) } _ => panic!("No supported yet"), @@ -186,7 +181,7 @@ fn llama_generate_callback( fn scheduler_loop( mut backend: UniquePtr, - tokenizer: Tokenizer, + tokenizer: Arc, backlog: Receiver<(GenerationContext, UnboundedSender)>, ) { // This loop will mostly decode single token at every step, so no need to rely on parallelism @@ -195,37 +190,34 @@ fn scheduler_loop( loop { if let Ok((generation, stream)) = backlog.recv() { let start = Instant::now(); - let tokenizer = tokenizer.clone(); let generation_params = generation.generation_params; // copy let sampling_params = generation.sampling_params; // copy let input_tokens = Arc::clone(&generation.input_tokens); // Creating the whole InferContext and pushing it to the heap - { - let ctx = Box::new(InferContext { - start, - stream, - tokenizer, - generation, - }); - - // We leak the box to avoid it being freed after the first callback call - // when going out of scope - unsafe { - let boxed_ctx = Box::into_raw(ctx); - if let Err(e) = backend.pin_mut().stream( - &input_tokens, - generation_params, - &sampling_params, - boxed_ctx, - llama_generate_callback, - ) { - error!("Error while decoding tokens... {}", e.what()); - } + let ctx = Box::new(InferContext { + start, + stream, + tokenizer: &tokenizer, + generation, + }); - // Make sure we re-keep track of the OpaqueStream box - let _ = Box::from_raw(boxed_ctx); + // We leak the box to avoid it being freed after the first callback call + // when going out of scope + unsafe { + let boxed_ctx = Box::into_raw(ctx); + if let Err(e) = backend.pin_mut().stream( + &input_tokens, + generation_params, + &sampling_params, + boxed_ctx, + llama_generate_callback, + ) { + error!("Error while decoding tokens... {}", e.what()); } + + // Make sure we re-keep track of the OpaqueStream box + let _ = Box::from_raw(boxed_ctx); } } else { info!("IPC channel is closed, exiting the scheduler loop"); diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 4f0fa800276..8fc989552be 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -33,7 +33,7 @@ mod ffi { } extern "Rust" { - type InferContext; + type InferContext<'a>; } unsafe extern "C++" { diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index a2abd5556a8..adc183edc5b 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -1,5 +1,6 @@ use clap::{Parser, Subcommand}; use std::path::PathBuf; +use std::sync::Arc; use text_generation_backend_llamacpp::backend::{LlamaCppBackend, LlamaCppBackendError}; use text_generation_router::server::ApiDoc; use text_generation_router::{server, usage_stats}; @@ -162,8 +163,10 @@ async fn main() -> Result<(), RouterError> { user_agent: Default::default(), auth_token, }; - let tokenizer = tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options)) - .expect("Failed to retrieve tokenizer"); + let tokenizer = Arc::new( + tokenizers::Tokenizer::from_pretrained(tokenizer_name.clone(), Some(options)) + .expect("Failed to retrieve tokenizer"), + ); let backend = LlamaCppBackend::new(gguf_path, tokenizer, num_cores_per_instance.unwrap_or(0))?; // Run server From 70c90ad93395bf5ea317efc10fdcd21d916cd89a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 14 Nov 2024 09:04:06 +0100 Subject: [PATCH 63/92] feat(backend): update llamacpp to 4077 --- backends/llamacpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index f92bbe68661..73369935594 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -33,7 +33,7 @@ endif () # Download llama.cpp repo at the specific version fetchcontent_declare( llama - URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4048.tar.gz + URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4077.tar.gz ) fetchcontent_makeavailable(llama) From 23d2bcf28dbbb01f5391a5ec56a3163ef9f018eb Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 14 Nov 2024 09:38:13 +0100 Subject: [PATCH 64/92] misc(build): improve build process --- backends/llamacpp/CMakeLists.txt | 7 +++---- backends/llamacpp/build.rs | 14 ++------------ backends/llamacpp/cmake/spdlog.cmake | 8 +++++++- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 73369935594..f6dd2db1db7 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -6,7 +6,6 @@ set(CMAKE_CXX_STANDARD 23) include(FetchContent) set(LLAMA_CPP_TARGET_VERSION "b3837" CACHE STRING "Version of llama.cpp to build against") -set(LLAMA_BUILD_COMMON ON) set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE STRING "CUDA arch(s) to build") option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") @@ -40,8 +39,8 @@ fetchcontent_makeavailable(llama) add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11) -target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama common) -install(TARGETS tgi_llamacpp_backend_impl spdlog llama common) +target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama) +install(TARGETS tgi_llamacpp_backend_impl spdlog llama) if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") target_compile_definitions(tgi_llamacpp_backend_impl PRIVATE TGI_LLAMACPP_BACKEND_DEBUG=1) @@ -51,7 +50,7 @@ if (${LLAMA_CPP_BUILD_OFFLINE_RUNNER}) message(STATUS "Building llama.cpp offline runner") add_executable(tgi_llamacpp_offline_runner offline/main.cpp) - target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama common spdlog::spdlog) + target_link_libraries(tgi_llamacpp_offline_runner PUBLIC tgi_llamacpp_backend_impl llama spdlog::spdlog) endif () diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 5331e87d451..0e9f2ae9afe 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -59,9 +59,6 @@ fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) { cxx_build::bridge("src/lib.rs") .static_flag(true) .std("c++23") - .include(deps_folder.join("spdlog-src").join("include")) // Why spdlog doesnt install headers? - .include(deps_folder.join("llama-src").join("ggml").join("include")) // Why ggml doesnt install headers? - .include(deps_folder.join("llama-src").join("common").join("include")) // Why common doesnt install headers? .include(install_prefix.join("include")) .include("csrc") .file("csrc/ffi.hpp") @@ -98,15 +95,8 @@ fn main() { // Linkage info println!("cargo:rustc-link-search=native={}", out_dir.display()); - if is_debug { - // println!("cargo:rustc-link-lib=dylib=fmtd"); - println!("cargo:rustc-link-lib=dylib=spdlogd"); - } else { - // println!("cargo:rustc-link-lib=dylib=fmt"); - println!("cargo:rustc-link-lib=dylib=spdlog"); - } - - println!("cargo:rustc-link-lib=static=common"); + let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" }; + println!("cargo:rustc-link-lib=static={spdlog_linkage_target}"); println!("cargo:rustc-link-lib=dylib=ggml"); println!("cargo:rustc-link-lib=dylib=llama"); diff --git a/backends/llamacpp/cmake/spdlog.cmake b/backends/llamacpp/cmake/spdlog.cmake index bd81d6d51a3..f9d590a7847 100644 --- a/backends/llamacpp/cmake/spdlog.cmake +++ b/backends/llamacpp/cmake/spdlog.cmake @@ -1,6 +1,12 @@ set(SPDLOG_USE_FMT ON) -set(SPDLOG_BUILD_SHARED ON) +set(SPDLOG_BUILD_SHARED OFF) set(SPDLOG_FMT_EXTERNAL OFF) +set(SPDLOG_INSTALL ON) +set(SPDLOG_NO_ATOMIC_LEVELS ON) # We are not modifying log levels concurrently + +if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(SPDLOG_CLOCK_COARSE ON) +endif () # Define the level at which SPDLOG_ compilation level is defined if (CMAKE_BUILD_TYPE STREQUAL "Debug") From 5335bf973b2fef2c592a3061ccbf9e5c4fec7ab7 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 21 Nov 2024 00:03:05 +0100 Subject: [PATCH 65/92] feat(backend): multistream inference on CPU --- Cargo.lock | 1 + backends/llamacpp/CMakeLists.txt | 6 + backends/llamacpp/Cargo.toml | 1 + backends/llamacpp/build.rs | 5 +- backends/llamacpp/cmake/numa.cmake | 20 ++++ backends/llamacpp/csrc/backend.cpp | 2 +- backends/llamacpp/csrc/ffi.hpp | 23 ++++ backends/llamacpp/src/backend.rs | 173 +++++++++++++++++++++++------ backends/llamacpp/src/lib.rs | 2 + 9 files changed, 198 insertions(+), 35 deletions(-) create mode 100644 backends/llamacpp/cmake/numa.cmake diff --git a/Cargo.lock b/Cargo.lock index 6b6cb7a7e18..81b7c282a7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4229,6 +4229,7 @@ dependencies = [ "log", "metrics", "metrics-exporter-prometheus", + "num_cpus", "pkg-config", "serde_json", "text-generation-router", diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index f6dd2db1db7..13107e0abce 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -18,6 +18,7 @@ else () endif () # Add dependencies +include(cmake/numa.cmake) include(cmake/spdlog.cmake) if (${LLAMA_CPP_BUILD_CUDA}) @@ -40,6 +41,11 @@ fetchcontent_makeavailable(llama) add_library(tgi_llamacpp_backend_impl STATIC csrc/backend.hpp csrc/backend.cpp) target_compile_features(tgi_llamacpp_backend_impl PRIVATE cxx_std_11) target_link_libraries(tgi_llamacpp_backend_impl PUBLIC spdlog::spdlog llama) + +if (NUMA_FOUND) + target_link_libraries(tgi_llamacpp_backend_impl PUBLIC numa) +endif () + install(TARGETS tgi_llamacpp_backend_impl spdlog llama) if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 48a0bb84362..0a5039b3034 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -9,6 +9,7 @@ homepage.workspace = true async-trait = "0.1" clap = { version = "4.5.19", features = ["derive"] } cxx = "1.0" +num_cpus = "1" hf-hub = { workspace = true } image = { version = "0.25.1", features = ["default-formats"] } metrics = { workspace = true } diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 0e9f2ae9afe..22726db1843 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -86,6 +86,7 @@ fn main() { // Emit linkage search path probe!("ompi", MPI_REQUIRED_VERSION); + probe!("numa", "2.0"); // Backend BACKEND_DEPS.iter().for_each(|name| { @@ -96,7 +97,9 @@ fn main() { println!("cargo:rustc-link-search=native={}", out_dir.display()); let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" }; - println!("cargo:rustc-link-lib=static={spdlog_linkage_target}"); + let fmt_linkage_target = if is_debug { "fmtd" } else { "fmt" }; + println!("cargo:rustc-link-lib=dylib={spdlog_linkage_target}"); + println!("cargo:rustc-link-lib=dylib={fmt_linkage_target}"); println!("cargo:rustc-link-lib=dylib=ggml"); println!("cargo:rustc-link-lib=dylib=llama"); diff --git a/backends/llamacpp/cmake/numa.cmake b/backends/llamacpp/cmake/numa.cmake new file mode 100644 index 00000000000..0399b752ce9 --- /dev/null +++ b/backends/llamacpp/cmake/numa.cmake @@ -0,0 +1,20 @@ +# Find the numa policy library. +# Output variables: +# NUMA_INCLUDE_DIR : e.g., /usr/include/. +# NUMA_LIBRARY : Library path of numa library +# NUMA_FOUND : True if found. +FIND_PATH(NUMA_INCLUDE_DIR NAME numa.h + HINTS $ENV{HOME}/local/include /opt/local/include /usr/local/include /usr/include) + +FIND_LIBRARY(NUMA_LIBRARY NAME numa + HINTS $ENV{HOME}/local/lib64 $ENV{HOME}/local/lib /usr/local/lib64 /usr/local/lib /opt/local/lib64 /opt/local/lib /usr/lib64 /usr/lib +) + +IF (NUMA_INCLUDE_DIR AND NUMA_LIBRARY) + SET(NUMA_FOUND TRUE) + MESSAGE(STATUS "Found numa library: inc=${NUMA_INCLUDE_DIR}, lib=${NUMA_LIBRARY}") +ELSE () + SET(NUMA_FOUND FALSE) + MESSAGE(STATUS "WARNING: Numa library not found.") + MESSAGE(STATUS "Try: 'sudo apt-get install libnuma libnuma-dev' (or sudo yum install numactl numactl-devel)") +ENDIF () \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index eb91e51782c..a30eb217e95 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -45,7 +45,7 @@ namespace huggingface::tgi::backends::llamacpp { #ifdef TGI_LLAMACPP_BACKEND_DEBUG char modelName[256]; llama_model_meta_val_str(model.get(), "general.name", modelName, sizeof(modelName)); - SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); + SPDLOG_DEBUG(FMT_STRING("Created llama.cpp backend for model: '{}'"), std::string_view(modelName)); #endif } diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 43694fa3276..9700f52e201 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -5,13 +5,19 @@ #ifndef TGI_LLAMA_CPP_BACKEND_FFI_HPP #define TGI_LLAMA_CPP_BACKEND_FFI_HPP +#include #include #include #include #include #include +#include #include +#include +#include + +#include namespace huggingface::tgi::backends::llamacpp { class llama_cpp_worker_frontend_t; @@ -92,6 +98,23 @@ namespace huggingface::tgi::backends::llamacpp { auto *model = (llama_load_model_from_file(static_cast(modelPath).c_str(), params)); return std::make_unique(model); } + + void set_numactl_core_affinity(rust::Slice affinity) { + SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id()); +// auto nodes = std::unordered_set(); + auto cpumask = numa_allocate_cpumask(); + for(auto core : affinity) { + numa_bitmask_setbit(cpumask, core); + numa_sched_setaffinity(0, cpumask); + } + +//#ifdef TGI_LLAMACPP_BACKEND_DEBUG + auto cpumask_check = numa_allocate_cpumask(); + numa_sched_getaffinity(0, cpumask_check); + SPDLOG_DEBUG(FMT_STRING("numa_sched_affinity for thread {} -> {:b}"), std::this_thread::get_id(), *cpumask_check->maskp); +//#endif + + } } diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index dc29b707a1e..fa5bfbab0e3 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,13 +1,17 @@ use crate::ffi::{ - create_worker_frontend, GenerationParams, LlamaCppWorkerFrontend, SamplingParams, + create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend, + SamplingParams, }; use async_trait::async_trait; use cxx::UniquePtr; -use std::ops::Deref; +use log::warn; +use std::cell::RefCell; +use std::ops::Range; use std::path::{Path, PathBuf}; use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Arc; -use std::thread::{spawn, JoinHandle}; +use std::thread::spawn; +use text_generation_router::infer::InferError::GenerationError; use text_generation_router::infer::{Backend, GeneratedText, InferError, InferStreamResponse}; use text_generation_router::validation::{ ValidGenerateRequest, ValidParameters, ValidStoppingParameters, @@ -15,11 +19,41 @@ use text_generation_router::validation::{ use text_generation_router::{FinishReason, Token}; use thiserror::Error; use tokenizers::Tokenizer; -use tokio::sync::mpsc::{unbounded_channel, UnboundedSender}; +use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; +use tokio::sync::{Semaphore, SemaphorePermit, TryAcquireError}; +use tokio::task::JoinHandle; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, info}; +macro_rules! send_or_warn { + ($send: expr, $err: expr) => { + if let Err(se) = $send.send(err) { + warn!( + "Failed to send message back to the user: {}. Originating error: {}", + se, e + ); + } + }; +} + +fn get_num_cores() -> usize { + match option_env!("TGI_USE_PHYSICAL_CORES") + .unwrap_or("OFF") + .to_uppercase() + .as_str() + { + "ON" => { + info!("Using only physical cores on the machine"); + num_cpus::get_physical() + } + _ => { + info!("Using physical and logical cores on the machine"); + num_cpus::get() + } + } +} + type InferResult = Result; unsafe impl Send for LlamaCppWorkerFrontend {} @@ -71,12 +105,19 @@ pub enum LlamaCppBackendError { struct LlamaCppWorker { sender: Sender<(GenerationContext, UnboundedSender)>, - handle: JoinHandle<()>, } -pub enum LlamaCppBackend { - Single(LlamaCppWorker), - // Multi(Vec) +impl LlamaCppWorker { + fn submit(&self, ctx: GenerationContext, sx: UnboundedSender) { + if let Err(err) = self.sender.send((ctx, sx)) { + // TODO: What do we do? + } + } +} + +pub struct LlamaCppBackend { + scheduler_sender: UnboundedSender<(GenerationContext, UnboundedSender)>, + scheduler_handle: JoinHandle<()>, } impl LlamaCppBackend { @@ -93,28 +134,67 @@ impl LlamaCppBackend { tokenizer: Arc, num_cores_per_instance: u16, ) -> Result { - let shared_path = Arc::new(model_path); - let path = shared_path.deref().as_ref(); + let path = model_path.as_ref(); if !path.exists() { return Err(LlamaCppBackendError::ModelFileDoesntExist( path.display().to_string(), )); } - let worker = match num_cores_per_instance { - 0 => { - let worker = Self::allocate_worker(path)?; - let (sender, receiver) = channel(); - let handle = spawn(move || scheduler_loop(worker, tokenizer, receiver)); - LlamaCppBackend::Single(LlamaCppWorker { sender, handle }) - } - _ => panic!("No supported yet"), - }; + let cores_allocation = get_cores_allocation(num_cores_per_instance as usize); + + // Allocate all the workers + let streams = cores_allocation + .iter() + .map(|affinity| match Self::allocate_worker(path) { + Ok(worker) => { + let tokenizer = Arc::clone(&tokenizer); + let (sender, receiver) = channel(); + let affinity = affinity.clone().collect::>(); + spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); - Ok(worker) + Ok(LlamaCppWorker { sender }) + } + Err(e) => Err(e), + }) + .collect::, _>>()?; + + // Start the scheduler loop + let (scheduler_sender, scheduler_receiver) = unbounded_channel(); + let scheduler_handle = tokio::spawn(scheduler_loop(scheduler_receiver, streams)); + Ok(Self { + scheduler_sender, + scheduler_handle, + }) } } +fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { + // Get the total number of cores on the CPU + let cores_count = get_num_cores(); + + // Make sure each instance has some cores available + let mut effective_num_cores_per_instance = match num_cores_per_instance { + 0 => cores_count, + _ => num_cores_per_instance, + }; + + // If we have spare cores, let's see if we can give everyone one more core + let mut num_instances = cores_count / effective_num_cores_per_instance; + if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { + effective_num_cores_per_instance = effective_num_cores_per_instance + 1; + warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); + } + + (0..num_instances) + .map(|ordinal| { + let start = ordinal * effective_num_cores_per_instance; + let end = (ordinal + 1) * effective_num_cores_per_instance - 1; + (start..end) + }) + .collect() +} + fn llama_generate_callback( ctx: *mut InferContext, new_token_id: u32, @@ -164,12 +244,12 @@ fn llama_generate_callback( start: ctx.start, queued: ctx.start, }), - Err(err) => Err(InferError::GenerationError(err.to_string())), + Err(err) => Err(GenerationError(err.to_string())), } } } } - Err(ref err) => Err(InferError::GenerationError(err.to_string())), + Err(ref err) => Err(GenerationError(err.to_string())), }; // Send back to the client @@ -179,14 +259,43 @@ fn llama_generate_callback( status.is_err() } -fn scheduler_loop( +async fn scheduler_loop( + mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender)>, + mut workers: Vec, +) { + // Semaphore allows us to wait for a worker to become available + let permits = Semaphore::new(workers.len()); + + // Let's receive incoming requests + loop { + match queue.recv().await { + None => break, + Some((ctx, sender)) => { + let permit = permits.try_acquire(); + if let Err(err) = permit { + let _ = sender.send(Err(InferError::Overloaded(err))); + } + + // We can unwrap because we wouldn't have a semaphore available otherwise + let worker = workers.pop().unwrap(); + worker.submit(ctx, sender); + } + } + } +} + +fn worker_loop( mut backend: UniquePtr, + affinity: Vec, tokenizer: Arc, backlog: Receiver<(GenerationContext, UnboundedSender)>, ) { // This loop will mostly decode single token at every step, so no need to rely on parallelism tokenizers::utils::parallelism::set_parallelism(false); + // Bind cores for the current thread + set_numactl_core_affinity(&affinity); + loop { if let Ok((generation, stream)) = backlog.recv() { let start = Instant::now(); @@ -214,6 +323,7 @@ fn scheduler_loop( llama_generate_callback, ) { error!("Error while decoding tokens... {}", e.what()); + // TODO: What error to give back to the user? } // Make sure we re-keep track of the OpaqueStream box @@ -244,18 +354,15 @@ impl Backend for LlamaCppBackend { sampling_params, }; - match self { - LlamaCppBackend::Single(worker) => match worker.sender.send((ctx, sx)) { - Ok(_) => Ok(UnboundedReceiverStream::new(rx)), - Err(_) => Err(InferError::GenerationError( - "Failed to sent the request".to_string(), - )), - }, + // We send the workload to the scheduler + if let Err(e) = self.scheduler_sender.send((ctx, sx)) { + Err(InferError::IncompleteGenerationStream) + } else { + // We are returning the associated channel as early as we can, potentially closing it up + Ok(UnboundedReceiverStream::new(rx)) } } else { - Err(InferError::GenerationError( - "Unsupported modalities".to_string(), - )) + Err(GenerationError("Unsupported modalities".to_string())) } } diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 8fc989552be..f9fc72e513f 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -51,6 +51,8 @@ mod ffi { fn create_worker_frontend(modelPath: &str) -> Result>; + fn set_numactl_core_affinity(affinity: &[usize]); + unsafe fn stream( self: Pin<&mut LlamaCppWorkerFrontend>, tokens: &[u32], From 50c376612cd49a0a9c16c67b3ee61bd5add96766 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 21 Nov 2024 13:52:38 +0100 Subject: [PATCH 66/92] feat(backend): bind thread and memory affinity for thread --- backends/llamacpp/build.rs | 38 +++++++++----- backends/llamacpp/cmake/numa.cmake | 2 +- backends/llamacpp/csrc/ffi.hpp | 84 +++++++++++++++++++++++++----- backends/llamacpp/src/backend.rs | 7 ++- 4 files changed, 101 insertions(+), 30 deletions(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 22726db1843..023ccfbaadb 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -12,8 +12,12 @@ const BACKEND_DEPS: [&str; 2] = [CMAKE_LLAMA_CPP_TARGET, CMAKE_LLAMA_CPP_FFI_TAR macro_rules! probe { ($name: expr, $version: expr) => { if let Err(_) = pkg_config::probe_library($name) { - pkg_config::probe_library(&format!("{}-{}", $name, $version)) - .expect(&format!("Failed to locate {}", $name)); + match pkg_config::probe_library(&format!("{}-{}", $name, $version)) { + Ok(_) => Ok(()), + Err(_) => Err(()), + } + } else { + Ok(()) } }; } @@ -53,16 +57,27 @@ fn build_backend( deps_folder } -fn build_ffi_layer(deps_folder: &Path, install_prefix: &Path) { - println!("cargo:warning={}", deps_folder.display()); +fn build_ffi_layer(is_debug: bool, install_prefix: &Path) { CFG.include_prefix = "backends/llamacpp"; - cxx_build::bridge("src/lib.rs") + + let mut bridge = cxx_build::bridge("src/lib.rs"); + + bridge .static_flag(true) .std("c++23") .include(install_prefix.join("include")) .include("csrc") - .file("csrc/ffi.hpp") - .compile(CMAKE_LLAMA_CPP_FFI_TARGET); // Make sure this target is not the same as cmake above + .file("csrc/ffi.hpp"); + + if is_debug { + bridge.define("TGI_LLAMACPP_BACKEND_DEBUG", ""); + } + + if probe!("numa", "2.0").is_ok() { + bridge.define("NUMA_AVAILABLE", ""); + }; + + bridge.compile(CMAKE_LLAMA_CPP_FFI_TARGET); // Make sure this target is not the same as cmake above } fn main() { @@ -82,11 +97,12 @@ fn main() { let deps_path = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path); // Build the FFI layer calling the backend above - build_ffi_layer(&deps_path, &install_path); + build_ffi_layer(is_debug, &install_path); // Emit linkage search path - probe!("ompi", MPI_REQUIRED_VERSION); - probe!("numa", "2.0"); + if probe!("ompi", MPI_REQUIRED_VERSION).is_err() { + panic!("An implement of MPI is required"); + } // Backend BACKEND_DEPS.iter().for_each(|name| { @@ -97,9 +113,7 @@ fn main() { println!("cargo:rustc-link-search=native={}", out_dir.display()); let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" }; - let fmt_linkage_target = if is_debug { "fmtd" } else { "fmt" }; println!("cargo:rustc-link-lib=dylib={spdlog_linkage_target}"); - println!("cargo:rustc-link-lib=dylib={fmt_linkage_target}"); println!("cargo:rustc-link-lib=dylib=ggml"); println!("cargo:rustc-link-lib=dylib=llama"); diff --git a/backends/llamacpp/cmake/numa.cmake b/backends/llamacpp/cmake/numa.cmake index 0399b752ce9..94dfddc2779 100644 --- a/backends/llamacpp/cmake/numa.cmake +++ b/backends/llamacpp/cmake/numa.cmake @@ -13,8 +13,8 @@ FIND_LIBRARY(NUMA_LIBRARY NAME numa IF (NUMA_INCLUDE_DIR AND NUMA_LIBRARY) SET(NUMA_FOUND TRUE) MESSAGE(STATUS "Found numa library: inc=${NUMA_INCLUDE_DIR}, lib=${NUMA_LIBRARY}") + add_compile_definitions(NUMA_AVAILABLE) ELSE () SET(NUMA_FOUND FALSE) MESSAGE(STATUS "WARNING: Numa library not found.") - MESSAGE(STATUS "Try: 'sudo apt-get install libnuma libnuma-dev' (or sudo yum install numactl numactl-devel)") ENDIF () \ No newline at end of file diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 9700f52e201..147f81aef02 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -17,7 +17,12 @@ #include #include +#ifdef NUMA_AVAILABLE +#define CURRENT_THREAD 0 +#include +#include #include +#endif namespace huggingface::tgi::backends::llamacpp { class llama_cpp_worker_frontend_t; @@ -84,6 +89,10 @@ namespace huggingface::tgi::backends::llamacpp { }; std::unique_ptr create_worker_frontend(rust::Str modelPath) { +#ifdef TGI_LLAMACPP_BACKEND_DEBUG + spdlog::set_level(spdlog::level::debug); +#endif + // Initialize the numa context from numactl static const bool INITIALIZED_NUMA_CONTEXT_ONCE = [](){ llama_numa_init(GGML_NUMA_STRATEGY_NUMACTL); @@ -99,21 +108,70 @@ namespace huggingface::tgi::backends::llamacpp { return std::make_unique(model); } - void set_numactl_core_affinity(rust::Slice affinity) { - SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id()); -// auto nodes = std::unordered_set(); - auto cpumask = numa_allocate_cpumask(); - for(auto core : affinity) { - numa_bitmask_setbit(cpumask, core); - numa_sched_setaffinity(0, cpumask); - } + struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; + typedef std::unique_ptr unique_cpumask_ptr; -//#ifdef TGI_LLAMACPP_BACKEND_DEBUG - auto cpumask_check = numa_allocate_cpumask(); - numa_sched_getaffinity(0, cpumask_check); - SPDLOG_DEBUG(FMT_STRING("numa_sched_affinity for thread {} -> {:b}"), std::this_thread::get_id(), *cpumask_check->maskp); -//#endif + void set_numactl_core_affinity(rust::Slice affinity) { +// void set_numactl_core_affinity(std::vector affinity) { +#ifdef NUMA_AVAILABLE + if(numa_available()) { + SPDLOG_INFO("Setting numactl cores affinity to {} for thread {}", affinity, std::this_thread::get_id()); + + auto cpumask = unique_cpumask_ptr(numa_allocate_cpumask()); + std::ranges::for_each(affinity, [&cpumask](size_t cpu) { numa_bitmask_setbit(cpumask.get(), cpu); }); + numa_sched_setaffinity(CURRENT_THREAD, cpumask.get()); + + // Retrieve some information about the current setup + if(const auto numa_num_nodes = numa_num_configured_nodes(); numa_num_nodes > 1) { + const auto *numa_all_cpus = numa_all_cpus_ptr; + SPDLOG_INFO(FMT_STRING("All CPUs: {:b} (# Nodes: {:d}"), *numa_all_cpus->maskp, numa_num_nodes); + + // Retrieve the cpumask specific for the current node + auto cpus_per_node = unique_cpumask_ptr(numa_allocate_cpumask()); + + // Allocate a set which keeps track of which nodes is being targeted + auto numa_spawning_nodes = std::unordered_set(); + for(auto node = 0; node < numa_num_nodes; ++node) { + // Retrieve the cpumask for the target node + numa_node_to_cpus(node, cpus_per_node.get()); + + // intersect which cores on the nodes are targeted, in no one on that specific node + // the value of allocated_cpus_on_node will be 0 as the result of the AND operation. + const auto allocated_cpus_on_node = *cpus_per_node->maskp & *cpumask->maskp; + if(allocated_cpus_on_node > 0) { + + // If we have some cores on the node, attempt to insert in the set of targeted node + if(const auto [_, was_inserted] = numa_spawning_nodes.emplace(node); was_inserted) { + SPDLOG_DEBUG("Allocated thread spawning node: {:d}", node); + } + } + + // Clear all the bits relative to the current node + numa_bitmask_clearall(cpus_per_node.get()); + } + + // Bind the memory if we spawn a single node, otherwise, let's display a warning + if(numa_spawning_nodes.size() == 1) { + SPDLOG_INFO(FMT_STRING("Setting memory affinity to node: {:d}"), *numa_spawning_nodes.begin()); + numa_set_preferred(*numa_spawning_nodes.begin()); + } else { + SPDLOG_WARN(FMT_STRING("Specified thread affinity spawn multiple NUMA nodes: {}"), numa_spawning_nodes); + } + } +#ifdef TGI_LLAMACPP_BACKEND_DEBUG + // Sanity check in the logs... + auto *cpumask_check = numa_allocate_cpumask(); + numa_sched_getaffinity(CURRENT_THREAD, cpumask_check); + SPDLOG_DEBUG( + FMT_STRING("numa_sched_affinity for thread {} -> {:b}"), + std::this_thread::get_id(), *cpumask_check->maskp); + numa_free_cpumask(cpumask_check); +#endif + } +#else + SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support"); +#endif } } diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index fa5bfbab0e3..1ef959a82c0 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -5,7 +5,6 @@ use crate::ffi::{ use async_trait::async_trait; use cxx::UniquePtr; use log::warn; -use std::cell::RefCell; use std::ops::Range; use std::path::{Path, PathBuf}; use std::sync::mpsc::{channel, Receiver, Sender}; @@ -20,7 +19,7 @@ use text_generation_router::{FinishReason, Token}; use thiserror::Error; use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; -use tokio::sync::{Semaphore, SemaphorePermit, TryAcquireError}; +use tokio::sync::Semaphore; use tokio::task::JoinHandle; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; @@ -180,7 +179,7 @@ fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { }; // If we have spare cores, let's see if we can give everyone one more core - let mut num_instances = cores_count / effective_num_cores_per_instance; + let num_instances = cores_count / effective_num_cores_per_instance; if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { effective_num_cores_per_instance = effective_num_cores_per_instance + 1; warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); @@ -190,7 +189,7 @@ fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { .map(|ordinal| { let start = ordinal * effective_num_cores_per_instance; let end = (ordinal + 1) * effective_num_cores_per_instance - 1; - (start..end) + start..end }) .collect() } From 84eead219af7e4cb5068d87ff4311afbb7b5b55a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 21 Nov 2024 21:43:50 +0100 Subject: [PATCH 67/92] feat(backend): correctly setup llama_context providing n_threads and n_ubatch --- backends/llamacpp/csrc/backend.cpp | 2 +- backends/llamacpp/csrc/backend.hpp | 2 +- backends/llamacpp/csrc/ffi.hpp | 8 ++++---- backends/llamacpp/src/backend.rs | 27 +++++++++++++++------------ backends/llamacpp/src/lib.rs | 5 ++++- 5 files changed, 25 insertions(+), 19 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index a30eb217e95..54f1cf73683 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -39,7 +39,7 @@ namespace huggingface::tgi::backends::llamacpp { return {pSampler, llama_sampler_deleter}; } - worker_t::worker_t(std::shared_ptr model, const llama_context_params ¶ms) + worker_t::worker_t(std::shared_ptr model, const llama_context_params &¶ms) : model_(model), context_(llama_new_context_with_model(model_.get(), params)) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index de37df75eb5..039d4eac9f1 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -85,7 +85,7 @@ namespace huggingface::tgi::backends::llamacpp { * @param model * @param params */ - worker_t(std::shared_ptr, const llama_context_params &); + worker_t(std::shared_ptr, const llama_context_params &&); /** * diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 147f81aef02..f9eec781967 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -51,8 +51,8 @@ namespace huggingface::tgi::backends::llamacpp { worker_t worker_; public: - explicit llama_cpp_worker_frontend_t(llama_model *model): - model_{ make_shared_llama_model(model) }, worker_(model_, {.no_perf = true}) {} + explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads): + model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {} size_t stream( rust::Slice input_tokens, @@ -88,7 +88,7 @@ namespace huggingface::tgi::backends::llamacpp { } }; - std::unique_ptr create_worker_frontend(rust::Str modelPath) { + std::unique_ptr create_worker_frontend(rust::Str modelPath, uint32_t num_threads) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG spdlog::set_level(spdlog::level::debug); #endif @@ -105,7 +105,7 @@ namespace huggingface::tgi::backends::llamacpp { // Allocate the model from the Rust provided, string path auto *model = (llama_load_model_from_file(static_cast(modelPath).c_str(), params)); - return std::make_unique(model); + return std::make_unique(model, static_cast(num_threads)); } struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 1ef959a82c0..e846a476e16 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -122,8 +122,9 @@ pub struct LlamaCppBackend { impl LlamaCppBackend { fn allocate_worker( path: &Path, + num_threads: u32, ) -> Result, LlamaCppBackendError> { - create_worker_frontend(&path.display().to_string()).map_err(|ref err| { + create_worker_frontend(&path.display().to_string(), num_threads).map_err(|ref err| { LlamaCppBackendError::ModelInitializationFailed(path.to_path_buf(), err.to_string()) }) } @@ -145,17 +146,19 @@ impl LlamaCppBackend { // Allocate all the workers let streams = cores_allocation .iter() - .map(|affinity| match Self::allocate_worker(path) { - Ok(worker) => { - let tokenizer = Arc::clone(&tokenizer); - let (sender, receiver) = channel(); - let affinity = affinity.clone().collect::>(); - spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); - - Ok(LlamaCppWorker { sender }) - } - Err(e) => Err(e), - }) + .map( + |affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) { + Ok(worker) => { + let tokenizer = Arc::clone(&tokenizer); + let (sender, receiver) = channel(); + let affinity = affinity.clone().collect::>(); + spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); + + Ok(LlamaCppWorker { sender }) + } + Err(e) => Err(e), + }, + ) .collect::, _>>()?; // Start the scheduler loop diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index f9fc72e513f..6b047bf53ff 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -49,7 +49,10 @@ mod ffi { #[cxx_name = "llama_cpp_worker_frontend_t"] type LlamaCppWorkerFrontend; - fn create_worker_frontend(modelPath: &str) -> Result>; + fn create_worker_frontend( + modelPath: &str, + num_threads: u32, + ) -> Result>; fn set_numactl_core_affinity(affinity: &[usize]); From 5a856616610d08ae58c6db78c8fe7d84327b7a19 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 13:32:56 +0100 Subject: [PATCH 68/92] feat(backend): rely on multi consumer queue to scheduler workers --- Cargo.lock | 49 ++++++++++++++++++++++++ backends/llamacpp/Cargo.toml | 1 + backends/llamacpp/src/backend.rs | 65 +++++++++++--------------------- 3 files changed, 71 insertions(+), 44 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 81b7c282a7e..4b4e7670ee7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -142,6 +142,18 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" +[[package]] +name = "async-channel" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89b47800b0be77592da0afd425cc03468052844aff33b84e33cc696f64e77b6a" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + [[package]] name = "async-rustls" version = "0.3.0" @@ -758,6 +770,15 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "console" version = "0.15.8" @@ -1158,6 +1179,27 @@ dependencies = [ "cc", ] +[[package]] +name = "event-listener" +version = "5.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6032be9bd27023a771701cc49f9f053c751055f71efb2e0ae5c15809093675ba" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "exr" version = "1.72.0" @@ -2922,6 +2964,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.3" @@ -4219,6 +4267,7 @@ dependencies = [ name = "text-generation-backend-llamacpp" version = "2.4.1-dev0" dependencies = [ + "async-channel", "async-trait", "clap 4.5.20", "cmake", diff --git a/backends/llamacpp/Cargo.toml b/backends/llamacpp/Cargo.toml index 0a5039b3034..df2c3421866 100644 --- a/backends/llamacpp/Cargo.toml +++ b/backends/llamacpp/Cargo.toml @@ -7,6 +7,7 @@ homepage.workspace = true [dependencies] async-trait = "0.1" +async-channel = "2.3" clap = { version = "4.5.19", features = ["derive"] } cxx = "1.0" num_cpus = "1" diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index e846a476e16..5bcb913b776 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -2,6 +2,7 @@ use crate::ffi::{ create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend, SamplingParams, }; +use async_channel::{unbounded as mpmc_unbounded, Receiver as MpmcReceiver, Sender as MpmcSender}; use async_trait::async_trait; use cxx::UniquePtr; use log::warn; @@ -19,7 +20,6 @@ use text_generation_router::{FinishReason, Token}; use thiserror::Error; use tokenizers::Tokenizer; use tokio::sync::mpsc::{unbounded_channel, UnboundedReceiver, UnboundedSender}; -use tokio::sync::Semaphore; use tokio::task::JoinHandle; use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; @@ -102,18 +102,6 @@ pub enum LlamaCppBackendError { ModelInitializationFailed(PathBuf, String), } -struct LlamaCppWorker { - sender: Sender<(GenerationContext, UnboundedSender)>, -} - -impl LlamaCppWorker { - fn submit(&self, ctx: GenerationContext, sx: UnboundedSender) { - if let Err(err) = self.sender.send((ctx, sx)) { - // TODO: What do we do? - } - } -} - pub struct LlamaCppBackend { scheduler_sender: UnboundedSender<(GenerationContext, UnboundedSender)>, scheduler_handle: JoinHandle<()>, @@ -141,29 +129,26 @@ impl LlamaCppBackend { )); } - let cores_allocation = get_cores_allocation(num_cores_per_instance as usize); + // Allocate the multi-consumer queue to orchestrate all the workers + let (backlog_submitter, backlog_receiver) = mpmc_unbounded(); // Allocate all the workers - let streams = cores_allocation - .iter() - .map( - |affinity| match Self::allocate_worker(path, num_cores_per_instance as u32) { - Ok(worker) => { - let tokenizer = Arc::clone(&tokenizer); - let (sender, receiver) = channel(); - let affinity = affinity.clone().collect::>(); - spawn(move || worker_loop(worker, affinity, tokenizer, receiver)); - - Ok(LlamaCppWorker { sender }) - } - Err(e) => Err(e), - }, - ) - .collect::, _>>()?; + let cores_allocation = get_cores_allocation(num_cores_per_instance as usize); + cores_allocation.iter().for_each(|affinity| { + match Self::allocate_worker(path, num_cores_per_instance as u32) { + Ok(worker) => { + let tokenizer = Arc::clone(&tokenizer); + let affinity = affinity.clone().collect::>(); + let backlog_receiver = backlog_receiver.clone(); + spawn(move || worker_loop(worker, affinity, tokenizer, backlog_receiver)); + } + Err(e) => {} + } + }); // Start the scheduler loop let (scheduler_sender, scheduler_receiver) = unbounded_channel(); - let scheduler_handle = tokio::spawn(scheduler_loop(scheduler_receiver, streams)); + let scheduler_handle = tokio::spawn(scheduler_loop(scheduler_receiver, backlog_submitter)); Ok(Self { scheduler_sender, scheduler_handle, @@ -263,24 +248,16 @@ fn llama_generate_callback( async fn scheduler_loop( mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender)>, - mut workers: Vec, + backlog: MpmcSender<(GenerationContext, UnboundedSender)>, ) { - // Semaphore allows us to wait for a worker to become available - let permits = Semaphore::new(workers.len()); - // Let's receive incoming requests loop { match queue.recv().await { None => break, Some((ctx, sender)) => { - let permit = permits.try_acquire(); - if let Err(err) = permit { - let _ = sender.send(Err(InferError::Overloaded(err))); + if let Err(e) = backlog.send((ctx, sender)).await { + todo!("What do we do") } - - // We can unwrap because we wouldn't have a semaphore available otherwise - let worker = workers.pop().unwrap(); - worker.submit(ctx, sender); } } } @@ -290,7 +267,7 @@ fn worker_loop( mut backend: UniquePtr, affinity: Vec, tokenizer: Arc, - backlog: Receiver<(GenerationContext, UnboundedSender)>, + backlog: MpmcReceiver<(GenerationContext, UnboundedSender)>, ) { // This loop will mostly decode single token at every step, so no need to rely on parallelism tokenizers::utils::parallelism::set_parallelism(false); @@ -299,7 +276,7 @@ fn worker_loop( set_numactl_core_affinity(&affinity); loop { - if let Ok((generation, stream)) = backlog.recv() { + if let Ok((generation, stream)) = backlog.recv_blocking() { let start = Instant::now(); let generation_params = generation.generation_params; // copy let sampling_params = generation.sampling_params; // copy From 30ae99631c0c028afe482e460b2bad316918e0f2 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 13:34:52 +0100 Subject: [PATCH 69/92] misc(docker): add numa lib as dependency --- Dockerfile.llamacpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index 3dab2a2968d..916f885a522 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -23,6 +23,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ cmake \ gcc g++ \ libc++-dev \ + libnumactl-dev \ libopenmpi-dev \ libssl-dev \ ninja-build \ @@ -61,6 +62,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ apt update && \ apt upgrade -y && \ apt install -y \ + numactl \ openssl \ python3.11-dev From 2d9465d181e0778a5456e5d99503264c98318f65 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 14:02:58 +0100 Subject: [PATCH 70/92] misc(backend): allow rebinding numa core affinity --- backends/llamacpp/csrc/backend.cpp | 1 - backends/llamacpp/csrc/ffi.hpp | 10 +++++++++- backends/llamacpp/src/backend.rs | 21 +++++---------------- backends/llamacpp/src/lib.rs | 3 ++- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 54f1cf73683..b60c3ddc0a3 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -3,7 +3,6 @@ // #include -#include #include #include diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index f9eec781967..d33a4c7b105 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -111,7 +111,7 @@ namespace huggingface::tgi::backends::llamacpp { struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; typedef std::unique_ptr unique_cpumask_ptr; - void set_numactl_core_affinity(rust::Slice affinity) { + void set_numa_core_affinity(rust::Slice affinity) { // void set_numactl_core_affinity(std::vector affinity) { #ifdef NUMA_AVAILABLE if(numa_available()) { @@ -173,6 +173,14 @@ namespace huggingface::tgi::backends::llamacpp { SPDLOG_WARN("TGI's llama.cpp backend was compiled without NUMA support"); #endif } + + /** + * + */ + void update_numa_affinity() { + SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id()); + llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_NUMACTL); + } } diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 5bcb913b776..709e5d42500 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -1,6 +1,6 @@ use crate::ffi::{ - create_worker_frontend, set_numactl_core_affinity, GenerationParams, LlamaCppWorkerFrontend, - SamplingParams, + create_worker_frontend, set_numa_core_affinity, update_numa_affinity, GenerationParams, + LlamaCppWorkerFrontend, SamplingParams, }; use async_channel::{unbounded as mpmc_unbounded, Receiver as MpmcReceiver, Sender as MpmcSender}; use async_trait::async_trait; @@ -8,7 +8,6 @@ use cxx::UniquePtr; use log::warn; use std::ops::Range; use std::path::{Path, PathBuf}; -use std::sync::mpsc::{channel, Receiver, Sender}; use std::sync::Arc; use std::thread::spawn; use text_generation_router::infer::InferError::GenerationError; @@ -25,17 +24,6 @@ use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, info}; -macro_rules! send_or_warn { - ($send: expr, $err: expr) => { - if let Err(se) = $send.send(err) { - warn!( - "Failed to send message back to the user: {}. Originating error: {}", - se, e - ); - } - }; -} - fn get_num_cores() -> usize { match option_env!("TGI_USE_PHYSICAL_CORES") .unwrap_or("OFF") @@ -272,8 +260,9 @@ fn worker_loop( // This loop will mostly decode single token at every step, so no need to rely on parallelism tokenizers::utils::parallelism::set_parallelism(false); - // Bind cores for the current thread - set_numactl_core_affinity(&affinity); + // Bind cores for the current thread and make sure it's taken into account + set_numa_core_affinity(&affinity); + update_numa_affinity(); loop { if let Ok((generation, stream)) = backlog.recv_blocking() { diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index 6b047bf53ff..e06220f2f84 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -54,7 +54,8 @@ mod ffi { num_threads: u32, ) -> Result>; - fn set_numactl_core_affinity(affinity: &[usize]); + fn set_numa_core_affinity(affinity: &[usize]); + fn update_numa_affinity(); unsafe fn stream( self: Pin<&mut LlamaCppWorkerFrontend>, From 4ee2ee58c9f2b528a95d78129aced91c9ca3e7f3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 14:48:39 +0100 Subject: [PATCH 71/92] misc(license): update LICENSE --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index d6456956733..faa86e9b0a6 100644 --- a/LICENSE +++ b/LICENSE @@ -187,7 +187,7 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright [yyyy] [name of copyright owner] + Copyright 2024 Hugging Face Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From b9c04b9c0726d66c30e8c7108fe306e1193b22c1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 15:13:54 +0100 Subject: [PATCH 72/92] misc(doc): c++ documentation --- backends/llamacpp/csrc/backend.hpp | 29 +++++++++++++-------- backends/llamacpp/csrc/ffi.hpp | 42 +++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 12 deletions(-) diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 039d4eac9f1..0e1a13ac167 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -33,14 +33,15 @@ namespace huggingface::tgi::backends::llamacpp { static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; }; /** - * + * Represent an error which can be returned as part of an std::expected */ enum backend_error_t : uint8_t { + // Provided model filepath doesnt exist MODEL_FILE_DOESNT_EXIST = 1 }; /** - * + * Hold all the parameters provided by TGI to sample from the final distribution of tokens */ struct sampling_params_t { uint32_t top_k = std::numeric_limits::max(); @@ -58,13 +59,19 @@ namespace huggingface::tgi::backends::llamacpp { }; /** - * + * Hold all the parameters provided by TGI to control the generation process */ struct generation_params_t { uint32_t max_new_tokens = std::numeric_limits::max(); bool ignore_eos_token = false; }; + /** + * Container structure wrapping up the current generation context composed by: + * - a non-owning view over the prompt tokens + * - the sampling parameters + * - the generation parameters + */ struct generation_context_t { generation_params_t generation_params; sampling_params_t sampling_params; @@ -72,7 +79,7 @@ namespace huggingface::tgi::backends::llamacpp { }; /** - * + * Represent the actual model execution (i.e. "forward") and generation loop for llama.cpp */ class worker_t { private: @@ -81,17 +88,17 @@ namespace huggingface::tgi::backends::llamacpp { public: /** - * - * @param model - * @param params + * Create a new llama.cpp worker from the provided llama_model and the context parameters + * @param model Previously allocated `llama_model` holding the weights of the neural network + * @param params Parameters to allocate the execution context of the model */ worker_t(std::shared_ptr, const llama_context_params &&); /** - * - * @param context - * @param generation_context - * @param callback + * Generate multiple successive tokens, sampled from the distribution generated by executing a forward pass + * over the neural network operations and matrices + * @param generation_context The generation context holding sampling and generation parameters along with prompt tokens + * @param callback An optional callback function which would be called everytime a new token is sampled */ [[nodiscard]] std::expected generate(const generation_context_t &, const std::optional &) const; diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index d33a4c7b105..3645526344f 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -35,11 +35,18 @@ namespace huggingface::tgi::backends::llamacpp { namespace huggingface::tgi::backends::llamacpp { + /** + * Smart pointer to drop a llama_model when going out of scope + */ auto llama_model_deleter = [](llama_model *model) { llama_free_model(model); }; auto make_shared_llama_model = [](llama_model *model) { return std::shared_ptr(model, llama_model_deleter); }; + /** + * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and + * allow automatic implementation of Result<_, Exception> from C++ to Rust + */ class llama_cpp_backend_exception_t : std::exception {}; /** @@ -51,9 +58,29 @@ namespace huggingface::tgi::backends::llamacpp { worker_t worker_; public: + /** + * Create a new llama.cpp worker frontend allowing to map custom Rust FFI types from CXX crate to c++ boundary + * @param model The `llama_model` to use on the worker + * @param num_threads The number of threads the worker is allowed to spawn accross for its threadpool + */ explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads): model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {} + /** + * Generate a new set of tokens from the provided `input_tokens`, streaming each individual token generated + * through the `callback`. + * Individual tokens are generated using the sampling parameters provided through `sampling_params` and the + * generation parameters, provided through `generation_params` allowing to define the behaviour of the generation loop. + * `ctx` is an opaque structure defined on Rust side which holds stream information to send tokens back to the originating client. + * @param input_tokens Prompt input tokens originating from the tokenization of the request's text input + * @param generation_params Parameters controlling the generation loop such as ignoring the end of sentence token or + * the maximum number of tokens to generate + * @param sampling_params Parameters controlling the sampling process on the final token distribution + * @param ctx Opaque structure from Rust holding HTTP channel to stream back response to the client + * @param callback Function pointer called everytime a new token is generated during the generation loop. + * If this callback returns `true` it signals an early termination request on the Rust side. + * @return Number of generated tokens + */ size_t stream( rust::Slice input_tokens, const generation_params_t generation_params, @@ -88,6 +115,12 @@ namespace huggingface::tgi::backends::llamacpp { } }; + /** + * Utility method to allocate a new worker frontend from Rust + * @param modelPath The GGUF model path as an UTF-8 string from Rust + * @param num_threads Integer greater than zero representing the number of threads the worker is allowed to use for computations + * @return unique ownership of `llama_cpp_worker_frontend_t` pointer + */ std::unique_ptr create_worker_frontend(rust::Str modelPath, uint32_t num_threads) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG spdlog::set_level(spdlog::level::debug); @@ -108,9 +141,16 @@ namespace huggingface::tgi::backends::llamacpp { return std::make_unique(model, static_cast(num_threads)); } + /** + * Smart pointer to automatically destroy the underlying numa_bitset * when going out of scope + */ struct numa_cpumask_deleter { void operator()(struct bitmask* cpumask){ numa_free_cpumask(cpumask); }}; typedef std::unique_ptr unique_cpumask_ptr; + /** + * Define the NUMA core and memory affinity for the current thread by binding cores and memory to respective NUMA node(s) + * @param affinity The set of allowed execution cores to inform the scheduler for the current thread + */ void set_numa_core_affinity(rust::Slice affinity) { // void set_numactl_core_affinity(std::vector affinity) { #ifdef NUMA_AVAILABLE @@ -175,7 +215,7 @@ namespace huggingface::tgi::backends::llamacpp { } /** - * + * Force an update of the llama.cpp/ggml threadpool, reading from NUMA cores affinity */ void update_numa_affinity() { SPDLOG_INFO("Rebinding NUMA affinity for current worker on thread: {}", std::this_thread::get_id()); From 862a519fdd818fd492696021bb3d19e993ec7b8b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 15:35:55 +0100 Subject: [PATCH 73/92] misc(doc): rust documentation --- backends/llamacpp/src/backend.rs | 135 +++++++++++++++++++++++++------ backends/llamacpp/src/lib.rs | 60 ++++++++++++++ 2 files changed, 169 insertions(+), 26 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 709e5d42500..32547655664 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -24,6 +24,10 @@ use tokio::time::Instant; use tokio_stream::wrappers::UnboundedReceiverStream; use tracing::{debug, error, info}; +/// Detect the number of CPU cores on the machine +/// +/// returns: usize Integer greater than 0 representing the number of CPU cores on the machine +/// fn get_num_cores() -> usize { match option_env!("TGI_USE_PHYSICAL_CORES") .unwrap_or("OFF") @@ -41,6 +45,45 @@ fn get_num_cores() -> usize { } } +/// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores +/// +/// # Arguments +/// +/// * `num_cores_per_instance`: Minimum number of cores for each instance +/// +/// returns: Vec, Global> +/// +/// # Examples +/// +/// ``` +/// +/// ``` +fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { + // Get the total number of cores on the CPU + let cores_count = get_num_cores(); + + // Make sure each instance has some cores available + let mut effective_num_cores_per_instance = match num_cores_per_instance { + 0 => cores_count, + _ => num_cores_per_instance, + }; + + // If we have spare cores, let's see if we can give everyone one more core + let num_instances = cores_count / effective_num_cores_per_instance; + if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { + effective_num_cores_per_instance = effective_num_cores_per_instance + 1; + warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); + } + + (0..num_instances) + .map(|ordinal| { + let start = ordinal * effective_num_cores_per_instance; + let end = (ordinal + 1) * effective_num_cores_per_instance - 1; + start..end + }) + .collect() +} + type InferResult = Result; unsafe impl Send for LlamaCppWorkerFrontend {} @@ -96,6 +139,20 @@ pub struct LlamaCppBackend { } impl LlamaCppBackend { + /// Attempt to create a new llama.cpp worker from the provided model path + /// + /// # Arguments + /// + /// * `path`: Path to the GGUF model file to load + /// * `num_threads`: Number of cores the model is allowed to spawn for its computations + /// + /// returns: Result, LlamaCppBackendError> + /// + /// # Examples + /// + /// ``` + /// + /// ``` fn allocate_worker( path: &Path, num_threads: u32, @@ -144,32 +201,27 @@ impl LlamaCppBackend { } } -fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { - // Get the total number of cores on the CPU - let cores_count = get_num_cores(); - - // Make sure each instance has some cores available - let mut effective_num_cores_per_instance = match num_cores_per_instance { - 0 => cores_count, - _ => num_cores_per_instance, - }; - - // If we have spare cores, let's see if we can give everyone one more core - let num_instances = cores_count / effective_num_cores_per_instance; - if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { - effective_num_cores_per_instance = effective_num_cores_per_instance + 1; - warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); - } - - (0..num_instances) - .map(|ordinal| { - let start = ordinal * effective_num_cores_per_instance; - let end = (ordinal + 1) * effective_num_cores_per_instance - 1; - start..end - }) - .collect() -} - +/// llama.cpp worker actual streaming callback, called everytime a new token is being generated +/// +/// # Arguments +/// +/// * `ctx`: InferContext holding the channel to stream back generated token to the client. +/// *UNSAFE* This parameter is unsafe and represented as a mutable pointer to avoid automatic drop of its +/// referenced resources after the first iteration step. +/// It's the responsibility of the caller to ensure a `Box::from_raw` is taking back full ownership of the pointer +/// for correct deletion. +/// * `new_token_id`: The sampled token identifier +/// * `new_token_logit`: the sampled token identifier log probability +/// * `is_final`: Flag indicating if the sampled token is a final one +/// * `n_generated_tokens`: Counter representing the actual number of token generated at this stage +/// +/// returns: bool `true` if the worker should stop the generation at this stage, `false` to continue +/// +/// # Examples +/// +/// ``` +/// +/// ``` fn llama_generate_callback( ctx: *mut InferContext, new_token_id: u32, @@ -234,6 +286,20 @@ fn llama_generate_callback( status.is_err() } +/// Main loop allowing scheduling incoming requests without blocking the main execution thread +/// +/// # Arguments +/// +/// * `queue`: Synchronized container to receive new request +/// * `backlog`: Synchronized container to dispatch new request towards all the workers for one to pick it up. +/// +/// returns: () +/// +/// # Examples +/// +/// ``` +/// +/// ``` async fn scheduler_loop( mut queue: UnboundedReceiver<(GenerationContext, UnboundedSender)>, backlog: MpmcSender<(GenerationContext, UnboundedSender)>, @@ -251,6 +317,23 @@ async fn scheduler_loop( } } +/// llama.cpp worker thread receiving incoming requests from the scheduler and handling all generation +/// process along with the streaming logic back to the client. +/// +/// # Arguments +/// +/// * `backend`: Owned llama.cpp worker with allocated execution resources +/// * `affinity`: Set of CPUs to bind the worker's thread for scheduling +/// * `tokenizer`: Tokenizer to use to decode generated token +/// * `backlog`: Multi-consumers queue holding the requests waiting to be handled by a worker +/// +/// returns: () +/// +/// # Examples +/// +/// ``` +/// +/// ``` fn worker_loop( mut backend: UniquePtr, affinity: Vec, diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index e06220f2f84..d844bb9fcd6 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -49,14 +49,74 @@ mod ffi { #[cxx_name = "llama_cpp_worker_frontend_t"] type LlamaCppWorkerFrontend; + /// Create a new llama.cpp worker + /// + /// # Arguments + /// + /// * `modelPath`: Path to the GGUF model file to load + /// * `num_threads`: Number of threads the worker is allowed to spawn to run computations + /// + /// returns: Result<, > + /// + /// # Examples + /// + /// ``` + /// + /// ``` fn create_worker_frontend( modelPath: &str, num_threads: u32, ) -> Result>; + /// Define the NUMA cores affinity on which the current thread is allowed to be scheduled. + /// + /// # Arguments + /// + /// * `affinity`: Set of CPU cores allowed for scheduling + /// + /// returns: () + /// + /// # Examples + /// + /// ``` + /// // Bind the current thread for execution on cores 0, 1, 2, 3 + /// set_numa_core_affinity(&[0, 1, 2, 3]); + /// ``` fn set_numa_core_affinity(affinity: &[usize]); + + /// Force llama.cpp to reevaluate the allowed NUMA context (core and memory affinity) for + /// its internal threads scheduling. + /// This method can potentially cause llama.cpp / ggml to reallocate its internal threadpool to + /// match the new affinity constraints + /// + /// returns: () + /// + /// # Examples + /// + /// ``` + /// set_numa_core_affinity(&[0, 1, 2, 3]); + /// update_numa_affinity(); + /// ``` fn update_numa_affinity(); + /// Generate new tokens from the provided prompt input `tokens` and generation and sampling parameters, + /// streaming back each generated individual token through the `callback`. + /// + /// # Arguments + /// + /// * `tokens`: Prompt input tokenized from the request's text input + /// * `generation_params`: Parameters controling the generation loop + /// * `sampling_params`: Parameters controling the sampling from the token distribution + /// * `stream`: Opaque structure mapping HTTP client transport to stream back token + /// * `callback`: Function pointer called everytime a new token is generated + /// + /// returns: Result> + /// + /// # Examples + /// + /// ``` + /// + /// ``` unsafe fn stream( self: Pin<&mut LlamaCppWorkerFrontend>, tokens: &[u32], From 9025a26ceae2a109cdcf66988309072b1cc58e5b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 22 Nov 2024 15:42:09 +0100 Subject: [PATCH 74/92] chore: remove unrelated change to trtllm --- backends/trtllm/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/backends/trtllm/CMakeLists.txt b/backends/trtllm/CMakeLists.txt index 80b2b4305af..831372cdf99 100644 --- a/backends/trtllm/CMakeLists.txt +++ b/backends/trtllm/CMakeLists.txt @@ -18,8 +18,6 @@ set(CMAKE_CXX_STANDARD 20) include(FetchContent) include(ExternalProject) -set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "--allow-unsupported-compiler -ccbin=gcc") - option(TGI_TRTLLM_BACKEND_BUILD_TESTS "Enable building the unittests suite" OFF) option(TGI_TRTLLM_BACKEND_BUILD_EXAMPLES "Enable building the examples suite" OFF) set(TGI_TRTLLM_BACKEND_TARGET_CUDA_ARCH_LIST "89-real" CACHE STRING "List of CUDA architectures to support") From bbe95ca9e9079c5c7cd29dff83fb759d1771c89b Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Thu, 28 Nov 2024 09:53:15 +0100 Subject: [PATCH 75/92] Update Dockerfile.llamacpp as per review Co-authored-by: Hugo Larcher --- Dockerfile.llamacpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index 916f885a522..44583b0989f 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -64,7 +64,9 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ apt install -y \ numactl \ openssl \ - python3.11-dev + python3.11-dev \ + python3.11-venv \ + ibgomp1 COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher COPY --from=builder /usr/src/text-generation-inference/dist /usr/ From d918e6a159ce5d1067fddf6a79b41f48867190fc Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Thu, 28 Nov 2024 09:53:59 +0100 Subject: [PATCH 76/92] Update Dockerfile.llamacpp as per review Co-authored-by: Hugo Larcher --- Dockerfile.llamacpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index 44583b0989f..92b1882a3a7 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ cmake \ gcc g++ \ libc++-dev \ - libnumactl-dev \ + libnuma-dev \ libopenmpi-dev \ libssl-dev \ ninja-build \ From 274cfce435a5b72806edc429327d6d7f710cb5cf Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 10:59:50 +0100 Subject: [PATCH 77/92] feat(backend): remove core overriding in the Rust backend --- backends/llamacpp/src/backend.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 32547655664..557c14b4921 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -70,10 +70,6 @@ fn get_cores_allocation(num_cores_per_instance: usize) -> Vec> { // If we have spare cores, let's see if we can give everyone one more core let num_instances = cores_count / effective_num_cores_per_instance; - if cores_count - (num_instances * effective_num_cores_per_instance) >= num_instances { - effective_num_cores_per_instance = effective_num_cores_per_instance + 1; - warn!("Overriding cores allocation to {effective_num_cores_per_instance} per instance"); - } (0..num_instances) .map(|ordinal| { From 8e8979351480298277d70ff264a0e82cbe1f34d1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 14:52:48 +0100 Subject: [PATCH 78/92] feat(backend): use the new batch api from llama --- backends/llamacpp/csrc/backend.cpp | 95 ++++++++++++++++++++---------- backends/llamacpp/csrc/backend.hpp | 2 +- 2 files changed, 64 insertions(+), 33 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index b60c3ddc0a3..17709b72704 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -38,6 +38,31 @@ namespace huggingface::tgi::backends::llamacpp { return {pSampler, llama_sampler_deleter}; } + + std::expected get_batch_from_prompt(std::span prompt) { + auto batch = llama_batch_init(static_cast(prompt.size()), 0, 1); + std::for_each(prompt.begin(), prompt.end(), [&batch](const llama_token token) { + const auto n_token = batch.n_tokens; + + batch.token[n_token] = token; + batch.pos[n_token] = n_token; + batch.n_seq_id[n_token] = 1; + batch.seq_id[n_token][0] = 1; + batch.logits[n_token] = false; + batch.n_tokens++; + }); + + batch.logits[batch.n_tokens - 1] = true; + return batch; + } + + void update_batch_for_decoding(llama_batch &batch, llama_token token, size_t position) { + batch.n_tokens = 1; + batch.logits[0] = true; + batch.token[0] = token; + batch.pos[0] = static_cast(position); + } + worker_t::worker_t(std::shared_ptr model, const llama_context_params &¶ms) : model_(model), context_(llama_new_context_with_model(model_.get(), params)) { @@ -59,44 +84,50 @@ namespace huggingface::tgi::backends::llamacpp { auto sampler = generation_context.sampling_params.into_llama_sampler(model_.get()); // Set up the prompt - auto copy = std::vector(generation_context.input_tokens.begin(), generation_context.input_tokens.end()); - auto batch = llama_batch_get_one(copy.data(), copy.size()); - - // Decode - auto n_decoded_tokens = 0; - for (bool generating = true; generating; ++n_decoded_tokens) { + if (auto maybe_batch = get_batch_from_prompt(generation_context.input_tokens); maybe_batch.has_value()) { + // Decode + auto batch = *maybe_batch; + auto n_decoded_tokens = 0; + const auto prompt_size = generation_context.input_tokens.size(); + for (bool generating = true; generating; ++n_decoded_tokens) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG - const auto start = std::chrono::steady_clock::now(); - const auto status = llama_decode(context_.get(), batch); - const auto end = std::chrono::steady_clock::now(); - const auto latency = std::chrono::duration_cast(end - start); - SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); + const auto start = std::chrono::steady_clock::now(); + const auto status = llama_decode(context_.get(), batch); + const auto end = std::chrono::steady_clock::now(); + const auto latency = std::chrono::duration_cast(end - start); + SPDLOG_DEBUG(FMT_STRING("Successfully decoded {:d} token(s) in {}"), batch.n_tokens, latency); #else - const auto status = llama_decode(context_.get(), batch); + const auto status = llama_decode(context_.get(), batch); #endif - batch.n_tokens = 0; - if (LLAMA_SUCCESS(status)) [[likely]] { - // Sample the new token - auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1); - auto is_eog = llama_token_is_eog(model_.get(), new_token_id); - auto new_token_logits = 0.0f; // TODO: return logit - - // Handle termination cases - const auto has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1; - const auto has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog; - - generating = !(has_reach_max_tokens | has_reach_eog); + batch.n_tokens = 0; + if (LLAMA_SUCCESS(status)) [[likely]] { + // Sample the new token + auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1); + const auto is_eog = llama_token_is_eog(model_.get(), new_token_id); + const auto new_token_logits = llama_get_logits_ith(context_.get(), -1); // TODO: return logit + + // Handle termination cases + const bool has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1; + const bool has_reach_eog = !generation_context.generation_params.ignore_eos_token & is_eog; + const bool is_final = has_reach_max_tokens | has_reach_eog; + + // Bubble up the generated token if a callback is provided + const auto should_stop = callback_(new_token_id, *new_token_logits, is_final, n_decoded_tokens + 1); + + // Compute the continuation flag + generating = !(should_stop | is_final); + + // Update the batch for the next generation + update_batch_for_decoding(batch, new_token_id, prompt_size + n_decoded_tokens); + } + } - // Bubble up the generated token if a callback is provided - const auto should_stop = - std::invoke(callback_, new_token_id, new_token_logits, !generating, n_decoded_tokens + 1); - generating ^= should_stop; + llama_batch_free(batch); - batch = llama_batch_get_one(&new_token_id, 1); - } + return n_decoded_tokens; + } else { + return maybe_batch.error(); } - - return n_decoded_tokens; } } \ No newline at end of file diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 0e1a13ac167..321b667ae49 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -75,7 +75,7 @@ namespace huggingface::tgi::backends::llamacpp { struct generation_context_t { generation_params_t generation_params; sampling_params_t sampling_params; - std::span input_tokens; + std::span input_tokens; }; /** From 298367cdfd2f33cc9699b2748f1d8edfbf3c66e1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 14:53:35 +0100 Subject: [PATCH 79/92] feat(backend): fix when num_cores_per_instance is equals to zero with the size of the generated core allocation --- backends/llamacpp/src/backend.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index 557c14b4921..e662e207c00 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -176,7 +176,7 @@ impl LlamaCppBackend { // Allocate all the workers let cores_allocation = get_cores_allocation(num_cores_per_instance as usize); cores_allocation.iter().for_each(|affinity| { - match Self::allocate_worker(path, num_cores_per_instance as u32) { + match Self::allocate_worker(path, affinity.len() as u32) { Ok(worker) => { let tokenizer = Arc::clone(&tokenizer); let affinity = affinity.clone().collect::>(); From 929a2fc718a16d1c19cbc44fb0966f0c1c3f5903 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 14:53:46 +0100 Subject: [PATCH 80/92] feat(backend): add some test to the backend for core allocation --- backends/llamacpp/build.rs | 2 +- backends/llamacpp/src/backend.rs | 76 ++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 1 deletion(-) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index 023ccfbaadb..e22fa07c7db 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -94,7 +94,7 @@ fn main() { .unwrap_or(out_dir.join("dist")); // Build the backend - let deps_path = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path); + let _ = build_backend(is_debug, opt_level, out_dir.as_path(), &install_path); // Build the FFI layer calling the backend above build_ffi_layer(is_debug, &install_path); diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index e662e207c00..d8f28ab9da1 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -28,6 +28,7 @@ use tracing::{debug, error, info}; /// /// returns: usize Integer greater than 0 representing the number of CPU cores on the machine /// +#[cfg(not(test))] fn get_num_cores() -> usize { match option_env!("TGI_USE_PHYSICAL_CORES") .unwrap_or("OFF") @@ -45,6 +46,18 @@ fn get_num_cores() -> usize { } } +#[cfg(test)] +fn get_num_cores() -> usize { + match option_env!("TGI_USE_PHYSICAL_CORES") + .unwrap_or("OFF") + .to_uppercase() + .as_str() + { + "ON" => 16, + _ => 32, + } +} + /// Subdivide the set of CPU cores available on the system to equal, non-overlapping, subsets of CPU cores /// /// # Arguments @@ -417,3 +430,66 @@ impl Backend for LlamaCppBackend { true } } + +#[cfg(test)] +mod tests { + use crate::backend::{get_cores_allocation, get_num_cores}; + + fn test_get_num_cores() { + std::env::set_var("TGI_USE_PHYSICAL_CORES", "OFF"); + assert_eq!(get_num_cores(), 32); + + std::env::set_var("TGI_USE_PHYSICAL_CORES", "ON"); + assert_eq!(get_num_cores(), 16); + } + + fn test_get_cores_allocation_single_instance() { + std::env::set_var("TGI_USE_PHYSICAL_CORES", "OFF"); + let smt_allocation = get_cores_allocation(0); + assert_eq!(smt_allocation.len(), 1); + assert_eq!( + smt_allocation[0].clone().collect::>(), + (0..32).collect::>() + ); + + std::env::set_var("TGI_USE_PHYSICAL_CORES", "ON"); + let smt_allocation = get_cores_allocation(0); + assert_eq!(smt_allocation.len(), 1); + assert_eq!( + smt_allocation[0].clone().collect::>(), + (0..16).collect::>() + ); + } + + fn test_get_cores_allocation_multi_instances() { + for cores_per_instance in [1, 2, 4, 8, 16, 3, 7] { + std::env::set_var("TGI_USE_PHYSICAL_CORES", "OFF"); + + let num_instances = 32 / cores_per_instance; + let smt_allocation = get_cores_allocation(cores_per_instance); + + for i in 0..num_instances { + let start = i * cores_per_instance; + let end = start + cores_per_instance; + assert_eq!( + smt_allocation[i].clone().collect::>(), + (start..end).collect::>() + ); + } + + std::env::set_var("TGI_USE_PHYSICAL_CORES", "ON"); + let num_instances = 16 / cores_per_instance; + let smt_allocation = get_cores_allocation(cores_per_instance); + assert_eq!(smt_allocation.len(), num_instances); + + for i in 0..num_instances { + let start = i * cores_per_instance; + let end = start + cores_per_instance; + assert_eq!( + smt_allocation[i].clone().collect::>(), + (start..end).collect::>() + ); + } + } + } +} From df72c56b5b57adf51fb32a342406c45fda144947 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 16:30:20 +0100 Subject: [PATCH 81/92] feat(backend): add guard in case top_k = 0 --- backends/llamacpp/csrc/backend.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 17709b72704..4605243588d 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -28,7 +28,10 @@ namespace huggingface::tgi::backends::llamacpp { false, false )); - llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(top_k))); + + if (top_k > 0) { + llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(top_k))); + } if (0 < top_p && top_p < 1) { llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1)); From 9d659f1e23f0f4eea9c6df8a7b8339d8b8af0af8 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 16:49:29 +0100 Subject: [PATCH 82/92] feat(backend): add missing temperature parameter --- backends/llamacpp/csrc/backend.cpp | 1 + backends/llamacpp/csrc/backend.hpp | 1 + backends/llamacpp/src/backend.rs | 1 + backends/llamacpp/src/lib.rs | 2 ++ 4 files changed, 5 insertions(+) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 4605243588d..00692ea88e8 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -37,6 +37,7 @@ namespace huggingface::tgi::backends::llamacpp { llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1)); } + llama_sampler_chain_add(pSampler, llama_sampler_init_temp(temperature)); llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); return {pSampler, llama_sampler_deleter}; } diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 321b667ae49..38fd3aad676 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -48,6 +48,7 @@ namespace huggingface::tgi::backends::llamacpp { float_t top_p = 1.0f; float_t frequency_penalty = 0.0f; float_t repetition_penalty = 0.0f; + float_t temperature = 0.0f; uint64_t seed = 2014; /** diff --git a/backends/llamacpp/src/backend.rs b/backends/llamacpp/src/backend.rs index d8f28ab9da1..e1575b1d027 100644 --- a/backends/llamacpp/src/backend.rs +++ b/backends/llamacpp/src/backend.rs @@ -104,6 +104,7 @@ impl From<&ValidParameters> for SamplingParams { top_p: v.top_p, frequency_penalty: v.frequency_penalty, repetition_penalty: v.repetition_penalty, + temperature: v.temperature, seed: v.seed, } } diff --git a/backends/llamacpp/src/lib.rs b/backends/llamacpp/src/lib.rs index d844bb9fcd6..3507217ff86 100644 --- a/backends/llamacpp/src/lib.rs +++ b/backends/llamacpp/src/lib.rs @@ -10,6 +10,7 @@ impl Default for SamplingParams { top_p: 1.0f32, frequency_penalty: 0.0f32, repetition_penalty: 0.0f32, + temperature: 1.0f32, seed: 2014u64, } } @@ -29,6 +30,7 @@ mod ffi { top_p: f32, frequency_penalty: f32, repetition_penalty: f32, + temperature: f32, seed: u64, } From 6c5a75b593cc72de9f944c400c572d3178f5f0e3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 17:45:22 +0100 Subject: [PATCH 83/92] misc(offline): update model creation as std::shared_ptr --- backends/llamacpp/offline/main.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index 721abf051f5..e5c70e77a4f 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -5,7 +5,7 @@ #include #include -#include s +#include #include "../csrc/backend.hpp" using namespace huggingface::tgi::backends::llamacpp; @@ -22,8 +22,9 @@ int main(int argc, char **argv) { const auto modelPath = absolute(std::filesystem::path(argv[1])); const auto params = llama_model_default_params(); - auto model = std::unique_ptr( - llama_load_model_from_file(modelPath.c_str(), params) + auto model = std::shared_ptr( + llama_load_model_from_file(modelPath.c_str(), params), + llama_model_deleter ); auto prompt = "My name is Morgan"; @@ -31,7 +32,7 @@ int main(int argc, char **argv) { const auto nb_tokens = llama_tokenize(model.get(), prompt, sizeof(prompt), tokens.data(), tokens.size(), true, false); tokens.resize(nb_tokens); - auto backend = worker_t{std::move(model), {.n_batch = 1, .n_threads = 4}}; + auto backend = worker_t(std::move(model), {.n_batch = 1, .n_threads = 4}); fmt::println("Tokenized: {}", tokens); From b1ebc8f73bbab6b8d683f58a9b48e150f5af2919 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 23:56:57 +0100 Subject: [PATCH 84/92] feat(backend): update llama.cpp to 4215 --- backends/llamacpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 13107e0abce..05fce9227d2 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -33,7 +33,7 @@ endif () # Download llama.cpp repo at the specific version fetchcontent_declare( llama - URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4077.tar.gz + URL https://github.com/ggerganov/llama.cpp/archive/refs/tags/b4215.tar.gz ) fetchcontent_makeavailable(llama) From dc6435e3a58decb93d6d06c6d03052be4eb57411 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 23:57:08 +0100 Subject: [PATCH 85/92] feat(backend): create llama_context_params with default factory --- backends/llamacpp/csrc/ffi.hpp | 11 ++++++++++- backends/llamacpp/offline/main.cpp | 25 +++++++++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 3645526344f..99679fdb0cb 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -43,6 +43,15 @@ namespace huggingface::tgi::backends::llamacpp { return std::shared_ptr(model, llama_model_deleter); }; + auto get_llama_context_params = [](size_t num_threads) { + auto params = llama_context_default_params(); + params.n_threads = num_threads; + params.n_threads_batch = num_threads; + params.flash_attn = true; + params.no_perf = false; + return params; + }; + /** * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and * allow automatic implementation of Result<_, Exception> from C++ to Rust @@ -64,7 +73,7 @@ namespace huggingface::tgi::backends::llamacpp { * @param num_threads The number of threads the worker is allowed to spawn accross for its threadpool */ explicit llama_cpp_worker_frontend_t(llama_model *model, int32_t num_threads): - model_{ make_shared_llama_model(model) }, worker_(model_, {.n_ubatch = 1, .n_threads = num_threads, .no_perf = true}) {} + model_{ make_shared_llama_model(model) }, worker_(model_, get_llama_context_params(num_threads)) {} /** * Generate a new set of tokens from the provided `input_tokens`, streaming each individual token generated diff --git a/backends/llamacpp/offline/main.cpp b/backends/llamacpp/offline/main.cpp index e5c70e77a4f..fad97b3a1ed 100644 --- a/backends/llamacpp/offline/main.cpp +++ b/backends/llamacpp/offline/main.cpp @@ -27,24 +27,37 @@ int main(int argc, char **argv) { llama_model_deleter ); - auto prompt = "My name is Morgan"; - auto tokens = std::vector(16); - const auto nb_tokens = llama_tokenize(model.get(), prompt, sizeof(prompt), tokens.data(), tokens.size(), true, + auto prompt = std::string("My name is Morgan"); + auto tokens = std::vector(128); + const auto nb_tokens = llama_tokenize(model.get(), prompt.c_str(), prompt.size(), tokens.data(), tokens.size(), + true, false); tokens.resize(nb_tokens); - auto backend = worker_t(std::move(model), {.n_batch = 1, .n_threads = 4}); + llama_numa_init(ggml_numa_strategy::GGML_NUMA_STRATEGY_DISTRIBUTE); + auto backend = worker_t(model, llama_context_default_params()); fmt::println("Tokenized: {}", tokens); // generate auto generated_tokens = std::vector(32); const auto n_generated_tokens = backend.generate( - {{.max_new_tokens = 32}, {.top_k = 40}, tokens}, + {{.max_new_tokens = 32}, {.top_k = 40, .top_p = 0.95, .temperature = 0.8}, + tokens}, [&generated_tokens](llama_token new_token_id, float_t logit, bool is_eos, size_t step) -> bool { generated_tokens.emplace(generated_tokens.begin() + (step - 1), new_token_id); return false; } ); generated_tokens.resize(n_generated_tokens.value()); - fmt::println("Generated {} tokens", generated_tokens); + + std::string decoded = std::string(256, 'a'); + const size_t length = llama_detokenize(model.get(), + generated_tokens.data(), + generated_tokens.size(), + decoded.data(), + decoded.size(), + false, false); + decoded.resize(std::min(length, decoded.size())); + fmt::println("Generated tokens: {}", generated_tokens); + fmt::println("Generated text: {}", decoded); } From b10eaab9f30f7c92ec9d3f73170e69de69c185fa Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 28 Nov 2024 23:57:24 +0100 Subject: [PATCH 86/92] feat(backend): use new batch API to generate tokens --- backends/llamacpp/csrc/backend.cpp | 55 +++++++++++++++--------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index 00692ea88e8..f7e4cde288d 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -14,10 +14,10 @@ namespace huggingface::tgi::backends::llamacpp { llama_sampler_ptr sampling_params_t::into_llama_sampler(const llama_model *model) const { - auto *pSampler = llama_sampler_chain_init({.no_perf = false}); + auto *sampler = llama_sampler_chain_init({.no_perf = false}); // Penalties - llama_sampler_chain_add(pSampler, llama_sampler_init_penalties( + llama_sampler_chain_add(sampler, llama_sampler_init_penalties( llama_n_vocab(model), llama_token_eos(model), llama_token_nl(model), @@ -28,31 +28,27 @@ namespace huggingface::tgi::backends::llamacpp { false, false )); - - if (top_k > 0) { - llama_sampler_chain_add(pSampler, llama_sampler_init_top_k(static_cast(top_k))); - } + llama_sampler_chain_add(sampler, llama_sampler_init_top_k(static_cast(top_k))); if (0 < top_p && top_p < 1) { - llama_sampler_chain_add(pSampler, llama_sampler_init_top_p(top_p, 1)); + llama_sampler_chain_add(sampler, llama_sampler_init_top_p(top_p, 0)); } - llama_sampler_chain_add(pSampler, llama_sampler_init_temp(temperature)); - llama_sampler_chain_add(pSampler, llama_sampler_init_dist(seed)); - return {pSampler, llama_sampler_deleter}; + llama_sampler_chain_add(sampler, llama_sampler_init_temp(temperature)); + llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); + return {sampler, llama_sampler_deleter}; } - std::expected get_batch_from_prompt(std::span prompt) { auto batch = llama_batch_init(static_cast(prompt.size()), 0, 1); - std::for_each(prompt.begin(), prompt.end(), [&batch](const llama_token token) { - const auto n_token = batch.n_tokens; + batch.n_tokens = 0; - batch.token[n_token] = token; - batch.pos[n_token] = n_token; - batch.n_seq_id[n_token] = 1; - batch.seq_id[n_token][0] = 1; - batch.logits[n_token] = false; + std::for_each(prompt.begin(), prompt.end(), [&batch](const llama_token token) { + batch.token[batch.n_tokens] = token; + batch.pos[batch.n_tokens] = batch.n_tokens; + batch.n_seq_id[batch.n_tokens] = 1; + batch.seq_id[batch.n_tokens][0] = 0; + batch.logits[batch.n_tokens] = false; batch.n_tokens++; }); @@ -60,11 +56,12 @@ namespace huggingface::tgi::backends::llamacpp { return batch; } - void update_batch_for_decoding(llama_batch &batch, llama_token token, size_t position) { - batch.n_tokens = 1; - batch.logits[0] = true; + int32_t update_batch_for_decoding(llama_batch &batch, llama_token token, size_t position) { batch.token[0] = token; batch.pos[0] = static_cast(position); + batch.logits[0] = true; + batch.n_tokens = 1; + return 0; // Decoding will always happen at position 0 } worker_t::worker_t(std::shared_ptr model, const llama_context_params &¶ms) @@ -89,10 +86,14 @@ namespace huggingface::tgi::backends::llamacpp { // Set up the prompt if (auto maybe_batch = get_batch_from_prompt(generation_context.input_tokens); maybe_batch.has_value()) { - // Decode auto batch = *maybe_batch; + + // Keep track of where we are auto n_decoded_tokens = 0; - const auto prompt_size = generation_context.input_tokens.size(); + auto position = batch.n_tokens; + auto sampling_index = batch.n_tokens - 1; + + // Decode for (bool generating = true; generating; ++n_decoded_tokens) { #ifdef TGI_LLAMACPP_BACKEND_DEBUG @@ -104,12 +105,11 @@ namespace huggingface::tgi::backends::llamacpp { #else const auto status = llama_decode(context_.get(), batch); #endif - batch.n_tokens = 0; if (LLAMA_SUCCESS(status)) [[likely]] { // Sample the new token - auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), -1); + auto new_token_id = llama_sampler_sample(sampler.get(), context_.get(), sampling_index); const auto is_eog = llama_token_is_eog(model_.get(), new_token_id); - const auto new_token_logits = llama_get_logits_ith(context_.get(), -1); // TODO: return logit + const auto *new_token_logits = llama_get_logits_ith(context_.get(), sampling_index) + new_token_id; // Handle termination cases const bool has_reach_max_tokens = n_decoded_tokens >= max_new_tokens - 1; @@ -123,7 +123,8 @@ namespace huggingface::tgi::backends::llamacpp { generating = !(should_stop | is_final); // Update the batch for the next generation - update_batch_for_decoding(batch, new_token_id, prompt_size + n_decoded_tokens); + sampling_index = update_batch_for_decoding(batch, new_token_id, position); + position += 1; } } From 59b0ef30189c55e52b20f229d0d39ea74a5bd02d Mon Sep 17 00:00:00 2001 From: Hugo Larcher Date: Fri, 29 Nov 2024 00:31:36 +0100 Subject: [PATCH 87/92] feat: Fix Cmakelist to allow building on Darwin platform (#2785) * feat: Fix Cmakelist to allow building on Darwin platform * fix: Fix tokenizer in llama.cpp Dockerfile --- Dockerfile.llamacpp | 7 +++++-- backends/llamacpp/CMakeLists.txt | 8 +++++++- backends/llamacpp/README.md | 17 +++++++++++++++++ backends/llamacpp/requirements.txt | 1 + 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 backends/llamacpp/README.md create mode 100644 backends/llamacpp/requirements.txt diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index 92b1882a3a7..e8896ad4915 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -66,11 +66,14 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ openssl \ python3.11-dev \ python3.11-venv \ - ibgomp1 + libgomp1 COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher COPY --from=builder /usr/src/text-generation-inference/dist /usr/ - +COPY --from=builder /usr/src/text-generation-inference/backends/llamacpp/requirements.txt requirements.txt +RUN /usr/bin/python3.11 -m venv /usr/src/text-generation-inference/venv +ENV PATH="/usr/src/text-generation-inference/venv/bin:$PATH" +RUN pip3 install --no-cache-dir -r requirements.txt ENV PORT=8080 WORKDIR /usr/src/text-generation-inference ENTRYPOINT ["text-generation-launcher"] \ No newline at end of file diff --git a/backends/llamacpp/CMakeLists.txt b/backends/llamacpp/CMakeLists.txt index 05fce9227d2..6599fd692e9 100644 --- a/backends/llamacpp/CMakeLists.txt +++ b/backends/llamacpp/CMakeLists.txt @@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.24) project(tgi-llama-cpp-backend VERSION 1.0.0) set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED ON) include(FetchContent) @@ -10,13 +11,18 @@ set(LLAMA_CPP_TARGET_CUDA_ARCHS "75-real;80-real;86-real;89-real;90-real" CACHE option(LLAMA_CPP_BUILD_OFFLINE_RUNNER "Flag to build the standalone c++ backend runner") option(LLAMA_CPP_BUILD_CUDA "Flag to build CUDA enabled inference through llama.cpp") -if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND ${CMAKE_SYSTEM_NAME} STREQUAL "Linux") +if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang" AND (${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")) message(STATUS "Targeting libc++") set(CMAKE_CXX_FLAGS -stdlib=libc++ ${CMAKE_CXX_FLAGS}) else () message(STATUS "Not using libc++ ${CMAKE_CXX_COMPILER_ID} ${CMAKE_SYSTEM_NAME}") endif () +# add linker options for Darwin +if (${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L$HOMEBREW_PREFIX/opt/llvm/lib/c++ -L$HOMEBREW_PREFIX/opt/llvm/lib/unwind -lunwind") +endif () + # Add dependencies include(cmake/numa.cmake) include(cmake/spdlog.cmake) diff --git a/backends/llamacpp/README.md b/backends/llamacpp/README.md new file mode 100644 index 00000000000..0931339c40d --- /dev/null +++ b/backends/llamacpp/README.md @@ -0,0 +1,17 @@ +## Compiling with MacOS + +To compile the Llama.cpp backend on MacOS, you need to install `clang` and `cmake` via Homebrew: + +```bash +brew install llvm cmake +``` + +You then need to configure CMakelists.txt to use the newly installed clang compiler. +You can do this by configuring your IDE or adding the following lines to the top of the file: + +```cmake +set(CMAKE_C_COMPILER /opt/homebrew/opt/llvm/bin/clang) +set(CMAKE_CXX_COMPILER /opt/homebrew/opt/llvm/bin/clang++) +``` + +CMakelist.txt assumes that Homebrew installs libc++ in `$HOMEBREW_PREFIX/opt/llvm/lib/c++`. \ No newline at end of file diff --git a/backends/llamacpp/requirements.txt b/backends/llamacpp/requirements.txt new file mode 100644 index 00000000000..2372d58ba53 --- /dev/null +++ b/backends/llamacpp/requirements.txt @@ -0,0 +1 @@ +transformers==4.45.2 ; python_version >= "3.9" and python_version < "3.13" \ No newline at end of file From f5c4cee364ca61f74103f9cb8aec992b670eb7e5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 29 Nov 2024 16:22:43 +0100 Subject: [PATCH 88/92] feat(backend): correctly link to all libraries --- backends/llamacpp/build.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/backends/llamacpp/build.rs b/backends/llamacpp/build.rs index e22fa07c7db..b5fd7bc0463 100644 --- a/backends/llamacpp/build.rs +++ b/backends/llamacpp/build.rs @@ -115,6 +115,9 @@ fn main() { let spdlog_linkage_target = if is_debug { "spdlogd" } else { "spdlog" }; println!("cargo:rustc-link-lib=dylib={spdlog_linkage_target}"); println!("cargo:rustc-link-lib=dylib=ggml"); + println!("cargo:rustc-link-lib=dylib=ggml-base"); + println!("cargo:rustc-link-lib=dylib=ggml-cpu"); + println!("cargo:rustc-link-lib=dylib=ggml-amx"); println!("cargo:rustc-link-lib=dylib=llama"); // Rerun if one of these file change From db41776a0e151f0a57170d4b614d5c4c5ef6ca27 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 29 Nov 2024 16:22:55 +0100 Subject: [PATCH 89/92] feat(backend): add mimalloc memory allocator to the container --- Dockerfile.llamacpp | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/Dockerfile.llamacpp b/Dockerfile.llamacpp index e8896ad4915..78b3636bcf4 100644 --- a/Dockerfile.llamacpp +++ b/Dockerfile.llamacpp @@ -54,6 +54,26 @@ ENV RUSTFLAGS="-L/usr/lib" ENV CMAKE_INSTALL_PREFIX=/usr/src/text-generation-inference/dist RUN cargo build --profile release-opt --package text-generation-backend-llamacpp --bin text-generation-backend-llamacpp --frozen +FROM ubuntu:22.04 AS mimalloc-builder +ENV DEBIAN_FRONTEND=noninteractive +ENV MIMALLOC_VERSION=2.1.7 +RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ + --mount=type=cache,target=/var/lib/apt,sharing=locked \ + apt update && \ + apt upgrade -y && \ + apt install -y \ + clang \ + cmake \ + ninja-build \ + wget + +RUN wget https://github.com/microsoft/mimalloc/archive/refs/tags/v${MIMALLOC_VERSION}.tar.gz -O mimalloc-${MIMALLOC_VERSION}.tar.gz && \ + tar -xzf mimalloc-${MIMALLOC_VERSION}.tar.gz && \ + cd mimalloc-${MIMALLOC_VERSION} && \ + cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -B build . && \ + cmake --build build --parallel && \ + cmake --install build + FROM ubuntu:22.04 ENV DEBIAN_FRONTEND=noninteractive @@ -62,18 +82,20 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ apt update && \ apt upgrade -y && \ apt install -y \ + libopenmpi3 \ numactl \ openssl \ python3.11-dev \ - python3.11-venv \ - libgomp1 + python3.11-venv COPY --from=builder /usr/src/text-generation-inference/target/release-opt/text-generation-backend-llamacpp /usr/src/text-generation-inference/text-generation-launcher COPY --from=builder /usr/src/text-generation-inference/dist /usr/ COPY --from=builder /usr/src/text-generation-inference/backends/llamacpp/requirements.txt requirements.txt +COPY --from=mimalloc-builder /usr/local/lib/libmimalloc.so.2.1 /usr/lib/libmimalloc.so.2.1 + RUN /usr/bin/python3.11 -m venv /usr/src/text-generation-inference/venv ENV PATH="/usr/src/text-generation-inference/venv/bin:$PATH" RUN pip3 install --no-cache-dir -r requirements.txt ENV PORT=8080 WORKDIR /usr/src/text-generation-inference -ENTRYPOINT ["text-generation-launcher"] \ No newline at end of file +ENTRYPOINT ["LD_PRELOAD=/usr/lib/libmimalloc.so.2.1", "text-generation-launcher"] \ No newline at end of file From c9f6c3a8f79d12346372ba786db9be9cd010a40b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 29 Nov 2024 23:34:16 +0100 Subject: [PATCH 90/92] feat(backend): better map exception throw on C++ side --- backends/llamacpp/csrc/backend.cpp | 6 ++++++ backends/llamacpp/csrc/backend.hpp | 4 +++- backends/llamacpp/csrc/ffi.hpp | 33 +++++++++++++++++++++++++++--- 3 files changed, 39 insertions(+), 4 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index f7e4cde288d..b6b3de00477 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -125,6 +125,12 @@ namespace huggingface::tgi::backends::llamacpp { // Update the batch for the next generation sampling_index = update_batch_for_decoding(batch, new_token_id, position); position += 1; + } else { + if (status == 1) { + return backend_error_t::NO_KV_SLOT_AVAILABLE; + } else { + return backend_error_t::DECODING_ERROR; + } } } diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index 38fd3aad676..e1ab1e6504f 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -37,7 +37,9 @@ namespace huggingface::tgi::backends::llamacpp { */ enum backend_error_t : uint8_t { // Provided model filepath doesnt exist - MODEL_FILE_DOESNT_EXIST = 1 + MODEL_FILE_DOESNT_EXIST = 1, + NO_KV_SLOT_AVAILABLE = 2, + DECODING_ERROR = 3 }; /** diff --git a/backends/llamacpp/csrc/ffi.hpp b/backends/llamacpp/csrc/ffi.hpp index 99679fdb0cb..2f1437397ca 100644 --- a/backends/llamacpp/csrc/ffi.hpp +++ b/backends/llamacpp/csrc/ffi.hpp @@ -32,7 +32,6 @@ namespace huggingface::tgi::backends::llamacpp { #include "backends/llamacpp/src/lib.rs.h" #include "rust/cxx.h" - namespace huggingface::tgi::backends::llamacpp { /** @@ -56,7 +55,12 @@ namespace huggingface::tgi::backends::llamacpp { * llama.cpp backend specific exception mapped from `backend_exception_t` to throw at the FFI level and * allow automatic implementation of Result<_, Exception> from C++ to Rust */ - class llama_cpp_backend_exception_t : std::exception {}; + class llama_cpp_backend_exception_t : std::exception { + public: + backend_error_t error; + + llama_cpp_backend_exception_t(const backend_error_t error): error(error) {}; + }; /** * Llama.cpp frontend over the worker interfacing with Rust FFI layer @@ -119,7 +123,7 @@ namespace huggingface::tgi::backends::llamacpp { if(const auto result = worker_.generate(generation_context, context_forwarding_callback); result.has_value()) [[likely]] { return *result; } else { - throw llama_cpp_backend_exception_t {}; + throw llama_cpp_backend_exception_t(result.error()); } } }; @@ -232,5 +236,28 @@ namespace huggingface::tgi::backends::llamacpp { } } +// Error handle converting to rust Result +template +static void trycatch(Try &&func, Fail &&fail) noexcept try { + func(); +} catch (const huggingface::tgi::backends::llamacpp::llama_cpp_backend_exception_t &e) { + switch (e.error) { + case huggingface::tgi::backends::llamacpp::backend_error_t::MODEL_FILE_DOESNT_EXIST: { + fail("Specified model path doesn't exist."); + break; + } + case huggingface::tgi::backends::llamacpp::backend_error_t::NO_KV_SLOT_AVAILABLE: { + fail("Keys/Values cache is full, no slot available for the new batch."); + break; + } + case huggingface::tgi::backends::llamacpp::backend_error_t::DECODING_ERROR: { + fail("An error what detected during the generation."); + break; + } + } + fail(); +} + + #endif //TGI_LLAMA_CPP_BACKEND_FFI_HPP From e0dda9b614d285f3ee9e4053f9306c946a753721 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 29 Nov 2024 23:38:27 +0100 Subject: [PATCH 91/92] feat(backend): use c++ defined types for llama.cpp --- backends/llamacpp/csrc/backend.cpp | 2 +- backends/llamacpp/csrc/backend.hpp | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/backends/llamacpp/csrc/backend.cpp b/backends/llamacpp/csrc/backend.cpp index b6b3de00477..d3f89adca61 100644 --- a/backends/llamacpp/csrc/backend.cpp +++ b/backends/llamacpp/csrc/backend.cpp @@ -36,7 +36,7 @@ namespace huggingface::tgi::backends::llamacpp { llama_sampler_chain_add(sampler, llama_sampler_init_temp(temperature)); llama_sampler_chain_add(sampler, llama_sampler_init_dist(seed)); - return {sampler, llama_sampler_deleter}; + return llama_sampler_ptr(sampler); } std::expected get_batch_from_prompt(std::span prompt) { diff --git a/backends/llamacpp/csrc/backend.hpp b/backends/llamacpp/csrc/backend.hpp index e1ab1e6504f..84602e77d08 100644 --- a/backends/llamacpp/csrc/backend.hpp +++ b/backends/llamacpp/csrc/backend.hpp @@ -17,18 +17,12 @@ #include #include +#include #include #define LLAMA_SUCCESS(x) x == 0 namespace huggingface::tgi::backends::llamacpp { - - static constexpr auto llama_context_deleter = [](llama_context *pContext) { llama_free(pContext); }; - typedef std::unique_ptr llama_context_ptr; - - static constexpr auto llama_sampler_deleter = [](llama_sampler *pSampler) { llama_sampler_free(pSampler); }; - typedef std::unique_ptr llama_sampler_ptr; - typedef std::function llama_decode_callback; static constexpr auto llama_void_callback = [](llama_token, float_t, bool, size_t) -> bool { return false; }; From 182ffaf06415e12a0feac4ff3700e06fdcc9dd5d Mon Sep 17 00:00:00 2001 From: Funtowicz Morgan Date: Thu, 12 Dec 2024 16:04:05 +0100 Subject: [PATCH 92/92] misc: use return Ok(()) Co-authored-by: Corentin REGAL --- backends/llamacpp/src/main.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/llamacpp/src/main.rs b/backends/llamacpp/src/main.rs index adc183edc5b..77340affbb2 100644 --- a/backends/llamacpp/src/main.rs +++ b/backends/llamacpp/src/main.rs @@ -118,7 +118,7 @@ async fn main() -> Result<(), RouterError> { use utoipa::OpenApi; let api_doc = ApiDoc::openapi().to_pretty_json().unwrap(); println!("{}", api_doc); - std::process::exit(0); + return Ok(()); }; text_generation_router::logging::init_logging(otlp_endpoint, otlp_service_name, json_output);