From cdb4ccd2e6682223d8321892d3ee9707528f68a2 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Tue, 26 Nov 2024 19:01:29 -0800 Subject: [PATCH] [shortfin] Add C++ tokenizer wrapper library. (#610) * This is gated by SHORTFIN_ENABLE_TOKENIZERS (presently off). * I'd like to either take over the wrapper or get https://github.com/mlc-ai/tokenizers-cpp/issues/50 before putting much weight on this. * There is no great C++ option for this component, so we go to the trouble of integrating a Rust component. We will need to do a bit of prep on our CI systems to enable this by default. * Python API will be added in a subsequent commit. This should be more efficient than the tokenizers Python API since we will allow direct access to the tokens vs doing a lot of conversions. * Size analysis: Prior to this patch, libshortfin was 1.8MB, which gave us an entire GPU and CPU runtime stack. After this patch (stripped) it is 8.4MB. Given how important the use case is, I'm willing to tolerate this for the moment. It seems like there is room for something better here, which is why I did not expose the underlying vendor'd API directly (edit: by switching to a nightly rust and activating a bunch of options from https://github.com/johnthagen/min-sized-rust, I was able to produce a binary that is 4.2MB, which is more reasonable). --- shortfin/CMakeLists.txt | 77 ++++++++++++++++++- .../build_tools/cmake/shortfin_library.cmake | 5 +- .../build_tools/cmake/shortfin_testing.cmake | 50 ++++++++++++ shortfin/setup.py | 1 + shortfin/src/shortfin/CMakeLists.txt | 1 + .../components/tokenizers/CMakeLists.txt | 41 ++++++++++ .../components/tokenizers/tokenizers.cc | 63 +++++++++++++++ .../components/tokenizers/tokenizers.h | 52 +++++++++++++ .../components/tokenizers/tokenizers_test.cc | 56 ++++++++++++++ 9 files changed, 341 insertions(+), 5 deletions(-) create mode 100644 shortfin/build_tools/cmake/shortfin_testing.cmake create mode 100644 shortfin/src/shortfin/components/tokenizers/CMakeLists.txt create mode 100644 shortfin/src/shortfin/components/tokenizers/tokenizers.cc create mode 100644 shortfin/src/shortfin/components/tokenizers/tokenizers.h create mode 100644 shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc diff --git a/shortfin/CMakeLists.txt b/shortfin/CMakeLists.txt index f025eccfe..16baa1675 100644 --- a/shortfin/CMakeLists.txt +++ b/shortfin/CMakeLists.txt @@ -48,6 +48,7 @@ option(SHORTFIN_BUILD_TESTS "Builds C++ tests" ON) option(SHORTFIN_BUNDLE_DEPS "Download dependencies instead of using system libraries" ON) option(SHORTFIN_ENABLE_TRACING "Enable runtime tracing for iree and shortfin" OFF) option(SHORTFIN_ENABLE_LTO "Enables LTO if supported" ON) +option(SHORTFIN_ENABLE_TOKENIZERS "Enables integration of native tokenizers library" OFF) set(SHORTFIN_IREE_SOURCE_DIR "" CACHE FILEPATH "Path to IREE source") @@ -80,6 +81,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/build_tools/cmake/ ) include(shortfin_library) +include(shortfin_testing) include(CheckCXXCompilerFlag) include(FetchContent) @@ -90,7 +92,9 @@ include(FetchContent) if(SHORTFIN_ENABLE_LTO) include(CheckIPOSupported) check_ipo_supported(RESULT SHORTFIN_LTO_SUPPORTED OUTPUT SHORTFIN_LTO_ERROR) - if(SHORTFIN_LTO_SUPPORTED) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + message(STATUS "Not enabling LTO for debug build") + elseif(SHORTFIN_LTO_SUPPORTED) message(STATUS "Shortfin LTO Enabled") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else() @@ -126,7 +130,9 @@ endif() message(STATUS " - Host") ################################################################################ -# Dependencies +# Bundled Dependencies +# These dependencies are either bundled or used via installed packages based +# on the SHORTFIN_BUNDLE_DEPS option. ################################################################################ if(SHORTFIN_BUNDLE_DEPS) @@ -164,15 +170,19 @@ if(SHORTFIN_BUNDLE_DEPS) shortfin_push_bundled_lib_options() # Enable spdlog shared library options so we can export it. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPDLOG_SHARED_LIB -Dspdlog_EXPORTS") + message(STATUS "Fetching bundled projects") + list(APPEND CMAKE_MESSAGE_INDENT " ") FetchContent_MakeAvailable(fmt spdlog xtl xtensor) shortfin_pop_bundled_lib_options() + list(POP_BACK CMAKE_MESSAGE_INDENT) else() find_package(spdlog) find_package(xtensor) endif() ################################################################################ -# IREE +# IREE Dependency +# This is always a source dependency on the IREE runtime. ################################################################################ # Set IREE build flags. @@ -237,6 +247,65 @@ else() endif() shortfin_pop_bundled_lib_options() +################################################################################ +# Tokenizer Library +################################################################################ + +function(shortfin_check_tokenizers) + # Make sure that rust/cargo is installed and usable. + # Consider switching this to a cached variable once the tokenizers_cpp project + # will accept an override vs running whatever is on the path. For now, just + # verify the path is sane as that is what will get used. + find_program(SHORTFIN_CARGO_PATH NAMES cargo NO_CACHE) + if(NOT SHORTFIN_CARGO_PATH) + message(SEND_ERROR + "Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool). " + "Please follow Rust documentation to install. On Ubuntu, this can typically be accomplished with:\n" + " sudo apt install rustup && rustup default stable\n" + "See https://www.rust-lang.org/tools/install" + ) + endif() + + # Make sure cargo is functional. + execute_process( + COMMAND ${SHORTFIN_CARGO_PATH} + RESULT_VARIABLE _CARGO_RESULT + OUTPUT_VARIABLE _CARGO_OUT + ERROR_VARIABLE _CARGO_ERR + ) + if(NOT "${_CARGO_RESULT}" STREQUAL "0") + message(SEND_ERROR + "Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool) " + "to be configured properly. It was found (${SHORTFIN_CARGO_PATH}) but returned an " + "error. Output below:\n" + "${_CARGO_OUT}\n" + "${_CARGO_ERR}" + ) + endif() +endfunction() + +if(SHORTFIN_ENABLE_TOKENIZERS) + # TODO: submit a patch to tokenizers_cpp to allow explicit configuration of the + # cargo location and pass that vs relying on environmental alignment. + shortfin_check_tokenizers() + + shortfin_push_bundled_lib_options() + set(CMAKE_C_VISIBILITY_PRESET "hidden") + set(CMAKE_CXX_VISIBILITY_PRESET "hidden") + set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) + set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER OFF) + + FetchContent_Declare( + tokenizers_cpp # From CMake project() declaration + GIT_REPOSITORY https://github.com/mlc-ai/tokenizers-cpp.git + GIT_TAG 4bb753377680e249345b54c6b10e6d0674c8af03 # 2024 Nov 15 + EXCLUDE_FROM_ALL + ) + message(STATUS "Fetching tokenizers_cpp") + FetchContent_MakeAvailable(tokenizers_cpp) + shortfin_pop_bundled_lib_options() +endif() + ################################################################################ # Tests ################################################################################ @@ -254,9 +323,9 @@ if(SHORTFIN_BUILD_TESTS) endif() include(GoogleTest) enable_testing() + add_custom_target(shortfin_testdata_deps) endif() - add_subdirectory(src) if(SHORTFIN_BUILD_PYTHON_BINDINGS) diff --git a/shortfin/build_tools/cmake/shortfin_library.cmake b/shortfin/build_tools/cmake/shortfin_library.cmake index aaa97a6c1..103fdf1c5 100644 --- a/shortfin/build_tools/cmake/shortfin_library.cmake +++ b/shortfin/build_tools/cmake/shortfin_library.cmake @@ -182,7 +182,10 @@ function(shortfin_gtest_test) GTest::gmock GTest::gtest_main ) - gtest_discover_tests(${_RULE_NAME}) + gtest_discover_tests( + ${_RULE_NAME} + WORKING_DIRECTORY "${libshortfin_BINARY_DIR}" + ) endfunction() diff --git a/shortfin/build_tools/cmake/shortfin_testing.cmake b/shortfin/build_tools/cmake/shortfin_testing.cmake new file mode 100644 index 000000000..e462b7023 --- /dev/null +++ b/shortfin/build_tools/cmake/shortfin_testing.cmake @@ -0,0 +1,50 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Downloads some test data file as part of configure. +# This does a download->rename in an attempt to be robust to partial downloads. +# It should not be used to manage large test data files or anything sensitive +# enough to require a hash check. +# The output file is added as an additional clean file on the global +# shortfin_testdata_deps target, meaning the "ninja clean" will remove it. +# It is also added to the current directories list of configure depends, which +# means that if ninja is run and it is not present, cmake will be re-invoked. +function(shortfin_download_test_data) + cmake_parse_arguments( + _RULE + "" + "URL;OUTPUT_FILE" + "" + ${ARGN} + ) + if(NOT SHORTFIN_BUILD_TESTS) + return() + endif() + if(NOT EXISTS "${_RULE_OUTPUT_FILE}") + set(_stage_file "${_RULE_OUTPUT_FILE}.stage") + message(STATUS "Downloading test data ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}") + file(DOWNLOAD "${_RULE_URL}" "${_stage_file}" STATUS _status) + list(POP_FRONT _status _status_code) + if(_status_code EQUAL "0") + file(RENAME "${_stage_file}" "${_RULE_OUTPUT_FILE}") + else() + message(SEND_ERROR "Error downloading file ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}") + endif() + endif() + + # Make clean remove it. + set_property( + TARGET shortfin_testdata_deps + APPEND PROPERTY ADDITIONAL_CLEAN_FILES + "${_RULE_OUTPUT_FILE}" + ) + + # And make us reconfigure if it isn't there. + set_property( + DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + APPEND PROPERTY + CMAKE_CONFIGURE_DEPENDS "${_RULE_OUTPUT_FILE}") +endfunction() diff --git a/shortfin/setup.py b/shortfin/setup.py index cf3762950..e15b38d89 100644 --- a/shortfin/setup.py +++ b/shortfin/setup.py @@ -225,6 +225,7 @@ def build_cmake_configuration(CMAKE_BUILD_DIR: Path, extra_cmake_args=[]): add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_LTO", default_value="ON") add_env_cmake_setting(cmake_args, "SHORTFIN_IREE_SOURCE_DIR") add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_ASAN") + add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_TOKENIZERS", default_value="OFF") # Only do a from-scratch configure if not already configured. cmake_cache_file = os.path.join(CMAKE_BUILD_DIR, "CMakeCache.txt") diff --git a/shortfin/src/shortfin/CMakeLists.txt b/shortfin/src/shortfin/CMakeLists.txt index 058e0e336..73df08e7c 100644 --- a/shortfin/src/shortfin/CMakeLists.txt +++ b/shortfin/src/shortfin/CMakeLists.txt @@ -5,5 +5,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception add_subdirectory(array) +add_subdirectory(components/tokenizers) add_subdirectory(local) add_subdirectory(support) diff --git a/shortfin/src/shortfin/components/tokenizers/CMakeLists.txt b/shortfin/src/shortfin/components/tokenizers/CMakeLists.txt new file mode 100644 index 000000000..6b9f794b1 --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/CMakeLists.txt @@ -0,0 +1,41 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if(NOT SHORTFIN_ENABLE_TOKENIZERS) + return() +endif() + +shortfin_cc_component( + NAME + shortfin_tokenizers + HDRS + tokenizers.h + SRCS + tokenizers.cc + DEFINES + SHORTFIN_HAVE_TOKENIZERS + COMPONENTS + shortfin_support + DEPS + tokenizers_cpp +) +set_property(GLOBAL APPEND + PROPERTY SHORTFIN_LIB_OPTIONAL_COMPONENTS + shortfin_tokenizers) +target_compile_definitions(shortfin_public_defs INTERFACE SHORTFIN_HAVE_TOKENIZERS) + +# Download test data. +shortfin_download_test_data( + URL "https://huggingface.co/google-bert/bert-base-cased/resolve/cd5ef92a9fb2f889e972770a36d4ed042daf221e/tokenizer.json" + OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json" +) + +# Note that tests run from the binary dir of the project. +shortfin_gtest_test( + NAME shortfin_tokenizers_test + SRCS + tokenizers_test.cc +) diff --git a/shortfin/src/shortfin/components/tokenizers/tokenizers.cc b/shortfin/src/shortfin/components/tokenizers/tokenizers.cc new file mode 100644 index 000000000..118bc0c1b --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/tokenizers.cc @@ -0,0 +1,63 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "shortfin/components/tokenizers/tokenizers.h" + +#include + +#include "shortfin/support/logging.h" +#include "tokenizers_cpp.h" + +namespace shortfin::tokenizers { + +namespace { + +class AccessibleTokenizer : public Tokenizer { + public: + using Tokenizer::vendor_tokenizer_; +}; + +::tokenizers::Tokenizer *Get(Tokenizer *self) { + void *ptr = static_cast(self)->vendor_tokenizer_; + if (!ptr) { + throw std::logic_error("Tokenizer is null"); + } + return static_cast<::tokenizers::Tokenizer *>(ptr); +} + +} // namespace + +Tokenizer::~Tokenizer() { delete Get(this); } + +Tokenizer Tokenizer::FromBlobJSON(const std::string &json_blob) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::FromBlobJSON"); + return Tokenizer(::tokenizers::Tokenizer::FromBlobJSON(json_blob).release()); +} + +std::vector Tokenizer::Encode(const std::string &text) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Encode"); + return Get(this)->Encode(text); +} + +std::vector> Tokenizer::EncodeBatch( + const std::vector &texts) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::EncodeBatch"); + return Get(this)->EncodeBatch(texts); +} + +std::string Tokenizer::Decode(const std::vector &ids) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Decode"); + return Get(this)->Decode(ids); +} +size_t Tokenizer::GetVocabSize() { return Get(this)->GetVocabSize(); } +std::string Tokenizer::IdToToken(int32_t token_id) { + return Get(this)->IdToToken(token_id); +} +int32_t Tokenizer::TokenToId(const std::string &token) { + return Get(this)->TokenToId(token); +} + +} // namespace shortfin::tokenizers diff --git a/shortfin/src/shortfin/components/tokenizers/tokenizers.h b/shortfin/src/shortfin/components/tokenizers/tokenizers.h new file mode 100644 index 000000000..d263eace6 --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/tokenizers.h @@ -0,0 +1,52 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H +#define SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H + +#include +#include + +#include "shortfin/support/api.h" + +namespace shortfin::tokenizers { + +// A vendored Tokenizer class that does not export the details of the backing +// implementation. While a little bit gross, this keeps us from needing to +// re-export a vendor'ed API as part of our public API. +// The current vendor tokenizer is based on mlc-ai/tokenizers-cpp. The API +// is fairly close to that implementation. +// See: https://github.com/mlc-ai/tokenizers-cpp +class SHORTFIN_API Tokenizer { + public: + Tokenizer(const Tokenizer &) = delete; + Tokenizer &operator=(const Tokenizer &) = delete; + Tokenizer(Tokenizer &&other) : vendor_tokenizer_(other.vendor_tokenizer_) { + vendor_tokenizer_ = nullptr; + } + ~Tokenizer(); + + // Factory functions. + static Tokenizer FromBlobJSON(const std::string &json_blob); + + std::vector Encode(const std::string &text); + std::vector> EncodeBatch( + const std::vector &texts); + std::string Decode(const std::vector &ids); + size_t GetVocabSize(); + std::string IdToToken(int32_t token_id); + int32_t TokenToId(const std::string &token); + + private: + Tokenizer(void *vendor_tokenizer) : vendor_tokenizer_(vendor_tokenizer) {} + + protected: + void *vendor_tokenizer_; +}; + +} // namespace shortfin::tokenizers + +#endif // SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H diff --git a/shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc b/shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc new file mode 100644 index 000000000..674721653 --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc @@ -0,0 +1,56 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "shortfin/components/tokenizers/tokenizers.h" + +#include +#include + +#include +#include + +using namespace shortfin::tokenizers; + +namespace { + +std::string ReadFile(std::filesystem::path path) { + std::ifstream in(path); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + +} // namespace + +// TODO: Enable once upstream changes with error handling have landed. +// Currently aborts. +// See: https://github.com/mlc-ai/tokenizers-cpp/issues/50 +// TEST(TokenizersTest, FromIllegalBlobJson) { +// auto tok = Tokenizer::FromBlobJSON("foobar"); +// } + +TEST(TokenizersTest, BasicTokenizerJson) { + std::filesystem::path tokenizer_path( + "src/shortfin/components/tokenizers/tokenizer.json"); + auto tokenizer_json = ReadFile(tokenizer_path); + ASSERT_GT(tokenizer_json.size(), 0) + << "reading " << tokenizer_path + << " (cwd: " << std::filesystem::current_path() << ")"; + auto tok = Tokenizer::FromBlobJSON(tokenizer_json); + EXPECT_GT(tok.GetVocabSize(), 100); // Sanity check + auto encoded = tok.Encode("hello world"); + EXPECT_THAT(encoded, + ::testing::ContainerEq(std::vector{19082, 1362})); + auto batch_encoded = tok.EncodeBatch({"hello", "world"}); + ASSERT_EQ(batch_encoded.size(), 2); + EXPECT_THAT(batch_encoded[0], + ::testing::ContainerEq(std::vector{19082})); + EXPECT_THAT(batch_encoded[1], + ::testing::ContainerEq(std::vector{1362})); + EXPECT_EQ(tok.TokenToId("hello"), 19082); + auto decoded = tok.Decode(encoded); + EXPECT_EQ(decoded, "hello world"); +}