diff --git a/shortfin/CMakeLists.txt b/shortfin/CMakeLists.txt index f025eccfe..61d47e19a 100644 --- a/shortfin/CMakeLists.txt +++ b/shortfin/CMakeLists.txt @@ -48,6 +48,7 @@ option(SHORTFIN_BUILD_TESTS "Builds C++ tests" ON) option(SHORTFIN_BUNDLE_DEPS "Download dependencies instead of using system libraries" ON) option(SHORTFIN_ENABLE_TRACING "Enable runtime tracing for iree and shortfin" OFF) option(SHORTFIN_ENABLE_LTO "Enables LTO if supported" ON) +option(SHORTFIN_ENABLE_TOKENIZERS "Enables integration of native tokenizers library" OFF) set(SHORTFIN_IREE_SOURCE_DIR "" CACHE FILEPATH "Path to IREE source") @@ -80,6 +81,7 @@ list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/build_tools/cmake/ ) include(shortfin_library) +include(shortfin_testing) include(CheckCXXCompilerFlag) include(FetchContent) @@ -90,7 +92,9 @@ include(FetchContent) if(SHORTFIN_ENABLE_LTO) include(CheckIPOSupported) check_ipo_supported(RESULT SHORTFIN_LTO_SUPPORTED OUTPUT SHORTFIN_LTO_ERROR) - if(SHORTFIN_LTO_SUPPORTED) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + message(STATUS "Not enabling LTO for debug build") + elseif(SHORTFIN_LTO_SUPPORTED) message(STATUS "Shortfin LTO Enabled") set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) else() @@ -126,7 +130,9 @@ endif() message(STATUS " - Host") ################################################################################ -# Dependencies +# Bundled Dependencies +# These dependencies are either bundled or used via installed packages based +# on the SHORTFIN_BUNDLE_DEPS option. ################################################################################ if(SHORTFIN_BUNDLE_DEPS) @@ -164,6 +170,7 @@ if(SHORTFIN_BUNDLE_DEPS) shortfin_push_bundled_lib_options() # Enable spdlog shared library options so we can export it. set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPDLOG_SHARED_LIB -Dspdlog_EXPORTS") + message(STATUS "Fetching bundled projects") FetchContent_MakeAvailable(fmt spdlog xtl xtensor) shortfin_pop_bundled_lib_options() else() @@ -172,7 +179,8 @@ else() endif() ################################################################################ -# IREE +# IREE Dependency +# This is always a source dependency on the IREE runtime. ################################################################################ # Set IREE build flags. @@ -237,6 +245,61 @@ else() endif() shortfin_pop_bundled_lib_options() +################################################################################ +# Tokenizer Library +################################################################################ + +function(shortfin_check_tokenizers) + # Make sure that rust/cargo is installed and usable. + find_program(SHORTFIN_CARGO_PATH NAMES cargo NO_CACHE) + if(NOT SHORTFIN_CARGO_PATH) + message(SEND_ERROR + "Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool). " + "Please follow Rust documentation to install. On Ubuntu, this can typically be accomplished with:\n" + " sudo apt install rustup && rustup default stable" + ) + endif() + + # Make sure cargo is functional. + execute_process( + COMMAND ${SHORTFIN_CARGO_PATH} + RESULT_VARIABLE _CARGO_RESULT + OUTPUT_VARIABLE _CARGO_OUT + ERROR_VARIABLE _CARGO_ERR + ) + if(NOT "${_CARGO_RESULT}" STREQUAL "0") + message(SEND_ERROR + "Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool) " + "to be configured properly. It was found (${SHORTFIN_CARGO_PATH}) but returned an " + "error. Output below:\n" + "${_CARGO_OUT}\n" + "${_CARGO_ERR}" + ) + endif() +endfunction() + +if(SHORTFIN_ENABLE_TOKENIZERS) + # TODO: submit a patch to tokenizers_cpp to allow explicit configuration of the + # cargo location and pass that vs relying on environmental alignment. + shortfin_check_tokenizers() + + shortfin_push_bundled_lib_options() + set(CMAKE_C_VISIBILITY_PRESET "hidden") + set(CMAKE_CXX_VISIBILITY_PRESET "hidden") + set(CMAKE_VISIBILITY_INLINES_HIDDEN ON) + set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER OFF) + + FetchContent_Declare( + tokenizers_cpp # From CMake project() declaration + GIT_REPOSITORY https://github.com/mlc-ai/tokenizers-cpp.git + GIT_TAG 4bb753377680e249345b54c6b10e6d0674c8af03 # 2024 Nov 15 + EXCLUDE_FROM_ALL + ) + message(STATUS "Fetching tokenizers_cpp") + FetchContent_MakeAvailable(tokenizers_cpp) + shortfin_pop_bundled_lib_options() +endif() + ################################################################################ # Tests ################################################################################ @@ -256,7 +319,7 @@ if(SHORTFIN_BUILD_TESTS) enable_testing() endif() - +add_custom_target(shortfin_testdata_deps) add_subdirectory(src) if(SHORTFIN_BUILD_PYTHON_BINDINGS) diff --git a/shortfin/build_tools/cmake/shortfin_library.cmake b/shortfin/build_tools/cmake/shortfin_library.cmake index aaa97a6c1..103fdf1c5 100644 --- a/shortfin/build_tools/cmake/shortfin_library.cmake +++ b/shortfin/build_tools/cmake/shortfin_library.cmake @@ -182,7 +182,10 @@ function(shortfin_gtest_test) GTest::gmock GTest::gtest_main ) - gtest_discover_tests(${_RULE_NAME}) + gtest_discover_tests( + ${_RULE_NAME} + WORKING_DIRECTORY "${libshortfin_BINARY_DIR}" + ) endfunction() diff --git a/shortfin/build_tools/cmake/shortfin_testing.cmake b/shortfin/build_tools/cmake/shortfin_testing.cmake new file mode 100644 index 000000000..6c43ecf48 --- /dev/null +++ b/shortfin/build_tools/cmake/shortfin_testing.cmake @@ -0,0 +1,47 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Downloads some test data file as part of configure. +# This does a download->rename in an attempt to be robust to partial downloads. +# It should not be used to manage large test data files or anything sensitive +# enough to require a hash check. +# The output file is added as an additional clean file on the global +# shortfin_testdata_deps target, meaning the "ninja clean" will remove it. +# It is also added to the current directories list of configure depends, which +# means that if ninja is run and it is not present, cmake will be re-invoked. +function(shortfin_download_test_data) + cmake_parse_arguments( + _RULE + "" + "URL;OUTPUT_FILE" + "" + ${ARGN} + ) + if(NOT EXISTS "${_RULE_OUTPUT_FILE}") + set(_stage_file "${_RULE_OUTPUT_FILE}.stage") + message(STATUS "Downloading test data ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}") + file(DOWNLOAD "${_RULE_URL}" "${_stage_file}" STATUS _status) + list(POP_FRONT _status _status_code) + if(_status_code EQUAL "0") + file(RENAME "${_stage_file}" "${_RULE_OUTPUT_FILE}") + else() + message(SEND_ERROR "Error downloading file ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}") + endif() + endif() + + # Make clean remove it. + set_property( + TARGET shortfin_testdata_deps + APPEND PROPERTY ADDITIONAL_CLEAN_FILES + "${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json" + ) + + # And make us reconfigure if it isn't there. + set_property( + DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}" + APPEND PROPERTY + CMAKE_CONFIGURE_DEPENDS "${_RULE_OUTPUT_FILE}") +endfunction() diff --git a/shortfin/setup.py b/shortfin/setup.py index cf3762950..e15b38d89 100644 --- a/shortfin/setup.py +++ b/shortfin/setup.py @@ -225,6 +225,7 @@ def build_cmake_configuration(CMAKE_BUILD_DIR: Path, extra_cmake_args=[]): add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_LTO", default_value="ON") add_env_cmake_setting(cmake_args, "SHORTFIN_IREE_SOURCE_DIR") add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_ASAN") + add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_TOKENIZERS", default_value="OFF") # Only do a from-scratch configure if not already configured. cmake_cache_file = os.path.join(CMAKE_BUILD_DIR, "CMakeCache.txt") diff --git a/shortfin/src/shortfin/CMakeLists.txt b/shortfin/src/shortfin/CMakeLists.txt index 058e0e336..73df08e7c 100644 --- a/shortfin/src/shortfin/CMakeLists.txt +++ b/shortfin/src/shortfin/CMakeLists.txt @@ -5,5 +5,6 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception add_subdirectory(array) +add_subdirectory(components/tokenizers) add_subdirectory(local) add_subdirectory(support) diff --git a/shortfin/src/shortfin/components/tokenizers/CMakeLists.txt b/shortfin/src/shortfin/components/tokenizers/CMakeLists.txt new file mode 100644 index 000000000..168388d28 --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/CMakeLists.txt @@ -0,0 +1,39 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if(SHORTFIN_ENABLE_TOKENIZERS) + shortfin_cc_component( + NAME + shortfin_tokenizers + HDRS + tokenizers.h + SRCS + tokenizers.cc + DEFINES + SHORTFIN_HAVE_TOKENIZERS + COMPONENTS + shortfin_support + DEPS + tokenizers_cpp + ) + set_property(GLOBAL APPEND + PROPERTY SHORTFIN_LIB_OPTIONAL_COMPONENTS + shortfin_tokenizers) + target_compile_definitions(shortfin_public_defs INTERFACE SHORTFIN_HAVE_TOKENIZERS) + + # Download test data. + shortfin_download_test_data( + URL "https://huggingface.co/google-bert/bert-base-cased/resolve/main/tokenizer.json" + OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json" + ) + + # Note that tests run from the binary dir of the project. + shortfin_gtest_test( + NAME shortfin_tokenizers_test + SRCS + tokenizers_test.cc + ) +endif() diff --git a/shortfin/src/shortfin/components/tokenizers/tokenizers.cc b/shortfin/src/shortfin/components/tokenizers/tokenizers.cc new file mode 100644 index 000000000..118bc0c1b --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/tokenizers.cc @@ -0,0 +1,63 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "shortfin/components/tokenizers/tokenizers.h" + +#include + +#include "shortfin/support/logging.h" +#include "tokenizers_cpp.h" + +namespace shortfin::tokenizers { + +namespace { + +class AccessibleTokenizer : public Tokenizer { + public: + using Tokenizer::vendor_tokenizer_; +}; + +::tokenizers::Tokenizer *Get(Tokenizer *self) { + void *ptr = static_cast(self)->vendor_tokenizer_; + if (!ptr) { + throw std::logic_error("Tokenizer is null"); + } + return static_cast<::tokenizers::Tokenizer *>(ptr); +} + +} // namespace + +Tokenizer::~Tokenizer() { delete Get(this); } + +Tokenizer Tokenizer::FromBlobJSON(const std::string &json_blob) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::FromBlobJSON"); + return Tokenizer(::tokenizers::Tokenizer::FromBlobJSON(json_blob).release()); +} + +std::vector Tokenizer::Encode(const std::string &text) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Encode"); + return Get(this)->Encode(text); +} + +std::vector> Tokenizer::EncodeBatch( + const std::vector &texts) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::EncodeBatch"); + return Get(this)->EncodeBatch(texts); +} + +std::string Tokenizer::Decode(const std::vector &ids) { + SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Decode"); + return Get(this)->Decode(ids); +} +size_t Tokenizer::GetVocabSize() { return Get(this)->GetVocabSize(); } +std::string Tokenizer::IdToToken(int32_t token_id) { + return Get(this)->IdToToken(token_id); +} +int32_t Tokenizer::TokenToId(const std::string &token) { + return Get(this)->TokenToId(token); +} + +} // namespace shortfin::tokenizers diff --git a/shortfin/src/shortfin/components/tokenizers/tokenizers.h b/shortfin/src/shortfin/components/tokenizers/tokenizers.h new file mode 100644 index 000000000..d263eace6 --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/tokenizers.h @@ -0,0 +1,52 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H +#define SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H + +#include +#include + +#include "shortfin/support/api.h" + +namespace shortfin::tokenizers { + +// A vendored Tokenizer class that does not export the details of the backing +// implementation. While a little bit gross, this keeps us from needing to +// re-export a vendor'ed API as part of our public API. +// The current vendor tokenizer is based on mlc-ai/tokenizers-cpp. The API +// is fairly close to that implementation. +// See: https://github.com/mlc-ai/tokenizers-cpp +class SHORTFIN_API Tokenizer { + public: + Tokenizer(const Tokenizer &) = delete; + Tokenizer &operator=(const Tokenizer &) = delete; + Tokenizer(Tokenizer &&other) : vendor_tokenizer_(other.vendor_tokenizer_) { + vendor_tokenizer_ = nullptr; + } + ~Tokenizer(); + + // Factory functions. + static Tokenizer FromBlobJSON(const std::string &json_blob); + + std::vector Encode(const std::string &text); + std::vector> EncodeBatch( + const std::vector &texts); + std::string Decode(const std::vector &ids); + size_t GetVocabSize(); + std::string IdToToken(int32_t token_id); + int32_t TokenToId(const std::string &token); + + private: + Tokenizer(void *vendor_tokenizer) : vendor_tokenizer_(vendor_tokenizer) {} + + protected: + void *vendor_tokenizer_; +}; + +} // namespace shortfin::tokenizers + +#endif // SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H diff --git a/shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc b/shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc new file mode 100644 index 000000000..674721653 --- /dev/null +++ b/shortfin/src/shortfin/components/tokenizers/tokenizers_test.cc @@ -0,0 +1,56 @@ +// Copyright 2024 Advanced Micro Devices, Inc. +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "shortfin/components/tokenizers/tokenizers.h" + +#include +#include + +#include +#include + +using namespace shortfin::tokenizers; + +namespace { + +std::string ReadFile(std::filesystem::path path) { + std::ifstream in(path); + std::ostringstream out; + out << in.rdbuf(); + return out.str(); +} + +} // namespace + +// TODO: Enable once upstream changes with error handling have landed. +// Currently aborts. +// See: https://github.com/mlc-ai/tokenizers-cpp/issues/50 +// TEST(TokenizersTest, FromIllegalBlobJson) { +// auto tok = Tokenizer::FromBlobJSON("foobar"); +// } + +TEST(TokenizersTest, BasicTokenizerJson) { + std::filesystem::path tokenizer_path( + "src/shortfin/components/tokenizers/tokenizer.json"); + auto tokenizer_json = ReadFile(tokenizer_path); + ASSERT_GT(tokenizer_json.size(), 0) + << "reading " << tokenizer_path + << " (cwd: " << std::filesystem::current_path() << ")"; + auto tok = Tokenizer::FromBlobJSON(tokenizer_json); + EXPECT_GT(tok.GetVocabSize(), 100); // Sanity check + auto encoded = tok.Encode("hello world"); + EXPECT_THAT(encoded, + ::testing::ContainerEq(std::vector{19082, 1362})); + auto batch_encoded = tok.EncodeBatch({"hello", "world"}); + ASSERT_EQ(batch_encoded.size(), 2); + EXPECT_THAT(batch_encoded[0], + ::testing::ContainerEq(std::vector{19082})); + EXPECT_THAT(batch_encoded[1], + ::testing::ContainerEq(std::vector{1362})); + EXPECT_EQ(tok.TokenToId("hello"), 19082); + auto decoded = tok.Decode(encoded); + EXPECT_EQ(decoded, "hello world"); +}