Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[shortfin] Add C++ tokenizer wrapper library. #610

Merged
merged 2 commits into from
Nov 27, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 67 additions & 4 deletions shortfin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ option(SHORTFIN_BUILD_TESTS "Builds C++ tests" ON)
option(SHORTFIN_BUNDLE_DEPS "Download dependencies instead of using system libraries" ON)
option(SHORTFIN_ENABLE_TRACING "Enable runtime tracing for iree and shortfin" OFF)
option(SHORTFIN_ENABLE_LTO "Enables LTO if supported" ON)
option(SHORTFIN_ENABLE_TOKENIZERS "Enables integration of native tokenizers library" OFF)

set(SHORTFIN_IREE_SOURCE_DIR "" CACHE FILEPATH "Path to IREE source")

Expand Down Expand Up @@ -80,6 +81,7 @@ list(APPEND CMAKE_MODULE_PATH
${CMAKE_CURRENT_LIST_DIR}/build_tools/cmake/
)
include(shortfin_library)
include(shortfin_testing)
include(CheckCXXCompilerFlag)
include(FetchContent)

Expand All @@ -90,7 +92,9 @@ include(FetchContent)
if(SHORTFIN_ENABLE_LTO)
include(CheckIPOSupported)
check_ipo_supported(RESULT SHORTFIN_LTO_SUPPORTED OUTPUT SHORTFIN_LTO_ERROR)
if(SHORTFIN_LTO_SUPPORTED)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
message(STATUS "Not enabling LTO for debug build")
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved
elseif(SHORTFIN_LTO_SUPPORTED)
message(STATUS "Shortfin LTO Enabled")
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
else()
Expand Down Expand Up @@ -126,7 +130,9 @@ endif()
message(STATUS " - Host")

################################################################################
# Dependencies
# Bundled Dependencies
# These dependencies are either bundled or used via installed packages based
# on the SHORTFIN_BUNDLE_DEPS option.
################################################################################

if(SHORTFIN_BUNDLE_DEPS)
Expand Down Expand Up @@ -164,6 +170,7 @@ if(SHORTFIN_BUNDLE_DEPS)
shortfin_push_bundled_lib_options()
# Enable spdlog shared library options so we can export it.
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSPDLOG_SHARED_LIB -Dspdlog_EXPORTS")
message(STATUS "Fetching bundled projects")
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved
FetchContent_MakeAvailable(fmt spdlog xtl xtensor)
shortfin_pop_bundled_lib_options()
else()
Expand All @@ -172,7 +179,8 @@ else()
endif()

################################################################################
# IREE
# IREE Dependency
# This is always a source dependency on the IREE runtime.
################################################################################

# Set IREE build flags.
Expand Down Expand Up @@ -237,6 +245,61 @@ else()
endif()
shortfin_pop_bundled_lib_options()

################################################################################
# Tokenizer Library
################################################################################

function(shortfin_check_tokenizers)
# Make sure that rust/cargo is installed and usable.
find_program(SHORTFIN_CARGO_PATH NAMES cargo NO_CACHE)
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved
if(NOT SHORTFIN_CARGO_PATH)
message(SEND_ERROR
"Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool). "
"Please follow Rust documentation to install. On Ubuntu, this can typically be accomplished with:\n"
" sudo apt install rustup && rustup default stable"
)
endif()
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved

# Make sure cargo is functional.
execute_process(
COMMAND ${SHORTFIN_CARGO_PATH}
RESULT_VARIABLE _CARGO_RESULT
OUTPUT_VARIABLE _CARGO_OUT
ERROR_VARIABLE _CARGO_ERR
)
if(NOT "${_CARGO_RESULT}" STREQUAL "0")
message(SEND_ERROR
"Building with -DSHORTFIN_ENABLE_TOKENIZERS=ON requires cargo (Rust's build tool) "
"to be configured properly. It was found (${SHORTFIN_CARGO_PATH}) but returned an "
"error. Output below:\n"
"${_CARGO_OUT}\n"
"${_CARGO_ERR}"
)
endif()
endfunction()

if(SHORTFIN_ENABLE_TOKENIZERS)
# TODO: submit a patch to tokenizers_cpp to allow explicit configuration of the
# cargo location and pass that vs relying on environmental alignment.
shortfin_check_tokenizers()

shortfin_push_bundled_lib_options()
set(CMAKE_C_VISIBILITY_PRESET "hidden")
set(CMAKE_CXX_VISIBILITY_PRESET "hidden")
set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
set(MLC_ENABLE_SENTENCEPIECE_TOKENIZER OFF)

FetchContent_Declare(
tokenizers_cpp # From CMake project() declaration
GIT_REPOSITORY https://github.com/mlc-ai/tokenizers-cpp.git
GIT_TAG 4bb753377680e249345b54c6b10e6d0674c8af03 # 2024 Nov 15
EXCLUDE_FROM_ALL
)
message(STATUS "Fetching tokenizers_cpp")
FetchContent_MakeAvailable(tokenizers_cpp)
shortfin_pop_bundled_lib_options()
endif()

################################################################################
# Tests
################################################################################
Expand All @@ -256,7 +319,7 @@ if(SHORTFIN_BUILD_TESTS)
enable_testing()
endif()


add_custom_target(shortfin_testdata_deps)
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved
add_subdirectory(src)

if(SHORTFIN_BUILD_PYTHON_BINDINGS)
Expand Down
5 changes: 4 additions & 1 deletion shortfin/build_tools/cmake/shortfin_library.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,10 @@ function(shortfin_gtest_test)
GTest::gmock
GTest::gtest_main
)
gtest_discover_tests(${_RULE_NAME})
gtest_discover_tests(
${_RULE_NAME}
WORKING_DIRECTORY "${libshortfin_BINARY_DIR}"
)
endfunction()


Expand Down
47 changes: 47 additions & 0 deletions shortfin/build_tools/cmake/shortfin_testing.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# Downloads some test data file as part of configure.
# This does a download->rename in an attempt to be robust to partial downloads.
# It should not be used to manage large test data files or anything sensitive
# enough to require a hash check.
# The output file is added as an additional clean file on the global
# shortfin_testdata_deps target, meaning the "ninja clean" will remove it.
# It is also added to the current directories list of configure depends, which
# means that if ninja is run and it is not present, cmake will be re-invoked.
function(shortfin_download_test_data)
cmake_parse_arguments(
_RULE
""
"URL;OUTPUT_FILE"
""
${ARGN}
)
if(NOT EXISTS "${_RULE_OUTPUT_FILE}")
set(_stage_file "${_RULE_OUTPUT_FILE}.stage")
message(STATUS "Downloading test data ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}")
file(DOWNLOAD "${_RULE_URL}" "${_stage_file}" STATUS _status)
list(POP_FRONT _status _status_code)
if(_status_code EQUAL "0")
file(RENAME "${_stage_file}" "${_RULE_OUTPUT_FILE}")
else()
message(SEND_ERROR "Error downloading file ${_RULE_URL} -> ${_RULE_OUTPUT_FILE}")
endif()
endif()

# Make clean remove it.
set_property(
TARGET shortfin_testdata_deps
APPEND PROPERTY ADDITIONAL_CLEAN_FILES
"${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json"
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved
)

# And make us reconfigure if it isn't there.
set_property(
DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}"
APPEND PROPERTY
CMAKE_CONFIGURE_DEPENDS "${_RULE_OUTPUT_FILE}")
endfunction()
1 change: 1 addition & 0 deletions shortfin/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,7 @@ def build_cmake_configuration(CMAKE_BUILD_DIR: Path, extra_cmake_args=[]):
add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_LTO", default_value="ON")
add_env_cmake_setting(cmake_args, "SHORTFIN_IREE_SOURCE_DIR")
add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_ASAN")
add_env_cmake_setting(cmake_args, "SHORTFIN_ENABLE_TOKENIZERS", default_value="OFF")

# Only do a from-scratch configure if not already configured.
cmake_cache_file = os.path.join(CMAKE_BUILD_DIR, "CMakeCache.txt")
Expand Down
1 change: 1 addition & 0 deletions shortfin/src/shortfin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

add_subdirectory(array)
add_subdirectory(components/tokenizers)
add_subdirectory(local)
add_subdirectory(support)
39 changes: 39 additions & 0 deletions shortfin/src/shortfin/components/tokenizers/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2024 Advanced Micro Devices, Inc.
#
# Licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

if(SHORTFIN_ENABLE_TOKENIZERS)
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved
shortfin_cc_component(
NAME
shortfin_tokenizers
HDRS
tokenizers.h
SRCS
tokenizers.cc
DEFINES
SHORTFIN_HAVE_TOKENIZERS
COMPONENTS
shortfin_support
DEPS
tokenizers_cpp
)
set_property(GLOBAL APPEND
PROPERTY SHORTFIN_LIB_OPTIONAL_COMPONENTS
shortfin_tokenizers)
target_compile_definitions(shortfin_public_defs INTERFACE SHORTFIN_HAVE_TOKENIZERS)

# Download test data.
shortfin_download_test_data(
URL "https://huggingface.co/google-bert/bert-base-cased/resolve/main/tokenizer.json"
OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/tokenizer.json"
)
stellaraccident marked this conversation as resolved.
Show resolved Hide resolved

# Note that tests run from the binary dir of the project.
shortfin_gtest_test(
NAME shortfin_tokenizers_test
SRCS
tokenizers_test.cc
)
endif()
63 changes: 63 additions & 0 deletions shortfin/src/shortfin/components/tokenizers/tokenizers.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Copyright 2024 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "shortfin/components/tokenizers/tokenizers.h"

#include <exception>

#include "shortfin/support/logging.h"
#include "tokenizers_cpp.h"

namespace shortfin::tokenizers {

namespace {

class AccessibleTokenizer : public Tokenizer {
public:
using Tokenizer::vendor_tokenizer_;
};

::tokenizers::Tokenizer *Get(Tokenizer *self) {
void *ptr = static_cast<AccessibleTokenizer *>(self)->vendor_tokenizer_;
if (!ptr) {
throw std::logic_error("Tokenizer is null");
}
return static_cast<::tokenizers::Tokenizer *>(ptr);
}

} // namespace

Tokenizer::~Tokenizer() { delete Get(this); }

Tokenizer Tokenizer::FromBlobJSON(const std::string &json_blob) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::FromBlobJSON");
return Tokenizer(::tokenizers::Tokenizer::FromBlobJSON(json_blob).release());
}

std::vector<int32_t> Tokenizer::Encode(const std::string &text) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Encode");
return Get(this)->Encode(text);
}

std::vector<std::vector<int32_t>> Tokenizer::EncodeBatch(
const std::vector<std::string> &texts) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::EncodeBatch");
return Get(this)->EncodeBatch(texts);
}

std::string Tokenizer::Decode(const std::vector<int32_t> &ids) {
SHORTFIN_TRACE_SCOPE_NAMED("Tokenizer::Decode");
return Get(this)->Decode(ids);
}
size_t Tokenizer::GetVocabSize() { return Get(this)->GetVocabSize(); }
std::string Tokenizer::IdToToken(int32_t token_id) {
return Get(this)->IdToToken(token_id);
}
int32_t Tokenizer::TokenToId(const std::string &token) {
return Get(this)->TokenToId(token);
}

} // namespace shortfin::tokenizers
52 changes: 52 additions & 0 deletions shortfin/src/shortfin/components/tokenizers/tokenizers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright 2024 Advanced Micro Devices, Inc.
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#ifndef SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H
#define SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H

#include <string>
#include <vector>

#include "shortfin/support/api.h"

namespace shortfin::tokenizers {

// A vendored Tokenizer class that does not export the details of the backing
// implementation. While a little bit gross, this keeps us from needing to
// re-export a vendor'ed API as part of our public API.
// The current vendor tokenizer is based on mlc-ai/tokenizers-cpp. The API
// is fairly close to that implementation.
// See: https://github.com/mlc-ai/tokenizers-cpp
class SHORTFIN_API Tokenizer {
public:
Tokenizer(const Tokenizer &) = delete;
Tokenizer &operator=(const Tokenizer &) = delete;
Tokenizer(Tokenizer &&other) : vendor_tokenizer_(other.vendor_tokenizer_) {
vendor_tokenizer_ = nullptr;
}
~Tokenizer();

// Factory functions.
static Tokenizer FromBlobJSON(const std::string &json_blob);

std::vector<int32_t> Encode(const std::string &text);
std::vector<std::vector<int32_t>> EncodeBatch(
const std::vector<std::string> &texts);
std::string Decode(const std::vector<int32_t> &ids);
size_t GetVocabSize();
std::string IdToToken(int32_t token_id);
int32_t TokenToId(const std::string &token);

private:
Tokenizer(void *vendor_tokenizer) : vendor_tokenizer_(vendor_tokenizer) {}

protected:
void *vendor_tokenizer_;
};

} // namespace shortfin::tokenizers

#endif // SHORTFIN_COMPONENTS_TOKENIZERS_TOKENIZERS_H
Loading
Loading