Skip to content

Move bpe and tiktoken tokenizer into extension/llm #4271

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@
[submodule "examples/third-party/LLaVA"]
path = examples/third-party/LLaVA
url = https://github.com/haotian-liu/LLaVA.git
[submodule "examples/models/llama2/third-party/re2"]
path = examples/models/llama2/third-party/re2
url = https://github.com/google/re2.git
[submodule "examples/models/llama2/third-party/abseil-cpp"]
path = examples/models/llama2/third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
[submodule "third-party/ios-cmake"]
path = third-party/ios-cmake
url = https://github.com/leetal/ios-cmake
[submodule "examples/models/phi-3-mini/third-party/sentencepiece"]
path = examples/models/phi-3-mini/third-party/sentencepiece
url = https://github.com/google/sentencepiece.git
[submodule "extension/llm/third-party/re2"]
path = extension/llm/third-party/re2
url = https://github.com/google/re2.git
[submodule "extension/llm/third-party/abseil-cpp"]
path = extension/llm/third-party/abseil-cpp
url = https://github.com/abseil/abseil-cpp.git
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#if ET_USE_TIKTOKEN
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
#else /* BPE */
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
#endif /* ET_USE_TIKTOKEN*/
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/runner_util/managed_tensor.h>
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#include <unordered_map>

#include <executorch/examples/models/llama2/sampler/sampler.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <executorch/extension/module/module.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def define_common_targets():
] + ([
"//executorch/examples/models/llama2/tokenizer:tiktoken",
] if use_tiktoken() else [
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
]) + (_get_operator_lib(aten)) + ([
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
# Therefore enable it explicitly for now to avoid failing tests
Expand Down
1 change: 0 additions & 1 deletion examples/models/llama2/third-party/abseil-cpp
Submodule abseil-cpp deleted from 854193
1 change: 0 additions & 1 deletion examples/models/llama2/third-party/re2
Submodule re2 deleted from ac82d4
2 changes: 1 addition & 1 deletion examples/models/llama2/tokenizer/llama_tiktoken.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#pragma once

#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
#include <executorch/extension/llm/tokenizer/tiktoken.h>

namespace torch {
namespace executor {
Expand Down
32 changes: 6 additions & 26 deletions examples/models/llama2/tokenizer/targets.bzl
Original file line number Diff line number Diff line change
@@ -1,44 +1,24 @@
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")

def define_common_targets():
runtime.cxx_library(
name = "bpe_tokenizer",
srcs = [
"bpe_tokenizer.cpp",
],
exported_headers = [
"tokenizer.h",
"bpe_tokenizer.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
)
"""Defines targets that should be shared between fbcode and xplat.

The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""

runtime.cxx_library(
name = "tiktoken",
srcs = [
"tiktoken.cpp",
"llama_tiktoken.cpp",
],
exported_headers = [
"tokenizer.h",
"tiktoken.h",
"llama_tiktoken.h",
"base64.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
"//executorch/extension/llm/tokenizer:tiktoken",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
exported_external_deps = [
"re2",
],
)
8 changes: 3 additions & 5 deletions examples/models/llama2/tokenizer/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,17 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)
set(
_tokenizer_test_srcs
test_tiktoken.cpp
test_bpe_tokenizer.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
)

set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

et_cxx_test(
Expand Down
17 changes: 0 additions & 17 deletions examples/models/llama2/tokenizer/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,6 @@ def define_common_targets():
The directory containing this targets.bzl file should also contain both
TARGETS and BUCK files that call this function.
"""

runtime.cxx_test(
name = "test_bpe_tokenizer",
srcs = [
"test_bpe_tokenizer.cpp",
],
deps = [
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
],
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
)

runtime.cxx_test(
name = "test_tiktoken",
srcs = [
Expand All @@ -31,9 +17,6 @@ def define_common_targets():
env = {
"RESOURCES_PATH": "$(location :resources)/resources",
},
external_deps = [
"re2",
],
)

runtime.filegroup(
Expand Down
94 changes: 0 additions & 94 deletions examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
*/

#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/runtime/platform/runtime.h>
#include <gtest/gtest.h>
#include <vector>
Expand All @@ -17,19 +16,6 @@ using namespace ::testing;
namespace torch {
namespace executor {

class TiktokenExtensionTest : public Test {
public:
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = get_tiktoken_for_llama();
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}

std::unique_ptr<Tokenizer> tokenizer_;
std::string modelPath_;
};

class MultimodalTiktokenV5ExtensionTest : public Test {
public:
void SetUp() override {
Expand All @@ -43,24 +29,6 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
std::string modelPath_;
};

TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
EXPECT_EQ(res.error(), Error::NotSupported);
}

TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
auto result = tokenizer_->decode(0, 0);
EXPECT_EQ(result.error(), Error::NotSupported);
}

TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
}

TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Expand All @@ -69,17 +37,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
}

TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
EXPECT_EQ(out.error(), Error::Ok);
EXPECT_EQ(out.get().size(), 3);
EXPECT_EQ(out.get()[0], 128000);
EXPECT_EQ(out.get()[1], 15339);
EXPECT_EQ(out.get()[2], 1917);
}

TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Expand All @@ -101,18 +58,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
}
}

TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
std::vector<uint64_t> tokens = {128000, 15339, 1917};
for (size_t i = 0; i < tokens.size(); i++) {
Result<std::string> out = tokenizer_->decode(0, tokens[i]);
EXPECT_EQ(out.error(), Error::Ok);
EXPECT_EQ(out.get(), expected[i]);
}
}

TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
Expand All @@ -134,44 +79,5 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
EXPECT_EQ(out.get(), expected[i]);
}
}

TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// The vocab size is 128256, addes 256 just so the token is out of vocab
// range.
Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
EXPECT_EQ(out.error(), Error::NotSupported);
}

TEST_F(TiktokenExtensionTest, ConstructionWithInvalidBOSIndex) {
// gtest death test doesn't work on iOS:
// https://github.com/google/googletest/issues/2834
#if !GTEST_OS_IOS
EXPECT_EXIT(
std::make_unique<Tiktoken>(
std::make_unique<std::vector<std::string>>(
std::vector<std::string>{"<|end_of_text|>"}),
1,
0),
::testing::KilledBySignal(SIGABRT),
"");
#endif
}

TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
// gtest death test doesn't work on iOS:
// https://github.com/google/googletest/issues/2834
#if !GTEST_OS_IOS
EXPECT_EXIT(
std::make_unique<Tiktoken>(
std::make_unique<std::vector<std::string>>(
std::vector<std::string>{"<|begin_of_text|>"}),
0,
1),
::testing::KilledBySignal(SIGABRT),
"");
#endif
}
} // namespace executor
} // namespace torch
2 changes: 1 addition & 1 deletion examples/qualcomm/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
// A simple llama2 runner that includes preprocessing and post processing logic.
// The module takes in a string as input and emits a string as output.

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/examples/qualcomm/llama2/runner/runner.h>
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/runner_util/managed_tensor.h>

#include <ctime>
Expand Down
1 change: 1 addition & 0 deletions extension/llm/third-party/abseil-cpp
Submodule abseil-cpp added at eb8522
1 change: 1 addition & 0 deletions extension/llm/third-party/re2
Submodule re2 added at 6dcd83
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>

#include <string>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

#pragma once

#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/extension/llm/tokenizer/tokenizer.h>
#include <cstdint>

namespace torch {
Expand Down
40 changes: 40 additions & 0 deletions extension/llm/tokenizer/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,43 @@ def define_common_targets():
":tokenizer_py_lib",
],
)

runtime.cxx_library(
name = "bpe_tokenizer",
srcs = [
"bpe_tokenizer.cpp",
],
exported_headers = [
"tokenizer.h",
"bpe_tokenizer.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
)

runtime.cxx_library(
name = "tiktoken",
srcs = [
"tiktoken.cpp",
],
exported_headers = [
"tokenizer.h",
"tiktoken.h",
"base64.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
],
visibility = [
"@EXECUTORCH_CLIENTS",
],
exported_external_deps = [
"re2",
],
)
Loading
Loading