Skip to content

Commit 0cde6b8

Browse files
helunwencserfacebook-github-bot
authored andcommitted
Move tokenizer into extension/llm/tokenizer (#4278)
Summary: This PR moves the titoken and bpe tokenizers into `extension/llm/tokenizer` such that they can be reused by other models. Note: Currently the tiktoken has two sets of unit tests based on llama2's tokenizer: - default - multimodal This PR only moves the default unit test into extension and keeps the multimodal's unit tests inside llama2/tokenizer. Pull Request resolved: #4278 Test Plan: - test/run_oss_cpp_tests.sh examples/models/llama2/tokenizer/test - test/run_oss_cpp_tests.sh extension/llm/tokenizer/test Reviewed By: larryliu0820 Differential Revision: D59822702 Pulled By: helunwencser fbshipit-source-id: 5d51ba3e44c9b2d9dc77b9f4349b58947ed68502
1 parent 740a0a5 commit 0cde6b8

31 files changed

+128324
-180
lines changed

.gitmodules

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,15 @@
5555
[submodule "examples/third-party/LLaVA"]
5656
path = examples/third-party/LLaVA
5757
url = https://github.com/haotian-liu/LLaVA.git
58-
[submodule "examples/models/llama2/third-party/re2"]
59-
path = examples/models/llama2/third-party/re2
60-
url = https://github.com/google/re2.git
61-
[submodule "examples/models/llama2/third-party/abseil-cpp"]
62-
path = examples/models/llama2/third-party/abseil-cpp
63-
url = https://github.com/abseil/abseil-cpp.git
6458
[submodule "third-party/ios-cmake"]
6559
path = third-party/ios-cmake
6660
url = https://github.com/leetal/ios-cmake
6761
[submodule "examples/models/phi-3-mini/third-party/sentencepiece"]
6862
path = examples/models/phi-3-mini/third-party/sentencepiece
6963
url = https://github.com/google/sentencepiece.git
64+
[submodule "extension/llm/third-party/re2"]
65+
path = extension/llm/third-party/re2
66+
url = https://github.com/google/re2.git
67+
[submodule "extension/llm/third-party/abseil-cpp"]
68+
path = extension/llm/third-party/abseil-cpp
69+
url = https://github.com/abseil/abseil-cpp.git

examples/models/llama2/CMakeLists.txt

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,14 @@ if(EXECUTORCH_USE_TIKTOKEN)
9797
set(ABSL_PROPAGATE_CXX_STD ON)
9898
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
9999
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
100-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/abseil-cpp)
101-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2)
100+
add_subdirectory(
101+
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/abseil-cpp
102+
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
103+
)
104+
add_subdirectory(
105+
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/re2
106+
${CMAKE_CURRENT_BINARY_DIR}/re2
107+
)
102108
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
103109
target_link_libraries(llama_runner PUBLIC re2::re2)
104110
endif()

examples/models/llama2/runner/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ target_include_directories(
4343

4444
if(EXECUTORCH_USE_TIKTOKEN)
4545
list(APPEND _llama_runner__srcs
46-
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/tiktoken.cpp
46+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
4747
)
4848
list(APPEND _llama_runner__srcs
4949
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp

examples/models/llama2/runner/runner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#if ET_USE_TIKTOKEN
1414
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
1515
#else /* BPE */
16-
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
16+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1717
#endif /* ET_USE_TIKTOKEN*/
1818
#include <executorch/extension/evalue_util/print_evalue.h>
1919
#include <executorch/extension/runner_util/managed_tensor.h>

examples/models/llama2/runner/runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include <unordered_map>
1919

2020
#include <executorch/examples/models/llama2/sampler/sampler.h>
21-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
21+
#include <executorch/extension/llm/tokenizer/tokenizer.h>
2222
#include <executorch/extension/module/module.h>
2323
#include <executorch/extension/runner_util/managed_tensor.h>
2424

examples/models/llama2/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def define_common_targets():
4343
] + ([
4444
"//executorch/examples/models/llama2/tokenizer:tiktoken",
4545
] if use_tiktoken() else [
46-
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
46+
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
4747
]) + (_get_operator_lib(aten)) + ([
4848
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
4949
# Therefore enable it explicitly for now to avoid failing tests
Lines changed: 0 additions & 1 deletion
This file was deleted.
Lines changed: 0 additions & 1 deletion
This file was deleted.

examples/models/llama2/tokenizer/llama_tiktoken.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
11+
#include <executorch/extension/llm/tokenizer/tiktoken.h>
1212

1313
namespace torch {
1414
namespace executor {
Lines changed: 5 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,23 @@
11
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
22

33
def define_common_targets():
4-
runtime.cxx_library(
5-
name = "bpe_tokenizer",
6-
srcs = [
7-
"bpe_tokenizer.cpp",
8-
],
9-
exported_headers = [
10-
"tokenizer.h",
11-
"bpe_tokenizer.h",
12-
],
13-
exported_deps = [
14-
"//executorch/runtime/core/exec_aten:lib",
15-
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
16-
],
17-
visibility = [
18-
"@EXECUTORCH_CLIENTS",
19-
],
20-
)
4+
"""Defines targets that should be shared between fbcode and xplat.
215
6+
The directory containing this targets.bzl file should also contain both
7+
TARGETS and BUCK files that call this function.
8+
"""
229
runtime.cxx_library(
2310
name = "tiktoken",
2411
srcs = [
25-
"tiktoken.cpp",
2612
"llama_tiktoken.cpp",
2713
],
2814
exported_headers = [
29-
"tokenizer.h",
30-
"tiktoken.h",
3115
"llama_tiktoken.h",
32-
"base64.h",
3316
],
3417
exported_deps = [
35-
"//executorch/runtime/core/exec_aten:lib",
36-
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
18+
"//executorch/extension/llm/tokenizer:tiktoken",
3719
],
3820
visibility = [
3921
"@EXECUTORCH_CLIENTS",
4022
],
41-
exported_external_deps = [
42-
"re2",
43-
],
4423
)

examples/models/llama2/tokenizer/test/CMakeLists.txt

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,31 +21,31 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
2121

2222
include(${EXECUTORCH_ROOT}/build/Test.cmake)
2323

24-
set(
25-
_tokenizer_test_srcs
26-
test_tiktoken.cpp
27-
test_bpe_tokenizer.cpp
28-
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
29-
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
30-
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
24+
set(_tokenizer_test_srcs
25+
test_tiktoken.cpp
26+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/tokenizer/tiktoken.cpp
27+
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
3128
)
3229

3330
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
3431
set(ABSL_ENABLE_INSTALL ON)
3532
set(ABSL_PROPAGATE_CXX_STD ON)
3633
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
3734
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
38-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
39-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
35+
add_subdirectory(
36+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
37+
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
38+
)
39+
add_subdirectory(
40+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/re2
41+
${CMAKE_CURRENT_BINARY_DIR}/re2
42+
)
4043
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
4144

42-
et_cxx_test(
43-
tokenizer_test
44-
SOURCES
45-
${_tokenizer_test_srcs}
46-
EXTRA_LIBS
47-
re2::re2
48-
)
45+
et_cxx_test(tokenizer_test SOURCES ${_tokenizer_test_srcs} EXTRA_LIBS re2::re2)
4946
target_include_directories(
50-
tokenizer_test PRIVATE ${CMAKE_INSTALL_PREFIX}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
47+
tokenizer_test
48+
PRIVATE
49+
${CMAKE_INSTALL_PREFIX}/include
50+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../../extension/llm/third-party/abseil-cpp
5151
)

examples/models/llama2/tokenizer/test/targets.bzl

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,6 @@ def define_common_targets():
66
The directory containing this targets.bzl file should also contain both
77
TARGETS and BUCK files that call this function.
88
"""
9-
10-
runtime.cxx_test(
11-
name = "test_bpe_tokenizer",
12-
srcs = [
13-
"test_bpe_tokenizer.cpp",
14-
],
15-
deps = [
16-
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
17-
],
18-
env = {
19-
"RESOURCES_PATH": "$(location :resources)/resources",
20-
},
21-
)
22-
239
runtime.cxx_test(
2410
name = "test_tiktoken",
2511
srcs = [
@@ -31,9 +17,6 @@ def define_common_targets():
3117
env = {
3218
"RESOURCES_PATH": "$(location :resources)/resources",
3319
},
34-
external_deps = [
35-
"re2",
36-
],
3720
)
3821

3922
runtime.filegroup(

examples/models/llama2/tokenizer/test/test_tiktoken.cpp

Lines changed: 0 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
*/
88

99
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
10-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
1110
#include <executorch/runtime/platform/runtime.h>
1211
#include <gtest/gtest.h>
1312
#include <vector>
@@ -17,19 +16,6 @@ using namespace ::testing;
1716
namespace torch {
1817
namespace executor {
1918

20-
class TiktokenExtensionTest : public Test {
21-
public:
22-
void SetUp() override {
23-
torch::executor::runtime_init();
24-
tokenizer_ = get_tiktoken_for_llama();
25-
modelPath_ = std::getenv("RESOURCES_PATH") +
26-
std::string("/test_tiktoken_tokenizer.model");
27-
}
28-
29-
std::unique_ptr<Tokenizer> tokenizer_;
30-
std::string modelPath_;
31-
};
32-
3319
class MultimodalTiktokenV5ExtensionTest : public Test {
3420
public:
3521
void SetUp() override {
@@ -43,24 +29,6 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
4329
std::string modelPath_;
4430
};
4531

46-
TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
47-
Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
48-
EXPECT_EQ(res.error(), Error::NotSupported);
49-
}
50-
51-
TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
52-
auto result = tokenizer_->decode(0, 0);
53-
EXPECT_EQ(result.error(), Error::NotSupported);
54-
}
55-
56-
TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
57-
Error res = tokenizer_->load(modelPath_.c_str());
58-
EXPECT_EQ(res, Error::Ok);
59-
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
60-
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
61-
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
62-
}
63-
6432
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
6533
Error res = tokenizer_->load(modelPath_.c_str());
6634
EXPECT_EQ(res, Error::Ok);
@@ -69,17 +37,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
6937
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
7038
}
7139

72-
TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
73-
Error res = tokenizer_->load(modelPath_.c_str());
74-
EXPECT_EQ(res, Error::Ok);
75-
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
76-
EXPECT_EQ(out.error(), Error::Ok);
77-
EXPECT_EQ(out.get().size(), 3);
78-
EXPECT_EQ(out.get()[0], 128000);
79-
EXPECT_EQ(out.get()[1], 15339);
80-
EXPECT_EQ(out.get()[2], 1917);
81-
}
82-
8340
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
8441
Error res = tokenizer_->load(modelPath_.c_str());
8542
EXPECT_EQ(res, Error::Ok);
@@ -101,18 +58,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
10158
}
10259
}
10360

104-
TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
105-
Error res = tokenizer_->load(modelPath_.c_str());
106-
EXPECT_EQ(res, Error::Ok);
107-
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
108-
std::vector<uint64_t> tokens = {128000, 15339, 1917};
109-
for (size_t i = 0; i < tokens.size(); i++) {
110-
Result<std::string> out = tokenizer_->decode(0, tokens[i]);
111-
EXPECT_EQ(out.error(), Error::Ok);
112-
EXPECT_EQ(out.get(), expected[i]);
113-
}
114-
}
115-
11661
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
11762
Error res = tokenizer_->load(modelPath_.c_str());
11863
EXPECT_EQ(res, Error::Ok);
@@ -134,44 +79,5 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
13479
EXPECT_EQ(out.get(), expected[i]);
13580
}
13681
}
137-
138-
TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
139-
Error res = tokenizer_->load(modelPath_.c_str());
140-
EXPECT_EQ(res, Error::Ok);
141-
// The vocab size is 128256, addes 256 just so the token is out of vocab
142-
// range.
143-
Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
144-
EXPECT_EQ(out.error(), Error::NotSupported);
145-
}
146-
147-
TEST_F(TiktokenExtensionTest, ConstructionWithInvalidBOSIndex) {
148-
// gtest death test doesn't work on iOS:
149-
// https://github.com/google/googletest/issues/2834
150-
#if !GTEST_OS_IOS
151-
EXPECT_EXIT(
152-
std::make_unique<Tiktoken>(
153-
std::make_unique<std::vector<std::string>>(
154-
std::vector<std::string>{"<|end_of_text|>"}),
155-
1,
156-
0),
157-
::testing::KilledBySignal(SIGABRT),
158-
"");
159-
#endif
160-
}
161-
162-
TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
163-
// gtest death test doesn't work on iOS:
164-
// https://github.com/google/googletest/issues/2834
165-
#if !GTEST_OS_IOS
166-
EXPECT_EXIT(
167-
std::make_unique<Tiktoken>(
168-
std::make_unique<std::vector<std::string>>(
169-
std::vector<std::string>{"<|begin_of_text|>"}),
170-
0,
171-
1),
172-
::testing::KilledBySignal(SIGABRT),
173-
"");
174-
#endif
175-
}
17682
} // namespace executor
17783
} // namespace torch

examples/qualcomm/llama2/runner/runner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
// A simple llama2 runner that includes preprocessing and post processing logic.
1010
// The module takes in a string as input and emits a string as output.
1111

12-
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1312
#include <executorch/examples/qualcomm/llama2/runner/runner.h>
1413
#include <executorch/extension/evalue_util/print_evalue.h>
14+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1515
#include <executorch/extension/runner_util/managed_tensor.h>
1616

1717
#include <ctime>

extension/android/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,11 @@ if(EXECUTORCH_BUILD_LLAMA_JNI)
128128
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
129129
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
130130
add_subdirectory(
131-
${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/abseil-cpp
131+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/abseil-cpp
132132
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
133133
)
134134
add_subdirectory(
135-
${CMAKE_CURRENT_SOURCE_DIR}/../../examples/models/llama2/third-party/re2
135+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/third-party/re2
136136
${CMAKE_CURRENT_BINARY_DIR}/re2
137137
)
138138
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})

extension/llm/third-party/abseil-cpp

Submodule abseil-cpp added at eb85220

extension/llm/third-party/re2

Submodule re2 added at 6dcd83d

examples/models/llama2/tokenizer/bpe_tokenizer.cpp renamed to extension/llm/tokenizer/bpe_tokenizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
9+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1010

1111
#include <string>
1212

examples/models/llama2/tokenizer/bpe_tokenizer.h renamed to extension/llm/tokenizer/bpe_tokenizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
11+
#include <executorch/extension/llm/tokenizer/tokenizer.h>
1212
#include <cstdint>
1313

1414
namespace torch {

0 commit comments

Comments
 (0)