Skip to content

Commit 45db5ce

Browse files
committed
Move bpe and tiktoken tokenizer into extension/llm
Pull Request resolved: #4271 ghstack-source-id: 233839158 Differential Revision: [D59779781](https://our.internmc.facebook.com/intern/diff/D59779781/)
1 parent 4bd9487 commit 45db5ce

23 files changed

+128288
-153
lines changed

examples/models/llama2/runner/runner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#if ET_USE_TIKTOKEN
1414
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
1515
#else /* BPE */
16-
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
16+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1717
#endif /* ET_USE_TIKTOKEN*/
1818
#include <executorch/extension/evalue_util/print_evalue.h>
1919
#include <executorch/extension/runner_util/managed_tensor.h>

examples/models/llama2/runner/runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include <unordered_map>
1919

2020
#include <executorch/examples/models/llama2/sampler/sampler.h>
21-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
21+
#include <executorch/extension/llm/tokenizer/tokenizer.h>
2222
#include <executorch/extension/module/module.h>
2323
#include <executorch/extension/runner_util/managed_tensor.h>
2424

examples/models/llama2/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def define_common_targets():
4343
] + ([
4444
"//executorch/examples/models/llama2/tokenizer:tiktoken",
4545
] if use_tiktoken() else [
46-
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
46+
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
4747
]) + (_get_operator_lib(aten)) + ([
4848
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
4949
# Therefore enable it explicitly for now to avoid failing tests

examples/models/llama2/tokenizer/llama_tiktoken.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
11+
#include <executorch/extension/llm/tokenizer/tiktoken.h>
1212

1313
namespace torch {
1414
namespace executor {
Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,24 @@
11
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
22

33
def define_common_targets():
4-
runtime.cxx_library(
5-
name = "bpe_tokenizer",
6-
srcs = [
7-
"bpe_tokenizer.cpp",
8-
],
9-
exported_headers = [
10-
"tokenizer.h",
11-
"bpe_tokenizer.h",
12-
],
13-
exported_deps = [
14-
"//executorch/runtime/core/exec_aten:lib",
15-
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
16-
],
17-
visibility = [
18-
"@EXECUTORCH_CLIENTS",
19-
],
20-
)
4+
"""Defines targets that should be shared between fbcode and xplat.
5+
6+
The directory containing this targets.bzl file should also contain both
7+
TARGETS and BUCK files that call this function.
8+
"""
219

2210
runtime.cxx_library(
2311
name = "tiktoken",
2412
srcs = [
25-
"tiktoken.cpp",
2613
"llama_tiktoken.cpp",
2714
],
2815
exported_headers = [
29-
"tokenizer.h",
30-
"tiktoken.h",
3116
"llama_tiktoken.h",
32-
"base64.h",
3317
],
3418
exported_deps = [
35-
"//executorch/runtime/core/exec_aten:lib",
36-
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
19+
"//executorch/extension/llm/tokenizer:tiktoken",
3720
],
3821
visibility = [
39-
"@EXECUTORCH_CLIENTS",
40-
],
41-
exported_external_deps = [
42-
"re2",
22+
"//executorch/examples/models/llama2/...",
4323
],
4424
)

examples/models/llama2/tokenizer/test/CMakeLists.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)
2424
set(
2525
_tokenizer_test_srcs
2626
test_tiktoken.cpp
27-
test_bpe_tokenizer.cpp
28-
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
2927
${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
30-
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
3128
)
3229

3330
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)

examples/models/llama2/tokenizer/test/targets.bzl

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,20 +6,6 @@ def define_common_targets():
66
The directory containing this targets.bzl file should also contain both
77
TARGETS and BUCK files that call this function.
88
"""
9-
10-
runtime.cxx_test(
11-
name = "test_bpe_tokenizer",
12-
srcs = [
13-
"test_bpe_tokenizer.cpp",
14-
],
15-
deps = [
16-
"//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
17-
],
18-
env = {
19-
"RESOURCES_PATH": "$(location :resources)/resources",
20-
},
21-
)
22-
239
runtime.cxx_test(
2410
name = "test_tiktoken",
2511
srcs = [
@@ -31,9 +17,6 @@ def define_common_targets():
3117
env = {
3218
"RESOURCES_PATH": "$(location :resources)/resources",
3319
},
34-
external_deps = [
35-
"re2",
36-
],
3720
)
3821

3922
runtime.filegroup(

examples/models/llama2/tokenizer/test/test_tiktoken.cpp

Lines changed: 0 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
*/
88

99
#include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
10-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
1110
#include <executorch/runtime/platform/runtime.h>
1211
#include <gtest/gtest.h>
1312
#include <vector>
@@ -17,19 +16,6 @@ using namespace ::testing;
1716
namespace torch {
1817
namespace executor {
1918

20-
class TiktokenExtensionTest : public Test {
21-
public:
22-
void SetUp() override {
23-
torch::executor::runtime_init();
24-
tokenizer_ = get_tiktoken_for_llama();
25-
modelPath_ = std::getenv("RESOURCES_PATH") +
26-
std::string("/test_tiktoken_tokenizer.model");
27-
}
28-
29-
std::unique_ptr<Tokenizer> tokenizer_;
30-
std::string modelPath_;
31-
};
32-
3319
class MultimodalTiktokenV5ExtensionTest : public Test {
3420
public:
3521
void SetUp() override {
@@ -43,24 +29,6 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
4329
std::string modelPath_;
4430
};
4531

46-
TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
47-
Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
48-
EXPECT_EQ(res.error(), Error::NotSupported);
49-
}
50-
51-
TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
52-
auto result = tokenizer_->decode(0, 0);
53-
EXPECT_EQ(result.error(), Error::NotSupported);
54-
}
55-
56-
TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
57-
Error res = tokenizer_->load(modelPath_.c_str());
58-
EXPECT_EQ(res, Error::Ok);
59-
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
60-
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
61-
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
62-
}
63-
6432
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
6533
Error res = tokenizer_->load(modelPath_.c_str());
6634
EXPECT_EQ(res, Error::Ok);
@@ -69,17 +37,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
6937
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
7038
}
7139

72-
TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
73-
Error res = tokenizer_->load(modelPath_.c_str());
74-
EXPECT_EQ(res, Error::Ok);
75-
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
76-
EXPECT_EQ(out.error(), Error::Ok);
77-
EXPECT_EQ(out.get().size(), 3);
78-
EXPECT_EQ(out.get()[0], 128000);
79-
EXPECT_EQ(out.get()[1], 15339);
80-
EXPECT_EQ(out.get()[2], 1917);
81-
}
82-
8340
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
8441
Error res = tokenizer_->load(modelPath_.c_str());
8542
EXPECT_EQ(res, Error::Ok);
@@ -101,18 +58,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
10158
}
10259
}
10360

104-
TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
105-
Error res = tokenizer_->load(modelPath_.c_str());
106-
EXPECT_EQ(res, Error::Ok);
107-
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
108-
std::vector<uint64_t> tokens = {128000, 15339, 1917};
109-
for (size_t i = 0; i < tokens.size(); i++) {
110-
Result<std::string> out = tokenizer_->decode(0, tokens[i]);
111-
EXPECT_EQ(out.error(), Error::Ok);
112-
EXPECT_EQ(out.get(), expected[i]);
113-
}
114-
}
115-
11661
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
11762
Error res = tokenizer_->load(modelPath_.c_str());
11863
EXPECT_EQ(res, Error::Ok);
@@ -134,44 +79,5 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
13479
EXPECT_EQ(out.get(), expected[i]);
13580
}
13681
}
137-
138-
TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
139-
Error res = tokenizer_->load(modelPath_.c_str());
140-
EXPECT_EQ(res, Error::Ok);
141-
// The vocab size is 128256, addes 256 just so the token is out of vocab
142-
// range.
143-
Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
144-
EXPECT_EQ(out.error(), Error::NotSupported);
145-
}
146-
147-
TEST_F(TiktokenExtensionTest, ConstructionWithInvalidBOSIndex) {
148-
// gtest death test doesn't work on iOS:
149-
// https://github.com/google/googletest/issues/2834
150-
#if !GTEST_OS_IOS
151-
EXPECT_EXIT(
152-
std::make_unique<Tiktoken>(
153-
std::make_unique<std::vector<std::string>>(
154-
std::vector<std::string>{"<|end_of_text|>"}),
155-
1,
156-
0),
157-
::testing::KilledBySignal(SIGABRT),
158-
"");
159-
#endif
160-
}
161-
162-
TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
163-
// gtest death test doesn't work on iOS:
164-
// https://github.com/google/googletest/issues/2834
165-
#if !GTEST_OS_IOS
166-
EXPECT_EXIT(
167-
std::make_unique<Tiktoken>(
168-
std::make_unique<std::vector<std::string>>(
169-
std::vector<std::string>{"<|begin_of_text|>"}),
170-
0,
171-
1),
172-
::testing::KilledBySignal(SIGABRT),
173-
"");
174-
#endif
175-
}
17682
} // namespace executor
17783
} // namespace torch

examples/qualcomm/llama2/runner/runner.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
// A simple llama2 runner that includes preprocessing and post processing logic.
1010
// The module takes in a string as input and emits a string as output.
1111

12-
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1312
#include <executorch/examples/qualcomm/llama2/runner/runner.h>
1413
#include <executorch/extension/evalue_util/print_evalue.h>
14+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1515
#include <executorch/extension/runner_util/managed_tensor.h>
1616

1717
#include <ctime>

examples/models/llama2/tokenizer/bpe_tokenizer.cpp renamed to extension/llm/tokenizer/bpe_tokenizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
9+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
1010

1111
#include <string>
1212

examples/models/llama2/tokenizer/bpe_tokenizer.h renamed to extension/llm/tokenizer/bpe_tokenizer.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
11+
#include <executorch/extension/llm/tokenizer/tokenizer.h>
1212
#include <cstdint>
1313

1414
namespace torch {

extension/llm/tokenizer/targets.bzl

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,43 @@ def define_common_targets():
3737
":tokenizer_py_lib",
3838
],
3939
)
40+
41+
runtime.cxx_library(
42+
name = "bpe_tokenizer",
43+
srcs = [
44+
"bpe_tokenizer.cpp",
45+
],
46+
exported_headers = [
47+
"tokenizer.h",
48+
"bpe_tokenizer.h",
49+
],
50+
exported_deps = [
51+
"//executorch/runtime/core/exec_aten:lib",
52+
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
53+
],
54+
visibility = [
55+
"@EXECUTORCH_CLIENTS",
56+
],
57+
)
58+
59+
runtime.cxx_library(
60+
name = "tiktoken",
61+
srcs = [
62+
"tiktoken.cpp",
63+
],
64+
exported_headers = [
65+
"tokenizer.h",
66+
"tiktoken.h",
67+
"base64.h",
68+
],
69+
exported_deps = [
70+
"//executorch/runtime/core/exec_aten:lib",
71+
"//executorch/runtime/core/exec_aten/util:scalar_type_util",
72+
],
73+
visibility = [
74+
"@EXECUTORCH_CLIENTS",
75+
],
76+
exported_external_deps = [
77+
"re2",
78+
],
79+
)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# This file should be formatted with
8+
# ~~~
9+
# cmake-format -i CMakeLists.txt
10+
# ~~~
11+
# It should also be cmake-lint clean.
12+
#
13+
14+
cmake_minimum_required(VERSION 3.19)
15+
project(tokenizer_test)
16+
17+
# Use C++17 for test.
18+
set(CMAKE_CXX_STANDARD 17)
19+
20+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
21+
22+
include(${EXECUTORCH_ROOT}/build/Test.cmake)
23+
24+
set(
25+
_tokenizer_test_srcs
26+
test_tiktoken.cpp
27+
test_bpe_tokenizer.cpp
28+
${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
29+
${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
30+
)
31+
32+
set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
33+
set(ABSL_ENABLE_INSTALL ON)
34+
set(ABSL_PROPAGATE_CXX_STD ON)
35+
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
36+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
37+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
38+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
39+
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
40+
41+
et_cxx_test(
42+
tokenizer_test
43+
SOURCES
44+
${_tokenizer_test_srcs}
45+
EXTRA_LIBS
46+
re2::re2
47+
)
48+
target_include_directories(
49+
tokenizer_test PRIVATE ${CMAKE_INSTALL_PREFIX}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
50+
)

0 commit comments

Comments
 (0)