Skip to content

Commit 71ea375

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Migrate users of llm tokenizer to use pytorch-labs/tokenizers (#9114)
Summary: Pull Request resolved: #9114 Finally migrate llm tokenizer usages to pytorch-labs/tokenizers. Reviewed By: iseeyuan Differential Revision: D70932091
1 parent f789df2 commit 71ea375

36 files changed

+178
-101
lines changed

.ci/scripts/utils.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ clean_executorch_install_folders() {
2020
./install_executorch.sh --clean
2121
}
2222

23+
update_tokenizers_git_submodule() {
24+
echo "Updating tokenizers git submodule..."
25+
git submodule update --init
26+
pushd extension/llm/tokenizers
27+
git submodule update --init
28+
popd
29+
}
30+
2331
install_executorch() {
2432
which pip
2533
# Install executorch, this assumes that Executorch is checked out in the

examples/mediatek/CMakeLists.txt

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -137,18 +137,18 @@ if(${ANDROID})
137137
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
138138

139139
# Build tokenizers
140-
set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer)
140+
set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizers)
141141
add_library(tokenizer STATIC)
142142
target_include_directories(
143143
tokenizer PUBLIC ${_common_include_directories} ${THIRD_PARTY_ABSL_DIR}
144-
${THIRD_PARTY_RE2_DIR}
144+
${THIRD_PARTY_RE2_DIR} ${LLAMA2_TOKENIZER_DIR}/include
145145
)
146146
target_link_libraries(tokenizer PRIVATE re2::re2)
147147
target_sources(
148148
tokenizer
149149
PRIVATE
150-
${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp
151-
${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp
150+
${LLAMA2_TOKENIZER_DIR}/src/tiktoken.cpp
151+
${LLAMA2_TOKENIZER_DIR}/src/llama2c_tokenizer.cpp
152152
${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp
153153
)
154154

examples/mediatek/executor_runner/mtk_llama_executor_runner.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@
6868
#include "llama_runner/llm_helper/include/llm_types.h"
6969

7070
#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
71-
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
72-
#include <executorch/extension/llm/tokenizer/tiktoken.h>
71+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
72+
#include <pytorch/tokenizers/tiktoken.h>
7373

7474
// Llama model options
7575
DEFINE_uint64(
@@ -140,10 +140,10 @@ using example::utils::read_file;
140140
using example::utils::split;
141141
using example::utils::Timer;
142142
using example::utils::to_string;
143-
using executorch::extension::llm::BPETokenizer;
144-
using executorch::extension::llm::Tokenizer;
145143
using executorch::runtime::Error;
146144
using executorch::runtime::Result;
145+
using tokenizers::Llama2cTokenizer;
146+
using tokenizers::Tokenizer;
147147

148148
LlamaModelOptions get_model_options() {
149149
LlamaModelOptions options = {

examples/mediatek/executor_runner/mtk_llama_runner.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414
#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
1515
#include <executorch/extension/llm/runner/irunner.h>
1616
#include <executorch/extension/llm/runner/stats.h>
17-
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
18-
#include <executorch/extension/llm/tokenizer/tiktoken.h>
17+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
18+
#include <pytorch/tokenizers/tiktoken.h>
1919
#include <cstdint>
2020
#include <functional>
2121
#include <memory>
@@ -28,9 +28,9 @@ using Stats = ::executorch::llm::Stats;
2828
using example::LlamaModelOptions;
2929
using example::LlamaModelPaths;
3030
using example::LlamaRuntime;
31-
using executorch::extension::llm::Tokenizer;
3231
using executorch::runtime::Error;
3332
using executorch::runtime::Result;
33+
using tokenizers::Tokenizer;
3434

3535
class MTKLlamaRunner : public executorch::extension::llm::IRunner {
3636
public:

examples/models/llama/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,11 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
209209
endif()
210210
endif()
211211

212-
target_include_directories(llama_main PUBLIC ${_common_include_directories})
212+
target_include_directories(
213+
llama_main
214+
PUBLIC ${_common_include_directories}
215+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
216+
)
213217
target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries})
214218
target_compile_options(llama_main PUBLIC ${_common_compile_options})
215219

examples/models/llama/runner/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ target_include_directories(
4343

4444
list(
4545
APPEND _llama_runner__srcs
46-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
46+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp
4747
)
4848
list(APPEND _llama_runner__srcs
4949
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp

examples/models/llama/runner/runner.cpp

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <executorch/extension/llm/runner/util.h>
1717

1818
#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
19-
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
19+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
2020

2121
namespace example {
2222

@@ -78,17 +78,21 @@ Error Runner::load() {
7878
// load tokenizer. Assuming tiktoken is the default tokenizer
7979
tokenizer_ = nullptr;
8080
tokenizer_ = get_tiktoken_for_llama();
81-
Error err = tokenizer_->load(tokenizer_path_);
81+
::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
8282
// Rely on tiktoken to throw error if the artifact is incompatible. Then we
8383
// fallback to BPE tokenizer.
84-
if (err == Error::InvalidArgument) {
84+
if (err != ::tokenizers::Error::Ok) {
8585
ET_LOG(
8686
Info,
8787
"Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
8888
tokenizer_path_.c_str());
8989
tokenizer_.reset();
90-
tokenizer_ = std::make_unique<llm::BPETokenizer>();
91-
tokenizer_->load(tokenizer_path_);
90+
tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
91+
err = tokenizer_->load(tokenizer_path_);
92+
ET_CHECK_TK_OK_OR_RETURN_ERROR(
93+
err,
94+
"Failed to load %s as a llama2.c tokenizer artifact",
95+
tokenizer_path_.c_str());
9296
}
9397

9498
ET_LOG(Info, "Reading metadata from model");
@@ -201,12 +205,12 @@ Error Runner::generate(
201205
? seq_len
202206
: metadata_.at(kMaxSeqLen);
203207

204-
Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
208+
::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
205209
prompt,
206210
/* bos */ 0,
207211
/* eos */ 0);
208212

209-
ET_CHECK_OK_OR_RETURN_ERROR(
213+
ET_CHECK_TK_OK_OR_RETURN_ERROR(
210214
encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
211215

212216
// encode the (string) prompt into tokens sequence
@@ -242,7 +246,8 @@ Error Runner::generate(
242246
uint64_t cur_token = prefill_res.get();
243247

244248
// print the first token from prefill. No prev_token so use cur_token for it.
245-
wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
249+
wrapped_callback(
250+
ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
246251
RUNNER_ET_LOG(
247252
warmup,
248253
"RSS after prompt prefill: %f MiB (0 if unsupported)",

examples/models/llama/runner/runner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
#include <executorch/extension/llm/runner/text_decoder_runner.h>
2424
#include <executorch/extension/llm/runner/text_prefiller.h>
2525
#include <executorch/extension/llm/runner/text_token_generator.h>
26-
#include <executorch/extension/llm/tokenizer/tokenizer.h>
2726
#include <executorch/extension/module/module.h>
27+
#include <pytorch/tokenizers/tokenizer.h>
2828

2929
namespace example {
3030

@@ -58,7 +58,7 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
5858
// model
5959
std::unique_ptr<::executorch::extension::Module> module_;
6060
std::string tokenizer_path_;
61-
std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
61+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
6262
std::unordered_map<std::string, int64_t> metadata_;
6363
std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
6464
text_decoder_runner_;

examples/models/llama/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def define_common_targets():
4848
"//executorch/runtime/core/exec_aten:lib" + aten_suffix,
4949
"//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
5050
"//executorch/examples/models/llama/tokenizer:tiktoken",
51-
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
51+
"//pytorch/tokenizers:llama2c_tokenizer",
5252
] + (_get_operator_lib(aten)) + ([
5353
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
5454
# Therefore enable it explicitly for now to avoid failing tests

examples/models/llama/tokenizer/llama_tiktoken.cpp

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
namespace example {
1212

13-
using ::executorch::extension::llm::Tiktoken;
13+
using ::tokenizers::Tiktoken;
1414

1515
namespace {
1616
static constexpr int32_t kSpecialTokensSize = 256;
@@ -42,8 +42,23 @@ _get_default_special_tokens() {
4242
return special_tokens;
4343
}
4444

45-
static inline std::unique_ptr<std::vector<std::string>>
46-
_get_multimodal_special_tokens() {
45+
std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
46+
switch (version) {
47+
case Version::Multimodal:
48+
return get_multimodal_special_tokens();
49+
default:
50+
return _get_default_special_tokens();
51+
}
52+
}
53+
54+
} // namespace
55+
56+
std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
57+
return std::make_unique<Tiktoken>(
58+
_get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
59+
}
60+
61+
std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens() {
4762
auto special_tokens =
4863
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
4964
"<|begin_of_text|>",
@@ -72,20 +87,4 @@ _get_multimodal_special_tokens() {
7287
return special_tokens;
7388
}
7489

75-
std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
76-
switch (version) {
77-
case Version::Multimodal:
78-
return _get_multimodal_special_tokens();
79-
default:
80-
return _get_default_special_tokens();
81-
}
82-
}
83-
84-
} // namespace
85-
86-
std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
87-
return std::make_unique<Tiktoken>(
88-
_get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
89-
}
90-
9190
} // namespace example

examples/models/llama/tokenizer/llama_tiktoken.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/extension/llm/tokenizer/tiktoken.h>
11+
#include <pytorch/tokenizers/tiktoken.h>
1212

1313
namespace example {
1414

@@ -17,7 +17,9 @@ enum class Version {
1717
Multimodal,
1818
};
1919

20-
std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
20+
std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
2121
Version version = Version::Default);
2222

23+
std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens();
24+
2325
} // namespace example

examples/models/llama/tokenizer/targets.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ def define_common_targets():
1515
"llama_tiktoken.h",
1616
],
1717
exported_deps = [
18-
"//executorch/extension/llm/tokenizer:tiktoken",
18+
"//pytorch/tokenizers:tiktoken",
19+
"//executorch/extension/llm/tokenizer:tiktoken", # TODO: remove
1920
],
2021
visibility = [
2122
"@EXECUTORCH_CLIENTS",

examples/models/llama/tokenizer/test/test_tiktoken.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
#include <vector>
1212

13-
#include <executorch/runtime/platform/runtime.h>
13+
#include <executorch/extension/llm/tokenizer/tiktoken.h>
1414

1515
#include <gtest/gtest.h>
1616

@@ -36,8 +36,8 @@ static std::string get_resource_path(const std::string& name) {
3636
class MultimodalTiktokenV5ExtensionTest : public Test {
3737
public:
3838
void SetUp() override {
39-
executorch::runtime::runtime_init();
40-
tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
39+
tokenizer_ = std::make_unique<executorch::extension::llm::Tiktoken>(
40+
example::get_multimodal_special_tokens(), 0, 1);
4141
modelPath_ = get_resource_path("test_tiktoken_tokenizer.model");
4242
}
4343

examples/models/llava/runner/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
2929
set(_llava_runner__srcs
3030
"${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
3131
"${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
32-
"${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"
32+
"${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"
3333
)
3434

3535
# extension llm runner lib
@@ -47,5 +47,6 @@ set(llava_runner_deps executorch extension_data_loader extension_llm_runner
4747
target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
4848

4949
target_include_directories(
50-
llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
50+
llava_runner INTERFACE ${_common_include_directories}
51+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
5152
)

examples/models/llava/runner/llava_runner.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include <executorch/examples/models/llava/runner/llava_image_prefiller.h>
1414
#include <executorch/examples/models/llava/runner/llava_runner.h>
1515
#include <executorch/examples/models/llava/runner/llava_text_decoder_runner.h>
16-
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
16+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
1717

1818
#include <ctime>
1919
#include <memory>
@@ -43,7 +43,7 @@ Error LlavaRunner::load() {
4343
stats_.model_load_start_ms = llm::time_in_ms();
4444

4545
// Load the tokenizer
46-
tokenizer_ = std::make_unique<llm::BPETokenizer>();
46+
tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
4747
tokenizer_->load(tokenizer_path_);
4848

4949
// Load the text decoder runner
@@ -90,7 +90,7 @@ Result<uint64_t> LlavaRunner::prefill_prompt(
9090
int8_t bos,
9191
int8_t eos) {
9292
std::vector<uint64_t> prompt_tokens =
93-
ET_UNWRAP(tokenizer_->encode(prompt, bos, eos));
93+
ET_UNWRAP_TOKENIZER(tokenizer_->encode(prompt, bos, eos));
9494

9595
return text_prefiller_->prefill(prompt_tokens, start_pos);
9696
}

examples/models/llava/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@ def define_common_targets():
1414
exported_deps = [
1515
"//executorch/backends/xnnpack:xnnpack_backend",
1616
"//executorch/extension/llm/runner:runner_lib",
17-
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
1817
"//executorch/extension/evalue_util:print_evalue",
1918
"//executorch/extension/module:module",
2019
"//executorch/extension/tensor:tensor",
@@ -23,5 +22,6 @@ def define_common_targets():
2322
"//executorch/runtime/core/exec_aten/util:tensor_util",
2423
"//executorch/configurations:optimized_native_cpu_ops",
2524
"//executorch/extension/llm/custom_ops:custom_ops",
25+
"//pytorch/tokenizers:llama2c_tokenizer",
2626
],
2727
)

examples/models/phi-3-mini/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,12 @@ add_executable(
4141
phi_3_mini_runner
4242
main.cpp runner.cpp
4343
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
44-
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizer/bpe_tokenizer.cpp
44+
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
4545
)
4646
target_include_directories(
4747
phi_3_mini_runner
4848
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
49+
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
4950
)
5051
target_link_libraries(
5152
phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor

0 commit comments

Comments
 (0)