Skip to content

Commit 8262bad

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Migrate users of llm tokenizer to use pytorch-labs/tokenizers (#9114)
Summary: Finally migrate llm tokenizer usages to pytorch-labs/tokenizers. Differential Revision: D70932091
1 parent 366ad75 commit 8262bad

File tree

18 files changed

+93
-56
lines changed

18 files changed

+93
-56
lines changed

.ci/scripts/utils.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,14 @@ clean_executorch_install_folders() {
2020
./install_executorch.sh --clean
2121
}
2222

23+
update_tokenizers_git_submodule() {
24+
echo "Updating tokenizers git submodule..."
25+
git submodule update --init
26+
pushd extension/llm/tokenizers
27+
git submodule update --init
28+
popd
29+
}
30+
2331
install_executorch() {
2432
which pip
2533
# Install executorch, this assumes that Executorch is checked out in the

examples/models/llama/runner/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ target_include_directories(
4343

4444
list(
4545
APPEND _llama_runner__srcs
46-
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
46+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp
4747
)
4848
list(APPEND _llama_runner__srcs
4949
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp

examples/models/llama/runner/runner.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <executorch/extension/llm/runner/util.h>
1717

1818
#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
19-
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
19+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
2020

2121
namespace example {
2222

@@ -78,16 +78,16 @@ Error Runner::load() {
7878
// load tokenizer. Assuming tiktoken is the default tokenizer
7979
tokenizer_ = nullptr;
8080
tokenizer_ = get_tiktoken_for_llama();
81-
Error err = tokenizer_->load(tokenizer_path_);
81+
::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
8282
// Rely on tiktoken to throw error if the artifact is incompatible. Then we
8383
// fallback to BPE tokenizer.
84-
if (err == Error::InvalidArgument) {
84+
if (err == ::tokenizers::Error::LoadFailure) {
8585
ET_LOG(
8686
Info,
8787
"Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
8888
tokenizer_path_.c_str());
8989
tokenizer_.reset();
90-
tokenizer_ = std::make_unique<llm::BPETokenizer>();
90+
tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
9191
tokenizer_->load(tokenizer_path_);
9292
}
9393

@@ -201,12 +201,12 @@ Error Runner::generate(
201201
? seq_len
202202
: metadata_.at(kMaxSeqLen);
203203

204-
Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
204+
::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
205205
prompt,
206206
/* bos */ 0,
207207
/* eos */ 0);
208208

209-
ET_CHECK_OK_OR_RETURN_ERROR(
209+
ET_CHECK_TK_OK_OR_RETURN_ERROR(
210210
encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
211211

212212
// encode the (string) prompt into tokens sequence
@@ -242,7 +242,7 @@ Error Runner::generate(
242242
uint64_t cur_token = prefill_res.get();
243243

244244
// print the first token from prefill. No prev_token so use cur_token for it.
245-
wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
245+
wrapped_callback(ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
246246
RUNNER_ET_LOG(
247247
warmup,
248248
"RSS after prompt prefill: %f MiB (0 if unsupported)",

examples/models/llama/runner/runner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
#include <executorch/extension/llm/runner/text_decoder_runner.h>
2424
#include <executorch/extension/llm/runner/text_prefiller.h>
2525
#include <executorch/extension/llm/runner/text_token_generator.h>
26-
#include <executorch/extension/llm/tokenizer/tokenizer.h>
26+
#include <pytorch/tokenizers/tokenizer.h>
2727
#include <executorch/extension/module/module.h>
2828

2929
namespace example {
@@ -58,7 +58,7 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
5858
// model
5959
std::unique_ptr<::executorch::extension::Module> module_;
6060
std::string tokenizer_path_;
61-
std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
61+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
6262
std::unordered_map<std::string, int64_t> metadata_;
6363
std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
6464
text_decoder_runner_;

examples/models/llama/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def define_common_targets():
4848
"//executorch/runtime/core/exec_aten:lib" + aten_suffix,
4949
"//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
5050
"//executorch/examples/models/llama/tokenizer:tiktoken",
51-
"//executorch/extension/llm/tokenizer:bpe_tokenizer",
51+
"//pytorch/tokenizers:llama2c_tokenizer",
5252
] + (_get_operator_lib(aten)) + ([
5353
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
5454
# Therefore enable it explicitly for now to avoid failing tests

examples/models/llama/tokenizer/llama_tiktoken.cpp

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
namespace example {
1212

13-
using ::executorch::extension::llm::Tiktoken;
13+
using ::tokenizers::Tiktoken;
1414

1515
namespace {
1616
static constexpr int32_t kSpecialTokensSize = 256;
@@ -42,8 +42,25 @@ _get_default_special_tokens() {
4242
return special_tokens;
4343
}
4444

45-
static inline std::unique_ptr<std::vector<std::string>>
46-
_get_multimodal_special_tokens() {
45+
46+
std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
47+
switch (version) {
48+
case Version::Multimodal:
49+
return get_multimodal_special_tokens();
50+
default:
51+
return _get_default_special_tokens();
52+
}
53+
}
54+
55+
} // namespace
56+
57+
std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
58+
return std::make_unique<Tiktoken>(
59+
_get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
60+
}
61+
62+
std::unique_ptr<std::vector<std::string>>
63+
get_multimodal_special_tokens() {
4764
auto special_tokens =
4865
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
4966
"<|begin_of_text|>",
@@ -72,20 +89,4 @@ _get_multimodal_special_tokens() {
7289
return special_tokens;
7390
}
7491

75-
std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
76-
switch (version) {
77-
case Version::Multimodal:
78-
return _get_multimodal_special_tokens();
79-
default:
80-
return _get_default_special_tokens();
81-
}
82-
}
83-
84-
} // namespace
85-
86-
std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
87-
return std::make_unique<Tiktoken>(
88-
_get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
89-
}
90-
9192
} // namespace example

examples/models/llama/tokenizer/llama_tiktoken.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <executorch/extension/llm/tokenizer/tiktoken.h>
11+
#include <pytorch/tokenizers/tiktoken.h>
1212

1313
namespace example {
1414

@@ -17,7 +17,9 @@ enum class Version {
1717
Multimodal,
1818
};
1919

20-
std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
20+
std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
2121
Version version = Version::Default);
2222

23+
std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens();
24+
2325
} // namespace example

examples/models/llama/tokenizer/targets.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ def define_common_targets():
1515
"llama_tiktoken.h",
1616
],
1717
exported_deps = [
18-
"//executorch/extension/llm/tokenizer:tiktoken",
18+
"//pytorch/tokenizers:tiktoken",
19+
"//executorch/extension/llm/tokenizer:tiktoken", # TODO: remove
1920
],
2021
visibility = [
2122
"@EXECUTORCH_CLIENTS",

examples/models/llama/tokenizer/test/test_tiktoken.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
#include <vector>
1212

13-
#include <executorch/runtime/platform/runtime.h>
13+
#include <executorch/extension/llm/tokenizer/tiktoken.h>
1414

1515
#include <gtest/gtest.h>
1616

@@ -36,8 +36,7 @@ static std::string get_resource_path(const std::string& name) {
3636
class MultimodalTiktokenV5ExtensionTest : public Test {
3737
public:
3838
void SetUp() override {
39-
executorch::runtime::runtime_init();
40-
tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
39+
tokenizer_ = std::make_unique<executorch::extension::llm::Tiktoken>(example::get_multimodal_special_tokens(), 0, 1);
4140
modelPath_ = get_resource_path("test_tiktoken_tokenizer.model");
4241
}
4342

examples/models/llava/runner/CMakeLists.txt

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
2929
set(_llava_runner__srcs
3030
"${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
3131
"${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
32-
"${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"
32+
"${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"
3333
)
3434

3535
# extension llm runner lib
@@ -47,5 +47,6 @@ set(llava_runner_deps executorch extension_data_loader extension_llm_runner
4747
target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
4848

4949
target_include_directories(
50-
llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
50+
llava_runner INTERFACE ${_common_include_directories}
51+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
5152
)

extension/llm/runner/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,5 @@ target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
5151

5252
target_include_directories(
5353
extension_llm_runner INTERFACE ${_common_include_directories}
54-
${EXECUTORCH_ROOT}
54+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
5555
)

extension/llm/runner/multimodal_runner.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626
#include <executorch/extension/llm/runner/text_prefiller.h>
2727
#include <executorch/extension/llm/runner/text_token_generator.h>
2828
#include <executorch/extension/llm/sampler/sampler.h>
29-
#include <executorch/extension/llm/tokenizer/tokenizer.h>
3029
#include <executorch/extension/module/module.h>
30+
#include <pytorch/tokenizers/tokenizer.h>
3131

3232
namespace executorch {
3333
namespace extension {
@@ -129,7 +129,7 @@ class ET_EXPERIMENTAL MultimodalRunner {
129129
std::unique_ptr<ImagePrefiller> image_prefiller_;
130130
std::unique_ptr<TextTokenGenerator> text_token_generator_;
131131
std::string tokenizer_path_;
132-
std::unique_ptr<Tokenizer> tokenizer_;
132+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
133133

134134
// stats
135135
Stats stats_;

extension/llm/runner/targets.bzl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def define_common_targets():
4949
],
5050
exported_deps = [
5151
":text_decoder_runner" + aten_suffix,
52-
"//executorch/extension/llm/tokenizer:tokenizer_header",
52+
"//pytorch/tokenizers:headers",
5353
"//executorch/extension/module:module" + aten_suffix,
5454
"//executorch/extension/tensor:tensor" + aten_suffix,
5555
],
@@ -63,7 +63,7 @@ def define_common_targets():
6363
],
6464
exported_deps = [
6565
":text_decoder_runner" + aten_suffix,
66-
"//executorch/extension/llm/tokenizer:tokenizer_header",
66+
"//pytorch/tokenizers:headers",
6767
"//executorch/extension/module:module" + aten_suffix,
6868
"//executorch/extension/tensor:tensor" + aten_suffix,
6969
],

extension/llm/runner/text_prefiller.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
#pragma once
1313

1414
#include <executorch/extension/llm/runner/text_decoder_runner.h>
15-
#include <executorch/extension/llm/tokenizer/tokenizer.h>
16-
#include <functional>
1715

1816
namespace executorch {
1917
namespace extension {

extension/llm/runner/text_token_generator.h

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,37 @@
1111

1212
#include <executorch/extension/llm/runner/stats.h>
1313
#include <executorch/extension/llm/runner/text_decoder_runner.h>
14-
#include <executorch/extension/llm/tokenizer/tokenizer.h>
14+
#include <pytorch/tokenizers/tokenizer.h>
1515
#include <executorch/extension/tensor/tensor.h>
1616

17+
#define ET_UNWRAP_TOKENIZER(result__) \
18+
({ \
19+
auto tk_result__ = (result__); \
20+
if (!tk_result__.ok()) { \
21+
ET_LOG(Error, "Tokenizers error code %d", static_cast<uint32_t>(tk_result__.error())); \
22+
return ::executorch::runtime::Error::InvalidArgument; \
23+
} \
24+
std::move(*tk_result__); \
25+
})
26+
27+
#define ET_CHECK_TK_OK_OR_RETURN_ERROR(result__, ...) \
28+
({ \
29+
auto tk_result__ = (result__); \
30+
if (tk_result__ != ::tokenizers::Error::Ok) { \
31+
ET_LOG(Error, "Tokenizer error: %d", static_cast<uint32_t>(tk_result__)); \
32+
ET_LOG(Error, __VA_ARGS__); \
33+
return ::executorch::runtime::Error::InvalidArgument; \
34+
} \
35+
})
36+
1737
namespace executorch {
1838
namespace extension {
1939
namespace llm {
2040

2141
class ET_EXPERIMENTAL TextTokenGenerator {
2242
public:
2343
TextTokenGenerator(
24-
Tokenizer* tokenizer,
44+
::tokenizers::Tokenizer* tokenizer,
2545
TextDecoderRunner* text_decoder_runner,
2646
bool use_kv_cache,
2747
std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
@@ -106,7 +126,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
106126
}
107127

108128
// print the token as string, decode it with the Tokenizer object
109-
token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token)));
129+
token_callback(ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
110130

111131
if (should_stop_) {
112132
break;
@@ -130,7 +150,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
130150
}
131151

132152
private:
133-
Tokenizer* tokenizer_;
153+
::tokenizers::Tokenizer* tokenizer_;
134154
TextDecoderRunner* text_decoder_runner_;
135155
std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
136156
bool use_kv_cache_;

install_executorch.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def clean():
6666
"pthreadpool": "CMakeLists.txt",
6767
"pybind11": "CMakeLists.txt",
6868
"shim": "BUCK",
69+
"tokenizers": "CMakeLists.txt",
6970
"XNNPACK": "CMakeLists.txt",
7071
}
7172

shim_et/xplat/executorch/build/env_interface.bzl

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ load(":type_defs.bzl", "is_list", "is_tuple")
1010

1111
_ET_TARGET_PREFIX = "executorch"
1212

13+
_TOKENIZER_TARGET_PREFIX = "pytorch/tokenizers"
14+
1315
# Indicates that an external_dep entry should fall through to the underlying
1416
# buck rule.
1517
_EXTERNAL_DEP_FALLTHROUGH = "<fallthrough>"
@@ -46,7 +48,6 @@ _EXTERNAL_DEPS = {
4648
"re2": "//extension/llm/tokenizers/third-party:re2",
4749
"sentencepiece": [], # Intentionally not supporting OSS buck build of sentencepiece.
4850
"sentencepiece-py": [],
49-
"tiktoken": "//extension/llm/tokenizers:tiktoken",
5051
# Core C++ PyTorch functionality like Tensor and ScalarType.
5152
"torch-core-cpp": "//third-party:libtorch",
5253
"torchgen": "//third-party:torchgen",
@@ -66,10 +67,11 @@ def _resolve_external_dep(name):
6667
return [res]
6768

6869
def _start_with_et_targets(target):
69-
prefix = "//" + _ET_TARGET_PREFIX
70-
for suffix in ("/", ":"):
71-
if target.startswith(prefix + suffix):
72-
return True
70+
for prefix in [_ET_TARGET_PREFIX, _TOKENIZER_TARGET_PREFIX]:
71+
prefix = "//" + prefix
72+
for suffix in ("/", ":"):
73+
if target.startswith(prefix + suffix):
74+
return True
7375
return False
7476

7577
def _patch_platforms(kwargs):
@@ -199,7 +201,11 @@ def _target_needs_patch(target):
199201
return _start_with_et_targets(target) or target.startswith(":")
200202

201203
def _patch_target_for_env(target):
202-
return target.replace("//executorch/", "//", 1)
204+
if _ET_TARGET_PREFIX in target:
205+
return target.replace("//executorch/", "//", 1)
206+
elif _TOKENIZER_TARGET_PREFIX in target:
207+
return target.replace("//pytorch/tokenizers", "//extension/llm/tokenizers", 1)
208+
return target
203209

204210
def _struct_to_json(object):
205211
# @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.

0 commit comments

Comments
 (0)