Skip to content

Commit 2b5ca16

Browse files
authored
Use external hf_tokenizer in llama runner (#9112)
### Summary Use https://github.com/pytorch-labs/tokenizers huggingface tokenizer in the Llama runner. Results on Qwen2.5 with `extension/llm/tokenizers` checked out to pytorch-labs/tokenizers#50: ``` Once upon a time, there was a little girl named Lily. She was very happy. She had a big garden in the back of her house. She planted many flowers in it. They were red, yellow and blue. They were very pretty. Lily loved them very much. One day, she was watering them. Suddenly, she heard a noise. It was a noise in the tree. She looked up. There was a big bird in the tree. It was eating one of Lily's flowers. Lily was very angry. She ran to the tree. "Hello!" she said to the bird. "What are you doing in my I 00:00:08.624959 executorch:runner.cpp:294] RSS after finishing text generation: 2147.121094 MiB (0 if unsupported) PyTorchObserver {"prompt_tokens":4,"generated_tokens":123,"model_load_start_ms":1744936315023,"model_load_end_ms":1744936318524,"inference_start_ms":1744936318524,"inference_end_ms":1744936323646,"prompt_eval_end_ms":1744936318580,"first_token_ms":1744936318580,"aggregate_sampling_time_ms":274877907025,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:08.625019 executorch:stats.h:106] Prompt Tokens: 4 Generated Tokens: 123 I 00:00:08.625021 executorch:stats.h:112] Model Load Time: 3.501000 (seconds) I 00:00:08.625023 executorch:stats.h:119] Total inference time: 5.122000 (seconds) Rate: 24.014057 (tokens/second) I 00:00:08.625033 executorch:stats.h:129] Prompt evaluation: 0.056000 (seconds) Rate: 71.428571 (tokens/second) I 00:00:08.625038 executorch:stats.h:138] Generated 123 tokens: 5.066000 (seconds) Rate: 24.279510 (tokens/second) I 00:00:08.625045 executorch:stats.h:149] Time to first generated token: 0.056000 (seconds) I 00:00:08.625047 executorch:stats.h:155] Sampling time over 127 tokens: 274877907.025000 (seconds) ``` ### Test plan Build llama runner locally (note the inclusion of `-DSUPPORT_REGEX_LOOKAHEAD=ON`): ``` cmake -DPYTHON_EXECUTABLE=python \ -DCMAKE_INSTALL_PREFIX=cmake-out \ -DCMAKE_BUILD_TYPE=Release \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON \ -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ -DSUPPORT_REGEX_LOOKAHEAD=ON \ -Bcmake-out/examples/models/llama \ examples/models/llama cmake --build cmake-out/examples/models/llama -j16 --config Release ``` Run on Qwen2.5: ``` cmake-out/examples/models/llama/llama_main --model_path=qwen2_5.pte --tokenizer_path ~/hf/models--Qwen--Qwen2.5-1.5B/snapshots/8faed761d45a263340a0528343f099c05c9a4323/tokenizer.json --prompt="Once upon a time" --temperature 0 ```
1 parent 5a643b3 commit 2b5ca16

File tree

7 files changed

+67
-36
lines changed

7 files changed

+67
-36
lines changed

examples/models/llama/runner/CMakeLists.txt

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,6 @@ target_include_directories(
4141
extension_module INTERFACE ${_common_include_directories}
4242
)
4343

44-
list(
45-
APPEND _llama_runner__srcs
46-
${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/tiktoken.cpp
47-
)
48-
list(APPEND _llama_runner__srcs
49-
${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
50-
)
51-
5244
if(CMAKE_TOOLCHAIN_IOS
5345
OR ANDROID
5446
OR APPLE
@@ -60,23 +52,8 @@ else()
6052
add_library(llama_runner SHARED ${_llama_runner__srcs})
6153
endif()
6254

63-
# find RE2 for tokenizer, build tiktoken
64-
set(ABSL_ENABLE_INSTALL ON)
65-
set(ABSL_PROPAGATE_CXX_STD ON)
66-
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
67-
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
68-
add_subdirectory(
69-
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/abseil-cpp
70-
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
71-
)
72-
add_subdirectory(
73-
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/re2
74-
${CMAKE_CURRENT_BINARY_DIR}/re2
75-
)
76-
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
77-
7855
set(llama_runner_deps executorch extension_data_loader extension_module
79-
extension_tensor re2::re2
56+
extension_tensor
8057
)
8158

8259
target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
@@ -85,6 +62,17 @@ target_include_directories(
8562
llama_runner
8663
INTERFACE ${_common_include_directories}
8764
)
65+
66+
# Include tokenizers dependency
67+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
68+
add_subdirectory(
69+
${EXECUTORCH_ROOT}/extension/llm/tokenizers
70+
${CMAKE_CURRENT_BINARY_DIR}/tokenizers
71+
)
72+
target_link_libraries(
73+
llama_runner PUBLIC tokenizers
74+
)
75+
8876
target_include_directories(
8977
llama_runner
9078
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include

examples/models/llama/runner/runner.cpp

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <executorch/extension/llm/runner/util.h>
1818

1919
#include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
20+
#include <pytorch/tokenizers/hf_tokenizer.h>
2021
#include <pytorch/tokenizers/llama2c_tokenizer.h>
2122

2223
namespace example {
@@ -36,6 +37,29 @@ static constexpr auto kMaxContextLen = "get_max_context_len";
3637
static constexpr auto kVocabSize = "get_vocab_size";
3738
static constexpr auto kUseKVCache = "use_kv_cache";
3839
static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
40+
41+
std::unique_ptr<::tokenizers::Tokenizer> load_tokenizer(
42+
const std::string& tokenizer_path) {
43+
auto json_tokenizer = std::make_unique<tokenizers::HFTokenizer>();
44+
if (json_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
45+
ET_LOG(Info, "Loaded json tokenizer");
46+
return json_tokenizer;
47+
}
48+
49+
auto tiktoken_tokenizer = get_tiktoken_for_llama();
50+
if (tiktoken_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
51+
ET_LOG(Info, "Loaded TikToken tokenizer");
52+
return tiktoken_tokenizer;
53+
}
54+
55+
auto bpe_tokenizer = std::make_unique<::tokenizers::Llama2cTokenizer>();
56+
if (bpe_tokenizer->load(tokenizer_path) == ::tokenizers::Error::Ok) {
57+
ET_LOG(Info, "Loaded BPE tokenizer");
58+
return bpe_tokenizer;
59+
}
60+
61+
return nullptr;
62+
}
3963
} // namespace
4064

4165
Runner::Runner(
@@ -87,25 +111,23 @@ Error Runner::load() {
87111
return Error::Ok;
88112
}
89113
ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
90-
// load tokenizer. Assuming tiktoken is the default tokenizer
91-
tokenizer_ = nullptr;
92-
tokenizer_ = get_tiktoken_for_llama();
93-
::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
94-
// Rely on tiktoken to throw error if the artifact is incompatible. Then we
95-
// fallback to BPE tokenizer.
96-
if (err != ::tokenizers::Error::Ok) {
114+
115+
// Load tokenizer.
116+
tokenizer_ = load_tokenizer(tokenizer_path_);
117+
if (tokenizer_ == nullptr) {
97118
ET_LOG(
98119
Info,
99120
"Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
100121
tokenizer_path_.c_str());
101122
tokenizer_.reset();
102123
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
103124
tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
104-
err = tokenizer_->load(tokenizer_path_);
125+
auto err = tokenizer_->load(tokenizer_path_);
105126
ET_CHECK_TK_OK_OR_RETURN_ERROR(
106127
err,
107128
"Failed to load %s as a llama2.c tokenizer artifact",
108129
tokenizer_path_.c_str());
130+
return ::executorch::runtime::Error::InvalidArgument;
109131
}
110132

111133
ET_LOG(Info, "Reading metadata from model");

examples/models/llama/runner/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ def define_common_targets():
4949
"//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
5050
"//executorch/examples/models/llama/tokenizer:tiktoken",
5151
"//pytorch/tokenizers:llama2c_tokenizer",
52+
"//pytorch/tokenizers:hf_tokenizer",
5253
] + (_get_operator_lib(aten)) + ([
5354
# Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
5455
# Therefore enable it explicitly for now to avoid failing tests

examples/qualcomm/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ find_package(gflags REQUIRED)
3535
set(_common_compile_options -Wno-deprecated-declarations -fPIC)
3636

3737
# Let files say "include <executorch/path/to/header.h>".
38-
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
38+
set(_common_include_directories ${EXECUTORCH_ROOT}/.. ${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/json/single_include)
3939

4040
#
4141
# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
@@ -67,6 +67,9 @@ target_include_directories(
6767
PUBLIC
6868
${_common_include_directories}
6969
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/include
70+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/json/single_include
71+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/llama.cpp-unicode/include
72+
${CMAKE_CURRENT_SOURCE_DIR}/../../extension/llm/tokenizers/third-party/llama.cpp-unicode/src
7073
)
7174

7275
# find RE2 for tokenizer

examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,12 @@ list(PREPEND _qaihub_llama2_7b_runner__srcs
2727
# build qaihub llama2 7b runner
2828
add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs})
2929
target_include_directories(
30-
qaihub_llama2_7b_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
30+
qaihub_llama2_7b_runner PUBLIC
31+
${_common_include_directories}
32+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
33+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/json/single_include
34+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/include
35+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/src
3136
)
3237
target_link_libraries(
3338
qaihub_llama2_7b_runner
@@ -69,7 +74,12 @@ list(
6974
# build qaihub llama3 8b runner
7075
add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs})
7176
target_include_directories(
72-
qaihub_llama3_8b_runner PUBLIC ${_common_include_directories} ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
77+
qaihub_llama3_8b_runner PUBLIC
78+
${_common_include_directories}
79+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/include
80+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/json/single_include
81+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/include
82+
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/third-party/llama.cpp-unicode/src
7383
)
7484

7585
target_link_libraries(

extension/llm/runner/CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ set(runner_deps executorch extension_data_loader extension_module
4949

5050
target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
5151

52+
target_include_directories(
53+
extension_llm_runner
54+
PUBLIC
55+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/llama.cpp-unicode/include
56+
${EXECUTORCH_ROOT}/extension/llm/tokenizers/third-party/llama.cpp-unicode/src
57+
)
58+
5259
target_include_directories(
5360
extension_llm_runner INTERFACE ${_common_include_directories}
5461
${EXECUTORCH_ROOT}/extension/llm/tokenizers/include

extension/llm/tokenizers

0 commit comments

Comments
 (0)