Migrate users of llm tokenizer to use pytorch-labs/tokenizers

larryliu0820 · facebook-github-bot · commit 70675c3b9415 · 2025-03-10T16:58:19.000-07:00
Summary: Finally migrate llm tokenizer usages to pytorch-labs/tokenizers.

Differential Revision: D70932091
diff --git a/examples/models/llama/runner/CMakeLists.txt b/examples/models/llama/runner/CMakeLists.txt
@@ -43,7 +43,7 @@ target_include_directories(
 
 list(
   APPEND _llama_runner__srcs
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp
 )
 list(APPEND _llama_runner__srcs
      ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
diff --git a/examples/models/llama/runner/runner.cpp b/examples/models/llama/runner/runner.cpp
@@ -16,7 +16,7 @@
 #include <executorch/extension/llm/runner/util.h>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 namespace example {
 
@@ -78,16 +78,16 @@ Error Runner::load() {
   // load tokenizer. Assuming tiktoken is the default tokenizer
   tokenizer_ = nullptr;
   tokenizer_ = get_tiktoken_for_llama();
-  Error err = tokenizer_->load(tokenizer_path_);
+  ::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
   // Rely on tiktoken to throw error if the artifact is incompatible. Then we
   // fallback to BPE tokenizer.
-  if (err == Error::InvalidArgument) {
+  if (err == ::tokenizers::Error::LoadFailure) {
     ET_LOG(
         Info,
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
-    tokenizer_ = std::make_unique<llm::BPETokenizer>();
+    tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
     tokenizer_->load(tokenizer_path_);
   }
 
@@ -201,12 +201,12 @@ Error Runner::generate(
       ? seq_len
       : metadata_.at(kMaxSeqLen);
 
-  Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
+  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
       /* bos */ 0,
       /* eos */ 0);
 
-  ET_CHECK_OK_OR_RETURN_ERROR(
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
       encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
 
   // encode the (string) prompt into tokens sequence
@@ -242,7 +242,7 @@ Error Runner::generate(
   uint64_t cur_token = prefill_res.get();
 
   // print the first token from prefill. No prev_token so use cur_token for it.
-  wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+  wrapped_callback(ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
   RUNNER_ET_LOG(
       warmup,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",
diff --git a/examples/models/llama/runner/runner.h b/examples/models/llama/runner/runner.h
@@ -23,7 +23,7 @@
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
+#include <pytorch/tokenizers/tokenizer.h>
 #include <executorch/extension/module/module.h>
 
 namespace example {
@@ -58,7 +58,7 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   // model
   std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
-  std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
   std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
       text_decoder_runner_;
diff --git a/examples/models/llama/runner/targets.bzl b/examples/models/llama/runner/targets.bzl
@@ -48,7 +48,7 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/examples/models/llama/tokenizer:tiktoken",
-                "//executorch/extension/llm/tokenizer:bpe_tokenizer",
+                "//pytorch/tokenizers:llama2c_tokenizer",
             ] + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.cpp b/examples/models/llama/tokenizer/llama_tiktoken.cpp
@@ -10,7 +10,7 @@
 
 namespace example {
 
-using ::executorch::extension::llm::Tiktoken;
+using ::tokenizers::Tiktoken;
 
 namespace {
 static constexpr int32_t kSpecialTokensSize = 256;
diff --git a/examples/models/llama/tokenizer/llama_tiktoken.h b/examples/models/llama/tokenizer/llama_tiktoken.h
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/tiktoken.h>
 
 namespace example {
 
@@ -17,7 +17,7 @@ enum class Version {
   Multimodal,
 };
 
-std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
+std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
     Version version = Version::Default);
 
 } // namespace example
diff --git a/examples/models/llama/tokenizer/targets.bzl b/examples/models/llama/tokenizer/targets.bzl
@@ -15,7 +15,7 @@ def define_common_targets():
             "llama_tiktoken.h",
         ],
         exported_deps = [
-            "//executorch/extension/llm/tokenizer:tiktoken",
+            "//pytorch/tokenizers:tiktoken",
         ],
         visibility = [
             "@EXECUTORCH_CLIENTS",
diff --git a/examples/models/llava/runner/CMakeLists.txt b/examples/models/llava/runner/CMakeLists.txt
@@ -29,7 +29,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_llava_runner__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
     "${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
-    "${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"
+    "${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"
 )
 
 # extension llm runner lib
@@ -47,5 +47,6 @@ set(llava_runner_deps executorch extension_data_loader extension_llm_runner
 target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
 
 target_include_directories(
-  llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
+  llava_runner INTERFACE ${_common_include_directories}
+                         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
diff --git a/extension/llm/runner/CMakeLists.txt b/extension/llm/runner/CMakeLists.txt
@@ -51,5 +51,5 @@ target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})
 
 target_include_directories(
   extension_llm_runner INTERFACE ${_common_include_directories}
-                                 ${EXECUTORCH_ROOT}
+                                 ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
diff --git a/extension/llm/runner/multimodal_runner.h b/extension/llm/runner/multimodal_runner.h
@@ -26,8 +26,8 @@
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace executorch {
 namespace extension {
@@ -129,7 +129,7 @@ class ET_EXPERIMENTAL MultimodalRunner {
   std::unique_ptr<ImagePrefiller> image_prefiller_;
   std::unique_ptr<TextTokenGenerator> text_token_generator_;
   std::string tokenizer_path_;
-  std::unique_ptr<Tokenizer> tokenizer_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
 
   // stats
   Stats stats_;
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl
@@ -49,7 +49,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 ":text_decoder_runner" + aten_suffix,
-                "//executorch/extension/llm/tokenizer:tokenizer_header",
+                "//pytorch/tokenizers:headers",
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
@@ -63,7 +63,7 @@ def define_common_targets():
             ],
             exported_deps = [
                 ":text_decoder_runner" + aten_suffix,
-                "//executorch/extension/llm/tokenizer:tokenizer_header",
+                "//pytorch/tokenizers:headers",
                 "//executorch/extension/module:module" + aten_suffix,
                 "//executorch/extension/tensor:tensor" + aten_suffix,
             ],
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -11,17 +11,37 @@
 
 #include <executorch/extension/llm/runner/stats.h>
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
+#include <pytorch/tokenizers/tokenizer.h>
 #include <executorch/extension/tensor/tensor.h>
 
+#define ET_UNWRAP_TOKENIZER(result__) \
+({  \
+  auto tk_result__ = (result__); \
+  if (!tk_result__.ok()) {          \
+    ET_LOG(Error, "Tokenizers error code %d", static_cast<uint32_t>(tk_result__.error())); \
+    return ::executorch::runtime::Error::InvalidArgument;     \
+  }                         \
+  std::move(*tk_result__);          \
+})
+
+#define ET_CHECK_TK_OK_OR_RETURN_ERROR(result__, ...) \
+({ \
+  auto tk_result__ = (result__); \
+  if (tk_result__ != ::tokenizers::Error::Ok) { \
+    ET_LOG(Error, "Tokenizer error: %d", static_cast<uint32_t>(tk_result__)); \
+    ET_LOG(Error, __VA_ARGS__); \
+    return ::executorch::runtime::Error::InvalidArgument; \
+  } \
+})
+
 namespace executorch {
 namespace extension {
 namespace llm {
 
 class ET_EXPERIMENTAL TextTokenGenerator {
  public:
   TextTokenGenerator(
-      Tokenizer* tokenizer,
+      ::tokenizers::Tokenizer* tokenizer,
       TextDecoderRunner* text_decoder_runner,
       bool use_kv_cache,
       std::unique_ptr<std::unordered_set<uint64_t>>&& eos_ids,
@@ -106,7 +126,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
       }
 
       // print the token as string, decode it with the Tokenizer object
-      token_callback(ET_UNWRAP(tokenizer_->decode(prev_token, cur_token)));
+      token_callback(ET_UNWRAP_TOKENIZER(tokenizer_->decode(prev_token, cur_token)));
 
       if (should_stop_) {
         break;
@@ -130,7 +150,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
   }
 
  private:
-  Tokenizer* tokenizer_;
+  ::tokenizers::Tokenizer* tokenizer_;
   TextDecoderRunner* text_decoder_runner_;
   std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
   bool use_kv_cache_;
diff --git a/shim_et/xplat/executorch/build/env_interface.bzl b/shim_et/xplat/executorch/build/env_interface.bzl
@@ -46,7 +46,6 @@ _EXTERNAL_DEPS = {
     "re2": "//extension/llm/tokenizers/third-party:re2",
     "sentencepiece": [], # Intentionally not supporting OSS buck build of sentencepiece.
     "sentencepiece-py": [],
-    "tiktoken": "//extension/llm/tokenizers:tiktoken",
     # Core C++ PyTorch functionality like Tensor and ScalarType.
     "torch-core-cpp": "//third-party:libtorch",
     "torchgen": "//third-party:torchgen",

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ target_include_directories(`
`43`	`43`
`44`	`44`	`list(`
`45`	`45`	`APPEND _llama_runner__srcs`
`46`		`- ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp`
	`46`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizers/src/tiktoken.cpp`
`47`	`47`	`)`
`48`	`48`	`list(APPEND _llama_runner__srcs`
`49`	`49`	`${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)`
`29`	`29`	`set(_llava_runner__srcs`
`30`	`30`	`"${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"`
`31`	`31`	`"${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"`
`32`		`- "${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"`
	`32`	`+ "${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"`
`33`	`33`	`)`
`34`	`34`
`35`	`35`	`# extension llm runner lib`
`@@ -47,5 +47,6 @@ set(llava_runner_deps executorch extension_data_loader extension_llm_runner`
`47`	`47`	`target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})`
`48`	`48`
`49`	`49`	`target_include_directories(`
`50`		`- llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}`
	`50`	`+ llava_runner INTERFACE ${_common_include_directories}`
	`51`	`+ ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include`
`51`	`52`	`)`
Original file line number	Diff line number	Diff line change
`@@ -51,5 +51,5 @@ target_link_libraries(extension_llm_runner PUBLIC ${runner_deps})`
`51`	`51`
`52`	`52`	`target_include_directories(`
`53`	`53`	`extension_llm_runner INTERFACE ${_common_include_directories}`
`54`		`- ${EXECUTORCH_ROOT}`
	`54`	`+ ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include`
`55`	`55`	`)`