pytorch · facebook-github-bot · Mar 17, 2025 · Mar 17, 2025
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
@@ -20,6 +20,14 @@ clean_executorch_install_folders() {
   ./install_executorch.sh --clean
 }
 
+update_tokenizers_git_submodule() {
+  echo "Updating tokenizers git submodule..."
+  git submodule update --init
+  pushd extension/llm/tokenizers
+  git submodule update --init
+  popd
+}
+
 install_executorch() {
   which pip
   # Install executorch, this assumes that Executorch is checked out in the

@@ -137,18 +137,18 @@ if(${ANDROID})
   set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
   # Build tokenizers
-  set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizer)
+  set(LLAMA2_TOKENIZER_DIR ${EXTENSIONS_LLM_DIR}/tokenizers)
   add_library(tokenizer STATIC)
   target_include_directories(
     tokenizer PUBLIC ${_common_include_directories} ${THIRD_PARTY_ABSL_DIR}
-                     ${THIRD_PARTY_RE2_DIR}
+                     ${THIRD_PARTY_RE2_DIR} ${LLAMA2_TOKENIZER_DIR}/include
   )
   target_link_libraries(tokenizer PRIVATE re2::re2)
   target_sources(
     tokenizer
     PRIVATE
-      ${LLAMA2_TOKENIZER_DIR}/tiktoken.cpp
-      ${LLAMA2_TOKENIZER_DIR}/bpe_tokenizer.cpp
+      ${LLAMA2_TOKENIZER_DIR}/src/tiktoken.cpp
+      ${LLAMA2_TOKENIZER_DIR}/src/llama2c_tokenizer.cpp
       ${CMAKE_CURRENT_SOURCE_DIR}/../models/llama/tokenizer/llama_tiktoken.cpp
   )
 

@@ -68,8 +68,8 @@
 #include "llama_runner/llm_helper/include/llm_types.h"
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/tiktoken.h>
 
 // Llama model options
 DEFINE_uint64(
@@ -140,10 +140,10 @@ using example::utils::read_file;
 using example::utils::split;
 using example::utils::Timer;
 using example::utils::to_string;
-using executorch::extension::llm::BPETokenizer;
-using executorch::extension::llm::Tokenizer;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
+using tokenizers::Llama2cTokenizer;
+using tokenizers::Tokenizer;
 
 LlamaModelOptions get_model_options() {
   LlamaModelOptions options = {

@@ -14,8 +14,8 @@
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
 #include <executorch/extension/llm/runner/irunner.h>
 #include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
+#include <pytorch/tokenizers/tiktoken.h>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -28,9 +28,9 @@ using Stats = ::executorch::llm::Stats;
 using example::LlamaModelOptions;
 using example::LlamaModelPaths;
 using example::LlamaRuntime;
-using executorch::extension::llm::Tokenizer;
 using executorch::runtime::Error;
 using executorch::runtime::Result;
+using tokenizers::Tokenizer;
 
 class MTKLlamaRunner : public executorch::extension::llm::IRunner {
  public:

@@ -209,7 +209,11 @@ if(CMAKE_BUILD_TYPE STREQUAL "Release")
   endif()
 endif()
 
-target_include_directories(llama_main PUBLIC ${_common_include_directories})
+target_include_directories(
+  llama_main
+  PUBLIC ${_common_include_directories}
+         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+)
 target_link_libraries(llama_main PUBLIC llama_runner ${link_libraries})
 target_compile_options(llama_main PUBLIC ${_common_compile_options})
 

@@ -43,7 +43,7 @@ target_include_directories(
 
 list(
   APPEND _llama_runner__srcs
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+  ${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/tiktoken.cpp
 )
 list(APPEND _llama_runner__srcs
      ${CMAKE_CURRENT_SOURCE_DIR}/../tokenizer/llama_tiktoken.cpp
@@ -83,7 +83,10 @@ target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 
 target_include_directories(
   llama_runner
-  INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
-            ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
+  INTERFACE ${_common_include_directories}
+)
+target_include_directories(
+  llama_runner
+  PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
 target_compile_options(llama_runner PUBLIC ${_preprocessor_flag})
@@ -16,7 +16,7 @@
 #include <executorch/extension/llm/runner/util.h>
 
 #include <executorch/examples/models/llama/tokenizer/llama_tiktoken.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 namespace example {
 
@@ -78,17 +78,21 @@ Error Runner::load() {
   // load tokenizer. Assuming tiktoken is the default tokenizer
   tokenizer_ = nullptr;
   tokenizer_ = get_tiktoken_for_llama();
-  Error err = tokenizer_->load(tokenizer_path_);
+  ::tokenizers::Error err = tokenizer_->load(tokenizer_path_);
   // Rely on tiktoken to throw error if the artifact is incompatible. Then we
   // fallback to BPE tokenizer.
-  if (err == Error::InvalidArgument) {
+  if (err != ::tokenizers::Error::Ok) {
     ET_LOG(
         Info,
         "Failed to load %s as a Tiktoken artifact, trying BPE tokenizer",
         tokenizer_path_.c_str());
     tokenizer_.reset();
-    tokenizer_ = std::make_unique<llm::BPETokenizer>();
-    tokenizer_->load(tokenizer_path_);
+    tokenizer_ = std::make_unique<::tokenizers::Llama2cTokenizer>();
+    err = tokenizer_->load(tokenizer_path_);
+    ET_CHECK_TK_OK_OR_RETURN_ERROR(
+        err,
+        "Failed to load %s as a llama2.c tokenizer artifact",
+        tokenizer_path_.c_str());
   }
 
   ET_LOG(Info, "Reading metadata from model");
@@ -201,12 +205,12 @@ Error Runner::generate(
       ? seq_len
       : metadata_.at(kMaxSeqLen);
 
-  Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
+  ::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
       prompt,
       /* bos */ 0,
       /* eos */ 0);
 
-  ET_CHECK_OK_OR_RETURN_ERROR(
+  ET_CHECK_TK_OK_OR_RETURN_ERROR(
       encode_res.error(), "Failed to encode prompt %s", prompt.c_str());
 
   // encode the (string) prompt into tokens sequence
@@ -242,7 +246,8 @@ Error Runner::generate(
   uint64_t cur_token = prefill_res.get();
 
   // print the first token from prefill. No prev_token so use cur_token for it.
-  wrapped_callback(ET_UNWRAP(tokenizer_->decode(cur_token, cur_token)));
+  wrapped_callback(
+      ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
   RUNNER_ET_LOG(
       warmup,
       "RSS after prompt prefill: %f MiB (0 if unsupported)",

@@ -23,8 +23,8 @@
 #include <executorch/extension/llm/runner/text_decoder_runner.h>
 #include <executorch/extension/llm/runner/text_prefiller.h>
 #include <executorch/extension/llm/runner/text_token_generator.h>
-#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/tokenizer.h>
 
 namespace example {
 
@@ -58,7 +58,7 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
   // model
   std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
-  std::unique_ptr<::executorch::extension::llm::Tokenizer> tokenizer_;
+  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
   std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
       text_decoder_runner_;

@@ -48,7 +48,7 @@ def define_common_targets():
                 "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
                 "//executorch/runtime/core/exec_aten/util:tensor_util" + aten_suffix,
                 "//executorch/examples/models/llama/tokenizer:tiktoken",
-                "//executorch/extension/llm/tokenizer:bpe_tokenizer",
+                "//pytorch/tokenizers:llama2c_tokenizer",
             ] + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests

@@ -10,7 +10,7 @@
 
 namespace example {
 
-using ::executorch::extension::llm::Tiktoken;
+using ::tokenizers::Tiktoken;
 
 namespace {
 static constexpr int32_t kSpecialTokensSize = 256;
@@ -42,8 +42,23 @@ _get_default_special_tokens() {
   return special_tokens;
 }
 
-static inline std::unique_ptr<std::vector<std::string>>
-_get_multimodal_special_tokens() {
+std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
+  switch (version) {
+    case Version::Multimodal:
+      return get_multimodal_special_tokens();
+    default:
+      return _get_default_special_tokens();
+  }
+}
+
+} // namespace
+
+std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
+  return std::make_unique<Tiktoken>(
+      _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
+}
+
+std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens() {
   auto special_tokens =
       std::make_unique<std::vector<std::string>>(std::vector<std::string>{
           "<|begin_of_text|>",
@@ -72,20 +87,4 @@ _get_multimodal_special_tokens() {
   return special_tokens;
 }
 
-std::unique_ptr<std::vector<std::string>> _get_special_tokens(Version version) {
-  switch (version) {
-    case Version::Multimodal:
-      return _get_multimodal_special_tokens();
-    default:
-      return _get_default_special_tokens();
-  }
-}
-
-} // namespace
-
-std::unique_ptr<Tiktoken> get_tiktoken_for_llama(Version version) {
-  return std::make_unique<Tiktoken>(
-      _get_special_tokens(version), kBOSTokenIndex, kEOSTokenIndex);
-}
-
 } // namespace example
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/extension/llm/tokenizer/tiktoken.h>
+#include <pytorch/tokenizers/tiktoken.h>
 
 namespace example {
 
@@ -17,7 +17,9 @@ enum class Version {
   Multimodal,
 };
 
-std::unique_ptr<::executorch::extension::llm::Tiktoken> get_tiktoken_for_llama(
+std::unique_ptr<::tokenizers::Tiktoken> get_tiktoken_for_llama(
     Version version = Version::Default);
 
+std::unique_ptr<std::vector<std::string>> get_multimodal_special_tokens();
+
 } // namespace example
@@ -15,7 +15,8 @@ def define_common_targets():
             "llama_tiktoken.h",
         ],
         exported_deps = [
-            "//executorch/extension/llm/tokenizer:tiktoken",
+            "//pytorch/tokenizers:tiktoken",
+            "//executorch/extension/llm/tokenizer:tiktoken", # TODO: remove
         ],
         visibility = [
             "@EXECUTORCH_CLIENTS",

@@ -10,7 +10,7 @@
 
 #include <vector>
 
-#include <executorch/runtime/platform/runtime.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
 
 #include <gtest/gtest.h>
 
@@ -36,8 +36,8 @@ static std::string get_resource_path(const std::string& name) {
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
-    executorch::runtime::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama(Version::Multimodal);
+    tokenizer_ = std::make_unique<executorch::extension::llm::Tiktoken>(
+        example::get_multimodal_special_tokens(), 0, 1);
     modelPath_ = get_resource_path("test_tiktoken_tokenizer.model");
   }
 

@@ -29,7 +29,7 @@ set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_llava_runner__srcs
     "${CMAKE_CURRENT_SOURCE_DIR}/llava_runner.cpp"
     "${EXECUTORCH_ROOT}/extension/llm/sampler/sampler.cpp"
-    "${EXECUTORCH_ROOT}/extension/llm/tokenizer/bpe_tokenizer.cpp"
+    "${EXECUTORCH_ROOT}/extension/llm/tokenizers/src/llama2c_tokenizer.cpp"
 )
 
 # extension llm runner lib
@@ -47,5 +47,6 @@ set(llava_runner_deps executorch extension_data_loader extension_llm_runner
 target_link_libraries(llava_runner PUBLIC ${llava_runner_deps})
 
 target_include_directories(
-  llava_runner INTERFACE ${_common_include_directories} ${EXECUTORCH_ROOT}
+  llava_runner INTERFACE ${_common_include_directories}
+                         ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
 )
@@ -13,7 +13,7 @@
 #include <executorch/examples/models/llava/runner/llava_image_prefiller.h>
 #include <executorch/examples/models/llava/runner/llava_runner.h>
 #include <executorch/examples/models/llava/runner/llava_text_decoder_runner.h>
-#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
+#include <pytorch/tokenizers/llama2c_tokenizer.h>
 
 #include <ctime>
 #include <memory>
@@ -43,7 +43,7 @@ Error LlavaRunner::load() {
   stats_.model_load_start_ms = llm::time_in_ms();
 
   // Load the tokenizer
-  tokenizer_ = std::make_unique<llm::BPETokenizer>();
+  tokenizer_ = std::make_unique<tokenizers::Llama2cTokenizer>();
   tokenizer_->load(tokenizer_path_);
 
   // Load the text decoder runner
@@ -90,7 +90,7 @@ Result<uint64_t> LlavaRunner::prefill_prompt(
     int8_t bos,
     int8_t eos) {
   std::vector<uint64_t> prompt_tokens =
-      ET_UNWRAP(tokenizer_->encode(prompt, bos, eos));
+      ET_UNWRAP_TOKENIZER(tokenizer_->encode(prompt, bos, eos));
 
   return text_prefiller_->prefill(prompt_tokens, start_pos);
 }

@@ -14,7 +14,6 @@ def define_common_targets():
         exported_deps = [
             "//executorch/backends/xnnpack:xnnpack_backend",
             "//executorch/extension/llm/runner:runner_lib",
-            "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             "//executorch/extension/evalue_util:print_evalue",
             "//executorch/extension/module:module",
             "//executorch/extension/tensor:tensor",
@@ -23,5 +22,6 @@ def define_common_targets():
             "//executorch/runtime/core/exec_aten/util:tensor_util",
             "//executorch/configurations:optimized_native_cpu_ops", 
             "//executorch/extension/llm/custom_ops:custom_ops",
+            "//pytorch/tokenizers:llama2c_tokenizer",
         ],
     )
@@ -41,11 +41,12 @@ add_executable(
   phi_3_mini_runner
   main.cpp runner.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizer/bpe_tokenizer.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/src/llama2c_tokenizer.cpp
 )
 target_include_directories(
   phi_3_mini_runner
   PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
+         ${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizers/include
 )
 target_link_libraries(
   phi_3_mini_runner PRIVATE executorch extension_module_static extension_tensor