pytorch
diff --git a/‎examples/models/llama2/runner/runner.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/runner/runner.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/runner/runner.h
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/runner/runner.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/runner/targets.bzl
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/runner/targets.bzl
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/tokenizer/llama_tiktoken.h
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/tokenizer/llama_tiktoken.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/tokenizer/targets.bzl
Lines changed: 7 additions & 27 deletions b/‎examples/models/llama2/tokenizer/targets.bzl
Lines changed: 7 additions & 27 deletions
diff --git a/‎examples/models/llama2/tokenizer/test/CMakeLists.txt
Lines changed: 0 additions & 3 deletions b/‎examples/models/llama2/tokenizer/test/CMakeLists.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎examples/models/llama2/tokenizer/test/targets.bzl
Lines changed: 0 additions & 17 deletions b/‎examples/models/llama2/tokenizer/test/targets.bzl
Lines changed: 0 additions & 17 deletions
diff --git a/‎examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Lines changed: 0 additions & 94 deletions b/‎examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Lines changed: 0 additions & 94 deletions
diff --git a/‎examples/qualcomm/llama2/runner/runner.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/qualcomm/llama2/runner/runner.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/tokenizer/base64.h renamed to ‎extension/llm/tokenizer/base64.h b/‎examples/models/llama2/tokenizer/base64.h renamed to ‎extension/llm/tokenizer/base64.h
diff --git a/‎examples/models/llama2/tokenizer/bpe_tokenizer.cpp renamed to ‎extension/llm/tokenizer/bpe_tokenizer.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/tokenizer/bpe_tokenizer.cpp renamed to ‎extension/llm/tokenizer/bpe_tokenizer.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/tokenizer/bpe_tokenizer.h renamed to ‎extension/llm/tokenizer/bpe_tokenizer.h
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/tokenizer/bpe_tokenizer.h renamed to ‎extension/llm/tokenizer/bpe_tokenizer.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎extension/llm/tokenizer/targets.bzl
Lines changed: 40 additions & 0 deletions b/‎extension/llm/tokenizer/targets.bzl
Lines changed: 40 additions & 0 deletions
diff --git a/‎extension/llm/tokenizer/test/CMakeLists.txt
Lines changed: 50 additions & 0 deletions b/‎extension/llm/tokenizer/test/CMakeLists.txt
Lines changed: 50 additions & 0 deletions
diff --git a/‎examples/models/llama2/tokenizer/test/resources/test_bpe_tokenizer.bin renamed to ‎extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin b/‎examples/models/llama2/tokenizer/test/resources/test_bpe_tokenizer.bin renamed to ‎extension/llm/tokenizer/test/resources/test_bpe_tokenizer.bin
@@ -13,7 +13,7 @@
 #if ET_USE_TIKTOKEN
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
 #else /* BPE */
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #endif /* ET_USE_TIKTOKEN*/
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -18,7 +18,7 @@
 #include <unordered_map>
 
 #include <executorch/examples/models/llama2/sampler/sampler.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 
@@ -43,7 +43,7 @@ def define_common_targets():
             ] + ([
                 "//executorch/examples/models/llama2/tokenizer:tiktoken",
             ] if use_tiktoken() else [
-                "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
+                "//executorch/extension/llm/tokenizer:bpe_tokenizer",
             ]) + (_get_operator_lib(aten)) + ([
                 # Vulkan API currently cannot build on some platforms (e.g. Apple, FBCODE)
                 # Therefore enable it explicitly for now to avoid failing tests
 
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/examples/models/llama2/tokenizer/tiktoken.h>
+#include <executorch/extension/llm/tokenizer/tiktoken.h>
 
 namespace torch {
 namespace executor {
 
@@ -1,44 +1,24 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets():
-    runtime.cxx_library(
-        name = "bpe_tokenizer",
-        srcs = [
-            "bpe_tokenizer.cpp",
-        ],
-        exported_headers = [
-            "tokenizer.h",
-            "bpe_tokenizer.h",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
-        ],
-        visibility = [
-            "@EXECUTORCH_CLIENTS",
-        ],
-    )
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
 
     runtime.cxx_library(
         name = "tiktoken",
         srcs = [
-            "tiktoken.cpp",
             "llama_tiktoken.cpp",
         ],
         exported_headers = [
-            "tokenizer.h",
-            "tiktoken.h",
             "llama_tiktoken.h",
-            "base64.h",
         ],
         exported_deps = [
-            "//executorch/runtime/core/exec_aten:lib",
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+            "//executorch/extension/llm/tokenizer:tiktoken",
         ],
         visibility = [
-            "@EXECUTORCH_CLIENTS",
-        ],
-        exported_external_deps = [
-            "re2",
+            "//executorch/examples/models/llama2/...",
         ],
     )
@@ -24,10 +24,7 @@ include(${EXECUTORCH_ROOT}/build/Test.cmake)
 set(
   _tokenizer_test_srcs
   test_tiktoken.cpp
-  test_bpe_tokenizer.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/../llama_tiktoken.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
 )
 
 set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
 
@@ -6,20 +6,6 @@ def define_common_targets():
     The directory containing this targets.bzl file should also contain both
     TARGETS and BUCK files that call this function.
     """
-
-    runtime.cxx_test(
-        name = "test_bpe_tokenizer",
-        srcs = [
-            "test_bpe_tokenizer.cpp",
-        ],
-        deps = [
-            "//executorch/examples/models/llama2/tokenizer:bpe_tokenizer",
-        ],
-        env = {
-            "RESOURCES_PATH": "$(location :resources)/resources",
-        },
-    )
-
     runtime.cxx_test(
         name = "test_tiktoken",
         srcs = [
@@ -31,9 +17,6 @@ def define_common_targets():
         env = {
             "RESOURCES_PATH": "$(location :resources)/resources",
         },
-        external_deps = [
-            "re2",
-        ],
     )
 
     runtime.filegroup(
 
@@ -7,7 +7,6 @@
  */
 
 #include <executorch/examples/models/llama2/tokenizer/llama_tiktoken.h>
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <gtest/gtest.h>
 #include <vector>
@@ -17,19 +16,6 @@ using namespace ::testing;
 namespace torch {
 namespace executor {
 
-class TiktokenExtensionTest : public Test {
- public:
-  void SetUp() override {
-    torch::executor::runtime_init();
-    tokenizer_ = get_tiktoken_for_llama();
-    modelPath_ = std::getenv("RESOURCES_PATH") +
-        std::string("/test_tiktoken_tokenizer.model");
-  }
-
-  std::unique_ptr<Tokenizer> tokenizer_;
-  std::string modelPath_;
-};
-
 class MultimodalTiktokenV5ExtensionTest : public Test {
  public:
   void SetUp() override {
@@ -43,24 +29,6 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
   std::string modelPath_;
 };
 
-TEST_F(TiktokenExtensionTest, EncodeWithoutLoadFails) {
-  Result<std::vector<uint64_t>> res = tokenizer_->encode("hello world", 0, 0);
-  EXPECT_EQ(res.error(), Error::NotSupported);
-}
-
-TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
-  auto result = tokenizer_->decode(0, 0);
-  EXPECT_EQ(result.error(), Error::NotSupported);
-}
-
-TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  EXPECT_EQ(tokenizer_->vocab_size(), 128256);
-  EXPECT_EQ(tokenizer_->bos_tok(), 128000);
-  EXPECT_EQ(tokenizer_->eos_tok(), 128001);
-}
-
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
@@ -69,17 +37,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
   EXPECT_EQ(tokenizer_->eos_tok(), 128001);
 }
 
-TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
-  EXPECT_EQ(out.error(), Error::Ok);
-  EXPECT_EQ(out.get().size(), 3);
-  EXPECT_EQ(out.get()[0], 128000);
-  EXPECT_EQ(out.get()[1], 15339);
-  EXPECT_EQ(out.get()[2], 1917);
-}
-
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
@@ -101,18 +58,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
   }
 }
 
-TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
-  std::vector<uint64_t> tokens = {128000, 15339, 1917};
-  for (size_t i = 0; i < tokens.size(); i++) {
-    Result<std::string> out = tokenizer_->decode(0, tokens[i]);
-    EXPECT_EQ(out.error(), Error::Ok);
-    EXPECT_EQ(out.get(), expected[i]);
-  }
-}
-
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
@@ -134,44 +79,5 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
     EXPECT_EQ(out.get(), expected[i]);
   }
 }
-
-TEST_F(TiktokenExtensionTest, TokenizerDecodeOutOfRangeFails) {
-  Error res = tokenizer_->load(modelPath_.c_str());
-  EXPECT_EQ(res, Error::Ok);
-  // The vocab size is 128256, addes 256 just so the token is out of vocab
-  // range.
-  Result<std::string> out = tokenizer_->decode(0, 128256 + 256);
-  EXPECT_EQ(out.error(), Error::NotSupported);
-}
-
-TEST_F(TiktokenExtensionTest, ConstructionWithInvalidBOSIndex) {
-  // gtest death test doesn't work on iOS:
-  // https://github.com/google/googletest/issues/2834
-#if !GTEST_OS_IOS
-  EXPECT_EXIT(
-      std::make_unique<Tiktoken>(
-          std::make_unique<std::vector<std::string>>(
-              std::vector<std::string>{"<|end_of_text|>"}),
-          1,
-          0),
-      ::testing::KilledBySignal(SIGABRT),
-      "");
-#endif
-}
-
-TEST_F(TiktokenExtensionTest, ConstructionWithInvalidEOSIndex) {
-  // gtest death test doesn't work on iOS:
-  // https://github.com/google/googletest/issues/2834
-#if !GTEST_OS_IOS
-  EXPECT_EXIT(
-      std::make_unique<Tiktoken>(
-          std::make_unique<std::vector<std::string>>(
-              std::vector<std::string>{"<|begin_of_text|>"}),
-          0,
-          1),
-      ::testing::KilledBySignal(SIGABRT),
-      "");
-#endif
-}
 } // namespace executor
 } // namespace torch
@@ -9,9 +9,9 @@
 // A simple llama2 runner that includes preprocessing and post processing logic.
 // The module takes in a string as input and emits a string as output.
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/examples/qualcomm/llama2/runner/runner.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
 #include <ctime>
 
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
+#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
 
 #include <string>
 
 
@@ -8,7 +8,7 @@
 
 #pragma once
 
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/extension/llm/tokenizer/tokenizer.h>
 #include <cstdint>
 
 namespace torch {
 
@@ -37,3 +37,43 @@ def define_common_targets():
             ":tokenizer_py_lib",
         ],
     )
+
+    runtime.cxx_library(
+        name = "bpe_tokenizer",
+        srcs = [
+            "bpe_tokenizer.cpp",
+        ],
+        exported_headers = [
+            "tokenizer.h",
+            "bpe_tokenizer.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "tiktoken",
+        srcs = [
+            "tiktoken.cpp",
+        ],
+        exported_headers = [
+            "tokenizer.h",
+            "tiktoken.h",
+            "base64.h",
+        ],
+        exported_deps = [
+            "//executorch/runtime/core/exec_aten:lib",
+            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_external_deps = [
+            "re2",
+        ],
+    )
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This file should be formatted with
+# ~~~
+# cmake-format -i CMakeLists.txt
+# ~~~
+# It should also be cmake-lint clean.
+#
+
+cmake_minimum_required(VERSION 3.19)
+project(tokenizer_test)
+
+# Use C++17 for test.
+set(CMAKE_CXX_STANDARD 17)
+
+set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../../../..)
+
+include(${EXECUTORCH_ROOT}/build/Test.cmake)
+
+set(
+  _tokenizer_test_srcs
+  test_tiktoken.cpp
+  test_bpe_tokenizer.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../tiktoken.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../bpe_tokenizer.cpp
+)
+
+set(ENV{RESOURCES_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/resources)
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/re2 ${CMAKE_CURRENT_BINARY_DIR}/re2)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+et_cxx_test(
+  tokenizer_test
+  SOURCES
+  ${_tokenizer_test_srcs}
+  EXTRA_LIBS
+  re2::re2
+)
+target_include_directories(
+  tokenizer_test PRIVATE ${CMAKE_INSTALL_PREFIX}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../third-party/abseil-cpp
+)