Buckify tokenizers

larryliu0820 · web-flow · commit 03744cee5896 · 2025-02-13T21:08:13.000-08:00
Differential Revision: D69509028 Pull Request resolved: #17
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,25 +32,27 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece)
 set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
 
 file(GLOB tokenizers_source_files ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp)
-file(GLOB unicode_source_files ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
-add_library(tokenizers STATIC ${tokenizers_source_files} ${unicode_source_files})
+file(GLOB unicode_source_files
+     ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/src/*.cpp)
+add_library(tokenizers STATIC ${tokenizers_source_files}
+                              ${unicode_source_files})
 
 # Using abseil from sentencepiece/third_party
 target_include_directories(
-  tokenizers PUBLIC
-  ${CMAKE_CURRENT_SOURCE_DIR}/include
-  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
-  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
-  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
-  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
-  ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
+  tokenizers
+  PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
+         ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
+         ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece/src
+         ${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
+         ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
+         ${CMAKE_CURRENT_SOURCE_DIR}/third-party/llama.cpp-unicode/include)
 
 target_link_libraries(tokenizers PUBLIC sentencepiece-static re2::re2)
 
 # Build test
 if(TOKENIZERS_BUILD_TEST)
-    enable_testing()
-    include(FetchContent)
+  enable_testing()
+  include(FetchContent)
   # CMAKE
   FetchContent_Declare(
     googletest
@@ -63,20 +65,22 @@ if(TOKENIZERS_BUILD_TEST)
   FetchContent_MakeAvailable(googletest)
 
   file(GLOB test_source_files ${CMAKE_CURRENT_SOURCE_DIR}/test/test_*.cpp)
+
+  set(test_env "RESOURCES_PATH=${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
   foreach(test_source_file ${test_source_files})
-      get_filename_component(test_name ${test_source_file} NAME_WE)
-      message(STATUS "Configuring unit test ${test_name}")
-      add_executable(${test_name} ${test_source_file})
-      target_include_directories(${test_name} PRIVATE
-        GTEST_INCLUDE_PATH
-        ${CMAKE_CURRENT_SOURCE_DIR}/include
-        ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
-        ${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
-        ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include
-        )
-      target_link_libraries(${test_name} gtest_main tokenizers)
-      target_compile_definitions(${test_name} PRIVATE RESOURCES_PATH="${CMAKE_CURRENT_SOURCE_DIR}/test/resources")
-      add_test(${test_name} "${test_name}")
+    get_filename_component(test_name ${test_source_file} NAME_WE)
+    message(STATUS "Configuring unit test ${test_name}")
+    add_executable(${test_name} ${test_source_file})
+    target_include_directories(
+      ${test_name}
+      PRIVATE GTEST_INCLUDE_PATH
+              ${CMAKE_CURRENT_SOURCE_DIR}/include
+              ${CMAKE_CURRENT_SOURCE_DIR}/third-party/sentencepiece
+              ${CMAKE_CURRENT_SOURCE_DIR}/third-party/re2
+              ${CMAKE_CURRENT_SOURCE_DIR}/third-party/json/single_include)
+    target_link_libraries(${test_name} gtest_main tokenizers)
+    add_test(${test_name} "${test_name}")
+    set_tests_properties(${test_name} PROPERTIES ENVIRONMENT ${test_env})
   endforeach()
 endif()
 
diff --git a/TARGETS b/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/include/detail/bpe_tokenizer_base.h b/include/detail/bpe_tokenizer_base.h
@@ -5,11 +5,15 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
 
 // Base class for all BPE tokenizer implementations
 #pragma once
 
 // Standard
+#include <memory>
+#include <optional>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
diff --git a/include/pre_tokenizer.h b/include/pre_tokenizer.h
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #pragma once
 
 // Standard
@@ -41,6 +43,8 @@ class PreTokenizer {
    */
   virtual std::vector<std::string> pre_tokenize(
       re2::StringPiece input) const = 0;
+
+  virtual ~PreTokenizer() = default;
 }; // end class PreTokenizer
 
 // -- Factory ------------------------------------------------------------------
diff --git a/include/sentencepiece.h b/include/sentencepiece.h
@@ -5,8 +5,9 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
 
-// A tokenizer that works with sentencepiece.
+// A tokenizer that works with sentencepiece. Used by Llama2.
 #pragma once
 
 #include <memory>
diff --git a/include/token_decoder.h b/include/token_decoder.h
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #pragma once
 
 // Standard
@@ -45,6 +47,9 @@ class TokenDecoder {
    */
   virtual std::string decode(re2::StringPiece token) const = 0;
 
+  // virtual destructor
+  virtual ~TokenDecoder() = default;
+
 }; // end class TokenDecoder
 
 // -- Factory ------------------------------------------------------------------
diff --git a/src/bpe_tokenizer_base.cpp b/src/bpe_tokenizer_base.cpp
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #include "detail/bpe_tokenizer_base.h"
 
 // Standard
@@ -56,7 +58,7 @@ static std::vector<uint64_t> _byte_pair_merge(
     if (rank) {
       // usize::MAX is a sentinel value and cannot be a valid rank
       if (*rank == _max_size()) {
-        fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
+        TK_LOG(Error, "at %" PRIu32 " rank is too large\n", i);
       }
       parts[i].second = *rank;
     }
@@ -177,8 +179,8 @@ BPETokenizerBase::encode_with_special_token_(
       } catch (const std::out_of_range&) {
         // Should never go here, since special pattern includes all special
         // chars.
-        fprintf(stderr, "unknown special token: %s\n", special->c_str());
-        exit(EXIT_FAILURE);
+        TK_LOG(Error, "unknown special token: %s\n", special->c_str());
+        return Error::EncodeFailure;
       }
 
       tokens.push_back(token);
@@ -259,8 +261,8 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
     if (iter != special_token_decoder_.end()) {
       token_bytes = iter->second;
     } else {
-      fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
-      exit(EXIT_FAILURE);
+      TK_LOG(Error, "unknown token: %" PRIu64 "\n", cur);
+      return Error::DecodeFailure;
     }
   }
   _decode(token_bytes, ret);
diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #include "hf_tokenizer.h"
 
 // Standard
@@ -127,17 +129,17 @@ Error HFTokenizer::load(const std::string& path) {
   // If a tokenizer config file is found, parse it to look up the eos/bos tokens
   if (!model_config_json.empty()) {
     // Load it and parse it as json
-    std::ifstream file(model_config_json);
-    if (!file) {
+    std::ifstream config_file(model_config_json);
+    if (!config_file) {
       fprintf(stderr, "failed to open encoder file: %s\n", path.c_str());
       return Error::LoadFailure;
     }
-    std::string contents(
-        (std::istreambuf_iterator<char>(file)),
+    std::string config_contents(
+        (std::istreambuf_iterator<char>(config_file)),
         std::istreambuf_iterator<char>());
-    json parsed_json;
+    json parsed_config_json;
     try {
-      parsed_json = json::parse(contents);
+      parsed_config_json = json::parse(config_contents);
     } catch (const json::exception& e) {
       std::cout << "Error parsing model config json json file: " << e.what()
                 << std::endl;
@@ -146,8 +148,8 @@ Error HFTokenizer::load(const std::string& path) {
 
     // Pull out the token strings
     try {
-      const std::string bos_token = parsed_json.at("bos_token");
-      const std::string eos_token = parsed_json.at("eos_token");
+      const std::string bos_token = parsed_config_json.at("bos_token");
+      const std::string eos_token = parsed_config_json.at("eos_token");
       const auto& bos_it = special_token_encoder_.find(bos_token);
       const auto& eos_it = special_token_encoder_.find(eos_token);
       if (bos_it == special_token_encoder_.end()) {
@@ -256,7 +258,11 @@ void HFTokenizer::_decode(re2::StringPiece input, std::string& ret) const {
   if (_decoder) {
     ret += _decoder->decode(input);
   } else {
+#ifdef _USE_INTERNAL_STRING_VIEW
+    ret += input.as_string();
+#else
     ret += input;
+#endif
   }
 }
 
diff --git a/src/pre_tokenizer.cpp b/src/pre_tokenizer.cpp
@@ -129,7 +129,7 @@ namespace {
 
 // Standard GPT2 regex
 // https://github.com/openai/gpt-2/blob/master/src/encoder.py#L53
-static const std::string GPT2_EXPR =
+constexpr char GPT2_EXPR[] =
     R"('s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+)";
 
 } // namespace
diff --git a/src/tiktoken.cpp b/src/tiktoken.cpp
@@ -183,7 +183,11 @@ Error Tiktoken::_encode(
 }
 
 void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
+#ifdef _USE_INTERNAL_STRING_VIEW
+  ret += input.as_string();
+#else
   ret += input;
+#endif
 }
 
 template <typename T>
diff --git a/src/token_decoder.cpp b/src/token_decoder.cpp
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #include "token_decoder.h"
 
 // Standard
@@ -60,7 +62,7 @@ static std::string format(const char* fmt, ...) {
   int size = vsnprintf(NULL, 0, fmt, ap);
   // GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
   std::vector<char> buf(size + 1);
-  int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+  // int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
   // GGML_ASSERT(size2 == size);
   va_end(ap2);
   va_end(ap);
diff --git a/targets.bzl b/targets.bzl
@@ -0,0 +1,96 @@
+load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "headers",
+        exported_headers = subdir_glob([
+            ("include", "*.h"),
+            ("include", "**/*.h"),
+        ]),
+        header_namespace = "",
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "sentencepiece",
+        srcs = [
+            "src/sentencepiece.cpp",
+        ],
+        exported_deps = [
+            ":headers",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compiler_flags = [
+            "-D_USE_INTERNAL_STRING_VIEW",
+        ],
+        external_deps = [
+            "sentencepiece",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "tiktoken",
+        srcs = [
+            "src/tiktoken.cpp",
+            "src/bpe_tokenizer_base.cpp",
+        ],
+        exported_deps = [
+            ":headers",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compiler_flags = [
+            "-D_USE_INTERNAL_STRING_VIEW",
+        ],
+        exported_external_deps = [
+            "re2",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "unicode",
+        srcs = [
+            "third-party/llama.cpp-unicode/src/unicode.cpp",
+            "third-party/llama.cpp-unicode/src/unicode-data.cpp",
+        ],
+        exported_headers = subdir_glob([
+            ("third-party/llama.cpp-unicode/include", "*.h"),
+        ]),
+        header_namespace = "",
+    )
+
+    runtime.cxx_library(
+        name = "hf_tokenizer",
+        srcs = [
+            "src/hf_tokenizer.cpp",
+            "src/bpe_tokenizer_base.cpp",
+            "src/pre_tokenizer.cpp",
+            "src/token_decoder.cpp",
+        ],
+        exported_deps = [
+            ":headers",
+            ":unicode",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compiler_flags = [
+            "-D_USE_INTERNAL_STRING_VIEW",
+        ],
+        exported_external_deps = [
+            "re2",
+            "nlohmann_json",
+        ],
+    )
diff --git a/test/resources/test_bpe_tokenizer.bin b/test/resources/test_bpe_tokenizer.bin
diff --git a/test/resources/test_tiktoken_invalid_base64.model b/test/resources/test_tiktoken_invalid_base64.model
@@ -0,0 +1 @@
+tet 0
diff --git a/test/resources/test_tiktoken_invalid_rank.model b/test/resources/test_tiktoken_invalid_rank.model
@@ -0,0 +1 @@
+ICAgICAgIA== 18446744073709551616
diff --git a/test/resources/test_tiktoken_no_space.model b/test/resources/test_tiktoken_no_space.model
@@ -0,0 +1 @@
+ICAgICAgIA==10
diff --git a/test/resources/test_tiktoken_tokenizer.model b/test/resources/test_tiktoken_tokenizer.model
diff --git a/test/test_pre_tokenizer.cpp b/test/test_pre_tokenizer.cpp
@@ -19,7 +19,7 @@ using namespace tokenizers;
 
 // Helpers /////////////////////////////////////////////////////////////////////
 
-void assert_split_match(
+static void assert_split_match(
     const PreTokenizer& ptok,
     const std::string& prompt,
     const std::vector<std::string>& expected) {
diff --git a/test/test_sentencepiece.cpp b/test/test_sentencepiece.cpp
diff --git a/test/test_tiktoken.cpp b/test/test_tiktoken.cpp
diff --git a/third-party/llama.cpp-unicode/include/unicode-data.h b/third-party/llama.cpp-unicode/include/unicode-data.h
diff --git a/third-party/llama.cpp-unicode/src/unicode-data.cpp b/third-party/llama.cpp-unicode/src/unicode-data.cpp
diff --git a/third-party/llama.cpp-unicode/src/unicode.cpp b/third-party/llama.cpp-unicode/src/unicode.cpp

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,11 @@ Error Tiktoken::_encode(`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {`
	`186`	`+#ifdef _USE_INTERNAL_STRING_VIEW`
	`187`	`+ ret += input.as_string();`
	`188`	`+#else`
`186`	`189`	`ret += input;`
	`190`	`+#endif`
`187`	`191`	`}`
`188`	`192`
`189`	`193`	`template <typename T>`