Buckify tokenizers (#17)

larryliu0820 · facebook-github-bot · commit 70a84b6895cb · 2025-02-13T00:29:18.000-08:00
Summary: X-link: pytorch/executorch#8408 Pull Request resolved: #17 So that it can be used by ET internally Reviewed By: jackzhxng Differential Revision: D69509028
diff --git a/TARGETS b/TARGETS
@@ -0,0 +1,8 @@
+# Any targets that should be shared between fbcode and xplat must be defined in
+# targets.bzl. This file can contain fbcode-only targets.
+
+load(":targets.bzl", "define_common_targets")
+
+oncall("executorch")
+
+define_common_targets()
diff --git a/include/detail/bpe_tokenizer_base.h b/include/detail/bpe_tokenizer_base.h
@@ -5,11 +5,15 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
 
 // Base class for all BPE tokenizer implementations
 #pragma once
 
 // Standard
+#include <memory>
+#include <optional>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
diff --git a/include/pre_tokenizer.h b/include/pre_tokenizer.h
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #pragma once
 
 // Standard
@@ -41,6 +43,8 @@ class PreTokenizer {
    */
   virtual std::vector<std::string> pre_tokenize(
       re2::StringPiece input) const = 0;
+
+  virtual ~PreTokenizer() = default;
 }; // end class PreTokenizer
 
 // -- Factory ------------------------------------------------------------------
diff --git a/include/sentencepiece.h b/include/sentencepiece.h
@@ -5,8 +5,9 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
 
-// A tokenizer that works with sentencepiece.
+// A tokenizer that works with sentencepiece. Used by Llama2.
 #pragma once
 
 #include <memory>
diff --git a/include/token_decoder.h b/include/token_decoder.h
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #pragma once
 
 // Standard
@@ -45,6 +47,9 @@ class TokenDecoder {
    */
   virtual std::string decode(re2::StringPiece token) const = 0;
 
+  // virtual destructor
+  virtual ~TokenDecoder() = default;
+
 }; // end class TokenDecoder
 
 // -- Factory ------------------------------------------------------------------
diff --git a/src/bpe_tokenizer_base.cpp b/src/bpe_tokenizer_base.cpp
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #include "detail/bpe_tokenizer_base.h"
 
 // Standard
@@ -56,7 +58,7 @@ static std::vector<uint64_t> _byte_pair_merge(
     if (rank) {
       // usize::MAX is a sentinel value and cannot be a valid rank
       if (*rank == _max_size()) {
-        fprintf(stderr, "at %" PRIu32 " rank is too large\n", i);
+        TK_LOG(Error, "at %" PRIu32 " rank is too large\n", i);
       }
       parts[i].second = *rank;
     }
@@ -177,8 +179,8 @@ BPETokenizerBase::encode_with_special_token_(
       } catch (const std::out_of_range&) {
         // Should never go here, since special pattern includes all special
         // chars.
-        fprintf(stderr, "unknown special token: %s\n", special->c_str());
-        exit(EXIT_FAILURE);
+        TK_LOG(Error, "unknown special token: %s\n", special->c_str());
+        return Error::EncodeFailure;
       }
 
       tokens.push_back(token);
@@ -259,8 +261,8 @@ Result<std::string> BPETokenizerBase::decode(uint64_t prev, uint64_t cur)
     if (iter != special_token_decoder_.end()) {
       token_bytes = iter->second;
     } else {
-      fprintf(stderr, "unknown token: %" PRIu64 "\n", cur);
-      exit(EXIT_FAILURE);
+      TK_LOG(Error, "unknown token: %" PRIu64 "\n", cur);
+      return Error::DecodeFailure;
     }
   }
   _decode(token_bytes, ret);
diff --git a/src/hf_tokenizer.cpp b/src/hf_tokenizer.cpp
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #include "hf_tokenizer.h"
 
 // Standard
@@ -256,7 +258,11 @@ void HFTokenizer::_decode(re2::StringPiece input, std::string& ret) const {
   if (_decoder) {
     ret += _decoder->decode(input);
   } else {
+#ifdef _USE_INTERNAL_STRING_VIEW
+    ret += input.as_string();
+#else
     ret += input;
+#endif
   }
 }
 
diff --git a/src/tiktoken.cpp b/src/tiktoken.cpp
@@ -183,7 +183,11 @@ Error Tiktoken::_encode(
 }
 
 void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {
+#ifdef _USE_INTERNAL_STRING_VIEW
+  ret += input.as_string();
+#else
   ret += input;
+#endif
 }
 
 template <typename T>
diff --git a/src/token_decoder.cpp b/src/token_decoder.cpp
@@ -5,6 +5,8 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
+
 #include "token_decoder.h"
 
 // Standard
@@ -60,7 +62,7 @@ static std::string format(const char* fmt, ...) {
   int size = vsnprintf(NULL, 0, fmt, ap);
   // GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
   std::vector<char> buf(size + 1);
-  int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
+  // int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
   // GGML_ASSERT(size2 == size);
   va_end(ap2);
   va_end(ap);
diff --git a/targets.bzl b/targets.bzl
@@ -0,0 +1,96 @@
+load("@fbsource//tools/build_defs:glob_defs.bzl", "subdir_glob")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+def define_common_targets():
+    """Defines targets that should be shared between fbcode and xplat.
+
+    The directory containing this targets.bzl file should also contain both
+    TARGETS and BUCK files that call this function.
+    """
+
+    runtime.cxx_library(
+        name = "headers",
+        exported_headers = subdir_glob([
+            ("include", "*.h"),
+            ("include", "**/*.h"),
+        ]),
+        header_namespace = "",
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "sentencepiece",
+        srcs = [
+            "src/sentencepiece.cpp",
+        ],
+        exported_deps = [
+            ":headers",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compiler_flags = [
+            "-D_USE_INTERNAL_STRING_VIEW",
+        ],
+        deps = [
+            "fbsource//third-party/sentencepiece:sentencepiece",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "tiktoken",
+        srcs = [
+            "src/tiktoken.cpp",
+            "src/bpe_tokenizer_base.cpp",
+        ],
+        exported_deps = [
+            ":headers",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compiler_flags = [
+            "-D_USE_INTERNAL_STRING_VIEW",
+        ],
+        exported_external_deps = [
+            "re2",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "unicode",
+        srcs = [
+            "third-party/llama.cpp-unicode/src/unicode.cpp",
+            "third-party/llama.cpp-unicode/src/unicode-data.cpp",
+        ],
+        exported_headers = subdir_glob([
+            ("third-party/llama.cpp-unicode/include", "*.h"),
+        ]),
+        header_namespace = "",
+    )
+
+    runtime.cxx_library(
+        name = "hf_tokenizer",
+        srcs = [
+            "src/hf_tokenizer.cpp",
+            "src/bpe_tokenizer_base.cpp",
+            "src/pre_tokenizer.cpp",
+            "src/token_decoder.cpp",
+        ],
+        exported_deps = [
+            ":headers",
+            ":unicode",
+        ],
+        visibility = [
+            "@EXECUTORCH_CLIENTS",
+        ],
+        compiler_flags = [
+            "-D_USE_INTERNAL_STRING_VIEW",
+        ],
+        exported_external_deps = [
+            "re2",
+            "nlohmann_json",
+        ],
+    )
diff --git a/test/resources/test_bpe_tokenizer.bin b/test/resources/test_bpe_tokenizer.bin
diff --git a/test/resources/test_tiktoken_invalid_base64.model b/test/resources/test_tiktoken_invalid_base64.model
@@ -0,0 +1 @@
+tet 0
diff --git a/test/resources/test_tiktoken_invalid_rank.model b/test/resources/test_tiktoken_invalid_rank.model
@@ -0,0 +1 @@
+ICAgICAgIA== 18446744073709551616
diff --git a/test/resources/test_tiktoken_no_space.model b/test/resources/test_tiktoken_no_space.model
@@ -0,0 +1 @@
+ICAgICAgIA==10
diff --git a/test/resources/test_tiktoken_tokenizer.model b/test/resources/test_tiktoken_tokenizer.model
diff --git a/test/test_sentencepiece.cpp b/test/test_sentencepiece.cpp
@@ -5,12 +5,28 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
+// @lint-ignore-every LICENSELINT
 
-#include "gtest/gtest.h"
+#ifdef TOKENIZERS_FB_BUCK
+#include <TestResourceUtils/TestResourceUtils.h>
+#endif
+#include <gtest/gtest.h>
 #include "sentencepiece.h"
 
 namespace tokenizers {
 
+namespace {
+static inline std::string _get_resource_path(const std::string& name) {
+#ifdef TOKENIZERS_FB_BUCK
+  return facebook::xplat::testing::getPathForTestResource(
+      "test/resources/" + name);
+#else
+  return std::getenv("RESOURCES_PATH") + std::string("/") + name;
+#endif
+}
+
+} // namespace
+
 TEST(SPTokenizerTest, TestEncodeWithoutLoad) {
   SPTokenizer tokenizer;
   std::string text = "Hello world!";
@@ -26,7 +42,7 @@ TEST(SPTokenizerTest, TestDecodeWithoutLoad) {
 
 TEST(SPTokenizerTest, TestLoad) {
   SPTokenizer tokenizer;
-  auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
+  auto path = _get_resource_path("test_sentencepiece.model");
   auto error = tokenizer.load(path);
   EXPECT_EQ(error, Error::Ok);
 }
@@ -39,7 +55,7 @@ TEST(SPTokenizerTest, TestLoadInvalidPath) {
 
 TEST(SPTokenizerTest, TestEncode) {
   SPTokenizer tokenizer;
-  auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
+  auto path = _get_resource_path("test_sentencepiece.model");
   auto error = tokenizer.load(path);
   EXPECT_EQ(error, Error::Ok);
   std::string text = "Hello world!";
@@ -54,7 +70,7 @@ TEST(SPTokenizerTest, TestEncode) {
 
 TEST(SPTokenizerTest, TestDecode) {
   SPTokenizer tokenizer;
-  auto path = RESOURCES_PATH + std::string("/test_sentencepiece.model");
+  auto path = _get_resource_path("test_sentencepiece.model");
   auto error = tokenizer.load(path);
   EXPECT_EQ(error, Error::Ok);
   std::vector<uint64_t> tokens = {1, 15043, 3186, 29991};
diff --git a/test/test_tiktoken.cpp b/test/test_tiktoken.cpp

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,11 @@ Error Tiktoken::_encode(`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`void Tiktoken::_decode(re2::StringPiece input, std::string& ret) const {`
	`186`	`+#ifdef _USE_INTERNAL_STRING_VIEW`
	`187`	`+ ret += input.as_string();`
	`188`	`+#else`
`186`	`189`	`ret += input;`
	`190`	`+#endif`
`187`	`191`	`}`
`188`	`192`
`189`	`193`	`template <typename T>`