Change tokenizer name to bpe_tokenizer and extract a base class (#3009)

larryliu0820 · facebook-github-bot · commit 4d1d502c0bd4 · 2024-04-12T14:56:40.000-07:00
Summary: Pull Request resolved: #3009 We want to be able to support more than 1 implementation of tokenizer. Currently `tokenizer.cpp` is adopted from `llama2.c` but we also wanted to support `Tiktoken` (will be added in next PR). This PR extract out a base class `Tokenizer` and make it extendable by different implementations. Reviewed By: mergennachin Differential Revision: D56052583
diff --git a/docs/source/llm/getting-started.md b/docs/source/llm/getting-started.md
@@ -394,7 +394,7 @@ acceleration and optimization. Any portions of the computation graph not
 delegated will be executed by the ExecuTorch operator implementations.
 
 To delegate the exported model to the specific backend, we need to import its
-partitioner as well as edge compile config from Executorch Codebase first, then
+partitioner as well as edge compile config from ExecuTorch Codebase first, then
 call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
 object `to_edge` function created.
 
@@ -482,7 +482,7 @@ target_link_libraries(
 ```
 
 Keep the rest of the code the same. For more details refer to
-[Exporting to Executorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
+[Exporting to ExecuTorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
 and
 [Invoking the Runtime](https://pytorch.org/executorch/main/llm/getting-started.html#step-2-invoking-the-runtime)
 for more details
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -10,6 +10,7 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama2/runner/runner.h>
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -76,7 +77,7 @@ Error Runner::load() {
   append_eos_ = getMetadataHelper("append_eos_to_prompt", false);
 
   // Load tokenizer
-  tokenizer_ = std::make_unique<Tokenizer>(vocab_size_, bos_id_, eos_id_);
+  tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
   tokenizer_->load(tokenizer_path_);
   if (tokenizer_->bos_tok() != bos_id_) {
     ET_LOG(
@@ -105,7 +106,7 @@ Error Runner::load() {
 }
 
 template <typename T>
-T Runner::getMetadataHelper(std::string method_name, T default_val) {
+T Runner::getMetadataHelper(const std::string& method_name, T default_val) {
   T res = default_val;
   if (model_methods_.count(method_name)) {
     Result<std::vector<EValue>> outputs = module_->execute(method_name);
@@ -484,9 +485,9 @@ void Runner::stop() {
 
 // explicit instantiation of template methods
 template int64_t Runner::getMetadataHelper<int64_t>(
-    std::string method_name,
+    const std::string& method_name,
     int64_t default_val);
 template bool Runner::getMetadataHelper<bool>(
-    std::string method_name,
+    const std::string& method_name,
     bool default_val);
 } // namespace torch::executor
diff --git a/examples/models/llama2/runner/runner.h b/examples/models/llama2/runner/runner.h
@@ -69,7 +69,7 @@ class Runner {
  private:
   // metadata
   template <typename T>
-  T getMetadataHelper(std::string method_name, T default_val);
+  T getMetadataHelper(const std::string& method_name, T default_val);
   template <typename T>
   int32_t
   logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);
diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 
 #include <string>
 
@@ -23,11 +23,11 @@ static int compare_tokens(const void* a, const void* b) {
   return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
 }
 
-Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
-    : initialized_(false),
-      vocab_size_(vocab_size),
-      bos_tok_(bos_tok),
-      eos_tok_(eos_tok),
+BPETokenizer::BPETokenizer(
+    int32_t vocab_size,
+    uint64_t bos_tok,
+    uint64_t eos_tok)
+    : Tokenizer(vocab_size, bos_tok, eos_tok),
       vocab_(std::make_unique<char*[]>(vocab_size)),
       vocab_scores_(std::make_unique<float[]>(vocab_size)),
       sorted_vocab_(std::make_unique<TokenIndex[]>(vocab_size)) {
@@ -47,7 +47,7 @@ Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
  * @param tokenizer_path The path to the tokenizer file.
  * @return Error
  */
-Error Tokenizer::load(const std::string& tokenizer_path) {
+Error BPETokenizer::load(const std::string& tokenizer_path) {
   if (initialized_) {
     ET_LOG(Info, "Tokenizer already initialized");
     return Error::Ok;
@@ -131,7 +131,7 @@ Error Tokenizer::load(const std::string& tokenizer_path) {
   return Error::Ok;
 }
 
-Tokenizer::~Tokenizer() {
+BPETokenizer::~BPETokenizer() {
   for (int i = 0; i < vocab_size_; i++) {
     delete[] vocab_[i];
   }
@@ -145,7 +145,7 @@ Tokenizer::~Tokenizer() {
  * @return Result<std::string> A pointer to the string representation of the
  * token.
  */
-Result<std::string> Tokenizer::decode(uint64_t prev_token, uint64_t token) {
+Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
   if (!initialized_) {
     ET_LOG(Error, "Tokenizer not initialized");
     return Error::NotSupported;
@@ -187,7 +187,7 @@ str_lookup(const char* str, TokenIndex* sorted_vocab, int32_t vocab_size) {
  * @return Result<std::vector<uint64_t>>
  */
 Result<std::vector<uint64_t>>
-Tokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
+BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
   if (!initialized_) {
     ET_LOG(Error, "Tokenizer not initialized");
     return Error::NotSupported;
diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.h b/examples/models/llama2/tokenizer/bpe_tokenizer.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <cstdint>
+
+namespace torch {
+namespace executor {
+
+struct TokenIndex {
+  const char* str;
+  int32_t id;
+};
+
+class BPETokenizer : public Tokenizer {
+ public:
+  explicit BPETokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
+  ~BPETokenizer() override;
+
+  Error load(const std::string& tokenizer_path) override;
+
+  Result<std::vector<uint64_t>>
+  encode(const std::string& input, int8_t bos, int8_t eos) override;
+
+  Result<std::string> decode(uint64_t prev_token, uint64_t token) override;
+
+ private:
+  std::unique_ptr<char*[]> vocab_;
+  std::unique_ptr<float[]> vocab_scores_;
+  std::unique_ptr<TokenIndex[]> sorted_vocab_;
+  unsigned int max_token_length_;
+  unsigned char byte_pieces_[512]; // stores all single-byte strings
+};
+} // namespace executor
+} // namespace torch
diff --git a/examples/models/llama2/tokenizer/targets.bzl b/examples/models/llama2/tokenizer/targets.bzl
@@ -4,10 +4,11 @@ def define_common_targets():
     runtime.cxx_library(
         name = "tokenizer",
         srcs = [
-            "tokenizer.cpp",
+            "bpe_tokenizer.cpp",
         ],
         exported_headers = [
             "tokenizer.h",
+            "bpe_tokenizer.h",
         ],
         exported_deps = [
             "//executorch/runtime/core/exec_aten:lib",
diff --git a/examples/models/llama2/tokenizer/test/test_tokenizer.cpp b/examples/models/llama2/tokenizer/test/test_tokenizer.cpp
@@ -6,6 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
 #include <executorch/runtime/platform/runtime.h>
 #include <gtest/gtest.h>
@@ -20,7 +21,7 @@ class TokenizerExtensionTest : public Test {
  public:
   void SetUp() override {
     torch::executor::runtime_init();
-    tokenizer_ = std::make_unique<Tokenizer>(32000, 1, 2);
+    tokenizer_ = std::make_unique<BPETokenizer>(32000, 1, 2);
     modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
   }
 
diff --git a/examples/models/llama2/tokenizer/tokenizer.h b/examples/models/llama2/tokenizer/tokenizer.h
@@ -26,22 +26,21 @@
 namespace torch {
 namespace executor {
 
-struct TokenIndex {
-  const char* str;
-  int32_t id;
-};
-
 class Tokenizer {
  public:
-  explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
-  ~Tokenizer();
+  explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
+      : initialized_(false),
+        vocab_size_(vocab_size),
+        bos_tok_(bos_tok),
+        eos_tok_(eos_tok) {}
+  virtual ~Tokenizer() {}
 
-  Error load(const std::string& tokenizer_path);
+  virtual Error load(const std::string& tokenizer_path) = 0;
 
-  Result<std::vector<uint64_t>>
-  encode(const std::string& input, int8_t bos, int8_t eos);
+  virtual Result<std::vector<uint64_t>>
+  encode(const std::string& input, int8_t bos, int8_t eos) = 0;
 
-  Result<std::string> decode(uint64_t prev_token, uint64_t token);
+  virtual Result<std::string> decode(uint64_t prev_token, uint64_t token) = 0;
 
   // getters
   int32_t vocab_size() const {
@@ -56,15 +55,10 @@ class Tokenizer {
     return eos_tok_;
   }
 
- private:
+ protected:
   bool initialized_;
   const int32_t vocab_size_;
   uint64_t bos_tok_, eos_tok_;
-  std::unique_ptr<char*[]> vocab_;
-  std::unique_ptr<float[]> vocab_scores_;
-  std::unique_ptr<TokenIndex[]> sorted_vocab_;
-  unsigned int max_token_length_;
-  unsigned char byte_pieces_[512]; // stores all single-byte strings
 };
 
 } // namespace executor