Bug fix in bpe tokenizer (#4149)

guangy10 · facebook-github-bot · commit d948c1acd7d8 · 2024-07-08T15:45:22.000-07:00
Summary: - Record bos/eos in the binary format - Updated tests Pull Request resolved: #4149 Reviewed By: larryliu0820 Differential Revision: D59349794 Pulled By: guangy10 fbshipit-source-id: ecdd5bd22dfcdc60429d179f07a61d46c832ef87
diff --git a/.gitignore b/.gitignore
@@ -11,7 +11,10 @@ __pycache__/
 
 # Any exported models and profiling outputs
 *.pte
+*.model
+!test_tiktoken_tokenizer.model
 *.bin
+!test_bpe_tokenizer.bin
 
 # Editor temporaries
 *.swa
diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/bpe_tokenizer.cpp
@@ -24,12 +24,12 @@ static int compare_tokens(const void* a, const void* b) {
 }
 
 BPETokenizer::BPETokenizer() : Tokenizer() {
-  vocab_size_ = kVocabSize;
-  vocab_ = std::make_unique<char*[]>(kVocabSize);
-  vocab_scores_ = std::make_unique<float[]>(kVocabSize);
-  sorted_vocab_ = std::make_unique<TokenIndex[]>(kVocabSize);
-  bos_tok_ = 1;
-  eos_tok_ = 2;
+  vocab_size_ = kDefaultVocabSize;
+  vocab_ = std::make_unique<char*[]>(kDefaultVocabSize);
+  vocab_scores_ = std::make_unique<float[]>(kDefaultVocabSize);
+  sorted_vocab_ = std::make_unique<TokenIndex[]>(kDefaultVocabSize);
+  bos_tok_ = kDefaultBosTokenId;
+  eos_tok_ = kDefaultEosTokenId;
   for (int i = 0; i < 256; i++) {
     byte_pieces_[i * 2] = (unsigned char)i;
     byte_pieces_[i * 2 + 1] = '\0';
@@ -57,8 +57,8 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
     ET_LOG(Error, "couldn't load %s", tokenizer_path.c_str());
     return Error::InvalidArgument;
   }
-  int32_t metadata[2];
-  for (int i = 0; i < 2; i++) {
+  int32_t metadata[4];
+  for (int i = 0; i < 4; i++) {
     if (fread(metadata + i, sizeof(int32_t), 1, file) != 1) {
       ET_LOG(
           Error,
@@ -72,8 +72,9 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
   // tokenizer file.
   int32_t tokenizer_vocab_size = metadata[0];
   vocab_size_ = tokenizer_vocab_size;
-
-  max_token_length_ = metadata[1];
+  bos_tok_ = metadata[1];
+  eos_tok_ = metadata[2];
+  max_token_length_ = metadata[3];
 
   // allocate space for the vocabulary
   vocab_ = std::make_unique<char*[]>(vocab_size_);
diff --git a/examples/models/llama2/tokenizer/bpe_tokenizer.h b/examples/models/llama2/tokenizer/bpe_tokenizer.h
@@ -14,7 +14,10 @@
 namespace torch {
 namespace executor {
 
-constexpr int32_t kVocabSize = 32000;
+// Default values for llama2
+constexpr int32_t kDefaultVocabSize = 32000;
+constexpr uint64_t kDefaultBosTokenId = 1;
+constexpr uint64_t kDefaultEosTokenId = 2;
 
 struct TokenIndex {
   const char* str;
diff --git a/examples/models/llama2/tokenizer/test/resources/test_bpe_tokenizer.bin b/examples/models/llama2/tokenizer/test/resources/test_bpe_tokenizer.bin
diff --git a/examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model b/examples/models/llama2/tokenizer/test/resources/test_tiktoken_tokenizer.model
diff --git a/examples/models/llama2/tokenizer/test/targets.bzl b/examples/models/llama2/tokenizer/test/targets.bzl
@@ -44,9 +44,9 @@ def define_common_targets():
     )
 
     runtime.python_test(
-        name = "test_tokenizer_py",
+        name = "test_bpe_tokenizer_py",
         srcs = [
-            "test_tokenizer.py",
+            "test_bpe_tokenizer.py",
         ],
         visibility = [
             "//executorch/examples/...",
diff --git a/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
@@ -22,7 +22,8 @@ class TokenizerExtensionTest : public Test {
   void SetUp() override {
     torch::executor::runtime_init();
     tokenizer_ = std::make_unique<BPETokenizer>();
-    modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
+    modelPath_ =
+        std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
   }
 
   std::unique_ptr<Tokenizer> tokenizer_;
@@ -47,13 +48,13 @@ TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
   EXPECT_EQ(result.error(), Error::NotSupported);
 }
 
-TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
+TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0.
+  // test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded.
   EXPECT_EQ(tokenizer_->vocab_size(), 0);
-  EXPECT_EQ(tokenizer_->bos_tok(), 1);
-  EXPECT_EQ(tokenizer_->eos_tok(), 2);
+  EXPECT_EQ(tokenizer_->bos_tok(), 0);
+  EXPECT_EQ(tokenizer_->eos_tok(), 0);
 }
 
 } // namespace executor
diff --git a/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.py b/examples/models/llama2/tokenizer/test/test_bpe_tokenizer.py
@@ -20,6 +20,8 @@ class TestTokenizer(unittest.TestCase):
     def test_export(self, mock_sp):
         # Set up the mock SentencePieceProcessor
         mock_sp.return_value.vocab_size.return_value = 0
+        mock_sp.return_value.bos_id.return_value = 0
+        mock_sp.return_value.eos_id.return_value = 0
         mock_sp.return_value.get_piece_size.return_value = 0
         # Create a temporary file
         with tempfile.NamedTemporaryFile(delete=True) as temp:
@@ -32,8 +34,12 @@ def test_export(self, mock_sp):
                 with open(output.name, "rb") as f:
                     data = f.read(16)
                 # Unpack the data as 4 integers
-                vocab_size, max_token_length = struct.unpack("II", data)
+                vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
+                    "IIII", data
+                )
                 # Check that the integers match the properties of the tokenizer
                 self.assertEqual(vocab_size, 0)
+                self.assertEqual(bos_id, 0)
+                self.assertEqual(eos_id, 0)
                 # Check that the max token length is correct
                 self.assertEqual(max_token_length, 0)
diff --git a/examples/models/llama2/tokenizer/test/test_tiktoken.cpp b/examples/models/llama2/tokenizer/test/test_tiktoken.cpp
@@ -22,8 +22,8 @@ class TiktokenExtensionTest : public Test {
   void SetUp() override {
     torch::executor::runtime_init();
     tokenizer_ = std::make_unique<Tiktoken>();
-    modelPath_ =
-        std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
+    modelPath_ = std::getenv("RESOURCES_PATH") +
+        std::string("/test_tiktoken_tokenizer.model");
   }
 
   std::unique_ptr<Tokenizer> tokenizer_;
@@ -35,8 +35,8 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
   void SetUp() override {
     torch::executor::runtime_init();
     tokenizer_ = std::make_unique<Tiktoken>(MULTIMODAL);
-    modelPath_ =
-        std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
+    modelPath_ = std::getenv("RESOURCES_PATH") +
+        std::string("/test_tiktoken_tokenizer.model");
   }
 
   std::unique_ptr<Tokenizer> tokenizer_;
@@ -56,8 +56,6 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
 TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
-  // passed in and add placeholder tokens.
   EXPECT_EQ(tokenizer_->vocab_size(), 128256);
   EXPECT_EQ(tokenizer_->bos_tok(), 128000);
   EXPECT_EQ(tokenizer_->eos_tok(), 128001);
@@ -66,8 +64,6 @@ TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
-  // passed in and add placeholder tokens.
   EXPECT_EQ(tokenizer_->vocab_size(), 128256);
   EXPECT_EQ(tokenizer_->bos_tok(), 128000);
   EXPECT_EQ(tokenizer_->eos_tok(), 128001);
@@ -76,8 +72,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
 TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
-  // passed in and add placeholder tokens.
   Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
   EXPECT_EQ(out.error(), Error::Ok);
   EXPECT_EQ(out.get().size(), 3);
@@ -89,8 +83,6 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
-  // passed in and add placeholder tokens.
   Result<std::vector<uint64_t>> out = tokenizer_->encode(
       "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>What do you think is going on in this snapshot?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAmidst a scenic garden backdrop, a man dressed in a suit with a distinct button on its lower portion stands prominently.<|eom_id|>",
       0,
@@ -112,8 +104,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
 TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
-  // passed in and add placeholder tokens.
   std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
   std::vector<uint64_t> tokens = {128000, 15339, 1917};
   for (size_t i = 0; i < tokens.size(); i++) {
@@ -126,8 +116,6 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
 TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
   Error res = tokenizer_->load(modelPath_.c_str());
   EXPECT_EQ(res, Error::Ok);
-  // test.bin has vocab size 0 but the tokenizer respects the vocab size being
-  // passed in and add placeholder tokens.
   std::vector<std::string> expected = {
       "<|begin_of_text|>",
       "<|start_header_id|>",
diff --git a/examples/models/llama2/tokenizer/tokenizer.py b/examples/models/llama2/tokenizer/tokenizer.py
@@ -58,8 +58,10 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
 
         The binary format is:
         1. vocab size: int32
-        2. max token length: int32
-        3. score: float32, len of bytes: int32, token bytes: [byte] for each token
+        2. bos token id: int32
+        3. eos token id: int32
+        4. max token length: int32
+        5. score: float32, len of bytes: int32, token bytes: [byte] for each token
 
         :param output_path: output path of the new binary.
         :param prepend_padding: a boolean to control if we want to prepend a padding token.
@@ -99,7 +101,11 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
         # write to a binary file
         with open(output_path, "wb") as f:
             # write the vocab size, bos/eos ids and max token length
-            f.write(struct.pack("II", self.n_words, max_token_length))
+            f.write(
+                struct.pack(
+                    "IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
+                )
+            )
             for bytes, score in zip(tokens, scores):
                 f.write(struct.pack("fI", score, len(bytes)))
                 f.write(bytes)