Skip to content

Commit d948c1a

Browse files
guangy10facebook-github-bot
authored andcommitted
Bug fix in bpe tokenizer (#4149)
Summary: - Record bos/eos in the binary format - Updated tests Pull Request resolved: #4149 Reviewed By: larryliu0820 Differential Revision: D59349794 Pulled By: guangy10 fbshipit-source-id: ecdd5bd22dfcdc60429d179f07a61d46c832ef87
1 parent 5f2ab0e commit d948c1a

File tree

10 files changed

+46
-38
lines changed

10 files changed

+46
-38
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@ __pycache__/
1111

1212
# Any exported models and profiling outputs
1313
*.pte
14+
*.model
15+
!test_tiktoken_tokenizer.model
1416
*.bin
17+
!test_bpe_tokenizer.bin
1518

1619
# Editor temporaries
1720
*.swa

examples/models/llama2/tokenizer/bpe_tokenizer.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@ static int compare_tokens(const void* a, const void* b) {
2424
}
2525

2626
BPETokenizer::BPETokenizer() : Tokenizer() {
27-
vocab_size_ = kVocabSize;
28-
vocab_ = std::make_unique<char*[]>(kVocabSize);
29-
vocab_scores_ = std::make_unique<float[]>(kVocabSize);
30-
sorted_vocab_ = std::make_unique<TokenIndex[]>(kVocabSize);
31-
bos_tok_ = 1;
32-
eos_tok_ = 2;
27+
vocab_size_ = kDefaultVocabSize;
28+
vocab_ = std::make_unique<char*[]>(kDefaultVocabSize);
29+
vocab_scores_ = std::make_unique<float[]>(kDefaultVocabSize);
30+
sorted_vocab_ = std::make_unique<TokenIndex[]>(kDefaultVocabSize);
31+
bos_tok_ = kDefaultBosTokenId;
32+
eos_tok_ = kDefaultEosTokenId;
3333
for (int i = 0; i < 256; i++) {
3434
byte_pieces_[i * 2] = (unsigned char)i;
3535
byte_pieces_[i * 2 + 1] = '\0';
@@ -57,8 +57,8 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
5757
ET_LOG(Error, "couldn't load %s", tokenizer_path.c_str());
5858
return Error::InvalidArgument;
5959
}
60-
int32_t metadata[2];
61-
for (int i = 0; i < 2; i++) {
60+
int32_t metadata[4];
61+
for (int i = 0; i < 4; i++) {
6262
if (fread(metadata + i, sizeof(int32_t), 1, file) != 1) {
6363
ET_LOG(
6464
Error,
@@ -72,8 +72,9 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
7272
// tokenizer file.
7373
int32_t tokenizer_vocab_size = metadata[0];
7474
vocab_size_ = tokenizer_vocab_size;
75-
76-
max_token_length_ = metadata[1];
75+
bos_tok_ = metadata[1];
76+
eos_tok_ = metadata[2];
77+
max_token_length_ = metadata[3];
7778

7879
// allocate space for the vocabulary
7980
vocab_ = std::make_unique<char*[]>(vocab_size_);

examples/models/llama2/tokenizer/bpe_tokenizer.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
namespace torch {
1515
namespace executor {
1616

17-
constexpr int32_t kVocabSize = 32000;
17+
// Default values for llama2
18+
constexpr int32_t kDefaultVocabSize = 32000;
19+
constexpr uint64_t kDefaultBosTokenId = 1;
20+
constexpr uint64_t kDefaultEosTokenId = 2;
1821

1922
struct TokenIndex {
2023
const char* str;

examples/models/llama2/tokenizer/test/targets.bzl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,9 @@ def define_common_targets():
4444
)
4545

4646
runtime.python_test(
47-
name = "test_tokenizer_py",
47+
name = "test_bpe_tokenizer_py",
4848
srcs = [
49-
"test_tokenizer.py",
49+
"test_bpe_tokenizer.py",
5050
],
5151
visibility = [
5252
"//executorch/examples/...",

examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ class TokenizerExtensionTest : public Test {
2222
void SetUp() override {
2323
torch::executor::runtime_init();
2424
tokenizer_ = std::make_unique<BPETokenizer>();
25-
modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
25+
modelPath_ =
26+
std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
2627
}
2728

2829
std::unique_ptr<Tokenizer> tokenizer_;
@@ -47,13 +48,13 @@ TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
4748
EXPECT_EQ(result.error(), Error::NotSupported);
4849
}
4950

50-
TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
51+
TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
5152
Error res = tokenizer_->load(modelPath_.c_str());
5253
EXPECT_EQ(res, Error::Ok);
53-
// test.bin has vocab size 0.
54+
// test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded.
5455
EXPECT_EQ(tokenizer_->vocab_size(), 0);
55-
EXPECT_EQ(tokenizer_->bos_tok(), 1);
56-
EXPECT_EQ(tokenizer_->eos_tok(), 2);
56+
EXPECT_EQ(tokenizer_->bos_tok(), 0);
57+
EXPECT_EQ(tokenizer_->eos_tok(), 0);
5758
}
5859

5960
} // namespace executor

examples/models/llama2/tokenizer/test/test_tokenizer.py renamed to examples/models/llama2/tokenizer/test/test_bpe_tokenizer.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ class TestTokenizer(unittest.TestCase):
2020
def test_export(self, mock_sp):
2121
# Set up the mock SentencePieceProcessor
2222
mock_sp.return_value.vocab_size.return_value = 0
23+
mock_sp.return_value.bos_id.return_value = 0
24+
mock_sp.return_value.eos_id.return_value = 0
2325
mock_sp.return_value.get_piece_size.return_value = 0
2426
# Create a temporary file
2527
with tempfile.NamedTemporaryFile(delete=True) as temp:
@@ -32,8 +34,12 @@ def test_export(self, mock_sp):
3234
with open(output.name, "rb") as f:
3335
data = f.read(16)
3436
# Unpack the data as 4 integers
35-
vocab_size, max_token_length = struct.unpack("II", data)
37+
vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
38+
"IIII", data
39+
)
3640
# Check that the integers match the properties of the tokenizer
3741
self.assertEqual(vocab_size, 0)
42+
self.assertEqual(bos_id, 0)
43+
self.assertEqual(eos_id, 0)
3844
# Check that the max token length is correct
3945
self.assertEqual(max_token_length, 0)

examples/models/llama2/tokenizer/test/test_tiktoken.cpp

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ class TiktokenExtensionTest : public Test {
2222
void SetUp() override {
2323
torch::executor::runtime_init();
2424
tokenizer_ = std::make_unique<Tiktoken>();
25-
modelPath_ =
26-
std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
25+
modelPath_ = std::getenv("RESOURCES_PATH") +
26+
std::string("/test_tiktoken_tokenizer.model");
2727
}
2828

2929
std::unique_ptr<Tokenizer> tokenizer_;
@@ -35,8 +35,8 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
3535
void SetUp() override {
3636
torch::executor::runtime_init();
3737
tokenizer_ = std::make_unique<Tiktoken>(MULTIMODAL);
38-
modelPath_ =
39-
std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
38+
modelPath_ = std::getenv("RESOURCES_PATH") +
39+
std::string("/test_tiktoken_tokenizer.model");
4040
}
4141

4242
std::unique_ptr<Tokenizer> tokenizer_;
@@ -56,8 +56,6 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
5656
TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
5757
Error res = tokenizer_->load(modelPath_.c_str());
5858
EXPECT_EQ(res, Error::Ok);
59-
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
60-
// passed in and add placeholder tokens.
6159
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
6260
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
6361
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
@@ -66,8 +64,6 @@ TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
6664
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
6765
Error res = tokenizer_->load(modelPath_.c_str());
6866
EXPECT_EQ(res, Error::Ok);
69-
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
70-
// passed in and add placeholder tokens.
7167
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
7268
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
7369
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
@@ -76,8 +72,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
7672
TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
7773
Error res = tokenizer_->load(modelPath_.c_str());
7874
EXPECT_EQ(res, Error::Ok);
79-
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
80-
// passed in and add placeholder tokens.
8175
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
8276
EXPECT_EQ(out.error(), Error::Ok);
8377
EXPECT_EQ(out.get().size(), 3);
@@ -89,8 +83,6 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
8983
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
9084
Error res = tokenizer_->load(modelPath_.c_str());
9185
EXPECT_EQ(res, Error::Ok);
92-
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
93-
// passed in and add placeholder tokens.
9486
Result<std::vector<uint64_t>> out = tokenizer_->encode(
9587
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>What do you think is going on in this snapshot?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAmidst a scenic garden backdrop, a man dressed in a suit with a distinct button on its lower portion stands prominently.<|eom_id|>",
9688
0,
@@ -112,8 +104,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
112104
TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
113105
Error res = tokenizer_->load(modelPath_.c_str());
114106
EXPECT_EQ(res, Error::Ok);
115-
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
116-
// passed in and add placeholder tokens.
117107
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
118108
std::vector<uint64_t> tokens = {128000, 15339, 1917};
119109
for (size_t i = 0; i < tokens.size(); i++) {
@@ -126,8 +116,6 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
126116
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
127117
Error res = tokenizer_->load(modelPath_.c_str());
128118
EXPECT_EQ(res, Error::Ok);
129-
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
130-
// passed in and add placeholder tokens.
131119
std::vector<std::string> expected = {
132120
"<|begin_of_text|>",
133121
"<|start_header_id|>",

examples/models/llama2/tokenizer/tokenizer.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
5858
5959
The binary format is:
6060
1. vocab size: int32
61-
2. max token length: int32
62-
3. score: float32, len of bytes: int32, token bytes: [byte] for each token
61+
2. bos token id: int32
62+
3. eos token id: int32
63+
4. max token length: int32
64+
5. score: float32, len of bytes: int32, token bytes: [byte] for each token
6365
6466
:param output_path: output path of the new binary.
6567
:param prepend_padding: a boolean to control if we want to prepend a padding token.
@@ -99,7 +101,11 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
99101
# write to a binary file
100102
with open(output_path, "wb") as f:
101103
# write the vocab size, bos/eos ids and max token length
102-
f.write(struct.pack("II", self.n_words, max_token_length))
104+
f.write(
105+
struct.pack(
106+
"IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
107+
)
108+
)
103109
for bytes, score in zip(tokens, scores):
104110
f.write(struct.pack("fI", score, len(bytes)))
105111
f.write(bytes)

0 commit comments

Comments
 (0)