Skip to content

Bug fix in bpe tokenizer #4149

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ __pycache__/

# Any exported models and profiling outputs
*.pte
*.model
!test_tiktoken_tokenizer.model
*.bin
!test_bpe_tokenizer.bin

# Editor temporaries
*.swa
Expand Down
21 changes: 11 additions & 10 deletions examples/models/llama2/tokenizer/bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@ static int compare_tokens(const void* a, const void* b) {
}

BPETokenizer::BPETokenizer() : Tokenizer() {
vocab_size_ = kVocabSize;
vocab_ = std::make_unique<char*[]>(kVocabSize);
vocab_scores_ = std::make_unique<float[]>(kVocabSize);
sorted_vocab_ = std::make_unique<TokenIndex[]>(kVocabSize);
bos_tok_ = 1;
eos_tok_ = 2;
vocab_size_ = kDefaultVocabSize;
vocab_ = std::make_unique<char*[]>(kDefaultVocabSize);
vocab_scores_ = std::make_unique<float[]>(kDefaultVocabSize);
sorted_vocab_ = std::make_unique<TokenIndex[]>(kDefaultVocabSize);
bos_tok_ = kDefaultBosTokenId;
eos_tok_ = kDefaultEosTokenId;
for (int i = 0; i < 256; i++) {
byte_pieces_[i * 2] = (unsigned char)i;
byte_pieces_[i * 2 + 1] = '\0';
Expand Down Expand Up @@ -57,8 +57,8 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
ET_LOG(Error, "couldn't load %s", tokenizer_path.c_str());
return Error::InvalidArgument;
}
int32_t metadata[2];
for (int i = 0; i < 2; i++) {
int32_t metadata[4];
for (int i = 0; i < 4; i++) {
if (fread(metadata + i, sizeof(int32_t), 1, file) != 1) {
ET_LOG(
Error,
Expand All @@ -72,8 +72,9 @@ Error BPETokenizer::load(const std::string& tokenizer_path) {
// tokenizer file.
int32_t tokenizer_vocab_size = metadata[0];
vocab_size_ = tokenizer_vocab_size;

max_token_length_ = metadata[1];
bos_tok_ = metadata[1];
eos_tok_ = metadata[2];
max_token_length_ = metadata[3];

// allocate space for the vocabulary
vocab_ = std::make_unique<char*[]>(vocab_size_);
Expand Down
5 changes: 4 additions & 1 deletion examples/models/llama2/tokenizer/bpe_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@
namespace torch {
namespace executor {

constexpr int32_t kVocabSize = 32000;
// Default values for llama2
constexpr int32_t kDefaultVocabSize = 32000;
constexpr uint64_t kDefaultBosTokenId = 1;
constexpr uint64_t kDefaultEosTokenId = 2;

struct TokenIndex {
const char* str;
Expand Down
Binary file not shown.
4 changes: 2 additions & 2 deletions examples/models/llama2/tokenizer/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ def define_common_targets():
)

runtime.python_test(
name = "test_tokenizer_py",
name = "test_bpe_tokenizer_py",
srcs = [
"test_tokenizer.py",
"test_bpe_tokenizer.py",
],
visibility = [
"//executorch/examples/...",
Expand Down
11 changes: 6 additions & 5 deletions examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class TokenizerExtensionTest : public Test {
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<BPETokenizer>();
modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
modelPath_ =
std::getenv("RESOURCES_PATH") + std::string("/test_bpe_tokenizer.bin");
}

std::unique_ptr<Tokenizer> tokenizer_;
Expand All @@ -47,13 +48,13 @@ TEST_F(TokenizerExtensionTest, DecodeOutOfRangeFails) {
EXPECT_EQ(result.error(), Error::NotSupported);
}

TEST_F(TokenizerExtensionTest, TokenizerVocabSizeIsExpected) {
TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0.
// test_bpe_tokenizer.bin has vocab_size 0, bos_id 0, eos_id 0 recorded.
EXPECT_EQ(tokenizer_->vocab_size(), 0);
EXPECT_EQ(tokenizer_->bos_tok(), 1);
EXPECT_EQ(tokenizer_->eos_tok(), 2);
EXPECT_EQ(tokenizer_->bos_tok(), 0);
EXPECT_EQ(tokenizer_->eos_tok(), 0);
}

} // namespace executor
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ class TestTokenizer(unittest.TestCase):
def test_export(self, mock_sp):
# Set up the mock SentencePieceProcessor
mock_sp.return_value.vocab_size.return_value = 0
mock_sp.return_value.bos_id.return_value = 0
mock_sp.return_value.eos_id.return_value = 0
mock_sp.return_value.get_piece_size.return_value = 0
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=True) as temp:
Expand All @@ -32,8 +34,12 @@ def test_export(self, mock_sp):
with open(output.name, "rb") as f:
data = f.read(16)
# Unpack the data as 4 integers
vocab_size, max_token_length = struct.unpack("II", data)
vocab_size, bos_id, eos_id, max_token_length = struct.unpack(
"IIII", data
)
# Check that the integers match the properties of the tokenizer
self.assertEqual(vocab_size, 0)
self.assertEqual(bos_id, 0)
self.assertEqual(eos_id, 0)
# Check that the max token length is correct
self.assertEqual(max_token_length, 0)
20 changes: 4 additions & 16 deletions examples/models/llama2/tokenizer/test/test_tiktoken.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class TiktokenExtensionTest : public Test {
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<Tiktoken>();
modelPath_ =
std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}

std::unique_ptr<Tokenizer> tokenizer_;
Expand All @@ -35,8 +35,8 @@ class MultimodalTiktokenV5ExtensionTest : public Test {
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<Tiktoken>(MULTIMODAL);
modelPath_ =
std::getenv("RESOURCES_PATH") + std::string("/tokenizer.model");
modelPath_ = std::getenv("RESOURCES_PATH") +
std::string("/test_tiktoken_tokenizer.model");
}

std::unique_ptr<Tokenizer> tokenizer_;
Expand All @@ -56,8 +56,6 @@ TEST_F(TiktokenExtensionTest, DecodeWithoutLoadFails) {
TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
Expand All @@ -66,8 +64,6 @@ TEST_F(TiktokenExtensionTest, TokenizerVocabSizeIsExpected) {
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
EXPECT_EQ(tokenizer_->vocab_size(), 128256);
EXPECT_EQ(tokenizer_->bos_tok(), 128000);
EXPECT_EQ(tokenizer_->eos_tok(), 128001);
Expand All @@ -76,8 +72,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerVocabSizeIsExpected) {
TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
Result<std::vector<uint64_t>> out = tokenizer_->encode("hello world", 1, 0);
EXPECT_EQ(out.error(), Error::Ok);
EXPECT_EQ(out.get().size(), 3);
Expand All @@ -89,8 +83,6 @@ TEST_F(TiktokenExtensionTest, TokenizerEncodeCorrectly) {
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
Result<std::vector<uint64_t>> out = tokenizer_->encode(
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>What do you think is going on in this snapshot?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nAmidst a scenic garden backdrop, a man dressed in a suit with a distinct button on its lower portion stands prominently.<|eom_id|>",
0,
Expand All @@ -112,8 +104,6 @@ TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerEncodeCorrectly) {
TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
std::vector<std::string> expected = {"<|begin_of_text|>", "hello", " world"};
std::vector<uint64_t> tokens = {128000, 15339, 1917};
for (size_t i = 0; i < tokens.size(); i++) {
Expand All @@ -126,8 +116,6 @@ TEST_F(TiktokenExtensionTest, TokenizerDecodeCorrectly) {
TEST_F(MultimodalTiktokenV5ExtensionTest, TokenizerDecodeCorrectly) {
Error res = tokenizer_->load(modelPath_.c_str());
EXPECT_EQ(res, Error::Ok);
// test.bin has vocab size 0 but the tokenizer respects the vocab size being
// passed in and add placeholder tokens.
std::vector<std::string> expected = {
"<|begin_of_text|>",
"<|start_header_id|>",
Expand Down
12 changes: 9 additions & 3 deletions examples/models/llama2/tokenizer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,10 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:

The binary format is:
1. vocab size: int32
2. max token length: int32
3. score: float32, len of bytes: int32, token bytes: [byte] for each token
2. bos token id: int32
3. eos token id: int32
4. max token length: int32
5. score: float32, len of bytes: int32, token bytes: [byte] for each token

:param output_path: output path of the new binary.
:param prepend_padding: a boolean to control if we want to prepend a padding token.
Expand Down Expand Up @@ -99,7 +101,11 @@ def export(self, output_path: str, *, prepend_padding: bool = False) -> None:
# write to a binary file
with open(output_path, "wb") as f:
# write the vocab size, bos/eos ids and max token length
f.write(struct.pack("II", self.n_words, max_token_length))
f.write(
struct.pack(
"IIII", self.n_words, self.bos_id, self.eos_id, max_token_length
)
)
for bytes, score in zip(tokens, scores):
f.write(struct.pack("fI", score, len(bytes)))
f.write(bytes)
Expand Down
Loading