Skip to content

Commit 994ae7b

Browse files
committed
Remove llama related stuff out of bpe_tokenizer
Pull Request resolved: #4235 We don't need to initialize `vocab_`, `vocab_scores_`, etc. They will be initialized anyway while loading the tokenizer binary. A benefit of removing them is that we can remove these llama related default values and make `bpe_tokenizer` agnostic to models. ghstack-source-id: 233769845 Differential Revision: [D59664556](https://our.internmc.facebook.com/intern/diff/D59664556/)
1 parent 75d3f4a commit 994ae7b

File tree

4 files changed

+19
-19
lines changed

4 files changed

+19
-19
lines changed

examples/models/llama2/tokenizer/bpe_tokenizer.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,6 @@ static int compare_tokens(const void* a, const void* b) {
2424
}
2525

2626
BPETokenizer::BPETokenizer() : Tokenizer() {
27-
vocab_size_ = kDefaultVocabSize;
28-
vocab_ = std::make_unique<char*[]>(kDefaultVocabSize);
29-
vocab_scores_ = std::make_unique<float[]>(kDefaultVocabSize);
30-
sorted_vocab_ = std::make_unique<TokenIndex[]>(kDefaultVocabSize);
31-
bos_tok_ = kDefaultBosTokenId;
32-
eos_tok_ = kDefaultEosTokenId;
3327
for (int i = 0; i < 256; i++) {
3428
byte_pieces_[i * 2] = (unsigned char)i;
3529
byte_pieces_[i * 2 + 1] = '\0';

examples/models/llama2/tokenizer/bpe_tokenizer.h

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,6 @@
1414
namespace torch {
1515
namespace executor {
1616

17-
// Default values for llama2
18-
constexpr int32_t kDefaultVocabSize = 32000;
19-
constexpr uint64_t kDefaultBosTokenId = 1;
20-
constexpr uint64_t kDefaultEosTokenId = 2;
21-
2217
struct TokenIndex {
2318
const char* str;
2419
int32_t id;
@@ -38,10 +33,10 @@ class BPETokenizer : public Tokenizer {
3833
const override;
3934

4035
private:
41-
std::unique_ptr<char*[]> vocab_;
42-
std::unique_ptr<float[]> vocab_scores_;
43-
std::unique_ptr<TokenIndex[]> sorted_vocab_;
44-
unsigned int max_token_length_;
36+
std::unique_ptr<char*[]> vocab_ = nullptr;
37+
std::unique_ptr<float[]> vocab_scores_ = nullptr;
38+
std::unique_ptr<TokenIndex[]> sorted_vocab_ = nullptr;
39+
unsigned int max_token_length_ = 0;
4540
unsigned char byte_pieces_[512]; // stores all single-byte strings
4641
};
4742
} // namespace executor

examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,15 @@ TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
5757
EXPECT_EQ(tokenizer_->eos_tok(), 0);
5858
}
5959

60+
TEST_F(TokenizerExtensionTest, SafeToDestruct) {
61+
// Safe to destruct initialized tokenizer.
62+
tokenizer_->load(modelPath_);
63+
tokenizer_.reset();
64+
65+
// Safe to destruct uninitialized tokenizer.
66+
tokenizer_ = std::make_unique<BPETokenizer>();
67+
tokenizer_.reset();
68+
}
69+
6070
} // namespace executor
6171
} // namespace torch

examples/models/llama2/tokenizer/tokenizer.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ namespace executor {
2828

2929
class Tokenizer {
3030
public:
31-
explicit Tokenizer() : initialized_(false) {}
31+
explicit Tokenizer() {}
3232
virtual ~Tokenizer() {}
3333

3434
virtual Error load(const std::string& tokenizer_path) = 0;
@@ -69,9 +69,10 @@ class Tokenizer {
6969
}
7070

7171
protected:
72-
bool initialized_;
73-
int32_t vocab_size_;
74-
uint64_t bos_tok_, eos_tok_;
72+
bool initialized_ = false;
73+
int32_t vocab_size_ = 0;
74+
uint64_t bos_tok_ = 0;
75+
uint64_t eos_tok_ = 0;
7576
};
7677

7778
} // namespace executor

0 commit comments

Comments
 (0)