Skip to content

Remove llama related stuff out of bpe_tokenizer #4235

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions examples/models/llama2/tokenizer/bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,6 @@ static int compare_tokens(const void* a, const void* b) {
}

BPETokenizer::BPETokenizer() : Tokenizer() {
vocab_size_ = kDefaultVocabSize;
vocab_ = std::make_unique<char*[]>(kDefaultVocabSize);
vocab_scores_ = std::make_unique<float[]>(kDefaultVocabSize);
sorted_vocab_ = std::make_unique<TokenIndex[]>(kDefaultVocabSize);
bos_tok_ = kDefaultBosTokenId;
eos_tok_ = kDefaultEosTokenId;
for (int i = 0; i < 256; i++) {
byte_pieces_[i * 2] = (unsigned char)i;
byte_pieces_[i * 2 + 1] = '\0';
Expand Down
13 changes: 4 additions & 9 deletions examples/models/llama2/tokenizer/bpe_tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,6 @@
namespace torch {
namespace executor {

// Default values for llama2
constexpr int32_t kDefaultVocabSize = 32000;
constexpr uint64_t kDefaultBosTokenId = 1;
constexpr uint64_t kDefaultEosTokenId = 2;

struct TokenIndex {
const char* str;
int32_t id;
Expand All @@ -38,10 +33,10 @@ class BPETokenizer : public Tokenizer {
const override;

private:
std::unique_ptr<char*[]> vocab_;
std::unique_ptr<float[]> vocab_scores_;
std::unique_ptr<TokenIndex[]> sorted_vocab_;
unsigned int max_token_length_;
std::unique_ptr<char*[]> vocab_ = nullptr;
std::unique_ptr<float[]> vocab_scores_ = nullptr;
std::unique_ptr<TokenIndex[]> sorted_vocab_ = nullptr;
unsigned int max_token_length_ = 0;
unsigned char byte_pieces_[512]; // stores all single-byte strings
};
} // namespace executor
Expand Down
10 changes: 10 additions & 0 deletions examples/models/llama2/tokenizer/test/test_bpe_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,15 @@ TEST_F(TokenizerExtensionTest, TokenizerMetadataIsExpected) {
EXPECT_EQ(tokenizer_->eos_tok(), 0);
}

TEST_F(TokenizerExtensionTest, SafeToDestruct) {
// Safe to destruct initialized tokenizer.
tokenizer_->load(modelPath_);
tokenizer_.reset();

// Safe to destruct uninitialized tokenizer.
tokenizer_ = std::make_unique<BPETokenizer>();
tokenizer_.reset();
}

} // namespace executor
} // namespace torch
9 changes: 5 additions & 4 deletions examples/models/llama2/tokenizer/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace executor {

class Tokenizer {
public:
explicit Tokenizer() : initialized_(false) {}
explicit Tokenizer() {}
virtual ~Tokenizer() {}

virtual Error load(const std::string& tokenizer_path) = 0;
Expand Down Expand Up @@ -69,9 +69,10 @@ class Tokenizer {
}

protected:
bool initialized_;
int32_t vocab_size_;
uint64_t bos_tok_, eos_tok_;
bool initialized_ = false;
int32_t vocab_size_ = 0;
uint64_t bos_tok_ = 0;
uint64_t eos_tok_ = 0;
};

} // namespace executor
Expand Down
Loading