Skip to content

Commit d68a45b

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Change tokenizer name to bpe_tokenizer and extract a base class
Summary: We want to be able to support more than 1 implementation of tokenizer. Currently `tokenizer.cpp` is adopted from `llama2.c` but we also wanted to support `Tiktoken` (will be added in next PR). This PR extract out a base class `Tokenizer` and make it extendable by different implementations. Differential Revision: D56052583
1 parent 6acc86f commit d68a45b

File tree

7 files changed

+74
-34
lines changed

7 files changed

+74
-34
lines changed

examples/models/llama2/runner/runner.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
// The module takes in a string as input and emits a string as output.
1111

1212
#include <executorch/examples/models/llama2/runner/runner.h>
13+
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1314
#include <executorch/extension/evalue_util/print_evalue.h>
1415
#include <executorch/extension/runner_util/managed_tensor.h>
1516

@@ -76,7 +77,7 @@ Error Runner::load() {
7677
append_eos_ = getMetadataHelper("append_eos_to_prompt", false);
7778

7879
// Load tokenizer
79-
tokenizer_ = std::make_unique<Tokenizer>(vocab_size_, bos_id_, eos_id_);
80+
tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
8081
tokenizer_->load(tokenizer_path_);
8182
if (tokenizer_->bos_tok() != bos_id_) {
8283
ET_LOG(
@@ -105,7 +106,7 @@ Error Runner::load() {
105106
}
106107

107108
template <typename T>
108-
T Runner::getMetadataHelper(std::string method_name, T default_val) {
109+
T Runner::getMetadataHelper(const std::string& method_name, T default_val) {
109110
T res = default_val;
110111
if (model_methods_.count(method_name)) {
111112
Result<std::vector<EValue>> outputs = module_->execute(method_name);
@@ -484,9 +485,9 @@ void Runner::stop() {
484485

485486
// explicit instantiation of template methods
486487
template int64_t Runner::getMetadataHelper<int64_t>(
487-
std::string method_name,
488+
const std::string& method_name,
488489
int64_t default_val);
489490
template bool Runner::getMetadataHelper<bool>(
490-
std::string method_name,
491+
const std::string& method_name,
491492
bool default_val);
492493
} // namespace torch::executor

examples/models/llama2/runner/runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class Runner {
6969
private:
7070
// metadata
7171
template <typename T>
72-
T getMetadataHelper(std::string method_name, T default_val);
72+
T getMetadataHelper(const std::string& method_name, T default_val);
7373
template <typename T>
7474
int32_t
7575
logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);

examples/models/llama2/tokenizer/tokenizer.cpp renamed to examples/models/llama2/tokenizer/bpe_tokenizer.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
9+
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1010

1111
#include <string>
1212

@@ -23,11 +23,11 @@ static int compare_tokens(const void* a, const void* b) {
2323
return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
2424
}
2525

26-
Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
27-
: initialized_(false),
28-
vocab_size_(vocab_size),
29-
bos_tok_(bos_tok),
30-
eos_tok_(eos_tok),
26+
BPETokenizer::BPETokenizer(
27+
int32_t vocab_size,
28+
uint64_t bos_tok,
29+
uint64_t eos_tok)
30+
: Tokenizer(vocab_size, bos_tok, eos_tok),
3131
vocab_(std::make_unique<char*[]>(vocab_size)),
3232
vocab_scores_(std::make_unique<float[]>(vocab_size)),
3333
sorted_vocab_(std::make_unique<TokenIndex[]>(vocab_size)) {
@@ -47,7 +47,7 @@ Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
4747
* @param tokenizer_path The path to the tokenizer file.
4848
* @return Error
4949
*/
50-
Error Tokenizer::load(const std::string& tokenizer_path) {
50+
Error BPETokenizer::load(const std::string& tokenizer_path) {
5151
if (initialized_) {
5252
ET_LOG(Info, "Tokenizer already initialized");
5353
return Error::Ok;
@@ -131,7 +131,7 @@ Error Tokenizer::load(const std::string& tokenizer_path) {
131131
return Error::Ok;
132132
}
133133

134-
Tokenizer::~Tokenizer() {
134+
BPETokenizer::~BPETokenizer() {
135135
for (int i = 0; i < vocab_size_; i++) {
136136
delete[] vocab_[i];
137137
}
@@ -145,7 +145,7 @@ Tokenizer::~Tokenizer() {
145145
* @return Result<std::string> A pointer to the string representation of the
146146
* token.
147147
*/
148-
Result<std::string> Tokenizer::decode(uint64_t prev_token, uint64_t token) {
148+
Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
149149
if (!initialized_) {
150150
ET_LOG(Error, "Tokenizer not initialized");
151151
return Error::NotSupported;
@@ -187,7 +187,7 @@ str_lookup(const char* str, TokenIndex* sorted_vocab, int32_t vocab_size) {
187187
* @return Result<std::vector<uint64_t>>
188188
*/
189189
Result<std::vector<uint64_t>>
190-
Tokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
190+
BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
191191
if (!initialized_) {
192192
ET_LOG(Error, "Tokenizer not initialized");
193193
return Error::NotSupported;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
12+
#include <cstdint>
13+
14+
namespace torch {
15+
namespace executor {
16+
17+
struct TokenIndex {
18+
const char* str;
19+
int32_t id;
20+
};
21+
22+
class BPETokenizer : public Tokenizer {
23+
public:
24+
explicit BPETokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
25+
~BPETokenizer();
26+
27+
Error load(const std::string& tokenizer_path);
28+
29+
Result<std::vector<uint64_t>>
30+
encode(const std::string& input, int8_t bos, int8_t eos);
31+
32+
Result<std::string> decode(uint64_t prev_token, uint64_t token);
33+
34+
private:
35+
std::unique_ptr<char*[]> vocab_;
36+
std::unique_ptr<float[]> vocab_scores_;
37+
std::unique_ptr<TokenIndex[]> sorted_vocab_;
38+
unsigned int max_token_length_;
39+
unsigned char byte_pieces_[512]; // stores all single-byte strings
40+
};
41+
} // namespace executor
42+
} // namespace torch

examples/models/llama2/tokenizer/targets.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ def define_common_targets():
44
runtime.cxx_library(
55
name = "tokenizer",
66
srcs = [
7-
"tokenizer.cpp",
7+
"bpe_tokenizer.cpp",
88
],
99
exported_headers = [
1010
"tokenizer.h",
11+
"bpe_tokenizer.h",
1112
],
1213
exported_deps = [
1314
"//executorch/runtime/core/exec_aten:lib",

examples/models/llama2/tokenizer/test/test_tokenizer.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <bpe_tokenizer.h>
10+
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
911
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
1012
#include <executorch/runtime/platform/runtime.h>
1113
#include <gtest/gtest.h>
@@ -20,7 +22,7 @@ class TokenizerExtensionTest : public Test {
2022
public:
2123
void SetUp() override {
2224
torch::executor::runtime_init();
23-
tokenizer_ = std::make_unique<Tokenizer>(32000, 1, 2);
25+
tokenizer_ = std::make_unique<BPETokenizer>(32000, 1, 2);
2426
modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
2527
}
2628

examples/models/llama2/tokenizer/tokenizer.h

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,21 @@
2626
namespace torch {
2727
namespace executor {
2828

29-
struct TokenIndex {
30-
const char* str;
31-
int32_t id;
32-
};
33-
3429
class Tokenizer {
3530
public:
36-
explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
37-
~Tokenizer();
31+
explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
32+
: initialized_(false),
33+
vocab_size_(vocab_size),
34+
bos_tok_(bos_tok),
35+
eos_tok_(eos_tok){};
36+
virtual ~Tokenizer(){};
3837

39-
Error load(const std::string& tokenizer_path);
38+
virtual Error load(const std::string& tokenizer_path) = 0;
4039

41-
Result<std::vector<uint64_t>>
42-
encode(const std::string& input, int8_t bos, int8_t eos);
40+
virtual Result<std::vector<uint64_t>>
41+
encode(const std::string& input, int8_t bos, int8_t eos) = 0;
4342

44-
Result<std::string> decode(uint64_t prev_token, uint64_t token);
43+
virtual Result<std::string> decode(uint64_t prev_token, uint64_t token) = 0;
4544

4645
// getters
4746
int32_t vocab_size() const {
@@ -56,15 +55,10 @@ class Tokenizer {
5655
return eos_tok_;
5756
}
5857

59-
private:
58+
protected:
6059
bool initialized_;
6160
const int32_t vocab_size_;
6261
uint64_t bos_tok_, eos_tok_;
63-
std::unique_ptr<char*[]> vocab_;
64-
std::unique_ptr<float[]> vocab_scores_;
65-
std::unique_ptr<TokenIndex[]> sorted_vocab_;
66-
unsigned int max_token_length_;
67-
unsigned char byte_pieces_[512]; // stores all single-byte strings
6862
};
6963

7064
} // namespace executor

0 commit comments

Comments
 (0)