Skip to content

Commit 4d1d502

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Change tokenizer name to bpe_tokenizer and extract a base class (#3009)
Summary: Pull Request resolved: #3009 We want to be able to support more than 1 implementation of tokenizer. Currently `tokenizer.cpp` is adopted from `llama2.c` but we also wanted to support `Tiktoken` (will be added in next PR). This PR extract out a base class `Tokenizer` and make it extendable by different implementations. Reviewed By: mergennachin Differential Revision: D56052583
1 parent 17c64a3 commit 4d1d502

File tree

8 files changed

+75
-36
lines changed

8 files changed

+75
-36
lines changed

docs/source/llm/getting-started.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ acceleration and optimization. Any portions of the computation graph not
394394
delegated will be executed by the ExecuTorch operator implementations.
395395

396396
To delegate the exported model to the specific backend, we need to import its
397-
partitioner as well as edge compile config from Executorch Codebase first, then
397+
partitioner as well as edge compile config from ExecuTorch Codebase first, then
398398
call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
399399
object `to_edge` function created.
400400

@@ -482,7 +482,7 @@ target_link_libraries(
482482
```
483483

484484
Keep the rest of the code the same. For more details refer to
485-
[Exporting to Executorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
485+
[Exporting to ExecuTorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
486486
and
487487
[Invoking the Runtime](https://pytorch.org/executorch/main/llm/getting-started.html#step-2-invoking-the-runtime)
488488
for more details

examples/models/llama2/runner/runner.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
// The module takes in a string as input and emits a string as output.
1111

1212
#include <executorch/examples/models/llama2/runner/runner.h>
13+
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1314
#include <executorch/extension/evalue_util/print_evalue.h>
1415
#include <executorch/extension/runner_util/managed_tensor.h>
1516

@@ -76,7 +77,7 @@ Error Runner::load() {
7677
append_eos_ = getMetadataHelper("append_eos_to_prompt", false);
7778

7879
// Load tokenizer
79-
tokenizer_ = std::make_unique<Tokenizer>(vocab_size_, bos_id_, eos_id_);
80+
tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
8081
tokenizer_->load(tokenizer_path_);
8182
if (tokenizer_->bos_tok() != bos_id_) {
8283
ET_LOG(
@@ -105,7 +106,7 @@ Error Runner::load() {
105106
}
106107

107108
template <typename T>
108-
T Runner::getMetadataHelper(std::string method_name, T default_val) {
109+
T Runner::getMetadataHelper(const std::string& method_name, T default_val) {
109110
T res = default_val;
110111
if (model_methods_.count(method_name)) {
111112
Result<std::vector<EValue>> outputs = module_->execute(method_name);
@@ -484,9 +485,9 @@ void Runner::stop() {
484485

485486
// explicit instantiation of template methods
486487
template int64_t Runner::getMetadataHelper<int64_t>(
487-
std::string method_name,
488+
const std::string& method_name,
488489
int64_t default_val);
489490
template bool Runner::getMetadataHelper<bool>(
490-
std::string method_name,
491+
const std::string& method_name,
491492
bool default_val);
492493
} // namespace torch::executor

examples/models/llama2/runner/runner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ class Runner {
6969
private:
7070
// metadata
7171
template <typename T>
72-
T getMetadataHelper(std::string method_name, T default_val);
72+
T getMetadataHelper(const std::string& method_name, T default_val);
7373
template <typename T>
7474
int32_t
7575
logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);

examples/models/llama2/tokenizer/tokenizer.cpp renamed to examples/models/llama2/tokenizer/bpe_tokenizer.cpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
9+
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
1010

1111
#include <string>
1212

@@ -23,11 +23,11 @@ static int compare_tokens(const void* a, const void* b) {
2323
return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
2424
}
2525

26-
Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
27-
: initialized_(false),
28-
vocab_size_(vocab_size),
29-
bos_tok_(bos_tok),
30-
eos_tok_(eos_tok),
26+
BPETokenizer::BPETokenizer(
27+
int32_t vocab_size,
28+
uint64_t bos_tok,
29+
uint64_t eos_tok)
30+
: Tokenizer(vocab_size, bos_tok, eos_tok),
3131
vocab_(std::make_unique<char*[]>(vocab_size)),
3232
vocab_scores_(std::make_unique<float[]>(vocab_size)),
3333
sorted_vocab_(std::make_unique<TokenIndex[]>(vocab_size)) {
@@ -47,7 +47,7 @@ Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
4747
* @param tokenizer_path The path to the tokenizer file.
4848
* @return Error
4949
*/
50-
Error Tokenizer::load(const std::string& tokenizer_path) {
50+
Error BPETokenizer::load(const std::string& tokenizer_path) {
5151
if (initialized_) {
5252
ET_LOG(Info, "Tokenizer already initialized");
5353
return Error::Ok;
@@ -131,7 +131,7 @@ Error Tokenizer::load(const std::string& tokenizer_path) {
131131
return Error::Ok;
132132
}
133133

134-
Tokenizer::~Tokenizer() {
134+
BPETokenizer::~BPETokenizer() {
135135
for (int i = 0; i < vocab_size_; i++) {
136136
delete[] vocab_[i];
137137
}
@@ -145,7 +145,7 @@ Tokenizer::~Tokenizer() {
145145
* @return Result<std::string> A pointer to the string representation of the
146146
* token.
147147
*/
148-
Result<std::string> Tokenizer::decode(uint64_t prev_token, uint64_t token) {
148+
Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
149149
if (!initialized_) {
150150
ET_LOG(Error, "Tokenizer not initialized");
151151
return Error::NotSupported;
@@ -187,7 +187,7 @@ str_lookup(const char* str, TokenIndex* sorted_vocab, int32_t vocab_size) {
187187
* @return Result<std::vector<uint64_t>>
188188
*/
189189
Result<std::vector<uint64_t>>
190-
Tokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
190+
BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
191191
if (!initialized_) {
192192
ET_LOG(Error, "Tokenizer not initialized");
193193
return Error::NotSupported;
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
12+
#include <cstdint>
13+
14+
namespace torch {
15+
namespace executor {
16+
17+
struct TokenIndex {
18+
const char* str;
19+
int32_t id;
20+
};
21+
22+
class BPETokenizer : public Tokenizer {
23+
public:
24+
explicit BPETokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
25+
~BPETokenizer() override;
26+
27+
Error load(const std::string& tokenizer_path) override;
28+
29+
Result<std::vector<uint64_t>>
30+
encode(const std::string& input, int8_t bos, int8_t eos) override;
31+
32+
Result<std::string> decode(uint64_t prev_token, uint64_t token) override;
33+
34+
private:
35+
std::unique_ptr<char*[]> vocab_;
36+
std::unique_ptr<float[]> vocab_scores_;
37+
std::unique_ptr<TokenIndex[]> sorted_vocab_;
38+
unsigned int max_token_length_;
39+
unsigned char byte_pieces_[512]; // stores all single-byte strings
40+
};
41+
} // namespace executor
42+
} // namespace torch

examples/models/llama2/tokenizer/targets.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@ def define_common_targets():
44
runtime.cxx_library(
55
name = "tokenizer",
66
srcs = [
7-
"tokenizer.cpp",
7+
"bpe_tokenizer.cpp",
88
],
99
exported_headers = [
1010
"tokenizer.h",
11+
"bpe_tokenizer.h",
1112
],
1213
exported_deps = [
1314
"//executorch/runtime/core/exec_aten:lib",

examples/models/llama2/tokenizer/test/test_tokenizer.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9+
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
910
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
1011
#include <executorch/runtime/platform/runtime.h>
1112
#include <gtest/gtest.h>
@@ -20,7 +21,7 @@ class TokenizerExtensionTest : public Test {
2021
public:
2122
void SetUp() override {
2223
torch::executor::runtime_init();
23-
tokenizer_ = std::make_unique<Tokenizer>(32000, 1, 2);
24+
tokenizer_ = std::make_unique<BPETokenizer>(32000, 1, 2);
2425
modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
2526
}
2627

examples/models/llama2/tokenizer/tokenizer.h

Lines changed: 11 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,22 +26,21 @@
2626
namespace torch {
2727
namespace executor {
2828

29-
struct TokenIndex {
30-
const char* str;
31-
int32_t id;
32-
};
33-
3429
class Tokenizer {
3530
public:
36-
explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
37-
~Tokenizer();
31+
explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
32+
: initialized_(false),
33+
vocab_size_(vocab_size),
34+
bos_tok_(bos_tok),
35+
eos_tok_(eos_tok) {}
36+
virtual ~Tokenizer() {}
3837

39-
Error load(const std::string& tokenizer_path);
38+
virtual Error load(const std::string& tokenizer_path) = 0;
4039

41-
Result<std::vector<uint64_t>>
42-
encode(const std::string& input, int8_t bos, int8_t eos);
40+
virtual Result<std::vector<uint64_t>>
41+
encode(const std::string& input, int8_t bos, int8_t eos) = 0;
4342

44-
Result<std::string> decode(uint64_t prev_token, uint64_t token);
43+
virtual Result<std::string> decode(uint64_t prev_token, uint64_t token) = 0;
4544

4645
// getters
4746
int32_t vocab_size() const {
@@ -56,15 +55,10 @@ class Tokenizer {
5655
return eos_tok_;
5756
}
5857

59-
private:
58+
protected:
6059
bool initialized_;
6160
const int32_t vocab_size_;
6261
uint64_t bos_tok_, eos_tok_;
63-
std::unique_ptr<char*[]> vocab_;
64-
std::unique_ptr<float[]> vocab_scores_;
65-
std::unique_ptr<TokenIndex[]> sorted_vocab_;
66-
unsigned int max_token_length_;
67-
unsigned char byte_pieces_[512]; // stores all single-byte strings
6862
};
6963

7064
} // namespace executor

0 commit comments

Comments
 (0)