Skip to content

Change tokenizer name to bpe_tokenizer and extract a base class #3009

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/llm/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ acceleration and optimization. Any portions of the computation graph not
delegated will be executed by the ExecuTorch operator implementations.

To delegate the exported model to the specific backend, we need to import its
partitioner as well as edge compile config from Executorch Codebase first, then
partitioner as well as edge compile config from ExecuTorch Codebase first, then
call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
object `to_edge` function created.

Expand Down Expand Up @@ -482,7 +482,7 @@ target_link_libraries(
```

Keep the rest of the code the same. For more details refer to
[Exporting to Executorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
[Exporting to ExecuTorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
and
[Invoking the Runtime](https://pytorch.org/executorch/main/llm/getting-started.html#step-2-invoking-the-runtime)
for more details
Expand Down
9 changes: 5 additions & 4 deletions examples/models/llama2/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// The module takes in a string as input and emits a string as output.

#include <executorch/examples/models/llama2/runner/runner.h>
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/extension/evalue_util/print_evalue.h>
#include <executorch/extension/runner_util/managed_tensor.h>

Expand Down Expand Up @@ -76,7 +77,7 @@ Error Runner::load() {
append_eos_ = getMetadataHelper("append_eos_to_prompt", false);

// Load tokenizer
tokenizer_ = std::make_unique<Tokenizer>(vocab_size_, bos_id_, eos_id_);
tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
tokenizer_->load(tokenizer_path_);
if (tokenizer_->bos_tok() != bos_id_) {
ET_LOG(
Expand Down Expand Up @@ -105,7 +106,7 @@ Error Runner::load() {
}

template <typename T>
T Runner::getMetadataHelper(std::string method_name, T default_val) {
T Runner::getMetadataHelper(const std::string& method_name, T default_val) {
T res = default_val;
if (model_methods_.count(method_name)) {
Result<std::vector<EValue>> outputs = module_->execute(method_name);
Expand Down Expand Up @@ -484,9 +485,9 @@ void Runner::stop() {

// explicit instantiation of template methods
template int64_t Runner::getMetadataHelper<int64_t>(
std::string method_name,
const std::string& method_name,
int64_t default_val);
template bool Runner::getMetadataHelper<bool>(
std::string method_name,
const std::string& method_name,
bool default_val);
} // namespace torch::executor
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class Runner {
private:
// metadata
template <typename T>
T getMetadataHelper(std::string method_name, T default_val);
T getMetadataHelper(const std::string& method_name, T default_val);
template <typename T>
int32_t
logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>

#include <string>

Expand All @@ -23,11 +23,11 @@ static int compare_tokens(const void* a, const void* b) {
return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
}

Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
: initialized_(false),
vocab_size_(vocab_size),
bos_tok_(bos_tok),
eos_tok_(eos_tok),
BPETokenizer::BPETokenizer(
int32_t vocab_size,
uint64_t bos_tok,
uint64_t eos_tok)
: Tokenizer(vocab_size, bos_tok, eos_tok),
vocab_(std::make_unique<char*[]>(vocab_size)),
vocab_scores_(std::make_unique<float[]>(vocab_size)),
sorted_vocab_(std::make_unique<TokenIndex[]>(vocab_size)) {
Expand All @@ -47,7 +47,7 @@ Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
* @param tokenizer_path The path to the tokenizer file.
* @return Error
*/
Error Tokenizer::load(const std::string& tokenizer_path) {
Error BPETokenizer::load(const std::string& tokenizer_path) {
if (initialized_) {
ET_LOG(Info, "Tokenizer already initialized");
return Error::Ok;
Expand Down Expand Up @@ -131,7 +131,7 @@ Error Tokenizer::load(const std::string& tokenizer_path) {
return Error::Ok;
}

Tokenizer::~Tokenizer() {
BPETokenizer::~BPETokenizer() {
for (int i = 0; i < vocab_size_; i++) {
delete[] vocab_[i];
}
Expand All @@ -145,7 +145,7 @@ Tokenizer::~Tokenizer() {
* @return Result<std::string> A pointer to the string representation of the
* token.
*/
Result<std::string> Tokenizer::decode(uint64_t prev_token, uint64_t token) {
Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
if (!initialized_) {
ET_LOG(Error, "Tokenizer not initialized");
return Error::NotSupported;
Expand Down Expand Up @@ -187,7 +187,7 @@ str_lookup(const char* str, TokenIndex* sorted_vocab, int32_t vocab_size) {
* @return Result<std::vector<uint64_t>>
*/
Result<std::vector<uint64_t>>
Tokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
if (!initialized_) {
ET_LOG(Error, "Tokenizer not initialized");
return Error::NotSupported;
Expand Down
42 changes: 42 additions & 0 deletions examples/models/llama2/tokenizer/bpe_tokenizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <cstdint>

namespace torch {
namespace executor {

struct TokenIndex {
const char* str;
int32_t id;
};

class BPETokenizer : public Tokenizer {
public:
explicit BPETokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
~BPETokenizer() override;

Error load(const std::string& tokenizer_path) override;

Result<std::vector<uint64_t>>
encode(const std::string& input, int8_t bos, int8_t eos) override;

Result<std::string> decode(uint64_t prev_token, uint64_t token) override;

private:
std::unique_ptr<char*[]> vocab_;
std::unique_ptr<float[]> vocab_scores_;
std::unique_ptr<TokenIndex[]> sorted_vocab_;
unsigned int max_token_length_;
unsigned char byte_pieces_[512]; // stores all single-byte strings
};
} // namespace executor
} // namespace torch
3 changes: 2 additions & 1 deletion examples/models/llama2/tokenizer/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ def define_common_targets():
runtime.cxx_library(
name = "tokenizer",
srcs = [
"tokenizer.cpp",
"bpe_tokenizer.cpp",
],
exported_headers = [
"tokenizer.h",
"bpe_tokenizer.h",
],
exported_deps = [
"//executorch/runtime/core/exec_aten:lib",
Expand Down
3 changes: 2 additions & 1 deletion examples/models/llama2/tokenizer/test/test_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* LICENSE file in the root directory of this source tree.
*/

#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
#include <executorch/runtime/platform/runtime.h>
#include <gtest/gtest.h>
Expand All @@ -20,7 +21,7 @@ class TokenizerExtensionTest : public Test {
public:
void SetUp() override {
torch::executor::runtime_init();
tokenizer_ = std::make_unique<Tokenizer>(32000, 1, 2);
tokenizer_ = std::make_unique<BPETokenizer>(32000, 1, 2);
modelPath_ = std::getenv("RESOURCES_PATH") + std::string("/test.bin");
}

Expand Down
28 changes: 11 additions & 17 deletions examples/models/llama2/tokenizer/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,21 @@
namespace torch {
namespace executor {

struct TokenIndex {
const char* str;
int32_t id;
};

class Tokenizer {
public:
explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok);
~Tokenizer();
explicit Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
: initialized_(false),
vocab_size_(vocab_size),
bos_tok_(bos_tok),
eos_tok_(eos_tok) {}
virtual ~Tokenizer() {}

Error load(const std::string& tokenizer_path);
virtual Error load(const std::string& tokenizer_path) = 0;

Result<std::vector<uint64_t>>
encode(const std::string& input, int8_t bos, int8_t eos);
virtual Result<std::vector<uint64_t>>
encode(const std::string& input, int8_t bos, int8_t eos) = 0;

Result<std::string> decode(uint64_t prev_token, uint64_t token);
virtual Result<std::string> decode(uint64_t prev_token, uint64_t token) = 0;

// getters
int32_t vocab_size() const {
Expand All @@ -56,15 +55,10 @@ class Tokenizer {
return eos_tok_;
}

private:
protected:
bool initialized_;
const int32_t vocab_size_;
uint64_t bos_tok_, eos_tok_;
std::unique_ptr<char*[]> vocab_;
std::unique_ptr<float[]> vocab_scores_;
std::unique_ptr<TokenIndex[]> sorted_vocab_;
unsigned int max_token_length_;
unsigned char byte_pieces_[512]; // stores all single-byte strings
};

} // namespace executor
Expand Down