Skip to content

Commit 864e0b0

Browse files
helunwencserfacebook-github-bot
authored andcommitted
Implement runner for phi-3-mini (#4500)
Summary: This PR mainly does the following things: - implement runner for phi-3-mini - replace phi-3-mini to use bpe tokenizer, which is shared for all LLMs now - fix a small bug in bpe tokenizer Pull Request resolved: #4500 Test Plan: ``` ./build/phi_3_mini_runner --model_path phi-3-mini-kv-128.pte --tokenizer_path tokenizer.bin --prompt "Tell me a story" --temperature 0 Prefilling tokens ... 24948 592 263 5828 Generating tokens ... about a time when you had to overcome a challenge. I remember when I was in high school, I had to prepare for a big exam that would determine my future. I had to study hard, but I also had to balance my schoolwork, my hobbies, and my social life. It was not easy, but I managed to do it. I made a study schedule, set goals, and rewarded myself for my achievements. I also asked for help from my teachers, friends, and family. I faced many difficulties, but I never gave up. I passed the exam with flying colors and``` Reviewed By: larryliu0820 Differential Revision: D60609165 Pulled By: helunwencser fbshipit-source-id: 9abab0ba8ea8e50559272c6001fa868e49f40a96
1 parent 14c2473 commit 864e0b0

File tree

6 files changed

+215
-126
lines changed

6 files changed

+215
-126
lines changed

examples/models/phi-3-mini/CMakeLists.txt

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,15 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7+
# ### Editing this file ###
8+
#
9+
# This file should be formatted with
10+
# ~~~
11+
# cmake-format -i CMakeLists.txt
12+
# ~~~
13+
# It should also be cmake-lint clean.
14+
#
15+
716
cmake_minimum_required(VERSION 3.19)
817
project(phi_3_mini_runner)
918

@@ -18,22 +27,26 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "" ON)
1827
option(EXECUTORCH_BUILD_XNNPACK "" ON)
1928

2029
add_subdirectory(
21-
${CMAKE_CURRENT_SOURCE_DIR}/../../..
22-
${CMAKE_BINARY_DIR}/../../..)
23-
add_subdirectory(
24-
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/sentencepiece
25-
${CMAKE_BINARY_DIR}/sentencepiece)
30+
${CMAKE_CURRENT_SOURCE_DIR}/../../.. ${CMAKE_BINARY_DIR}/../../..
31+
)
32+
if(NOT TARGET gflags)
33+
add_subdirectory(
34+
${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags
35+
${CMAKE_BINARY_DIR}/gflags
36+
)
37+
endif()
2638

27-
add_executable(phi_3_mini_runner main.cpp)
39+
add_executable(
40+
phi_3_mini_runner
41+
main.cpp runner.cpp
42+
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/sampler/sampler.cpp
43+
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/tokenizer/bpe_tokenizer.cpp
44+
)
2845
target_include_directories(
29-
phi_3_mini_runner
30-
PUBLIC
31-
${CMAKE_CURRENT_SOURCE_DIR}/../../../extension/llm/third-party/sentencepiece/src)
46+
phi_3_mini_runner
47+
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/../../../third-party/gflags/src
48+
)
3249
target_link_libraries(
33-
phi_3_mini_runner
34-
PRIVATE
35-
executorch
36-
extension_module_static
37-
optimized_native_cpu_ops_lib
38-
xnnpack_backend
39-
sentencepiece)
50+
phi_3_mini_runner PRIVATE executorch extension_module_static
51+
optimized_native_cpu_ops_lib xnnpack_backend gflags
52+
)

examples/models/phi-3-mini/main.cpp

Lines changed: 26 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -6,85 +6,45 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
// main.cpp
9+
#include <gflags/gflags.h>
1010

11-
#include <iostream>
11+
#include <executorch/examples/models/phi-3-mini/runner.h>
1212

13-
#include <executorch/extension/module/module.h>
14-
#include <executorch/extension/runner_util/managed_tensor.h>
13+
DEFINE_string(
14+
model_path,
15+
"phi-3-mini.pte",
16+
"File path for model serialized in flatbuffer format.");
1517

16-
#include "sentence_piece_tokenizer.h"
18+
DEFINE_string(tokenizer_path, "tokenizer.bin", "File path for tokenizer.");
1719

18-
using namespace torch::executor;
20+
DEFINE_string(prompt, "Tell me a story", "Prompt.");
1921

20-
// The value of the phi-3-mini `<|endoftext|>` token.
21-
#define ENDOFTEXT_TOKEN 32000
22-
#define VOCABULARY_SIZE 32064
22+
DEFINE_double(
23+
temperature,
24+
0.8f,
25+
"Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
2326

24-
// TODO(lunwenh): refactor and share with llama
25-
void generate(
26-
Module& llm_model,
27-
std::string& prompt,
28-
SentencePieceTokenizer& tokenizer,
29-
size_t max_output_length) {
30-
// Convert the input text into a list of integers (tokens) that represents
31-
// it, using the string-to-token mapping that the model was trained on.
32-
// Each token is an integer that represents a word or part of a word.
33-
std::vector<int64_t> input_tokens = tokenizer.encode(prompt);
27+
DEFINE_int32(
28+
seq_len,
29+
128,
30+
"Total number of tokens to generate (prompt + output).");
3431

35-
std::cout << "Generating tokens ..." << std::endl;
32+
int main(int32_t argc, char** argv) {
33+
gflags::ParseCommandLineFlags(&argc, &argv, true);
3634

37-
std::vector<int64_t> output_tokens;
35+
const char* model_path = FLAGS_model_path.c_str();
3836

39-
for (size_t i = 0; i < max_output_length; i++) {
40-
ManagedTensor tensor_tokens(
41-
input_tokens.data(),
42-
{1, static_cast<int>(input_tokens.size())},
43-
ScalarType::Long);
44-
std::vector<EValue> inputs = {tensor_tokens.get_aliasing_tensor()};
37+
const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
4538

46-
Result<std::vector<EValue>> result_evalue = llm_model.forward(inputs);
39+
const char* prompt = FLAGS_prompt.c_str();
4740

48-
const auto error = result_evalue.error();
49-
Tensor logits_tensor = result_evalue.get()[0].toTensor();
50-
const auto sentence_length = logits_tensor.size(1);
51-
std::vector<float> logits(
52-
logits_tensor.data_ptr<float>() +
53-
(sentence_length - 1) * VOCABULARY_SIZE,
54-
logits_tensor.data_ptr<float>() + sentence_length * VOCABULARY_SIZE);
41+
double temperature = FLAGS_temperature;
5542

56-
// Sample the next token from the logits.
57-
int64_t next_token =
58-
std::max_element(logits.begin(), logits.end()) - logits.begin();
43+
int32_t seq_len = FLAGS_seq_len;
5944

60-
std::cout << next_token << "\t";
61-
std::cout.flush();
45+
::torch::executor::Runner runner(model_path, tokenizer_path, temperature);
6246

63-
// Break if we reached the end of the text.
64-
if (next_token == ENDOFTEXT_TOKEN) {
65-
break;
66-
}
47+
runner.generate(prompt, seq_len);
6748

68-
output_tokens.push_back(next_token);
69-
70-
// Update next input.
71-
input_tokens.push_back(next_token);
72-
}
73-
74-
std::cout << std::endl;
75-
std::cout << tokenizer.decode(output_tokens) << std::endl;
76-
}
77-
78-
int main() {
79-
// Set up the prompt. This provides the seed text for the model to elaborate.
80-
std::cout << "Enter model prompt: ";
81-
std::string prompt;
82-
std::getline(std::cin, prompt);
83-
84-
SentencePieceTokenizer tokenizer("tokenizer.model");
85-
86-
Module model("phi-3-mini.pte", Module::LoadMode::MmapUseMlockIgnoreErrors);
87-
88-
const auto max_output_tokens = 128;
89-
generate(model, prompt, tokenizer, max_output_tokens);
49+
return 0;
9050
}

examples/models/phi-3-mini/runner.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/examples/models/phi-3-mini/runner.h>
10+
11+
#include <ctime>
12+
#include <iostream>
13+
14+
#include <executorch/extension/llm/tokenizer/bpe_tokenizer.h>
15+
#include <executorch/extension/runner_util/managed_tensor.h>
16+
#include <executorch/runtime/platform/log.h>
17+
18+
namespace torch::executor {
19+
20+
#define SAMPLER_TOP 0.9f
21+
#define ENDOFTEXT_TOKEN 32000
22+
#define VOCABULARY_SIZE 32064
23+
24+
Runner::Runner(
25+
const std::string& model_path,
26+
const std::string& tokenizer_path,
27+
const float temperature)
28+
: module_(std::make_unique<Module>(model_path, Module::LoadMode::File)),
29+
tokenizer_(std::make_unique<BPETokenizer>()),
30+
sampler_(std::make_unique<Sampler>(
31+
VOCABULARY_SIZE,
32+
temperature,
33+
SAMPLER_TOP,
34+
static_cast<unsigned long long>(std::time(nullptr)))) {
35+
ET_CHECK_MSG(
36+
tokenizer_->load(tokenizer_path) == Error::Ok,
37+
"Failed to load tokenizer at %s",
38+
tokenizer_path.c_str());
39+
ET_LOG(
40+
Info,
41+
"Created Phi-3-mini runner: model_path=%s, tokenizer_path=%s",
42+
model_path.c_str(),
43+
tokenizer_path.c_str());
44+
}
45+
46+
void Runner::generate(const std::string& prompt, std::size_t max_seq_len) {
47+
auto encode_res = tokenizer_->encode(prompt, 0, 0);
48+
ET_CHECK_MSG(
49+
encode_res.error() == Error::Ok, "Failed to encode %", prompt.c_str());
50+
auto input_tokens = encode_res.get();
51+
52+
std::cout << "Prefilling tokens ..." << std::endl;
53+
for (auto token : input_tokens) {
54+
std::cout << token << " ";
55+
}
56+
std::cout << std::endl;
57+
std::cout.flush();
58+
auto prev_token = input_tokens.back();
59+
auto current_token = prefill(input_tokens);
60+
61+
std::cout << "Generating tokens ..." << std::endl;
62+
std::cout << tokenizer_->decode(prev_token, current_token).get();
63+
std::cout.flush();
64+
65+
std::size_t seq_len = input_tokens.size() + 1;
66+
67+
while (current_token != ENDOFTEXT_TOKEN && seq_len < max_seq_len) {
68+
prev_token = current_token;
69+
current_token = run_model_step(current_token);
70+
std::cout << tokenizer_->decode(prev_token, current_token).get();
71+
std::cout.flush();
72+
73+
++seq_len;
74+
}
75+
76+
std::cout << std::endl;
77+
}
78+
79+
uint64_t Runner::logits_to_token(const exec_aten::Tensor& logits_tensor) {
80+
return sampler_->sample(logits_tensor.data_ptr<float>());
81+
}
82+
83+
uint64_t Runner::prefill(std::vector<uint64_t>& tokens) {
84+
ManagedTensor input_tokens(
85+
tokens.data(),
86+
{1, static_cast<exec_aten::SizesType>(tokens.size())},
87+
ScalarType::Long);
88+
std::vector<EValue> inputs = {input_tokens.get_aliasing_tensor()};
89+
90+
auto result = module_->forward(inputs);
91+
ET_CHECK_MSG(result.error() == Error::Ok, "Failed to prefill tokens");
92+
93+
return logits_to_token(result.get()[0].toTensor());
94+
}
95+
96+
uint64_t Runner::run_model_step(uint64_t token) {
97+
ManagedTensor input_token(&token, {1, 1}, ScalarType::Long);
98+
std::vector<EValue> inputs = {input_token.get_aliasing_tensor()};
99+
100+
auto result = module_->forward(inputs);
101+
ET_CHECK_MSG(
102+
result.error() == Error::Ok,
103+
"Failed to run forward() for token %" PRIu64,
104+
token);
105+
106+
return logits_to_token(result.get()[0].toTensor());
107+
}
108+
109+
} // namespace torch::executor

examples/models/phi-3-mini/runner.h

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// A simple phi-3-mini runner that includes preprocessing and post processing
10+
// logic. The module takes in a string as input and emits a string as output.
11+
12+
#pragma once
13+
14+
#include <memory>
15+
#include <string>
16+
17+
#include <executorch/extension/llm/sampler/sampler.h>
18+
#include <executorch/extension/llm/tokenizer/tokenizer.h>
19+
#include <executorch/extension/module/module.h>
20+
#include <executorch/runtime/core/exec_aten/exec_aten.h>
21+
22+
namespace torch::executor {
23+
24+
class Runner {
25+
public:
26+
explicit Runner(
27+
const std::string& model_path,
28+
const std::string& tokenizer_path,
29+
const float temperature = 0.8f);
30+
31+
/**
32+
* Generates response for a given prompt.
33+
*
34+
* @param[in] prompt The prompt to generate a response for.
35+
* @param[in] max_seq_len The maximum length of the sequence to generate,
36+
* including prompt.
37+
*/
38+
void generate(const std::string& prompt, std::size_t max_seq_len);
39+
40+
private:
41+
uint64_t logits_to_token(const exec_aten::Tensor& logits_tensor);
42+
uint64_t prefill(std::vector<uint64_t>& tokens);
43+
uint64_t run_model_step(uint64_t token);
44+
45+
std::unique_ptr<Module> module_;
46+
std::unique_ptr<Tokenizer> tokenizer_;
47+
std::unique_ptr<Sampler> sampler_;
48+
};
49+
50+
} // namespace torch::executor

examples/models/phi-3-mini/sentence_piece_tokenizer.h

Lines changed: 0 additions & 43 deletions
This file was deleted.

extension/llm/tokenizer/bpe_tokenizer.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) const {
190190
std::vector<uint64_t> tokens;
191191

192192
// add optional BOS token, if desired
193-
if (bos > 0) {
193+
if (bos >= 0) {
194194
while (bos--) {
195195
tokens.push_back(bos_tok_);
196196
}

0 commit comments

Comments
 (0)