Skip to content

Introduce GenerationConfig #10228

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -761,12 +761,16 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
endif()

if(EXECUTORCH_BUILD_EXTENSION_MODULE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
endif()

if(EXECUTORCH_BUILD_EXTENSION_LLM)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizers)
endif()

if(EXECUTORCH_BUILD_EXTENSION_MODULE)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/runner)
endif()

if(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#import <executorch/examples/models/llama/runner/runner.h>
#import <executorch/examples/models/llava/runner/llava_runner.h>

using executorch::extension::llm::GenerationConfig;
using executorch::extension::llm::Image;
using executorch::runtime::Error;

Expand Down Expand Up @@ -61,8 +62,11 @@ - (BOOL)generate:(NSString*)prompt
sequenceLength:(NSInteger)seq_len
withTokenCallback:(nullable void (^)(NSString*))callback
error:(NSError**)error {
const GenerationConfig config{
.seq_len = static_cast<int32_t>(seq_len)
};
const auto status = _runner->generate(
prompt.UTF8String, seq_len, [callback](const std::string& token) {
prompt.UTF8String, config, [callback](const std::string& token) {
callback(@(token.c_str()));
});
if (status != Error::Ok) {
Expand Down
6 changes: 2 additions & 4 deletions examples/mediatek/executor_runner/mtk_llama_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,9 @@ bool MTKLlamaRunner::is_loaded() const {

Error MTKLlamaRunner::generate(
const std::string& prompt,
int32_t seq_len,
executorch::extension::llm::GenerationConfig config,
std::function<void(const std::string&)> token_callback,
std::function<void(const Stats&)> stats_callback,
bool echo,
bool warming) {
std::function<void(const Stats&)> stats_callback) {
if (!is_loaded()) {
ET_CHECK_OK_OR_RETURN_ERROR(load());
}
Expand Down
6 changes: 2 additions & 4 deletions examples/mediatek/executor_runner/mtk_llama_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,9 @@ class MTKLlamaRunner : public executorch::extension::llm::IRunner {
Error load();
Error generate(
const std::string& prompt,
int32_t seq_len = 128,
executorch::extension::llm::GenerationConfig config,
std::function<void(const std::string&)> token_callback = {},
std::function<void(const Stats&)> stats_callback = {},
bool echo = true,
bool warming = false);
std::function<void(const Stats&)> stats_callback = {});
void stop();

LlamaModelOptions get_model_options();
Expand Down
13 changes: 9 additions & 4 deletions examples/models/llama/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int32_t main(int32_t argc, char** argv) {

const char* prompt = FLAGS_prompt.c_str();

double temperature = FLAGS_temperature;
float temperature = FLAGS_temperature;

int32_t seq_len = FLAGS_seq_len;

Expand All @@ -73,13 +73,18 @@ int32_t main(int32_t argc, char** argv) {
}
#endif
// create llama runner
example::Runner runner(model_path, tokenizer_path, temperature);
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
example::Runner runner(model_path, tokenizer_path);

if (warmup) {
runner.warmup(prompt, seq_len);
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
runner.warmup(prompt, /*max_new_tokens=*/seq_len);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be added in the internal runner as well?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which internal runner?

}
// generate
runner.generate(prompt, seq_len);
executorch::extension::llm::GenerationConfig config{
.seq_len = seq_len, .temperature = temperature};
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
runner.generate(prompt, config);

return 0;
}
92 changes: 50 additions & 42 deletions examples/models/llama/runner/runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
Runner::Runner(
const std::string& model_path,
const std::string& tokenizer_path,
const float temperature,
std::optional<const std::string> data_path)
// NOTE: we observed ~2x loading performance increase on iPhone 15
// and a ~5% improvement on Galaxy S22 by switching to
// FileDataLoader instead of MmapDataLoader + UseMlockIgnoreErrors.
: temperature_(temperature),
tokenizer_path_(tokenizer_path),
: tokenizer_path_(tokenizer_path),
metadata_({
{kEnableDynamicShape, false},
{kMaxSeqLen, 128},
Expand All @@ -68,6 +66,17 @@ Runner::Runner(
tokenizer_path.c_str());
}

[[deprecated(
"This constructor is deprecated. Use the constructor without temperature parameter instead.")]]
Runner::Runner(
const std::string& model_path,
const std::string& tokenizer_path,
const float temperature,
std::optional<const std::string> data_path)
: Runner(model_path, tokenizer_path, std::move(data_path)) {
temperature_ = temperature;
}

bool Runner::is_loaded() const {
return module_->is_loaded() && tokenizer_ && text_decoder_runner_ &&
text_prefiller_ && text_token_generator_;
Expand Down Expand Up @@ -133,11 +142,9 @@ Error Runner::load() {
ET_LOG(Info, "eos_id = %" PRId64, value);
}
}
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
module_.get(),
metadata_.at(kUseKVCache),
metadata_.at(kVocabSize),
temperature_);
module_.get(), metadata_.at(kUseKVCache));
text_prefiller_ = std::make_unique<llm::TextPrefiller>(
text_decoder_runner_.get(),
metadata_.at(kUseKVCache),
Expand All @@ -164,11 +171,9 @@ Error Runner::load() {

Error Runner::generate(
const std::string& prompt,
int32_t seq_len,
const ::executorch::extension::llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback,
std::function<void(const llm::Stats&)> stats_callback,
bool echo,
bool warmup) {
std::function<void(const llm::Stats&)> stats_callback) {
// Prepare the inputs.
// Use ones-initialized inputs.
ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
Expand All @@ -178,19 +183,19 @@ Error Runner::generate(
stats_.model_load_end_ms = llm::time_in_ms();
}

if (warmup) {
if (config.warming) {
ET_LOG(Info, "Doing a warmup run...");
}

RUNNER_ET_LOG(
warmup,
config.warming,
"RSS after loading model: %f MiB (0 if unsupported)",
llm::get_rss_bytes() / 1024.0 / 1024.0);

// Wrap the token_callback with print function
std::function<void(const std::string&)> wrapped_callback =
[token_callback, warmup](const std::string& piece) {
if (!warmup) {
[token_callback, config](const std::string& piece) {
if (!config.warming) {
llm::safe_printf(piece.c_str());
fflush(stdout);
}
Expand All @@ -204,11 +209,6 @@ Error Runner::generate(
stats_.inference_start_ms = llm::time_in_ms();
shouldStop_ = false;

// Set the sequence length to the max seq length if not provided
seq_len = (seq_len > 0 && seq_len <= metadata_.at(kMaxContextLen))
? seq_len
: metadata_.at(kMaxContextLen);

::tokenizers::Result<std::vector<uint64_t>> encode_res = tokenizer_->encode(
prompt,
/* bos */ 0,
Expand All @@ -225,21 +225,22 @@ Error Runner::generate(
ET_CHECK_MSG(
num_prompt_tokens < metadata_.at(kMaxContextLen),
"num_prompt_tokens %d >= max_seq_len_ %" PRId64
", Max seq length exceeded - please increase max seq len value in .../llama2/model.py",
", Max seq length exceeded - please increase max seq len value in your export script",
num_prompt_tokens,
metadata_.at(kMaxContextLen));
ET_CHECK_MSG(
num_prompt_tokens < seq_len,
"num_prompt_tokens %d >= seq_len %d, Sequence length exceeded - please increase the seq_len value passed to generate()",
num_prompt_tokens,
seq_len);

// Determine max_new_tokens using the GenerationConfig's resolve method
int max_new_tokens = config.resolve_max_new_tokens(
metadata_.at(kMaxContextLen), num_prompt_tokens);

ET_LOG(Info, "Max new tokens resolved: %d", max_new_tokens);

// Prefill first
// Here feed all tokens to the model and get the next predicted token
// after the prompt. After that we will enter generate loop.

// print prompts
if (echo) {
if (config.echo) {
wrapped_callback(prompt);
}
int64_t pos = 0;
Expand All @@ -253,32 +254,38 @@ Error Runner::generate(
wrapped_callback(
ET_UNWRAP_TOKENIZER(tokenizer_->decode(cur_token, cur_token)));
RUNNER_ET_LOG(
warmup,
config.warming,
"RSS after prompt prefill: %f MiB (0 if unsupported)",
llm::get_rss_bytes() / 1024.0 / 1024.0);

// start the main loop
prompt_tokens.push_back(cur_token);

// Generate max_new_tokens - 1 because prefill already generated 1 token.
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
prompt_tokens, num_prompt_tokens, seq_len, wrapped_callback));
prompt_tokens,
num_prompt_tokens,
max_new_tokens - 1,
temperature_ == -1.0f ? config.temperature : temperature_,
wrapped_callback));

stats_.inference_end_ms = llm::time_in_ms();
if (!warmup) {
if (!config.warming) {
printf("\n");
}
RUNNER_ET_LOG(
warmup,
config.warming,
"RSS after finishing text generation: %f MiB (0 if unsupported)",
llm::get_rss_bytes() / 1024.0 / 1024.0);

if (num_prompt_tokens + num_generated_tokens == seq_len) {
RUNNER_ET_LOG(warmup, "Sequence length (%i tokens) reached!", seq_len);
if (num_generated_tokens == max_new_tokens) {
RUNNER_ET_LOG(config.warming, "Max new tokens %i reached!", max_new_tokens);
}

stats_.num_prompt_tokens = num_prompt_tokens;
stats_.num_generated_tokens = num_generated_tokens;

if (warmup) {
if (config.warming) {
ET_LOG(Info, "Warmup run finished!");
} else {
// Do not print report during warmup
Expand All @@ -291,14 +298,15 @@ Error Runner::generate(
return Error::Ok;
}

Error Runner::warmup(const std::string& prompt, int32_t seq_len) {
Error err = generate(
prompt,
seq_len,
/*token_callback=*/nullptr,
/*stats_callbak=*/nullptr,
/*echo=*/false,
/*warmup=*/true);
Error Runner::warmup(const std::string& prompt, int32_t max_new_tokens) {
// Create a GenerationConfig for warmup
llm::GenerationConfig config{
.echo = false, .max_new_tokens = max_new_tokens, .warming = true};

// Call generate with the warmup config
Error err = generate(prompt, config);

// Reset stats after warmup
stats_.reset();
return err;
}
Expand Down
28 changes: 18 additions & 10 deletions examples/models/llama/runner/runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,26 +33,30 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {
explicit Runner(
const std::string& model_path,
const std::string& tokenizer_path,
const float temperature = 0.8f,
std::optional<const std::string> data_path = std::nullopt);

bool is_loaded() const;
::executorch::runtime::Error load();
[[deprecated(
"This constructor is deprecated. Use the constructor without temperature parameter instead.")]]
explicit Runner(
const std::string& model_path,
const std::string& tokenizer_path,
const float temperature,
std::optional<const std::string> data_path = std::nullopt);

bool is_loaded() const override;
::executorch::runtime::Error load() override;
::executorch::runtime::Error generate(
const std::string& prompt,
int32_t seq_len = 128,
const ::executorch::extension::llm::GenerationConfig& config,
std::function<void(const std::string&)> token_callback = {},
std::function<void(const ::executorch::extension::llm::Stats&)>
stats_callback = {},
bool echo = true,
bool warming = false);
stats_callback = {}) override;
::executorch::runtime::Error warmup(
const std::string& prompt,
int32_t seq_len = 128);
void stop();
int32_t max_new_tokens);
void stop() override;

private:
float temperature_;
bool shouldStop_{false};

// model
Expand All @@ -68,6 +72,10 @@ class ET_EXPERIMENTAL Runner : public executorch::extension::llm::IRunner {

// stats
::executorch::extension::llm::Stats stats_;

// temperature.
// Deprecated, we should rely on the temperature in GenerationConfig instead.
float temperature_ = -1.0f;
};

} // namespace example
12 changes: 9 additions & 3 deletions examples/models/llava/runner/llava_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,10 @@ Error LlavaRunner::load() {
tokenizer_->load(tokenizer_path_);

// Load the text decoder runner
text_decoder_runner_ = std::make_unique<LlavaTextDecoderRunner>(
module_.get(), tokenizer_->vocab_size(), temperature_);
text_decoder_runner_ =
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
std::make_unique<LlavaTextDecoderRunner>(module_.get());
// @lint-ignore CLANGTIDY facebook-hte-Deprecated
text_decoder_runner_->load();

// Load the text prefiller
Expand Down Expand Up @@ -117,7 +119,11 @@ Error LlavaRunner::generate_from_pos(

// Generate tokens
int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
{prefill_next_token}, start_pos, seq_len, token_callback));
/*tokens=*/{prefill_next_token},
/*start_pos=*/start_pos,
/*max_new_tokens=*/seq_len - start_pos + 1,
/*temperature=*/temperature_,
/*token_callback=*/token_callback));

// Bookkeeping
stats_.num_generated_tokens = num_generated_tokens;
Expand Down
7 changes: 2 additions & 5 deletions examples/models/llava/runner/llava_text_decoder_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,8 @@ namespace example {
class ET_EXPERIMENTAL LlavaTextDecoderRunner
: public executorch::extension::llm::TextDecoderRunner {
public:
LlavaTextDecoderRunner(
executorch::extension::Module* module,
int32_t vocab_size,
float temperature)
: TextDecoderRunner(module, true, vocab_size, temperature){};
explicit LlavaTextDecoderRunner(executorch::extension::Module* module)
: TextDecoderRunner(module, true) {}

inline executorch::runtime::Result<executorch::aten::Tensor> step(
executorch::extension::TensorPtr& tokens,
Expand Down
Loading
Loading