Skip to content

examples : do not use common library in simple example #9803

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/simple/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-simple)
add_executable(${TARGET} simple.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
224 changes: 128 additions & 96 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
@@ -1,50 +1,112 @@
#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"

#include <cstdio>
#include <cstring>
#include <string>
#include <vector>

static void print_usage(int, char ** argv) {
LOG("\nexample usage:\n");
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
LOG("\n");
printf("\nexample usage:\n");
printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
printf("\n");
}

int main(int argc, char ** argv) {
gpt_params params;

params.prompt = "Hello my name is";
params.n_predict = 32;

if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
return 1;
// path to the model gguf file
std::string model_path;
// prompt to generate text from
std::string prompt = "Hello my name is";
// number of layers to offload to the GPU
int ngl = 99;
// number of tokens to predict
int n_predict = 32;

// parse command line arguments

{
int i = 1;
for (; i < argc; i++) {
if (strcmp(argv[i], "-m") == 0) {
if (i + 1 < argc) {
model_path = argv[++i];
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-n") == 0) {
if (i + 1 < argc) {
try {
n_predict = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} else if (strcmp(argv[i], "-ngl") == 0) {
if (i + 1 < argc) {
try {
ngl = std::stoi(argv[++i]);
} catch (...) {
print_usage(argc, argv);
return 1;
}
} else {
print_usage(argc, argv);
return 1;
}
} else {
// prompt starts here
break;
}
}
if (model_path.empty()) {
print_usage(argc, argv);
return 1;
}
if (i < argc) {
prompt = argv[i++];
for (; i < argc; i++) {
prompt += " ";
prompt += argv[i];
}
}
}

gpt_init();

// total length of the sequence including the prompt
const int n_predict = params.n_predict;

// init LLM

llama_backend_init();
llama_numa_init(params.numa);

// initialize the model

llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = ngl;

llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);

if (model == NULL) {
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
return 1;
}

// tokenize the prompt

// find the number of tokens in the prompt
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);

// allocate space for the tokens and tokenize the prompt
std::vector<llama_token> prompt_tokens(n_prompt);
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
return 1;
}

// initialize the context

llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
llama_context_params ctx_params = llama_context_default_params();
// n_ctx is the context size
ctx_params.n_ctx = n_prompt + n_predict - 1;
// n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
ctx_params.n_batch = n_prompt;
// enable performance counters
ctx_params.no_perf = false;

llama_context * ctx = llama_new_context_with_model(model, ctx_params);

Expand All @@ -53,117 +115,87 @@ int main(int argc, char ** argv) {
return 1;
}

auto sparams = llama_sampler_chain_default_params();
// initialize the sampler

auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;

llama_sampler * smpl = llama_sampler_chain_init(sparams);

llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

// tokenize the prompt

std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(ctx, params.prompt, true);

const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());

LOG("\n");
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);

// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}

// print the prompt token-by-token

LOG("\n");

for (auto id : tokens_list) {
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}

// create a llama_batch with size 512
// we use this object to submit token data for decoding

llama_batch batch = llama_batch_init(512, 0, 1);

// evaluate the initial prompt
for (size_t i = 0; i < tokens_list.size(); i++) {
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
for (auto id : prompt_tokens) {
char buf[128];
int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1;
}
std::string s(buf, n);
printf("%s", s.c_str());
}

// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;
// prepare a batch for the prompt

if (llama_decode(ctx, batch) != 0) {
LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0);

// main loop

int n_cur = batch.n_tokens;
const auto t_main_start = ggml_time_us();
int n_decode = 0;
llama_token new_token_id;

const auto t_main_start = ggml_time_us();
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}

n_pos += batch.n_tokens;

while (n_cur <= n_predict) {
// sample the next token
{
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
new_token_id = llama_sampler_sample(smpl, ctx, -1);

// is it an end of generation?
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
LOG("\n");

if (llama_token_is_eog(model, new_token_id)) {
break;
}

LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
char buf[128];
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1;
}
std::string s(buf, n);
printf("%s", s.c_str());
fflush(stdout);

// prepare the next batch
llama_batch_clear(batch);

// push this new token for next evaluation
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
// prepare the next batch with the sampled token
batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0);

n_decode += 1;
}

n_cur += 1;

// evaluate the current batch with the transformer model
if (llama_decode(ctx, batch)) {
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
return 1;
}
}

LOG("\n");
printf("\n");

const auto t_main_end = ggml_time_us();

LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));

LOG("\n");
fprintf(stderr, "\n");
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx);
fprintf(stderr, "\n");

LOG("\n");

llama_batch_free(batch);
llama_sampler_free(smpl);
llama_free(ctx);
llama_free_model(model);

llama_backend_free();

return 0;
}
Loading