Skip to content

Commit 5449c17

Browse files
committed
cont : update all examples except server
ggml-ci
1 parent d206f87 commit 5449c17

File tree

27 files changed

+975
-910
lines changed

27 files changed

+975
-910
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1348,7 +1348,7 @@ llama-cvector-generator: examples/cvector-generator/cvector-generator.cpp \
13481348
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
13491349

13501350
llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp \
1351-
$(OBJ_GGML) $(OBJ_LLAMA)
1351+
$(OBJ_ALL)
13521352
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
13531353
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
13541354

common/arg.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,7 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
419419
[](gpt_params & params) {
420420
params.use_color = true;
421421
}
422-
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
422+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL, LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP}));
423423
add_opt(llama_arg(
424424
{"-t", "--threads"}, "N",
425425
format("number of threads to use during generation (default: %d)", params.cpuparams.n_threads),
@@ -870,15 +870,15 @@ gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex,
870870
params.input_prefix = value;
871871
params.enable_chat_template = false;
872872
}
873-
).set_examples({LLAMA_EXAMPLE_MAIN}));
873+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
874874
add_opt(llama_arg(
875875
{"--in-suffix"}, "STRING",
876876
"string to suffix after user inputs with (default: empty)",
877877
[](gpt_params & params, const std::string & value) {
878878
params.input_suffix = value;
879879
params.enable_chat_template = false;
880880
}
881-
).set_examples({LLAMA_EXAMPLE_MAIN}));
881+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_INFILL}));
882882
add_opt(llama_arg(
883883
{"--no-warmup"},
884884
"skip warming up the model with an empty run",

common/common.cpp

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -442,7 +442,29 @@ void string_replace_all(std::string & s, const std::string & search, const std::
442442
s = std::move(builder);
443443
}
444444

445-
std::string string_from_tokens(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
445+
std::string string_from(bool value) {
446+
return value ? "true" : "false";
447+
}
448+
449+
std::string string_from(const std::vector<int> & values) {
450+
std::stringstream buf;
451+
452+
buf << "[ ";
453+
bool first = true;
454+
for (auto e : values) {
455+
if (first) {
456+
first = false;
457+
} else {
458+
buf << ", ";
459+
}
460+
buf << std::to_string(e);
461+
}
462+
buf << " ]";
463+
464+
return buf.str();
465+
}
466+
467+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens) {
446468
std::stringstream buf;
447469

448470
buf << "[ ";
@@ -473,7 +495,7 @@ std::string string_from_tokens(const struct llama_context * ctx, const std::vect
473495
return buf.str();
474496
}
475497

476-
std::string string_from_batch(const struct llama_context * ctx, const struct llama_batch & batch) {
498+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch) {
477499
std::stringstream buf;
478500

479501
buf << "[ ";

common/common.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -374,8 +374,10 @@ static std::vector<T> string_split(const std::string & str, char delim) {
374374
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
375375
void string_process_escapes(std::string & input);
376376

377-
std::string string_from_tokens(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
378-
std::string string_from_batch (const struct llama_context * ctx, const struct llama_batch & batch);
377+
std::string string_from(bool value);
378+
std::string string_from(const std::vector<int> & values);
379+
std::string string_from(const struct llama_context * ctx, const std::vector<llama_token> & tokens);
380+
std::string string_from(const struct llama_context * ctx, const struct llama_batch & batch);
379381

380382
//
381383
// Filesystem utils

examples/batched-bench/batched-bench.cpp

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "arg.h"
22
#include "common.h"
3+
#include "log.h"
34
#include "llama.h"
45

56
#include <algorithm>
@@ -8,12 +9,18 @@
89
#include <vector>
910

1011
static void print_usage(int, char ** argv) {
11-
LOG_TEE("\nexample usage:\n");
12-
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
13-
LOG_TEE("\n");
12+
LOG("\nexample usage:\n");
13+
LOG("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
14+
LOG("\n");
1415
}
1516

1617
int main(int argc, char ** argv) {
18+
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
19+
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
20+
gpt_log_add(gpt_log_main(), level, "%s", text);
21+
}
22+
}, NULL);
23+
1724
gpt_params params;
1825

1926
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
@@ -76,7 +83,7 @@ int main(int argc, char ** argv) {
7683

7784
const int ret = llama_decode(ctx, batch_view);
7885
if (ret != 0) {
79-
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
86+
LOG_ERR("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
8087
return false;
8188
}
8289

@@ -93,17 +100,17 @@ int main(int argc, char ** argv) {
93100
}
94101

95102
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
96-
LOG_TEE("%s: llama_decode() failed\n", __func__);
103+
LOG_ERR("%s: llama_decode() failed\n", __func__);
97104
return 1;
98105
}
99106
}
100107

101108
if (!params.batched_bench_output_jsonl) {
102-
LOG_TEE("\n");
103-
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
104-
LOG_TEE("\n");
105-
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
106-
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
109+
LOG("\n");
110+
LOG("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
111+
LOG("\n");
112+
LOG("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
113+
LOG("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
107114
}
108115

109116
for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) {
@@ -133,7 +140,7 @@ int main(int argc, char ** argv) {
133140
llama_kv_cache_clear(ctx);
134141

135142
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
136-
LOG_TEE("%s: llama_decode() failed\n", __func__);
143+
LOG_ERR("%s: llama_decode() failed\n", __func__);
137144
return 1;
138145
}
139146

@@ -155,7 +162,7 @@ int main(int argc, char ** argv) {
155162
}
156163

157164
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
158-
LOG_TEE("%s: llama_decode() failed\n", __func__);
165+
LOG_ERR("%s: llama_decode() failed\n", __func__);
159166
return 1;
160167
}
161168
}
@@ -173,20 +180,20 @@ int main(int argc, char ** argv) {
173180
const float speed = n_kv / t;
174181

175182
if(params.batched_bench_output_jsonl) {
176-
LOG_TEE(
183+
LOG(
177184
"{\"n_kv_max\": %d, \"n_batch\": %d, \"n_ubatch\": %d, \"flash_attn\": %d, \"is_pp_shared\": %d, \"n_gpu_layers\": %d, \"n_threads\": %u, \"n_threads_batch\": %u, "
178185
"\"pp\": %d, \"tg\": %d, \"pl\": %d, \"n_kv\": %d, \"t_pp\": %f, \"speed_pp\": %f, \"t_tg\": %f, \"speed_tg\": %f, \"t\": %f, \"speed\": %f}\n",
179186
n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch,
180187
pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed
181188
);
182189
} else {
183-
LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
190+
LOG("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed);
184191
}
185192
}
186193
}
187194
}
188195

189-
LOG_TEE("\n");
196+
LOG("\n");
190197
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
191198

192199
llama_batch_free(batch);
@@ -196,7 +203,7 @@ int main(int argc, char ** argv) {
196203

197204
llama_backend_free();
198205

199-
fprintf(stderr, "\n\n");
206+
LOG("\n\n");
200207

201208
return 0;
202209
}

examples/batched/batched.cpp

Lines changed: 28 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#include "arg.h"
22
#include "common.h"
3+
#include "log.h"
34
#include "llama.h"
45

56
#include <algorithm>
@@ -8,12 +9,18 @@
89
#include <vector>
910

1011
static void print_usage(int, char ** argv) {
11-
LOG_TEE("\nexample usage:\n");
12-
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
13-
LOG_TEE("\n");
12+
LOG("\nexample usage:\n");
13+
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
14+
LOG("\n");
1415
}
1516

1617
int main(int argc, char ** argv) {
18+
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
19+
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_env) {
20+
gpt_log_add(gpt_log_main(), level, "%s", text);
21+
}
22+
}, NULL);
23+
1724
gpt_params params;
1825

1926
params.prompt = "Hello my name is";
@@ -42,7 +49,7 @@ int main(int argc, char ** argv) {
4249
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
4350

4451
if (model == NULL) {
45-
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
52+
LOG_ERR("%s: error: unable to load model\n" , __func__);
4653
return 1;
4754
}
4855

@@ -72,31 +79,29 @@ int main(int argc, char ** argv) {
7279
llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
7380

7481
if (ctx == NULL) {
75-
fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
82+
LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
7683
return 1;
7784
}
7885

7986
const int n_ctx = llama_n_ctx(ctx);
8087

81-
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
88+
LOG_INF("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
8289

8390
// make sure the KV cache is big enough to hold all the prompt and generated tokens
8491
if (n_kv_req > n_ctx) {
85-
LOG_TEE("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
86-
LOG_TEE("%s: either reduce n_parallel or increase n_ctx\n", __func__);
92+
LOG_ERR("%s: error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", __func__, n_kv_req);
93+
LOG_ERR("%s: either reduce n_parallel or increase n_ctx\n", __func__);
8794
return 1;
8895
}
8996

9097
// print the prompt token-by-token
9198

92-
fprintf(stderr, "\n");
99+
LOG("\n");
93100

94101
for (auto id : tokens_list) {
95-
fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
102+
LOG("%s", llama_token_to_piece(ctx, id).c_str());
96103
}
97104

98-
fflush(stderr);
99-
100105
// create a llama_batch
101106
// we use this object to submit token data for decoding
102107
llama_batch batch = llama_batch_init(std::max(tokens_list.size(), (size_t) n_parallel), 0, n_parallel);
@@ -114,7 +119,7 @@ int main(int argc, char ** argv) {
114119

115120
if (llama_model_has_encoder(model)) {
116121
if (llama_encode(ctx, batch)) {
117-
LOG_TEE("%s : failed to eval\n", __func__);
122+
LOG_ERR("%s : failed to eval\n", __func__);
118123
return 1;
119124
}
120125

@@ -131,7 +136,7 @@ int main(int argc, char ** argv) {
131136
batch.logits[batch.n_tokens - 1] = true;
132137

133138
if (llama_decode(ctx, batch) != 0) {
134-
LOG_TEE("%s: llama_decode() failed\n", __func__);
139+
LOG_ERR("%s: llama_decode() failed\n", __func__);
135140
return 1;
136141
}
137142

@@ -142,7 +147,7 @@ int main(int argc, char ** argv) {
142147
//}
143148

144149
if (n_parallel > 1) {
145-
LOG_TEE("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
150+
LOG("\n\n%s: generating %d sequences ...\n", __func__, n_parallel);
146151
}
147152

148153
// main loop
@@ -175,18 +180,17 @@ int main(int argc, char ** argv) {
175180
// is it an end of generation? -> mark the stream as finished
176181
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
177182
i_batch[i] = -1;
178-
LOG_TEE("\n");
183+
LOG("\n");
179184
if (n_parallel > 1) {
180-
LOG_TEE("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
185+
LOG_INF("%s: stream %d finished at n_cur = %d", __func__, i, n_cur);
181186
}
182187

183188
continue;
184189
}
185190

186191
// if there is only one stream, we print immediately to stdout
187192
if (n_parallel == 1) {
188-
LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
189-
fflush(stdout);
193+
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
190194
}
191195

192196
streams[i] += llama_token_to_piece(ctx, new_token_id);
@@ -208,27 +212,25 @@ int main(int argc, char ** argv) {
208212

209213
// evaluate the current batch with the transformer model
210214
if (llama_decode(ctx, batch)) {
211-
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
215+
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
212216
return 1;
213217
}
214218
}
215219

216-
LOG_TEE("\n");
217-
218220
if (n_parallel > 1) {
219-
LOG_TEE("\n");
221+
LOG("\n");
220222

221223
for (int32_t i = 0; i < n_parallel; ++i) {
222-
LOG_TEE("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
224+
LOG("sequence %d:\n\n%s%s\n\n", i, params.prompt.c_str(), streams[i].c_str());
223225
}
224226
}
225227

226228
const auto t_main_end = ggml_time_us();
227229

228-
LOG_TEE("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
230+
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
229231
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
230232

231-
LOG_TEE("\n");
233+
LOG_INF("\n");
232234
llama_perf_print(smpl, LLAMA_PERF_TYPE_SAMPLER_CHAIN);
233235
llama_perf_print(ctx, LLAMA_PERF_TYPE_CONTEXT);
234236

0 commit comments

Comments
 (0)