Skip to content

Commit c292bf1

Browse files
authored
Merge branch 'ggerganov:master' into qwen2-vl
2 parents 9abb252 + 64ae065 commit c292bf1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

54 files changed

+1033
-910
lines changed

.github/workflows/server.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ jobs:
7979
# Setup nodejs (to be used for verifying bundled index.html)
8080
- uses: actions/setup-node@v4
8181
with:
82-
node-version: 22
82+
node-version: '22.11.0'
8383

8484
- name: Verify bundled index.html
8585
id: verify_server_index_html

CODEOWNERS

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
11
# collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
22

3-
ci/ @ggerganov
3+
/ci/ @ggerganov
4+
/.devops/ @ngxson
5+
/examples/server/ @ngxson

common/arg.cpp

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,35 @@ static void common_params_handle_model_default(common_params & params) {
145145
}
146146
}
147147

148+
const std::vector<ggml_type> kv_cache_types = {
149+
GGML_TYPE_F32,
150+
GGML_TYPE_F16,
151+
GGML_TYPE_BF16,
152+
GGML_TYPE_Q8_0,
153+
GGML_TYPE_Q4_0,
154+
GGML_TYPE_Q4_1,
155+
GGML_TYPE_IQ4_NL,
156+
GGML_TYPE_Q5_0,
157+
GGML_TYPE_Q5_1,
158+
};
159+
160+
static ggml_type kv_cache_type_from_str(const std::string & s) {
161+
for (const auto & type : kv_cache_types) {
162+
if (ggml_type_name(type) == s) {
163+
return type;
164+
}
165+
}
166+
throw std::runtime_error("Unsupported cache type: " + s);
167+
}
168+
169+
static std::string get_all_kv_cache_types() {
170+
std::ostringstream msg;
171+
for (const auto & type : kv_cache_types) {
172+
msg << ggml_type_name(type) << (&type == &kv_cache_types.back() ? "" : ", ");
173+
}
174+
return msg.str();
175+
}
176+
148177
//
149178
// CLI argument parsing functions
150179
//
@@ -1174,18 +1203,28 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
11741203
).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
11751204
add_opt(common_arg(
11761205
{"-ctk", "--cache-type-k"}, "TYPE",
1177-
string_format("KV cache data type for K (default: %s)", params.cache_type_k.c_str()),
1206+
string_format(
1207+
"KV cache data type for K\n"
1208+
"allowed values: %s\n"
1209+
"(default: %s)",
1210+
get_all_kv_cache_types().c_str(),
1211+
ggml_type_name(params.cache_type_k)
1212+
),
11781213
[](common_params & params, const std::string & value) {
1179-
// TODO: get the type right here
1180-
params.cache_type_k = value;
1214+
params.cache_type_k = kv_cache_type_from_str(value);
11811215
}
11821216
).set_env("LLAMA_ARG_CACHE_TYPE_K"));
11831217
add_opt(common_arg(
11841218
{"-ctv", "--cache-type-v"}, "TYPE",
1185-
string_format("KV cache data type for V (default: %s)", params.cache_type_v.c_str()),
1219+
string_format(
1220+
"KV cache data type for V\n"
1221+
"allowed values: %s\n"
1222+
"(default: %s)",
1223+
get_all_kv_cache_types().c_str(),
1224+
ggml_type_name(params.cache_type_v)
1225+
),
11861226
[](common_params & params, const std::string & value) {
1187-
// TODO: get the type right here
1188-
params.cache_type_v = value;
1227+
params.cache_type_v = kv_cache_type_from_str(value);
11891228
}
11901229
).set_env("LLAMA_ARG_CACHE_TYPE_V"));
11911230
add_opt(common_arg(
@@ -2083,35 +2122,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
20832122
[](common_params & params, int value) {
20842123
params.speculative.n_max = value;
20852124
}
2086-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2125+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MAX"));
20872126
add_opt(common_arg(
20882127
{"--draft-min", "--draft-n-min"}, "N",
20892128
string_format("minimum number of draft tokens to use for speculative decoding (default: %d)", params.speculative.n_min),
20902129
[](common_params & params, int value) {
20912130
params.speculative.n_min = value;
20922131
}
2093-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}));
2132+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_MIN"));
20942133
add_opt(common_arg(
20952134
{"--draft-p-split"}, "P",
20962135
string_format("speculative decoding split probability (default: %.1f)", (double)params.speculative.p_split),
20972136
[](common_params & params, const std::string & value) {
20982137
params.speculative.p_split = std::stof(value);
20992138
}
2100-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}));
2139+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE}).set_env("LLAMA_ARG_DRAFT_P_SPLIT"));
21012140
add_opt(common_arg(
21022141
{"--draft-p-min"}, "P",
21032142
string_format("minimum speculative decoding probability (greedy) (default: %.1f)", (double)params.speculative.p_min),
21042143
[](common_params & params, const std::string & value) {
21052144
params.speculative.p_min = std::stof(value);
21062145
}
2107-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2146+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_DRAFT_P_MIN"));
21082147
add_opt(common_arg(
21092148
{"-cd", "--ctx-size-draft"}, "N",
21102149
string_format("size of the prompt context for the draft model (default: %d, 0 = loaded from model)", params.speculative.n_ctx),
21112150
[](common_params & params, int value) {
21122151
params.speculative.n_ctx = value;
21132152
}
2114-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2153+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CTX_SIZE_DRAFT"));
21152154
add_opt(common_arg(
21162155
{"-devd", "--device-draft"}, "<dev1,dev2,..>",
21172156
"comma-separated list of devices to use for offloading the draft model (none = don't offload)\n"
@@ -2131,14 +2170,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
21312170
fprintf(stderr, "warning: consult docs/build.md for compilation instructions\n");
21322171
}
21332172
}
2134-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2173+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_N_GPU_LAYERS_DRAFT"));
21352174
add_opt(common_arg(
21362175
{"-md", "--model-draft"}, "FNAME",
21372176
"draft model for speculative decoding (default: unused)",
21382177
[](common_params & params, const std::string & value) {
21392178
params.speculative.model = value;
21402179
}
2141-
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}));
2180+
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT"));
21422181

21432182
return ctx_arg;
21442183
}

common/common.cpp

Lines changed: 2 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,38 +1015,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
10151015
return mparams;
10161016
}
10171017

1018-
static ggml_type kv_cache_type_from_str(const std::string & s) {
1019-
if (s == "f32") {
1020-
return GGML_TYPE_F32;
1021-
}
1022-
if (s == "f16") {
1023-
return GGML_TYPE_F16;
1024-
}
1025-
if (s == "bf16") {
1026-
return GGML_TYPE_BF16;
1027-
}
1028-
if (s == "q8_0") {
1029-
return GGML_TYPE_Q8_0;
1030-
}
1031-
if (s == "q4_0") {
1032-
return GGML_TYPE_Q4_0;
1033-
}
1034-
if (s == "q4_1") {
1035-
return GGML_TYPE_Q4_1;
1036-
}
1037-
if (s == "iq4_nl") {
1038-
return GGML_TYPE_IQ4_NL;
1039-
}
1040-
if (s == "q5_0") {
1041-
return GGML_TYPE_Q5_0;
1042-
}
1043-
if (s == "q5_1") {
1044-
return GGML_TYPE_Q5_1;
1045-
}
1046-
1047-
throw std::runtime_error("Unsupported cache type: " + s);
1048-
}
1049-
10501018
struct llama_context_params common_context_params_to_llama(const common_params & params) {
10511019
auto cparams = llama_context_default_params();
10521020

@@ -1081,8 +1049,8 @@ struct llama_context_params common_context_params_to_llama(const common_params &
10811049
cparams.pooling_type = LLAMA_POOLING_TYPE_RANK;
10821050
}
10831051

1084-
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
1085-
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
1052+
cparams.type_k = params.cache_type_k;
1053+
cparams.type_v = params.cache_type_v;
10861054

10871055
return cparams;
10881056
}

common/common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,8 @@ struct common_params {
286286
bool warmup = true; // warmup run
287287
bool check_tensors = false; // validate tensor data
288288

289-
std::string cache_type_k = "f16"; // KV cache data type for the K
290-
std::string cache_type_v = "f16"; // KV cache data type for the V
289+
ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
290+
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
291291

292292
// multimodal models (see examples/llava)
293293
std::string mmproj = ""; // path to multimodal projector // NOLINT

examples/CMakeLists.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@ else()
2020
add_subdirectory(batched)
2121
add_subdirectory(embedding)
2222
add_subdirectory(eval-callback)
23-
add_subdirectory(gbnf-validator)
23+
24+
if (NOT WIN32)
25+
# disabled on Windows because it uses internal functions not exported with LLAMA_API
26+
add_subdirectory(gbnf-validator)
27+
endif()
28+
2429
add_subdirectory(gguf-hash)
2530
add_subdirectory(gguf-split)
2631
add_subdirectory(gguf)
@@ -46,12 +51,16 @@ else()
4651
add_subdirectory(speculative)
4752
add_subdirectory(speculative-simple)
4853
add_subdirectory(tokenize)
54+
add_subdirectory(gen-docs)
4955
if (NOT GGML_BACKEND_DL)
5056
# these examples use the backends directly and cannot be built with dynamic loading
5157
add_subdirectory(convert-llama2c-to-ggml)
5258
add_subdirectory(cvector-generator)
5359
add_subdirectory(export-lora)
54-
add_subdirectory(quantize-stats)
60+
if (NOT WIN32)
61+
# disabled on Windows because it uses internal functions not exported with LLAMA_API
62+
add_subdirectory(quantize-stats)
63+
endif()
5564
add_subdirectory(llava)
5665
if (GGML_RPC)
5766
add_subdirectory(rpc)

examples/gguf-split/gguf-split.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ struct split_strategy {
287287
}
288288

289289
void print_info() {
290-
printf("n_split: %ld\n", ctx_outs.size());
290+
printf("n_split: %zu\n", ctx_outs.size());
291291
int i_split = 0;
292292
for (auto & ctx_out : ctx_outs) {
293293
// re-calculate the real gguf size for each split (= metadata size + total size of all tensors)
@@ -297,7 +297,7 @@ struct split_strategy {
297297
total_size += ggml_nbytes(t);
298298
}
299299
total_size = total_size / 1000 / 1000; // convert to megabytes
300-
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
300+
printf("split %05d: n_tensors = %d, total_size = %zuM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
301301
i_split++;
302302
}
303303
}

examples/llama-bench/llama-bench.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1521,7 +1521,7 @@ int main(int argc, char ** argv) {
15211521
for (const auto & inst : params_instances) {
15221522
params_idx++;
15231523
if (params.progress) {
1524-
fprintf(stderr, "llama-bench: benchmark %d/%ld: starting\n", params_idx, params_count);
1524+
fprintf(stderr, "llama-bench: benchmark %d/%zu: starting\n", params_idx, params_count);
15251525
}
15261526
// keep the same model between tests when possible
15271527
if (!lmodel || !prev_inst || !inst.equal_mparams(*prev_inst)) {
@@ -1573,14 +1573,14 @@ int main(int argc, char ** argv) {
15731573
// warmup run
15741574
if (t.n_prompt > 0) {
15751575
if (params.progress) {
1576-
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup prompt run\n", params_idx, params_count);
1576+
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup prompt run\n", params_idx, params_count);
15771577
}
15781578
//test_prompt(ctx, std::min(t.n_batch, std::min(t.n_prompt, 32)), 0, t.n_batch, t.n_threads);
15791579
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
15801580
}
15811581
if (t.n_gen > 0) {
15821582
if (params.progress) {
1583-
fprintf(stderr, "llama-bench: benchmark %d/%ld: warmup generation run\n", params_idx, params_count);
1583+
fprintf(stderr, "llama-bench: benchmark %d/%zu: warmup generation run\n", params_idx, params_count);
15841584
}
15851585
test_gen(ctx, 1, t.n_threads);
15861586
}
@@ -1592,14 +1592,14 @@ int main(int argc, char ** argv) {
15921592

15931593
if (t.n_prompt > 0) {
15941594
if (params.progress) {
1595-
fprintf(stderr, "llama-bench: benchmark %d/%ld: prompt run %d/%d\n", params_idx, params_count,
1595+
fprintf(stderr, "llama-bench: benchmark %d/%zu: prompt run %d/%d\n", params_idx, params_count,
15961596
i + 1, params.reps);
15971597
}
15981598
test_prompt(ctx, t.n_prompt, t.n_batch, t.n_threads);
15991599
}
16001600
if (t.n_gen > 0) {
16011601
if (params.progress) {
1602-
fprintf(stderr, "llama-bench: benchmark %d/%ld: generation run %d/%d\n", params_idx, params_count,
1602+
fprintf(stderr, "llama-bench: benchmark %d/%zu: generation run %d/%d\n", params_idx, params_count,
16031603
i + 1, params.reps);
16041604
}
16051605
test_gen(ctx, t.n_gen, t.n_threads);

examples/quantize/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Several quantization methods are supported. They differ in the resulting model d
8181
- [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
8282
- [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
8383
- [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
84-
- [#4996 - k-qunats tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
84+
- [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
8585
- [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
8686
- [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
8787
- [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)

examples/retrieval/retrieval.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ int main(int argc, char ** argv) {
143143
std::vector<chunk> file_chunk = chunk_file(context_file, params.chunk_size, params.chunk_separator);
144144
chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end());
145145
}
146-
LOG_INF("Number of chunks: %ld\n", chunks.size());
146+
LOG_INF("Number of chunks: %zu\n", chunks.size());
147147

148148
llama_backend_init();
149149
llama_numa_init(params.numa);

0 commit comments

Comments
 (0)