Skip to content

Commit 70b98fa

Browse files
authored
server : fix default draft model parameters (#10586)
* server : force F16 KV cache for the draft model ggml-ci * server : fix draft params ggml-ci * server : various params fixes ggml-ci
1 parent 642330a commit 70b98fa

File tree

1 file changed

+14
-3
lines changed

1 file changed

+14
-3
lines changed

examples/server/server.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -696,8 +696,9 @@ struct server_context {
696696

697697
params_dft.devices = params_base.speculative.devices;
698698
params_dft.model = params_base.speculative.model;
699-
params_dft.n_ctx = params_base.speculative.n_ctx;
699+
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
700700
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
701+
params_dft.n_parallel = 1;
701702

702703
common_init_result llama_init_dft = common_init_from_params(params_dft);
703704

@@ -717,8 +718,14 @@ struct server_context {
717718
return false;
718719
}
719720

720-
cparams_dft = common_context_params_to_llama(params_base);
721-
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
721+
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context);
722+
723+
cparams_dft = common_context_params_to_llama(params_dft);
724+
cparams_dft.n_batch = n_ctx_dft;
725+
726+
// force F16 KV cache for the draft model for extra performance
727+
cparams_dft.type_k = GGML_TYPE_F16;
728+
cparams_dft.type_v = GGML_TYPE_F16;
722729

723730
// the context is not needed - we will create one for each slot
724731
llama_free(llama_init_dft.context);
@@ -2322,6 +2329,10 @@ struct server_context {
23222329
continue;
23232330
}
23242331

2332+
if (slot.state != SLOT_STATE_GENERATING) {
2333+
continue;
2334+
}
2335+
23252336
llama_token id = slot.sampled;
23262337

23272338
struct common_speculative_params params_spec;

0 commit comments

Comments
 (0)