@@ -696,8 +696,9 @@ struct server_context {
696
696
697
697
params_dft.devices = params_base.speculative .devices ;
698
698
params_dft.model = params_base.speculative .model ;
699
- params_dft.n_ctx = params_base.speculative .n_ctx ;
699
+ params_dft.n_ctx = params_base.speculative .n_ctx == 0 ? params_base. n_ctx / params_base. n_parallel : params_base. speculative . n_ctx ;
700
700
params_dft.n_gpu_layers = params_base.speculative .n_gpu_layers ;
701
+ params_dft.n_parallel = 1 ;
701
702
702
703
common_init_result llama_init_dft = common_init_from_params (params_dft);
703
704
@@ -717,8 +718,14 @@ struct server_context {
717
718
return false ;
718
719
}
719
720
720
- cparams_dft = common_context_params_to_llama (params_base);
721
- cparams_dft.n_batch = llama_n_ctx (llama_init_dft.context );
721
+ const int n_ctx_dft = llama_n_ctx (llama_init_dft.context );
722
+
723
+ cparams_dft = common_context_params_to_llama (params_dft);
724
+ cparams_dft.n_batch = n_ctx_dft;
725
+
726
+ // force F16 KV cache for the draft model for extra performance
727
+ cparams_dft.type_k = GGML_TYPE_F16;
728
+ cparams_dft.type_v = GGML_TYPE_F16;
722
729
723
730
// the context is not needed - we will create one for each slot
724
731
llama_free (llama_init_dft.context );
@@ -2322,6 +2329,10 @@ struct server_context {
2322
2329
continue ;
2323
2330
}
2324
2331
2332
+ if (slot.state != SLOT_STATE_GENERATING) {
2333
+ continue ;
2334
+ }
2335
+
2325
2336
llama_token id = slot.sampled ;
2326
2337
2327
2338
struct common_speculative_params params_spec;
0 commit comments