@@ -645,18 +645,16 @@ struct server_context {
645
645
646
646
// Clear any sampling context
647
647
for (server_slot & slot : slots) {
648
- if (slot.smpl != nullptr ) {
649
- llama_free (slot.ctx_dft );
650
- slot.ctx_dft = nullptr ;
648
+ common_sampler_free (slot.smpl );
649
+ slot.smpl = nullptr ;
651
650
652
- common_speculative_free (slot.spec );
653
- slot.spec = nullptr ;
651
+ llama_free (slot.ctx_dft );
652
+ slot.ctx_dft = nullptr ;
654
653
655
- common_sampler_free (slot.smpl );
656
- slot.smpl = nullptr ;
654
+ common_speculative_free (slot.spec );
655
+ slot.spec = nullptr ;
657
656
658
- llama_batch_free (slot.batch_spec );
659
- }
657
+ llama_batch_free (slot.batch_spec );
660
658
}
661
659
662
660
llama_batch_free (batch);
@@ -688,15 +686,9 @@ struct server_context {
688
686
689
687
auto params_dft = params;
690
688
691
- params_dft.model = params.model_draft ;
689
+ params_dft.model = params.model_draft ;
692
690
params_dft.n_gpu_layers = params.n_gpu_layers_draft ;
693
691
694
- if (params.draft_cpuparams .n_threads > 0 ) {
695
- params_dft.cpuparams .n_threads = params.draft_cpuparams .n_threads ;
696
- }
697
-
698
- params_dft.cpuparams_batch .n_threads = params.draft_cpuparams_batch .n_threads ;
699
-
700
692
common_init_result llama_init_dft = common_init_from_params (params_dft);
701
693
702
694
model_dft = llama_init_dft.model ;
@@ -708,10 +700,15 @@ struct server_context {
708
700
709
701
if (!common_speculative_are_compatible (ctx, llama_init_dft.context )) {
710
702
SRV_ERR (" the draft model '%s' is not compatible with the target model '%s'\n " , params.model_draft .c_str (), params.model .c_str ());
703
+
704
+ llama_free (llama_init_dft.context );
705
+ llama_free_model (llama_init_dft.model );
706
+
711
707
return false ;
712
708
}
713
709
714
710
cparams_dft = common_context_params_to_llama (params);
711
+ cparams_dft.n_batch = llama_n_ctx (llama_init_dft.context );
715
712
716
713
// the context is not needed - we will create one for each slot
717
714
llama_free (llama_init_dft.context );
0 commit comments