Skip to content

Commit 0c74590

Browse files
committed
server : fixes
ggml-ci
1 parent 7dc6ae5 commit 0c74590

File tree

1 file changed

+13
-16
lines changed

1 file changed

+13
-16
lines changed

examples/server/server.cpp

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -645,18 +645,16 @@ struct server_context {
645645

646646
// Clear any sampling context
647647
for (server_slot & slot : slots) {
648-
if (slot.smpl != nullptr) {
649-
llama_free(slot.ctx_dft);
650-
slot.ctx_dft = nullptr;
648+
common_sampler_free(slot.smpl);
649+
slot.smpl = nullptr;
651650

652-
common_speculative_free(slot.spec);
653-
slot.spec = nullptr;
651+
llama_free(slot.ctx_dft);
652+
slot.ctx_dft = nullptr;
654653

655-
common_sampler_free(slot.smpl);
656-
slot.smpl = nullptr;
654+
common_speculative_free(slot.spec);
655+
slot.spec = nullptr;
657656

658-
llama_batch_free(slot.batch_spec);
659-
}
657+
llama_batch_free(slot.batch_spec);
660658
}
661659

662660
llama_batch_free(batch);
@@ -688,15 +686,9 @@ struct server_context {
688686

689687
auto params_dft = params;
690688

691-
params_dft.model = params.model_draft;
689+
params_dft.model = params.model_draft;
692690
params_dft.n_gpu_layers = params.n_gpu_layers_draft;
693691

694-
if (params.draft_cpuparams.n_threads > 0) {
695-
params_dft.cpuparams.n_threads = params.draft_cpuparams.n_threads;
696-
}
697-
698-
params_dft.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
699-
700692
common_init_result llama_init_dft = common_init_from_params(params_dft);
701693

702694
model_dft = llama_init_dft.model;
@@ -708,10 +700,15 @@ struct server_context {
708700

709701
if (!common_speculative_are_compatible(ctx, llama_init_dft.context)) {
710702
SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params.model_draft.c_str(), params.model.c_str());
703+
704+
llama_free (llama_init_dft.context);
705+
llama_free_model(llama_init_dft.model);
706+
711707
return false;
712708
}
713709

714710
cparams_dft = common_context_params_to_llama(params);
711+
cparams_dft.n_batch = llama_n_ctx(llama_init_dft.context);
715712

716713
// the context is not needed - we will create one for each slot
717714
llama_free(llama_init_dft.context);

0 commit comments

Comments
 (0)