@@ -2267,50 +2267,49 @@ struct server_context {
2267
2267
continue ; // continue loop of slots
2268
2268
}
2269
2269
2270
- llama_token id;
2270
+ llama_token id = common_sampler_sample (slot. smpl , ctx, slot. i_batch - i) ;
2271
2271
2272
- {
2273
- completion_token_output result;
2274
-
2275
- id = common_sampler_sample (slot.smpl , ctx, slot.i_batch - i);
2272
+ slot.i_batch = -1 ;
2276
2273
2277
- slot.i_batch = - 1 ;
2274
+ common_sampler_accept ( slot.smpl , id, true ) ;
2278
2275
2279
- common_sampler_accept (slot.smpl , id, true );
2280
-
2281
- slot.n_decoded += 1 ;
2282
- if (slot.n_decoded == 1 ) {
2283
- slot.t_start_generation = ggml_time_us ();
2284
- slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt ) / 1e3 ;
2285
- metrics.on_prompt_eval (slot);
2286
- }
2276
+ slot.n_decoded += 1 ;
2277
+ if (slot.n_decoded == 1 ) {
2278
+ slot.t_start_generation = ggml_time_us ();
2279
+ slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt ) / 1e3 ;
2280
+ metrics.on_prompt_eval (slot);
2281
+ }
2287
2282
2288
- result.tok = id;
2283
+ completion_token_output result;
2284
+ result.tok = id;
2289
2285
2290
- const auto * cur_p = common_sampler_get_candidates (slot.smpl );
2286
+ const auto * cur_p = common_sampler_get_candidates (slot.smpl );
2291
2287
2292
- for (size_t i = 0 ; i < (size_t ) slot.params .sampling .n_probs ; ++i) {
2293
- result.probs .push_back ({
2294
- cur_p->data [i].id ,
2295
- i >= cur_p->size ? 0 .0f : cur_p->data [i].p ,
2296
- });
2297
- }
2288
+ for (size_t i = 0 ; i < (size_t ) slot.params .sampling .n_probs ; ++i) {
2289
+ result.probs .push_back ({
2290
+ cur_p->data [i].id ,
2291
+ i >= cur_p->size ? 0 .0f : cur_p->data [i].p ,
2292
+ });
2293
+ }
2298
2294
2299
- if (!process_token (result, slot)) {
2300
- // release slot because of stop condition
2301
- slot.release ();
2302
- slot.print_timings ();
2303
- send_final_response (slot);
2304
- metrics.on_prediction (slot);
2305
- continue ;
2306
- }
2295
+ if (!process_token (result, slot)) {
2296
+ // release slot because of stop condition
2297
+ slot.release ();
2298
+ slot.print_timings ();
2299
+ send_final_response (slot);
2300
+ metrics.on_prediction (slot);
2301
+ continue ;
2307
2302
}
2303
+ }
2308
2304
2309
- // check if the slot supports speculative decoding
2310
- if (!slot.can_speculate ()) {
2305
+ // do speculative decoding
2306
+ for (auto & slot : slots) {
2307
+ if (!slot.is_processing () || !slot.can_speculate ()) {
2311
2308
continue ;
2312
2309
}
2313
2310
2311
+ llama_token id = slot.sampled ;
2312
+
2314
2313
struct common_speculative_params params_spec;
2315
2314
params_spec.n_draft = slot.params .speculative .n_max ;
2316
2315
params_spec.n_reuse = llama_n_ctx (slot.ctx_dft ) - slot.params .speculative .n_max ;
0 commit comments