@@ -371,7 +371,7 @@ void llama_kv_cache_unified::commit() {
371
371
bool llama_kv_cache_unified::update (llama_context & lctx) {
372
372
bool need_reserve = false ;
373
373
374
- const auto & sched = lctx.get_sched ();
374
+ auto * sched = lctx.get_sched ();
375
375
376
376
if (has_shift) {
377
377
if (!get_can_shift ()) {
@@ -382,13 +382,13 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
382
382
383
383
// apply K-shift if needed
384
384
if (hparams.rope_type != LLAMA_ROPE_TYPE_NONE) {
385
- ggml_backend_sched_reset (sched. get () );
385
+ ggml_backend_sched_reset (sched);
386
386
387
387
auto * gf = lctx.graph_init ();
388
388
389
389
auto res = build_graph_shift (lctx, gf);
390
390
391
- ggml_backend_sched_alloc_graph (sched. get () , gf);
391
+ ggml_backend_sched_alloc_graph (sched, gf);
392
392
393
393
res->set_inputs (nullptr );
394
394
@@ -410,13 +410,13 @@ bool llama_kv_cache_unified::update(llama_context & lctx) {
410
410
LLAMA_LOG_DEBUG (" %s: defragmenting KV cache\n " , __func__);
411
411
412
412
if (defrag_prepare (lctx.graph_max_nodes ())) {
413
- ggml_backend_sched_reset (sched. get () );
413
+ ggml_backend_sched_reset (sched);
414
414
415
415
auto * gf = lctx.graph_init ();
416
416
417
417
auto res = build_graph_defrag (lctx, gf);
418
418
419
- ggml_backend_sched_alloc_graph (sched. get () , gf);
419
+ ggml_backend_sched_alloc_graph (sched, gf);
420
420
421
421
res->set_inputs (nullptr );
422
422
@@ -602,7 +602,8 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
602
602
ggml_backend_buffer * bbuf) const {
603
603
const auto & cparams = lctx.get_cparams ();
604
604
const auto & backends = lctx.get_backends ();
605
- const auto & sched = lctx.get_sched ();
605
+
606
+ auto * sched = lctx.get_sched ();
606
607
607
608
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn ;
608
609
@@ -623,12 +624,12 @@ ggml_tensor * llama_kv_cache_unified::build_rope_shift(
623
624
// dequantize to f32 -> RoPE -> quantize back
624
625
tmp = ggml_cast (ctx, cur, GGML_TYPE_F32);
625
626
626
- // TODO: can we simplify/avoid this?
627
+ // TODO: can we simplify/avoid this? [TAG_BACKENDS]
627
628
if (bbuf) {
628
629
for (const auto & backend : backends) {
629
630
// Figure out which backend KV cache belongs to
630
631
if (ggml_backend_supports_buft (backend.get (), ggml_backend_buffer_get_type (bbuf))) {
631
- ggml_backend_sched_set_tensor_backend (sched. get () , tmp, backend.get ());
632
+ ggml_backend_sched_set_tensor_backend (sched, tmp, backend.get ());
632
633
break ;
633
634
}
634
635
}
@@ -680,7 +681,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_shift(
680
681
ggml_cgraph * gf) const {
681
682
auto res = std::make_unique<llm_graph_result>();
682
683
683
- auto * ctx = lctx.get_ctx_compute (). get () ;
684
+ auto * ctx = lctx.get_ctx_compute ();
684
685
685
686
const auto & cparams = lctx.get_cparams ();
686
687
@@ -733,7 +734,7 @@ llm_graph_result_ptr llama_kv_cache_unified::build_graph_defrag(
733
734
ggml_cgraph * gf) const {
734
735
auto res = std::make_unique<llm_graph_result>();
735
736
736
- auto * ctx = lctx.get_ctx_compute (). get () ;
737
+ auto * ctx = lctx.get_ctx_compute ();
737
738
738
739
const auto & ids = defrag_info.ids ;
739
740
0 commit comments