shefben
diff --git a/‎src/llama-context.cpp
Lines changed: 27 additions & 56 deletions b/‎src/llama-context.cpp
Lines changed: 27 additions & 56 deletions
diff --git a/‎src/llama-context.h
Lines changed: 1 addition & 5 deletions b/‎src/llama-context.h
Lines changed: 1 addition & 5 deletions
diff --git a/‎src/llama-kv-cache-recurrent.cpp
Lines changed: 11 additions & 8 deletions b/‎src/llama-kv-cache-recurrent.cpp
Lines changed: 11 additions & 8 deletions
diff --git a/‎src/llama-kv-cache-recurrent.h
Lines changed: 3 additions & 1 deletion b/‎src/llama-kv-cache-recurrent.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/llama-kv-cache-unified-iswa.cpp
Lines changed: 28 additions & 31 deletions b/‎src/llama-kv-cache-unified-iswa.cpp
Lines changed: 28 additions & 31 deletions
diff --git a/‎src/llama-kv-cache-unified-iswa.h
Lines changed: 8 additions & 10 deletions b/‎src/llama-kv-cache-unified-iswa.h
Lines changed: 8 additions & 10 deletions
@@ -429,62 +429,30 @@ const llama_kv_cache * llama_context::get_kv_self() const {
     return kv_self;
 }
 
-void llama_context::kv_self_defrag_sched() {
-    if (!memory) {
-        return;
-    }
-
-    memory_force_optimize = true;
-}
-
-bool llama_context::kv_self_update(bool optimize) {
+bool llama_context::kv_self_update() {
     if (!memory) {
         return false;
     }
 
     llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
 
-    {
-        // TODO: remove in the future
-        optimize |= memory_force_optimize;
-        memory_force_optimize = false;
-
-        const auto kv_state = kv_self->init_update(this, optimize);
-        switch (kv_state->get_status()) {
-            case LLAMA_MEMORY_STATUS_SUCCESS:
-                {
-                    // noop
-                } break;
-            case LLAMA_MEMORY_STATUS_NO_UPDATE:
-                {
-                    // no updates need to be performed
-                    return false;
-                }
-            case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
-            case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
-                {
-                    LLAMA_LOG_ERROR("%s: failed to prepare memory update\n", __func__);
-                    return false;
-                }
-        }
-
-        if (!kv_state->apply()) {
-            LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
-        }
+    if (!kv_self->update(*this)) {
+        // no updates have been performed
+        return false;
     }
 
     // if the KV cache did any computation, we have to reserve a new worst-case graph
     const auto kv_state = kv_self->init_full();
     if (!kv_state) {
-        throw std::runtime_error("failed to initialize memory state");
+        throw std::runtime_error("failed to initialize KV cache");
     }
 
     const uint32_t n_seqs   = cparams.n_seq_max;
     const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
 
     auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
     if (!gf) {
-        LLAMA_LOG_ERROR("%s: failed to reserve graph after the memory update\n", __func__);
+        LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
     }
 
     return true;
@@ -972,13 +940,13 @@ int llama_context::decode(llama_batch & inp_batch) {
         n_outputs_all = 1;
     }
 
-    bool did_optimize = false;
-
     // handle any pending defrags/shifts
-    kv_self_update(false);
+    kv_self_update();
 
     llama_memory_state_ptr kv_state;
 
+    bool did_defrag = false;
+
     while (true) {
         kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
         if (!kv_state) {
@@ -989,32 +957,25 @@ int llama_context::decode(llama_batch & inp_batch) {
             case LLAMA_MEMORY_STATUS_SUCCESS:
                 {
                 } break;
-            case LLAMA_MEMORY_STATUS_NO_UPDATE:
-                {
-                    LLAMA_LOG_ERROR("%s: unexpected memory state status: %d\n", __func__, kv_state->get_status());
-
-                    return -2;
-                }
             case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
                 {
-                    if (!did_optimize) {
-                        did_optimize = true;
+                    if (!did_defrag) {
+                        did_defrag = true;
 
-                        if (kv_self_update(true)) {
-                            LLAMA_LOG_DEBUG("%s: retrying batch size %d after cache optimization\n", __func__, batch.n_tokens);
+                        kv_self->defrag_sched(-1.0f);
+                        if (kv_self_update()) {
+                            LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
 
                             continue;
                         }
                     }
 
-                    LLAMA_LOG_WARN("%s: failed to find a memory slot for batch of size %d\n", __func__, batch.n_tokens);
+                    LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
 
                     return 1;
                 }
             case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
                 {
-                    LLAMA_LOG_ERROR("%s: compute failed while preparing batch of size %d\n", __func__, batch.n_tokens);
-
                     return -2;
                 }
         }
@@ -1231,6 +1192,11 @@ int llama_context::decode(llama_batch & inp_batch) {
     // wait for the computation to finish (automatically done when obtaining the model output)
     //synchronize();
 
+    // decide if we need to defrag the kv cache
+    if (cparams.defrag_thold > 0.0f) {
+        kv_self->defrag_sched(cparams.defrag_thold);
+    }
+
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
     // overlap with device computation.
     ggml_backend_sched_reset(sched.get());
@@ -2320,7 +2286,7 @@ llama_kv_cache * llama_get_kv_self(llama_context * ctx) {
 
 // deprecated
 void llama_kv_self_update(llama_context * ctx) {
-    ctx->kv_self_update(false);
+    ctx->kv_self_update();
 }
 
 enum llama_pooling_type llama_pooling_type(const llama_context * ctx) {
@@ -2575,8 +2541,13 @@ llama_pos llama_kv_self_seq_pos_max(llama_context * ctx, llama_seq_id seq_id) {
 
 // deprecated
 void llama_kv_self_defrag(llama_context * ctx) {
+    auto * kv = ctx->get_kv_self();
+    if (!kv) {
+        return;
+    }
+
     // force defrag
-    ctx->kv_self_defrag_sched();
+    kv->defrag_sched(-1.0f);
 }
 
 bool llama_kv_self_can_shift(const llama_context * ctx) {
 
@@ -52,8 +52,7 @@ struct llama_context {
 
     // return true of the KV cache was updated
     // TODO: remove
-    bool kv_self_update(bool optimize);
-    void kv_self_defrag_sched();
+    bool kv_self_update();
 
     enum llama_pooling_type pooling_type() const;
 
@@ -232,9 +231,6 @@ struct llama_context {
 
     std::unique_ptr<llama_memory_i> memory;
 
-    // TODO: temporary, until the llama_kv_self_defrag() API is removed
-    bool memory_force_optimize = false;
-
     // decode output (2-dimensional array: [n_outputs][n_vocab])
     size_t  logits_size = 0; // capacity (of floats) for logits
     float * logits      = nullptr;
 
@@ -1,7 +1,6 @@
 #include "llama-kv-cache-recurrent.h"
 
 #include "llama-impl.h"
-#include "llama-io.h"
 #include "llama-batch.h"
 #include "llama-model.h"
 
@@ -387,13 +386,6 @@ llama_memory_state_ptr llama_kv_cache_recurrent::init_full() {
     return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
 }
 
-llama_memory_state_ptr llama_kv_cache_recurrent::init_update(llama_context * lctx, bool optimize) {
-    GGML_UNUSED(lctx);
-    GGML_UNUSED(optimize);
-
-    return std::make_unique<llama_kv_cache_recurrent_state>(LLAMA_MEMORY_STATUS_NO_UPDATE);
-}
-
 bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatches) {
     // simply remember the full state because it is very small for this type of cache
     // TODO: optimize
@@ -427,6 +419,17 @@ bool llama_kv_cache_recurrent::prepare(const std::vector<llama_ubatch> & ubatche
     return success;
 }
 
+bool llama_kv_cache_recurrent::update(llama_context & lctx) {
+    GGML_UNUSED(lctx);
+    // noop
+    return false;
+}
+
+void llama_kv_cache_recurrent::defrag_sched(float thold) {
+    GGML_UNUSED(thold);
+    // noop
+}
+
 bool llama_kv_cache_recurrent::find_slot(const llama_ubatch & ubatch) {
     const uint32_t n_tokens = ubatch.n_tokens;
     const uint32_t n_seqs   = ubatch.n_seqs;
 
@@ -52,7 +52,9 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
 
     llama_memory_state_ptr init_full() override;
 
-    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
+    bool update(llama_context & lctx) override;
+
+    void defrag_sched(float thold) override;
 
     bool prepare(const std::vector<llama_ubatch> & ubatches);
 
 
@@ -123,16 +123,26 @@ llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch
 
     assert(heads_base.size() == heads_swa.size());
 
-    return std::make_unique<llama_kv_cache_unified_iswa_state>(
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS,
             this, std::move(sbatch), std::move(heads_base), std::move(heads_swa), std::move(ubatches));
 }
 
 llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
-    return std::make_unique<llama_kv_cache_unified_iswa_state>(this);
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_SUCCESS, this);
 }
 
-llama_memory_state_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
-    return std::make_unique<llama_kv_cache_unified_iswa_state>(this, lctx, optimize);
+bool llama_kv_cache_unified_iswa::update(llama_context & lctx) {
+    bool res = false;
+
+    res = res | kv_base->update(lctx);
+    res = res | kv_swa ->update(lctx);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::defrag_sched(float thold) {
+    kv_base->defrag_sched(thold);
+    kv_swa ->defrag_sched(thold);
 }
 
 bool llama_kv_cache_unified_iswa::get_can_shift() const {
@@ -164,38 +174,26 @@ llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {}
 
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
-        llama_kv_cache_unified_iswa * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS) {
-    state_base = kv->get_base()->init_full();
-    state_swa  = kv->get_swa ()->init_full();
-
-    status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
-}
-
-llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
-        llama_kv_cache_unified_iswa * kv,
-        llama_context * lctx,
-        bool optimize) : status(LLAMA_MEMORY_STATUS_SUCCESS) {
-    state_base = kv->get_base()->init_update(lctx, optimize);
-    state_swa  = kv->get_swa ()->init_update(lctx, optimize);
-
-    status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
+        llama_memory_status status,
+        llama_kv_cache_unified_iswa * kv) : status(status) {
+    state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base()));
+    state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa ()));
 }
 
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
+        llama_memory_status status,
         llama_kv_cache_unified_iswa * kv,
         llama_sbatch sbatch,
         std::vector<uint32_t> heads_base,
         std::vector<uint32_t> heads_swa,
         std::vector<llama_ubatch> ubatches)
-        : status(LLAMA_MEMORY_STATUS_SUCCESS),
-        sbatch(std::move(sbatch)),
-        ubatches(std::move(ubatches)) {
-    // note: here we copy the ubatches. not sure if this is ideal
-    state_base.reset(new llama_kv_cache_unified_state(kv->get_base(), {}, std::move(heads_base), this->ubatches));
-    state_swa .reset(new llama_kv_cache_unified_state(kv->get_swa (), {}, std::move(heads_swa),  this->ubatches));
-
-    status = llama_memory_status_combine(state_base->get_status(), state_swa->get_status());
-}
+    : status(status),
+    sbatch(std::move(sbatch)),
+    ubatches(std::move(ubatches)) {
+        // note: here we copy the ubatches. not sure if this is ideal
+        state_base.reset(new llama_kv_cache_unified_state(status, kv->get_base(), {}, std::move(heads_base), this->ubatches));
+        state_swa .reset(new llama_kv_cache_unified_state(status, kv->get_swa (), {}, std::move(heads_swa),  this->ubatches));
+    }
 
 llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
 
@@ -235,18 +233,17 @@ llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
 
 const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
-
     return ubatches[i_next];
 }
 
 const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
 
-    return static_cast<const llama_kv_cache_unified_state *>(state_base.get());
+    return state_base.get();
 }
 
 const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa()  const {
     assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
 
-    return static_cast<const llama_kv_cache_unified_state *>(state_swa.get());
+    return state_swa.get();
 }
@@ -54,7 +54,9 @@ class llama_kv_cache_unified_iswa : public llama_kv_cache {
 
     llama_memory_state_ptr init_full() override;
 
-    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
+    bool update(llama_context & lctx) override;
+
+    void defrag_sched(float thold) override;
 
     bool get_can_shift() const override;
 
@@ -84,16 +86,12 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
 
     // used to create a full-cache state
     llama_kv_cache_unified_iswa_state(
+            llama_memory_status status,
             llama_kv_cache_unified_iswa * kv);
 
-    // used to create an update state
-    llama_kv_cache_unified_iswa_state(
-            llama_kv_cache_unified_iswa * kv,
-            llama_context * lctx,
-            bool optimize);
-
     // used to create a state from a batch
     llama_kv_cache_unified_iswa_state(
+            llama_memory_status status,
             llama_kv_cache_unified_iswa * kv,
             llama_sbatch sbatch,
             std::vector<uint32_t> heads_base,
@@ -122,7 +120,7 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
     const llama_kv_cache_unified_state * get_swa()  const;
 
 private:
-    llama_memory_status status;
+    const llama_memory_status status;
 
     //llama_kv_cache_unified_iswa * kv;
 
@@ -133,6 +131,6 @@ class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
 
     std::vector<llama_ubatch> ubatches;
 
-    llama_memory_state_ptr state_base;
-    llama_memory_state_ptr state_swa;
+    std::unique_ptr<llama_kv_cache_unified_state> state_base;
+    std::unique_ptr<llama_kv_cache_unified_state> state_swa;
 };