kv-cache : hide defrag logic in the implementation

ggerganov · ggerganov · commit 61710fca6e30 · 2025-04-28T10:02:23.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1426,19 +1426,8 @@ int llama_context::decode(llama_batch & inp_batch) {
     //synchronize();
 
     // decide if we need to defrag the kv cache
-    if (!llama_model_is_recurrent(&model) && cparams.causal_attn && cparams.defrag_thold > 0.0f) {
-        auto * kv = static_cast<llama_kv_cache_unified *>(kv_self);
-
-        // - do not defrag small contexts (i.e. < 2048 tokens)
-        // - count the padding towards the number of used tokens
-        const float fragmentation = kv->n >= 2048 ? std::max(0.0f, 1.0f - float(kv->used + kv->padding)/float(kv->n)) : 0.0f;
-
-        // queue defragmentation for next llama_kv_cache_update
-        if (fragmentation > cparams.defrag_thold) {
-            LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
-
-            kv_self->defrag();
-        }
+    if (cparams.defrag_thold > 0.0f) {
+        kv_self->defrag(cparams.defrag_thold);
     }
 
     // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
@@ -2588,7 +2577,8 @@ void llama_kv_self_defrag(llama_context * ctx) {
         return;
     }
 
-    return kv->defrag();
+    // force defrag
+    return kv->defrag(-1.0f);
 }
 
 // deprecated
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp
@@ -357,8 +357,17 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
 
-void llama_kv_cache_unified::defrag() {
-    do_defrag = true;
+void llama_kv_cache_unified::defrag(float thold) {
+    // - do not defrag small contexts (i.e. < 2048 tokens)
+    // - count the padding towards the number of used tokens
+    const float fragmentation = n >= 2048 ? std::max(0.0f, 1.0f - float(used + padding)/float(n)) : 0.0f;
+
+    // queue defragmentation for next llama_kv_cache_update
+    if (fragmentation > thold) {
+        LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation);
+
+        do_defrag = true;
+    }
 }
 
 void llama_kv_cache_unified::restore() {
@@ -1358,7 +1367,8 @@ llama_pos llama_kv_cache_recurrent::seq_pos_max(llama_seq_id seq_id) const {
     return result;
 }
 
-void llama_kv_cache_recurrent::defrag() {
+void llama_kv_cache_recurrent::defrag(float thold) {
+    GGML_UNUSED(thold);
     LLAMA_LOG_ERROR("%s: not supported\n", __func__);
 }
 
diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h
@@ -31,6 +31,8 @@ struct llama_kv_cache : public llama_memory_i {
     virtual void restore() = 0; // call if batch processing fails - restores the cache state
     virtual void commit() = 0;  // call after successful batch processing - clears any pending state
 
+    virtual void defrag(float thold) = 0;
+
     virtual int32_t get_n_tokens()   const = 0;
     virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
 
@@ -124,7 +126,7 @@ class llama_kv_cache_unified : public llama_kv_cache {
     llama_pos get_pos_max() const override;
 
     void clear() override;
-    void defrag() override;
+    void defrag(float thold) override;
 
     void restore() override;
     void commit() override;
@@ -252,7 +254,7 @@ class llama_kv_cache_recurrent : public llama_kv_cache {
     llama_pos get_pos_max() const override;
 
     void clear() override;
-    void defrag() override;
+    void defrag(float thold) override;
 
     void restore() override;
     void commit() override;
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -23,7 +23,6 @@ class llama_memory_i {
     virtual ~llama_memory_i() = default;
 
     virtual void clear() = 0;
-    virtual void defrag() = 0;
 
     virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
     virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;