context : move memory creation logic to model

ggerganov · ggerganov · commit 8c3a6a29ffe1 · 2025-04-22T16:16:47.000+03:00
ggml-ci
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -178,34 +178,12 @@ llama_context::llama_context(
 
     // init the memory module
     if (!hparams.vocab_only) {
-        LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
-
-        if (!llama_model_is_recurrent(&model)) {
-            cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_unified::get_padding(cparams));
-
-            LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
-
-            llama_memory_params params_mem = {
-                /*.type_k       =*/ params.type_k,
-                /*.type_v       =*/ params.type_v,
-                /*.v_trans      =*/ !cparams.flash_attn,
-                /*.offload_kqv  =*/ cparams.offload_kqv,
-                /*.kv_size      =*/ cparams.n_ctx,
-            };
-
-            memory.reset(model.create_memory(params_mem));
-        } else {
-            llama_memory_params params_mem = {
-                /*.type_k       =*/ GGML_TYPE_F32, // required by ggml_ssm_conv for Mamba's conv_states
-                /*.type_v       =*/ GGML_TYPE_F32, // required by ggml_ssm_scan for Mamba's ssm_states
-                /*.v_trans      =*/ false, // unused
-                /*.offload_kqv  =*/ cparams.offload_kqv,
-                /*.kv_size      =*/ std::max((uint32_t) 1, params.n_seq_max), // Mamba needs at least as many KV cells as there are sequences kept at any time
-            };
-
-            memory.reset(model.create_memory(params_mem));
-        }
+        llama_memory_params params_mem = {
+            /*.type_k       =*/ params.type_k,
+            /*.type_v       =*/ params.type_v,
+        };
 
+        memory.reset(model.create_memory(cparams, params_mem));
     }
 
     // init backends
diff --git a/src/llama-memory.h b/src/llama-memory.h
@@ -7,10 +7,10 @@ struct llama_memory_params {
     ggml_type type_k;
     ggml_type type_v;
 
-    bool v_trans;
-    bool offload_kqv;
+    //bool v_trans;
+    //bool offload_kqv;
 
-    uint32_t kv_size;
+    //uint32_t kv_size;
 
     // other types of memory
     // ...
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -12764,10 +12764,10 @@ struct llm_build_bailingmoe : public llm_graph_context {
     }
 };
 
-llama_memory_i * llama_model::create_memory(const llama_memory_params & params) const {
+llama_memory_i * llama_model::create_memory(llama_cparams & cparams, const llama_memory_params & params) const {
     llama_memory_i * res;
 
-    const bool offload = params.offload_kqv;
+    const bool offload = cparams.offload_kqv;
 
     auto get_buft = [this, offload](int il) {
         const char * dev_name = "CPU";
@@ -12787,6 +12787,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params)
         return buft;
     };
 
+    LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
+
     switch (arch) {
         case LLM_ARCH_MAMBA:
         case LLM_ARCH_RWKV6:
@@ -12800,12 +12802,16 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params)
                             /*.get_rope_factors =*/ nullptr,
                             /*.get_buft         =*/ get_buft,
                         },
-                        params.type_k,
-                        params.type_v,
-                        params.kv_size);
+                        GGML_TYPE_F32,
+                        GGML_TYPE_F32,
+                        std::max((uint32_t) 1, cparams.n_seq_max));
             } break;
         default:
             {
+                cparams.n_ctx = GGML_PAD(cparams.n_ctx, llama_kv_cache_unified::get_padding(cparams));
+
+                LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
+
                 res = new llama_kv_cache_unified(
                         hparams,
                         {
@@ -12825,8 +12831,8 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params)
                         },
                         params.type_k,
                         params.type_v,
-                        params.v_trans,
-                        params.kv_size);
+                        !cparams.flash_attn,
+                        cparams.n_ctx);
             }
     }
 
diff --git a/src/llama-model.h b/src/llama-model.h
@@ -390,8 +390,9 @@ struct llama_model {
 
     const struct ggml_tensor * get_tensor(const char * name) const;
 
+    // note: can mutate `cparams`
     // TODO: move this to new llm_arch_model_i interface
-    llama_memory_i * create_memory(const llama_memory_params & params) const;
+    llama_memory_i * create_memory(llama_cparams & cparams, const llama_memory_params & params) const;
 
     // TODO: move this to new llm_arch_model_i interface
     llm_graph_result_ptr build_graph(