weight_before_ffn

ngxson · ngxson · commit ee06e9b710a3 · 2025-04-07T17:08:13.000+02:00
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -814,7 +814,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
                  int   il) const {
     const int64_t n_embd   = cur->ne[0];
     const int64_t n_tokens = cur->ne[1];
-    const bool scale_before_ffn = arch == LLM_ARCH_LLAMA4;
+    const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
 
     ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
     cb(logits, "ffn_moe_logits", il);
@@ -875,13 +875,16 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
     cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
 
+    if (weight_before_ffn) {
+        ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
+        repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
+        cur = ggml_mul(ctx0, repeated, weights);
+        cb(cur, "ffn_moe_weighted", il);
+    }
+
     ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(up, "ffn_moe_up", il);
 
-    if (scale_before_ffn) {
-        up = ggml_mul(ctx0, up, weights);
-    }
-
     ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(gate, "ffn_moe_gate", il);
 
@@ -906,7 +909,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
-    if (!scale_before_ffn) {
+    if (!weight_before_ffn) {
         experts = ggml_mul(ctx0, experts, weights);
     }