fusion POC

jeffbolznv · jeffbolznv · commit 2ed9fb482fe2 · 2025-06-23T13:17:36.000-05:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -614,7 +614,11 @@ extern "C" {
 
         void * extra; // extra things e.g. for ggml-cuda.cu
 
-        char padding[8];
+        // number of operations that use this tensor as a src
+        int32_t use_count;
+
+        // add padding if needed to make a multiple of GGML_MEM_ALIGN
+        char padding[4];
     };
 
     static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
@@ -817,8 +817,8 @@ static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, str
         }
         if (sched->debug > 1) {
             ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
-            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
-                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
+            GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s] use=%d:", i, ggml_op_name(node->op), node->name,
+                fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node), node->use_count);
             for (int j = 0; j < GGML_MAX_SRC; j++) {
                 struct ggml_tensor * src = node->src[j];
                 if (src == NULL) {
@@ -1562,11 +1562,99 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
     return true;
 }
 
+bool is_view_2d(ggml_tensor *view,
+                ggml_tensor *src,
+                int64_t     ne0,
+                int64_t     ne1,
+                size_t      nb1,
+                size_t      offset) {
+    if (view->op != GGML_OP_VIEW || view->view_src != src) {
+        return false;
+    }
+
+    if (view->nb[0] != src->nb[0] ||
+        view->nb[1] != nb1 ||
+        view->nb[2] != view->nb[1] * ne1 ||
+        view->nb[3] != view->nb[2]) {
+        return false;
+    }
+    if (view->ne[0] != ne0 ||
+        view->ne[1] != ne1 ||
+        view->ne[2] != 1 ||
+        view->ne[3] != 1) {
+        return false;
+    }
+    if (view->view_offs != view->view_src->view_offs + offset) {
+        return false;
+    }
+    return true;
+}
+
 bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
     GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + graph->n_leafs);
 
     ggml_backend_sched_split_graph(sched, graph);
 
+    for (int s = 0; s < sched->n_splits; s++) {
+        for (int i = sched->splits[s].graph.n_nodes - 1; i >= 0; i--) {
+            struct ggml_tensor * node = sched->splits[s].graph.nodes[i];
+
+            // peephole to find swiglu:
+            // x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+            // x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+            // x0 = ggml_silu(ctx0, x0);
+            // ggml_mul(ctx0, x0, x1);
+            do {
+                if (node->op != GGML_OP_MUL) {
+                    break;
+                }
+                ggml_tensor * src0 = node->src[0];
+                ggml_tensor * src1 = node->src[1];
+                if (src0->op != GGML_OP_UNARY || ggml_get_op_params_i32(src0, 0) != GGML_UNARY_OP_SILU) {
+                    break;
+                }
+                src0 = src0->src[0];
+                if (src0->op != GGML_OP_CONT || src1->op != GGML_OP_CONT) {
+                    break;
+                }
+                if (src0->use_count != 1 || src1->use_count != 1) {
+                    break;
+                }
+                src0 = src0->src[0];
+                src1 = src1->src[0];
+
+                if (src0->use_count != 1 || src1->use_count != 1) {
+                    break;
+                }
+
+                ggml_tensor * input = src0->src[0];
+                if (!input || input->use_count != 2) {
+                    break;
+                }
+
+                uint32_t split_point = input->ne[0] / 2;
+
+                if (!is_view_2d(src0, input, split_point, input->ne[1], input->nb[1], 0)) {
+                    return false;
+                }
+                if (!is_view_2d(src1, input, split_point, input->ne[1], input->nb[1], split_point * ggml_element_size(input))) {
+                    return false;
+                }
+                //printf("detected swiglu\n");
+
+                node->src[0]->op = GGML_OP_NONE;
+                node->src[0]->src[0]->op = GGML_OP_NONE;
+                node->src[1]->op = GGML_OP_NONE;
+
+                node->op     = GGML_OP_GLU;
+                node->src[0] = input;
+                node->src[1] = NULL;
+                ggml_set_op_params_i32(node, 0, (int32_t) GGML_GLU_OP_SWIGLU);
+                ggml_set_op_params_i32(node, 1, (int32_t) false);
+            } while (0);
+        }
+    }
+
     if (!ggml_backend_sched_alloc_splits(sched)) {
         return false;
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1640,6 +1640,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
         /*.data         =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data,
         /*.name         =*/ { 0 },
         /*.extra        =*/ NULL,
+        /*.use_count    =*/ 0,
         /*.padding      =*/ { 0 },
     };
 
@@ -5962,6 +5963,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
             /* unknown order, just fall back to using i*/ i;
         if (node->src[k]) {
             ggml_visit_parents(cgraph, node->src[k]);
+            node->src[k]->use_count++;
         }
     }
 
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
@@ -554,20 +554,12 @@ ggml_tensor * llm_graph_context::build_ffn(
 
     switch (type_op) {
         case LLM_FFN_SILU:
-            if (gate && type_gate == LLM_FFN_PAR) {
-                cur = ggml_swiglu_split(ctx0, cur, tmp);
-                cb(cur, "ffn_swiglu", il);
-                type_gate = LLM_FFN_SEQ;
-            } else {
+            {
                 cur = ggml_silu(ctx0, cur);
                 cb(cur, "ffn_silu", il);
             } break;
         case LLM_FFN_GELU:
-            if (gate && type_gate == LLM_FFN_PAR) {
-                cur = ggml_geglu_split(ctx0, cur, tmp);
-                cb(cur, "ffn_geglu", il);
-                type_gate = LLM_FFN_SEQ;
-            } else {
+            {
                 cur = ggml_gelu(ctx0, cur);
                 cb(cur, "ffn_gelu", il);
                 if (act_scales != NULL) {
@@ -576,11 +568,7 @@ ggml_tensor * llm_graph_context::build_ffn(
                 }
             } break;
         case LLM_FFN_RELU:
-            if (gate && type_gate == LLM_FFN_PAR) {
-                cur = ggml_reglu_split(ctx0, cur, tmp);
-                cb(cur, "ffn_reglu", il);
-                type_gate = LLM_FFN_SEQ;
-            } else {
+            {
                 cur = ggml_relu(ctx0, cur);
                 cb(cur, "ffn_relu", il);
             } break;
@@ -594,19 +582,32 @@ ggml_tensor * llm_graph_context::build_ffn(
             } break;
         case LLM_FFN_SWIGLU:
             {
-                cur = ggml_swiglu(ctx0, cur);
-                cb(cur, "ffn_swiglu", il);
+                // Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf
+                int64_t split_point = cur->ne[0] / 2;
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
+                ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                x0 = ggml_silu(ctx0, x0);
+                cb(cur, "ffn_silu", il);
+
+                cur = ggml_mul(ctx0, x0, x1);
+                cb(cur, "ffn_mul", il);
             } break;
         case LLM_FFN_GEGLU:
             {
-                cur = ggml_geglu(ctx0, cur);
+                // Split into two equal parts
+                int64_t split_point = cur->ne[0] / 2;
+                // TODO: these conts should not be needed, see https://github.com/ggml-org/llama.cpp/pull/14090#discussion_r2137437217
+                ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
+                ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
+
+                x0 = ggml_gelu(ctx0, x0);
+                cb(x0, "ffn_gelu", il);
+
+                cur = ggml_mul(ctx0, x0, x1);
                 cb(cur, "ffn_geglu", il);
             } break;
-        case LLM_FFN_REGLU:
-            {
-                cur = ggml_reglu(ctx0, cur);
-                cb(cur, "ffn_reglu", il);
-            } break;
     }
 
     if (gate && type_gate == LLM_FFN_PAR) {
@@ -736,25 +737,24 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
 
     switch (type_op) {
         case LLM_FFN_SILU:
-            if (gate_exps) {
-                cur = ggml_swiglu_split(ctx0, cur, up);
-                cb(cur, "ffn_moe_swiglu", il);
-            } else {
+            {
                 cur = ggml_silu(ctx0, cur);
                 cb(cur, "ffn_moe_silu", il);
             } break;
         case LLM_FFN_GELU:
-            if (gate_exps) {
-                cur = ggml_geglu_split(ctx0, cur, up);
-                cb(cur, "ffn_moe_geglu", il);
-            } else {
+            {
                 cur = ggml_gelu(ctx0, cur);
                 cb(cur, "ffn_moe_gelu", il);
             } break;
         default:
             GGML_ABORT("fatal error");
     }
 
+    if (gate_exps) {
+        cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate_par", il);
+    }
+
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 

Original file line number	Diff line number	Diff line change
`@@ -1640,6 +1640,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(`
`1640`	`1640`	`/.data =/ obj_alloc_size > 0 ? (void *)(result + 1) : data,`
`1641`	`1641`	`/.name =/ { 0 },`
`1642`	`1642`	`/.extra =/ NULL,`
	`1643`	`+ /.use_count =/ 0,`
`1643`	`1644`	`/.padding =/ { 0 },`
`1644`	`1645`	`};`
`1645`	`1646`
`@@ -5962,6 +5963,7 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *`
`5962`	`5963`	`/* unknown order, just fall back to using i*/ i;`
`5963`	`5964`	`if (node->src[k]) {`
`5964`	`5965`	`ggml_visit_parents(cgraph, node->src[k]);`
	`5966`	`+ node->src[k]->use_count++;`
`5965`	`5967`	`}`
`5966`	`5968`	`}`
`5967`	`5969`