Skip to content

Commit 0ffcd89

Browse files
committed
ggml : reduce memory buffer for F16 mul_mat when not using cuBLAS
1 parent 150e135 commit 0ffcd89

File tree

1 file changed

+7
-4
lines changed

1 file changed

+7
-4
lines changed

ggml.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11571,10 +11571,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1157111571
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
1157211572
node->n_tasks = 1; // TODO: this actually is doing nothing
1157311573
// the threads are still spinning
11574-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
11575-
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
11576-
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
11577-
//printf("cur = %zu\n", cur);
11574+
#if defined(GGML_USE_CUBLAS)
11575+
// with cuBLAS, we need memory for the full 3D / 4D data of src1
11576+
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
11577+
#else
11578+
// here we need memory just for single 2D matrix from src0
11579+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
11580+
#endif
1157811581
} else {
1157911582
cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
1158011583
}

0 commit comments

Comments
 (0)