ggml : adjust mul_mat_f16 work memory (#1226)

ggerganov · web-flow · commit 214b6a35702a · 2023-04-29T18:43:28.000+03:00
* llama : minor - remove explicity int64_t cast

* ggml : reduce memory buffer for F16 mul_mat when not using cuBLAS

* ggml : add asserts to guard for incorrect wsize
diff --git a/Makefile b/Makefile
@@ -34,10 +34,15 @@ endif
 #
 
 # keep standard at C11 and C++11
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =
 
+ifndef LLAMA_DEBUG
+	CFLAGS   += -DNDEBUG
+	CXXFLAGS += -DNDEBUG
+endif
+
 # warnings
 CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith
 CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
diff --git a/ggml.c b/ggml.c
@@ -8245,8 +8245,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
         ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
         ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
         float       * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
-#else
-        float * const wdata = params->wdata;
 #endif
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8263,15 +8261,20 @@ static void ggml_compute_forward_mul_mat_f16_f32(
                             wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
                         }
                     }
+
+                    assert(id*sizeof(ggml_fp16_t) <= params->wsize);
                 }
 #else
+                float * const wdata = params->wdata;
                 {
                     size_t id = 0;
                     for (int64_t i01 = 0; i01 < ne01; ++i01) {
                         for (int64_t i00 = 0; i00 < ne00; ++i00) {
                             wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
                         }
                     }
+
+                    assert(id*sizeof(float) <= params->wsize);
                 }
 #endif
 
@@ -8537,7 +8540,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
                         dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
                         id += ne00;
                     }
+
+                    assert(id*sizeof(float) <= params->wsize);
                 }
+
                 const float * x = wdata;
 #endif
 
@@ -11571,10 +11577,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                 node->n_tasks = 1; // TODO: this actually is doing nothing
                                                    //       the threads are still spinning
-                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
-                                //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
-                                //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
-                                //printf("cur = %zu\n", cur);
+#if defined(GGML_USE_CUBLAS)
+                                // with cuBLAS, we need memory for the full 3D / 4D data of src1
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
+#else
+                                // here we need memory just for single 2D matrix from src0
+                                cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
+#endif
                             } else {
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
                             }
diff --git a/llama.cpp b/llama.cpp
@@ -780,7 +780,7 @@ static bool kv_cache_init(
     const int n_embd  = hparams.n_embd;
     const int n_layer = hparams.n_layer;
 
-    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_mem      = n_layer*n_ctx;
     const int64_t n_elements = n_embd*n_mem;
 
     cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

Original file line number	Diff line number	Diff line change
`@@ -8245,8 +8245,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(`
`8245`	`8245`	`ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);`
`8246`	`8246`	`ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);`
`8247`	`8247`	`float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);`
`8248`		`-#else`
`8249`		`- float * const wdata = params->wdata;`
`8250`	`8248`	`#endif`
`8251`	`8249`	`for (int64_t i03 = 0; i03 < ne03; i03++) {`
`8252`	`8250`	`for (int64_t i02 = 0; i02 < ne02; i02++) {`
`@@ -8263,15 +8261,20 @@ static void ggml_compute_forward_mul_mat_f16_f32(`
`8263`	`8261`	`wdata[id++] = GGML_FP32_TO_FP16((float ) ((char ) src1->data + i03nb13 + i02nb12 + i01nb11 + i00*nb10));`
`8264`	`8262`	`}`
`8265`	`8263`	`}`
	`8264`	`+`
	`8265`	`+ assert(id*sizeof(ggml_fp16_t) <= params->wsize);`
`8266`	`8266`	`}`
`8267`	`8267`	`#else`
	`8268`	`+ float * const wdata = params->wdata;`
`8268`	`8269`	`{`
`8269`	`8270`	`size_t id = 0;`
`8270`	`8271`	`for (int64_t i01 = 0; i01 < ne01; ++i01) {`
`8271`	`8272`	`for (int64_t i00 = 0; i00 < ne00; ++i00) {`
`8272`	`8273`	`wdata[id++] = GGML_FP16_TO_FP32((ggml_fp16_t ) ((char ) src0->data + i03nb03 + i02nb02 + i01nb01 + i00*nb00));`
`8273`	`8274`	`}`
`8274`	`8275`	`}`
	`8276`	`+`
	`8277`	`+ assert(id*sizeof(float) <= params->wsize);`
`8275`	`8278`	`}`
`8276`	`8279`	`#endif`
`8277`	`8280`
`@@ -8537,7 +8540,10 @@ static void ggml_compute_forward_mul_mat_q_f32(`
`8537`	`8540`	`dequantize_row_q((char ) src0->data + i03nb03 + i02nb02 + i01nb01, wdata + id, ne00);`
`8538`	`8541`	`id += ne00;`
`8539`	`8542`	`}`
	`8543`	`+`
	`8544`	`+ assert(id*sizeof(float) <= params->wsize);`
`8540`	`8545`	`}`
	`8546`	`+`
`8541`	`8547`	`const float * x = wdata;`
`8542`	`8548`	`#endif`
`8543`	`8549`
`@@ -11571,10 +11577,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)`
`11571`	`11577`	`if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {`
`11572`	`11578`	`node->n_tasks = 1; // TODO: this actually is doing nothing`
`11573`	`11579`	`// the threads are still spinning`
`11574`		`- cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));`
`11575`		`- //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);`
`11576`		`- //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);`
`11577`		`- //printf("cur = %zu\n", cur);`
	`11580`	`+#if defined(GGML_USE_CUBLAS)`
	`11581`	`+ // with cuBLAS, we need memory for the full 3D / 4D data of src1`
	`11582`	`+ cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);`
	`11583`	`+#else`
	`11584`	`+ // here we need memory just for single 2D matrix from src0`
	`11585`	`+ cur = GGML_TYPE_SIZE[GGML_TYPE_F32](node->src0->ne[0]node->src0->ne[1]);`
	`11586`	`+#endif`
`11578`	`11587`	`} else {`
`11579`	`11588`	`cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);`
`11580`	`11589`	`}`