@@ -8245,8 +8245,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8245
8245
ggml_fp16_t * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8246
8246
ggml_fp16_t * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8247
8247
float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8248
- #else
8249
- float * const wdata = params -> wdata ;
8250
8248
#endif
8251
8249
for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
8252
8250
for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
@@ -8263,15 +8261,20 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8263
8261
wdata [id ++ ] = GGML_FP32_TO_FP16 (* (float * ) ((char * ) src1 -> data + i03 * nb13 + i02 * nb12 + i01 * nb11 + i00 * nb10 ));
8264
8262
}
8265
8263
}
8264
+
8265
+ assert (id * sizeof (ggml_fp16_t ) <= params -> wsize );
8266
8266
}
8267
8267
#else
8268
+ float * const wdata = params -> wdata ;
8268
8269
{
8269
8270
size_t id = 0 ;
8270
8271
for (int64_t i01 = 0 ; i01 < ne01 ; ++ i01 ) {
8271
8272
for (int64_t i00 = 0 ; i00 < ne00 ; ++ i00 ) {
8272
8273
wdata [id ++ ] = GGML_FP16_TO_FP32 (* (ggml_fp16_t * ) ((char * ) src0 -> data + i03 * nb03 + i02 * nb02 + i01 * nb01 + i00 * nb00 ));
8273
8274
}
8274
8275
}
8276
+
8277
+ assert (id * sizeof (float ) <= params -> wsize );
8275
8278
}
8276
8279
#endif
8277
8280
@@ -8537,7 +8540,10 @@ static void ggml_compute_forward_mul_mat_q_f32(
8537
8540
dequantize_row_q ((char * ) src0 -> data + i03 * nb03 + i02 * nb02 + i01 * nb01 , wdata + id , ne00 );
8538
8541
id += ne00 ;
8539
8542
}
8543
+
8544
+ assert (id * sizeof (float ) <= params -> wsize );
8540
8545
}
8546
+
8541
8547
const float * x = wdata ;
8542
8548
#endif
8543
8549
@@ -11571,10 +11577,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11571
11577
if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
11572
11578
node -> n_tasks = 1 ; // TODO: this actually is doing nothing
11573
11579
// the threads are still spinning
11574
- cur = GGML_TYPE_SIZE [GGML_TYPE_F32 ]* MAX (ggml_nelements (node -> src1 ), ggml_nelements (node -> src0 ));
11575
- //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
11576
- //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
11577
- //printf("cur = %zu\n", cur);
11580
+ #if defined(GGML_USE_CUBLAS )
11581
+ // with cuBLAS, we need memory for the full 3D / 4D data of src1
11582
+ cur = GGML_TYPE_SIZE [GGML_TYPE_F16 ]* ggml_nelements (node -> src1 );
11583
+ #else
11584
+ // here we need memory just for single 2D matrix from src0
11585
+ cur = GGML_TYPE_SIZE [GGML_TYPE_F32 ]* (node -> src0 -> ne [0 ]* node -> src0 -> ne [1 ]);
11586
+ #endif
11578
11587
} else {
11579
11588
cur = GGML_TYPE_SIZE [GGML_TYPE_F16 ]* ggml_nelements (node -> src1 );
11580
11589
}
0 commit comments