@@ -8242,15 +8242,18 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8242
8242
const int d_ne = ne11 * ne01 ;
8243
8243
8244
8244
size_t x_size , y_size , d_size ;
8245
- float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8246
- float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8247
- float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8245
+ ggml_fp16_t * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8246
+ ggml_fp16_t * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8247
+ float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8248
8248
#else
8249
8249
float * const wdata = params -> wdata ;
8250
8250
#endif
8251
8251
for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
8252
8252
for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
8253
8253
#if defined(GGML_USE_CUBLAS )
8254
+ // copy src0 while converting src1
8255
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i02 , i03 , g_cudaStream ));
8256
+
8254
8257
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8255
8258
ggml_fp16_t * const wdata = (ggml_fp16_t * ) params -> wdata + (ne11 * ne10 ) * (i03 * ne02 + i02 );
8256
8259
{
@@ -8274,11 +8277,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8274
8277
8275
8278
#if defined(GGML_USE_CUBLAS )
8276
8279
const ggml_fp16_t * y = (ggml_fp16_t * ) wdata ;
8277
-
8278
8280
float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
8279
8281
8280
8282
// copy data to device
8281
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8282
8283
CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (ggml_fp16_t ) * y_ne , cudaMemcpyHostToDevice , g_cudaStream ));
8283
8284
8284
8285
// compute
0 commit comments