@@ -8252,7 +8252,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8252
8252
for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
8253
8253
#if defined(GGML_USE_CUBLAS )
8254
8254
// copy src0 while converting src1
8255
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i02 , i03 , g_cudaStream ));
8255
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8256
8256
8257
8257
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8258
8258
ggml_fp16_t * const wdata = (ggml_fp16_t * ) params -> wdata + (ne11 * ne10 ) * (i03 * ne02 + i02 );
@@ -8523,10 +8523,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
8523
8523
8524
8524
#if defined(GGML_USE_CUBLAS )
8525
8525
// copy and dequantize on device
8526
- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream ));
8526
+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream2 ));
8527
8527
8528
8528
dequantize_row_q_cuda (d_Q , d_X , x_ne , g_cudaStream2 );
8529
8529
CUDA_CHECK (cudaGetLastError ());
8530
+ CUDA_CHECK (cudaEventRecord (g_cudaEvent , g_cudaStream2 ));
8530
8531
#elif defined(GGML_USE_CLBLAST )
8531
8532
const void * x = (char * ) src0 -> data + i03 * nb03 + i02 * nb02 ;
8532
8533
#else
0 commit comments