@@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
8033
8033
#if defined(GGML_USE_CUBLAS )
8034
8034
const float alpha = 1.0f ;
8035
8035
const float beta = 0.0f ;
8036
- const int x_ne = ne01 * ne10 ;
8036
+ const int x_ne = ne01 * ne00 ;
8037
8037
const int y_ne = ne11 * ne10 ;
8038
8038
const int d_ne = ne11 * ne01 ;
8039
8039
@@ -8239,7 +8239,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8239
8239
8240
8240
const float alpha = 1.0f ;
8241
8241
const float beta = 0.0f ;
8242
- const int x_ne = ne01 * ne10 ;
8242
+ const int x_ne = ne01 * ne00 ;
8243
8243
const int y_ne = ne11 * ne10 ;
8244
8244
const int d_ne = ne11 * ne01 ;
8245
8245
@@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
8498
8498
#if defined(GGML_USE_CUBLAS )
8499
8499
const float alpha = 1.0f ;
8500
8500
const float beta = 0.0f ;
8501
- const int x_ne = ne01 * ne10 ;
8501
+ const int x_ne = ne01 * ne00 ;
8502
8502
const int y_ne = ne11 * ne10 ;
8503
8503
const int d_ne = ne11 * ne01 ;
8504
8504
8505
8505
size_t x_size , y_size , d_size , q_size ;
8506
- float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8507
- float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8508
- float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8509
- float * d_Q = ggml_cuda_pool_malloc (GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], & q_size );
8506
+ float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8507
+ float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8508
+ float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8509
+ void * d_Q = ggml_cuda_pool_malloc (GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], & q_size );
8510
8510
8511
- void (* dequantize_row_q_cuda )(const void * x , float * y , int k , cudaStream_t stream ) = NULL ;
8512
- if (type == GGML_TYPE_Q4_0 ) {
8513
- dequantize_row_q_cuda = dequantize_row_q4_0_cuda ;
8514
- }
8515
- else if (type == GGML_TYPE_Q4_1 ) {
8516
- dequantize_row_q_cuda = dequantize_row_q4_1_cuda ;
8517
- }
8518
- else if (type == GGML_TYPE_Q4_2 ) {
8519
- dequantize_row_q_cuda = dequantize_row_q4_2_cuda ;
8520
- }
8521
- else if (type == GGML_TYPE_Q5_0 ) {
8522
- dequantize_row_q_cuda = dequantize_row_q5_0_cuda ;
8523
- }
8524
- else if (type == GGML_TYPE_Q5_1 ) {
8525
- dequantize_row_q_cuda = dequantize_row_q5_1_cuda ;
8526
- }
8527
- else if (type == GGML_TYPE_Q8_0 ) {
8528
- dequantize_row_q_cuda = dequantize_row_q8_0_cuda ;
8529
- }
8530
- else {
8531
- GGML_ASSERT (false);
8532
- }
8533
- #elif !defined(GGML_USE_CLBLAST )
8511
+ const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda (type );
8512
+ GGML_ASSERT (dequantize_row_q_cuda != NULL );
8513
+ #else
8534
8514
float * const wdata = params -> wdata ;
8535
8515
dequantize_row_q_t const dequantize_row_q = quantize_fns [type ].dequantize_row_q ;
8536
8516
#endif
@@ -8545,7 +8525,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8545
8525
// copy and dequantize on device
8546
8526
CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream ));
8547
8527
8548
- dequantize_row_q_cuda (d_Q , d_X , ne01 * ne00 , g_cudaStream );
8528
+ dequantize_row_q_cuda (d_Q , d_X , x_ne , g_cudaStream2 );
8549
8529
CUDA_CHECK (cudaGetLastError ());
8550
8530
#elif defined(GGML_USE_CLBLAST )
8551
8531
const void * x = (char * ) src0 -> data + i03 * nb03 + i02 * nb02 ;
@@ -8565,6 +8545,9 @@ static void ggml_compute_forward_mul_mat_q_f32(
8565
8545
// copy data to device
8566
8546
CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1 , i03 , i02 , g_cudaStream ));
8567
8547
8548
+ // wait for dequantization
8549
+ CUDA_CHECK (cudaStreamWaitEvent (g_cudaStream , g_cudaEvent , 0 ));
8550
+
8568
8551
// compute
8569
8552
CUBLAS_CHECK (
8570
8553
cublasSgemm (g_cublasH , CUBLAS_OP_T , CUBLAS_OP_N ,
0 commit comments