@@ -5730,47 +5730,49 @@ inline void ggml_cuda_op_mul_mat_vec(
5730
5730
#endif // GGML_CUDA_FORCE_DMMV
5731
5731
5732
5732
if (use_mul_mat_vec_q) {
5733
+ const int nchannels = buffers_contiguous ? 1 : ne02;
5734
+
5733
5735
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5734
5736
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5735
5737
size_t as;
5736
- void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*ne02 *sizeof (block_q8_1)/QK8_1, &as);
5737
- const int64_t row_stride_q = src1-> backend == GGML_BACKEND_CPU ? ne10 : nb11 / sizeof (float );
5738
- const int64_t channel_stride_q = src1-> backend == GGML_BACKEND_CPU ? ne10*1 : nb12 / sizeof (float );
5739
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, 1 , padded_row_size, ne02 , row_stride_q, channel_stride_q, cudaStream_main);
5738
+ void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*nchannels *sizeof (block_q8_1)/QK8_1, &as);
5739
+ const int64_t row_stride_q = buffers_contiguous ? ne10 : nb11 / sizeof (float );
5740
+ const int64_t channel_stride_q = buffers_contiguous ? ne10*1 : nb12 / sizeof (float );
5741
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, 1 , padded_row_size, nchannels , row_stride_q, channel_stride_q, cudaStream_main);
5740
5742
5741
- const int row_stride_x = nb01 / ggml_type_size (src0->type );
5742
- const int channel_stride_x = nb02 / ggml_type_size (src0->type );
5743
+ const int row_stride_x = buffers_contiguous ? ne00 / ggml_blck_size (src0-> type ) : nb01 / ggml_type_size (src0->type );
5744
+ const int channel_stride_x = buffers_contiguous ? ne00* 1 / ggml_blck_size (src0-> type ) : nb02 / ggml_type_size (src0->type );
5743
5745
const int channel_stride_y = padded_row_size / QK8_1;
5744
5746
switch (src0->type ) {
5745
5747
case GGML_TYPE_Q4_0:
5746
- mul_mat_vec_q4_0_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5748
+ mul_mat_vec_q4_0_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5747
5749
break ;
5748
5750
case GGML_TYPE_Q4_1:
5749
- mul_mat_vec_q4_1_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5751
+ mul_mat_vec_q4_1_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5750
5752
break ;
5751
5753
case GGML_TYPE_Q5_0:
5752
- mul_mat_vec_q5_0_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5754
+ mul_mat_vec_q5_0_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5753
5755
break ;
5754
5756
case GGML_TYPE_Q5_1:
5755
- mul_mat_vec_q5_1_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5757
+ mul_mat_vec_q5_1_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5756
5758
break ;
5757
5759
case GGML_TYPE_Q8_0:
5758
- mul_mat_vec_q8_0_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5760
+ mul_mat_vec_q8_0_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5759
5761
break ;
5760
5762
case GGML_TYPE_Q2_K:
5761
- mul_mat_vec_q2_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5763
+ mul_mat_vec_q2_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5762
5764
break ;
5763
5765
case GGML_TYPE_Q3_K:
5764
- mul_mat_vec_q3_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5766
+ mul_mat_vec_q3_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5765
5767
break ;
5766
5768
case GGML_TYPE_Q4_K:
5767
- mul_mat_vec_q4_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5769
+ mul_mat_vec_q4_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5768
5770
break ;
5769
5771
case GGML_TYPE_Q5_K:
5770
- mul_mat_vec_q5_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5772
+ mul_mat_vec_q5_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5771
5773
break ;
5772
5774
case GGML_TYPE_Q6_K:
5773
- mul_mat_vec_q6_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, ne02 , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5775
+ mul_mat_vec_q6_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, nchannels , row_stride_x, channel_stride_x, channel_stride_y, cudaStream_main);
5774
5776
break ;
5775
5777
default :
5776
5778
GGML_ASSERT (false );
@@ -5779,7 +5781,7 @@ inline void ggml_cuda_op_mul_mat_vec(
5779
5781
5780
5782
ggml_cuda_pool_free (src1_q8_1, as);
5781
5783
} else {
5782
- GGML_ASSERT (buffers_contiguous || ne02 == 1 );
5784
+ GGML_ASSERT (buffers_contiguous);
5783
5785
5784
5786
// on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
5785
5787
#ifdef GGML_CUDA_F16
@@ -6548,7 +6550,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6548
6550
}
6549
6551
6550
6552
// no quantized non-contiguous support for lower CC kernels implemented
6551
- const bool nc_okay = src0->type == GGML_TYPE_F16 || g_compute_capabilities[g_main_device] >= MIN_CC_DP4A;
6553
+ // const bool nc_okay = src0->type == GGML_TYPE_F16 || g_compute_capabilities[g_main_device] >= MIN_CC_DP4A;
6554
+ const bool nc_okay = false ;
6552
6555
6553
6556
if (all_on_device && nc_okay && ggml_is_permuted (src0) && ggml_is_permuted (src1) && src1->ne [1 ] == 1 ) {
6554
6557
ggml_cuda_mul_mat_vec_p021 (src0, src1, dst);
0 commit comments