@@ -5522,6 +5522,7 @@ inline void ggml_cuda_op_rms_norm(
5522
5522
(void ) i1;
5523
5523
}
5524
5524
5525
+ template <bool buffers_contiguous>
5525
5526
inline void ggml_cuda_op_mul_mat_q (
5526
5527
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
5527
5528
float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -5534,15 +5535,18 @@ inline void ggml_cuda_op_mul_mat_q(
5534
5535
const int64_t ne00 = src0->ne [0 ];
5535
5536
const int64_t ne02 = src0->ne [2 ];
5536
5537
5537
- GGML_ASSERT (src0->nb [0 ] == ggml_type_size (src0->type ));
5538
- const size_t nb01 = src0->nb [1 ];
5539
- const size_t nb02 = src0->nb [2 ];
5540
-
5541
5538
const int64_t ne10 = src1->ne [0 ];
5542
5539
const int64_t ne11 = src1->ne [1 ];
5543
5540
5544
5541
const int64_t ne0 = dst->ne [0 ];
5545
5542
5543
+ GGML_ASSERT (src0->nb [0 ] == ggml_type_size (src0->type ));
5544
+ const size_t nb01 = src0->nb [1 ];
5545
+ const size_t nb02 = src0->nb [2 ];
5546
+
5547
+ const size_t nb11 = src1->nb [1 ];
5548
+ const size_t nb12 = src1->nb [2 ];
5549
+
5546
5550
const int64_t i01_diff = i01_high - i01_low;
5547
5551
5548
5552
int id;
@@ -5552,19 +5556,19 @@ inline void ggml_cuda_op_mul_mat_q(
5552
5556
// nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
5553
5557
const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
5554
5558
5555
- const int nchannels = src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_GPU &&
5556
- dst->backend == GGML_BACKEND_GPU && ggml_is_contiguous (src1) ? ne02 : 1 ;
5559
+ const int nchannels = buffers_contiguous ? 1 : ne02;
5557
5560
5558
5561
const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
5559
5562
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5560
5563
size_t as;
5561
5564
void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*ne11*nchannels*sizeof (block_q8_1)/QK8_1, &as);
5562
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels, ne10, ne10*ne11, cudaStream_main);
5565
+ const int64_t src1_row_stride = buffers_contiguous ? ne10 : nb11 / sizeof (float );
5566
+ const int64_t src1_channel_stride = buffers_contiguous ? ne10*ne11 : nb12 / sizeof (float );
5567
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels,
5568
+ src1_row_stride, src1_channel_stride, cudaStream_main);
5563
5569
5564
- // const int row_stride = nb01 / ggml_type_size(src0->type);
5565
- const int row_stride = src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_GPU &&
5566
- dst->backend == GGML_BACKEND_GPU && ggml_is_contiguous (src1) ? nb01 / ggml_type_size (src0->type ) : ne10 / ggml_blck_size (src0->type );
5567
- const int channel_stride_x = nb02 / ggml_type_size (src0->type );
5570
+ const int row_stride = buffers_contiguous ? ne10 / ggml_blck_size (src0->type ) : nb01 / ggml_type_size (src0->type );
5571
+ const int channel_stride_x = buffers_contiguous ? ne10*ne11 / ggml_blck_size (src0->type ) : nb02 / ggml_type_size (src0->type );
5568
5572
const int channel_stride_y = padded_row_size*ne11 / QK8_1;
5569
5573
5570
5574
switch (src0->type ) {
@@ -5681,6 +5685,9 @@ inline void ggml_cuda_op_mul_mat_vec(
5681
5685
const int64_t nb01 = src0->nb [1 ];
5682
5686
const int64_t nb02 = src0->nb [2 ];
5683
5687
5688
+ const int64_t nb11 = src1->nb [1 ];
5689
+ const int64_t nb12 = src1->nb [2 ];
5690
+
5684
5691
const int64_t nrows = i01_high - i01_low;
5685
5692
5686
5693
#ifdef GGML_CUDA_FORCE_DMMV
@@ -5713,7 +5720,9 @@ inline void ggml_cuda_op_mul_mat_vec(
5713
5720
ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
5714
5721
size_t as;
5715
5722
void * src1_q8_1 = ggml_cuda_pool_malloc (padded_row_size*ne02*sizeof (block_q8_1)/QK8_1, &as);
5716
- quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, 1 , padded_row_size, ne02, ne10, ne10*1 , cudaStream_main);
5723
+ const int64_t row_stride = src1->backend == GGML_BACKEND_CPU ? ne10 : nb11 / sizeof (float );
5724
+ const int64_t channel_stride = src1->backend == GGML_BACKEND_CPU ? ne10*1 : nb12 / sizeof (float );
5725
+ quantize_row_q8_1_cuda (src1_ddf_i, src1_q8_1, ne10, 1 , padded_row_size, ne02, row_stride, channel_stride, cudaStream_main);
5717
5726
5718
5727
const int row_delta = nb01 / ggml_type_size (src0->type );
5719
5728
const int channel_delta = nb02 / ggml_type_size (src0->type );
@@ -6433,7 +6442,7 @@ void ggml_cuda_mul_mat_nc(const ggml_tensor * src0, const ggml_tensor * src1, gg
6433
6442
struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra ;
6434
6443
float * dst_ddf = (float *) dst_extra->data_device [g_main_device];
6435
6444
6436
- ggml_cuda_op_mul_mat_q (src0, src1, dst, src0_ddq, nullptr , src1_ddf, dst_ddf, 0 , 0 , ne01, 0 , cudaStream_main);
6445
+ ggml_cuda_op_mul_mat_q< false > (src0, src1, dst, src0_ddq, nullptr , src1_ddf, dst_ddf, 0 , 0 , ne01, 0 , cudaStream_main);
6437
6446
CUDA_CHECK (cudaGetLastError ());
6438
6447
}
6439
6448
@@ -6534,10 +6543,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
6534
6543
}
6535
6544
6536
6545
if (g_mul_mat_q && ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A) {
6537
- if (all_on_device && src0->backend != GGML_BACKEND_GPU_SPLIT && ggml_is_contiguous (src1) ) {
6546
+ if (all_on_device && src0->backend != GGML_BACKEND_GPU_SPLIT) {
6538
6547
ggml_cuda_mul_mat_nc (src0, src1, dst);
6539
6548
} else {
6540
- ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_q, false , false );
6549
+ ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_q< true > , false , false );
6541
6550
}
6542
6551
} else {
6543
6552
ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true , false );
0 commit comments