works

JohannesGaessler · JohannesGaessler · commit c0daa6682dc0 · 2023-09-05T10:10:31.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -5522,6 +5522,7 @@ inline void ggml_cuda_op_rms_norm(
     (void) i1;
 }
 
+template <bool buffers_contiguous>
 inline void ggml_cuda_op_mul_mat_q(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, char * src0_ddq_i,
     float * src0_ddf_i, float * src1_ddf_i, float * dst_ddf_i, int64_t i02, int64_t i01_low, int64_t i01_high, int i1,
@@ -5534,15 +5535,18 @@ inline void ggml_cuda_op_mul_mat_q(
     const int64_t ne00 = src0->ne[0];
     const int64_t ne02 = src0->ne[2];
 
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    const size_t nb01 = src0->nb[1];
-    const size_t nb02 = src0->nb[2];
-
     const int64_t ne10 = src1->ne[0];
     const int64_t ne11 = src1->ne[1];
 
     const int64_t ne0 = dst->ne[0];
 
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    const size_t nb01 = src0->nb[1];
+    const size_t nb02 = src0->nb[2];
+
+    const size_t nb11 = src1->nb[1];
+    const size_t nb12 = src1->nb[2];
+
     const int64_t i01_diff = i01_high - i01_low;
 
     int id;
@@ -5552,19 +5556,19 @@ inline void ggml_cuda_op_mul_mat_q(
     // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
     const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : i01_diff;
 
-    const int nchannels = src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_GPU &&
-        dst->backend == GGML_BACKEND_GPU && ggml_is_contiguous(src1) ? ne02 : 1;
+    const int nchannels = buffers_contiguous ? 1 : ne02;
 
     const int64_t padded_row_size = ne10 % MATRIX_ROW_PADDING == 0 ?
         ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
     size_t as;
     void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne11*nchannels*sizeof(block_q8_1)/QK8_1, &as);
-    quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels, ne10, ne10*ne11, cudaStream_main);
+    const int64_t src1_row_stride     = buffers_contiguous ? ne10      : nb11 / sizeof(float);
+    const int64_t src1_channel_stride = buffers_contiguous ? ne10*ne11 : nb12 / sizeof(float);
+    quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, ne11, padded_row_size, nchannels,
+                           src1_row_stride, src1_channel_stride, cudaStream_main);
 
-    // const int row_stride       = nb01 / ggml_type_size(src0->type);
-    const int row_stride = src0->backend == GGML_BACKEND_GPU && src1->backend == GGML_BACKEND_GPU &&
-        dst->backend == GGML_BACKEND_GPU && ggml_is_contiguous(src1) ? nb01 / ggml_type_size(src0->type) : ne10 / ggml_blck_size(src0->type);
-    const int channel_stride_x = nb02 / ggml_type_size(src0->type);
+    const int row_stride       = buffers_contiguous ? ne10      / ggml_blck_size(src0->type) : nb01 / ggml_type_size(src0->type);
+    const int channel_stride_x = buffers_contiguous ? ne10*ne11 / ggml_blck_size(src0->type) : nb02 / ggml_type_size(src0->type);
     const int channel_stride_y = padded_row_size*ne11 / QK8_1;
 
     switch (src0->type) {
@@ -5681,6 +5685,9 @@ inline void ggml_cuda_op_mul_mat_vec(
     const int64_t nb01 = src0->nb[1];
     const int64_t nb02 = src0->nb[2];
 
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
     const int64_t nrows = i01_high - i01_low;
 
 #ifdef GGML_CUDA_FORCE_DMMV
@@ -5713,7 +5720,9 @@ inline void ggml_cuda_op_mul_mat_vec(
             ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
         size_t as;
         void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*ne02*sizeof(block_q8_1)/QK8_1, &as);
-        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, 1, padded_row_size, ne02, ne10, ne10*1, cudaStream_main);
+        const int64_t row_stride     = src1->backend == GGML_BACKEND_CPU ? ne10   : nb11 / sizeof(float);
+        const int64_t channel_stride = src1->backend == GGML_BACKEND_CPU ? ne10*1 : nb12 / sizeof(float);
+        quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne10, 1, padded_row_size, ne02, row_stride, channel_stride, cudaStream_main);
 
         const int row_delta       = nb01 / ggml_type_size(src0->type);
         const int channel_delta   = nb02 / ggml_type_size(src0->type);
@@ -6433,7 +6442,7 @@ void ggml_cuda_mul_mat_nc(const ggml_tensor * src0, const ggml_tensor * src1, gg
     struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
-    ggml_cuda_op_mul_mat_q(src0, src1, dst, src0_ddq, nullptr, src1_ddf, dst_ddf, 0, 0, ne01, 0, cudaStream_main);
+    ggml_cuda_op_mul_mat_q<false>(src0, src1, dst, src0_ddq, nullptr, src1_ddf, dst_ddf, 0, 0, ne01, 0, cudaStream_main);
     CUDA_CHECK(cudaGetLastError());
 }
 
@@ -6534,10 +6543,10 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
             }
 
             if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
-                if (all_on_device && src0->backend != GGML_BACKEND_GPU_SPLIT && ggml_is_contiguous(src1)) {
+                if (all_on_device && src0->backend != GGML_BACKEND_GPU_SPLIT) {
                     ggml_cuda_mul_mat_nc(src0, src1, dst);
                 } else {
-                    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
+                    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q<true>, false, false);
                 }
             } else {
                 ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);