mmq template, q4_1

JohannesGaessler · JohannesGaessler · commit bca8a686abf7 · 2023-07-15T12:44:03.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -87,8 +87,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0
 #define QR4_1 2
 #define QI4_1 (QK4_1 / (4 * QR4_1))
 typedef struct {
-    half    d;              // delta
-    half    m;              // min
+    half2   dm;             // dm.x = delta, dm.y = min
     uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
@@ -133,6 +132,13 @@ typedef struct {
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
 
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int8_t ** x_sc);
+typedef void (*load_tiles_cuda_t)(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int8_t * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
 
 //================================= k-quants
 
@@ -380,8 +386,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
     const block_q4_1 * x = (const block_q4_1 *) vx;
 
-    const dfloat d = x[ib].d;
-    const dfloat m = x[ib].m;
+    const dfloat d = x[ib].dm.x;
+    const dfloat m = x[ib].dm.y;
 
     const int vui = x[ib].qs[iqs];
 
@@ -1313,33 +1319,111 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
     return vec_dot_q4_0_q8_1_impl(vi, ui0, ui1, bq4_0->d, bq8_1->ds);
 }
 
-static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int8_t ** x_sc) {
 
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+    __shared__ int  tile_x_qs[(2*WARP_SIZE) * (WARP_SIZE + 1)];
+    __shared__ half2 tile_x_d[(2*WARP_SIZE) * (WARP_SIZE/QI4_0)];
 
-    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_d;
+}
 
-    const float d = __half2float(bq4_1->d) * __half2float(bq8_1->ds.x);
-    const float m = bq4_1->m;
-    const float s = bq8_1->ds.y;
+static __device__ __forceinline__ void load_tiles_q4_0(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) {
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = sizeof(int) * (k % QI4_0);
+
+    const block_q4_0 * bx = ((block_q4_0 *) vx) + i*blocks_per_row + kbx;
+
+    memcpy(&x_ql[i * WARP_SIZE + i + k], &bx->qs[kqsx], sizeof(int));
+    x_dm[i * (WARP_SIZE / QI4_0) + kbx].x = bx->d;
+}
 
+static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int8_t * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    return vec_dot_q4_0_q8_1_impl(
+        x_ql[i * WARP_SIZE + i + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)],
+        x_dm[i * (WARP_SIZE/QI4_0) + k/QI4_0].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl(
+    const int & vi, const int & ui0, const int & ui1, const half2 & dm4, const half2 & ds8) {
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const int vi0 = (vi >> 0) & 0x0F0F0F0F;
     const int vi1 = (vi >> 4) & 0x0F0F0F0F;
 
     // SIMD dot product of quantized values
     int sumi = __dp4a(vi0, ui0, 0);
     sumi     = __dp4a(vi1, ui1, sumi);
 
-    return sumi*d + m*s / QI4_1; // scale sum by QI4_1 because there are QI4_1 threads working on this block
+#ifdef GGML_CUDA_DMMV_F16
+    const half2 tmp = __hmul2(dm4, ds8);
+    const float d4d8 = __half2float(tmp.x);
+    const float m4s8 = __half2float(tmp.y);
+#else
+    const float d4d8 = __half2float(dm4.x) * __half2float(ds8.x);
+    const float m4s8 = __half2float(dm4.y) * __half2float(ds8.y);
+#endif // GGML_CUDA_DMMV_F16
+
+    // scale second part of sum by QI8_1/QR4_1 to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / QR4_1);
 #else
     return 0.0f; // only to satisfy the compiler
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
+
+    return vec_dot_q4_1_q8_1_impl(vi, ui0, ui1, bq4_1->dm, bq8_1->ds);
+}
+
+static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int8_t ** x_sc) {
+
+    __shared__ int   tile_x_qs[(2*WARP_SIZE) * (WARP_SIZE + 1)];
+    __shared__ half2 tile_x_dm[(2*WARP_SIZE) * (WARP_SIZE/QI4_1)];
+
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_dm;
+}
+
+static __device__ __forceinline__ void load_tiles_q4_1(
+    const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh,
+    int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) {
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = sizeof(int) * (k % QI4_1);
+
+    const block_q4_1 * bx = ((block_q4_1 *) vx) + i*blocks_per_row + kbx;
+
+    x_ql[i * WARP_SIZE + i + k] = *((int *) &bx->qs[kqsx]);
+    x_dm[i * (WARP_SIZE / QI4_1) + kbx] = bx->dm;
+}
+
+static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int8_t * __restrict__ x_sc,
+    const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) {
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    return vec_dot_q4_1_q8_1_impl(
+        x_ql[i * WARP_SIZE + i + k], y_qs[j * (2*WARP_SIZE) + kyqs], y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)],
+        x_dm[i * (WARP_SIZE/QI4_1) + k/QI4_1], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
+}
+
 static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
 
@@ -1647,15 +1731,17 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
+template <int qk, int qi, typename block_q_t,
+              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __global__ void mul_mat_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_dst) {
 
-    const block_q4_0 * x = (const block_q4_0 *) vx;
+    const block_q_t  * x = (const block_q_t  *) vx;
     const block_q8_1 * y = (const block_q8_1 *) vy;
 
-    const int blocks_per_row = ncols_x / QK4_0;
-    const int blocks_per_warp = WARP_SIZE / QI4_0;
+    const int blocks_per_row = ncols_x / qk;
+    const int blocks_per_warp = WARP_SIZE / qi;
 
     const int & ncols_dst = ncols_y;
 
@@ -1669,20 +1755,23 @@ static __global__ void mul_mat_q(
     const int col_dst_0 = blockIdx.y*WARP_SIZE;
     const int & col_y_0 = col_dst_0;
 
-    __shared__ int tile_x_qs[2*WARP_SIZE][WARP_SIZE + 1];
-    __shared__ half tile_x_d[2*WARP_SIZE][WARP_SIZE/QI4_0];
-    __shared__ int tile_y_qs[WARP_SIZE][2*WARP_SIZE];
-    __shared__ half2 tile_y_ds[WARP_SIZE][2*WARP_SIZE/QI8_1];
+    int    * tile_x_ql = nullptr;
+    half2  * tile_x_dm = nullptr;
+    int    * tile_x_qh = nullptr;
+    int8_t * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    __shared__ int    tile_y_qs[(WARP_SIZE) * (2*WARP_SIZE)];
+    __shared__ half2  tile_y_ds[(WARP_SIZE) * (2*WARP_SIZE/QI8_1)];
+
     float sum[2][4] = {0.0f};
 
     for (int ib0 = 0; ib0 < blocks_per_row; ib0 += blocks_per_warp) {
-        const int ibx  = tid_x / QI4_0;
-        const int iqsx = sizeof(int) * (tid_x % QI4_0);
 
-        for (int j = 0; j < 2*WARP_SIZE; j += 8) {
-            const block_q4_0 * __restrict__ bx = &x[(row_x_0 + j + tid_y)*blocks_per_row + ib0 + ibx];
-            memcpy(&tile_x_qs[j + tid_y][tid_x], &bx->qs[iqsx], sizeof(int));
-            tile_x_d[j + tid_y][ibx] = bx->d;
+        for (int i = 0; i < 2*WARP_SIZE; i += 8) {
+            load_tiles(x + row_x_0*blocks_per_row + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                       i + tid_y, tid_x, blocks_per_row);
         }
 
         const int iby0 = tid_x / QI8_1;
@@ -1692,26 +1781,23 @@ static __global__ void mul_mat_q(
         for (int i = 0; i < WARP_SIZE; i += 8) {
             const block_q8_1 * __restrict__ by0 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby0];
 
-            tile_y_qs[tid_y + i][tid_x] = *((int *) &by0->qs[iqsy]);
-            tile_y_ds[tid_y + i][iby0] = by0->ds;
+            tile_y_qs[(tid_y + i) * (2*WARP_SIZE) + tid_x] = *((int *) &by0->qs[iqsy]);
+            tile_y_ds[(tid_y + i) * (2*WARP_SIZE/QI8_1) + iby0] = by0->ds;
 
             const block_q8_1 * __restrict__ by1 = &y[(col_y_0 + tid_y + i)*blocks_per_row + ib0 + iby1];
 
-            tile_y_qs[tid_y + i][tid_x + WARP_SIZE] = *((int *) &by1->qs[iqsy]);
-            tile_y_ds[tid_y + i][iby1] = by1->ds;
+            tile_y_qs[(tid_y + i) * (2*WARP_SIZE) + tid_x + WARP_SIZE] = *((int *) &by1->qs[iqsy]);
+            tile_y_ds[(tid_y + i) * (2*WARP_SIZE/QI8_1) + iby1] = by1->ds;
         }
 
         __syncthreads();
 
         for (int k = 0; k < WARP_SIZE; ++k) {
-            const int iqsy = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
             for (int j = 0; j < WARP_SIZE; j += 8) {
-                sum[0][j/8] += vec_dot_q4_0_q8_1_impl(
-                    tile_x_qs[tid_x][k], tile_y_qs[tid_y + j][iqsy + 0], tile_y_qs[tid_y + j][iqsy + (QI8_1/2)],
-                    tile_x_d[tid_x][k / QI4_0], tile_y_ds[tid_y + j][2 * k / QI8_1]);
-                sum[1][j/8] += vec_dot_q4_0_q8_1_impl(
-                    tile_x_qs[tid_x + WARP_SIZE][k], tile_y_qs[tid_y + j][iqsy + 0], tile_y_qs[tid_y + j][iqsy + (QI8_1/2)],
-                    tile_x_d[tid_x + WARP_SIZE][k / QI4_0], tile_y_ds[tid_y + j][2 * k / QI8_1]);
+                sum[0][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                                       tid_x,             tid_y + j, k);
+                sum[1][j/8] += vec_dot(tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds,
+                                       tid_x + WARP_SIZE, tid_y + j, k);
             }
         }
 
@@ -2425,7 +2511,15 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void * vx, const void * vy, float
     const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
     const dim3 block_nums(block_num_x, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
-    mul_mat_q<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_dst);
+    mul_mat_q<QK4_0, QI4_0, block_q4_0, allocate_tiles_q4_0, load_tiles_q4_0, vec_dot_q4_0_q8_1_mul_mat><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_dst);
+}
+
+static void ggml_mul_mat_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_dst, cudaStream_t stream){
+    const int block_num_x = (nrows_x + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
+    const int block_num_y = (ncols_y + WARP_SIZE - 1) / WARP_SIZE;
+    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, WARP_SIZE/4, 1);
+    mul_mat_q<QK4_1, QI4_1, block_q4_1, allocate_tiles_q4_1, load_tiles_q4_1, vec_dot_q4_1_q8_1_mul_mat><<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_dst);
 }
 
 static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) {
@@ -2890,6 +2984,9 @@ inline void ggml_cuda_op_mul_mat_q(
         case GGML_TYPE_Q4_0:
             ggml_mul_mat_q4_0_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, nrows_dst, cudaStream_main);
             break;
+        case GGML_TYPE_Q4_1:
+            ggml_mul_mat_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, i01_diff, ne11, nrows_dst, cudaStream_main);
+            break;
         default:
             GGML_ASSERT(false);
             break;
@@ -3639,7 +3736,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
             ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec, false, false);
         } else {
-            if (src0->type == GGML_TYPE_Q4_0) {
+            // if (src0->type == GGML_TYPE_Q4_0) {
+            if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) {
                 ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_q, false, false);
             } else {
                 ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, true, false);