mul_mat_q8_0 with tensor cores

JohannesGaessler · JohannesGaessler · commit 28797802837e · 2024-05-31T16:18:21.000+02:00
diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu
@@ -486,7 +486,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
     GGML_UNUSED(x_qh); GGML_UNUSED(x_sc);
 
-    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y];
+    __shared__ int  tile_x_qs[mmq_y * (WARP_SIZE)       + mmq_y*4];
     __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0];
 
     *x_ql = tile_x_qs;
@@ -519,7 +519,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
 
         const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
 
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+        x_ql[i * (WARP_SIZE + 4) + k] = get_int_from_int8(bxi->qs, kqsx);
     }
 
     const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
@@ -547,9 +547,20 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat(
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+    const int * v = &x_ql[i * (WARP_SIZE + 1) + k];
+    const int * u = &y_qs[j * WARP_SIZE + k];
+    const float d8_0 = x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0];
+    const float d8_1 = y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1];
+
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMQ; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
 }
 
 template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -1066,6 +1077,167 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat(
     return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
 }
 
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
+              allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
+static __device__ __forceinline__ void mul_mat_q_test(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = blockIdx.x*mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = blockIdx.y*mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    half2 * tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    __shared__ int    tile_y_qs[mmq_x * WARP_SIZE       + mmq_x*4];
+    __shared__ half2  tile_y_ds[mmq_x * WARP_SIZE/QI8_1];
+
+    static_assert(mmq_x % (8*nwarps) == 0, "assert");
+    float sum[mmq_x/(8*nwarps)][mmq_y/16][4] = {{{0.0f}}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir*WARP_SIZE + threadIdx.x;
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (threadIdx.y + i) * (WARP_SIZE + 4) + kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x;
+                const int kby = threadIdx.x % (WARP_SIZE/QI8_1);
+                const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds;
+                half2       * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = __low2float(*dsi_src);
+                }
+            }
+
+            __syncthreads();
+
+            const float * x_dmf = (const float *) tile_x_dm;
+            const float * y_df  = (const float *) tile_y_ds;
+
+            static_assert(!need_sum);
+            static_assert(vdr == 32/sizeof(int));
+#pragma unroll
+            for (int k0 = ir*WARP_SIZE/qr; k0 < (ir+1)*WARP_SIZE/qr; k0 += vdr) {
+#pragma unroll
+                for (int j00 = 0; j00 < mmq_x; j00 += 8*nwarps) {
+                    const int j0 = j00 + 8*threadIdx.y;
+#pragma unroll
+                    for (int i0 = 0; i0 < mmq_y; i0 += 16) {
+                        int v[4];
+#pragma unroll
+                        for (int l = 0; l < 4; ++l) {
+                            v[l] = tile_x_ql[(i0 + (l%2)*8 + threadIdx.x/4) * (WARP_SIZE + 4) + k0 + (l/2)*4 + threadIdx.x%4];
+                        }
+                        int u[2];
+#pragma unroll
+                        for (int l = 0; l < 2; ++l) {
+                            u[l] = tile_y_qs[(j0 + threadIdx.x/4) * (WARP_SIZE + 4) + k0 + 4*l + threadIdx.x%4];
+                        }
+
+                        int sumi[4] = {0};
+#if __CUDA_ARCH__ >= CC_AMPERE
+                        asm("mma.sync.aligned.m16n8k32.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%0, %1, %2, %3};"
+                            : "+r"(sumi[0]), "+r"(sumi[1]), "+r"(sumi[2]), "+r"(sumi[3])
+                            : "r"(v[0]), "r"(v[1]), "r"(v[2]), "r"(v[3]), "r"(u[0]), "r"(u[1]));
+#else
+                        asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+                            : "+r"(sumi[0]), "+r"(sumi[1]), "+r"(sumi[2]), "+r"(sumi[3])
+                            : "r"(v[0]), "r"(v[1]), "r"(u[0]));
+                        asm("mma.sync.aligned.m16n8k16.row.col.s32.s8.s8.s32 {%0, %1, %2, %3}, {%4, %5}, {%6}, {%0, %1, %2, %3};"
+                            : "+r"(sumi[0]), "+r"(sumi[1]), "+r"(sumi[2]), "+r"(sumi[3])
+                            : "r"(v[2]), "r"(v[3]), "r"(u[1]));
+#endif // __CUDA_ARCH__ >= CC_AMPERE
+
+                        float d8_0[2];
+#pragma unroll
+                        for (int l = 0; l < 2; ++l) {
+                            const int i = i0 + 8*l + threadIdx.x/4;
+                            d8_0[l] = x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k0/QI8_0];
+                        }
+                        float d8_1[2];
+#pragma unroll
+                        for (int l = 0; l < 2; ++l) {
+                            const int j = j0 + 2*(threadIdx.x%4) + l;
+                            d8_1[l] = y_df[j * (WARP_SIZE/QI8_1) + k0/QI8_1];
+                        }
+
+#pragma unroll
+                        for (int l = 0; l < 4; ++l) {
+                            sum[j00/(8*nwarps)][i0/16][l] += d8_0[l/2]*d8_1[l%2] * sumi[l];
+                        }
+                    }
+                }
+            }
+
+            __syncthreads();
+        }
+    }
+
+#pragma unroll
+    for (int j00 = 0; j00 < mmq_x; j00 += 8*nwarps) {
+        const int j0 = j00 + 8*threadIdx.y + 2*(threadIdx.x%4);
+#pragma unroll
+        for (int i00 = 0; i00 < mmq_y; i00 += 16) {
+            const int i0 = i00 + threadIdx.x/4;
+
+#pragma unroll
+            for (int l = 0; l < 4; ++l) {
+                const int i = i0 + 8*(l/2);
+                const int j = j0 +   (l%2);
+
+                const int row_dst = row_dst_0 + i;
+                const int col_dst = col_dst_0 + j;
+                if (row_dst >= nrows_dst) {
+                    continue;
+                }
+                if (col_dst >= ncols_dst) {
+                    continue;
+                }
+                dst[col_dst*nrows_dst + row_dst] = sum[j00/(8*nwarps)][i00/16][l];
+            }
+        }
+    }
+}
+
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x, int mmq_y, int nwarps,
               allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __device__ __forceinline__ void mul_mat_q(
@@ -1304,7 +1476,7 @@ template <bool need_check> static __global__ void
 #if __CUDA_ARCH__ >= MIN_CC_DP4A
     constexpr mmq_arch_config_t arch_config = get_arch_config_device(MMQ_CONFIG_Q8_0);
 
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
+    mul_mat_q_test<QK8_0, QR8_0, QI8_0, false, block_q8_0, arch_config.x, arch_config.y, arch_config.nwarps, allocate_tiles_q8_0<arch_config.y>,
         load_tiles_q8_0<arch_config.y, arch_config.nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else