q5_1

JohannesGaessler · JohannesGaessler · commit 08db1998d739 · 2023-07-04T14:27:02.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -93,6 +93,7 @@ static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong
 
 #define QK5_0 32
 #define QR5_0 2
+#define QI5_0 4
 typedef struct {
     half d;                 // delta
     uint8_t qh[4];          // 5-th bit of quants
@@ -102,6 +103,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 
 #define QK5_1 32
 #define QR5_1 2
+#define QI5_1 4
 typedef struct {
     half d;                 // delta
     half m;                 // min
@@ -112,6 +114,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
 
 #define QK8_0 32
 #define QR8_0 1
+#define QI8_0 4
 typedef struct {
     half    d;              // delta
     int8_t  qs[QK8_0];      // quants
@@ -1273,6 +1276,36 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
     return sumi*d;
 }
 
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
+    const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
+    const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
+    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
+    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
+
+    const float d = bq5_1->d * bq8_1->d;
+    const float m = bq5_1->m;
+    const float s = bq8_1->s;
+
+    int vi0 = (qs  >>  0) & 0x0F0F0F0F;
+    vi0    |= (qh0 <<  4) & 0x00000010;
+    vi0    |= (qh0 << 11) & 0x00001000;
+    vi0    |= (qh0 << 18) & 0x00100000;
+    vi0    |= (qh0 << 25) & 0x10000000;
+    int sumi = __dp4a(vi0, ui0, 0);
+
+    int vi1 = (qs  >>  4) & 0x0F0F0F0F;
+    vi1    |= (qh1 <<  4) & 0x00000010;
+    vi1    |= (qh1 << 11) & 0x00001000;
+    vi1    |= (qh1 << 18) & 0x00100000;
+    vi1    |= (qh1 << 25) & 0x10000000;
+    sumi = __dp4a(vi1, ui1, sumi);
+
+    return sumi*d + m*s / QI5_1;
+}
+
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
 static __global__ void dequantize_block(const void * vx, float * y, const int k) {
     const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -1294,7 +1327,7 @@ static __global__ void dequantize_block(const void * vx, float * y, const int k)
     y[iybs + iqs + y_offset] = v.y;
 }
 
-template <int qk, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
+template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
 static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * dst, const int ncols, const int nrows) {
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
@@ -1304,7 +1337,6 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
 
     const int blocks_per_row = ncols / qk;
     const int blocks_per_warp = WARP_SIZE * sizeof(int)*2/qk;
-    const int ints_per_block = qk / (2 * sizeof(int));
 
 // partial sum for each thread
     float tmp = 0.0f;
@@ -1313,11 +1345,11 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
     const block_q8_1 * y = (const block_q8_1 *) vy;
 
     for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i + threadIdx.x/ints_per_block; // x block index
+        const int ibx = row*blocks_per_row + i + threadIdx.x/qi; // x block index
 
-        const int iby = i + threadIdx.x/ints_per_block;
+        const int iby = i + threadIdx.x/qi;
 
-        const int iqs  = threadIdx.x % ints_per_block;
+        const int iqs  = threadIdx.x % qi;
 
         tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
     }
@@ -1812,7 +1844,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
     const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
-    mul_mat_vec_q<QK4_0, block_q4_0, vec_dot_q4_0_q8_1>
+    mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, vec_dot_q4_0_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
@@ -1821,7 +1853,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
     const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
-    mul_mat_vec_q<QK4_0, block_q4_1, vec_dot_q4_1_q8_1>
+    mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, vec_dot_q4_1_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
@@ -1830,7 +1862,16 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
     const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
-    mul_mat_vec_q<QK5_0, block_q5_0, vec_dot_q5_0_q8_1>
+    mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, vec_dot_q5_0_q8_1>
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
+    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
+    mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, vec_dot_q5_1_q8_1>
         <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
 }
 
@@ -2360,6 +2401,9 @@ inline void ggml_cuda_op_mul_mat_vec_q(
         case GGML_TYPE_Q5_0:
             mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
             break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
+            break;
         default:
             GGML_ASSERT(false);
             break;
@@ -2916,7 +2960,8 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
             bool use_mul_mat_vec_q = false;
-            use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0;
+            use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1
+                || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1;
             if (use_mul_mat_vec_q) {
                 ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, false, false);
             } else {