q5_1 impl

JohannesGaessler · JohannesGaessler · commit 5d64dd6f6de1 · 2023-07-15T23:24:20.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -124,8 +124,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5
 #define QR5_1 2
 #define QI5_1 (QK5_1 / (4 * QR5_1))
 typedef struct {
-    half d;                 // delta
-    half m;                 // min
+    half2 dm;               // dm.x = delta, dm.y = min
     uint8_t qh[4];          // 5-th bit of quants
     uint8_t qs[QK5_1 / 2];  // nibbles / quants
 } block_q5_1;
@@ -447,8 +446,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
 static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){
     const block_q5_1 * x = (const block_q5_1 *) vx;
 
-    const dfloat d = x[ib].d;
-    const dfloat m = x[ib].m;
+    const dfloat d = x[ib].dm.x;
+    const dfloat m = x[ib].dm.y;
 
     uint32_t qh;
     memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -1519,42 +1518,53 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat(
         y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], x_dm[index_bx].x, y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
 }
 
-static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl(
+    const int & qs, const int & qh, const int & ui0, const int & ui1, const half2 & dm5, const half2 & ds8) {
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    const int qs  = *((int *) &bq5_1->qs[sizeof(int) * (iqs + 0)]);
-    const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
-    const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
-
-    const float d = __half2float(bq5_1->d) * __half2float(bq8_1->ds.x);
-    const float m = bq5_1->m;
-    const float s = bq8_1->ds.y;
-
-    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
-    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
-    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
-    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
-    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
+    int vi0 = (qs >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
+    vi0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+    vi0    |= (qh << 11) & 0x00001000; // 1 -> 12
+    vi0    |= (qh << 18) & 0x00100000; // 2 -> 20
+    vi0    |= (qh << 25) & 0x10000000; // 3 -> 28
     int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
 
-    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
-    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
-    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
-    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
-    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
+    int vi1 = (qs >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
+    vi1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+    vi1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+    vi1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+    vi1    |= (qh <<  9) & 0x10000000; // 19 -> 28
     sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
 
-    return sumi*d + m*s / QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
+#ifdef GGML_CUDA_DMMV_F16
+    const half2 tmp = __hmul2(dm5, ds8);
+    const float d5d8 = __half2float(tmp.x);
+    const float m5s8 = __half2float(tmp.y);
+#else
+    const float d5d8 = __half2float(dm5.x) * __half2float(ds8.x);
+    const float m5s8 = __half2float(dm5.y) * __half2float(ds8.y);
+#endif // GGML_CUDA_DMMV_F16
+
+    return sumi*d5d8 + m5s8/QI5_1; // scale sum by QI5_1 because there are QI5_1 threads working on this block
+
 #else
     return 0.0f; // only to satisfy the compiler
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
+static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    const int qs = get_int_from_uint8_aligned(bq5_1->qs, iqs);
+    const int qh = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * iqs);
+    const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs);
+    const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_1);
+
+    return vec_dot_q5_1_q8_1_impl(qs, qh, ui0, ui1, bq5_1->dm, bq8_1->ds);
+}
+
 static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {