faster q5_0

JohannesGaessler · JohannesGaessler · commit 274b0d7d1785 · 2023-07-16T11:23:14.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -1446,26 +1446,24 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat(
 }
 
 static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
-    const int & qs, const int & qh0, const int & qh1, const int & ui0, const int & ui1, const half & d5, const half2 & ds8) {
+    const int & qs, const int & qh, const int & ui0, const int & ui1, const half & d5, const half2 & ds8) {
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
-    vi0    |= (qh0 <<  4) & 0x00000010; // 0 ->  4
-    vi0    |= (qh0 << 11) & 0x00001000; // 1 -> 12
-    vi0    |= (qh0 << 18) & 0x00100000; // 2 -> 20
-    vi0    |= (qh0 << 25) & 0x10000000; // 3 -> 28
-    vi0     = __vsub4(vi0,  0x10101010); // subtract 16 from quantized values
+    int vi0 = (qs >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+    vi0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+    vi0    |= (qh << 11) & 0x00001000; // 1 -> 12
+    vi0    |= (qh << 18) & 0x00100000; // 2 -> 20
+    vi0    |= (qh << 25) & 0x10000000; // 3 -> 28
     int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
 
-    int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
-    vi1    |= (qh1 <<  4) & 0x00000010; // 0 ->  4
-    vi1    |= (qh1 << 11) & 0x00001000; // 1 -> 12
-    vi1    |= (qh1 << 18) & 0x00100000; // 2 -> 20
-    vi1    |= (qh1 << 25) & 0x10000000; // 3 -> 28
-    vi1     = __vsub4(vi1,  0x10101010); // subtract 16 from quantized values
+    int vi1 = (qs >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+    vi1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+    vi1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+    vi1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+    vi1    |= (qh <<  9) & 0x10000000; // 19 -> 28
     sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
 
-    return sumi*__half2float(d5)*__half2float(ds8.x);
+    return __half2float(d5) * (sumi*__half2float(ds8.x) - (16/QI5_0) * __half2float(ds8.y));
 #else
     return 0.0f; // only to satisfy the compiler
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
@@ -1477,12 +1475,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
     const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 
     const int qs = get_int_from_uint8(bq5_0->qs, iqs);
-    const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
-    const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
+    const int qh = get_int_from_uint8(bq5_0->qh, 0) >> (4 * iqs);
     const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs);
     const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_0);
 
-    return vec_dot_q5_0_q8_1_impl(qs, qh0, qh1, ui0, ui1, bq5_0->d, bq8_1->ds);
+    return vec_dot_q5_0_q8_1_impl(qs, qh, ui0, ui1, bq5_0->d, bq8_1->ds);
 }
 
 static __device__ __forceinline__ float vec_dot_q5_1_q8_1(