q8_0 impl

JohannesGaessler · JohannesGaessler · commit 7b2a075029ca · 2023-07-16T10:38:58.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -60,6 +60,16 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif //GGML_CUDA_DMMV_F16
 
+static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) {
+    const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
 static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
     const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
 
@@ -1602,27 +1612,30 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
         y_qs[j * (2*WARP_SIZE) + kyqs + (QI8_1/2)], x_dm[index_bx], y_ds[j * (2*WARP_SIZE/QI8_1) + 2*k/QI8_1]);
 }
 
-static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
-    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl(
+    const int & vi, const int & ui, const half & d8_0, const half2 & ds8_1) {
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int vi;
-    memcpy(&vi,  &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
-    const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-
-    const float d = __half2float(bq8_0->d) * __half2float(bq8_1->ds.x);
-
     // SIMD dot product of quantized values
-    int sumi = __dp4a(vi, ui, 0);
+    const int sumi = __dp4a(vi, ui, 0);
 
-    return sumi*d;
+    return sumi * __half2float(d8_0) * __half2float(ds8_1.x);
 #else
     return 0.0f; // only to satisfy the compiler
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
+static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
+    const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    const int vi = get_int_from_int8(bq8_0->qs, iqs);
+    const int ui = get_int_from_int8_aligned(bq8_1->qs, iqs);
+
+    return vec_dot_q8_0_q8_1_impl(vi, ui, bq8_0->d, bq8_1->ds);
+}
+
 static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
     const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {