Performance optimization: 2 byte aligned reads

JohannesGaessler · JohannesGaessler · commit 89d2b3e2db03 · 2023-07-16T11:23:14.000+02:00
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
@@ -60,6 +60,24 @@ typedef float dfloat; // dequantize float
 typedef float2 dfloat2;
 #endif //GGML_CUDA_DMMV_F16
 
+static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) {
+    const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) {
+    return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) {
+    return *((int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
 typedef void (*to_fp32_cuda_t)(const void * __restrict__ x, float * __restrict__ y, int k, cudaStream_t stream);
 typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
@@ -1315,10 +1333,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
 
     const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
 
-    int vi;
-    memcpy(&vi,  &bq4_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
+    const int vi  = get_int_from_uint8(bq4_0->qs, iqs);
+    const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs);
+    const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI4_0);
 
     return vec_dot_q4_0_q8_1_impl(vi, ui0, ui1, bq4_0->d, bq8_1->ds);
 }
@@ -1337,11 +1354,11 @@ static __device__ __forceinline__ void load_tiles_q4_0(
     int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) {
 
     const int kbx  = k / QI4_0;
-    const int kqsx = sizeof(int) * (k % QI4_0);
+    const int kqsx = k % QI4_0;
 
     const block_q4_0 * bx = ((block_q4_0 *) vx) + i*blocks_per_row + kbx;
 
-    memcpy(&x_ql[i * WARP_SIZE + i + k], &bx->qs[kqsx], sizeof(int));
+    x_ql[i * (WARP_SIZE + 1)     + k]     = get_int_from_uint8(bx->qs, kqsx);
     x_dm[i * (WARP_SIZE / QI4_0) + kbx].x = bx->d;
 }
 
@@ -1388,9 +1405,9 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
 
     const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
 
-    const int vi  = *((int *) &bq4_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_1)]);
+    const int vi = get_int_from_uint8_aligned(bq4_1->qs, iqs);
+    const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs);
+    const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI4_1);
 
     return vec_dot_q4_1_q8_1_impl(vi, ui0, ui1, bq4_1->dm, bq8_1->ds);
 }
@@ -1409,11 +1426,11 @@ static __device__ __forceinline__ void load_tiles_q4_1(
     int8_t * __restrict__ x_sc, const int & i, const int & k, const int & blocks_per_row) {
 
     const int kbx  = k / QI4_1;
-    const int kqsx = sizeof(int) * (k % QI4_1);
+    const int kqsx = k % QI4_1;
 
     const block_q4_1 * bx = ((block_q4_1 *) vx) + i*blocks_per_row + kbx;
 
-    x_ql[i * WARP_SIZE + i + k] = *((int *) &bx->qs[kqsx]);
+    x_ql[i * (WARP_SIZE + 1)     + k]   = get_int_from_uint8_aligned(bx->qs, kqsx);
     x_dm[i * (WARP_SIZE / QI4_1) + kbx] = bx->dm;
 }
 
@@ -1433,18 +1450,18 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl(
 
 #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int vi0 = (qs  >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh0 as 5th bits
-    vi0    |= (qh0 <<  4) & 0x00000010; // 1 ->  5
-    vi0    |= (qh0 << 11) & 0x00001000; // 2 -> 13
-    vi0    |= (qh0 << 18) & 0x00100000; // 3 -> 21
-    vi0    |= (qh0 << 25) & 0x10000000; // 4 -> 29
+    vi0    |= (qh0 <<  4) & 0x00000010; // 0 ->  4
+    vi0    |= (qh0 << 11) & 0x00001000; // 1 -> 12
+    vi0    |= (qh0 << 18) & 0x00100000; // 2 -> 20
+    vi0    |= (qh0 << 25) & 0x10000000; // 3 -> 28
     vi0     = __vsub4(vi0,  0x10101010); // subtract 16 from quantized values
     int sumi = __dp4a(vi0, ui0, 0); // SIMD dot product of quantized values
 
     int vi1 = (qs  >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh1 as 5th bits
-    vi1    |= (qh1 <<  4) & 0x00000010; // 1 ->  5
-    vi1    |= (qh1 << 11) & 0x00001000; // 2 -> 13
-    vi1    |= (qh1 << 18) & 0x00100000; // 3 -> 21
-    vi1    |= (qh1 << 25) & 0x10000000; // 4 -> 29
+    vi1    |= (qh1 <<  4) & 0x00000010; // 0 ->  4
+    vi1    |= (qh1 << 11) & 0x00001000; // 1 -> 12
+    vi1    |= (qh1 << 18) & 0x00100000; // 2 -> 20
+    vi1    |= (qh1 << 25) & 0x10000000; // 3 -> 28
     vi1     = __vsub4(vi1,  0x10101010); // subtract 16 from quantized values
     sumi = __dp4a(vi1, ui1, sumi); // SIMD dot product of quantized values
 
@@ -1459,12 +1476,11 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
 
     const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 
-    int qs;
-    memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
+    const int qs = get_int_from_uint8(bq5_0->qs, iqs);
     const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
     const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
-    const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
-    const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
+    const int ui0 = get_int_from_int8_aligned(bq8_1->qs, iqs);
+    const int ui1 = get_int_from_int8_aligned(bq8_1->qs, iqs + QI5_0);
 
     return vec_dot_q5_0_q8_1_impl(qs, qh0, qh1, ui0, ui1, bq5_0->d, bq8_1->ds);
 }