Skip to content

Commit 2f9cf97

Browse files
ikawrakowKawrakow
andauthored
Some more Q4_K and Q5_K speedup on CUDA (#2346)
* Faster Q5_K on CUDA * Small Q5_K improvement on older GPUs * Spped up Q4_K on CUDA GTX1660: 29.5 ms/t -> 25.6 ms/t RTX4080: 8.40 ms/t -> 8.25 ms/t * Spped up Q4_K on CUDA GTX1660: 36.7 ms/t -> 35.6 ms/t RTX4080: 9.8 ms/t -> 9.5 ms/t * Address PR comments * Add some comments to satisfy PR reviewer --------- Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 4f06592 commit 2f9cf97

File tree

1 file changed

+84
-30
lines changed

1 file changed

+84
-30
lines changed

ggml-cuda.cu

Lines changed: 84 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,10 +1073,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
10731073
uint16_t aux[4];
10741074
const uint8_t * sc = (const uint8_t *)aux;
10751075

1076+
uint16_t q16[8];
1077+
const uint8_t * q4 = (const uint8_t *)q16;
1078+
10761079
for (int i = ix; i < num_blocks_per_row; i += 2) {
10771080

10781081
const uint8_t * ql1 = x[i].qs + q_offset;
1079-
const uint8_t * ql2 = ql1 + 64;
10801082
const uint8_t * qh = x[i].qh + l0;
10811083
const float * y1 = yy + i*QK_K + y_offset;
10821084
const float * y2 = y1 + 128;
@@ -1092,15 +1094,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
10921094

10931095
float4 sum = {0.f, 0.f, 0.f, 0.f};
10941096
float smin = 0;
1097+
const uint16_t * q1 = (const uint16_t *)ql1;
1098+
const uint16_t * q2 = q1 + 32;
1099+
q16[0] = q1[0] & 0x0f0f;
1100+
q16[1] = q1[8] & 0x0f0f;
1101+
q16[2] = (q1[0] >> 4) & 0x0f0f;
1102+
q16[3] = (q1[8] >> 4) & 0x0f0f;
1103+
q16[4] = q2[0] & 0x0f0f;
1104+
q16[5] = q2[8] & 0x0f0f;
1105+
q16[6] = (q2[0] >> 4) & 0x0f0f;
1106+
q16[7] = (q2[8] >> 4) & 0x0f0f;
10951107
for (int l = 0; l < n; ++l) {
1096-
sum.x += y1[l+ 0] * ((ql1[l+ 0] & 0xF) + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1097-
+ y1[l+16] * ((ql1[l+16] & 0xF) + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1098-
sum.y += y1[l+32] * ((ql1[l+ 0] >> 4) + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1099-
+ y1[l+48] * ((ql1[l+16] >> 4) + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1100-
sum.z += y2[l+ 0] * ((ql2[l+ 0] & 0xF) + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1101-
+ y2[l+16] * ((ql2[l+16] & 0xF) + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1102-
sum.w += y2[l+32] * ((ql2[l+ 0] >> 4) + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1103-
+ y2[l+48] * ((ql2[l+16] >> 4) + (qh[l+16] & (hm2 << 1) ? 16 : 0));
1108+
sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0))
1109+
+ y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0));
1110+
sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0))
1111+
+ y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0));
1112+
sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0))
1113+
+ y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0));
1114+
sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0))
1115+
+ y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0));
11041116
smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
11051117
+ (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
11061118
}
@@ -1554,15 +1566,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
15541566
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
15551567
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
15561568

1557-
const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6
1569+
// iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1570+
const int bq8_offset = QR4_K * (iqs / (QI8_1/2));
15581571

15591572
float sumf_d = 0.0f;
15601573
float sumf_m = 0.0f;
15611574

15621575
const float d = bq4_K->d;
15631576
const float dmin = bq4_K->dmin;
15641577

1565-
const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1578+
// iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579+
// iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580+
// iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581+
// iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1582+
1583+
const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1584+
const int v1 = q4[0];
1585+
const int v2 = q4[4];
15661586

15671587
const uint16_t * scales = (const uint16_t *)bq4_K->scales;
15681588
uint16_t aux[2];
@@ -1580,13 +1600,19 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
15801600
for (int i = 0; i < QR4_K; ++i) {
15811601

15821602
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1583-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
15841603
const float d8i = bq8i->d;
1604+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
1605+
const int ui1 = q8[0];
1606+
const int ui2 = q8[4];
15851607

1586-
const int vi = (v >> (4*i)) & 0x0F0F0F0F;
1608+
const int vi1 = (v1 >> (4*i)) & 0x0F0F0F0F;
1609+
const int vi2 = (v2 >> (4*i)) & 0x0F0F0F0F;
15871610

1588-
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc[i]); // SIMD dot product
1589-
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1611+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1612+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
1613+
1614+
sumf_d += d8i * (dot1 * sc[i]);
1615+
sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
15901616
}
15911617

15921618
return d*sumf_d - dmin*sumf_m;
@@ -1601,36 +1627,58 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
16011627
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
16021628
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
16031629

1604-
const int bq8_offset = QR5_K * (iqs / QI8_1);
1630+
const int bq8_offset = QR5_K * (iqs / (QI8_1/2));
1631+
const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4));
1632+
const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4));
16051633

16061634
float sumf_d = 0.0f;
16071635
float sumf_m = 0.0f;
16081636

16091637
const float d = bq5_K->d;
16101638
const float dmin = bq5_K->dmin;
16111639

1612-
const int vl = *((int *) &bq5_K->qs[sizeof(int) * iqs]);
1640+
const int vl1 = ql[0];
1641+
const int vl2 = ql[4];
16131642

1614-
const int vh = (*((int *) &bq5_K->qh[sizeof(int) * (iqs % (QI5_K/4))])) >> bq8_offset;
1643+
const int vh1 = qh[0] >> bq8_offset;
1644+
const int vh2 = qh[4] >> bq8_offset;
16151645

1616-
for (int i = 0; i < QR5_K; ++i) {
1617-
const int isc = bq8_offset + i;
1646+
const uint16_t * scales = (const uint16_t *)bq5_K->scales;
1647+
uint16_t aux[2];
1648+
const int j = bq8_offset/2;
1649+
if (j < 2) {
1650+
aux[0] = scales[j+0] & 0x3f3f;
1651+
aux[1] = scales[j+2] & 0x3f3f;
1652+
} else {
1653+
aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
1654+
aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
1655+
}
1656+
const uint8_t * sc = (const uint8_t *)aux;
1657+
const uint8_t * m = sc + 2;
16181658

1619-
uint8_t sc, m;
1620-
get_scale_min_k4(isc, bq5_K->scales, sc, m);
1659+
for (int i = 0; i < QR5_K; ++i) {
16211660

16221661
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1623-
const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]);
16241662
const float d8i = bq8i->d;
1663+
const int * q8 = (const int *)bq8i->qs + (iqs%4);
1664+
const int ui1 = q8[0];
1665+
const int ui2 = q8[4];
16251666

1626-
const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
1667+
const int vil1 = (vl1 >> (4*i)) & 0x0F0F0F0F;
1668+
const int vil2 = (vl2 >> (4*i)) & 0x0F0F0F0F;
1669+
1670+
const int vih1 = ((vh1 >> i) << 4) & 0x10101010;
1671+
const int vih2 = ((vh2 >> i) << 4) & 0x10101010;
1672+
1673+
const int vi1 = vil1 | vih1;
1674+
const int vi2 = vil2 | vih2;
16271675

1628-
const int vih = ((vh >> i) << 4) & 0x10101010;
1676+
const int dot1 = __dp4a(vi2, ui2, __dp4a(vi1, ui1, 0)); // SIMD dot product
1677+
const int dot2 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
16291678

1630-
const int vi = vil | vih;
1679+
sumf_d += d8i * (dot1 * sc[i]);
1680+
sumf_m += d8i * (dot2 * m[i]);
16311681

1632-
sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product
1633-
sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q5_K with sum of q8_1 values
16341682
}
16351683

16361684
return d*sumf_d - dmin*sumf_m;
@@ -2306,7 +2354,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
23062354
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
23072355
const dim3 block_nums(1, block_num_y, 1);
23082356
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2309-
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2357+
// Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2358+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2359+
// is better amortized.
2360+
mul_mat_vec_q<QK_K, QI4_K/2, block_q4_K, vec_dot_q4_K_q8_1>
23102361
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
23112362
}
23122363

@@ -2315,7 +2366,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
23152366
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
23162367
const dim3 block_nums(1, block_num_y, 1);
23172368
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2318-
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2369+
// Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2370+
// kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2371+
// is better amortized.
2372+
mul_mat_vec_q<QK_K, QI5_K/2, block_q5_K, vec_dot_q5_K_q8_1>
23192373
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
23202374
}
23212375

0 commit comments

Comments
 (0)