@@ -1073,10 +1073,12 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1073
1073
uint16_t aux[4 ];
1074
1074
const uint8_t * sc = (const uint8_t *)aux;
1075
1075
1076
+ uint16_t q16[8 ];
1077
+ const uint8_t * q4 = (const uint8_t *)q16;
1078
+
1076
1079
for (int i = ix; i < num_blocks_per_row; i += 2 ) {
1077
1080
1078
1081
const uint8_t * ql1 = x[i].qs + q_offset;
1079
- const uint8_t * ql2 = ql1 + 64 ;
1080
1082
const uint8_t * qh = x[i].qh + l0;
1081
1083
const float * y1 = yy + i*QK_K + y_offset;
1082
1084
const float * y2 = y1 + 128 ;
@@ -1092,15 +1094,25 @@ static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx,
1092
1094
1093
1095
float4 sum = {0 .f , 0 .f , 0 .f , 0 .f };
1094
1096
float smin = 0 ;
1097
+ const uint16_t * q1 = (const uint16_t *)ql1;
1098
+ const uint16_t * q2 = q1 + 32 ;
1099
+ q16[0 ] = q1[0 ] & 0x0f0f ;
1100
+ q16[1 ] = q1[8 ] & 0x0f0f ;
1101
+ q16[2 ] = (q1[0 ] >> 4 ) & 0x0f0f ;
1102
+ q16[3 ] = (q1[8 ] >> 4 ) & 0x0f0f ;
1103
+ q16[4 ] = q2[0 ] & 0x0f0f ;
1104
+ q16[5 ] = q2[8 ] & 0x0f0f ;
1105
+ q16[6 ] = (q2[0 ] >> 4 ) & 0x0f0f ;
1106
+ q16[7 ] = (q2[8 ] >> 4 ) & 0x0f0f ;
1095
1107
for (int l = 0 ; l < n; ++l) {
1096
- sum.x += y1[l+ 0 ] * ((ql1[l+ 0 ] & 0xF ) + (qh[l+ 0 ] & (hm1 << 0 ) ? 16 : 0 ))
1097
- + y1[l+16 ] * ((ql1[l+ 16 ] & 0xF ) + (qh[l+16 ] & (hm1 << 0 ) ? 16 : 0 ));
1098
- sum.y += y1[l+32 ] * ((ql1[l+ 0 ] >> 4 ) + (qh[l+ 0 ] & (hm1 << 1 ) ? 16 : 0 ))
1099
- + y1[l+48 ] * ((ql1[l+ 16 ] >> 4 ) + (qh[l+16 ] & (hm1 << 1 ) ? 16 : 0 ));
1100
- sum.z += y2[l+ 0 ] * ((ql2[l+ 0 ] & 0xF ) + (qh[l+ 0 ] & (hm2 << 0 ) ? 16 : 0 ))
1101
- + y2[l+16 ] * ((ql2 [l+16 ] & 0xF ) + (qh[l+16 ] & (hm2 << 0 ) ? 16 : 0 ));
1102
- sum.w += y2[l+32 ] * ((ql2 [l+ 0 ] >> 4 ) + (qh[l+ 0 ] & (hm2 << 1 ) ? 16 : 0 ))
1103
- + y2[l+48 ] * ((ql2 [l+16 ] >> 4 ) + (qh[l+16 ] & (hm2 << 1 ) ? 16 : 0 ));
1108
+ sum.x += y1[l+ 0 ] * (q4[l + 0 ] + (qh[l+ 0 ] & (hm1 << 0 ) ? 16 : 0 ))
1109
+ + y1[l+16 ] * (q4[l + 2 ] + (qh[l+16 ] & (hm1 << 0 ) ? 16 : 0 ));
1110
+ sum.y += y1[l+32 ] * (q4[l + 4 ] + (qh[l+ 0 ] & (hm1 << 1 ) ? 16 : 0 ))
1111
+ + y1[l+48 ] * (q4[l + 6 ] + (qh[l+16 ] & (hm1 << 1 ) ? 16 : 0 ));
1112
+ sum.z += y2[l+ 0 ] * (q4[l + 8 ] + (qh[l+ 0 ] & (hm2 << 0 ) ? 16 : 0 ))
1113
+ + y2[l+16 ] * (q4 [l+10 ] + (qh[l+16 ] & (hm2 << 0 ) ? 16 : 0 ));
1114
+ sum.w += y2[l+32 ] * (q4 [l+12 ] + (qh[l+ 0 ] & (hm2 << 1 ) ? 16 : 0 ))
1115
+ + y2[l+48 ] * (q4 [l+14 ] + (qh[l+16 ] & (hm2 << 1 ) ? 16 : 0 ));
1104
1116
smin += (y1[l] + y1[l+16 ]) * sc[2 ] + (y1[l+32 ] + y1[l+48 ]) * sc[3 ]
1105
1117
+ (y2[l] + y2[l+16 ]) * sc[6 ] + (y2[l+32 ] + y2[l+48 ]) * sc[7 ];
1106
1118
}
@@ -1554,15 +1566,23 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1554
1566
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1555
1567
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1556
1568
1557
- const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6
1569
+ // iqs is in 0...15. bq8_offset = 2 * (iqs/4) -> bq8_offset = 0, 2, 4, 6
1570
+ const int bq8_offset = QR4_K * (iqs / (QI8_1/2 ));
1558
1571
1559
1572
float sumf_d = 0 .0f ;
1560
1573
float sumf_m = 0 .0f ;
1561
1574
1562
1575
const float d = bq4_K->d ;
1563
1576
const float dmin = bq4_K->dmin ;
1564
1577
1565
- const int v = *((int *) &bq4_K->qs [sizeof (int ) * iqs]);
1578
+ // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
1579
+ // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
1580
+ // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
1581
+ // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
1582
+
1583
+ const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * (iqs%4 ));
1584
+ const int v1 = q4[0 ];
1585
+ const int v2 = q4[4 ];
1566
1586
1567
1587
const uint16_t * scales = (const uint16_t *)bq4_K->scales ;
1568
1588
uint16_t aux[2 ];
@@ -1580,13 +1600,19 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1580
1600
for (int i = 0 ; i < QR4_K; ++i) {
1581
1601
1582
1602
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1583
- const int ui = *((int *) &bq8i->qs [sizeof (int ) * (iqs % QI8_1)]);
1584
1603
const float d8i = bq8i->d ;
1604
+ const int * q8 = (const int *)bq8i->qs + (iqs%4 );
1605
+ const int ui1 = q8[0 ];
1606
+ const int ui2 = q8[4 ];
1585
1607
1586
- const int vi = (v >> (4 *i)) & 0x0F0F0F0F ;
1608
+ const int vi1 = (v1 >> (4 *i)) & 0x0F0F0F0F ;
1609
+ const int vi2 = (v2 >> (4 *i)) & 0x0F0F0F0F ;
1587
1610
1588
- sumf_d += d8i * (__dp4a (vi, ui, 0 ) * sc[i]); // SIMD dot product
1589
- sumf_m += d8i * (__dp4a (0x01010101 , ui, 0 ) * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1611
+ const int dot1 = __dp4a (vi2, ui2, __dp4a (vi1, ui1, 0 )); // SIMD dot product
1612
+ const int dot2 = __dp4a (0x01010101 , ui2, __dp4a (0x01010101 , ui1, 0 ));
1613
+
1614
+ sumf_d += d8i * (dot1 * sc[i]);
1615
+ sumf_m += d8i * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
1590
1616
}
1591
1617
1592
1618
return d*sumf_d - dmin*sumf_m;
@@ -1601,36 +1627,58 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
1601
1627
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1602
1628
const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1603
1629
1604
- const int bq8_offset = QR5_K * (iqs / QI8_1);
1630
+ const int bq8_offset = QR5_K * (iqs / (QI8_1/2 ));
1631
+ const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * (iqs%4 ));
1632
+ const int * qh = (const int *)(bq5_K->qh + 4 * (iqs%4 ));
1605
1633
1606
1634
float sumf_d = 0 .0f ;
1607
1635
float sumf_m = 0 .0f ;
1608
1636
1609
1637
const float d = bq5_K->d ;
1610
1638
const float dmin = bq5_K->dmin ;
1611
1639
1612
- const int vl = *((int *) &bq5_K->qs [sizeof (int ) * iqs]);
1640
+ const int vl1 = ql[0 ];
1641
+ const int vl2 = ql[4 ];
1613
1642
1614
- const int vh = (*((int *) &bq5_K->qh [sizeof (int ) * (iqs % (QI5_K/4 ))])) >> bq8_offset;
1643
+ const int vh1 = qh[0 ] >> bq8_offset;
1644
+ const int vh2 = qh[4 ] >> bq8_offset;
1615
1645
1616
- for (int i = 0 ; i < QR5_K; ++i) {
1617
- const int isc = bq8_offset + i;
1646
+ const uint16_t * scales = (const uint16_t *)bq5_K->scales ;
1647
+ uint16_t aux[2 ];
1648
+ const int j = bq8_offset/2 ;
1649
+ if (j < 2 ) {
1650
+ aux[0 ] = scales[j+0 ] & 0x3f3f ;
1651
+ aux[1 ] = scales[j+2 ] & 0x3f3f ;
1652
+ } else {
1653
+ aux[0 ] = ((scales[j+2 ] >> 0 ) & 0x0f0f ) | ((scales[j-2 ] & 0xc0c0 ) >> 2 );
1654
+ aux[1 ] = ((scales[j+2 ] >> 4 ) & 0x0f0f ) | ((scales[j-0 ] & 0xc0c0 ) >> 2 );
1655
+ }
1656
+ const uint8_t * sc = (const uint8_t *)aux;
1657
+ const uint8_t * m = sc + 2 ;
1618
1658
1619
- uint8_t sc, m;
1620
- get_scale_min_k4 (isc, bq5_K->scales , sc, m);
1659
+ for (int i = 0 ; i < QR5_K; ++i) {
1621
1660
1622
1661
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1623
- const int ui = *((int *) &bq8i->qs [sizeof (int ) * (iqs % QI8_1)]);
1624
1662
const float d8i = bq8i->d ;
1663
+ const int * q8 = (const int *)bq8i->qs + (iqs%4 );
1664
+ const int ui1 = q8[0 ];
1665
+ const int ui2 = q8[4 ];
1625
1666
1626
- const int vil = (vl >> (4 *i)) & 0x0F0F0F0F ;
1667
+ const int vil1 = (vl1 >> (4 *i)) & 0x0F0F0F0F ;
1668
+ const int vil2 = (vl2 >> (4 *i)) & 0x0F0F0F0F ;
1669
+
1670
+ const int vih1 = ((vh1 >> i) << 4 ) & 0x10101010 ;
1671
+ const int vih2 = ((vh2 >> i) << 4 ) & 0x10101010 ;
1672
+
1673
+ const int vi1 = vil1 | vih1;
1674
+ const int vi2 = vil2 | vih2;
1627
1675
1628
- const int vih = ((vh >> i) << 4 ) & 0x10101010 ;
1676
+ const int dot1 = __dp4a (vi2, ui2, __dp4a (vi1, ui1, 0 )); // SIMD dot product
1677
+ const int dot2 = __dp4a (0x01010101 , ui2, __dp4a (0x01010101 , ui1, 0 ));
1629
1678
1630
- const int vi = vil | vih;
1679
+ sumf_d += d8i * (dot1 * sc[i]);
1680
+ sumf_m += d8i * (dot2 * m[i]);
1631
1681
1632
- sumf_d += d8i * (__dp4a (vi, ui, 0 ) * sc); // SIMD dot product
1633
- sumf_m += d8i * (__dp4a (0x01010101 , ui, 0 ) * m); // multiply constant part of q5_K with sum of q8_1 values
1634
1682
}
1635
1683
1636
1684
return d*sumf_d - dmin*sumf_m;
@@ -2306,7 +2354,10 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2306
2354
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
2307
2355
const dim3 block_nums (1 , block_num_y, 1 );
2308
2356
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
2309
- mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2357
+ // Note: we use QI4_K/2 instead of QI4_K to make the dot product template require 4 groups of quants to be processed per
2358
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2359
+ // is better amortized.
2360
+ mul_mat_vec_q<QK_K, QI4_K/2 , block_q4_K, vec_dot_q4_K_q8_1>
2310
2361
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
2311
2362
}
2312
2363
@@ -2315,7 +2366,10 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
2315
2366
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
2316
2367
const dim3 block_nums (1 , block_num_y, 1 );
2317
2368
const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
2318
- mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2369
+ // Note: we use QI5_K/2 instead of QI5_K to make the dot product template require 4 groups of quants to be processed per
2370
+ // kernel call instead of 2. This results in a better perfmance because the cost of computing the k-quant scales
2371
+ // is better amortized.
2372
+ mul_mat_vec_q<QK_K, QI5_K/2 , block_q5_K, vec_dot_q5_K_q8_1>
2319
2373
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
2320
2374
}
2321
2375
0 commit comments