@@ -935,12 +935,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
935
935
uint16_t aux[4 ];
936
936
const uint8_t * sc = (const uint8_t *)aux;
937
937
938
+ #if K_QUANTS_PER_ITERATION == 2
939
+ uint32_t q32[4 ];
940
+ const uint8_t * q4 = (const uint8_t *)q32;
941
+ #else
942
+ uint16_t q16[4 ];
943
+ const uint8_t * q4 = (const uint8_t *)q16;
944
+ #endif
945
+
938
946
float tmp = 0 ; // partial sum for thread in warp
939
947
940
948
for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
941
949
942
- const uint8_t * q1 = x[i].qs + q_offset;
943
- const uint8_t * q2 = q1 + 64 ;
944
950
const float * y1 = yy + i*QK_K + y_offset;
945
951
const float * y2 = y1 + 128 ;
946
952
@@ -953,14 +959,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx,
953
959
aux[2 ] = ((a[im+4 ] >> 0 ) & kmask2) | ((a[im+0 ] & kmask3) >> 2 );
954
960
aux[3 ] = ((a[im+4 ] >> 4 ) & kmask2) | ((a[im+2 ] & kmask3) >> 2 );
955
961
962
+ #if K_QUANTS_PER_ITERATION == 2
963
+ const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
964
+ const uint32_t * q2 = q1 + 16 ;
965
+
966
+ q32[0 ] = q1[0 ] & 0x0f0f0f0f ;
967
+ q32[1 ] = q1[0 ] & 0xf0f0f0f0 ;
968
+ q32[2 ] = q2[0 ] & 0x0f0f0f0f ;
969
+ q32[3 ] = q2[0 ] & 0xf0f0f0f0 ;
970
+
956
971
float4 s = {0 .f , 0 .f , 0 .f , 0 .f };
957
972
float smin = 0 ;
958
- for (int l = 0 ; l < n ; ++l) {
959
- s.x += y1[l] * (q1[l] & 0xF ) ; s.y += y1[l+32 ] * (q1[l] >> 4 ) ;
960
- s.z += y2[l] * (q2[l] & 0xF ) ; s.w += y2[l+32 ] * (q2[l] >> 4 ) ;
973
+ for (int l = 0 ; l < 4 ; ++l) {
974
+ s.x += y1[l] * q4[l+ 0 ] ; s.y += y1[l+32 ] * q4[l+ 4 ] ;
975
+ s.z += y2[l] * q4[l+ 8 ] ; s.w += y2[l+32 ] * q4[l+ 12 ] ;
961
976
smin += y1[l] * sc[2 ] + y1[l+32 ] * sc[3 ] + y2[l] * sc[6 ] + y2[l+32 ] * sc[7 ];
962
977
}
963
- tmp += dall * (s.x * sc[0 ] + s.y * sc[1 ] + s.z * sc[4 ] + s.w * sc[5 ]) - dmin * smin;
978
+ tmp += dall * (s.x * sc[0 ] + s.y * sc[1 ] * 1 .f /16 .f + s.z * sc[4 ] + s.w * sc[5 ] * 1 .f /16 .f ) - dmin * smin;
979
+ #else
980
+ const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
981
+ const uint16_t * q2 = q1 + 32 ;
982
+
983
+ q16[0 ] = q1[0 ] & 0x0f0f ;
984
+ q16[1 ] = q1[0 ] & 0xf0f0 ;
985
+ q16[2 ] = q2[0 ] & 0x0f0f ;
986
+ q16[3 ] = q2[0 ] & 0xf0f0 ;
987
+
988
+ float4 s = {0 .f , 0 .f , 0 .f , 0 .f };
989
+ float smin = 0 ;
990
+ for (int l = 0 ; l < 2 ; ++l) {
991
+ s.x += y1[l] * q4[l+0 ]; s.y += y1[l+32 ] * q4[l+2 ];
992
+ s.z += y2[l] * q4[l+4 ]; s.w += y2[l+32 ] * q4[l+6 ];
993
+ smin += y1[l] * sc[2 ] + y1[l+32 ] * sc[3 ] + y2[l] * sc[6 ] + y2[l+32 ] * sc[7 ];
994
+ }
995
+ tmp += dall * (s.x * sc[0 ] + s.y * sc[1 ] * 1 .f /16 .f + s.z * sc[4 ] + s.w * sc[5 ] * 1 .f /16 .f ) - dmin * smin;
996
+ #endif
964
997
965
998
}
966
999
#else
@@ -1521,7 +1554,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1521
1554
#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1522
1555
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1523
1556
1524
- const int bq8_offset = QR4_K * (iqs / QI8_1);
1557
+ const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6
1525
1558
1526
1559
float sumf_d = 0 .0f ;
1527
1560
float sumf_m = 0 .0f ;
@@ -1531,20 +1564,29 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1531
1564
1532
1565
const int v = *((int *) &bq4_K->qs [sizeof (int ) * iqs]);
1533
1566
1534
- for (int i = 0 ; i < QR4_K; ++i) {
1535
- const int isc = bq8_offset + i;
1567
+ const uint16_t * scales = (const uint16_t *)bq4_K->scales ;
1568
+ uint16_t aux[2 ];
1569
+ const int j = bq8_offset/2 ;
1570
+ if (j < 2 ) {
1571
+ aux[0 ] = scales[j+0 ] & 0x3f3f ;
1572
+ aux[1 ] = scales[j+2 ] & 0x3f3f ;
1573
+ } else {
1574
+ aux[0 ] = ((scales[j+2 ] >> 0 ) & 0x0f0f ) | ((scales[j-2 ] & 0xc0c0 ) >> 2 );
1575
+ aux[1 ] = ((scales[j+2 ] >> 4 ) & 0x0f0f ) | ((scales[j-0 ] & 0xc0c0 ) >> 2 );
1576
+ }
1577
+ const uint8_t * sc = (const uint8_t *)aux;
1578
+ const uint8_t * m = sc + 2 ;
1536
1579
1537
- uint8_t sc, m;
1538
- get_scale_min_k4 (isc, bq4_K->scales , sc, m);
1580
+ for (int i = 0 ; i < QR4_K; ++i) {
1539
1581
1540
1582
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1541
1583
const int ui = *((int *) &bq8i->qs [sizeof (int ) * (iqs % QI8_1)]);
1542
1584
const float d8i = bq8i->d ;
1543
1585
1544
1586
const int vi = (v >> (4 *i)) & 0x0F0F0F0F ;
1545
1587
1546
- sumf_d += d8i * (__dp4a (vi, ui, 0 ) * sc); // SIMD dot product
1547
- sumf_m += d8i * (__dp4a (0x01010101 , ui, 0 ) * m); // multiply constant part of q4_K with sum of q8_1 values
1588
+ sumf_d += d8i * (__dp4a (vi, ui, 0 ) * sc[i] ); // SIMD dot product
1589
+ sumf_m += d8i * (__dp4a (0x01010101 , ui, 0 ) * m[i] ); // multiply constant part of q4_K with sum of q8_1 values
1548
1590
}
1549
1591
1550
1592
return d*sumf_d - dmin*sumf_m;
@@ -2497,7 +2539,9 @@ static size_t g_scratch_offset = 0;
2497
2539
2498
2540
static int g_device_count = -1 ;
2499
2541
static int g_main_device = 0 ;
2542
+ #ifndef GGML_CUDA_FORCE_DMMV
2500
2543
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
2544
+ #endif
2501
2545
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
2502
2546
2503
2547
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
@@ -2520,7 +2564,9 @@ void ggml_init_cublas() {
2520
2564
g_tensor_split[id] = total_vram;
2521
2565
total_vram += prop.totalGlobalMem ;
2522
2566
2567
+ #ifndef GGML_CUDA_FORCE_DMMV
2523
2568
g_compute_capabilities[id] = 100 *prop.major + 10 *prop.minor ;
2569
+ #endif
2524
2570
}
2525
2571
for (int id = 0 ; id < g_device_count; ++id) {
2526
2572
g_tensor_split[id] /= total_vram;
0 commit comments