@@ -167,6 +167,8 @@ typedef struct {
167
167
} block_q3_K;
168
168
// static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
169
169
170
+ #define QR4_K 2
171
+ #define QI4_K (QK_K / (4 *QR4_K))
170
172
#ifdef GGML_QKK_64
171
173
typedef struct {
172
174
half d[2 ]; // super-block scales/mins
@@ -1491,6 +1493,44 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
1491
1493
#endif // __CUDA_ARCH__ >= 610
1492
1494
}
1493
1495
1496
+ static __device__ __forceinline__ float vec_dot_q4_K_q8_1 (
1497
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1498
+
1499
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1500
+ const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1501
+
1502
+ const int bq8_offset = 2 * (iqs / 8 );
1503
+
1504
+ float sumf_d = 0 .0f ;
1505
+ float sumf_m = 0 ;
1506
+
1507
+ const float d = bq4_K->d ;
1508
+ const float dmin = bq4_K->dmin ;
1509
+
1510
+ const int vi = *((int *) &bq4_K->qs [sizeof (int ) * iqs]);
1511
+
1512
+ for (int i = 0 ; i < 2 ; ++i) {
1513
+ const int isc = bq8_offset + i;
1514
+
1515
+ uint8_t sc, m;
1516
+ get_scale_min_k4 (isc, bq4_K->scales , sc, m);
1517
+
1518
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1519
+ const int uii = *((int *) &bq8i->qs [sizeof (int ) * (iqs%8 )]);
1520
+ const float d8i = bq8i->d ;
1521
+
1522
+ const int vii = (vi >> (4 *i)) & 0x0F0F0F0F ;
1523
+
1524
+ sumf_d += d8i * (__dp4a (vii, uii, 0 ) * sc);
1525
+ sumf_m += d8i * (__dp4a (0x01010101 , uii, 0 ) * m);
1526
+ }
1527
+
1528
+ return d*sumf_d - dmin*sumf_m;
1529
+ #else
1530
+ return 0 .0f ; // only to satisfy the compiler
1531
+ #endif // __CUDA_ARCH__ >= 610
1532
+ }
1533
+
1494
1534
template <int qk, int qi, typename block_q_t , vec_dot_q_cuda_t vec_dot_q_cuda>
1495
1535
static __global__ void mul_mat_vec_q (const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1496
1536
const int row = blockIdx .y *blockDim .y + threadIdx .y ;
@@ -2067,6 +2107,15 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
2067
2107
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
2068
2108
}
2069
2109
2110
+ static void mul_mat_vec_q4_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2111
+ GGML_ASSERT (ncols % QK_K == 0 );
2112
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
2113
+ const dim3 block_nums (1 , block_num_y, 1 );
2114
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
2115
+ mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2116
+ <<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
2117
+ }
2118
+
2070
2119
static void convert_fp16_to_fp32_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
2071
2120
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
2072
2121
dequantize_block<1 , 1 , convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
@@ -2531,8 +2580,8 @@ inline void ggml_cuda_op_mul_mat_vec(
2531
2580
src0->type == GGML_TYPE_Q5_1 ||
2532
2581
src0->type == GGML_TYPE_Q8_0 ||
2533
2582
src0->type == GGML_TYPE_Q2_K ||
2534
- src0->type == GGML_TYPE_Q3_K;
2535
- // src0->type == GGML_TYPE_Q4_K ||
2583
+ src0->type == GGML_TYPE_Q3_K ||
2584
+ src0->type == GGML_TYPE_Q4_K;
2536
2585
// src0->type == GGML_TYPE_Q5_K ||
2537
2586
// src0->type == GGML_TYPE_Q5_K;
2538
2587
@@ -2568,6 +2617,9 @@ inline void ggml_cuda_op_mul_mat_vec(
2568
2617
case GGML_TYPE_Q3_K:
2569
2618
mul_mat_vec_q3_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2570
2619
break ;
2620
+ case GGML_TYPE_Q4_K:
2621
+ mul_mat_vec_q4_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2622
+ break ;
2571
2623
default :
2572
2624
GGML_ASSERT (false );
2573
2625
break ;
0 commit comments