@@ -186,6 +186,8 @@ typedef struct {
186
186
static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_fp16_t ) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
187
187
#endif
188
188
189
+ #define QR5_K 2
190
+ #define QI5_K (QK_K / (4 *QR5_K))
189
191
#ifdef GGML_QKK_64
190
192
typedef struct {
191
193
half d; // super-block scale
@@ -1531,6 +1533,50 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1531
1533
#endif // __CUDA_ARCH__ >= 610
1532
1534
}
1533
1535
1536
+ static __device__ __forceinline__ float vec_dot_q5_K_q8_1 (
1537
+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1538
+
1539
+ #if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1540
+ const block_q5_K * bq5_K = (const block_q5_K *) vbq;
1541
+
1542
+ const int bq8_offset = 2 * (iqs / 8 );
1543
+
1544
+ float sumf_d = 0 .0f ;
1545
+ float sumf_m = 0 ;
1546
+
1547
+ const float d = bq5_K->d ;
1548
+ const float dmin = bq5_K->dmin ;
1549
+
1550
+ const int vil = *((int *) &bq5_K->qs [sizeof (int ) * iqs]);
1551
+
1552
+ const int vih = (*((int *) &bq5_K->qh [sizeof (int ) * (iqs % 8 )])) >> bq8_offset;
1553
+
1554
+ for (int i = 0 ; i < 2 ; ++i) {
1555
+ const int isc = bq8_offset + i;
1556
+
1557
+ uint8_t sc, m;
1558
+ get_scale_min_k4 (isc, bq5_K->scales , sc, m);
1559
+
1560
+ const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1561
+ const int uii = *((int *) &bq8i->qs [sizeof (int ) * (iqs%8 )]);
1562
+ const float d8i = bq8i->d ;
1563
+
1564
+ const int viil = (vil >> (4 *i)) & 0x0F0F0F0F ;
1565
+
1566
+ const int viih = ((vih >> i) << 4 ) & 0x10101010 ;
1567
+
1568
+ const int vii = viil | viih;
1569
+
1570
+ sumf_d += d8i * (__dp4a (vii, uii, 0 ) * sc);
1571
+ sumf_m += d8i * (__dp4a (0x01010101 , uii, 0 ) * m);
1572
+ }
1573
+
1574
+ return d*sumf_d - dmin*sumf_m;
1575
+ #else
1576
+ return 0 .0f ; // only to satisfy the compiler
1577
+ #endif // __CUDA_ARCH__ >= 610
1578
+ }
1579
+
1534
1580
template <int qk, int qi, typename block_q_t , vec_dot_q_cuda_t vec_dot_q_cuda>
1535
1581
static __global__ void mul_mat_vec_q (const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
1536
1582
const int row = blockIdx .y *blockDim .y + threadIdx .y ;
@@ -2116,6 +2162,15 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
2116
2162
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
2117
2163
}
2118
2164
2165
+ static void mul_mat_vec_q5_K_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2166
+ GGML_ASSERT (ncols % QK_K == 0 );
2167
+ const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1 ) / GGML_CUDA_MMV_Y;
2168
+ const dim3 block_nums (1 , block_num_y, 1 );
2169
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_MMV_Y, 1 );
2170
+ mul_mat_vec_q<QK_K, QI5_K, block_q5_K, vec_dot_q5_K_q8_1>
2171
+ <<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
2172
+ }
2173
+
2119
2174
static void convert_fp16_to_fp32_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
2120
2175
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
2121
2176
dequantize_block<1 , 1 , convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
@@ -2581,8 +2636,8 @@ inline void ggml_cuda_op_mul_mat_vec(
2581
2636
src0->type == GGML_TYPE_Q8_0 ||
2582
2637
src0->type == GGML_TYPE_Q2_K ||
2583
2638
src0->type == GGML_TYPE_Q3_K ||
2584
- src0->type == GGML_TYPE_Q4_K;
2585
- // src0->type == GGML_TYPE_Q5_K ||
2639
+ src0->type == GGML_TYPE_Q4_K ||
2640
+ src0->type == GGML_TYPE_Q5_K;
2586
2641
// src0->type == GGML_TYPE_Q5_K;
2587
2642
2588
2643
const bool use_mul_mat_vec_q = g_compute_capabilities[id] >= 610 && mul_mat_vec_q_implemented;
@@ -2620,6 +2675,9 @@ inline void ggml_cuda_op_mul_mat_vec(
2620
2675
case GGML_TYPE_Q4_K:
2621
2676
mul_mat_vec_q4_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2622
2677
break ;
2678
+ case GGML_TYPE_Q5_K:
2679
+ mul_mat_vec_q5_K_q8_1_cuda (src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2680
+ break ;
2623
2681
default :
2624
2682
GGML_ASSERT (false );
2625
2683
break ;
0 commit comments