Skip to content

Commit 9d51118

Browse files
q4_k works
1 parent 5528a5f commit 9d51118

File tree

1 file changed

+54
-2
lines changed

1 file changed

+54
-2
lines changed

ggml-cuda.cu

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ typedef struct {
167167
} block_q3_K;
168168
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
169169

170+
#define QR4_K 2
171+
#define QI4_K (QK_K / (4*QR4_K))
170172
#ifdef GGML_QKK_64
171173
typedef struct {
172174
half d[2]; // super-block scales/mins
@@ -1491,6 +1493,44 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
14911493
#endif // __CUDA_ARCH__ >= 610
14921494
}
14931495

1496+
static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
1497+
const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1498+
1499+
#if __CUDA_ARCH__ >= 610 // lowest compute capability for integer intrinsics
1500+
const block_q4_K * bq4_K = (const block_q4_K *) vbq;
1501+
1502+
const int bq8_offset = 2 * (iqs / 8);
1503+
1504+
float sumf_d = 0.0f;
1505+
float sumf_m = 0;
1506+
1507+
const float d = bq4_K->d;
1508+
const float dmin = bq4_K->dmin;
1509+
1510+
const int vi = *((int *) &bq4_K->qs[sizeof(int) * iqs]);
1511+
1512+
for (int i = 0; i < 2; ++i) {
1513+
const int isc = bq8_offset + i;
1514+
1515+
uint8_t sc, m;
1516+
get_scale_min_k4(isc, bq4_K->scales, sc, m);
1517+
1518+
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
1519+
const int uii = *((int*) &bq8i->qs[sizeof(int) * (iqs%8)]);
1520+
const float d8i = bq8i->d;
1521+
1522+
const int vii = (vi >> (4*i)) & 0x0F0F0F0F;
1523+
1524+
sumf_d += d8i * (__dp4a(vii, uii, 0) * sc);
1525+
sumf_m += d8i * (__dp4a(0x01010101, uii, 0) * m);
1526+
}
1527+
1528+
return d*sumf_d - dmin*sumf_m;
1529+
#else
1530+
return 0.0f; // only to satisfy the compiler
1531+
#endif // __CUDA_ARCH__ >= 610
1532+
}
1533+
14941534
template <int qk, int qi, typename block_q_t, vec_dot_q_cuda_t vec_dot_q_cuda>
14951535
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
14961536
const int row = blockIdx.y*blockDim.y + threadIdx.y;
@@ -2067,6 +2107,15 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
20672107
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
20682108
}
20692109

2110+
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
2111+
GGML_ASSERT(ncols % QK_K == 0);
2112+
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
2113+
const dim3 block_nums(1, block_num_y, 1);
2114+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
2115+
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, vec_dot_q4_K_q8_1>
2116+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
2117+
}
2118+
20702119
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
20712120
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
20722121
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2531,8 +2580,8 @@ inline void ggml_cuda_op_mul_mat_vec(
25312580
src0->type == GGML_TYPE_Q5_1 ||
25322581
src0->type == GGML_TYPE_Q8_0 ||
25332582
src0->type == GGML_TYPE_Q2_K ||
2534-
src0->type == GGML_TYPE_Q3_K;
2535-
// src0->type == GGML_TYPE_Q4_K ||
2583+
src0->type == GGML_TYPE_Q3_K ||
2584+
src0->type == GGML_TYPE_Q4_K;
25362585
// src0->type == GGML_TYPE_Q5_K ||
25372586
// src0->type == GGML_TYPE_Q5_K;
25382587

@@ -2568,6 +2617,9 @@ inline void ggml_cuda_op_mul_mat_vec(
25682617
case GGML_TYPE_Q3_K:
25692618
mul_mat_vec_q3_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
25702619
break;
2620+
case GGML_TYPE_Q4_K:
2621+
mul_mat_vec_q4_K_q8_1_cuda(src0_ddq_i, src1_q8_1, dst_ddf_i, ne00, nrows, cudaStream_main);
2622+
break;
25712623
default:
25722624
GGML_ASSERT(false);
25732625
break;

0 commit comments

Comments
 (0)