Skip to content

Commit 4fcd9ff

Browse files
CUDA: faster q8_0 -> f16 dequantization
1 parent 326b418 commit 4fcd9ff

File tree

1 file changed

+49
-1
lines changed

1 file changed

+49
-1
lines changed

ggml-cuda.cu

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
519519
#define CUDA_ACC_BLOCK_SIZE 256
520520
#define CUDA_IM2COL_BLOCK_SIZE 256
521521

522+
#define CUDA_Q8_0_NE_ALIGN 2048
523+
522524
// dmmv = dequantize_mul_mat_vec
523525
#ifndef GGML_CUDA_DMMV_X
524526
#define GGML_CUDA_DMMV_X 32
@@ -2327,6 +2329,41 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
23272329
y[i] = x[i];
23282330
}
23292331

2332+
template <bool need_check>
2333+
static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) {
2334+
constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE;
2335+
2336+
const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x;
2337+
const int * x0 = ((int *) vx) + blockIdx.x * nint;
2338+
half2 * y2 = (half2 *) (y + i0);
2339+
2340+
__shared__ int vals[nint];
2341+
2342+
#pragma unroll
2343+
for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) {
2344+
if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) {
2345+
break;
2346+
}
2347+
2348+
const int ix = ix0 + threadIdx.x;
2349+
vals[ix] = x0[ix];
2350+
}
2351+
2352+
#pragma unroll
2353+
for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) {
2354+
if (need_check && i0 + iy + 2*threadIdx.x >= k) {
2355+
return;
2356+
}
2357+
2358+
const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0);
2359+
const half d = *b0;
2360+
const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)];
2361+
2362+
y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d));
2363+
}
2364+
2365+
}
2366+
23302367
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
23312368
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
23322369

@@ -6181,6 +6218,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
61816218
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
61826219
}
61836220

6221+
static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
6222+
const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN;
6223+
if (k % CUDA_Q8_0_NE_ALIGN == 0) {
6224+
const bool need_check = false;
6225+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6226+
} else {
6227+
const bool need_check = true;
6228+
dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0, stream>>>(vx, y, k);
6229+
}
6230+
}
6231+
61846232
template<typename dst_t>
61856233
static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) {
61866234
const int nb = k / QK_K;
@@ -6256,7 +6304,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
62566304
case GGML_TYPE_Q5_1:
62576305
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
62586306
case GGML_TYPE_Q8_0:
6259-
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
6307+
return dequantize_block_q8_0_f16_cuda;
62606308
case GGML_TYPE_Q2_K:
62616309
return dequantize_row_q2_K_cuda;
62626310
case GGML_TYPE_Q3_K:

0 commit comments

Comments
 (0)