@@ -519,6 +519,8 @@ static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16
519
519
#define CUDA_ACC_BLOCK_SIZE 256
520
520
#define CUDA_IM2COL_BLOCK_SIZE 256
521
521
522
+ #define CUDA_Q8_0_NE_ALIGN 2048
523
+
522
524
// dmmv = dequantize_mul_mat_vec
523
525
#ifndef GGML_CUDA_DMMV_X
524
526
#define GGML_CUDA_DMMV_X 32
@@ -2327,6 +2329,45 @@ static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __res
2327
2329
y[i] = x[i];
2328
2330
}
2329
2331
2332
+ template <bool need_check>
2333
+ static __global__ void dequantize_block_q8_0_f16 (const void * __restrict__ vx, half * __restrict__ y, const int k) {
2334
+ #if __CUDA_ARCH__ >= CC_PASCAL
2335
+ constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof (int ) + WARP_SIZE;
2336
+
2337
+ const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx .x ;
2338
+ const int * x0 = ((int *) vx) + blockIdx .x * nint;
2339
+ half2 * y2 = (half2 *) (y + i0);
2340
+
2341
+ __shared__ int vals[nint];
2342
+
2343
+ #pragma unroll
2344
+ for (int ix0 = 0 ; ix0 < nint; ix0 += WARP_SIZE) {
2345
+ if (need_check && i0*sizeof (block_q8_0)/QK8_0 + sizeof (int )*(ix0 + threadIdx .x ) >= k*sizeof (block_q8_0)/QK8_0) {
2346
+ break ;
2347
+ }
2348
+
2349
+ const int ix = ix0 + threadIdx .x ;
2350
+ vals[ix] = x0[ix];
2351
+ }
2352
+
2353
+ #pragma unroll
2354
+ for (int iy = 0 ; iy < CUDA_Q8_0_NE_ALIGN; iy += 2 *WARP_SIZE) {
2355
+ if (need_check && i0 + iy + 2 *threadIdx .x >= k) {
2356
+ return ;
2357
+ }
2358
+
2359
+ const half * b0 = ((const half *) vals) + (sizeof (block_q8_0)/sizeof (half)) * ((iy + 2 *threadIdx .x )/QK8_0);
2360
+ const half d = *b0;
2361
+ const char2 qs = ((const char2 *) (b0 + 1 ))[threadIdx .x % (QK8_0/2 )];
2362
+
2363
+ y2[iy/2 + threadIdx .x ] = __hmul2 (make_half2 (qs.x , qs.y ), __half2half2 (d));
2364
+ }
2365
+ #else
2366
+ (void ) vx; (void ) y; (void ) k;
2367
+ bad_arch ();
2368
+ #endif // __CUDA_ARCH__ >= CC_PASCAL
2369
+ }
2370
+
2330
2371
// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
2331
2372
// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
2332
2373
@@ -6181,6 +6222,17 @@ static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restri
6181
6222
dequantize_block<qk, qr, dequantize_kernel><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
6182
6223
}
6183
6224
6225
+ static void dequantize_block_q8_0_f16_cuda (const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) {
6226
+ const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1 ) / CUDA_Q8_0_NE_ALIGN;
6227
+ if (k % CUDA_Q8_0_NE_ALIGN == 0 ) {
6228
+ const bool need_check = false ;
6229
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0 , stream>>> (vx, y, k);
6230
+ } else {
6231
+ const bool need_check = true ;
6232
+ dequantize_block_q8_0_f16<need_check><<<num_blocks, WARP_SIZE, 0 , stream>>> (vx, y, k);
6233
+ }
6234
+ }
6235
+
6184
6236
template <typename dst_t >
6185
6237
static void dequantize_row_q2_K_cuda (const void * vx, dst_t * y, const int k, cudaStream_t stream) {
6186
6238
const int nb = k / QK_K;
@@ -6256,7 +6308,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
6256
6308
case GGML_TYPE_Q5_1:
6257
6309
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
6258
6310
case GGML_TYPE_Q8_0:
6259
- return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0> ;
6311
+ return dequantize_block_q8_0_f16_cuda ;
6260
6312
case GGML_TYPE_Q2_K:
6261
6313
return dequantize_row_q2_K_cuda;
6262
6314
case GGML_TYPE_Q3_K:
0 commit comments