@@ -114,7 +114,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114
114
115
115
#define QK8_0 32
116
116
#define QR8_0 1
117
- #define QI8_0 4
117
+ #define QI8_0 8
118
118
typedef struct {
119
119
half d; // delta
120
120
int8_t qs[QK8_0]; // quants
@@ -123,6 +123,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123
123
124
124
#define QK8_1 32
125
125
#define QR8_1 1
126
+ #define QI8_1 8
126
127
typedef struct {
127
128
half d; // delta
128
129
half s; // unquantized sum
@@ -1253,7 +1254,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
1253
1254
const int qh0 = bq5_0->qh [iqs/2 + 0 ] >> 4 *(iqs%2 );
1254
1255
const int qh1 = bq5_0->qh [iqs/2 + 2 ] >> 4 *(iqs%2 );
1255
1256
const int ui0 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + 0 )]);
1256
- const int ui1 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + QI4_0 )]);
1257
+ const int ui1 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + QI5_0 )]);
1257
1258
1258
1259
const float d = bq5_0->d * bq8_1->d ;
1259
1260
@@ -1283,7 +1284,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1283
1284
const int qh0 = bq5_1->qh [iqs/2 + 0 ] >> 4 *(iqs%2 );
1284
1285
const int qh1 = bq5_1->qh [iqs/2 + 2 ] >> 4 *(iqs%2 );
1285
1286
const int ui0 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + 0 )]);
1286
- const int ui1 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + QI4_0 )]);
1287
+ const int ui1 = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + QI5_1 )]);
1287
1288
1288
1289
const float d = bq5_1->d * bq8_1->d ;
1289
1290
const float m = bq5_1->m ;
@@ -1306,6 +1307,20 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
1306
1307
return sumi*d + m*s / QI5_1;
1307
1308
}
1308
1309
1310
+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1311
+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1312
+
1313
+ int vi;
1314
+ memcpy (&vi, &bq8_0->qs [sizeof (int ) * (iqs + 0 )], sizeof (int ));
1315
+ const int ui = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + 0 )]);
1316
+
1317
+ const float d = bq8_0->d * bq8_1->d ;
1318
+
1319
+ int sumi = __dp4a (vi, ui, 0 );
1320
+
1321
+ return sumi*d;
1322
+ }
1323
+
1309
1324
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
1310
1325
static __global__ void dequantize_block (const void * vx, float * y, const int k) {
1311
1326
const int i = blockDim .x *blockIdx .x + 2 *threadIdx .x ;
@@ -1336,7 +1351,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1336
1351
}
1337
1352
1338
1353
const int blocks_per_row = ncols / qk;
1339
- const int blocks_per_warp = WARP_SIZE * sizeof ( int )* 2 /qk ;
1354
+ const int blocks_per_warp = WARP_SIZE / qi ;
1340
1355
1341
1356
// partial sum for each thread
1342
1357
float tmp = 0 .0f ;
@@ -1345,9 +1360,9 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
1345
1360
const block_q8_1 * y = (const block_q8_1 *) vy;
1346
1361
1347
1362
for (int i = 0 ; i < blocks_per_row; i += blocks_per_warp) {
1348
- const int ibx = row*blocks_per_row + i + threadIdx .x / qi; // x block index
1363
+ const int ibx = row*blocks_per_row + i + threadIdx .x / qi; // x block index
1349
1364
1350
- const int iby = i + threadIdx .x / qi;
1365
+ const int iby = i + threadIdx .x / qi;
1351
1366
1352
1367
const int iqs = threadIdx .x % qi;
1353
1368
@@ -1875,6 +1890,15 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
1875
1890
<<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
1876
1891
}
1877
1892
1893
+ static void mul_mat_vec_q8_0_q8_1_cuda (const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894
+ GGML_ASSERT (ncols % GGML_CUDA_DMMV_X == 0 );
1895
+ const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1 ) / GGML_CUDA_DMMV_Y;
1896
+ const dim3 block_nums (1 , block_num_y, 1 );
1897
+ const dim3 block_dims (WARP_SIZE, GGML_CUDA_DMMV_Y, 1 );
1898
+ mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1899
+ <<<block_nums, block_dims, 0 , stream>>> (vx, vy, dst, ncols, nrows);
1900
+ }
1901
+
1878
1902
static void convert_fp16_to_fp32_cuda (const void * vx, float * y, const int k, cudaStream_t stream) {
1879
1903
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1 ) / CUDA_DEQUANTIZE_BLOCK_SIZE;
1880
1904
dequantize_block<1 , 1 , convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0 , stream>>> (vx, y, k);
@@ -2404,6 +2428,9 @@ inline void ggml_cuda_op_mul_mat_vec_q(
2404
2428
case GGML_TYPE_Q5_1:
2405
2429
mul_mat_vec_q5_1_q8_1_cuda (src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2406
2430
break ;
2431
+ case GGML_TYPE_Q8_0:
2432
+ mul_mat_vec_q8_0_q8_1_cuda (src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2433
+ break ;
2407
2434
default :
2408
2435
GGML_ASSERT (false );
2409
2436
break ;
@@ -2961,7 +2988,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
2961
2988
if (src1->ne [1 ] == 1 && src0->ne [0 ] % GGML_CUDA_DMMV_X == 0 && src0->ne [1 ] % GGML_CUDA_DMMV_Y == 0 ) {
2962
2989
bool use_mul_mat_vec_q = false ;
2963
2990
use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1
2964
- || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1;
2991
+ || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0-> type == GGML_TYPE_Q8_0 ;
2965
2992
if (use_mul_mat_vec_q) {
2966
2993
ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, false , false );
2967
2994
} else {
0 commit comments