Skip to content

Commit deae466

Browse files
q8_0
1 parent 08db199 commit deae466

File tree

1 file changed

+34
-7
lines changed

1 file changed

+34
-7
lines changed

ggml-cuda.cu

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) +
114114

115115
#define QK8_0 32
116116
#define QR8_0 1
117-
#define QI8_0 4
117+
#define QI8_0 8
118118
typedef struct {
119119
half d; // delta
120120
int8_t qs[QK8_0]; // quants
@@ -123,6 +123,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 blo
123123

124124
#define QK8_1 32
125125
#define QR8_1 1
126+
#define QI8_1 8
126127
typedef struct {
127128
half d; // delta
128129
half s; // unquantized sum
@@ -1253,7 +1254,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, cons
12531254
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
12541255
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
12551256
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1256-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1257+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_0)]);
12571258

12581259
const float d = bq5_0->d * bq8_1->d;
12591260

@@ -1283,7 +1284,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
12831284
const int qh0 = bq5_1->qh[iqs/2 + 0] >> 4*(iqs%2);
12841285
const int qh1 = bq5_1->qh[iqs/2 + 2] >> 4*(iqs%2);
12851286
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1286-
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1287+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI5_1)]);
12871288

12881289
const float d = bq5_1->d * bq8_1->d;
12891290
const float m = bq5_1->m;
@@ -1306,6 +1307,20 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(const void * vbq, cons
13061307
return sumi*d + m*s / QI5_1;
13071308
}
13081309

1310+
static __device__ __forceinline__ float vec_dot_q8_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1311+
const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1312+
1313+
int vi;
1314+
memcpy(&vi, &bq8_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1315+
const int ui = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1316+
1317+
const float d = bq8_0->d * bq8_1->d;
1318+
1319+
int sumi = __dp4a(vi, ui, 0);
1320+
1321+
return sumi*d;
1322+
}
1323+
13091324
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
13101325
static __global__ void dequantize_block(const void * vx, float * y, const int k) {
13111326
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
@@ -1336,7 +1351,7 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
13361351
}
13371352

13381353
const int blocks_per_row = ncols / qk;
1339-
const int blocks_per_warp = WARP_SIZE * sizeof(int)*2/qk;
1354+
const int blocks_per_warp = WARP_SIZE / qi;
13401355

13411356
// partial sum for each thread
13421357
float tmp = 0.0f;
@@ -1345,9 +1360,9 @@ static __global__ void mul_mat_vec_q(const void * vx, const void * vy, float * d
13451360
const block_q8_1 * y = (const block_q8_1 *) vy;
13461361

13471362
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
1348-
const int ibx = row*blocks_per_row + i + threadIdx.x/qi; // x block index
1363+
const int ibx = row*blocks_per_row + i + threadIdx.x / qi; // x block index
13491364

1350-
const int iby = i + threadIdx.x/qi;
1365+
const int iby = i + threadIdx.x / qi;
13511366

13521367
const int iqs = threadIdx.x % qi;
13531368

@@ -1875,6 +1890,15 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
18751890
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
18761891
}
18771892

1893+
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1894+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1895+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1896+
const dim3 block_nums(1, block_num_y, 1);
1897+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1898+
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, vec_dot_q8_0_q8_1>
1899+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1900+
}
1901+
18781902
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
18791903
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
18801904
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2404,6 +2428,9 @@ inline void ggml_cuda_op_mul_mat_vec_q(
24042428
case GGML_TYPE_Q5_1:
24052429
mul_mat_vec_q5_1_q8_1_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
24062430
break;
2431+
case GGML_TYPE_Q8_0:
2432+
mul_mat_vec_q8_0_q8_1_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2433+
break;
24072434
default:
24082435
GGML_ASSERT(false);
24092436
break;
@@ -2961,7 +2988,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
29612988
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
29622989
bool use_mul_mat_vec_q = false;
29632990
use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1
2964-
|| src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1;
2991+
|| src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0;
29652992
if (use_mul_mat_vec_q) {
29662993
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, false, false);
29672994
} else {

0 commit comments

Comments
 (0)