Skip to content

Commit 4f7b612

Browse files
q5_0
1 parent 90b8539 commit 4f7b612

File tree

1 file changed

+43
-2
lines changed

1 file changed

+43
-2
lines changed

ggml-cuda.cu

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,7 +1220,6 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(const void * vbq, cons
12201220
sumi = __dp4a(vi1, ui1, sumi);
12211221

12221222
return sumi*d;
1223-
12241223
}
12251224

12261225
static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
@@ -1241,7 +1240,37 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * vbq, cons
12411240
sumi = __dp4a(vi1, ui1, sumi);
12421241

12431242
return sumi*d + m*s / QI4_1;
1243+
}
1244+
1245+
static __device__ __forceinline__ float vec_dot_q5_0_q8_1(const void * vbq, const block_q8_1 * bq8_1, const int iqs) {
1246+
const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
1247+
1248+
int qs;
1249+
memcpy(&qs, &bq5_0->qs[sizeof(int) * (iqs + 0)], sizeof(int));
1250+
const int qh0 = bq5_0->qh[iqs/2 + 0] >> 4*(iqs%2);
1251+
const int qh1 = bq5_0->qh[iqs/2 + 2] >> 4*(iqs%2);
1252+
const int ui0 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + 0)]);
1253+
const int ui1 = *((int *) &bq8_1->qs[sizeof(int) * (iqs + QI4_0)]);
1254+
1255+
const float d = bq5_0->d * bq8_1->d;
1256+
1257+
int vi0 = (qs >> 0) & 0x0F0F0F0F;
1258+
vi0 |= (qh0 << 4) & 0x00000010;
1259+
vi0 |= (qh0 << 11) & 0x00001000;
1260+
vi0 |= (qh0 << 18) & 0x00100000;
1261+
vi0 |= (qh0 << 25) & 0x10000000;
1262+
vi0 = __vsub4(vi0, 0x10101010);
1263+
int sumi = __dp4a(vi0, ui0, 0);
1264+
1265+
int vi1 = (qs >> 4) & 0x0F0F0F0F;
1266+
vi1 |= (qh1 << 4) & 0x00000010;
1267+
vi1 |= (qh1 << 11) & 0x00001000;
1268+
vi1 |= (qh1 << 18) & 0x00100000;
1269+
vi1 |= (qh1 << 25) & 0x10000000;
1270+
vi1 = __vsub4(vi1, 0x10101010);
1271+
sumi = __dp4a(vi1, ui1, sumi);
12441272

1273+
return sumi*d;
12451274
}
12461275

12471276
template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -1796,6 +1825,15 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
17961825
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
17971826
}
17981827

1828+
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
1829+
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
1830+
const int block_num_y = (nrows + GGML_CUDA_DMMV_Y - 1) / GGML_CUDA_DMMV_Y;
1831+
const dim3 block_nums(1, block_num_y, 1);
1832+
const dim3 block_dims(WARP_SIZE, GGML_CUDA_DMMV_Y, 1);
1833+
mul_mat_vec_q<QK5_0, block_q5_0, vec_dot_q5_0_q8_1>
1834+
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
1835+
}
1836+
17991837
static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
18001838
const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
18011839
dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
@@ -2319,6 +2357,9 @@ inline void ggml_cuda_op_mul_mat_vec_q(
23192357
case GGML_TYPE_Q4_1:
23202358
mul_mat_vec_q4_1_q8_1_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
23212359
break;
2360+
case GGML_TYPE_Q5_0:
2361+
mul_mat_vec_q5_0_q8_1_cuda(src0_ddq_i, src1_q8_0, dst_ddf_i, ne00, nrows, cudaStream_main);
2362+
break;
23222363
default:
23232364
GGML_ASSERT(false);
23242365
break;
@@ -2875,7 +2916,7 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
28752916
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
28762917
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src0->ne[1] % GGML_CUDA_DMMV_Y == 0) {
28772918
bool use_mul_mat_vec_q = false;
2878-
use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1;
2919+
use_mul_mat_vec_q = src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0;
28792920
if (use_mul_mat_vec_q) {
28802921
ggml_cuda_op(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, false, false);
28812922
} else {

0 commit comments

Comments
 (0)