@@ -204,23 +204,31 @@ typedef void (*ggml_cuda_op_t)(
204
204
// QR = QK / number of values before dequantization
205
205
// QI = number of 32 bit integers before dequantization
206
206
207
+ #define Q4_0DM (1 .0f /8 .0f )
208
+ #define Q4_0D (x ) (((x)*Q4_0DM) / 127 .0f )
209
+
207
210
#define QK4_0 32
208
211
#define QR4_0 2
209
212
#define QI4_0 (QK4_0 / (4 * QR4_0))
210
213
typedef struct {
211
- half d; // delta
214
+ int8_t d; // delta
212
215
uint8_t qs[QK4_0 / 2 ]; // nibbles / quants
213
216
} block_q4_0;
214
- static_assert (sizeof (block_q4_0) == sizeof(ggml_fp16_t ) + QK4_0 / 2, "wrong q4_0 block size/padding");
217
+ static_assert (sizeof (block_q4_0) == sizeof(int8_t ) + QK4_0 / 2, "wrong q4_0 block size/padding");
218
+
219
+ #define Q4_1DM (2 .0f /15 .0f )
220
+ #define Q4_1MM (2 .0f )
221
+ #define Q4_1D (x ) ( (((x) & 0xFF )*Q4_1DM) / 255 .0f )
222
+ #define Q4_1M (x ) (-1 .0f + (((x) >> 8 )*Q4_1MM) / 255 .0f )
215
223
216
224
#define QK4_1 32
217
225
#define QR4_1 2
218
226
#define QI4_1 (QK4_1 / (4 * QR4_1))
219
227
typedef struct {
220
- half2 dm; // dm.x = delta, dm.y = min
221
- uint8_t qs[QK4_1 / 2 ]; // nibbles / quants
228
+ uint16_t dm; // 8-bit delta + 8-bit min (can be adjusted easily)
229
+ uint8_t qs[QK4_1 / 2 ]; // nibbles / quants
222
230
} block_q4_1;
223
- static_assert (sizeof (block_q4_1) == sizeof(ggml_fp16_t ) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
231
+ static_assert (sizeof (block_q4_1) == sizeof(uint16_t ) + QK4_1 / 2, "wrong q4_1 block size/padding");
224
232
225
233
#define QK5_0 32
226
234
#define QR5_0 2
@@ -232,15 +240,20 @@ typedef struct {
232
240
} block_q5_0;
233
241
static_assert (sizeof (block_q5_0) == sizeof(ggml_fp16_t ) + sizeof(uint32_t ) + QK5_0 / 2, "wrong q5_0 block size/padding");
234
242
243
+ #define Q5_1DM (2 .0f /31 .0f )
244
+ #define Q5_1MM (2 .0f )
245
+ #define Q5_1D (x ) ( (((x) & 0x0F )*Q5_1DM) / 15 .0f )
246
+ #define Q5_1M (x ) (-1 .0f + (((x) >> 4 )*Q5_1MM) / 15 .0f )
247
+
235
248
#define QK5_1 32
236
249
#define QR5_1 2
237
250
#define QI5_1 (QK5_1 / (4 * QR5_1))
238
251
typedef struct {
239
- half2 dm; // dm.x = delta, dm.y = min
252
+ uint8_t dm; // 4-bit delta + 4-bit min
240
253
uint8_t qh[4 ]; // 5-th bit of quants
241
254
uint8_t qs[QK5_1 / 2 ]; // nibbles / quants
242
255
} block_q5_1;
243
- static_assert (sizeof (block_q5_1) == 2 * sizeof(ggml_fp16_t ) + sizeof(uint32_t ) + QK5_1 / 2, "wrong q5_1 block size/padding");
256
+ static_assert (sizeof (block_q5_1) == sizeof(uint8_t ) + sizeof(uint32_t ) + QK5_1 / 2, "wrong q5_1 block size/padding");
244
257
245
258
#define QK8_0 32
246
259
#define QR8_0 1
@@ -506,7 +519,7 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
506
519
static __device__ __forceinline__ void dequantize_q4_0 (const void * vx, const int ib, const int iqs, dfloat2 & v){
507
520
const block_q4_0 * x = (const block_q4_0 *) vx;
508
521
509
- const dfloat d = x[ib].d ;
522
+ const dfloat d = Q4_0D ( x[ib].d ) ;
510
523
511
524
const int vui = x[ib].qs [iqs];
512
525
@@ -525,8 +538,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
525
538
static __device__ __forceinline__ void dequantize_q4_1 (const void * vx, const int ib, const int iqs, dfloat2 & v){
526
539
const block_q4_1 * x = (const block_q4_1 *) vx;
527
540
528
- const dfloat d = __low2half (x[ib].dm );
529
- const dfloat m = __high2half (x[ib].dm );
541
+ const dfloat d = Q4_1D (x[ib].dm );
542
+ const dfloat m = Q4_1M (x[ib].dm );
530
543
531
544
const int vui = x[ib].qs [iqs];
532
545
@@ -568,8 +581,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
568
581
static __device__ __forceinline__ void dequantize_q5_1 (const void * vx, const int ib, const int iqs, dfloat2 & v){
569
582
const block_q5_1 * x = (const block_q5_1 *) vx;
570
583
571
- const dfloat d = __low2half (x[ib].dm );
572
- const dfloat m = __high2half (x[ib].dm );
584
+ const dfloat d = Q5_1D (x[ib].dm );
585
+ const dfloat m = Q5_1M (x[ib].dm );
573
586
574
587
uint32_t qh;
575
588
memcpy (&qh, x[ib].qh , sizeof (qh));
@@ -2041,7 +2054,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
2041
2054
u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI4_0);
2042
2055
}
2043
2056
2044
- return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d , bq8_1->ds );
2057
+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, Q4_0D ( bq4_0->d ) , bq8_1->ds );
2045
2058
}
2046
2059
2047
2060
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2135,7 +2148,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2135
2148
u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI4_1);
2136
2149
}
2137
2150
2138
- return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm , bq8_1->ds );
2151
+ const float d = Q4_1D (bq4_1->dm );
2152
+ const float m = Q4_1M (bq4_1->dm );
2153
+
2154
+ const float2 dm = {d, m};
2155
+
2156
+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, dm, bq8_1->ds );
2139
2157
}
2140
2158
2141
2159
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2341,7 +2359,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2341
2359
u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI5_1);
2342
2360
}
2343
2361
2344
- return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm , bq8_1->ds );
2362
+ const float d = Q5_1D (bq4_1->dm );
2363
+ const float m = Q5_1M (bq4_1->dm );
2364
+
2365
+ const float2 dm = {d, m};
2366
+
2367
+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, dm, bq8_1->ds );
2345
2368
}
2346
2369
2347
2370
template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
0 commit comments