@@ -65,13 +65,8 @@ typedef sycl::half2 ggml_half2;
65
65
// QK = number of values after dequantization
66
66
// QK_K = super-block size
67
67
68
- #ifdef GGML_QKK_64
69
- #define QK_K 64
70
- #define K_SCALE_SIZE 4
71
- #else
72
68
#define QK_K 256
73
69
#define K_SCALE_SIZE 12
74
- #endif // GGML_QKK_64
75
70
76
71
#if defined(GGML_COMMON_DECL_CUDA) || defined(GGML_COMMON_DECL_HIP) || defined(GGML_COMMON_DECL_SYCL)
77
72
// QR = QK / number of values before dequantization
@@ -131,13 +126,8 @@ typedef sycl::half2 ggml_half2;
131
126
#define QI4_NL (QK4_NL / (4 *QR4_NL))
132
127
#define QR4_NL 2
133
128
134
- #if QK_K == 64
135
- #define QI4_XS QI4_NL
136
- #define QR4_XS QR4_NL
137
- #else
138
129
#define QI4_XS (QK_K / (4 *QR4_XS))
139
130
#define QR4_XS 8
140
- #endif
141
131
142
132
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
143
133
@@ -228,36 +218,18 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wro
228
218
// weight is represented as x = a * q
229
219
// 16 blocks of 16 elements each
230
220
// Effectively 3.4375 bits per weight
231
- #ifdef GGML_QKK_64
232
- typedef struct {
233
- uint8_t hmask[QK_K/8 ]; // quants - high bit
234
- uint8_t qs[QK_K/4 ]; // quants - low 2 bits
235
- uint8_t scales[2 ];
236
- ggml_half d; // super-block scale
237
- } block_q3_K;
238
- static_assert (sizeof (block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
239
- #else
240
221
typedef struct {
241
222
uint8_t hmask[QK_K/8 ]; // quants - high bit
242
223
uint8_t qs[QK_K/4 ]; // quants - low 2 bits
243
224
uint8_t scales[12 ]; // scales, quantized with 6 bits
244
225
ggml_half d; // super-block scale
245
226
} block_q3_K;
246
227
static_assert (sizeof (block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
247
- #endif
248
228
249
229
// 4-bit quantization
250
230
// 8 blocks of 32 elements each
251
231
// weight is represented as x = a * q + b
252
232
// Effectively 4.5 bits per weight
253
- #ifdef GGML_QKK_64
254
- typedef struct {
255
- ggml_half d[2 ]; // super-block scales/mins
256
- uint8_t scales[2 ]; // 4-bit block scales/mins
257
- uint8_t qs[QK_K/2 ]; // 4--bit quants
258
- } block_q4_K;
259
- static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_half) + QK_K/2 + 2, "wrong q4_K block size/padding");
260
- #else
261
233
typedef struct {
262
234
union {
263
235
struct {
@@ -270,21 +242,11 @@ typedef struct {
270
242
uint8_t qs[QK_K/2 ]; // 4--bit quants
271
243
} block_q4_K;
272
244
static_assert (sizeof (block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
273
- #endif
274
245
275
246
// 5-bit quantization
276
247
// 8 blocks of 32 elements each
277
248
// weight is represented as x = a * q + b
278
249
// Effectively 5.5 bits per weight
279
- #ifdef GGML_QKK_64
280
- typedef struct {
281
- ggml_half d; // super-block scale
282
- int8_t scales[QK_K/16 ]; // 8-bit block scales
283
- uint8_t qh[QK_K/8 ]; // quants, high bit
284
- uint8_t qs[QK_K/2 ]; // quants, low 4 bits
285
- } block_q5_K;
286
- static_assert (sizeof (block_q5_K) == sizeof(ggml_half) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
287
- #else
288
250
typedef struct {
289
251
union {
290
252
struct {
@@ -298,7 +260,6 @@ typedef struct {
298
260
uint8_t qs[QK_K/2 ]; // quants, low 4 bits
299
261
} block_q5_K;
300
262
static_assert (sizeof (block_q5_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
301
- #endif
302
263
303
264
// 6-bit quantization
304
265
// weight is represented as x = a * q
@@ -356,11 +317,7 @@ typedef struct {
356
317
static_assert (sizeof (block_iq3_xxs) == sizeof(ggml_half) + 3*(QK_K/8 ), "wrong iq3_xxs block size/padding");
357
318
358
319
// 3.4375 bpw
359
- #if QK_K == 64
360
- #define IQ3S_N_SCALE 2
361
- #else
362
320
#define IQ3S_N_SCALE QK_K/64
363
- #endif
364
321
typedef struct {
365
322
ggml_half d;
366
323
uint8_t qs[QK_K/4 ];
@@ -381,16 +338,9 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
381
338
typedef struct {
382
339
uint8_t qs[QK_K/8 ]; // grid index, low 8 bits
383
340
uint8_t qh[QK_K/16 ]; // grid index, high 3 bits + grid shift bit (for two groups of 8)
384
- #if QK_K == 64
385
- ggml_half d;
386
- #endif
387
341
uint8_t scales[QK_K/32 ]; // 3-bit block scales (4-bit if QK_K == 64)
388
342
} block_iq1_m;
389
- #if QK_K == 64
390
- static_assert (sizeof (block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding");
391
- #else
392
343
static_assert (sizeof (block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
393
- #endif
394
344
395
345
// Used by IQ1_M quants
396
346
typedef union {
@@ -406,17 +356,13 @@ typedef struct {
406
356
} block_iq4_nl;
407
357
static_assert (sizeof (block_iq4_nl) == sizeof(ggml_half) + QK4_NL/2, "wrong iq4_nl block size/padding");
408
358
409
- #if QK_K == 64
410
- #define block_iq4_xs block_iq4_nl
411
- #else
412
359
typedef struct {
413
360
ggml_half d;
414
361
uint16_t scales_h;
415
362
uint8_t scales_l[QK_K/64 ];
416
363
uint8_t qs[QK_K/2 ];
417
364
} block_iq4_xs;
418
365
static_assert (sizeof (block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t ) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
419
- #endif
420
366
421
367
#endif // GGML_COMMON_DECL
422
368
#endif // GGML_COMMON_DECL
0 commit comments