Skip to content

Commit 06e33d0

Browse files
committed
q4_0c continous row layout
Introduce alternative quantized formats q4_0c and q8_0c, corresponding exactly to q4_0 and q8_0, except that quantized values and scales are laid out continuously in memory, and the nibbles in q4_0 are rearranged. This should simplify SIMD implementations, at the expense of slighly more complex scalar implementations.
1 parent e36189f commit 06e33d0

File tree

2 files changed

+198
-4
lines changed

2 files changed

+198
-4
lines changed

ggml.c

Lines changed: 196 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,13 @@ typedef struct {
648648
} block_q8_0;
649649
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
650650

651+
#define QK4_0C (4*32)
652+
#define QK4_0C_MUL (QK4_0C / QK4_0)
653+
// TODO: nicer description - pseudostruct?
654+
// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
655+
656+
#define QK8_0C 32
657+
// q8_0c : uint8_t qs[n] || float d[n]
651658

652659
// reference implementation for deterministic creation of model files
653660
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -937,6 +944,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
937944
#endif
938945
}
939946

947+
static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * restrict y, int k) {
948+
assert(k % QK4_0C == 0);
949+
const int nb = k / QK4_0;
950+
const int nsb = k / QK4_0C;
951+
952+
// Split y into nibbles section and scales section
953+
uint8_t * restrict qs = y;
954+
float * restrict ds = (float *) (y + QK4_0C/2 * nsb);
955+
956+
for (int i = 0; i < nb/2; i++) {
957+
// Interleave two output blocks in low and high nibbles
958+
const int src0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
959+
const int src1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
960+
const float * xb[2] = {
961+
x + QK4_0 * src0, // block in low nibbles
962+
x + QK4_0 * src1, // block in high nibbles
963+
};
964+
965+
// Find multiplier for each block
966+
float d[2];
967+
float id[2];
968+
for (int j = 0; j < 2; j++) {
969+
float amax = 0.0f; // absolute max
970+
971+
for (int l = 0; l < QK4_0; l++) {
972+
const float v = xb[j][l];
973+
amax = MAX(amax, fabsf(v));
974+
}
975+
976+
d[j] = amax / ((1 << 3) - 1);
977+
id[j] = d[j] ? 1.0f/d[j] : 0.0f;
978+
}
979+
980+
ds[src0] = d[0];
981+
ds[src1] = d[1];
982+
983+
for (int l = 0; l < QK4_0; l++) {
984+
const float v0 = xb[0][l]*id[0];
985+
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
986+
987+
const float v1 = xb[1][l]*id[1];
988+
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
989+
990+
assert(vi0 < 16);
991+
assert(vi1 < 16);
992+
993+
qs[i*QK4_0 + l] = vi0 | (vi1 << 4);
994+
}
995+
}
996+
}
997+
940998
static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
941999
assert(k % QK4_1 == 0);
9421000
const int nb = k / QK4_1;
@@ -1377,6 +1435,40 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
13771435
#endif
13781436
}
13791437

1438+
// reference implementation for deterministic creation of model files
1439+
static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
1440+
assert(k % QK8_0 == 0);
1441+
const int nb = k / QK8_0;
1442+
1443+
uint8_t * restrict qs = y;
1444+
float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
1445+
1446+
for (int i = 0; i < nb; i++) {
1447+
float amax = 0.0f; // absolute max
1448+
1449+
for (int l = 0; l < QK8_0; l++) {
1450+
const float v = x[i*QK8_0 + l];
1451+
amax = MAX(amax, fabsf(v));
1452+
}
1453+
1454+
const float d = amax / ((1 << 7) - 1);
1455+
const float id = d ? 1.0f/d : 0.0f;
1456+
1457+
ds[i] = d;
1458+
1459+
for (int l = 0; l < QK8_0; ++l) {
1460+
const float v = x[i*QK8_0 + l]*id;
1461+
qs[i*QK8_0 + l] = roundf(v);
1462+
}
1463+
}
1464+
}
1465+
1466+
static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
1467+
assert(k % QK8_0 == 0);
1468+
1469+
quantize_row_q8_0c_reference(x, vy, k);
1470+
}
1471+
13801472
static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
13811473
assert(k % QK4_0 == 0);
13821474
const int nb = k / QK4_0;
@@ -1495,6 +1587,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
14951587
#endif
14961588
}
14971589

1590+
static void dequantize_row_q4_0c(const void * restrict vx, float * restrict y, int k) {
1591+
assert(k % QK4_0C == 0);
1592+
const int nb = k / QK4_0;
1593+
const int nsb = k / QK4_0C;
1594+
1595+
// Split vx into nibbles section and scales section
1596+
const uint8_t * restrict qs = vx;
1597+
const float * restrict ds = (const float *) ((const uint8_t *) vx + QK4_0C/2 * nsb);
1598+
1599+
// scalar
1600+
for (int i = 0; i < nb/2; i++) {
1601+
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
1602+
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
1603+
1604+
const float d0 = ds[dst0];
1605+
const float d1 = ds[dst1];
1606+
1607+
for (int l = 0; l < QK4_0; l++) {
1608+
const uint8_t vi = qs[i * QK4_0 + l];
1609+
1610+
const int8_t vi0 = vi & 0xf;
1611+
const int8_t vi1 = vi >> 4;
1612+
1613+
const float v0 = (vi0 - 8)*d0;
1614+
const float v1 = (vi1 - 8)*d1;
1615+
1616+
y[dst0*QK4_0 + l] = v0;
1617+
y[dst1*QK4_0 + l] = v1;
1618+
1619+
assert(!isnan(y[dst0*QK4_0 + l]));
1620+
assert(!isnan(y[dst1*QK4_0 + l]));
1621+
}
1622+
}
1623+
}
1624+
14981625
static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
14991626
assert(k % QK4_1 == 0);
15001627
const int nb = k / QK4_1;
@@ -1631,6 +1758,7 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
16311758
}
16321759

16331760
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1761+
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
16341762
static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
16351763
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
16361764

@@ -1642,6 +1770,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
16421770
.quantize_row_q_dot = quantize_row_q8_0,
16431771
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
16441772
},
1773+
[GGML_TYPE_Q4_0C] = {
1774+
.dequantize_row_q = dequantize_row_q4_0c,
1775+
//.quantize_row_q = quantize_row_q4_0c,
1776+
.quantize_row_q = (quantize_row_q_t) quantize_row_q4_0c_reference,
1777+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0c_reference,
1778+
.quantize_row_q_dot = quantize_row_q8_0c,
1779+
.vec_dot_q = ggml_vec_dot_q4_0c_q8_0c,
1780+
},
16451781
[GGML_TYPE_Q4_1] = {
16461782
.dequantize_row_q = dequantize_row_q4_1,
16471783
.quantize_row_q = quantize_row_q4_1,
@@ -2460,6 +2596,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
24602596
*s = sumf;
24612597
}
24622598

2599+
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2600+
const int nb = n / QK4_0;
2601+
const int nsb = n / QK4_0C;
2602+
2603+
assert(n % QK4_0C == 0);
2604+
2605+
// Split into nibbles and scales sections
2606+
const uint8_t * restrict xqs = vx;
2607+
const float * restrict xds = (const float *) ((const uint8_t *) vx + nsb*QK4_0C/2);
2608+
const int8_t * restrict yqs = vy;
2609+
const float * restrict yds = (const float *) ((const uint8_t *) vy + nb*QK8_0C);
2610+
2611+
float sumf = 0.0;
2612+
2613+
// scalar
2614+
for (int i = 0; i < nb/2; i++) {
2615+
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
2616+
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
2617+
2618+
const float dx0 = xds[dst0];
2619+
const float dx1 = xds[dst1];
2620+
const float dy0 = yds[dst0];
2621+
const float dy1 = yds[dst1];
2622+
2623+
int sumi0 = 0;
2624+
int sumi1 = 0;
2625+
2626+
for (int l = 0; l < QK4_0; l++) {
2627+
const uint8_t v0 = xqs[i*QK4_0 + l];
2628+
2629+
const int i0 = (int8_t) (v0 & 0xf) - 8;
2630+
const int i1 = (int8_t) (v0 >> 4) - 8;
2631+
2632+
const int i2 = yqs[dst0*QK4_0 + l];
2633+
const int i3 = yqs[dst1*QK4_0 + l];
2634+
2635+
sumi0 += i0*i2;
2636+
sumi1 += i1*i3;
2637+
}
2638+
sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
2639+
}
2640+
2641+
*s = sumf;
2642+
}
2643+
24632644
static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
24642645
const int nb = n / QK8_0;
24652646

@@ -3004,54 +3185,61 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
30043185
[GGML_TYPE_F32] = 1,
30053186
[GGML_TYPE_F16] = 1,
30063187
[GGML_TYPE_Q4_0] = QK4_0,
3188+
[GGML_TYPE_Q4_0C] = QK4_0C,
30073189
[GGML_TYPE_Q4_1] = QK4_1,
30083190
[GGML_TYPE_Q4_2] = QK4_2,
30093191
[GGML_TYPE_Q8_0] = QK8_0,
3192+
[GGML_TYPE_Q8_0C] = QK8_0C,
30103193
[GGML_TYPE_I8] = 1,
30113194
[GGML_TYPE_I16] = 1,
30123195
[GGML_TYPE_I32] = 1,
30133196
};
3014-
static_assert(GGML_TYPE_COUNT == 9, "GGML_BLCK_SIZE is outdated");
3197+
static_assert(GGML_TYPE_COUNT == 11, "GGML_BLCK_SIZE is outdated");
30153198

30163199
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
30173200
[GGML_TYPE_F32] = sizeof(float),
30183201
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
30193202
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3203+
[GGML_TYPE_Q4_0C] = 4*sizeof(block_q4_0),
30203204
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
30213205
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
30223206
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3207+
[GGML_TYPE_Q8_0C] = sizeof(block_q8_0),
30233208
[GGML_TYPE_I8] = sizeof(int8_t),
30243209
[GGML_TYPE_I16] = sizeof(int16_t),
30253210
[GGML_TYPE_I32] = sizeof(int32_t),
30263211
};
3027-
static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_SIZE is outdated");
3212+
static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_SIZE is outdated");
30283213

30293214

30303215
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
30313216
[GGML_TYPE_F32] = "f32",
30323217
[GGML_TYPE_F16] = "f16",
30333218
[GGML_TYPE_Q4_0] = "q4_0",
3219+
[GGML_TYPE_Q4_0C] = "q4_0c",
30343220
[GGML_TYPE_Q4_1] = "q4_1",
30353221
[GGML_TYPE_Q4_2] = "q4_2",
30363222
[GGML_TYPE_Q8_0] = "q8_0",
3223+
[GGML_TYPE_Q8_0C] = "q8_0c",
30373224
[GGML_TYPE_I8] = "i8",
30383225
[GGML_TYPE_I16] = "i16",
30393226
[GGML_TYPE_I32] = "i32",
30403227
};
3041-
static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_NAME is outdated");
3228+
static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_NAME is outdated");
30423229

30433230
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
30443231
[GGML_TYPE_F32] = false,
30453232
[GGML_TYPE_F16] = false,
30463233
[GGML_TYPE_Q4_0] = true,
3234+
[GGML_TYPE_Q4_0C] = true,
30473235
[GGML_TYPE_Q4_1] = true,
30483236
[GGML_TYPE_Q4_2] = true,
30493237
[GGML_TYPE_Q8_0] = true,
30503238
[GGML_TYPE_I8] = false,
30513239
[GGML_TYPE_I16] = false,
30523240
[GGML_TYPE_I32] = false,
30533241
};
3054-
static_assert(GGML_TYPE_COUNT == 9, "GGML_IS_QUANTIZED is outdated");
3242+
static_assert(GGML_TYPE_COUNT == 11, "GGML_IS_QUANTIZED is outdated");
30553243

30563244
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
30573245
"NONE",
@@ -7873,9 +8061,11 @@ static void ggml_compute_forward_mul_mat(
78738061
struct ggml_tensor * dst) {
78748062
switch (src0->type) {
78758063
case GGML_TYPE_Q4_0:
8064+
case GGML_TYPE_Q4_0C:
78768065
case GGML_TYPE_Q4_1:
78778066
case GGML_TYPE_Q4_2:
78788067
case GGML_TYPE_Q8_0:
8068+
case GGML_TYPE_Q8_0C:
78798069
{
78808070
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
78818071
} break;
@@ -8129,9 +8319,11 @@ static void ggml_compute_forward_get_rows(
81298319
struct ggml_tensor * dst) {
81308320
switch (src0->type) {
81318321
case GGML_TYPE_Q4_0:
8322+
case GGML_TYPE_Q4_0C:
81328323
case GGML_TYPE_Q4_1:
81338324
case GGML_TYPE_Q4_2:
81348325
case GGML_TYPE_Q8_0:
8326+
case GGML_TYPE_Q8_0C:
81358327
{
81368328
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
81378329
} break;

ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ enum ggml_type {
206206
GGML_TYPE_Q4_1 = 3,
207207
GGML_TYPE_Q4_2 = 4,
208208
GGML_TYPE_Q8_0 = 5,
209+
GGML_TYPE_Q4_0C = 6,
210+
GGML_TYPE_Q8_0C = 7,
209211
GGML_TYPE_I8,
210212
GGML_TYPE_I16,
211213
GGML_TYPE_I32,

0 commit comments

Comments
 (0)