Skip to content

Commit 5c55b33

Browse files
committed
q4_0c continous row layout
Introduce alternative quantized formats q4_0c and q8_0c, corresponding exactly to q4_0 and q8_0, except that quantized values and scales are laid out continuously in memory, and the nibbles in q4_0 are rearranged. This should simplify SIMD implementations, at the expense of slighly more complex scalar implementations.
1 parent a5c893a commit 5c55b33

File tree

2 files changed

+206
-4
lines changed

2 files changed

+206
-4
lines changed

ggml.c

Lines changed: 204 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -648,6 +648,13 @@ typedef struct {
648648
} block_q8_0;
649649
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
650650

651+
#define QK4_0C (4*32)
652+
#define QK4_0C_MUL (QK4_0C / QK4_0)
653+
// TODO: nicer description - pseudostruct?
654+
// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
655+
656+
#define QK8_0C 32
657+
// q8_0c : uint8_t qs[n] || float d[n]
651658

652659
// reference implementation for deterministic creation of model files
653660
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
@@ -937,6 +944,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
937944
#endif
938945
}
939946

947+
static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * restrict y, int k) {
948+
assert(k % QK4_0C == 0);
949+
const int nb = k / QK4_0;
950+
const int nsb = k / QK4_0C;
951+
952+
// Split y into nibbles section and scales section
953+
uint8_t * restrict qs = y;
954+
float * restrict ds = (float *) (y + QK4_0C/2 * nsb);
955+
956+
for (int i = 0; i < nb/2; i++) {
957+
// Interleave two output blocks in low and high nibbles
958+
const int src0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
959+
const int src1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
960+
const float * xb[2] = {
961+
x + QK4_0 * src0, // block in low nibbles
962+
x + QK4_0 * src1, // block in high nibbles
963+
};
964+
965+
// Find multiplier for each block
966+
float d[2];
967+
float id[2];
968+
for (int j = 0; j < 2; j++) {
969+
float amax = 0.0f; // absolute max
970+
971+
for (int l = 0; l < QK4_0; l++) {
972+
const float v = xb[j][l];
973+
amax = MAX(amax, fabsf(v));
974+
}
975+
976+
d[j] = amax / ((1 << 3) - 1);
977+
id[j] = d[j] ? 1.0f/d[j] : 0.0f;
978+
}
979+
980+
ds[src0] = d[0];
981+
ds[src1] = d[1];
982+
983+
for (int l = 0; l < QK4_0; l++) {
984+
const float v0 = xb[0][l]*id[0];
985+
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
986+
987+
const float v1 = xb[1][l]*id[1];
988+
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
989+
990+
assert(vi0 < 16);
991+
assert(vi1 < 16);
992+
993+
qs[i*QK4_0 + l] = vi0 | (vi1 << 4);
994+
}
995+
}
996+
}
997+
940998
static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
941999
assert(k % QK4_1 == 0);
9421000
const int nb = k / QK4_1;
@@ -1377,6 +1435,40 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
13771435
#endif
13781436
}
13791437

1438+
// reference implementation for deterministic creation of model files
1439+
static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
1440+
assert(k % QK8_0 == 0);
1441+
const int nb = k / QK8_0;
1442+
1443+
uint8_t * restrict qs = y;
1444+
float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
1445+
1446+
for (int i = 0; i < nb; i++) {
1447+
float amax = 0.0f; // absolute max
1448+
1449+
for (int l = 0; l < QK8_0; l++) {
1450+
const float v = x[i*QK8_0 + l];
1451+
amax = MAX(amax, fabsf(v));
1452+
}
1453+
1454+
const float d = amax / ((1 << 7) - 1);
1455+
const float id = d ? 1.0f/d : 0.0f;
1456+
1457+
ds[i] = d;
1458+
1459+
for (int l = 0; l < QK8_0; ++l) {
1460+
const float v = x[i*QK8_0 + l]*id;
1461+
qs[i*QK8_0 + l] = roundf(v);
1462+
}
1463+
}
1464+
}
1465+
1466+
static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
1467+
assert(k % QK8_0 == 0);
1468+
1469+
quantize_row_q8_0c_reference(x, vy, k);
1470+
}
1471+
13801472
static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
13811473
assert(k % QK4_0 == 0);
13821474
const int nb = k / QK4_0;
@@ -1495,6 +1587,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
14951587
#endif
14961588
}
14971589

1590+
static void dequantize_row_q4_0c(const void * restrict vx, float * restrict y, int k) {
1591+
assert(k % QK4_0C == 0);
1592+
const int nb = k / QK4_0;
1593+
const int nsb = k / QK4_0C;
1594+
1595+
// Split vx into nibbles section and scales section
1596+
const uint8_t * restrict qs = vx;
1597+
const float * restrict ds = (const float *) ((const uint8_t *) vx + QK4_0C/2 * nsb);
1598+
1599+
// scalar
1600+
for (int i = 0; i < nb/2; i++) {
1601+
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
1602+
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
1603+
1604+
const float d0 = ds[dst0];
1605+
const float d1 = ds[dst1];
1606+
1607+
for (int l = 0; l < QK4_0; l++) {
1608+
const uint8_t vi = qs[i * QK4_0 + l];
1609+
1610+
const int8_t vi0 = vi & 0xf;
1611+
const int8_t vi1 = vi >> 4;
1612+
1613+
const float v0 = (vi0 - 8)*d0;
1614+
const float v1 = (vi1 - 8)*d1;
1615+
1616+
y[dst0*QK4_0 + l] = v0;
1617+
y[dst1*QK4_0 + l] = v1;
1618+
1619+
assert(!isnan(y[dst0*QK4_0 + l]));
1620+
assert(!isnan(y[dst1*QK4_0 + l]));
1621+
}
1622+
}
1623+
}
1624+
14981625
static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
14991626
assert(k % QK4_1 == 0);
15001627
const int nb = k / QK4_1;
@@ -1631,6 +1758,7 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
16311758
}
16321759

16331760
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
1761+
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
16341762
static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
16351763
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
16361764

@@ -1642,6 +1770,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
16421770
.quantize_row_q_dot = quantize_row_q8_0,
16431771
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
16441772
},
1773+
[GGML_TYPE_Q4_0C] = {
1774+
.dequantize_row_q = dequantize_row_q4_0c,
1775+
//.quantize_row_q = quantize_row_q4_0c,
1776+
.quantize_row_q = (quantize_row_q_t) quantize_row_q4_0c_reference,
1777+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0c_reference,
1778+
.quantize_row_q_dot = quantize_row_q8_0c,
1779+
.vec_dot_q = ggml_vec_dot_q4_0c_q8_0c,
1780+
},
16451781
[GGML_TYPE_Q4_1] = {
16461782
.dequantize_row_q = dequantize_row_q4_1,
16471783
.quantize_row_q = quantize_row_q4_1,
@@ -1663,6 +1799,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
16631799
.quantize_row_q_dot = quantize_row_q8_0,
16641800
.vec_dot_q = NULL, // TODO
16651801
},
1802+
[GGML_TYPE_Q8_0C] = {
1803+
.dequantize_row_q = NULL,
1804+
.quantize_row_q = quantize_row_q8_0c,
1805+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0c_reference,
1806+
.quantize_row_q_dot = quantize_row_q8_0c,
1807+
.vec_dot_q = NULL,
1808+
},
16661809
};
16671810

16681811
// For internal test use
@@ -2460,6 +2603,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
24602603
*s = sumf;
24612604
}
24622605

2606+
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2607+
const int nb = n / QK4_0;
2608+
const int nsb = n / QK4_0C;
2609+
2610+
assert(n % QK4_0C == 0);
2611+
2612+
// Split into nibbles and scales sections
2613+
const uint8_t * restrict xqs = vx;
2614+
const float * restrict xds = (const float *) ((const uint8_t *) vx + nsb*QK4_0C/2);
2615+
const int8_t * restrict yqs = vy;
2616+
const float * restrict yds = (const float *) ((const uint8_t *) vy + nb*QK8_0C);
2617+
2618+
float sumf = 0.0;
2619+
2620+
// scalar
2621+
for (int i = 0; i < nb/2; i++) {
2622+
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
2623+
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
2624+
2625+
const float dx0 = xds[dst0];
2626+
const float dx1 = xds[dst1];
2627+
const float dy0 = yds[dst0];
2628+
const float dy1 = yds[dst1];
2629+
2630+
int sumi0 = 0;
2631+
int sumi1 = 0;
2632+
2633+
for (int l = 0; l < QK4_0; l++) {
2634+
const uint8_t v0 = xqs[i*QK4_0 + l];
2635+
2636+
const int i0 = (int8_t) (v0 & 0xf) - 8;
2637+
const int i1 = (int8_t) (v0 >> 4) - 8;
2638+
2639+
const int i2 = yqs[dst0*QK4_0 + l];
2640+
const int i3 = yqs[dst1*QK4_0 + l];
2641+
2642+
sumi0 += i0*i2;
2643+
sumi1 += i1*i3;
2644+
}
2645+
sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
2646+
}
2647+
2648+
*s = sumf;
2649+
}
2650+
24632651
static void ggml_vec_dot_q4_1_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
24642652
const int nb = n / QK8_0;
24652653

@@ -3004,54 +3192,62 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
30043192
[GGML_TYPE_F32] = 1,
30053193
[GGML_TYPE_F16] = 1,
30063194
[GGML_TYPE_Q4_0] = QK4_0,
3195+
[GGML_TYPE_Q4_0C] = QK4_0C,
30073196
[GGML_TYPE_Q4_1] = QK4_1,
30083197
[GGML_TYPE_Q4_2] = QK4_2,
30093198
[GGML_TYPE_Q8_0] = QK8_0,
3199+
[GGML_TYPE_Q8_0C] = QK8_0C,
30103200
[GGML_TYPE_I8] = 1,
30113201
[GGML_TYPE_I16] = 1,
30123202
[GGML_TYPE_I32] = 1,
30133203
};
3014-
static_assert(GGML_TYPE_COUNT == 9, "GGML_BLCK_SIZE is outdated");
3204+
static_assert(GGML_TYPE_COUNT == 11, "GGML_BLCK_SIZE is outdated");
30153205

30163206
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
30173207
[GGML_TYPE_F32] = sizeof(float),
30183208
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
30193209
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3210+
[GGML_TYPE_Q4_0C] = 4*sizeof(block_q4_0),
30203211
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
30213212
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
30223213
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
3214+
[GGML_TYPE_Q8_0C] = sizeof(block_q8_0),
30233215
[GGML_TYPE_I8] = sizeof(int8_t),
30243216
[GGML_TYPE_I16] = sizeof(int16_t),
30253217
[GGML_TYPE_I32] = sizeof(int32_t),
30263218
};
3027-
static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_SIZE is outdated");
3219+
static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_SIZE is outdated");
30283220

30293221

30303222
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
30313223
[GGML_TYPE_F32] = "f32",
30323224
[GGML_TYPE_F16] = "f16",
30333225
[GGML_TYPE_Q4_0] = "q4_0",
3226+
[GGML_TYPE_Q4_0C] = "q4_0c",
30343227
[GGML_TYPE_Q4_1] = "q4_1",
30353228
[GGML_TYPE_Q4_2] = "q4_2",
30363229
[GGML_TYPE_Q8_0] = "q8_0",
3230+
[GGML_TYPE_Q8_0C] = "q8_0c",
30373231
[GGML_TYPE_I8] = "i8",
30383232
[GGML_TYPE_I16] = "i16",
30393233
[GGML_TYPE_I32] = "i32",
30403234
};
3041-
static_assert(GGML_TYPE_COUNT == 9, "GGML_TYPE_NAME is outdated");
3235+
static_assert(GGML_TYPE_COUNT == 11, "GGML_TYPE_NAME is outdated");
30423236

30433237
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
30443238
[GGML_TYPE_F32] = false,
30453239
[GGML_TYPE_F16] = false,
30463240
[GGML_TYPE_Q4_0] = true,
3241+
[GGML_TYPE_Q4_0C] = true,
30473242
[GGML_TYPE_Q4_1] = true,
30483243
[GGML_TYPE_Q4_2] = true,
30493244
[GGML_TYPE_Q8_0] = true,
3245+
[GGML_TYPE_Q8_0C] = true,
30503246
[GGML_TYPE_I8] = false,
30513247
[GGML_TYPE_I16] = false,
30523248
[GGML_TYPE_I32] = false,
30533249
};
3054-
static_assert(GGML_TYPE_COUNT == 9, "GGML_IS_QUANTIZED is outdated");
3250+
static_assert(GGML_TYPE_COUNT == 11, "GGML_IS_QUANTIZED is outdated");
30553251

30563252
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
30573253
"NONE",
@@ -7873,9 +8069,11 @@ static void ggml_compute_forward_mul_mat(
78738069
struct ggml_tensor * dst) {
78748070
switch (src0->type) {
78758071
case GGML_TYPE_Q4_0:
8072+
case GGML_TYPE_Q4_0C:
78768073
case GGML_TYPE_Q4_1:
78778074
case GGML_TYPE_Q4_2:
78788075
case GGML_TYPE_Q8_0:
8076+
case GGML_TYPE_Q8_0C:
78798077
{
78808078
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
78818079
} break;
@@ -8129,9 +8327,11 @@ static void ggml_compute_forward_get_rows(
81298327
struct ggml_tensor * dst) {
81308328
switch (src0->type) {
81318329
case GGML_TYPE_Q4_0:
8330+
case GGML_TYPE_Q4_0C:
81328331
case GGML_TYPE_Q4_1:
81338332
case GGML_TYPE_Q4_2:
81348333
case GGML_TYPE_Q8_0:
8334+
case GGML_TYPE_Q8_0C:
81358335
{
81368336
ggml_compute_forward_get_rows_q(params, src0, src1, dst);
81378337
} break;

ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,8 @@ enum ggml_type {
206206
GGML_TYPE_Q4_1 = 3,
207207
GGML_TYPE_Q4_2 = 4,
208208
GGML_TYPE_Q8_0 = 5,
209+
GGML_TYPE_Q4_0C = 6,
210+
GGML_TYPE_Q8_0C = 7,
209211
GGML_TYPE_I8,
210212
GGML_TYPE_I16,
211213
GGML_TYPE_I32,

0 commit comments

Comments
 (0)