Skip to content

Commit a1e6fb9

Browse files
committed
q4_0c continous row layout
Introduce alternative quantized formats q4_0c and q8_0c, corresponding exactly to q4_0 and q8_0, except that quantized values and scales are laid out continuously in memory, and the nibbles in q4_0 are rearranged. This should simplify SIMD implementations, at the expense of slighly more complex scalar implementations.
1 parent 2219467 commit a1e6fb9

File tree

2 files changed

+207
-4
lines changed

2 files changed

+207
-4
lines changed

ggml.c

Lines changed: 205 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -772,6 +772,14 @@ typedef struct {
772772
} block_q8_1;
773773
static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
774774

775+
#define QK4_0C (4*32)
776+
#define QK4_0C_MUL (QK4_0C / QK4_0)
777+
// TODO: nicer description - pseudostruct?
778+
// q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
779+
780+
#define QK8_0C 32
781+
// q8_0c : uint8_t qs[n] || float d[n]
782+
775783
// reference implementation for deterministic creation of model files
776784
static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) {
777785
assert(k % QK4_0 == 0);
@@ -1117,6 +1125,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
11171125
#endif
11181126
}
11191127

1128+
static void quantize_row_q4_0c_reference(const float * restrict x, uint8_t * restrict y, int k) {
1129+
assert(k % QK4_0C == 0);
1130+
const int nb = k / QK4_0;
1131+
const int nsb = k / QK4_0C;
1132+
1133+
// Split y into nibbles section and scales section
1134+
uint8_t * restrict qs = y;
1135+
float * restrict ds = (float *) (y + QK4_0C/2 * nsb);
1136+
1137+
for (int i = 0; i < nb/2; i++) {
1138+
// Interleave two output blocks in low and high nibbles
1139+
const int src0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
1140+
const int src1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
1141+
const float * xb[2] = {
1142+
x + QK4_0 * src0, // block in low nibbles
1143+
x + QK4_0 * src1, // block in high nibbles
1144+
};
1145+
1146+
// Find multiplier for each block
1147+
float d[2];
1148+
float id[2];
1149+
for (int j = 0; j < 2; j++) {
1150+
float amax = 0.0f; // absolute max
1151+
1152+
for (int l = 0; l < QK4_0; l++) {
1153+
const float v = xb[j][l];
1154+
amax = MAX(amax, fabsf(v));
1155+
}
1156+
1157+
d[j] = amax / ((1 << 3) - 1);
1158+
id[j] = d[j] ? 1.0f/d[j] : 0.0f;
1159+
}
1160+
1161+
ds[src0] = d[0];
1162+
ds[src1] = d[1];
1163+
1164+
for (int l = 0; l < QK4_0; l++) {
1165+
const float v0 = xb[0][l]*id[0];
1166+
const uint8_t vi0 = (int8_t)roundf(v0) + 8;
1167+
1168+
const float v1 = xb[1][l]*id[1];
1169+
const uint8_t vi1 = (int8_t)roundf(v1) + 8;
1170+
1171+
assert(vi0 < 16);
1172+
assert(vi1 < 16);
1173+
1174+
qs[i*QK4_0 + l] = vi0 | (vi1 << 4);
1175+
}
1176+
}
1177+
}
1178+
11201179
static void quantize_row_q4_1_reference(const float * restrict x, void * restrict vy, int k) {
11211180
assert(k % QK4_1 == 0);
11221181
const int nb = k / QK4_1;
@@ -1658,6 +1717,40 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
16581717
#endif
16591718
}
16601719

1720+
// reference implementation for deterministic creation of model files
1721+
static void quantize_row_q8_0c_reference(const float * restrict x, void * restrict y, int k) {
1722+
assert(k % QK8_0 == 0);
1723+
const int nb = k / QK8_0;
1724+
1725+
uint8_t * restrict qs = y;
1726+
float * restrict ds = (float *) ((uint8_t *) y + QK8_0C * nb);
1727+
1728+
for (int i = 0; i < nb; i++) {
1729+
float amax = 0.0f; // absolute max
1730+
1731+
for (int l = 0; l < QK8_0; l++) {
1732+
const float v = x[i*QK8_0 + l];
1733+
amax = MAX(amax, fabsf(v));
1734+
}
1735+
1736+
const float d = amax / ((1 << 7) - 1);
1737+
const float id = d ? 1.0f/d : 0.0f;
1738+
1739+
ds[i] = d;
1740+
1741+
for (int l = 0; l < QK8_0; ++l) {
1742+
const float v = x[i*QK8_0 + l]*id;
1743+
qs[i*QK8_0 + l] = roundf(v);
1744+
}
1745+
}
1746+
}
1747+
1748+
static void quantize_row_q8_0c(const float * restrict x, void * restrict vy, int k) {
1749+
assert(k % QK8_0 == 0);
1750+
1751+
quantize_row_q8_0c_reference(x, vy, k);
1752+
}
1753+
16611754
static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) {
16621755
assert(k % QK4_0 == 0);
16631756
const int nb = k / QK4_0;
@@ -1776,6 +1869,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
17761869
#endif
17771870
}
17781871

1872+
static void dequantize_row_q4_0c(const void * restrict vx, float * restrict y, int k) {
1873+
assert(k % QK4_0C == 0);
1874+
const int nb = k / QK4_0;
1875+
const int nsb = k / QK4_0C;
1876+
1877+
// Split vx into nibbles section and scales section
1878+
const uint8_t * restrict qs = vx;
1879+
const float * restrict ds = (const float *) ((const uint8_t *) vx + QK4_0C/2 * nsb);
1880+
1881+
// scalar
1882+
for (int i = 0; i < nb/2; i++) {
1883+
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
1884+
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
1885+
1886+
const float d0 = ds[dst0];
1887+
const float d1 = ds[dst1];
1888+
1889+
for (int l = 0; l < QK4_0; l++) {
1890+
const uint8_t vi = qs[i * QK4_0 + l];
1891+
1892+
const int8_t vi0 = vi & 0xf;
1893+
const int8_t vi1 = vi >> 4;
1894+
1895+
const float v0 = (vi0 - 8)*d0;
1896+
const float v1 = (vi1 - 8)*d1;
1897+
1898+
y[dst0*QK4_0 + l] = v0;
1899+
y[dst1*QK4_0 + l] = v1;
1900+
1901+
assert(!isnan(y[dst0*QK4_0 + l]));
1902+
assert(!isnan(y[dst1*QK4_0 + l]));
1903+
}
1904+
}
1905+
}
1906+
17791907
static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) {
17801908
assert(k % QK4_1 == 0);
17811909
const int nb = k / QK4_1;
@@ -2002,6 +2130,7 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
20022130
}
20032131

20042132
static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2133+
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
20052134
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
20062135
static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
20072136
static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -2017,6 +2146,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
20172146
.vec_dot_q = ggml_vec_dot_q4_0_q8_0,
20182147
.vec_dot_type = GGML_TYPE_Q8_0,
20192148
},
2149+
[GGML_TYPE_Q4_0C] = {
2150+
.dequantize_row_q = dequantize_row_q4_0c,
2151+
//.quantize_row_q = quantize_row_q4_0c,
2152+
.quantize_row_q = (quantize_row_q_t) quantize_row_q4_0c_reference,
2153+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0c_reference,
2154+
.quantize_row_q_dot = quantize_row_q8_0c,
2155+
.vec_dot_q = ggml_vec_dot_q4_0c_q8_0c,
2156+
},
20202157
[GGML_TYPE_Q4_1] = {
20212158
.dequantize_row_q = dequantize_row_q4_1,
20222159
.quantize_row_q = quantize_row_q4_1,
@@ -2065,6 +2202,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
20652202
.vec_dot_q = NULL, // TODO
20662203
.vec_dot_type = GGML_TYPE_Q8_1,
20672204
},
2205+
[GGML_TYPE_Q8_0C] = {
2206+
.dequantize_row_q = NULL,
2207+
.quantize_row_q = quantize_row_q8_0c,
2208+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q8_0c_reference,
2209+
.quantize_row_q_dot = quantize_row_q8_0c,
2210+
.vec_dot_q = NULL,
2211+
},
20682212
};
20692213

20702214
// For internal test use
@@ -2835,6 +2979,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
28352979
#endif
28362980
}
28372981

2982+
static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
2983+
const int nb = n / QK4_0;
2984+
const int nsb = n / QK4_0C;
2985+
2986+
assert(n % QK4_0C == 0);
2987+
2988+
// Split into nibbles and scales sections
2989+
const uint8_t * restrict xqs = vx;
2990+
const float * restrict xds = (const float *) ((const uint8_t *) vx + nsb*QK4_0C/2);
2991+
const int8_t * restrict yqs = vy;
2992+
const float * restrict yds = (const float *) ((const uint8_t *) vy + nb*QK8_0C);
2993+
2994+
float sumf = 0.0;
2995+
2996+
// scalar
2997+
for (int i = 0; i < nb/2; i++) {
2998+
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
2999+
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
3000+
3001+
const float dx0 = xds[dst0];
3002+
const float dx1 = xds[dst1];
3003+
const float dy0 = yds[dst0];
3004+
const float dy1 = yds[dst1];
3005+
3006+
int sumi0 = 0;
3007+
int sumi1 = 0;
3008+
3009+
for (int l = 0; l < QK4_0; l++) {
3010+
const uint8_t v0 = xqs[i*QK4_0 + l];
3011+
3012+
const int i0 = (int8_t) (v0 & 0xf) - 8;
3013+
const int i1 = (int8_t) (v0 >> 4) - 8;
3014+
3015+
const int i2 = yqs[dst0*QK4_0 + l];
3016+
const int i3 = yqs[dst1*QK4_0 + l];
3017+
3018+
sumi0 += i0*i2;
3019+
sumi1 += i1*i3;
3020+
}
3021+
sumf += dx0*dy0*sumi0 + dx1*dy1*sumi1;
3022+
}
3023+
3024+
*s = sumf;
3025+
}
3026+
28383027
static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
28393028
const int nb = n / QK8_1;
28403029

@@ -3885,66 +4074,74 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
38854074
[GGML_TYPE_F32] = 1,
38864075
[GGML_TYPE_F16] = 1,
38874076
[GGML_TYPE_Q4_0] = QK4_0,
4077+
[GGML_TYPE_Q4_0C] = QK4_0C,
38884078
[GGML_TYPE_Q4_1] = QK4_1,
38894079
[GGML_TYPE_Q4_2] = QK4_2,
38904080
[GGML_TYPE_Q5_0] = QK5_0,
38914081
[GGML_TYPE_Q5_1] = QK5_1,
38924082
[GGML_TYPE_Q8_0] = QK8_0,
4083+
[GGML_TYPE_Q8_0C] = QK8_0C,
38934084
[GGML_TYPE_Q8_1] = QK8_1,
38944085
[GGML_TYPE_I8] = 1,
38954086
[GGML_TYPE_I16] = 1,
38964087
[GGML_TYPE_I32] = 1,
38974088
};
3898-
static_assert(GGML_TYPE_COUNT == 13, "GGML_BLCK_SIZE is outdated");
4089+
static_assert(GGML_TYPE_COUNT == 15, "GGML_BLCK_SIZE is outdated");
38994090

39004091
static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
39014092
[GGML_TYPE_F32] = sizeof(float),
39024093
[GGML_TYPE_F16] = sizeof(ggml_fp16_t),
39034094
[GGML_TYPE_Q4_0] = sizeof(block_q4_0),
4095+
[GGML_TYPE_Q4_0C] = 4*sizeof(block_q4_0),
39044096
[GGML_TYPE_Q4_1] = sizeof(block_q4_1),
39054097
[GGML_TYPE_Q4_2] = sizeof(block_q4_2),
39064098
[GGML_TYPE_Q5_0] = sizeof(block_q5_0),
39074099
[GGML_TYPE_Q5_1] = sizeof(block_q5_1),
39084100
[GGML_TYPE_Q8_0] = sizeof(block_q8_0),
4101+
[GGML_TYPE_Q8_0C] = sizeof(block_q8_0),
39094102
[GGML_TYPE_Q8_1] = sizeof(block_q8_1),
39104103
[GGML_TYPE_I8] = sizeof(int8_t),
39114104
[GGML_TYPE_I16] = sizeof(int16_t),
39124105
[GGML_TYPE_I32] = sizeof(int32_t),
39134106
};
3914-
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_SIZE is outdated");
4107+
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_SIZE is outdated");
39154108

39164109

39174110
static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
39184111
[GGML_TYPE_F32] = "f32",
39194112
[GGML_TYPE_F16] = "f16",
39204113
[GGML_TYPE_Q4_0] = "q4_0",
4114+
[GGML_TYPE_Q4_0C] = "q4_0c",
39214115
[GGML_TYPE_Q4_1] = "q4_1",
39224116
[GGML_TYPE_Q4_2] = "q4_2",
39234117
[GGML_TYPE_Q5_0] = "q5_0",
39244118
[GGML_TYPE_Q5_1] = "q5_1",
39254119
[GGML_TYPE_Q8_0] = "q8_0",
4120+
[GGML_TYPE_Q8_0C] = "q8_0c",
39264121
[GGML_TYPE_Q8_1] = "q8_1",
39274122
[GGML_TYPE_I8] = "i8",
39284123
[GGML_TYPE_I16] = "i16",
39294124
[GGML_TYPE_I32] = "i32",
39304125
};
3931-
static_assert(GGML_TYPE_COUNT == 13, "GGML_TYPE_NAME is outdated");
4126+
static_assert(GGML_TYPE_COUNT == 15, "GGML_TYPE_NAME is outdated");
39324127

39334128
static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
39344129
[GGML_TYPE_F32] = false,
39354130
[GGML_TYPE_F16] = false,
39364131
[GGML_TYPE_Q4_0] = true,
4132+
[GGML_TYPE_Q4_0C] = true,
39374133
[GGML_TYPE_Q4_1] = true,
39384134
[GGML_TYPE_Q4_2] = true,
39394135
[GGML_TYPE_Q5_0] = true,
39404136
[GGML_TYPE_Q5_1] = true,
39414137
[GGML_TYPE_Q8_0] = true,
4138+
[GGML_TYPE_Q8_0C] = true,
39424139
[GGML_TYPE_Q8_1] = true,
39434140
[GGML_TYPE_I8] = false,
39444141
[GGML_TYPE_I16] = false,
39454142
[GGML_TYPE_I32] = false,
39464143
};
3947-
static_assert(GGML_TYPE_COUNT == 13, "GGML_IS_QUANTIZED is outdated");
4144+
static_assert(GGML_TYPE_COUNT == 15, "GGML_IS_QUANTIZED is outdated");
39484145

39494146
static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
39504147
"NONE",
@@ -8763,11 +8960,13 @@ static void ggml_compute_forward_mul_mat(
87638960
struct ggml_tensor * dst) {
87648961
switch (src0->type) {
87658962
case GGML_TYPE_Q4_0:
8963+
case GGML_TYPE_Q4_0C:
87668964
case GGML_TYPE_Q4_1:
87678965
case GGML_TYPE_Q4_2:
87688966
case GGML_TYPE_Q5_0:
87698967
case GGML_TYPE_Q5_1:
87708968
case GGML_TYPE_Q8_0:
8969+
case GGML_TYPE_Q8_0C:
87718970
case GGML_TYPE_Q8_1:
87728971
{
87738972
ggml_compute_forward_mul_mat_q_f32(params, src0, src1, dst);
@@ -8994,11 +9193,13 @@ static void ggml_compute_forward_get_rows(
89949193
struct ggml_tensor * dst) {
89959194
switch (src0->type) {
89969195
case GGML_TYPE_Q4_0:
9196+
case GGML_TYPE_Q4_0C:
89979197
case GGML_TYPE_Q4_1:
89989198
case GGML_TYPE_Q4_2:
89999199
case GGML_TYPE_Q5_0:
90009200
case GGML_TYPE_Q5_1:
90019201
case GGML_TYPE_Q8_0:
9202+
case GGML_TYPE_Q8_0C:
90029203
case GGML_TYPE_Q8_1:
90039204
{
90049205
ggml_compute_forward_get_rows_q(params, src0, src1, dst);

ggml.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,8 @@ extern "C" {
237237
GGML_TYPE_Q5_1 = 7,
238238
GGML_TYPE_Q8_0 = 8,
239239
GGML_TYPE_Q8_1 = 9,
240+
GGML_TYPE_Q4_0C = 10,
241+
GGML_TYPE_Q8_0C = 11,
240242
GGML_TYPE_I8,
241243
GGML_TYPE_I16,
242244
GGML_TYPE_I32,

0 commit comments

Comments
 (0)