Skip to content

Commit f7d0509

Browse files
ikawrakowKawrakowggerganov
authored
Q4_2 quantization with rmse-optimized scale and quants (#1062)
* Q4_2 quantization with rmse-optimized scale and quants For quantize-stats we get q4_2: rmse 0.00159301, maxerr 0.17480469, 95pct<0.0030, median<0.0012 For 7B perplexity with BLAS enabled we get 6.2038 after 655 chunks. Quantization is slow (~90 seconds on my Mac for 7B) as not multi-threaded as in PR #896. * ggml : satisfy the sanitizer builds Not sure why this makes them fail * Better follow ggml conventions for function names * Fixed type as per reviewer comment --------- Co-authored-by: Iwan Kawrakow <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 884e7d7 commit f7d0509

File tree

1 file changed

+87
-3
lines changed

1 file changed

+87
-3
lines changed

ggml.c

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <inttypes.h>
2020
#include <stdio.h>
2121
#include <float.h>
22+
#include <limits.h>
2223

2324
// if C99 - static_assert is noop
2425
// ref: https://stackoverflow.com/a/53923785/4039976
@@ -1135,12 +1136,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
11351136
}
11361137
}
11371138

1139+
static inline int nearest_int(float fval) {
1140+
assert(fval <= 4194303.f);
1141+
float val = fval + 12582912.f;
1142+
int i; memcpy(&i, &val, sizeof(int));
1143+
return (i & 0x007fffff) - 0x00400000;
1144+
}
1145+
1146+
static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates,
1147+
const float * restrict candidates, int8_t * restrict L) {
1148+
assert (nmin >= INT8_MIN);
1149+
assert (nmax <= INT8_MAX);
1150+
float amax = 0;
1151+
for (int i=0; i<n; ++i) amax = MAX(amax, fabsf(X[i]));
1152+
if (!amax) { // all zero
1153+
for (int i=0; i<n; ++i) L[i] = 0;
1154+
return 1.f;
1155+
}
1156+
float best = 0, bestScale = 0;
1157+
for (int si=0; si<nCandidates; ++si) {
1158+
float iscale = candidates[si]/amax;
1159+
float sumlxP = 0; int suml2P = 0;
1160+
float sumlxM = 0; int suml2M = 0;
1161+
for (int i=0; i<n; ++i) {
1162+
int l = nearest_int(iscale*X[i]);
1163+
int lp = MAX(nmin, MIN(nmax, +l));
1164+
int lm = MAX(nmin, MIN(nmax, -l));
1165+
sumlxP += X[i]*lp; suml2P += lp*lp;
1166+
sumlxM += X[i]*lm; suml2M += lm*lm;
1167+
}
1168+
float sumlxP2 = sumlxP*sumlxP;
1169+
float sumlxM2 = sumlxM*sumlxM;
1170+
if (sumlxP2*suml2M > sumlxM2*suml2P) {
1171+
if (sumlxP2 > best*suml2P) {
1172+
best = sumlxP2/suml2P; bestScale = iscale;
1173+
}
1174+
} else {
1175+
if (sumlxM2 > best*suml2M) {
1176+
best = sumlxM2/suml2M; bestScale = -iscale;
1177+
}
1178+
}
1179+
}
1180+
float sumlx = 0; int suml2 = 0;
1181+
for (int i=0; i<n; ++i) {
1182+
int l = nearest_int(bestScale*X[i]);
1183+
l = MAX(nmin, MIN(nmax, l));
1184+
sumlx += X[i]*l; suml2 += l*l;
1185+
L[i] = l;
1186+
}
1187+
float scale = sumlx/suml2;
1188+
return scale;
1189+
}
1190+
1191+
static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restrict y, int k) {
1192+
#define CANDIDATE_COUNT 8
1193+
static const float candidates[CANDIDATE_COUNT] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f };
1194+
assert(k % QK4_2 == 0);
1195+
1196+
int8_t L[QK4_2];
1197+
1198+
const int nb = k / QK4_2;
1199+
1200+
for (int i = 0; i < nb; i++) {
1201+
1202+
float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L);
1203+
y[i].d = GGML_FP32_TO_FP16(scale);
1204+
1205+
for (int l = 0; l < QK4_2; l += 2) {
1206+
const uint8_t vi0 = (uint8_t)(L[l+0] + 8);
1207+
const uint8_t vi1 = (uint8_t)(L[l+1] + 8);
1208+
1209+
assert(vi0 < 16);
1210+
assert(vi1 < 16);
1211+
1212+
y[i].qs[l/2] = vi0 | (vi1 << 4);
1213+
}
1214+
1215+
x += QK4_2;
1216+
}
1217+
}
1218+
11381219
static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
11391220
assert(k % QK4_2 == 0);
11401221

11411222
block_q4_2 * restrict y = vy;
11421223

1143-
quantize_row_q4_2_reference(x, y, k);
1224+
//quantize_row_q4_2_reference(x, y, k);
1225+
// This produces the exact same format, just better match to the input floats ("better" as measured by RMSE)
1226+
quantize_row_q4_2_rmse(x, y, k);
11441227
}
11451228

11461229
// reference implementation for deterministic creation of model files
@@ -1569,7 +1652,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
15691652
[GGML_TYPE_Q4_2] = {
15701653
.dequantize_row_q = dequantize_row_q4_2,
15711654
.quantize_row_q = quantize_row_q4_2,
1572-
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference,
1655+
.quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_rmse, //quantize_row_q4_2_reference,
15731656
.quantize_row_q_dot = quantize_row_q8_0,
15741657
.vec_dot_q = ggml_vec_dot_q4_2_q8_0,
15751658
},
@@ -11770,7 +11853,8 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
1177011853
for (int j = 0; j < n; j += k) {
1177111854
block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
1177211855

11773-
quantize_row_q4_2_reference(src + j, y, k);
11856+
//quantize_row_q4_2_reference(src + j, y, k);
11857+
quantize_row_q4_2_rmse(src + j, y, k);
1177411858

1177511859
for (int i = 0; i < nb; i++) {
1177611860
for (int l = 0; l < QK4_2; l += 2) {

0 commit comments

Comments
 (0)