|
19 | 19 | #include <inttypes.h>
|
20 | 20 | #include <stdio.h>
|
21 | 21 | #include <float.h>
|
| 22 | +#include <limits.h> |
22 | 23 |
|
23 | 24 | // if C99 - static_assert is noop
|
24 | 25 | // ref: https://stackoverflow.com/a/53923785/4039976
|
@@ -1135,12 +1136,94 @@ static void quantize_row_q4_2_reference(const float * restrict x, block_q4_2 * r
|
1135 | 1136 | }
|
1136 | 1137 | }
|
1137 | 1138 |
|
| 1139 | +static inline int nearest_int(float fval) { |
| 1140 | + assert(fval <= 4194303.f); |
| 1141 | + float val = fval + 12582912.f; |
| 1142 | + int i; memcpy(&i, &val, sizeof(int)); |
| 1143 | + return (i & 0x007fffff) - 0x00400000; |
| 1144 | +} |
| 1145 | + |
| 1146 | +static float kquantize_q4_with_bounds(int n, int nmin, int nmax, const float * restrict X, int nCandidates, |
| 1147 | + const float * restrict candidates, int8_t * restrict L) { |
| 1148 | + assert (nmin >= INT8_MIN); |
| 1149 | + assert (nmax <= INT8_MAX); |
| 1150 | + float amax = 0; |
| 1151 | + for (int i=0; i<n; ++i) amax = MAX(amax, fabsf(X[i])); |
| 1152 | + if (!amax) { // all zero |
| 1153 | + for (int i=0; i<n; ++i) L[i] = 0; |
| 1154 | + return 1.f; |
| 1155 | + } |
| 1156 | + float best = 0, bestScale = 0; |
| 1157 | + for (int si=0; si<nCandidates; ++si) { |
| 1158 | + float iscale = candidates[si]/amax; |
| 1159 | + float sumlxP = 0; int suml2P = 0; |
| 1160 | + float sumlxM = 0; int suml2M = 0; |
| 1161 | + for (int i=0; i<n; ++i) { |
| 1162 | + int l = nearest_int(iscale*X[i]); |
| 1163 | + int lp = MAX(nmin, MIN(nmax, +l)); |
| 1164 | + int lm = MAX(nmin, MIN(nmax, -l)); |
| 1165 | + sumlxP += X[i]*lp; suml2P += lp*lp; |
| 1166 | + sumlxM += X[i]*lm; suml2M += lm*lm; |
| 1167 | + } |
| 1168 | + float sumlxP2 = sumlxP*sumlxP; |
| 1169 | + float sumlxM2 = sumlxM*sumlxM; |
| 1170 | + if (sumlxP2*suml2M > sumlxM2*suml2P) { |
| 1171 | + if (sumlxP2 > best*suml2P) { |
| 1172 | + best = sumlxP2/suml2P; bestScale = iscale; |
| 1173 | + } |
| 1174 | + } else { |
| 1175 | + if (sumlxM2 > best*suml2M) { |
| 1176 | + best = sumlxM2/suml2M; bestScale = -iscale; |
| 1177 | + } |
| 1178 | + } |
| 1179 | + } |
| 1180 | + float sumlx = 0; int suml2 = 0; |
| 1181 | + for (int i=0; i<n; ++i) { |
| 1182 | + int l = nearest_int(bestScale*X[i]); |
| 1183 | + l = MAX(nmin, MIN(nmax, l)); |
| 1184 | + sumlx += X[i]*l; suml2 += l*l; |
| 1185 | + L[i] = l; |
| 1186 | + } |
| 1187 | + float scale = sumlx/suml2; |
| 1188 | + return scale; |
| 1189 | +} |
| 1190 | + |
| 1191 | +static void quantize_row_q4_2_rmse(const float * restrict x, block_q4_2 * restrict y, int k) { |
| 1192 | +#define CANDIDATE_COUNT 8 |
| 1193 | + static const float candidates[CANDIDATE_COUNT] = { +8.7f, +8.3f, +8.1f, +7.8f, +7.3f, +7.0f, +6.3f, +5.7f }; |
| 1194 | + assert(k % QK4_2 == 0); |
| 1195 | + |
| 1196 | + int8_t L[QK4_2]; |
| 1197 | + |
| 1198 | + const int nb = k / QK4_2; |
| 1199 | + |
| 1200 | + for (int i = 0; i < nb; i++) { |
| 1201 | + |
| 1202 | + float scale = kquantize_q4_with_bounds(QK4_2, -8, 7, x, CANDIDATE_COUNT, candidates, L); |
| 1203 | + y[i].d = GGML_FP32_TO_FP16(scale); |
| 1204 | + |
| 1205 | + for (int l = 0; l < QK4_2; l += 2) { |
| 1206 | + const uint8_t vi0 = (uint8_t)(L[l+0] + 8); |
| 1207 | + const uint8_t vi1 = (uint8_t)(L[l+1] + 8); |
| 1208 | + |
| 1209 | + assert(vi0 < 16); |
| 1210 | + assert(vi1 < 16); |
| 1211 | + |
| 1212 | + y[i].qs[l/2] = vi0 | (vi1 << 4); |
| 1213 | + } |
| 1214 | + |
| 1215 | + x += QK4_2; |
| 1216 | + } |
| 1217 | +} |
| 1218 | + |
1138 | 1219 | static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int k) {
|
1139 | 1220 | assert(k % QK4_2 == 0);
|
1140 | 1221 |
|
1141 | 1222 | block_q4_2 * restrict y = vy;
|
1142 | 1223 |
|
1143 |
| - quantize_row_q4_2_reference(x, y, k); |
| 1224 | + //quantize_row_q4_2_reference(x, y, k); |
| 1225 | + // This produces the exact same format, just better match to the input floats ("better" as measured by RMSE) |
| 1226 | + quantize_row_q4_2_rmse(x, y, k); |
1144 | 1227 | }
|
1145 | 1228 |
|
1146 | 1229 | // reference implementation for deterministic creation of model files
|
@@ -1569,7 +1652,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
|
1569 | 1652 | [GGML_TYPE_Q4_2] = {
|
1570 | 1653 | .dequantize_row_q = dequantize_row_q4_2,
|
1571 | 1654 | .quantize_row_q = quantize_row_q4_2,
|
1572 |
| - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_reference, |
| 1655 | + .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_2_rmse, //quantize_row_q4_2_reference, |
1573 | 1656 | .quantize_row_q_dot = quantize_row_q8_0,
|
1574 | 1657 | .vec_dot_q = ggml_vec_dot_q4_2_q8_0,
|
1575 | 1658 | },
|
@@ -11770,7 +11853,8 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
|
11770 | 11853 | for (int j = 0; j < n; j += k) {
|
11771 | 11854 | block_q4_2 * restrict y = (block_q4_2 *)dst + j/QK4_2;
|
11772 | 11855 |
|
11773 |
| - quantize_row_q4_2_reference(src + j, y, k); |
| 11856 | + //quantize_row_q4_2_reference(src + j, y, k); |
| 11857 | + quantize_row_q4_2_rmse(src + j, y, k); |
11774 | 11858 |
|
11775 | 11859 | for (int i = 0; i < nb; i++) {
|
11776 | 11860 | for (int l = 0; l < QK4_2; l += 2) {
|
|
0 commit comments