Skip to content

Commit 4dc62e7

Browse files
unboundedsw
authored andcommitted
Really slow RMS "optimal" scaling for q4_0
Use a sweep line approach to scan all configurations of quantization, examining every changeover point where a quantize value changes, and find the optimal scaling for each configuration analytically.
1 parent 40ebf81 commit 4dc62e7

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed

ggml.c

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -646,6 +646,139 @@ static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restri
646646
}
647647
}
648648

649+
static int comparefloat(const void * f1p, const void * f2p) {
650+
float f1 = *(const float *) f1p;
651+
float f2 = *(const float *) f2p;
652+
return (f1 > f2) - (f1 < f2);
653+
}
654+
655+
// Find the optimal quantization scaling for a set of values using a sweep line approach
656+
// Returns the final scaling vale, and writes the quantized indices as bytes to y
657+
static float find_optimal_scale(const float * restrict x, uint8_t * restrict qi) {
658+
// The quantization shape is a set of values that will be scaled linearly with a value 'd' to produce a set of values to choose from.
659+
// The input values will then be rounded to the nearest of the scaled values.
660+
// The shape can contain any set of values, e.g. to fit a non-linear distribution, but must be in sorted order and have exactly one '0'
661+
const float shape[16] = {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7};
662+
// Precalculate the midpoint between adjacent values in the shape.
663+
float inv_midpoints[15] = {0};
664+
for (int i = 0; i < 15; i++) {
665+
inv_midpoints[i] = 2/(shape[i] + shape[i+1]);
666+
}
667+
int zero_i;
668+
for (zero_i = 0; shape[zero_i] != 0.0f; zero_i++) {
669+
// find zero index
670+
};
671+
672+
// Each event represents a value of d where one value in x changes its quantization
673+
struct event {
674+
float d;
675+
uint8_t x_i;
676+
uint8_t new_shape_i;
677+
};
678+
// Each input value will go through each of the 16 quantization values
679+
struct event events[16*QK];
680+
int nevents = 0;
681+
for (int i = 0; i < QK; i++) {
682+
if (x[i] == 0.0f) {
683+
// We ignore the scaling of zero valued elements
684+
continue;
685+
}
686+
for (int j = 0; j < 15; j++) {
687+
// Positive valued elements sweep backwards from zero, negative elements sweep forward from zero,
688+
// both will wrap around and end up back at zero
689+
int forwardi = (x[i] > 0) ? j : j+1;
690+
events[nevents++] = (struct event) {
691+
.d = x[i] * inv_midpoints[j],
692+
.x_i = i,
693+
.new_shape_i = forwardi,
694+
};
695+
}
696+
// Add a wrap-around event at 0
697+
events[nevents++] = (struct event) {
698+
.d = 0,
699+
.x_i = i,
700+
.new_shape_i = (x[i] > 0) ? 15 : 0
701+
};
702+
}
703+
704+
// Order the events in increasing order of scaling factor d
705+
qsort(events, nevents, sizeof(struct event), comparefloat);
706+
707+
// We will keep track of our sum-of-squared-error score as we loop through the scales, which is
708+
// sum(x_i^2) + d^2*sum(q_i^2) - 2*d*sum(x_i*q_i)
709+
// sum(q_i^2)
710+
float qv_sqr_sum = 0;
711+
// sum(x_i*q_i)
712+
float x_mul_qv_sum = 0;
713+
714+
// Start scaling at negative infinity
715+
float best_score = INFINITY;
716+
float best_d = 0;
717+
int best_i = 0;
718+
for (int i = 0; i < QK; i++) {
719+
qi[i] = zero_i;
720+
}
721+
722+
for (int i = 0; i < nevents; i++) {
723+
struct event ev = events[i];
724+
// Update loop values
725+
const int old_i = qi[ev.x_i];
726+
const float old_val = shape[old_i];
727+
const float new_val = shape[ev.new_shape_i];
728+
qv_sqr_sum -= old_val*old_val;
729+
qv_sqr_sum += new_val*new_val;
730+
x_mul_qv_sum -= x[ev.x_i] * old_val;
731+
x_mul_qv_sum += x[ev.x_i] * new_val;
732+
qi[ev.x_i] = ev.new_shape_i;
733+
734+
if (ev.d == 0.0f || qv_sqr_sum == 0.0f) {
735+
continue;
736+
}
737+
738+
// squared error score at best_d, ommitting the constant sum(x_i^2) factor
739+
const float local_score = -(x_mul_qv_sum * x_mul_qv_sum) / qv_sqr_sum;
740+
741+
if (local_score < best_score) {
742+
// find the optimal scaling factor d for the current quantization assignments,
743+
// solve for minima of d^2*sum(q_i^2) - 2*d*sum(x_i*q_i)
744+
best_d = x_mul_qv_sum / qv_sqr_sum;
745+
best_score = local_score;
746+
best_i = i;
747+
}
748+
}
749+
// restore qi values at position i
750+
for (int i = 0; i < 16; i++) {
751+
qi[i] = zero_i;
752+
}
753+
for (int i = 0; i <= best_i; i++) {
754+
qi[events[i].x_i] = events[i].new_shape_i;
755+
}
756+
757+
return best_d;
758+
}
759+
760+
// Slow implementation of q4_0 that optimizes for RMSE
761+
static void quantize_row_q4_0_slow(const float * restrict x, block_q4_0 * restrict y, int k) {
762+
assert(k % QK == 0);
763+
const int nb = k / QK;
764+
765+
uint8_t pp[QK/2];
766+
767+
for (int i = 0; i < nb; i++) {
768+
uint8_t qi[QK];
769+
y[i].d = find_optimal_scale(&x[i*QK], &qi[0]);
770+
771+
for (int l = 0; l < QK; l += 2) {
772+
assert(qi[l] < 16);
773+
assert(qi[l+1] < 16);
774+
775+
pp[l/2] = qi[l] | (qi[l+1] << 4);
776+
}
777+
778+
memcpy(y[i].qs, pp, sizeof(pp));
779+
}
780+
}
781+
649782
static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) {
650783
assert(k % QK == 0);
651784
const int nb = k / QK;

0 commit comments

Comments
 (0)