@@ -646,6 +646,139 @@ static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restri
646
646
}
647
647
}
648
648
649
+ static int comparefloat (const void * f1p , const void * f2p ) {
650
+ float f1 = * (const float * ) f1p ;
651
+ float f2 = * (const float * ) f2p ;
652
+ return (f1 > f2 ) - (f1 < f2 );
653
+ }
654
+
655
+ // Find the optimal quantization scaling for a set of values using a sweep line approach
656
+ // Returns the final scaling vale, and writes the quantized indices as bytes to y
657
+ static float find_optimal_scale (const float * restrict x , uint8_t * restrict qi ) {
658
+ // The quantization shape is a set of values that will be scaled linearly with a value 'd' to produce a set of values to choose from.
659
+ // The input values will then be rounded to the nearest of the scaled values.
660
+ // The shape can contain any set of values, e.g. to fit a non-linear distribution, but must be in sorted order and have exactly one '0'
661
+ const float shape [16 ] = {-8 , -7 , -6 , -5 , -4 , -3 , -2 , -1 , 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 };
662
+ // Precalculate the midpoint between adjacent values in the shape.
663
+ float inv_midpoints [15 ] = {0 };
664
+ for (int i = 0 ; i < 15 ; i ++ ) {
665
+ inv_midpoints [i ] = 2 /(shape [i ] + shape [i + 1 ]);
666
+ }
667
+ int zero_i ;
668
+ for (zero_i = 0 ; shape [zero_i ] != 0.0f ; zero_i ++ ) {
669
+ // find zero index
670
+ };
671
+
672
+ // Each event represents a value of d where one value in x changes its quantization
673
+ struct event {
674
+ float d ;
675
+ uint8_t x_i ;
676
+ uint8_t new_shape_i ;
677
+ };
678
+ // Each input value will go through each of the 16 quantization values
679
+ struct event events [16 * QK ];
680
+ int nevents = 0 ;
681
+ for (int i = 0 ; i < QK ; i ++ ) {
682
+ if (x [i ] == 0.0f ) {
683
+ // We ignore the scaling of zero valued elements
684
+ continue ;
685
+ }
686
+ for (int j = 0 ; j < 15 ; j ++ ) {
687
+ // Positive valued elements sweep backwards from zero, negative elements sweep forward from zero,
688
+ // both will wrap around and end up back at zero
689
+ int forwardi = (x [i ] > 0 ) ? j : j + 1 ;
690
+ events [nevents ++ ] = (struct event ) {
691
+ .d = x [i ] * inv_midpoints [j ],
692
+ .x_i = i ,
693
+ .new_shape_i = forwardi ,
694
+ };
695
+ }
696
+ // Add a wrap-around event at 0
697
+ events [nevents ++ ] = (struct event ) {
698
+ .d = 0 ,
699
+ .x_i = i ,
700
+ .new_shape_i = (x [i ] > 0 ) ? 15 : 0
701
+ };
702
+ }
703
+
704
+ // Order the events in increasing order of scaling factor d
705
+ qsort (events , nevents , sizeof (struct event ), comparefloat );
706
+
707
+ // We will keep track of our sum-of-squared-error score as we loop through the scales, which is
708
+ // sum(x_i^2) + d^2*sum(q_i^2) - 2*d*sum(x_i*q_i)
709
+ // sum(q_i^2)
710
+ float qv_sqr_sum = 0 ;
711
+ // sum(x_i*q_i)
712
+ float x_mul_qv_sum = 0 ;
713
+
714
+ // Start scaling at negative infinity
715
+ float best_score = INFINITY ;
716
+ float best_d = 0 ;
717
+ int best_i = 0 ;
718
+ for (int i = 0 ; i < QK ; i ++ ) {
719
+ qi [i ] = zero_i ;
720
+ }
721
+
722
+ for (int i = 0 ; i < nevents ; i ++ ) {
723
+ struct event ev = events [i ];
724
+ // Update loop values
725
+ const int old_i = qi [ev .x_i ];
726
+ const float old_val = shape [old_i ];
727
+ const float new_val = shape [ev .new_shape_i ];
728
+ qv_sqr_sum -= old_val * old_val ;
729
+ qv_sqr_sum += new_val * new_val ;
730
+ x_mul_qv_sum -= x [ev .x_i ] * old_val ;
731
+ x_mul_qv_sum += x [ev .x_i ] * new_val ;
732
+ qi [ev .x_i ] = ev .new_shape_i ;
733
+
734
+ if (ev .d == 0.0f || qv_sqr_sum == 0.0f ) {
735
+ continue ;
736
+ }
737
+
738
+ // squared error score at best_d, ommitting the constant sum(x_i^2) factor
739
+ const float local_score = - (x_mul_qv_sum * x_mul_qv_sum ) / qv_sqr_sum ;
740
+
741
+ if (local_score < best_score ) {
742
+ // find the optimal scaling factor d for the current quantization assignments,
743
+ // solve for minima of d^2*sum(q_i^2) - 2*d*sum(x_i*q_i)
744
+ best_d = x_mul_qv_sum / qv_sqr_sum ;
745
+ best_score = local_score ;
746
+ best_i = i ;
747
+ }
748
+ }
749
+ // restore qi values at position i
750
+ for (int i = 0 ; i < 16 ; i ++ ) {
751
+ qi [i ] = zero_i ;
752
+ }
753
+ for (int i = 0 ; i <= best_i ; i ++ ) {
754
+ qi [events [i ].x_i ] = events [i ].new_shape_i ;
755
+ }
756
+
757
+ return best_d ;
758
+ }
759
+
760
+ // Slow implementation of q4_0 that optimizes for RMSE
761
+ static void quantize_row_q4_0_slow (const float * restrict x , block_q4_0 * restrict y , int k ) {
762
+ assert (k % QK == 0 );
763
+ const int nb = k / QK ;
764
+
765
+ uint8_t pp [QK /2 ];
766
+
767
+ for (int i = 0 ; i < nb ; i ++ ) {
768
+ uint8_t qi [QK ];
769
+ y [i ].d = find_optimal_scale (& x [i * QK ], & qi [0 ]);
770
+
771
+ for (int l = 0 ; l < QK ; l += 2 ) {
772
+ assert (qi [l ] < 16 );
773
+ assert (qi [l + 1 ] < 16 );
774
+
775
+ pp [l /2 ] = qi [l ] | (qi [l + 1 ] << 4 );
776
+ }
777
+
778
+ memcpy (y [i ].qs , pp , sizeof (pp ));
779
+ }
780
+ }
781
+
649
782
static void quantize_row_q4_0 (const float * restrict x , void * restrict vy , int k ) {
650
783
assert (k % QK == 0 );
651
784
const int nb = k / QK ;
0 commit comments