@@ -851,8 +851,7 @@ static_assert(sizeof(block_q8_1) == 3*sizeof(float) + QK8_1, "wrong q8_1 block s
851
851
static void quantize_row_q4_0_reference (const float * restrict x , block_q4_0 * restrict y , int k ) {
852
852
static const int qk = QK4_0 ;
853
853
854
- assert (qk / 16 == 0 );
855
- assert ( k % qk == 0 );
854
+ assert (k % qk == 0 );
856
855
857
856
const int nb = k / qk ;
858
857
@@ -873,20 +872,16 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
873
872
874
873
y [i ].d = d ;
875
874
876
- uint64_t qs [QK4_0 / 16 ] = {0 };
877
-
878
875
for (int l = 0 ; l < qk /2 ; ++ l ) {
879
876
const float x0 = x [i * qk + 0 + l ]* id ;
880
877
const float x1 = x [i * qk + qk /2 + l ]* id ;
881
878
882
- const uint64_t xi0 = MIN (15 , (int8_t )(x0 + 8.5f ));
883
- const uint64_t xi1 = MIN (15 , (int8_t )(x1 + 8.5f ));
879
+ const uint8_t xi0 = MIN (15 , (int8_t )(x0 + 8.5f ));
880
+ const uint8_t xi1 = MIN (15 , (int8_t )(x1 + 8.5f ));
884
881
885
- qs [l / 8 ] | = xi0 << ( 8 * ( l & 7 )) ;
886
- qs [l / 8 ] |= xi1 << ( 8 * ( l & 7 ) + 4 ) ;
882
+ y [ i ]. qs [l ] = xi0 ;
883
+ y [ i ]. qs [l ] |= xi1 << 4 ;
887
884
}
888
-
889
- memcpy (y [i ].qs , qs , qk /2 );
890
885
}
891
886
}
892
887
@@ -897,8 +892,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict y, int k
897
892
static void quantize_row_q4_1_reference (const float * restrict x , block_q4_1 * restrict y , int k ) {
898
893
const int qk = QK4_1 ;
899
894
900
- assert (qk / 16 == 0 );
901
- assert ( k % qk == 0 );
895
+ assert (k % qk == 0 );
902
896
903
897
const int nb = k / qk ;
904
898
@@ -919,20 +913,16 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
919
913
y [i ].d = d ;
920
914
y [i ].m = min ;
921
915
922
- uint64_t qs [QK4_1 / 16 ] = {0 };
923
-
924
916
for (int l = 0 ; l < qk /2 ; ++ l ) {
925
917
const float x0 = (x [0 + l ] - min )* id ;
926
918
const float x1 = (x [qk /2 + l ] - min )* id ;
927
919
928
- const uint64_t xi0 = MIN (15 , (int8_t )(x0 + 0.5f ));
929
- const uint64_t xi1 = MIN (15 , (int8_t )(x1 + 0.5f ));
920
+ const uint8_t xi0 = MIN (15 , (int8_t )(x0 + 0.5f ));
921
+ const uint8_t xi1 = MIN (15 , (int8_t )(x1 + 0.5f ));
930
922
931
- qs [l / 8 ] | = xi0 << ( 8 * ( l & 7 )) ;
932
- qs [l / 8 ] |= xi1 << ( 8 * ( l & 7 ) + 4 ) ;
923
+ y [ i ]. qs [l ] = xi0 ;
924
+ y [ i ]. qs [l ] |= xi1 << 4 ;
933
925
}
934
-
935
- memcpy (y [i ].qs , qs , qk /2 );
936
926
}
937
927
}
938
928
@@ -944,8 +934,7 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict y, int k
944
934
static void quantize_row_q4_2_reference (const float * restrict x , block_q4_2 * restrict y , int k ) {
945
935
static const int qk = QK4_2 ;
946
936
947
- assert (qk / 16 == 0 );
948
- assert ( k % qk == 0 );
937
+ assert (k % qk == 0 );
949
938
950
939
const int nb = k / qk ;
951
940
@@ -990,8 +979,7 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict y, int k
990
979
static void quantize_row_q5_0_reference (const float * restrict x , block_q5_0 * restrict y , int k ) {
991
980
static const int qk = QK5_0 ;
992
981
993
- assert (qk / 16 == 0 );
994
- assert ( k % qk == 0 );
982
+ assert (k % qk == 0 );
995
983
996
984
const int nb = k / qk ;
997
985
@@ -1013,24 +1001,21 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
1013
1001
y [i ].d = d ;
1014
1002
1015
1003
uint32_t qh = 0 ;
1016
- uint64_t qs [QK5_0 / 16 ] = {0 };
1017
1004
1018
1005
for (int l = 0 ; l < qk /2 ; ++ l ) {
1019
1006
const float x0 = x [i * qk + 0 + l ]* id ;
1020
1007
const float x1 = x [i * qk + qk /2 + l ]* id ;
1021
1008
1022
- const uint64_t xi0 = MIN (31 , (int8_t )(x0 + 16.5f ));
1023
- const uint64_t xi1 = MIN (31 , (int8_t )(x1 + 16.5f ));
1009
+ const uint8_t xi0 = MIN (31 , (int8_t )(x0 + 16.5f ));
1010
+ const uint8_t xi1 = MIN (31 , (int8_t )(x1 + 16.5f ));
1024
1011
1025
- qs [l /8 ] |= xi0 << (8 * (l & 7 ));
1026
- qs [l /8 ] |= xi1 << (8 * (l & 7 ) + 4 );
1012
+ y [i ].qs [l ] = (xi0 & 0x0F ) | ((xi1 & 0x0F ) << 4 );
1027
1013
1028
1014
// get the 5-th bit and store it in qh at the right position
1029
1015
qh |= ((xi0 & 0x10 ) >> 4 ) << (l + 0 );
1030
1016
qh |= ((xi1 & 0x10 ) >> 4 ) << (l + qk /2 );
1031
1017
}
1032
1018
1033
- memcpy ( y [i ].qs , qs , qk /2 );
1034
1019
memcpy (& y [i ].qh , & qh , sizeof (qh ));
1035
1020
}
1036
1021
}
@@ -1040,50 +1025,50 @@ static void quantize_row_q5_0(const float * restrict x, void * restrict y, int k
1040
1025
}
1041
1026
1042
1027
static void quantize_row_q5_1_reference (const float * restrict x , block_q5_1 * restrict y , int k ) {
1043
- assert (k % QK5_1 == 0 );
1044
- const int nb = k / QK5_1 ;
1028
+ const int qk = QK5_1 ;
1029
+
1030
+ assert (k % qk == 0 );
1031
+
1032
+ const int nb = k / qk ;
1045
1033
1046
1034
for (int i = 0 ; i < nb ; i ++ ) {
1047
1035
float min = FLT_MAX ;
1048
1036
float max = - FLT_MAX ;
1049
1037
1050
- for (int l = 0 ; l < QK5_1 ; l ++ ) {
1051
- const float v = x [i * QK5_1 + l ];
1038
+ for (int l = 0 ; l < qk ; l ++ ) {
1039
+ const float v = x [i * qk + l ];
1040
+
1052
1041
if (v < min ) min = v ;
1053
1042
if (v > max ) max = v ;
1054
1043
}
1055
1044
1056
- const float d = (max - min ) / ((1 << 5 ) - 1 );
1045
+ const float d = (max - min ) / ((1 << 5 ) - 1 );
1057
1046
const float id = d ? 1.0f /d : 0.0f ;
1058
1047
1059
1048
y [i ].d = GGML_FP32_TO_FP16 (d );
1060
1049
y [i ].m = GGML_FP32_TO_FP16 (min );
1061
1050
1062
1051
uint32_t qh = 0 ;
1063
1052
1064
- for (int l = 0 ; l < QK5_1 ; l += 2 ) {
1065
- const float v0 = (x [i * QK5_1 + l + 0 ] - min )* id ;
1066
- const float v1 = (x [i * QK5_1 + l + 1 ] - min )* id ;
1053
+ for (int l = 0 ; l < qk / 2 ; ++ l ) {
1054
+ const float x0 = (x [i * qk + 0 + l ] - min )* id ;
1055
+ const float x1 = (x [i * qk + qk / 2 + l ] - min )* id ;
1067
1056
1068
- const uint32_t vi0 = (int ) ( v0 + 0.5f );
1069
- const uint32_t vi1 = (int ) ( v1 + 0.5f );
1057
+ const uint8_t xi0 = (uint8_t )( x0 + 0.5f );
1058
+ const uint8_t xi1 = (uint8_t )( x1 + 0.5f );
1070
1059
1071
- y [i ].qs [l / 2 ] = (vi0 & 0x0F ) | ((vi1 & 0x0F ) << 4 );
1060
+ y [i ].qs [l ] = (xi0 & 0x0F ) | ((xi1 & 0x0F ) << 4 );
1072
1061
1073
1062
// get the 5-th bit and store it in qh at the right position
1074
- qh |= ((vi0 & 0x10 ) >> 4 ) << (l + 0 );
1075
- qh |= ((vi1 & 0x10 ) >> 4 ) << (l + 1 );
1063
+ qh |= ((xi0 & 0x10 ) >> 4 ) << (l + 0 );
1064
+ qh |= ((xi1 & 0x10 ) >> 4 ) << (l + qk / 2 );
1076
1065
}
1077
1066
1078
1067
memcpy (& y [i ].qh , & qh , sizeof (y [i ].qh ));
1079
1068
}
1080
1069
}
1081
1070
1082
- static void quantize_row_q5_1 (const float * restrict x , void * restrict vy , int k ) {
1083
- assert (k % QK5_1 == 0 );
1084
-
1085
- block_q5_1 * restrict y = vy ;
1086
-
1071
+ static void quantize_row_q5_1 (const float * restrict x , void * restrict y , int k ) {
1087
1072
quantize_row_q5_1_reference (x , y , k );
1088
1073
}
1089
1074
@@ -1443,8 +1428,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1443
1428
static void dequantize_row_q4_0 (const block_q4_0 * restrict x , float * restrict y , int k ) {
1444
1429
static const int qk = QK4_0 ;
1445
1430
1446
- assert (qk / 16 == 0 );
1447
- assert ( k % qk == 0 );
1431
+ assert (k % qk == 0 );
1448
1432
1449
1433
const int nb = k / qk ;
1450
1434
@@ -1464,8 +1448,7 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
1464
1448
static void dequantize_row_q4_1 (const block_q4_1 * restrict x , float * restrict y , int k ) {
1465
1449
static const int qk = QK4_1 ;
1466
1450
1467
- assert (qk / 16 == 0 );
1468
- assert ( k % qk == 0 );
1451
+ assert (k % qk == 0 );
1469
1452
1470
1453
const int nb = k / qk ;
1471
1454
@@ -1487,8 +1470,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
1487
1470
// BORKEN !!!
1488
1471
static const int qk = QK4_2 ;
1489
1472
1490
- assert (qk / 16 == 0 );
1491
- assert ( k % qk == 0 );
1473
+ assert (k % qk == 0 );
1492
1474
1493
1475
const int nb = k / qk ;
1494
1476
@@ -1508,8 +1490,7 @@ static void dequantize_row_q4_2(const block_q4_2 * restrict x, float * restrict
1508
1490
static void dequantize_row_q5_0 (const block_q5_0 * restrict x , float * restrict y , int k ) {
1509
1491
static const int qk = QK4_0 ;
1510
1492
1511
- assert (qk / 16 == 0 );
1512
- assert ( k % qk == 0 );
1493
+ assert (k % qk == 0 );
1513
1494
1514
1495
const int nb = k / qk ;
1515
1496
@@ -1532,39 +1513,29 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
1532
1513
}
1533
1514
}
1534
1515
1535
- static void dequantize_row_q5_1 (const void * restrict vx , float * restrict y , int k ) {
1536
- assert (k % QK5_1 == 0 );
1537
- const int nb = k / QK5_1 ;
1516
+ static void dequantize_row_q5_1 (const block_q5_1 * restrict x , float * restrict y , int k ) {
1517
+ static const int qk = QK5_1 ;
1538
1518
1539
- const block_q5_1 * restrict x = vx ;
1519
+ assert (k % qk == 0 );
1520
+
1521
+ const int nb = k / qk ;
1540
1522
1541
1523
for (int i = 0 ; i < nb ; i ++ ) {
1542
1524
const float d = GGML_FP16_TO_FP32 (x [i ].d );
1543
1525
const float m = GGML_FP16_TO_FP32 (x [i ].m );
1544
1526
1545
- const uint8_t * restrict pp = x [i ].qs ;
1546
-
1547
1527
uint32_t qh ;
1548
1528
memcpy (& qh , x [i ].qh , sizeof (qh ));
1549
1529
1550
- for (int l = 0 ; l < QK5_1 ; l += 2 ) {
1551
- const uint8_t vi = pp [l /2 ];
1552
-
1553
- // extract the 5-th bit from qh
1554
- const uint8_t vh0 = ((qh & (1u << (l + 0 ))) >> (l + 0 )) << 4 ;
1555
- const uint8_t vh1 = ((qh & (1u << (l + 1 ))) >> (l + 1 )) << 4 ;
1556
-
1557
- const uint8_t vi0 = (vi & 0x0F ) | vh0 ;
1558
- const uint8_t vi1 = (vi >> 4 ) | vh1 ;
1559
-
1560
- const float v0 = vi0 * d + m ;
1561
- const float v1 = vi1 * d + m ;
1530
+ for (int j = 0 ; j < qk /2 ; ++ j ) {
1531
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4 ;
1532
+ const uint8_t xh_1 = ((qh & (1u << (j + 16 ))) >> (j + 12 ));
1562
1533
1563
- y [ i * QK5_1 + l + 0 ] = v0 ;
1564
- y [ i * QK5_1 + l + 1 ] = v1 ;
1534
+ const int x0 = ( x [ i ]. qs [ j ] & 0xf ) | xh_0 ;
1535
+ const int x1 = ( x [ i ]. qs [ j ] >> 4 ) | xh_1 ;
1565
1536
1566
- assert (! isnan ( y [i * QK5_1 + l + 0 ])) ;
1567
- assert (! isnan ( y [i * QK5_1 + l + 1 ])) ;
1537
+ y [i * qk + j + 0 ] = x0 * d + m ;
1538
+ y [i * qk + j + qk / 2 ] = x1 * d + m ;
1568
1539
}
1569
1540
}
1570
1541
}
@@ -1627,7 +1598,7 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1627
1598
.vec_dot_type = GGML_TYPE_Q8_0 ,
1628
1599
},
1629
1600
[GGML_TYPE_Q5_1 ] = {
1630
- .dequantize_row_q = dequantize_row_q5_1 ,
1601
+ .dequantize_row_q = ( dequantize_row_q_t ) dequantize_row_q5_1 ,
1631
1602
.quantize_row_q = quantize_row_q5_1 ,
1632
1603
.quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q5_1_reference ,
1633
1604
.quantize_row_q_dot = quantize_row_q8_1 ,
@@ -2875,11 +2846,12 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2875
2846
}
2876
2847
2877
2848
static void ggml_vec_dot_q5_1_q8_1 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
2878
- const int nb = n / QK8_1 ;
2849
+ const int qk = QK8_1 ;
2850
+ const int nb = n / qk ;
2879
2851
2880
- assert (n % QK8_1 == 0 );
2852
+ assert (n % qk == 0 );
2881
2853
assert (nb % 2 == 0 );
2882
- assert (QK8_1 == QK5_1 );
2854
+ assert (qk == QK5_1 );
2883
2855
2884
2856
const block_q5_1 * restrict x = vx ;
2885
2857
const block_q8_1 * restrict y = vy ;
@@ -2915,13 +2887,9 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
2915
2887
const int8x16_t v0l = vreinterpretq_s8_u8 (vandq_u8 (v0 , vdupq_n_u8 (0x0F )));
2916
2888
const int8x16_t v0h = vreinterpretq_s8_u8 (vshrq_n_u8 (v0 , 4 ));
2917
2889
2918
- // interleave
2919
- const int8x16_t v0lz = vzip1q_s8 (v0l , v0h );
2920
- const int8x16_t v0hz = vzip2q_s8 (v0l , v0h );
2921
-
2922
2890
// add
2923
- const int8x16_t v0lf = vorrq_s8 (v0lz , qhl );
2924
- const int8x16_t v0hf = vorrq_s8 (v0hz , qhh );
2891
+ const int8x16_t v0lf = vorrq_s8 (v0l , qhl );
2892
+ const int8x16_t v0hf = vorrq_s8 (v0h , qhh );
2925
2893
2926
2894
// load y
2927
2895
const int8x16_t v1l = vld1q_s8 (y0 -> qs );
@@ -3044,36 +3012,28 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3044
3012
3045
3013
* s = hsum_float_8 (acc ) + summs ;
3046
3014
#else
3015
+ // scalar
3047
3016
float sumf = 0.0 ;
3048
3017
3049
3018
for (int i = 0 ; i < nb ; i ++ ) {
3050
- const uint8_t * restrict x0 = x [i ].qs ;
3051
- const int8_t * restrict y0 = y [i ].qs ;
3019
+ const int8_t * py = y [i ].qs ;
3052
3020
3053
3021
uint32_t qh ;
3054
3022
memcpy (& qh , x [i ].qh , sizeof (qh ));
3055
3023
3056
- const float d = GGML_FP16_TO_FP32 (x [i ].d );
3057
- const float m = GGML_FP16_TO_FP32 (x [i ].m );
3058
-
3059
- int sxy = 0 ;
3060
-
3061
- for (int j = 0 ; j < QK8_1 /2 ; j ++ ) {
3062
- const uint8_t v0 = x0 [j ];
3063
-
3064
- const int x0_0h = ((qh & (1u << (2 * j + 0 ))) >> (2 * j + 0 )) << 4 ;
3065
- const int x1_0h = ((qh & (1u << (2 * j + 1 ))) >> (2 * j + 1 )) << 4 ;
3024
+ int sumi = 0 ;
3066
3025
3067
- const int x0_0 = (v0 & 0x0F ) | x0_0h ;
3068
- const int x1_0 = (v0 >> 4 ) | x1_0h ;
3026
+ for (int j = 0 ; j < qk /2 ; ++ j ) {
3027
+ const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4 ;
3028
+ const uint8_t xh_1 = ((qh & (1u << (j + 16 ))) >> (j + 12 ));
3069
3029
3070
- const int y0_0 = y0 [ 2 * j + 0 ] ;
3071
- const int y1_0 = y0 [ 2 * j + 1 ] ;
3030
+ const int32_t x0 = ( x [ i ]. qs [ j ] & 0xF ) | xh_0 ;
3031
+ const int32_t x1 = ( x [ i ]. qs [ j ] >> 4 ) | xh_1 ;
3072
3032
3073
- sxy += x0_0 * y0_0 + x1_0 * y1_0 ;
3033
+ sumi += ( x0 * py [ j ]) + ( x1 * py [ j + qk / 2 ]) ;
3074
3034
}
3075
3035
3076
- sumf += (d * sxy )* y [i ].d + m * (y [i ].s0 + y [i ].s1 );
3036
+ sumf += (GGML_FP16_TO_FP32 ( x [ i ]. d )* y [i ].d ) * sumi + GGML_FP16_TO_FP32 ( x [ i ]. m ) * (y [i ].s0 + y [i ].s1 );
3077
3037
}
3078
3038
3079
3039
* s = sumf ;
0 commit comments