@@ -648,6 +648,13 @@ typedef struct {
648
648
} block_q8_0 ;
649
649
static_assert (sizeof (block_q8_0 ) == sizeof (float ) + QK8_0 , "wrong q8_0 block size/padding" );
650
650
651
+ #define QK4_0C (4*32)
652
+ #define QK4_0C_MUL (QK4_0C / QK4_0)
653
+ // TODO: nicer description - pseudostruct?
654
+ // q4_0c : (uint8_t[QK4_0C/2]) qs[nb] || float d[n]
655
+
656
+ #define QK8_0C 32
657
+ // q8_0c : uint8_t qs[n] || float d[n]
651
658
652
659
// reference implementation for deterministic creation of model files
653
660
static void quantize_row_q4_0_reference (const float * restrict x , block_q4_0 * restrict y , int k ) {
@@ -937,6 +944,57 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
937
944
#endif
938
945
}
939
946
947
+ static void quantize_row_q4_0c_reference (const float * restrict x , uint8_t * restrict y , int k ) {
948
+ assert (k % QK4_0C == 0 );
949
+ const int nb = k / QK4_0 ;
950
+ const int nsb = k / QK4_0C ;
951
+
952
+ // Split y into nibbles section and scales section
953
+ uint8_t * restrict qs = y ;
954
+ float * restrict ds = (float * ) (y + QK4_0C /2 * nsb );
955
+
956
+ for (int i = 0 ; i < nb /2 ; i ++ ) {
957
+ // Interleave two output blocks in low and high nibbles
958
+ const int src0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
959
+ const int src1 = i + i /2 * 2 + 2 ; // 2, 3, 6, 7, 10, 11 ...
960
+ const float * xb [2 ] = {
961
+ x + QK4_0 * src0 , // block in low nibbles
962
+ x + QK4_0 * src1 , // block in high nibbles
963
+ };
964
+
965
+ // Find multiplier for each block
966
+ float d [2 ];
967
+ float id [2 ];
968
+ for (int j = 0 ; j < 2 ; j ++ ) {
969
+ float amax = 0.0f ; // absolute max
970
+
971
+ for (int l = 0 ; l < QK4_0 ; l ++ ) {
972
+ const float v = xb [j ][l ];
973
+ amax = MAX (amax , fabsf (v ));
974
+ }
975
+
976
+ d [j ] = amax / ((1 << 3 ) - 1 );
977
+ id [j ] = d [j ] ? 1.0f /d [j ] : 0.0f ;
978
+ }
979
+
980
+ ds [src0 ] = d [0 ];
981
+ ds [src1 ] = d [1 ];
982
+
983
+ for (int l = 0 ; l < QK4_0 ; l ++ ) {
984
+ const float v0 = xb [0 ][l ]* id [0 ];
985
+ const uint8_t vi0 = (int8_t )roundf (v0 ) + 8 ;
986
+
987
+ const float v1 = xb [1 ][l ]* id [1 ];
988
+ const uint8_t vi1 = (int8_t )roundf (v1 ) + 8 ;
989
+
990
+ assert (vi0 < 16 );
991
+ assert (vi1 < 16 );
992
+
993
+ qs [i * QK4_0 + l ] = vi0 | (vi1 << 4 );
994
+ }
995
+ }
996
+ }
997
+
940
998
static void quantize_row_q4_1_reference (const float * restrict x , void * restrict vy , int k ) {
941
999
assert (k % QK4_1 == 0 );
942
1000
const int nb = k / QK4_1 ;
@@ -1377,6 +1435,40 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
1377
1435
#endif
1378
1436
}
1379
1437
1438
+ // reference implementation for deterministic creation of model files
1439
+ static void quantize_row_q8_0c_reference (const float * restrict x , void * restrict y , int k ) {
1440
+ assert (k % QK8_0 == 0 );
1441
+ const int nb = k / QK8_0 ;
1442
+
1443
+ uint8_t * restrict qs = y ;
1444
+ float * restrict ds = (float * ) ((uint8_t * ) y + QK8_0C * nb );
1445
+
1446
+ for (int i = 0 ; i < nb ; i ++ ) {
1447
+ float amax = 0.0f ; // absolute max
1448
+
1449
+ for (int l = 0 ; l < QK8_0 ; l ++ ) {
1450
+ const float v = x [i * QK8_0 + l ];
1451
+ amax = MAX (amax , fabsf (v ));
1452
+ }
1453
+
1454
+ const float d = amax / ((1 << 7 ) - 1 );
1455
+ const float id = d ? 1.0f /d : 0.0f ;
1456
+
1457
+ ds [i ] = d ;
1458
+
1459
+ for (int l = 0 ; l < QK8_0 ; ++ l ) {
1460
+ const float v = x [i * QK8_0 + l ]* id ;
1461
+ qs [i * QK8_0 + l ] = roundf (v );
1462
+ }
1463
+ }
1464
+ }
1465
+
1466
+ static void quantize_row_q8_0c (const float * restrict x , void * restrict vy , int k ) {
1467
+ assert (k % QK8_0 == 0 );
1468
+
1469
+ quantize_row_q8_0c_reference (x , vy , k );
1470
+ }
1471
+
1380
1472
static void dequantize_row_q4_0 (const void * restrict vx , float * restrict y , int k ) {
1381
1473
assert (k % QK4_0 == 0 );
1382
1474
const int nb = k / QK4_0 ;
@@ -1495,6 +1587,41 @@ static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, in
1495
1587
#endif
1496
1588
}
1497
1589
1590
+ static void dequantize_row_q4_0c (const void * restrict vx , float * restrict y , int k ) {
1591
+ assert (k % QK4_0C == 0 );
1592
+ const int nb = k / QK4_0 ;
1593
+ const int nsb = k / QK4_0C ;
1594
+
1595
+ // Split vx into nibbles section and scales section
1596
+ const uint8_t * restrict qs = vx ;
1597
+ const float * restrict ds = (const float * ) ((const uint8_t * ) vx + QK4_0C /2 * nsb );
1598
+
1599
+ // scalar
1600
+ for (int i = 0 ; i < nb /2 ; i ++ ) {
1601
+ const int dst0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
1602
+ const int dst1 = i + i /2 * 2 + 2 ; // 2, 3, 6, 7, 10, 11 ...
1603
+
1604
+ const float d0 = ds [dst0 ];
1605
+ const float d1 = ds [dst1 ];
1606
+
1607
+ for (int l = 0 ; l < QK4_0 ; l ++ ) {
1608
+ const uint8_t vi = qs [i * QK4_0 + l ];
1609
+
1610
+ const int8_t vi0 = vi & 0xf ;
1611
+ const int8_t vi1 = vi >> 4 ;
1612
+
1613
+ const float v0 = (vi0 - 8 )* d0 ;
1614
+ const float v1 = (vi1 - 8 )* d1 ;
1615
+
1616
+ y [dst0 * QK4_0 + l ] = v0 ;
1617
+ y [dst1 * QK4_0 + l ] = v1 ;
1618
+
1619
+ assert (!isnan (y [dst0 * QK4_0 + l ]));
1620
+ assert (!isnan (y [dst1 * QK4_0 + l ]));
1621
+ }
1622
+ }
1623
+ }
1624
+
1498
1625
static void dequantize_row_q4_1 (const void * restrict vx , float * restrict y , int k ) {
1499
1626
assert (k % QK4_1 == 0 );
1500
1627
const int nb = k / QK4_1 ;
@@ -1631,6 +1758,7 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
1631
1758
}
1632
1759
1633
1760
static void ggml_vec_dot_q4_0_q8_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy );
1761
+ static void ggml_vec_dot_q4_0c_q8_0c (const int n , float * restrict s , const void * restrict vx , const void * restrict vy );
1634
1762
static void ggml_vec_dot_q4_1_q8_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy );
1635
1763
static void ggml_vec_dot_q4_2_q8_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy );
1636
1764
@@ -1642,6 +1770,14 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1642
1770
.quantize_row_q_dot = quantize_row_q8_0 ,
1643
1771
.vec_dot_q = ggml_vec_dot_q4_0_q8_0 ,
1644
1772
},
1773
+ [GGML_TYPE_Q4_0C ] = {
1774
+ .dequantize_row_q = dequantize_row_q4_0c ,
1775
+ //.quantize_row_q = quantize_row_q4_0c,
1776
+ .quantize_row_q = (quantize_row_q_t ) quantize_row_q4_0c_reference ,
1777
+ .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q4_0c_reference ,
1778
+ .quantize_row_q_dot = quantize_row_q8_0c ,
1779
+ .vec_dot_q = ggml_vec_dot_q4_0c_q8_0c ,
1780
+ },
1645
1781
[GGML_TYPE_Q4_1 ] = {
1646
1782
.dequantize_row_q = dequantize_row_q4_1 ,
1647
1783
.quantize_row_q = quantize_row_q4_1 ,
@@ -1663,6 +1799,13 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
1663
1799
.quantize_row_q_dot = quantize_row_q8_0 ,
1664
1800
.vec_dot_q = NULL , // TODO
1665
1801
},
1802
+ [GGML_TYPE_Q8_0C ] = {
1803
+ .dequantize_row_q = NULL ,
1804
+ .quantize_row_q = quantize_row_q8_0c ,
1805
+ .quantize_row_q_reference = (quantize_row_q_t ) quantize_row_q8_0c_reference ,
1806
+ .quantize_row_q_dot = quantize_row_q8_0c ,
1807
+ .vec_dot_q = NULL ,
1808
+ },
1666
1809
};
1667
1810
1668
1811
// For internal test use
@@ -2460,6 +2603,51 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2460
2603
* s = sumf ;
2461
2604
}
2462
2605
2606
+ static void ggml_vec_dot_q4_0c_q8_0c (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
2607
+ const int nb = n / QK4_0 ;
2608
+ const int nsb = n / QK4_0C ;
2609
+
2610
+ assert (n % QK4_0C == 0 );
2611
+
2612
+ // Split into nibbles and scales sections
2613
+ const uint8_t * restrict xqs = vx ;
2614
+ const float * restrict xds = (const float * ) ((const uint8_t * ) vx + nsb * QK4_0C /2 );
2615
+ const int8_t * restrict yqs = vy ;
2616
+ const float * restrict yds = (const float * ) ((const uint8_t * ) vy + nb * QK8_0C );
2617
+
2618
+ float sumf = 0.0 ;
2619
+
2620
+ // scalar
2621
+ for (int i = 0 ; i < nb /2 ; i ++ ) {
2622
+ const int dst0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
2623
+ const int dst1 = i + i /2 * 2 + 2 ; // 2, 3, 6, 7, 10, 11 ...
2624
+
2625
+ const float dx0 = xds [dst0 ];
2626
+ const float dx1 = xds [dst1 ];
2627
+ const float dy0 = yds [dst0 ];
2628
+ const float dy1 = yds [dst1 ];
2629
+
2630
+ int sumi0 = 0 ;
2631
+ int sumi1 = 0 ;
2632
+
2633
+ for (int l = 0 ; l < QK4_0 ; l ++ ) {
2634
+ const uint8_t v0 = xqs [i * QK4_0 + l ];
2635
+
2636
+ const int i0 = (int8_t ) (v0 & 0xf ) - 8 ;
2637
+ const int i1 = (int8_t ) (v0 >> 4 ) - 8 ;
2638
+
2639
+ const int i2 = yqs [dst0 * QK4_0 + l ];
2640
+ const int i3 = yqs [dst1 * QK4_0 + l ];
2641
+
2642
+ sumi0 += i0 * i2 ;
2643
+ sumi1 += i1 * i3 ;
2644
+ }
2645
+ sumf += dx0 * dy0 * sumi0 + dx1 * dy1 * sumi1 ;
2646
+ }
2647
+
2648
+ * s = sumf ;
2649
+ }
2650
+
2463
2651
static void ggml_vec_dot_q4_1_q8_0 (const int n , float * restrict s , const void * restrict vx , const void * restrict vy ) {
2464
2652
const int nb = n / QK8_0 ;
2465
2653
@@ -3004,54 +3192,62 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3004
3192
[GGML_TYPE_F32 ] = 1 ,
3005
3193
[GGML_TYPE_F16 ] = 1 ,
3006
3194
[GGML_TYPE_Q4_0 ] = QK4_0 ,
3195
+ [GGML_TYPE_Q4_0C ] = QK4_0C ,
3007
3196
[GGML_TYPE_Q4_1 ] = QK4_1 ,
3008
3197
[GGML_TYPE_Q4_2 ] = QK4_2 ,
3009
3198
[GGML_TYPE_Q8_0 ] = QK8_0 ,
3199
+ [GGML_TYPE_Q8_0C ] = QK8_0C ,
3010
3200
[GGML_TYPE_I8 ] = 1 ,
3011
3201
[GGML_TYPE_I16 ] = 1 ,
3012
3202
[GGML_TYPE_I32 ] = 1 ,
3013
3203
};
3014
- static_assert (GGML_TYPE_COUNT == 9 , "GGML_BLCK_SIZE is outdated" );
3204
+ static_assert (GGML_TYPE_COUNT == 11 , "GGML_BLCK_SIZE is outdated" );
3015
3205
3016
3206
static const size_t GGML_TYPE_SIZE [GGML_TYPE_COUNT ] = {
3017
3207
[GGML_TYPE_F32 ] = sizeof (float ),
3018
3208
[GGML_TYPE_F16 ] = sizeof (ggml_fp16_t ),
3019
3209
[GGML_TYPE_Q4_0 ] = sizeof (block_q4_0 ),
3210
+ [GGML_TYPE_Q4_0C ] = 4 * sizeof (block_q4_0 ),
3020
3211
[GGML_TYPE_Q4_1 ] = sizeof (block_q4_1 ),
3021
3212
[GGML_TYPE_Q4_2 ] = sizeof (block_q4_2 ),
3022
3213
[GGML_TYPE_Q8_0 ] = sizeof (block_q8_0 ),
3214
+ [GGML_TYPE_Q8_0C ] = sizeof (block_q8_0 ),
3023
3215
[GGML_TYPE_I8 ] = sizeof (int8_t ),
3024
3216
[GGML_TYPE_I16 ] = sizeof (int16_t ),
3025
3217
[GGML_TYPE_I32 ] = sizeof (int32_t ),
3026
3218
};
3027
- static_assert (GGML_TYPE_COUNT == 9 , "GGML_TYPE_SIZE is outdated" );
3219
+ static_assert (GGML_TYPE_COUNT == 11 , "GGML_TYPE_SIZE is outdated" );
3028
3220
3029
3221
3030
3222
static const char * GGML_TYPE_NAME [GGML_TYPE_COUNT ] = {
3031
3223
[GGML_TYPE_F32 ] = "f32" ,
3032
3224
[GGML_TYPE_F16 ] = "f16" ,
3033
3225
[GGML_TYPE_Q4_0 ] = "q4_0" ,
3226
+ [GGML_TYPE_Q4_0C ] = "q4_0c" ,
3034
3227
[GGML_TYPE_Q4_1 ] = "q4_1" ,
3035
3228
[GGML_TYPE_Q4_2 ] = "q4_2" ,
3036
3229
[GGML_TYPE_Q8_0 ] = "q8_0" ,
3230
+ [GGML_TYPE_Q8_0C ] = "q8_0c" ,
3037
3231
[GGML_TYPE_I8 ] = "i8" ,
3038
3232
[GGML_TYPE_I16 ] = "i16" ,
3039
3233
[GGML_TYPE_I32 ] = "i32" ,
3040
3234
};
3041
- static_assert (GGML_TYPE_COUNT == 9 , "GGML_TYPE_NAME is outdated" );
3235
+ static_assert (GGML_TYPE_COUNT == 11 , "GGML_TYPE_NAME is outdated" );
3042
3236
3043
3237
static bool GGML_IS_QUANTIZED [GGML_TYPE_COUNT ] = {
3044
3238
[GGML_TYPE_F32 ] = false,
3045
3239
[GGML_TYPE_F16 ] = false,
3046
3240
[GGML_TYPE_Q4_0 ] = true,
3241
+ [GGML_TYPE_Q4_0C ] = true,
3047
3242
[GGML_TYPE_Q4_1 ] = true,
3048
3243
[GGML_TYPE_Q4_2 ] = true,
3049
3244
[GGML_TYPE_Q8_0 ] = true,
3245
+ [GGML_TYPE_Q8_0C ] = true,
3050
3246
[GGML_TYPE_I8 ] = false,
3051
3247
[GGML_TYPE_I16 ] = false,
3052
3248
[GGML_TYPE_I32 ] = false,
3053
3249
};
3054
- static_assert (GGML_TYPE_COUNT == 9 , "GGML_IS_QUANTIZED is outdated" );
3250
+ static_assert (GGML_TYPE_COUNT == 11 , "GGML_IS_QUANTIZED is outdated" );
3055
3251
3056
3252
static const char * GGML_OP_LABEL [GGML_OP_COUNT ] = {
3057
3253
"NONE" ,
@@ -7873,9 +8069,11 @@ static void ggml_compute_forward_mul_mat(
7873
8069
struct ggml_tensor * dst ) {
7874
8070
switch (src0 -> type ) {
7875
8071
case GGML_TYPE_Q4_0 :
8072
+ case GGML_TYPE_Q4_0C :
7876
8073
case GGML_TYPE_Q4_1 :
7877
8074
case GGML_TYPE_Q4_2 :
7878
8075
case GGML_TYPE_Q8_0 :
8076
+ case GGML_TYPE_Q8_0C :
7879
8077
{
7880
8078
ggml_compute_forward_mul_mat_q_f32 (params , src0 , src1 , dst );
7881
8079
} break ;
@@ -8129,9 +8327,11 @@ static void ggml_compute_forward_get_rows(
8129
8327
struct ggml_tensor * dst ) {
8130
8328
switch (src0 -> type ) {
8131
8329
case GGML_TYPE_Q4_0 :
8330
+ case GGML_TYPE_Q4_0C :
8132
8331
case GGML_TYPE_Q4_1 :
8133
8332
case GGML_TYPE_Q4_2 :
8134
8333
case GGML_TYPE_Q8_0 :
8334
+ case GGML_TYPE_Q8_0C :
8135
8335
{
8136
8336
ggml_compute_forward_get_rows_q (params , src0 , src1 , dst );
8137
8337
} break ;
0 commit comments