@@ -1725,17 +1725,17 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
1725
1725
1726
1726
// reference implementation for deterministic creation of model files
1727
1727
static void quantize_row_q8_0c_reference (const float * restrict x , void * restrict y , int k ) {
1728
- assert (k % QK8_0 == 0 );
1729
- const int nb = k / QK8_0 ;
1728
+ assert (k % QK8_0C == 0 );
1729
+ const int nb = k / QK8_0C ;
1730
1730
1731
1731
uint8_t * restrict qs = y ;
1732
1732
float * restrict ds = (float * ) ((uint8_t * ) y + QK8_0C * nb );
1733
1733
1734
1734
for (int i = 0 ; i < nb ; i ++ ) {
1735
1735
float amax = 0.0f ; // absolute max
1736
1736
1737
- for (int l = 0 ; l < QK8_0 ; l ++ ) {
1738
- const float v = x [i * QK8_0 + l ];
1737
+ for (int l = 0 ; l < QK8_0C ; l ++ ) {
1738
+ const float v = x [i * QK8_0C + l ];
1739
1739
amax = MAX (amax , fabsf (v ));
1740
1740
}
1741
1741
@@ -1744,17 +1744,46 @@ static void quantize_row_q8_0c_reference(const float * restrict x, void * restri
1744
1744
1745
1745
ds [i ] = d ;
1746
1746
1747
- for (int l = 0 ; l < QK8_0 ; ++ l ) {
1748
- const float v = x [i * QK8_0 + l ]* id ;
1749
- qs [i * QK8_0 + l ] = roundf (v );
1747
+ for (int l = 0 ; l < QK8_0C ; ++ l ) {
1748
+ const float v = x [i * QK8_0C + l ]* id ;
1749
+ qs [i * QK8_0C + l ] = roundf (v );
1750
1750
}
1751
1751
}
1752
1752
}
1753
1753
1754
1754
static void quantize_row_q8_0c (const float * restrict x , void * restrict vy , int k ) {
1755
- assert (k % QK8_0 == 0 );
1755
+ assert (k % QK8_0C == 0 );
1756
+ const int nb = k / QK8_0C ;
1757
+
1758
+ int8_t * restrict qs = vy ;
1759
+ float * restrict ds = (float * ) ((uint8_t * ) vy + nb * QK8_0C );
1760
+
1761
+ #if __AVX512F__
1762
+ for (int i = 0 ; i < nb ; i ++ ) {
1763
+ const __m512 x0 = _mm512_loadu_ps ( x + i * QK8_0C );
1764
+ const __m512 x1 = _mm512_loadu_ps ( x + i * QK8_0C + QK8_0C /2 );
1765
+
1766
+ // Find absolute max
1767
+ const __m512 x0abs = _mm512_abs_ps (x0 );
1768
+ const __m512 x1abs = _mm512_abs_ps (x1 );
1769
+ const float amax = _mm512_reduce_max_ps (_mm512_max_ps (x0abs , x1abs ));
1770
+
1771
+ const float d = amax / ((1 << 7 ) - 1 );
1772
+ const float id = d ? 1.0f /d : 0.0f ;
1773
+
1774
+ ds [i ] = d ;
1775
+
1776
+ const __m512 mul = _mm512_set1_ps ( id );
1777
+ const __m512i x0q = _mm512_cvt_roundps_epi32 (_mm512_mul_ps (x0 , mul ), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ));
1778
+ const __m512i x1q = _mm512_cvt_roundps_epi32 (_mm512_mul_ps (x1 , mul ), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ));
1756
1779
1780
+ _mm512_mask_cvtepi32_storeu_epi8 (qs + i * QK8_0C , 0xffff , x0q );
1781
+ _mm512_mask_cvtepi32_storeu_epi8 (qs + i * QK8_0C + QK8_0C /2 , 0xffff , x1q );
1782
+ }
1783
+ #else
1784
+ // scalar
1757
1785
quantize_row_q8_0c_reference (x , vy , k );
1786
+ #endif
1758
1787
}
1759
1788
1760
1789
static void dequantize_row_q4_0 (const void * restrict vx , float * restrict y , int k ) {
@@ -2780,6 +2809,73 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float
2780
2809
* s = sumf ;
2781
2810
}
2782
2811
2812
+ #if __AVX512F__ && QK4_0 == 32
2813
+
2814
+ // Dot product of four blocks of q4_0c with four blocks of q8_0c
2815
+ static inline __m512 dot_q4_0c_fourblocks_avx512 (
2816
+ __m512 acc ,
2817
+ const uint8_t * restrict xqs ,
2818
+ const float * restrict xds ,
2819
+ const int8_t * restrict yqs ,
2820
+ const float * restrict yds
2821
+ ) {
2822
+ // load quantized bytes
2823
+ // TODO: change back to aligned loads
2824
+ const __m512i xqs0123 = _mm512_loadu_epi64 ( xqs );
2825
+ const __m512i low_nibble_mask = _mm512_set1_epi8 ( 0xf );
2826
+ const __m512i xqs01 = _mm512_and_si512 ( low_nibble_mask , xqs0123 );
2827
+ // TODO: try srlv/i?
2828
+ const __m512i xqs23 = _mm512_and_si512 ( low_nibble_mask , _mm512_srli_epi32 ( xqs0123 , 4 ) );
2829
+ const __m512i yqs01 = _mm512_loadu_epi64 ( yqs );
2830
+ const __m512i yqs23 = _mm512_loadu_epi64 ( yqs + 2 * QK8_0C );
2831
+
2832
+ // load scales
2833
+ const __m512i scale_mask0 = _mm512_set_epi32 (1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
2834
+ const __m512i scale_mask1 = _mm512_set_epi32 (3 , 3 , 3 , 3 , 3 , 3 , 3 , 3 , 2 , 2 , 2 , 2 , 2 , 2 , 2 , 2 );
2835
+ const __m128 xyds = _mm_mul_ps (_mm_load_ps (xds ), _mm_load_ps (yds ));
2836
+ const __m512 xyds0123 = _mm512_broadcast_f32x4 (xyds );
2837
+ const __m512 xyds01 = _mm512_permutevar_ps (xyds0123 , scale_mask0 );
2838
+ const __m512 xyds23 = _mm512_permutevar_ps (xyds0123 , scale_mask1 );
2839
+
2840
+ // take dot product of x and y bytes
2841
+ const __m512i plus_8 = _mm512_set1_epi8 ( 8 );
2842
+ #ifdef __AVX512VNNI__
2843
+ // We have VPDPBUSDS in AVX512-VNNI, which does exactly what we want, but with a catch:
2844
+ // the *left* operand is supposed to be unsigned, while Q4_0 quantization subtracts 8
2845
+ // from each nibble, so they can be negative. So, instead of `(xqs01 - 8) * yqs01`,
2846
+ // we compute `xqs01 * yqs01 - 8 * yqks`.
2847
+ const __m512i zero = _mm512_setzero_epi32 ();
2848
+ const __m512i yqs01_mul8 = _mm512_dpbusds_epi32 ( zero , plus_8 , yqs01 );
2849
+ const __m512i yqs23_mul8 = _mm512_dpbusds_epi32 ( zero , plus_8 , yqs23 );
2850
+ const __m512i xy01 = _mm512_dpbusds_epi32 ( zero , xqs01 , yqs01 );
2851
+ const __m512i xy23 = _mm512_dpbusds_epi32 ( zero , xqs23 , yqs23 );
2852
+ const __m512i res0_int = _mm512_sub_epi32 ( xy01 , yqs01_mul8 );
2853
+ const __m512i res1_int = _mm512_sub_epi32 ( xy23 , yqs23_mul8 );
2854
+ #else
2855
+ // As a fallback, we have VPMADDUBSW in AVX512-BW, which uses 16-bit products instead of 32-bit ones.
2856
+ // It has the same catch as VPDPBUSDS: the left operand should be unsigned.
2857
+ // This is essentially the AVX-512 version of the AVX-2 trick used by GH user Const-me
2858
+ // ref: https://gist.github.com/Const-me/4d30e1fc767ab314596e16e90f53b6f4#file-matmultest-cpp-L119
2859
+ const __m512i one = _mm512_set1_epi16 ( 1 );
2860
+ const __m512i prod_0 = _mm512_maddubs_epi16 ( xqs01 , yqs01 );
2861
+ const __m512i prod_1 = _mm512_maddubs_epi16 ( plus_8 , yqs01 );
2862
+ const __m512i prod_2 = _mm512_maddubs_epi16 ( xqs23 , yqs23 );
2863
+ const __m512i prod_3 = _mm512_maddubs_epi16 ( plus_8 , yqs23 );
2864
+ const __m512i diff0 = _mm512_sub_epi16 ( prod_0 , prod_1 );
2865
+ const __m512i diff1 = _mm512_sub_epi16 ( prod_2 , prod_3 );
2866
+ const __m512i res0_int = _mm512_madd_epi16 ( diff0 , one );
2867
+ const __m512i res1_int = _mm512_madd_epi16 ( diff1 , one );
2868
+ #endif
2869
+
2870
+ // Finally, we multiply the permuted scales and the 32-bit dot products, then accumulate.
2871
+ const __m512 res0_float = _mm512_cvtepi32_ps ( res0_int );
2872
+ const __m512 res1_float = _mm512_cvtepi32_ps ( res1_int );
2873
+
2874
+ return _mm512_fmadd_ps ( xyds23 , res1_float ,
2875
+ _mm512_fmadd_ps ( xyds01 , res0_float , acc ));
2876
+ }
2877
+ #endif
2878
+
2783
2879
inline static void ggml_vec_dot_f16 (const int n , float * restrict s , ggml_fp16_t * restrict x , ggml_fp16_t * restrict y ) {
2784
2880
ggml_float sumf = 0.0 ;
2785
2881
@@ -2999,6 +3095,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
2999
3095
3000
3096
float sumf = 0.0 ;
3001
3097
3098
+ #if __AVX512F__
3099
+ // Initialize accumulator with zeros
3100
+ __m512 acc = _mm512_setzero_ps ();
3101
+ for (int i = 0 ; i < nb ; i += 4 ) {
3102
+ acc = dot_q4_0c_fourblocks_avx512 (acc , xqs + i * QK4_0 /2 , xds + i , yqs + i * QK8_0 , yds + i );
3103
+ }
3104
+ // Horizontal sum of all lanes of the accumulator
3105
+ sumf = _mm512_reduce_add_ps ( acc );
3106
+ #else
3002
3107
// scalar
3003
3108
for (int i = 0 ; i < nb /2 ; i ++ ) {
3004
3109
const int dst0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
@@ -3009,23 +3114,25 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
3009
3114
const float dy0 = yds [dst0 ];
3010
3115
const float dy1 = yds [dst1 ];
3011
3116
3012
- int sumi0 = 0 ;
3013
- int sumi1 = 0 ;
3117
+ // NOTE: having these as plain int triggers a bug with AVX512 on GCC 12.2
3118
+ int64_t sumi0 = 0 ;
3119
+ int64_t sumi1 = 0 ;
3014
3120
3015
3121
for (int l = 0 ; l < QK4_0 ; l ++ ) {
3016
- const uint8_t v0 = xqs [i * QK4_0 + l ];
3122
+ const uint8_t v0 = xqs [i * QK4_0 + l ];
3017
3123
3018
- const int i0 = (int8_t ) (v0 & 0xf ) - 8 ;
3019
- const int i1 = (int8_t ) (v0 >> 4 ) - 8 ;
3124
+ const int i0 = (int ) (v0 & 0xf ) - 8 ;
3125
+ const int i1 = (int ) (v0 >> 4 ) - 8 ;
3020
3126
3021
- const int i2 = yqs [dst0 * QK4_0 + l ];
3022
- const int i3 = yqs [dst1 * QK4_0 + l ];
3127
+ const int i2 = yqs [dst0 * QK4_0 + l ];
3128
+ const int i3 = yqs [dst1 * QK4_0 + l ];
3023
3129
3024
- sumi0 += i0 * i2 ;
3025
- sumi1 += i1 * i3 ;
3130
+ sumi0 += i0 * i2 ;
3131
+ sumi1 += i1 * i3 ;
3026
3132
}
3027
3133
sumf += dx0 * dy0 * sumi0 + dx1 * dy1 * sumi1 ;
3028
3134
}
3135
+ #endif
3029
3136
3030
3137
* s = sumf ;
3031
3138
}
0 commit comments