@@ -2848,10 +2848,17 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
2848
2848
float sumf = 0.0 ;
2849
2849
2850
2850
#if defined(__ARM_NEON )
2851
+ const int ahead = 80 ;
2851
2852
float32x4_t sumv0 = vdupq_n_f32 (0.0f );
2852
2853
float32x4_t sumv1 = vdupq_n_f32 (0.0f );
2853
2854
2854
2855
for (int i = 0 ; i < nb /2 ; i ++ ) {
2856
+ __builtin_prefetch (& xqs [i * QK4_0 + 64 * ahead ]);
2857
+ __builtin_prefetch (& yqs [2 * i * QK8_0C + 64 * ahead ]);
2858
+ __builtin_prefetch (& yqs [2 * i * QK8_0C + 64 * ahead + 64 ]);
2859
+ __builtin_prefetch (& xds [2 * i + 64 /4 * ahead ]);
2860
+ __builtin_prefetch (& yds [2 * i + 64 /4 * ahead ]);
2861
+
2855
2862
const int dst0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
2856
2863
const int dst1 = i + i /2 * 2 + 2 ; // 2, 3, 6, 7, 10, 11 ...
2857
2864
@@ -2910,9 +2917,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
2910
2917
sumf = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 );
2911
2918
2912
2919
#elif defined(__AVX512F__ )
2920
+ const int ahead = 64 ;
2913
2921
// Initialize accumulator with zeros
2914
2922
__m512 acc = _mm512_setzero_ps ();
2915
2923
for (int i = 0 ; i < nb ; i += 4 ) {
2924
+ _mm_prefetch (xqs + i * QK4_0 /2 + 64 * ahead , _MM_HINT_T0 );
2925
+ _mm_prefetch (yqs + i * QK8_0 + 64 * ahead , _MM_HINT_T0 );
2926
+ _mm_prefetch (yqs + i * QK8_0 + 64 * ahead + 64 , _MM_HINT_T0 );
2927
+ _mm_prefetch (xds + i + 64 /4 * ahead , _MM_HINT_T0 );
2928
+ _mm_prefetch (yds + i + 64 /4 * ahead , _MM_HINT_T0 );
2916
2929
acc = dot_q4_0c_fourblocks_avx512 (acc , xqs + i * QK4_0 /2 , xds + i , yqs + i * QK8_0 , yds + i );
2917
2930
}
2918
2931
// Horizontal sum of all lanes of the accumulator
0 commit comments