@@ -3126,10 +3126,17 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
3126
3126
float sumf = 0.0 ;
3127
3127
3128
3128
#if defined(__ARM_NEON )
3129
+ const int ahead = 80 ;
3129
3130
float32x4_t sumv0 = vdupq_n_f32 (0.0f );
3130
3131
float32x4_t sumv1 = vdupq_n_f32 (0.0f );
3131
3132
3132
3133
for (int i = 0 ; i < nb /2 ; i ++ ) {
3134
+ __builtin_prefetch (& xqs [i * QK4_0 + 64 * ahead ]);
3135
+ __builtin_prefetch (& yqs [2 * i * QK8_0C + 64 * ahead ]);
3136
+ __builtin_prefetch (& yqs [2 * i * QK8_0C + 64 * ahead + 64 ]);
3137
+ __builtin_prefetch (& xds [2 * i + 64 /4 * ahead ]);
3138
+ __builtin_prefetch (& yds [2 * i + 64 /4 * ahead ]);
3139
+
3133
3140
const int dst0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
3134
3141
const int dst1 = i + i /2 * 2 + 2 ; // 2, 3, 6, 7, 10, 11 ...
3135
3142
@@ -3188,9 +3195,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
3188
3195
sumf = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 );
3189
3196
3190
3197
#elif defined(__AVX512F__ )
3198
+ const int ahead = 64 ;
3191
3199
// Initialize accumulator with zeros
3192
3200
__m512 acc = _mm512_setzero_ps ();
3193
3201
for (int i = 0 ; i < nb ; i += 4 ) {
3202
+ _mm_prefetch (xqs + i * QK4_0 /2 + 64 * ahead , _MM_HINT_T0 );
3203
+ _mm_prefetch (yqs + i * QK8_0 + 64 * ahead , _MM_HINT_T0 );
3204
+ _mm_prefetch (yqs + i * QK8_0 + 64 * ahead + 64 , _MM_HINT_T0 );
3205
+ _mm_prefetch (xds + i + 64 /4 * ahead , _MM_HINT_T0 );
3206
+ _mm_prefetch (yds + i + 64 /4 * ahead , _MM_HINT_T0 );
3194
3207
acc = dot_q4_0c_fourblocks_avx512 (acc , xqs + i * QK4_0 /2 , xds + i , yqs + i * QK8_0 , yds + i );
3195
3208
}
3196
3209
// Horizontal sum of all lanes of the accumulator
0 commit comments