Skip to content

Commit 58e10f2

Browse files
committed
q4_0c: prefetch on AVX-512 and ARM
Seems significant especially for evaluation time
1 parent 64a6a29 commit 58e10f2

File tree

1 file changed

+13
-0
lines changed

1 file changed

+13
-0
lines changed

ggml.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2848,10 +2848,17 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
28482848
float sumf = 0.0;
28492849

28502850
#if defined(__ARM_NEON)
2851+
const int ahead=80;
28512852
float32x4_t sumv0 = vdupq_n_f32(0.0f);
28522853
float32x4_t sumv1 = vdupq_n_f32(0.0f);
28532854

28542855
for (int i = 0; i < nb/2; i++) {
2856+
__builtin_prefetch(&xqs[i*QK4_0 + 64*ahead]);
2857+
__builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead]);
2858+
__builtin_prefetch(&yqs[2*i*QK8_0C + 64*ahead + 64]);
2859+
__builtin_prefetch(&xds[2*i + 64/4*ahead]);
2860+
__builtin_prefetch(&yds[2*i + 64/4*ahead]);
2861+
28552862
const int dst0 = i + i/2*2; // 0, 1, 4, 5, 8, 9, ...
28562863
const int dst1 = i + i/2*2 + 2; // 2, 3, 6, 7, 10, 11 ...
28572864

@@ -2910,9 +2917,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
29102917
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
29112918

29122919
#elif defined(__AVX512F__)
2920+
const int ahead = 64;
29132921
// Initialize accumulator with zeros
29142922
__m512 acc = _mm512_setzero_ps();
29152923
for (int i = 0; i < nb; i += 4) {
2924+
_mm_prefetch(xqs + i*QK4_0/2 + 64*ahead, _MM_HINT_T0);
2925+
_mm_prefetch(yqs + i*QK8_0 + 64*ahead, _MM_HINT_T0);
2926+
_mm_prefetch(yqs + i*QK8_0 + 64*ahead + 64, _MM_HINT_T0);
2927+
_mm_prefetch(xds + i + 64/4*ahead, _MM_HINT_T0);
2928+
_mm_prefetch(yds + i + 64/4*ahead, _MM_HINT_T0);
29162929
acc = dot_q4_0c_fourblocks_avx512(acc, xqs + i*QK4_0/2, xds + i, yqs + i*QK8_0, yds + i);
29172930
}
29182931
// Horizontal sum of all lanes of the accumulator

0 commit comments

Comments
 (0)