@@ -3917,8 +3917,8 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
3917
3917
__m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0);
3918
3918
__m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1);
3919
3919
3920
- __m256i yq8_0 = _mm256_lddqu_si256 ((const __m256i *) (y[i + 0].qs));
3921
- __m256i yq8_1 = _mm256_lddqu_si256 ((const __m256i *) (y[i + 1].qs));
3920
+ __m256i yq8_0 = _mm256_loadu_si256 ((const __m256i *) (y[i + 0].qs));
3921
+ __m256i yq8_1 = _mm256_loadu_si256 ((const __m256i *) (y[i + 1].qs));
3922
3922
3923
3923
const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0);
3924
3924
const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1);
@@ -3951,7 +3951,7 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
3951
3951
xq8h = _mm256_srai_epi16(xq8h, 14);
3952
3952
xq8 = _mm256_packs_epi16(xq8l, xq8h);
3953
3953
3954
- __m256i yq8 = _mm256_lddqu_si256 ((const __m256i *) (y[i].qs));
3954
+ __m256i yq8 = _mm256_loadu_si256 ((const __m256i *) (y[i].qs));
3955
3955
const __m256 q = mul_sum_i8_pairs_float(xq8, yq8);
3956
3956
3957
3957
acc = _mm256_fmadd_ps( d, q, acc );
@@ -11371,11 +11371,12 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11371
11371
__m256 accumf = _mm256_setzero_ps();
11372
11372
11373
11373
for (int i = 0; i < nb; ++i) {
11374
- // __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11375
- // __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11376
- // WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though.
11377
- __m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q);
11378
- __m256i x12 = MM256_SET_M128I(x12b, x12b);
11374
+ // const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11375
+ // const __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11376
+ // WARNING: reading 3 bytes further than necessary.
11377
+ // It's measurably faster than a masked load on an Intel Core m3-8100Y
11378
+ const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q));
11379
+ const __m256i x12 = MM256_SET_M128I(x12b, x12b);
11379
11380
11380
11381
{
11381
11382
__m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
@@ -11406,6 +11407,7 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11406
11407
1, 1, 1, 1,
11407
11408
3, 9, 27, 81,
11408
11409
3, 9, 27, 81);
11410
+ // extract ternary values
11409
11411
x0l = _mm256_mullo_epi16(x0l, shift0);
11410
11412
x0h = _mm256_mullo_epi16(x0h, shift0);
11411
11413
x1l = _mm256_mullo_epi16(x1l, shift1l);
@@ -11414,22 +11416,22 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11414
11416
x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3));
11415
11417
x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3));
11416
11418
x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3));
11417
- x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1));
11418
- x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1));
11419
- x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1));
11420
- x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1));
11421
11419
11422
11420
__m256i x0 = _mm256_packs_epi16(x0l, x0h);
11423
11421
__m256i x1 = _mm256_packs_epi16(x1l, x1h);
11424
11422
11425
- __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs));
11426
- __m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs));
11423
+ // 0, 1, 2 => -1, 0, 1
11424
+ x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1));
11425
+ x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1));
11427
11426
11428
- __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32( y[2*i].d ));
11429
- __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32( y[2*i + 1].d ));
11427
+ const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) ( y[2*i + 0].qs ));
11428
+ const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) ( y[2*i + 1].qs ));
11430
11429
11431
- __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11432
- __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
11430
+ const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d));
11431
+ const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11432
+
11433
+ const __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11434
+ const __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
11433
11435
11434
11436
accumf = _mm256_fmadd_ps(d0, q0, accumf);
11435
11437
accumf = _mm256_fmadd_ps(d1, q1, accumf);
0 commit comments