@@ -3970,8 +3970,8 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
3970
3970
__m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0);
3971
3971
__m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1);
3972
3972
3973
- __m256i yq8_0 = _mm256_lddqu_si256 ((const __m256i *) (y[i + 0].qs));
3974
- __m256i yq8_1 = _mm256_lddqu_si256 ((const __m256i *) (y[i + 1].qs));
3973
+ __m256i yq8_0 = _mm256_loadu_si256 ((const __m256i *) (y[i + 0].qs));
3974
+ __m256i yq8_1 = _mm256_loadu_si256 ((const __m256i *) (y[i + 1].qs));
3975
3975
3976
3976
const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0);
3977
3977
const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1);
@@ -4004,7 +4004,7 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
4004
4004
xq8h = _mm256_srai_epi16(xq8h, 14);
4005
4005
xq8 = _mm256_packs_epi16(xq8l, xq8h);
4006
4006
4007
- __m256i yq8 = _mm256_lddqu_si256 ((const __m256i *) (y[i].qs));
4007
+ __m256i yq8 = _mm256_loadu_si256 ((const __m256i *) (y[i].qs));
4008
4008
const __m256 q = mul_sum_i8_pairs_float(xq8, yq8);
4009
4009
4010
4010
acc = _mm256_fmadd_ps( d, q, acc );
@@ -11009,11 +11009,12 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11009
11009
__m256 accumf = _mm256_setzero_ps();
11010
11010
11011
11011
for (int i = 0; i < nb; ++i) {
11012
- // __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11013
- // __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11014
- // WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though.
11015
- __m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q);
11016
- __m256i x12 = MM256_SET_M128I(x12b, x12b);
11012
+ // const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11013
+ // const __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11014
+ // WARNING: reading 3 bytes further than necessary.
11015
+ // It's measurably faster than a masked load on an Intel Core m3-8100Y
11016
+ const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q));
11017
+ const __m256i x12 = MM256_SET_M128I(x12b, x12b);
11017
11018
11018
11019
{
11019
11020
__m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
@@ -11044,6 +11045,7 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11044
11045
1, 1, 1, 1,
11045
11046
3, 9, 27, 81,
11046
11047
3, 9, 27, 81);
11048
+ // extract ternary values
11047
11049
x0l = _mm256_mullo_epi16(x0l, shift0);
11048
11050
x0h = _mm256_mullo_epi16(x0h, shift0);
11049
11051
x1l = _mm256_mullo_epi16(x1l, shift1l);
@@ -11052,22 +11054,22 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11052
11054
x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3));
11053
11055
x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3));
11054
11056
x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3));
11055
- x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1));
11056
- x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1));
11057
- x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1));
11058
- x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1));
11059
11057
11060
11058
__m256i x0 = _mm256_packs_epi16(x0l, x0h);
11061
11059
__m256i x1 = _mm256_packs_epi16(x1l, x1h);
11062
11060
11063
- __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs));
11064
- __m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs));
11061
+ // 0, 1, 2 => -1, 0, 1
11062
+ x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1));
11063
+ x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1));
11065
11064
11066
- __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32( y[2*i].d ));
11067
- __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32( y[2*i + 1].d ));
11065
+ const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) ( y[2*i + 0].qs ));
11066
+ const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) ( y[2*i + 1].qs ));
11068
11067
11069
- __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11070
- __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
11068
+ const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d));
11069
+ const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11070
+
11071
+ const __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11072
+ const __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
11071
11073
11072
11074
accumf = _mm256_fmadd_ps(d0, q0, accumf);
11073
11075
accumf = _mm256_fmadd_ps(d1, q1, accumf);
0 commit comments