Skip to content

Commit 92709cc

Browse files
committed
wip : substract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
1 parent d6fd678 commit 92709cc

File tree

1 file changed

+20
-18
lines changed

1 file changed

+20
-18
lines changed

ggml-quants.c

Lines changed: 20 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3970,8 +3970,8 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
39703970
__m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0);
39713971
__m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1);
39723972

3973-
__m256i yq8_0 = _mm256_lddqu_si256((const __m256i *) (y[i + 0].qs));
3974-
__m256i yq8_1 = _mm256_lddqu_si256((const __m256i *) (y[i + 1].qs));
3973+
__m256i yq8_0 = _mm256_loadu_si256((const __m256i *) (y[i + 0].qs));
3974+
__m256i yq8_1 = _mm256_loadu_si256((const __m256i *) (y[i + 1].qs));
39753975

39763976
const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0);
39773977
const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1);
@@ -4004,7 +4004,7 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
40044004
xq8h = _mm256_srai_epi16(xq8h, 14);
40054005
xq8 = _mm256_packs_epi16(xq8l, xq8h);
40064006

4007-
__m256i yq8 = _mm256_lddqu_si256((const __m256i *) (y[i].qs));
4007+
__m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs));
40084008
const __m256 q = mul_sum_i8_pairs_float(xq8, yq8);
40094009

40104010
acc = _mm256_fmadd_ps( d, q, acc );
@@ -11009,11 +11009,12 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1100911009
__m256 accumf = _mm256_setzero_ps();
1101011010

1101111011
for (int i = 0; i < nb; ++i) {
11012-
// __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11013-
// __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11014-
// WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though.
11015-
__m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q);
11016-
__m256i x12 = MM256_SET_M128I(x12b, x12b);
11012+
// const __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11013+
// const __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11014+
// WARNING: reading 3 bytes further than necessary.
11015+
// It's measurably faster than a masked load on an Intel Core m3-8100Y
11016+
const __m128i x12b = _mm_loadu_si128((const __m128i_u *) (x[i].q));
11017+
const __m256i x12 = MM256_SET_M128I(x12b, x12b);
1101711018

1101811019
{
1101911020
__m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
@@ -11044,6 +11045,7 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1104411045
1, 1, 1, 1,
1104511046
3, 9, 27, 81,
1104611047
3, 9, 27, 81);
11048+
// extract ternary values
1104711049
x0l = _mm256_mullo_epi16(x0l, shift0);
1104811050
x0h = _mm256_mullo_epi16(x0h, shift0);
1104911051
x1l = _mm256_mullo_epi16(x1l, shift1l);
@@ -11052,22 +11054,22 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
1105211054
x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3));
1105311055
x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3));
1105411056
x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3));
11055-
x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1));
11056-
x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1));
11057-
x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1));
11058-
x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1));
1105911057

1106011058
__m256i x0 = _mm256_packs_epi16(x0l, x0h);
1106111059
__m256i x1 = _mm256_packs_epi16(x1l, x1h);
1106211060

11063-
__m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs));
11064-
__m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs));
11061+
// 0, 1, 2 => -1, 0, 1
11062+
x0 = _mm256_sub_epi8(x0, _mm256_set1_epi8(1));
11063+
x1 = _mm256_sub_epi8(x1, _mm256_set1_epi8(1));
1106511064

11066-
__m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d));
11067-
__m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11065+
const __m256i y0 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 0].qs));
11066+
const __m256i y1 = _mm256_loadu_si256((const __m256i_u *) (y[2*i + 1].qs));
1106811067

11069-
__m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11070-
__m256 q1 = mul_sum_i8_pairs_float(x1, y1);
11068+
const __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 0].d));
11069+
const __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11070+
11071+
const __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11072+
const __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
1107111073

1107211074
accumf = _mm256_fmadd_ps(d0, q0, accumf);
1107311075
accumf = _mm256_fmadd_ps(d1, q1, accumf);

0 commit comments

Comments
 (0)