@@ -11009,40 +11009,68 @@ void ggml_vec_dot_q1_3_q8_0(int n, float * restrict s, size_t bs, const void * r
11009
11009
__m256 accumf = _mm256_setzero_ps();
11010
11010
11011
11011
for (int i = 0; i < nb; ++i) {
11012
- {
11013
- __m256i x0 = _mm256_set_epi32(q1_3_grid[x[i].q[7]], q1_3_grid[x[i].q[6]],
11014
- q1_3_grid[x[i].q[5]], q1_3_grid[x[i].q[4]],
11015
- q1_3_grid[x[i].q[3]], q1_3_grid[x[i].q[2]],
11016
- q1_3_grid[x[i].q[1]], q1_3_grid[x[i].q[0]]);
11017
- __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i].qs));
11018
-
11019
- __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d));
11020
-
11021
- __m256 q = mul_sum_i8_pairs_float(x0, y0);
11022
-
11023
- accumf = _mm256_fmadd_ps(d, q, accumf);
11024
- }
11012
+ // __m128i x12b = _mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1));
11013
+ // __m128i x12b = _mm_insert_epi8(x12a, x[i].qs[0], 12);
11014
+ // WARNING: reading 3 bytes further than necessary. It's faster than the above on my CPU, though.
11015
+ __m128i x12b = _mm_loadu_si128((const __m128i_u *) x[i].q);
11016
+ __m256i x12 = MM256_SET_M128I(x12b, x12b);
11025
11017
11026
11018
{
11027
- __m256i x1 = _mm256_castsi128_si256(_mm_set_epi32(q1_3_grid[x[i].q[11]], q1_3_grid[x[i].q[10]],
11028
- q1_3_grid[x[i].q[9]], q1_3_grid[x[i].q[8]]));
11029
- __m256i x2 = _mm256_cvtepu8_epi16(_mm_maskload_epi32((const int32_t *) x[i].q, _mm_set_epi32(0, -1, -1, -1)));
11019
+ __m256i x0l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
11020
+ 4, -1, 4, -1, 4, -1, 4, -1,
11021
+ 1, -1, 1, -1, 1, -1, 1, -1,
11022
+ 0, -1, 0, -1, 0, -1, 0, -1));
11023
+ __m256i x0h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1,
11024
+ 6, -1, 6, -1, 6, -1, 6, -1,
11025
+ 3, -1, 3, -1, 3, -1, 3, -1,
11026
+ 2, -1, 2, -1, 2, -1, 2, -1));
11027
+ __m256i x1l = _mm256_shuffle_epi8(x12, _mm256_set_epi8(7, -1, 6, -1, 5, -1, 4, -1,
11028
+ 3, -1, 2, -1, 1, -1, 0, -1,
11029
+ 9, -1, 9, -1, 9, -1, 9, -1,
11030
+ 8, -1, 8, -1, 8, -1, 8, -1));
11031
+ __m256i x1h = _mm256_shuffle_epi8(x12, _mm256_set_epi8(12, -1, 12, -1, 12, -1, 12, -1,
11032
+ 11, -1, 10, -1, 9, -1, 8, -1,
11033
+ 11, -1, 11, -1, 11, -1, 11, -1,
11034
+ 10, -1, 10, -1, 10, -1, 10, -1));
11035
+ const __m256i shift0 = _mm256_set_epi16(3, 9, 27, 81,
11036
+ 3, 9, 27, 81,
11037
+ 3, 9, 27, 81,
11038
+ 3, 9, 27, 81);
11039
+ const __m256i shift1l = _mm256_set_epi16(1, 1, 1, 1,
11040
+ 1, 1, 1, 1,
11041
+ 3, 9, 27, 81,
11042
+ 3, 9, 27, 81);
11043
+ const __m256i shift1h = _mm256_set_epi16(3, 9, 27, 81,
11044
+ 1, 1, 1, 1,
11045
+ 3, 9, 27, 81,
11046
+ 3, 9, 27, 81);
11047
+ x0l = _mm256_mullo_epi16(x0l, shift0);
11048
+ x0h = _mm256_mullo_epi16(x0h, shift0);
11049
+ x1l = _mm256_mullo_epi16(x1l, shift1l);
11050
+ x1h = _mm256_mullo_epi16(x1h, shift1h);
11051
+ x0l = _mm256_mulhi_epu16(x0l, _mm256_set1_epi16(3));
11052
+ x0h = _mm256_mulhi_epu16(x0h, _mm256_set1_epi16(3));
11053
+ x1l = _mm256_mulhi_epu16(x1l, _mm256_set1_epi16(3));
11054
+ x1h = _mm256_mulhi_epu16(x1h, _mm256_set1_epi16(3));
11055
+ x0l = _mm256_sub_epi16(x0l, _mm256_set1_epi16(1));
11056
+ x0h = _mm256_sub_epi16(x0h, _mm256_set1_epi16(1));
11057
+ x1l = _mm256_sub_epi16(x1l, _mm256_set1_epi16(1));
11058
+ x1h = _mm256_sub_epi16(x1h, _mm256_set1_epi16(1));
11059
+
11060
+ __m256i x0 = _mm256_packs_epi16(x0l, x0h);
11061
+ __m256i x1 = _mm256_packs_epi16(x1l, x1h);
11062
+
11063
+ __m256i y0 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 0].qs));
11030
11064
__m256i y1 = _mm256_lddqu_si256((const __m256i_u *) (y[2*i + 1].qs));
11031
11065
11032
- x2 = _mm256_mulhi_epu16(x2, _mm256_set1_epi16(3 << 8));
11033
- x2 = _mm256_sub_epi16(x2, _mm256_set1_epi16(1));
11034
-
11035
- // TODO: reduce shuffling
11036
- x2 = _mm256_packs_epi16(x2, _mm256_setzero_si256());
11037
- x2 = _mm256_permute4x64_epi64(x2, _MM_SHUFFLE(3, 1, 2, 0));
11038
- __m128i x2_l = _mm_insert_epi32(_mm256_castsi256_si128(x2), q1_3_grid[x[i].qs[0]], 3);
11039
- x1 = _mm256_inserti128_si256(x1, x2_l, 1);
11040
-
11041
- __m256 d = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11066
+ __m256 d0 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i].d));
11067
+ __m256 d1 = _mm256_set1_ps(GGML_FP16_TO_FP32(y[2*i + 1].d));
11042
11068
11043
- __m256 q = mul_sum_i8_pairs_float(x1, y1);
11069
+ __m256 q0 = mul_sum_i8_pairs_float(x0, y0);
11070
+ __m256 q1 = mul_sum_i8_pairs_float(x1, y1);
11044
11071
11045
- accumf = _mm256_fmadd_ps(d, q, accumf);
11072
+ accumf = _mm256_fmadd_ps(d0, q0, accumf);
11073
+ accumf = _mm256_fmadd_ps(d1, q1, accumf);
11046
11074
}
11047
11075
}
11048
11076
0 commit comments