Skip to content

Commit 569a03e

Browse files
finish i2_s/i8_s vec_dot x86 simd
1 parent 95dced0 commit 569a03e

File tree

3 files changed

+120
-121
lines changed

3 files changed

+120
-121
lines changed

ggml-common.h

Lines changed: 64 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,70 +1023,70 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
10231023
GGML_TABLE_END()
10241024

10251025
GGML_TABLE_BEGIN(uint32_t, i2s_i8s, 256)
1026-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1027-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1028-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1029-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1030-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1031-
0x00010100, 0x01010100, 0x00010100, 0xff010100,
1032-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1033-
0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
1034-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1035-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1036-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1037-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1038-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1039-
0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
1040-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1041-
0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
1042-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1043-
0x00010001, 0x01010001, 0x00010001, 0xff010001,
1044-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1045-
0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
1046-
0x00000101, 0x01000101, 0x00000101, 0xff000101,
1047-
0x00010101, 0x01010101, 0x00010101, 0xff010101,
1048-
0x00000101, 0x01000101, 0x00000101, 0xff000101,
1049-
0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101,
1050-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1051-
0x00010001, 0x01010001, 0x00010001, 0xff010001,
1052-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1053-
0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
1054-
0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
1055-
0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01,
1056-
0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
1057-
0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01,
1058-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1059-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1060-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1061-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1062-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1063-
0x00010100, 0x01010100, 0x00010100, 0xff010100,
1064-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1065-
0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
1066-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1067-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1068-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1069-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1070-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1071-
0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
1072-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1073-
0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
1074-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1075-
0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
1076-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1077-
0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
1078-
0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
1079-
0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff,
1080-
0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
1081-
0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff,
1082-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1083-
0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
1084-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1085-
0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
1086-
0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
1087-
0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff,
1088-
0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
1089-
0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff,
1026+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1027+
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1028+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1029+
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1030+
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1031+
0x00010100, 0x01010100, 0x00010100, 0xff010100,
1032+
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1033+
0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
1034+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1035+
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1036+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1037+
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1038+
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1039+
0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
1040+
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1041+
0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
1042+
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1043+
0x00010001, 0x01010001, 0x00010001, 0xff010001,
1044+
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1045+
0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
1046+
0x00000101, 0x01000101, 0x00000101, 0xff000101,
1047+
0x00010101, 0x01010101, 0x00010101, 0xff010101,
1048+
0x00000101, 0x01000101, 0x00000101, 0xff000101,
1049+
0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101,
1050+
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1051+
0x00010001, 0x01010001, 0x00010001, 0xff010001,
1052+
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1053+
0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
1054+
0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
1055+
0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01,
1056+
0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
1057+
0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01,
1058+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1059+
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1060+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1061+
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1062+
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1063+
0x00010100, 0x01010100, 0x00010100, 0xff010100,
1064+
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1065+
0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
1066+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1067+
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1068+
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1069+
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1070+
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1071+
0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
1072+
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1073+
0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
1074+
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1075+
0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
1076+
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1077+
0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
1078+
0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
1079+
0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff,
1080+
0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
1081+
0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff,
1082+
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1083+
0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
1084+
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1085+
0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
1086+
0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
1087+
0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff,
1088+
0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
1089+
0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff,
10901090
GGML_TABLE_END()
10911091

10921092
#define NGRID_IQ1S 2048

ggml-quants.c

Lines changed: 56 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -3799,60 +3799,61 @@ void ggml_vec_dot_i2_i8_s(int n, float * restrict s, size_t bs, const void * res
37993799
UNUSED(by);
38003800
UNUSED(nrc);
38013801

3802-
// TODO
3803-
// #if defined(__AVX2__)
3804-
// __m256i accu = _mm256_setzero_si256();
3805-
3806-
// for (int i=0; i<n/32; i++) {
3807-
// const int8_t* w0 = (const int8_t *)(i2s_i8s + x[i*8 + 0]);
3808-
// const int8_t* w1 = (const int8_t *)(i2s_i8s + x[i*8 + 1]);
3809-
// const int8_t* w2 = (const int8_t *)(i2s_i8s + x[i*8 + 2]);
3810-
// const int8_t* w3 = (const int8_t *)(i2s_i8s + x[i*8 + 3]);
3811-
// const int8_t* w4 = (const int8_t *)(i2s_i8s + x[i*8 + 4]);
3812-
// const int8_t* w5 = (const int8_t *)(i2s_i8s + x[i*8 + 5]);
3813-
// const int8_t* w6 = (const int8_t *)(i2s_i8s + x[i*8 + 6]);
3814-
// const int8_t* w7 = (const int8_t *)(i2s_i8s + x[i*8 + 7]);
3815-
3816-
// __m256i xq8 = _mm256_set_epi8(
3817-
// w0[0], w0[1], w0[2], w0[3],
3818-
// w1[0], w1[1], w1[2], w1[3],
3819-
// w2[0], w2[1], w2[2], w2[3],
3820-
// w3[0], w3[1], w3[2], w3[3],
3821-
// w4[0], w4[1], w4[2], w4[3],
3822-
// w5[0], w5[1], w5[2], w5[3],
3823-
// w6[0], w6[1], w6[2], w6[3],
3824-
// w7[0], w7[1], w7[2], w7[3]
3825-
// );
3826-
3827-
// __m256i yq8 = _mm256_loadu_si256((const __m256i*)(y + i*32));
3828-
3829-
// __m128i hxq8 = _mm256_castsi256_si128(xq8);
3830-
// __m128i lxq8 = _mm256_extractf128_si256(xq8, 1);
3831-
// __m128i hyq8 = _mm256_castsi256_si128(yq8);
3832-
// __m128i lyq8 = _mm256_extractf128_si256(yq8, 1);
3833-
3834-
// __m256i hxq16 = _mm256_cvtepi8_epi16(hxq8);
3835-
// __m256i lxq16 = _mm256_cvtepi8_epi16(lxq8);
3836-
// __m256i hyq16 = _mm256_cvtepi8_epi16(hyq8);
3837-
// __m256i lyq16 = _mm256_cvtepi8_epi16(lyq8);
3838-
3839-
// __m256i hzq16 = _mm256_sign_epi16(hyq16, hxq16);
3840-
// __m256i lzq16 = _mm256_sign_epi16(lyq16, lxq16);
3841-
3842-
// __m256i hhzq32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(hzq16));
3843-
// __m256i hlzq32 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(hzq16, 1));
3844-
// __m256i llzq32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(lzq16));
3845-
// __m256i lhzq32 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(lzq16, 1));
3846-
3847-
// accu = _mm256_add_epi32(accu, hhzq32);
3848-
// accu = _mm256_add_epi32(accu, hlzq32);
3849-
// accu = _mm256_add_epi32(accu, llzq32);
3850-
// accu = _mm256_add_epi32(accu, lhzq32);
3851-
// }
3852-
3853-
// int sumi = hsum_i32_8(accu);
3854-
// *s = (float)sumi;
3855-
// #else
3802+
#if defined(__AVX2__)
3803+
__m256i accu = _mm256_setzero_si256();
3804+
3805+
// max group_size is 128 (2^8)
3806+
// limited by 8640 to 2 (8640 % (2 * 32) == 0)
3807+
int group_num = 2;
3808+
3809+
for (int i=0; i < n / (group_num * 32); i++){
3810+
__m256i laccu = _mm256_setzero_si256();
3811+
__m256i haccu = _mm256_setzero_si256();
3812+
3813+
for (int j=0; j < group_num; j++) {
3814+
__m256i xq8 = _mm256_set_epi32(
3815+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 7]],
3816+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 6]],
3817+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 5]],
3818+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 4]],
3819+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 3]],
3820+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 2]],
3821+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 1]],
3822+
(int)i2s_i8s[x[i * group_num * 8 + j * 8 + 0]]
3823+
);
3824+
3825+
__m256i yq8 = _mm256_loadu_si256((const __m256i*)(y + i * group_num * 32 + j * 32));
3826+
3827+
__m128i hxq8 = _mm256_castsi256_si128(xq8);
3828+
__m128i lxq8 = _mm256_extractf128_si256(xq8, 1);
3829+
__m128i hyq8 = _mm256_castsi256_si128(yq8);
3830+
__m128i lyq8 = _mm256_extractf128_si256(yq8, 1);
3831+
3832+
__m256i hxq16 = _mm256_cvtepi8_epi16(hxq8);
3833+
__m256i lxq16 = _mm256_cvtepi8_epi16(lxq8);
3834+
__m256i hyq16 = _mm256_cvtepi8_epi16(hyq8);
3835+
__m256i lyq16 = _mm256_cvtepi8_epi16(lyq8);
3836+
3837+
__m256i hzq16 = _mm256_sign_epi16(hyq16, hxq16);
3838+
__m256i lzq16 = _mm256_sign_epi16(lyq16, lxq16);
3839+
3840+
haccu = _mm256_add_epi16(haccu, hzq16);
3841+
laccu = _mm256_add_epi16(laccu, lzq16);
3842+
}
3843+
3844+
__m256i hhzq32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(haccu));
3845+
__m256i hlzq32 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(haccu, 1));
3846+
__m256i llzq32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(laccu));
3847+
__m256i lhzq32 = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(laccu, 1));
3848+
3849+
accu = _mm256_add_epi32(accu, hhzq32);
3850+
accu = _mm256_add_epi32(accu, hlzq32);
3851+
accu = _mm256_add_epi32(accu, llzq32);
3852+
accu = _mm256_add_epi32(accu, lhzq32);
3853+
}
3854+
int sumi = hsum_i32_8(accu);
3855+
*s = (float)sumi;
3856+
#else
38563857

38573858
int sumi = 0;
38583859

@@ -3864,7 +3865,7 @@ void ggml_vec_dot_i2_i8_s(int n, float * restrict s, size_t bs, const void * res
38643865
sumi += (int)y[i*4+3] * weight[3];
38653866
}
38663867
*s = (float)sumi;
3867-
// #endif
3868+
#endif
38683869
}
38693870

38703871
void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {

llama.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11702,7 +11702,6 @@ struct llm_build_context {
1170211702

1170311703
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
1170411704
cb(cur, "ffn_down", il);
11705-
1170611705
}
1170711706
cur = ggml_add(ctx0, cur, ffn_inp);
1170811707
cb(cur, "l_out", il);
@@ -11723,7 +11722,6 @@ struct llm_build_context {
1172311722
cb(cur, "result_output", -1);
1172411723

1172511724
ggml_build_forward_expand(gf, cur);
11726-
1172711725
return gf;
1172811726
}
1172911727

0 commit comments

Comments
 (0)