Skip to content

Commit c6d9b11

Browse files
committed
wip : Q2_2 now faster than Q4_K on with AVX2
1 parent 92709cc commit c6d9b11

File tree

3 files changed

+33
-162
lines changed

3 files changed

+33
-162
lines changed

convert-hf-to-gguf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,8 @@ def write_tensors(self):
294294
))
295295

296296
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
297+
# TODO: cleaner model-specific per-tensor types
298+
# NOTE: Q1_3 is only relevant for BitNet 1.58b
297299
if self.ftype == gguf.LlamaFileType.MOSTLY_Q1_3 and not any(
298300
self.match_model_tensor_name(new_name, key, None)
299301
for key in [

ggml-common.h

Lines changed: 0 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1037,73 +1037,6 @@ GGML_TABLE_BEGIN(uint32_t, iq3s_grid, 512)
10371037
0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101,
10381038
GGML_TABLE_END()
10391039

1040-
GGML_TABLE_BEGIN(uint32_t, q22_grid, 256)
1041-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1042-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1043-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1044-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1045-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1046-
0x00010100, 0x01010100, 0x00010100, 0xff010100,
1047-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1048-
0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
1049-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1050-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1051-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1052-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1053-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1054-
0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
1055-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1056-
0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
1057-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1058-
0x00010001, 0x01010001, 0x00010001, 0xff010001,
1059-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1060-
0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
1061-
0x00000101, 0x01000101, 0x00000101, 0xff000101,
1062-
0x00010101, 0x01010101, 0x00010101, 0xff010101,
1063-
0x00000101, 0x01000101, 0x00000101, 0xff000101,
1064-
0x00ff0101, 0x01ff0101, 0x00ff0101, 0xffff0101,
1065-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1066-
0x00010001, 0x01010001, 0x00010001, 0xff010001,
1067-
0x00000001, 0x01000001, 0x00000001, 0xff000001,
1068-
0x00ff0001, 0x01ff0001, 0x00ff0001, 0xffff0001,
1069-
0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
1070-
0x0001ff01, 0x0101ff01, 0x0001ff01, 0xff01ff01,
1071-
0x0000ff01, 0x0100ff01, 0x0000ff01, 0xff00ff01,
1072-
0x00ffff01, 0x01ffff01, 0x00ffff01, 0xffffff01,
1073-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1074-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1075-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1076-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1077-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1078-
0x00010100, 0x01010100, 0x00010100, 0xff010100,
1079-
0x00000100, 0x01000100, 0x00000100, 0xff000100,
1080-
0x00ff0100, 0x01ff0100, 0x00ff0100, 0xffff0100,
1081-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1082-
0x00010000, 0x01010000, 0x00010000, 0xff010000,
1083-
0x00000000, 0x01000000, 0x00000000, 0xff000000,
1084-
0x00ff0000, 0x01ff0000, 0x00ff0000, 0xffff0000,
1085-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1086-
0x0001ff00, 0x0101ff00, 0x0001ff00, 0xff01ff00,
1087-
0x0000ff00, 0x0100ff00, 0x0000ff00, 0xff00ff00,
1088-
0x00ffff00, 0x01ffff00, 0x00ffff00, 0xffffff00,
1089-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1090-
0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
1091-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1092-
0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
1093-
0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
1094-
0x000101ff, 0x010101ff, 0x000101ff, 0xff0101ff,
1095-
0x000001ff, 0x010001ff, 0x000001ff, 0xff0001ff,
1096-
0x00ff01ff, 0x01ff01ff, 0x00ff01ff, 0xffff01ff,
1097-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1098-
0x000100ff, 0x010100ff, 0x000100ff, 0xff0100ff,
1099-
0x000000ff, 0x010000ff, 0x000000ff, 0xff0000ff,
1100-
0x00ff00ff, 0x01ff00ff, 0x00ff00ff, 0xffff00ff,
1101-
0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
1102-
0x0001ffff, 0x0101ffff, 0x0001ffff, 0xff01ffff,
1103-
0x0000ffff, 0x0100ffff, 0x0000ffff, 0xff00ffff,
1104-
0x00ffffff, 0x01ffffff, 0x00ffffff, 0xffffffff,
1105-
GGML_TABLE_END()
1106-
11071040
GGML_TABLE_BEGIN(uint32_t, q1_3_grid, 256)
11081041
0xffffffff, 0xffffffff, 0xffffff00, 0xffffff01, 0xffff00ff, 0xffff0000, 0xffff0001, 0xffff01ff,
11091042
0xffff0100, 0xffff0101, 0xff00ffff, 0xff00ff00, 0xff00ff01, 0xff0000ff, 0xff000000, 0xff000001,

ggml-quants.c

Lines changed: 31 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -669,21 +669,21 @@ void quantize_row_q2_2_reference(const float * restrict x, block_q2_2 * restrict
669669
for (int i = 0; i < nb; i++) {
670670

671671
for (int j = 0; j < qk/4; ++j) {
672-
int8_t x0 = (int8_t)x[i*qk + j*4 + 0];
673-
int8_t x1 = (int8_t)x[i*qk + j*4 + 1];
674-
int8_t x2 = (int8_t)x[i*qk + j*4 + 2];
675-
int8_t x3 = (int8_t)x[i*qk + j*4 + 3];
672+
int8_t x0 = (int8_t)x[i*qk + 0 + j];
673+
int8_t x1 = (int8_t)x[i*qk + 1*qk/4 + j];
674+
int8_t x2 = (int8_t)x[i*qk + 2*qk/4 + j];
675+
int8_t x3 = (int8_t)x[i*qk + 3*qk/4 + j];
676676

677-
const uint8_t xi0 = x0 >= 0 ? x0 : 3;
678-
const uint8_t xi1 = x1 >= 0 ? x1 : 3;
679-
const uint8_t xi2 = x2 >= 0 ? x2 : 3;
680-
const uint8_t xi3 = x3 >= 0 ? x3 : 3;
677+
const uint8_t xi0 = x0 < 0 ? 1 : x0 == 0 ? 2 : 3;
678+
const uint8_t xi1 = x1 < 0 ? 1 : x1 == 0 ? 2 : 3;
679+
const uint8_t xi2 = x2 < 0 ? 1 : x2 == 0 ? 2 : 3;
680+
const uint8_t xi3 = x3 < 0 ? 1 : x3 == 0 ? 2 : 3;
681681

682682
y[i].qs[j] = 0;
683-
y[i].qs[j] |= (xi0 << 6);
684-
y[i].qs[j] |= (xi1 << 4);
685-
y[i].qs[j] |= (xi2 << 2);
686-
y[i].qs[j] |= (xi3 << 0);
683+
y[i].qs[j] |= (xi0 << 0);
684+
y[i].qs[j] |= (xi1 << 2);
685+
y[i].qs[j] |= (xi2 << 4);
686+
y[i].qs[j] |= (xi3 << 6);
687687
}
688688
}
689689
}
@@ -1555,12 +1555,12 @@ void dequantize_row_q2_2(const block_q2_2 * restrict x, float * restrict y, int6
15551555
for (int i = 0; i < nb; i++) {
15561556

15571557
for (int j = 0; j < qk/4; ++j) {
1558-
const int8_t * q = (const int8_t *) (q22_grid + x[i].qs[j]);
1558+
const int8_t q = x[i].qs[j];
15591559

1560-
*y++ = (float) q[0];
1561-
*y++ = (float) q[1];
1562-
*y++ = (float) q[2];
1563-
*y++ = (float) q[3];
1560+
y[i*qk + j + 0 ] = (float) (((q >> 0) & 3) - 2);
1561+
y[i*qk + j + 1*qk/4] = (float) (((q >> 2) & 3) - 2);
1562+
y[i*qk + j + 2*qk/4] = (float) (((q >> 4) & 3) - 2);
1563+
y[i*qk + j + 3*qk/4] = (float) (((q >> 6) & 3) - 2);
15641564
}
15651565
}
15661566
}
@@ -3929,82 +3929,18 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
39293929
#if defined(__AVX2__)
39303930
__m256 acc = _mm256_setzero_ps();
39313931

3932-
int leftovers = nb % 2;
3933-
3934-
for (int i = 0; i < nb - leftovers; i += 2) {
3935-
3936-
const __m256 d0 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 0].d) );
3937-
const __m256 d1 = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i + 1].d) );
3938-
3939-
// assuming two consecutive blocks are contiguous AND aligned
3940-
__m128i xq16b = _mm_load_si128((const __m128i *) (x[i].qs));
3941-
__m256i xq16 = MM256_SET_M128I(xq16b, xq16b);
3942-
__m256i xq8l0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
3943-
4, -1, 4, -1, 4, -1, 4, -1,
3944-
1, -1, 1, -1, 1, -1, 1, -1,
3945-
0, -1, 0, -1, 0, -1, 0, -1));
3946-
__m256i xq8h0 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1,
3947-
6, -1, 6, -1, 6, -1, 6, -1,
3948-
3, -1, 3, -1, 3, -1, 3, -1,
3949-
2, -1, 2, -1, 2, -1, 2, -1));
3950-
__m256i xq8l1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(13, -1, 13, -1, 13, -1, 13, -1,
3951-
12, -1, 12, -1, 12, -1, 12, -1,
3952-
9, -1, 9, -1, 9, -1, 9, -1,
3953-
8, -1, 8, -1, 8, -1, 8, -1));
3954-
__m256i xq8h1 = _mm256_shuffle_epi8(xq16, _mm256_set_epi8(15, -1, 15, -1, 15, -1, 15, -1,
3955-
14, -1, 14, -1, 14, -1, 14, -1,
3956-
11, -1, 11, -1, 11, -1, 11, -1,
3957-
10, -1, 10, -1, 10, -1, 10, -1));
3958-
__m256i shift = _mm256_set_epi16(64, 16, 4, 1,
3959-
64, 16, 4, 1,
3960-
64, 16, 4, 1,
3961-
64, 16, 4, 1);
3962-
xq8l0 = _mm256_mullo_epi16(xq8l0, shift);
3963-
xq8h0 = _mm256_mullo_epi16(xq8h0, shift);
3964-
xq8l1 = _mm256_mullo_epi16(xq8l1, shift);
3965-
xq8h1 = _mm256_mullo_epi16(xq8h1, shift);
3966-
xq8l0 = _mm256_srai_epi16(xq8l0, 14);
3967-
xq8h0 = _mm256_srai_epi16(xq8h0, 14);
3968-
xq8l1 = _mm256_srai_epi16(xq8l1, 14);
3969-
xq8h1 = _mm256_srai_epi16(xq8h1, 14);
3970-
__m256i xq8_0 = _mm256_packs_epi16(xq8l0, xq8h0);
3971-
__m256i xq8_1 = _mm256_packs_epi16(xq8l1, xq8h1);
3972-
3973-
__m256i yq8_0 = _mm256_loadu_si256((const __m256i *) (y[i + 0].qs));
3974-
__m256i yq8_1 = _mm256_loadu_si256((const __m256i *) (y[i + 1].qs));
3975-
3976-
const __m256 q0 = mul_sum_i8_pairs_float(xq8_0, yq8_0);
3977-
const __m256 q1 = mul_sum_i8_pairs_float(xq8_1, yq8_1);
3978-
3979-
acc = _mm256_fmadd_ps( d0, q0, acc );
3980-
acc = _mm256_fmadd_ps( d1, q1, acc );
3981-
}
3982-
3983-
for (int i = nb - leftovers; i < nb; ++i) {
3932+
for (int i = 0; i < nb; ++i) {
39843933

39853934
const __m256 d = _mm256_set1_ps( GGML_FP16_TO_FP32(y[i].d) );
39863935

3987-
__m128i xq8b = _mm_loadu_si64(x[i].qs);
3988-
__m256i xq8 = MM256_SET_M128I(xq8b, xq8b);
3989-
__m256i xq8l = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(5, -1, 5, -1, 5, -1, 5, -1,
3990-
4, -1, 4, -1, 4, -1, 4, -1,
3991-
1, -1, 1, -1, 1, -1, 1, -1,
3992-
0, -1, 0, -1, 0, -1, 0, -1));
3993-
__m256i xq8h = _mm256_shuffle_epi8(xq8, _mm256_set_epi8(7, -1, 7, -1, 7, -1, 7, -1,
3994-
6, -1, 6, -1, 6, -1, 6, -1,
3995-
3, -1, 3, -1, 3, -1, 3, -1,
3996-
2, -1, 2, -1, 2, -1, 2, -1));
3997-
__m256i shift = _mm256_set_epi16(64, 16, 4, 1,
3998-
64, 16, 4, 1,
3999-
64, 16, 4, 1,
4000-
64, 16, 4, 1);
4001-
xq8l = _mm256_mullo_epi16(xq8l, shift);
4002-
xq8h = _mm256_mullo_epi16(xq8h, shift);
4003-
xq8l = _mm256_srai_epi16(xq8l, 14);
4004-
xq8h = _mm256_srai_epi16(xq8h, 14);
4005-
xq8 = _mm256_packs_epi16(xq8l, xq8h);
4006-
4007-
__m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs));
3936+
// assuming this is always aligned
3937+
__m256i xq8 = _mm256_set1_epi64x(*(const int64_t *) x[i].qs);
3938+
xq8 = _mm256_srlv_epi64(xq8, _mm256_set_epi64x(6, 4, 2, 0));
3939+
xq8 = _mm256_and_si256(xq8, _mm256_set1_epi8(0x03));
3940+
// stangely enough, this is much slower with 1 instead of 2
3941+
xq8 = _mm256_sub_epi8(xq8, _mm256_set1_epi8(2));
3942+
3943+
const __m256i yq8 = _mm256_loadu_si256((const __m256i *) (y[i].qs));
40083944
const __m256 q = mul_sum_i8_pairs_float(xq8, yq8);
40093945

40103946
acc = _mm256_fmadd_ps( d, q, acc );
@@ -4017,11 +3953,11 @@ void ggml_vec_dot_q2_2_q8_0(int n, float * restrict s, size_t bs, const void * r
40173953
for (int i = 0; i < nb; i++) {
40183954
int sumi = 0;
40193955
for (int j = 0; j < qk / 4; j++) {
4020-
const int8_t* weight = (const int8_t *)(q22_grid + x[i].qs[j]);
4021-
sumi += (int)y[i].qs[4*j+0] * weight[0];
4022-
sumi += (int)y[i].qs[4*j+1] * weight[1];
4023-
sumi += (int)y[i].qs[4*j+2] * weight[2];
4024-
sumi += (int)y[i].qs[4*j+3] * weight[3];
3956+
const uint8_t weight = x[i].qs[j];
3957+
sumi += (int)y[i].qs[j + 0*qk/4] * ((weight >> 0) & 3) - 2;
3958+
sumi += (int)y[i].qs[j + 1*qk/4] * ((weight >> 2) & 3) - 2;
3959+
sumi += (int)y[i].qs[j + 2*qk/4] * ((weight >> 4) & 3) - 2;
3960+
sumi += (int)y[i].qs[j + 3*qk/4] * ((weight >> 6) & 3) - 2;
40253961
}
40263962
sumf += (float)(sumi)*(GGML_FP16_TO_FP32(y[i].d));
40273963
}

0 commit comments

Comments
 (0)