Skip to content

Commit 5cf167a

Browse files
remyoudomphengmglambda
authored andcommitted
ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions (ggml-org#12154)
* ggml-cpu: Faster IQ1 mul_mat_vec on AVX2 using BMI2 instructions * cmake: Add GGML_BMI2 build option * ggml: enable BMI2 on relevant CPU variants * ggml-cpu: include BMI2 in backend score * ggml-cpu: register BMI2 in ggml_backend_cpu_get_features * ggml-cpu: add __BMI2__ define when using MSVC
1 parent b9467dd commit 5cf167a

File tree

8 files changed

+68
-18
lines changed

8 files changed

+68
-18
lines changed

ggml/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable
106106
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
107107
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
108108
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
109+
option(GGML_BMI2 "ggml: enable BMI2" ${INS_ENB})
109110
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
110111
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
111112
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)

ggml/include/ggml-cpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ extern "C" {
8080
GGML_BACKEND_API int ggml_cpu_has_avx (void);
8181
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
8282
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
83+
GGML_BACKEND_API int ggml_cpu_has_bmi2 (void);
8384
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
8485
GGML_BACKEND_API int ggml_cpu_has_fma (void);
8586
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);

ggml/src/CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -289,7 +289,7 @@ function(ggml_add_cpu_backend_variant tag_name)
289289
set(GGML_CPU_TAG_NAME ${tag_name})
290290
# other: OPENMP LLAMAFILE CPU_HBM
291291
foreach (feat NATIVE
292-
AVX AVX2 AVX_VNNI FMA F16C
292+
AVX AVX2 BMI2 AVX_VNNI FMA F16C
293293
AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
294294
AMX_TILE AMX_INT8 AMX_BF16)
295295
set(GGML_${feat} OFF)
@@ -309,13 +309,13 @@ if (GGML_CPU_ALL_VARIANTS)
309309
message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
310310
endif()
311311
ggml_add_cpu_backend_variant(sandybridge AVX)
312-
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 FMA)
313-
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 FMA AVX512)
314-
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
315-
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 FMA AVX_VNNI)
312+
ggml_add_cpu_backend_variant(haswell AVX F16C AVX2 BMI2 FMA)
313+
ggml_add_cpu_backend_variant(skylakex AVX F16C AVX2 BMI2 FMA AVX512)
314+
ggml_add_cpu_backend_variant(icelake AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
315+
ggml_add_cpu_backend_variant(alderlake AVX F16C AVX2 BMI2 FMA AVX_VNNI)
316316
if (NOT MSVC)
317317
# MSVC doesn't support AMX
318-
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
318+
ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
319319
endif()
320320
elseif (GGML_CPU)
321321
ggml_add_cpu_backend_variant_impl("")

ggml/src/ggml-cpu/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
219219
if (GGML_AVX_VNNI)
220220
list(APPEND ARCH_DEFINITIONS __AVXVNNI__ GGML_AVX_VNNI)
221221
endif()
222+
if (GGML_BMI2)
223+
# MSVC does not define macro __BMI2__
224+
list(APPEND ARCH_DEFINITIONS __BMI2__ GGML_BMI2)
225+
endif()
222226
else ()
223227
if (GGML_NATIVE)
224228
list(APPEND ARCH_FLAGS -march=native)
@@ -233,6 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
233237
list(APPEND ARCH_FLAGS -mfma)
234238
list(APPEND ARCH_DEFINITIONS GGML_FMA)
235239
endif()
240+
if (GGML_BMI2)
241+
list(APPEND ARCH_FLAGS -mbmi2)
242+
list(APPEND ARCH_DEFINITIONS GGML_BMI2)
243+
endif()
236244
if (GGML_AVX)
237245
list(APPEND ARCH_FLAGS -mavx)
238246
list(APPEND ARCH_DEFINITIONS GGML_AVX)

ggml/src/ggml-cpu/cpu-feats-x86.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,10 @@ static int ggml_backend_cpu_x86_score() {
278278
if (!is.SSE42()) { return 0; }
279279
score += 1<<2;
280280
#endif
281+
#ifdef GGML_BMI2
282+
if (!is.BMI2()) { return 0; }
283+
score += 1<<3;
284+
#endif
281285
#ifdef GGML_AVX
282286
if (!is.AVX()) { return 0; }
283287
score += 1<<4;

ggml/src/ggml-cpu/ggml-cpu-quants.c

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11362,10 +11362,19 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1136211362
__m256i sumi = _mm256_setzero_si256();
1136311363
int sumi1 = 0;
1136411364
for (int ib = 0; ib < QK_K/32; ib += 2) {
11365+
#ifdef __BMI2__
11366+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib], 0x700070007000700ULL);
11367+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL) | _pdep_u64(qh[ib + 1], 0x700070007000700ULL);
11368+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11369+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11370+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11371+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11372+
#else
1136511373
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[qs[3] | ((qh[ib+0] >> 1) & 0x700)], iq1s_grid[qs[2] | ((qh[ib+0] << 2) & 0x700)],
1136611374
iq1s_grid[qs[1] | ((qh[ib+0] << 5) & 0x700)], iq1s_grid[qs[0] | ((qh[ib+0] << 8) & 0x700)]);
1136711375
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[qs[7] | ((qh[ib+1] >> 1) & 0x700)], iq1s_grid[qs[6] | ((qh[ib+1] << 2) & 0x700)],
1136811376
iq1s_grid[qs[5] | ((qh[ib+1] << 5) & 0x700)], iq1s_grid[qs[4] | ((qh[ib+1] << 8) & 0x700)]);
11377+
#endif
1136911378
qs += 8;
1137011379
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1137111380
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
@@ -11709,8 +11718,9 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1170911718

1171011719
#elif defined __AVX2__
1171111720

11712-
const __m256i mask = _mm256_set1_epi16(0x7);
11721+
const __m256i mask = _mm256_set1_epi16(2 * 0x7);
1171311722
const __m256i mone = _mm256_set1_epi16(1);
11723+
const __m256i mone8 = _mm256_set1_epi8(1);
1171411724

1171511725
__m256 accum1 = _mm256_setzero_ps();
1171611726
__m256 accum2 = _mm256_setzero_ps();
@@ -11726,6 +11736,21 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1172611736
__m256i sumi1 = _mm256_setzero_si256();
1172711737
__m256i sumi2 = _mm256_setzero_si256();
1172811738
for (int ib = 0; ib < QK_K/32; ib += 2) {
11739+
#ifdef __BMI2__
11740+
const uint64_t packed_idx1 = _pdep_u64(*(const uint32_t *)qs, 0x00ff00ff00ff00ffULL)
11741+
| _pdep_u64(*(const uint16_t*)(qh) & 0x7777, 0xf000f000f000f00ULL);
11742+
const uint64_t packed_idx2 = _pdep_u64(*(const uint32_t *)(qs + 4), 0x00ff00ff00ff00ffULL)
11743+
| _pdep_u64(*(const uint16_t*)(qh + 2) & 0x7777, 0xf000f000f000f00ULL);
11744+
const uint16_t *idx1 = (const uint16_t *)(&packed_idx1);
11745+
const uint16_t *idx2 = (const uint16_t *)(&packed_idx2);
11746+
const __m256i q1b_1 = _mm256_set_epi64x(iq1s_grid[idx1[3]], iq1s_grid[idx1[2]], iq1s_grid[idx1[1]], iq1s_grid[idx1[0]]);
11747+
const __m256i q1b_2 = _mm256_set_epi64x(iq1s_grid[idx2[3]], iq1s_grid[idx2[2]], iq1s_grid[idx2[1]], iq1s_grid[idx2[0]]);
11748+
11749+
// Convert signs to bytes 0x81 (negative) or 0x01 (positive)
11750+
const uint64_t delta_sign = _pdep_u64(*(const uint32_t*)(qh) & 0x88888888, 0xf0f0f0f0f0f0f0f0ULL);
11751+
const __m256i delta1 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign)));
11752+
const __m256i delta2 = _mm256_or_si256(mone8, _mm256_cvtepi8_epi64(_mm_set1_epi32(delta_sign >> 32)));
11753+
#else
1172911754
const __m256i q1b_1 = _mm256_set_epi64x(
1173011755
iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)],
1173111756
iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)]
@@ -11734,11 +11759,6 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1173411759
iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)],
1173511760
iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)]
1173611761
);
11737-
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11738-
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11739-
11740-
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11741-
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
1174211762

1174311763
const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
1174411764
qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
@@ -11748,15 +11768,20 @@ void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const
1174811768
qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101,
1174911769
qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101,
1175011770
qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101);
11771+
#endif
11772+
const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
11773+
const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
1175111774

11752-
const __m256i dot3 = mul_add_epi8(delta1, q8b_1);
11753-
const __m256i dot4 = mul_add_epi8(delta2, q8b_2);
11775+
const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1);
11776+
const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2);
11777+
const __m256i dot3 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_1, delta1));
11778+
const __m256i dot4 = _mm256_maddubs_epi16(mone8, _mm256_sign_epi8(q8b_2, delta2));
1175411779

11755-
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0));
11756-
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6));
11780+
__m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 2), _mm_set1_epi16(sc[ib/2] << 1));
11781+
__m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 8), _mm_set1_epi16(sc[ib/2] >> 5));
1175711782

11758-
scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone);
11759-
scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone);
11783+
scale1 = _mm256_add_epi16(_mm256_and_si256(scale1, mask), mone);
11784+
scale2 = _mm256_add_epi16(_mm256_and_si256(scale2, mask), mone);
1176011785
const __m256i p1 = _mm256_madd_epi16(dot1, scale1);
1176111786
const __m256i p2 = _mm256_madd_epi16(dot2, scale2);
1176211787
const __m256i p3 = _mm256_madd_epi16(dot3, scale1);

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15440,6 +15440,14 @@ int ggml_cpu_has_amx_int8(void) {
1544015440
#endif
1544115441
}
1544215442

15443+
int ggml_cpu_has_bmi2(void) {
15444+
#if defined(__BMI2__)
15445+
return 1;
15446+
#else
15447+
return 0;
15448+
#endif
15449+
}
15450+
1544315451
int ggml_cpu_has_fma(void) {
1544415452
#if defined(__FMA__)
1544515453
return 1;

ggml/src/ggml-cpu/ggml-cpu.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,9 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
511511
if (ggml_cpu_has_fma()) {
512512
features.push_back({ "FMA", "1" });
513513
}
514+
if (ggml_cpu_has_bmi2()) {
515+
features.push_back({ "BMI2", "1" });
516+
}
514517
if (ggml_cpu_has_avx512()) {
515518
features.push_back({ "AVX512", "1" });
516519
}

0 commit comments

Comments
 (0)