Skip to content

Commit 44e706b

Browse files
committed
fix(avx): workaround for missing _mm256_setr_m128i in GCC < 9
1 parent 2483676 commit 44e706b

File tree

1 file changed

+12
-7
lines changed

1 file changed

+12
-7
lines changed

ggml.c

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ typedef double ggml_float;
188188
#else
189189
#if !defined(__riscv)
190190
#include <immintrin.h>
191+
#if (defined(__GNUC__) && __GNUC__ >= 9) || defined(__INTEL_COMPILER)
192+
#define MM256_SET_M128I(a, b) _mm256_set_m128i((a), (b))
193+
#else
194+
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
195+
#endif
191196
#endif
192197
#endif
193198
#endif
@@ -533,7 +538,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
533538
static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
534539
{
535540
const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
536-
const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
541+
const __m256i bytes = MM256_SET_M128I(_mm_srli_epi16(tmp, 4), tmp);
537542
const __m256i lowMask = _mm256_set1_epi8( 0xF );
538543
return _mm256_and_si256(lowMask, bytes);
539544
}
@@ -606,7 +611,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
606611
bytesh = _mm_or_si128(bytesh, bit_mask);
607612
bytesl = _mm_cmpeq_epi8(bytesl, _mm_set1_epi64x(-1));
608613
bytesh = _mm_cmpeq_epi8(bytesh, _mm_set1_epi64x(-1));
609-
return _mm256_set_m128i(bytesh, bytesl);
614+
return MM256_SET_M128I(bytesh, bytesl);
610615
}
611616

612617
// Unpack 32 4-bit fields into 32 bytes
@@ -619,15 +624,15 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
619624
const __m128i lowMask = _mm_set1_epi8(0xF);
620625
tmpl = _mm_and_si128(lowMask, tmpl);
621626
tmph = _mm_and_si128(lowMask, tmph);
622-
return _mm256_set_m128i(tmph, tmpl);
627+
return MM256_SET_M128I(tmph, tmpl);
623628
}
624629

625630
// add int16_t pairwise and return as float vector
626631
static inline __m256 sum_i16_pairs_float(const __m128i xh, const __m128i xl) {
627632
const __m128i ones = _mm_set1_epi16(1);
628633
const __m128i summed_pairsl = _mm_madd_epi16(ones, xl);
629634
const __m128i summed_pairsh = _mm_madd_epi16(ones, xh);
630-
const __m256i summed_pairs = _mm256_set_m128i(summed_pairsh, summed_pairsl);
635+
const __m256i summed_pairs = MM256_SET_M128I(summed_pairsh, summed_pairsl);
631636
return _mm256_cvtepi32_ps(summed_pairs);
632637
}
633638

@@ -2290,7 +2295,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
22902295
const __m128i i32_1 = mul_sum_i8_pairs(bx, by);
22912296

22922297
// Convert int32_t to float
2293-
__m256 p = _mm256_cvtepi32_ps(_mm256_set_m128i(i32_0, i32_1));
2298+
__m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
22942299

22952300
// Apply the scale, and accumulate
22962301
acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
@@ -2766,7 +2771,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
27662771
__m128i bxh = _mm256_extractf128_si256(bx, 1);
27672772
bxl = _mm_or_si128(bxl, bxhil);
27682773
bxh = _mm_or_si128(bxh, bxhih);
2769-
bx = _mm256_set_m128i(bxh, bxl);
2774+
bx = MM256_SET_M128I(bxh, bxl);
27702775

27712776
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
27722777

@@ -3022,7 +3027,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
30223027
__m128i bxh = _mm256_extractf128_si256(bx, 1);
30233028
bxl = _mm_or_si128(bxl, bxhil);
30243029
bxh = _mm_or_si128(bxh, bxhih);
3025-
bx = _mm256_set_m128i(bxh, bxl);
3030+
bx = MM256_SET_M128I(bxh, bxl);
30263031

30273032
const __m256 dy = _mm256_set1_ps(y[i].d);
30283033
const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);

0 commit comments

Comments
 (0)