@@ -188,6 +188,11 @@ typedef double ggml_float;
188
188
#else
189
189
#if !defined(__riscv )
190
190
#include <immintrin.h>
191
+ #if (defined(__GNUC__ ) && __GNUC__ >= 9 ) || defined(__INTEL_COMPILER )
192
+ #define MM256_SET_M128I (a , b ) _mm256_set_m128i((a), (b))
193
+ #else
194
+ #define MM256_SET_M128I (a , b ) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
195
+ #endif
191
196
#endif
192
197
#endif
193
198
#endif
@@ -533,7 +538,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
533
538
static inline __m256i bytes_from_nibbles_32 (const uint8_t * rsi )
534
539
{
535
540
const __m128i tmp = _mm_loadu_si128 ((const __m128i * )rsi );
536
- const __m256i bytes = _mm256_set_m128i (_mm_srli_epi16 (tmp , 4 ), tmp );
541
+ const __m256i bytes = MM256_SET_M128I (_mm_srli_epi16 (tmp , 4 ), tmp );
537
542
const __m256i lowMask = _mm256_set1_epi8 ( 0xF );
538
543
return _mm256_and_si256 (lowMask , bytes );
539
544
}
@@ -606,7 +611,7 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
606
611
bytesh = _mm_or_si128 (bytesh , bit_mask );
607
612
bytesl = _mm_cmpeq_epi8 (bytesl , _mm_set1_epi64x (-1 ));
608
613
bytesh = _mm_cmpeq_epi8 (bytesh , _mm_set1_epi64x (-1 ));
609
- return _mm256_set_m128i (bytesh , bytesl );
614
+ return MM256_SET_M128I (bytesh , bytesl );
610
615
}
611
616
612
617
// Unpack 32 4-bit fields into 32 bytes
@@ -619,15 +624,15 @@ static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
619
624
const __m128i lowMask = _mm_set1_epi8 (0xF );
620
625
tmpl = _mm_and_si128 (lowMask , tmpl );
621
626
tmph = _mm_and_si128 (lowMask , tmph );
622
- return _mm256_set_m128i (tmph , tmpl );
627
+ return MM256_SET_M128I (tmph , tmpl );
623
628
}
624
629
625
630
// add int16_t pairwise and return as float vector
626
631
static inline __m256 sum_i16_pairs_float (const __m128i xh , const __m128i xl ) {
627
632
const __m128i ones = _mm_set1_epi16 (1 );
628
633
const __m128i summed_pairsl = _mm_madd_epi16 (ones , xl );
629
634
const __m128i summed_pairsh = _mm_madd_epi16 (ones , xh );
630
- const __m256i summed_pairs = _mm256_set_m128i (summed_pairsh , summed_pairsl );
635
+ const __m256i summed_pairs = MM256_SET_M128I (summed_pairsh , summed_pairsl );
631
636
return _mm256_cvtepi32_ps (summed_pairs );
632
637
}
633
638
@@ -2290,7 +2295,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2290
2295
const __m128i i32_1 = mul_sum_i8_pairs (bx , by );
2291
2296
2292
2297
// Convert int32_t to float
2293
- __m256 p = _mm256_cvtepi32_ps (_mm256_set_m128i (i32_0 , i32_1 ));
2298
+ __m256 p = _mm256_cvtepi32_ps (MM256_SET_M128I (i32_0 , i32_1 ));
2294
2299
2295
2300
// Apply the scale, and accumulate
2296
2301
acc = _mm256_add_ps (_mm256_mul_ps ( d , p ), acc );
@@ -2766,7 +2771,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
2766
2771
__m128i bxh = _mm256_extractf128_si256 (bx , 1 );
2767
2772
bxl = _mm_or_si128 (bxl , bxhil );
2768
2773
bxh = _mm_or_si128 (bxh , bxhih );
2769
- bx = _mm256_set_m128i (bxh , bxl );
2774
+ bx = MM256_SET_M128I (bxh , bxl );
2770
2775
2771
2776
const __m256i by = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
2772
2777
@@ -3022,7 +3027,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3022
3027
__m128i bxh = _mm256_extractf128_si256 (bx , 1 );
3023
3028
bxl = _mm_or_si128 (bxl , bxhil );
3024
3029
bxh = _mm_or_si128 (bxh , bxhih );
3025
- bx = _mm256_set_m128i (bxh , bxl );
3030
+ bx = MM256_SET_M128I (bxh , bxl );
3026
3031
3027
3032
const __m256 dy = _mm256_set1_ps (y [i ].d );
3028
3033
const __m256i by = _mm256_loadu_si256 ((const __m256i * )y [i ].qs );
0 commit comments