Skip to content

Commit ea685e1

Browse files
committed
[X86][AVX] Update _mm256_loadu2_m128* intrinsics to use _mm256_set_m128* (PR51796)
As reported on PR51796, the _mm256_loadu2_m128i in particular was inserting bitcasts and shuffles with different types making it trickier for some combines, and prevented the value tracker from identifying the shuffle sequences as a single insert_subvector style concat_vectors pattern. This patch instead concatenate the 128-bit unaligned loads with _mm256_set_m128*, which was written to avoid the unnecessary bitcasts and only emits a single shuffle. Differential Revision: https://reviews.llvm.org/D109497
1 parent dd662f0 commit ea685e1

File tree

2 files changed

+6
-15
lines changed

2 files changed

+6
-15
lines changed

clang/lib/Headers/avxintrin.h

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4902,8 +4902,7 @@ _mm256_setr_m128i (__m128i __lo, __m128i __hi)
49024902
static __inline __m256 __DEFAULT_FN_ATTRS
49034903
_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
49044904
{
4905-
__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
4906-
return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
4905+
return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
49074906
}
49084907

49094908
/// Loads two 128-bit floating-point vectors of [2 x double] from
@@ -4930,8 +4929,7 @@ _mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
49304929
static __inline __m256d __DEFAULT_FN_ATTRS
49314930
_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
49324931
{
4933-
__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
4934-
return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
4932+
return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
49354933
}
49364934

49374935
/// Loads two 128-bit integer vectors from unaligned memory locations and
@@ -4955,8 +4953,7 @@ _mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
49554953
static __inline __m256i __DEFAULT_FN_ATTRS
49564954
_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
49574955
{
4958-
__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
4959-
return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
4956+
return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
49604957
}
49614958

49624959
/* SIMD store ops (unaligned) */

clang/test/CodeGen/X86/avx-builtins.c

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1233,30 +1233,24 @@ __m256i test_mm256_loadu_si256(__m256i* A) {
12331233
__m256 test_mm256_loadu2_m128(float* A, float* B) {
12341234
// CHECK-LABEL: test_mm256_loadu2_m128
12351235
// CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
1236-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
12371236
// CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 1{{$}}
1238-
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1239-
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1237+
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
12401238
return _mm256_loadu2_m128(A, B);
12411239
}
12421240

12431241
__m256d test_mm256_loadu2_m128d(double* A, double* B) {
12441242
// CHECK-LABEL: test_mm256_loadu2_m128d
12451243
// CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
1246-
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
12471244
// CHECK: load <2 x double>, <2 x double>* %{{.*}}, align 1{{$}}
1248-
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1249-
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
1245+
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
12501246
return _mm256_loadu2_m128d(A, B);
12511247
}
12521248

12531249
__m256i test_mm256_loadu2_m128i(__m128i* A, __m128i* B) {
12541250
// CHECK-LABEL: test_mm256_loadu2_m128i
12551251
// CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
1256-
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
12571252
// CHECK: load <2 x i64>, <2 x i64>* %{{.*}}, align 1{{$}}
1258-
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1259-
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
1253+
// CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
12601254
return _mm256_loadu2_m128i(A, B);
12611255
}
12621256

0 commit comments

Comments
 (0)