Skip to content

Commit 6727526

Browse files
authored
[X86] X86DAGToDAGISel - attempt to merge XMM/YMM loads with YMM/ZMM loads of the same ptr (#73126)
If we are loading the same ptr at different vector widths, then reuse the larger load and just extract the low subvector. Unlike the equivalent VBROADCAST_LOAD/SUBV_BROADCAST_LOAD folds which can occur in DAG, we have to wait until DAGISel otherwise we can hit infinite loops if constant folding recreates the original constant value. This is mainly useful for better constant sharing.
1 parent aaae104 commit 6727526

37 files changed

+6886
-6858
lines changed

llvm/lib/Target/X86/X86ISelDAGToDAG.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,43 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
10361036

10371037
break;
10381038
}
1039+
case ISD::LOAD: {
1040+
// If this is a XMM/YMM load of the same lower bits as another YMM/ZMM
1041+
// load, then just extract the lower subvector and avoid the second load.
1042+
auto *Ld = cast<LoadSDNode>(N);
1043+
MVT VT = N->getSimpleValueType(0);
1044+
if (!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
1045+
!(VT.is128BitVector() || VT.is256BitVector()))
1046+
break;
1047+
1048+
SDValue Ptr = Ld->getBasePtr();
1049+
SDValue Chain = Ld->getChain();
1050+
for (SDNode *User : Ptr->uses()) {
1051+
auto *UserLd = dyn_cast<LoadSDNode>(N);
1052+
MVT UserVT = User->getSimpleValueType(0);
1053+
if (User != N && UserLd && ISD::isNormalLoad(User) &&
1054+
UserLd->getBasePtr() == Ptr && UserLd->getChain() == Chain &&
1055+
!User->hasAnyUseOfValue(1) &&
1056+
(UserVT.is256BitVector() || UserVT.is512BitVector()) &&
1057+
UserVT.getSizeInBits() > VT.getSizeInBits()) {
1058+
SDLoc dl(N);
1059+
unsigned NumSubElts =
1060+
VT.getSizeInBits() / UserVT.getScalarSizeInBits();
1061+
MVT SubVT = MVT::getVectorVT(UserVT.getScalarType(), NumSubElts);
1062+
SDValue Extract = CurDAG->getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT,
1063+
SDValue(User, 0),
1064+
CurDAG->getIntPtrConstant(0, dl));
1065+
SDValue Res = CurDAG->getBitcast(VT, Extract);
1066+
--I;
1067+
SDValue To[] = {Res, SDValue(UserLd, 1)};
1068+
CurDAG->ReplaceAllUsesWith(N, To);
1069+
++I;
1070+
MadeChange = true;
1071+
continue;
1072+
}
1073+
}
1074+
break;
1075+
}
10391076
case ISD::VSELECT: {
10401077
// Replace VSELECT with non-mask conditions with with BLENDV/VPTERNLOG.
10411078
EVT EleVT = N->getOperand(0).getValueType().getVectorElementType();

llvm/test/CodeGen/X86/avx512-regcall-Mask.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -98,10 +98,8 @@ define dso_local i64 @caller_argv64i1() #0 {
9898
; X32: # %bb.0: # %entry
9999
; X32-NEXT: pushl %edi
100100
; X32-NEXT: subl $88, %esp
101-
; X32-NEXT: vmovddup {{.*#+}} xmm0 = [2,1,2,1]
102-
; X32-NEXT: # xmm0 = mem[0,0]
103-
; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
104101
; X32-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1]
102+
; X32-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp)
105103
; X32-NEXT: vmovups %zmm0, (%esp)
106104
; X32-NEXT: movl $1, {{[0-9]+}}(%esp)
107105
; X32-NEXT: movl $2, {{[0-9]+}}(%esp)

llvm/test/CodeGen/X86/bfloat.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1401,9 +1401,9 @@ define <32 x bfloat> @pr63017_2() nounwind {
14011401
; AVXNC-NEXT: jne .LBB12_2
14021402
; AVXNC-NEXT: # %bb.1: # %cond.load
14031403
; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm1 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1404-
; AVXNC-NEXT: vpbroadcastw {{.*#+}} xmm0 = [49024,49024,49024,49024,49024,49024,49024,49024]
1405-
; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm0
1406-
; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
1404+
; AVXNC-NEXT: vpbroadcastw {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
1405+
; AVXNC-NEXT: vpinsrw $0, (%rax), %xmm0, %xmm2
1406+
; AVXNC-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
14071407
; AVXNC-NEXT: .LBB12_2: # %else
14081408
; AVXNC-NEXT: xorl %eax, %eax
14091409
; AVXNC-NEXT: testb %al, %al

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -209,8 +209,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
209209
; AVX1-NEXT: vmovd %edi, %xmm0
210210
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
211211
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
212-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
213-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
212+
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
213+
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
214+
; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
214215
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
215216
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
216217
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -255,8 +256,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
255256
; AVX1-NEXT: vmovd %edi, %xmm0
256257
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
257258
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
258-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
259-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
259+
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
260+
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
261+
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
260262
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
261263
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
262264
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -303,8 +305,9 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
303305
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
304306
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
305307
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
306-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
307-
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
308+
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
309+
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
310+
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
308311
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
309312
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
310313
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -421,13 +424,15 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
421424
; AVX1-NEXT: vmovd %edi, %xmm0
422425
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
423426
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
424-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
425-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
426-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
427-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
428-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
429-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
430-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
427+
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
428+
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
429+
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
430+
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
431+
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
432+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
433+
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
434+
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
435+
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
431436
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
432437
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
433438
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
@@ -482,13 +487,15 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
482487
; AVX1-NEXT: vmovd %edi, %xmm0
483488
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
484489
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
485-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
486-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
487-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
488-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
489-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
490-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
491-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
490+
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
491+
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
492+
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
493+
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
494+
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
495+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
496+
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
497+
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
498+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
492499
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
493500
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
494501
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
@@ -549,17 +556,16 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
549556
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
550557
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
551558
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
552-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128]
553-
; AVX1-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0
559+
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
554560
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
555561
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
556562
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
557563
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
558564
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
559-
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
560-
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm2, %xmm2
561-
; AVX1-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1
562-
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
565+
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
566+
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm3, %xmm3
567+
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
568+
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
563569
; AVX1-NEXT: retq
564570
;
565571
; AVX2-LABEL: ext_i32_32i16:

llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll

Lines changed: 36 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,9 @@ define <4 x i64> @ext_i4_4i64(i4 %a0) {
265265
; AVX1-NEXT: vmovd %edi, %xmm0
266266
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
267267
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
268-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
269-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
268+
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8]
269+
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
270+
; AVX1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm1
270271
; AVX1-NEXT: vpsrlq $63, %xmm1, %xmm1
271272
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
272273
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -324,8 +325,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
324325
; AVX1-NEXT: vmovd %edi, %xmm0
325326
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
326327
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
327-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
328-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
328+
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
329+
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
330+
; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
329331
; AVX1-NEXT: vpsrld $31, %xmm1, %xmm1
330332
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
331333
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -385,8 +387,9 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) {
385387
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
386388
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
387389
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
388-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
389-
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
390+
; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
391+
; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
392+
; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm1
390393
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
391394
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
392395
; AVX1-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -544,15 +547,17 @@ define <8 x i64> @ext_i8_8i64(i8 %a0) {
544547
; AVX1-NEXT: vmovd %edi, %xmm0
545548
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
546549
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
547-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
548-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
549-
; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
550-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
551-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
550+
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8]
551+
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
552+
; AVX1-NEXT: vpcmpeqq %xmm0, %xmm2, %xmm0
552553
; AVX1-NEXT: vpsrlq $63, %xmm0, %xmm0
553-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
554-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
555-
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
554+
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
555+
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
556+
; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
557+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
558+
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [16,32,64,128]
559+
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
560+
; AVX1-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm2
556561
; AVX1-NEXT: vpsrlq $63, %xmm2, %xmm2
557562
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
558563
; AVX1-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -623,15 +628,17 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
623628
; AVX1-NEXT: vmovd %edi, %xmm0
624629
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
625630
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
626-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
627-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
628-
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
629-
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
630-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
631+
; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
632+
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
633+
; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
631634
; AVX1-NEXT: vpsrld $31, %xmm0, %xmm0
632-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
633-
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
634-
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
635+
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
636+
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
637+
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
638+
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
639+
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
640+
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
641+
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
635642
; AVX1-NEXT: vpsrld $31, %xmm2, %xmm2
636643
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
637644
; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -705,22 +712,21 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) {
705712
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
706713
; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
707714
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
708-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128]
709-
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm0, %xmm4
710-
; AVX1-NEXT: vpsrlw $15, %xmm4, %xmm4
715+
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3
716+
; AVX1-NEXT: vpsrlw $15, %xmm3, %xmm3
711717
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
712-
; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [256,512,1024,2048,4096,8192,16384,32768]
713-
; AVX1-NEXT: vpcmpeqw %xmm5, %xmm0, %xmm0
718+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [256,512,1024,2048,4096,8192,16384,32768]
719+
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0
714720
; AVX1-NEXT: vpsrlw $15, %xmm0, %xmm0
715-
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
721+
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0
716722
; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
717723
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
718724
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1
719725
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
720-
; AVX1-NEXT: vpcmpeqw %xmm3, %xmm1, %xmm2
726+
; AVX1-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
721727
; AVX1-NEXT: vpsrlw $15, %xmm2, %xmm2
722728
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
723-
; AVX1-NEXT: vpcmpeqw %xmm5, %xmm1, %xmm1
729+
; AVX1-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm1
724730
; AVX1-NEXT: vpsrlw $15, %xmm1, %xmm1
725731
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
726732
; AVX1-NEXT: retq

0 commit comments

Comments
 (0)