Skip to content

Commit c92980c

Browse files
committed
[X86] Fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
If we don't demand the same element from both single source shuffles (permutes), then attempt to blend the sources together first and then perform a merged permute. We don't attempt this for vXi16 blends as these are much more likely to involve byte/word vector shuffles that will result in the creation of a lot more variable-mask shuffles (PSHUFB etc.). This fold might be worth it for VSELECT with constant masks on AVX512 targets, but I haven't investigated this yet. The PR34592 -O0 regression is an unfortunate failure to cleanup with a later SimplifyDemandedElts pass like the -O3 does - I'm not sure how worried we should be tbh.
1 parent 6e47330 commit c92980c

13 files changed

+10045
-11368
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41754,6 +41754,61 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4175441754
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
4175541755
break;
4175641756
}
41757+
case X86ISD::BLENDI: {
41758+
SDValue N0 = Op.getOperand(0);
41759+
SDValue N1 = Op.getOperand(1);
41760+
if (VT.getScalarSizeInBits() < 32 || !N0.hasOneUse() || !N1.hasOneUse())
41761+
break;
41762+
41763+
// Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
41764+
// iff we don't demand the same element index for both X and Y.
41765+
SDValue BC0 = peekThroughOneUseBitcasts(N0);
41766+
SDValue BC1 = peekThroughOneUseBitcasts(N1);
41767+
SmallVector<SDValue, 2> Ops, Ops0, Ops1;
41768+
SmallVector<int, 32> Mask, Mask0, Mask1, ScaledMask0, ScaledMask1;
41769+
if (!getTargetShuffleMask(Op, false, Ops, Mask) ||
41770+
!getTargetShuffleMask(BC0, false, Ops0, Mask0) ||
41771+
!getTargetShuffleMask(BC1, false, Ops1, Mask1) ||
41772+
!scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
41773+
!scaleShuffleElements(Mask1, NumElts, ScaledMask1))
41774+
break;
41775+
41776+
// Determine the demanded elts from both permutes, confirm that we only use
41777+
// a single operand and that we don't demand the same index from both.
41778+
APInt Demanded0, DemandedLHS0, DemandedRHS0;
41779+
APInt Demanded1, DemandedLHS1, DemandedRHS1;
41780+
if (getShuffleDemandedElts(NumElts, Mask, DemandedElts, Demanded0,
41781+
Demanded1, /*AllowUndefElts=*/true) &&
41782+
getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
41783+
DemandedRHS0, /*AllowUndefElts=*/true) &&
41784+
getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
41785+
DemandedRHS1, /*AllowUndefElts=*/true) &&
41786+
DemandedRHS0.isZero() && DemandedRHS1.isZero() &&
41787+
!DemandedLHS0.intersects(DemandedLHS1)) {
41788+
// Use the permute demanded elts masks as the new blend mask.
41789+
uint64_t NewBlendMask = DemandedLHS1.getZExtValue();
41790+
41791+
// Create the new permute mask as a blend of the 2 original permute masks.
41792+
SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
41793+
for (int I = 0; I != NumElts; ++I) {
41794+
if (Demanded0[I])
41795+
NewPermuteMask[I] = ScaledMask0[I];
41796+
else if (Demanded1[I])
41797+
NewPermuteMask[I] = ScaledMask1[I];
41798+
}
41799+
assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
41800+
41801+
SDLoc DL(Op);
41802+
SDValue NewBlend = TLO.DAG.getNode(
41803+
X86ISD::BLENDI, DL, VT, TLO.DAG.getBitcast(VT, Ops0[0]),
41804+
TLO.DAG.getBitcast(VT, Ops1[0]),
41805+
TLO.DAG.getTargetConstant(NewBlendMask, DL, MVT::i8));
41806+
return TLO.CombineTo(Op, TLO.DAG.getVectorShuffle(VT, DL, NewBlend,
41807+
TLO.DAG.getUNDEF(VT),
41808+
NewPermuteMask));
41809+
}
41810+
break;
41811+
}
4175741812
case X86ISD::BLENDV: {
4175841813
APInt SelUndef, SelZero;
4175941814
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,

llvm/test/CodeGen/X86/horizontal-sum.ll

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -679,9 +679,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
679679
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
680680
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
681681
; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
682-
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
683-
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
684-
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
682+
; AVX1-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
683+
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
685684
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
686685
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
687686
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -704,9 +703,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
704703
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
705704
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
706705
; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
707-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
708-
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
709-
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
706+
; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
707+
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
710708
; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
711709
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
712710
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
@@ -727,9 +725,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
727725
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
728726
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
729727
; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
730-
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
731-
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
732-
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
728+
; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
729+
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
733730
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
734731
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
735732
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -752,9 +749,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
752749
; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
753750
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
754751
; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
755-
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
756-
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
757-
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
752+
; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
753+
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
758754
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
759755
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
760756
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 28 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1450,19 +1450,17 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
14501450
; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
14511451
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
14521452
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
1453-
; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2]
1454-
; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
1455-
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
1456-
; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
1457-
; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
1458-
; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
1453+
; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
1454+
; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,2]
1455+
; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
1456+
; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,2]
14591457
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
14601458
; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3]
14611459
; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
14621460
; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
14631461
; SSE42-NEXT: movups %xmm5, 16(%rsi)
14641462
; SSE42-NEXT: movups %xmm4, (%rsi)
1465-
; SSE42-NEXT: movdqu %xmm10, 16(%rdx)
1463+
; SSE42-NEXT: movdqu %xmm8, 16(%rdx)
14661464
; SSE42-NEXT: movdqu %xmm6, (%rdx)
14671465
; SSE42-NEXT: movups %xmm9, 16(%rcx)
14681466
; SSE42-NEXT: movups %xmm7, (%rcx)
@@ -1504,19 +1502,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
15041502
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
15051503
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
15061504
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
1507-
; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1508-
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
1509-
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1510-
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
1505+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1506+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1507+
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
1508+
; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm3
1509+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1510+
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
1511+
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
15111512
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
1512-
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1513-
; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1514-
; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
1515-
; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
1516-
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1517-
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
1518-
; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
1519-
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
15201513
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
15211514
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
15221515
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -1534,26 +1527,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
15341527
; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0
15351528
; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1
15361529
; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2
1537-
; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1538-
; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
1539-
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1540-
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
1530+
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1531+
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1532+
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
1533+
; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm3
1534+
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1535+
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
1536+
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
15411537
; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4
1542-
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1543-
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1544-
; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
1545-
; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4
1546-
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1547-
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
1548-
; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
1549-
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
1550-
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
1551-
; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
1552-
; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
15531538
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
1554-
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
1539+
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
1540+
; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
15551541
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
1556-
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
15571542
; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rsi)
15581543
; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdx)
15591544
; AVX2-FAST-ALL-NEXT: vmovups %ymm0, (%rcx)
@@ -1565,19 +1550,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
15651550
; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0
15661551
; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1
15671552
; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2
1568-
; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
1569-
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
1570-
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1571-
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
1553+
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
1554+
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
1555+
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
1556+
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm3
1557+
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1558+
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
1559+
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
15721560
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4
1573-
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
1574-
; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
1575-
; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
1576-
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4
1577-
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
1578-
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
1579-
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5
1580-
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
15811561
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
15821562
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
15831563
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0

llvm/test/CodeGen/X86/pr34592.ll

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,38 +8,40 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
88
; CHECK-O0-NEXT: pushq %rbp
99
; CHECK-O0-NEXT: movq %rsp, %rbp
1010
; CHECK-O0-NEXT: andq $-32, %rsp
11-
; CHECK-O0-NEXT: subq $32, %rsp
11+
; CHECK-O0-NEXT: subq $64, %rsp
1212
; CHECK-O0-NEXT: vmovaps %ymm4, %ymm10
1313
; CHECK-O0-NEXT: vmovaps %ymm3, %ymm9
14+
; CHECK-O0-NEXT: vmovaps %ymm2, (%rsp) # 32-byte Spill
1415
; CHECK-O0-NEXT: vmovaps %ymm1, %ymm8
16+
; CHECK-O0-NEXT: vmovaps %ymm0, %ymm3
17+
; CHECK-O0-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
1518
; CHECK-O0-NEXT: vmovaps 240(%rbp), %ymm4
16-
; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm3
17-
; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm1
18-
; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm1
19+
; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm1
20+
; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm2
21+
; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm2
1922
; CHECK-O0-NEXT: vmovaps 112(%rbp), %ymm11
2023
; CHECK-O0-NEXT: vmovaps 80(%rbp), %ymm11
2124
; CHECK-O0-NEXT: vmovaps 48(%rbp), %ymm11
2225
; CHECK-O0-NEXT: vmovaps 16(%rbp), %ymm11
23-
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7]
24-
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
25-
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
26+
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
27+
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
28+
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
2629
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
27-
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
28-
; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
29-
; CHECK-O0-NEXT: vxorps %xmm2, %xmm2, %xmm2
30-
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
30+
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
31+
; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
32+
; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3
33+
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
34+
; CHECK-O0-NEXT: vmovaps %xmm1, %xmm3
35+
; CHECK-O0-NEXT: vmovaps %xmm7, %xmm1
36+
; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
37+
; CHECK-O0-NEXT: # implicit-def: $ymm1
38+
; CHECK-O0-NEXT: vmovaps %xmm3, %xmm1
39+
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,3]
40+
; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
41+
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5],ymm1[6,7]
3142
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
32-
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
33-
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
34-
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
35-
; CHECK-O0-NEXT: vmovaps %xmm3, %xmm4
36-
; CHECK-O0-NEXT: vmovaps %xmm7, %xmm3
37-
; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
38-
; CHECK-O0-NEXT: # implicit-def: $ymm3
39-
; CHECK-O0-NEXT: vmovaps %xmm4, %xmm3
40-
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,0,1,3]
41-
; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
42-
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5],ymm3[6,7]
43+
; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
44+
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
4345
; CHECK-O0-NEXT: movq %rbp, %rsp
4446
; CHECK-O0-NEXT: popq %rbp
4547
; CHECK-O0-NEXT: retq

0 commit comments

Comments
 (0)