-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y)) #90219
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesIf we don't demand the same element from both single source shuffles (permutes), then attempt to blend the sources together first and then perform a merged permute. We don't attempt this for vXi16 blends as these are much more likely to involve byte/word vector shuffles that will result in the creation of a lot more variable-mask shuffles (PSHUFB etc.). This fold might be worth it for VSELECT with constant masks on AVX512 targets, but I haven't investigated this yet. The PR34592 -O0 regression is an unfortunate failure to cleanup with a later SimplifyDemandedElts pass like the -O3 does - I'm not sure how worried we should be tbh. Patch is 1.66 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/90219.diff 13 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f0cec6224e84e4..6080e950d169d5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41753,6 +41753,61 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
KnownUndef = SrcUndef.zextOrTrunc(NumElts);
break;
}
+ case X86ISD::BLENDI: {
+ SDValue N0 = Op.getOperand(0);
+ SDValue N1 = Op.getOperand(1);
+ if (VT.getScalarSizeInBits() < 32 || !N0.hasOneUse() || !N1.hasOneUse())
+ break;
+
+ // Attempt to fold BLEND(PERMUTE(X),PERMUTE(Y)) -> PERMUTE(BLEND(X,Y))
+ // iff we don't demand the same element index for both X and Y.
+ SDValue BC0 = peekThroughOneUseBitcasts(N0);
+ SDValue BC1 = peekThroughOneUseBitcasts(N1);
+ SmallVector<SDValue, 2> Ops, Ops0, Ops1;
+ SmallVector<int, 32> Mask, Mask0, Mask1, ScaledMask0, ScaledMask1;
+ if (!getTargetShuffleMask(Op, false, Ops, Mask) ||
+ !getTargetShuffleMask(BC0, false, Ops0, Mask0) ||
+ !getTargetShuffleMask(BC1, false, Ops1, Mask1) ||
+ !scaleShuffleElements(Mask0, NumElts, ScaledMask0) ||
+ !scaleShuffleElements(Mask1, NumElts, ScaledMask1))
+ break;
+
+ // Determine the demanded elts from both permutes, confirm that we only use
+ // a single operand and that we don't demand the same index from both.
+ APInt Demanded0, DemandedLHS0, DemandedRHS0;
+ APInt Demanded1, DemandedLHS1, DemandedRHS1;
+ if (getShuffleDemandedElts(NumElts, Mask, DemandedElts, Demanded0,
+ Demanded1, /*AllowUndefElts=*/true) &&
+ getShuffleDemandedElts(NumElts, ScaledMask0, Demanded0, DemandedLHS0,
+ DemandedRHS0, /*AllowUndefElts=*/true) &&
+ getShuffleDemandedElts(NumElts, ScaledMask1, Demanded1, DemandedLHS1,
+ DemandedRHS1, /*AllowUndefElts=*/true) &&
+ DemandedRHS0.isZero() && DemandedRHS1.isZero() &&
+ !DemandedLHS0.intersects(DemandedLHS1)) {
+ // Use the permute demanded elts masks as the new blend mask.
+ uint64_t NewBlendMask = DemandedLHS1.getZExtValue();
+
+ // Create the new permute mask as a blend of the 2 original permute masks.
+ SmallVector<int, 32> NewPermuteMask(NumElts, SM_SentinelUndef);
+ for (int I = 0; I != NumElts; ++I) {
+ if (Demanded0[I])
+ NewPermuteMask[I] = ScaledMask0[I];
+ else if (Demanded1[I])
+ NewPermuteMask[I] = ScaledMask1[I];
+ }
+ assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
+
+ SDLoc DL(Op);
+ SDValue NewBlend = TLO.DAG.getNode(
+ X86ISD::BLENDI, DL, VT, TLO.DAG.getBitcast(VT, Ops0[0]),
+ TLO.DAG.getBitcast(VT, Ops1[0]),
+ TLO.DAG.getTargetConstant(NewBlendMask, DL, MVT::i8));
+ return TLO.CombineTo(Op, TLO.DAG.getVectorShuffle(VT, DL, NewBlend,
+ TLO.DAG.getUNDEF(VT),
+ NewPermuteMask));
+ }
+ break;
+ }
case X86ISD::BLENDV: {
APInt SelUndef, SelZero;
if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef,
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index a026757a0264d6..5fe1e2996ee9b0 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -679,9 +679,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX1-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -704,9 +703,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
+; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
@@ -727,9 +725,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX2-SLOW-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-SLOW-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
@@ -752,9 +749,8 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm4
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
+; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 01056a8b2c24a9..44f965d1c6e84e 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1450,19 +1450,17 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; SSE42-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,3,2,3]
; SSE42-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm1[2,3]
; SSE42-NEXT: insertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm0[1]
-; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm3[2,2,2,2]
-; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,3]
-; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5],xmm10[6,7]
-; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,3]
-; SSE42-NEXT: pshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
-; SSE42-NEXT: pblendw {{.*#+}} xmm10 = xmm8[0,1,2,3,4,5],xmm10[6,7]
+; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,0,3,2]
+; SSE42-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5],xmm8[6,7]
+; SSE42-NEXT: pshufd {{.*#+}} xmm8 = xmm8[1,0,3,2]
; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm2[2,3],xmm7[4,5,6,7]
; SSE42-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm3[0,3]
; SSE42-NEXT: pblendw {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3],xmm9[4,5,6,7]
; SSE42-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,3]
; SSE42-NEXT: movups %xmm5, 16(%rsi)
; SSE42-NEXT: movups %xmm4, (%rsi)
-; SSE42-NEXT: movdqu %xmm10, 16(%rdx)
+; SSE42-NEXT: movdqu %xmm8, 16(%rdx)
; SSE42-NEXT: movdqu %xmm6, (%rdx)
; SSE42-NEXT: movups %xmm9, 16(%rcx)
; SSE42-NEXT: movups %xmm7, (%rcx)
@@ -1504,19 +1502,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-SLOW-NEXT: vmovups (%rdi), %ymm0
; AVX2-SLOW-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-SLOW-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-SLOW-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-SLOW-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-SLOW-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-SLOW-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-SLOW-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-SLOW-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-SLOW-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
; AVX2-SLOW-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -1534,26 +1527,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-ALL-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-ALL-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-ALL-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FAST-ALL-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-FAST-ALL-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FAST-ALL-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
-; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
-; AVX2-FAST-ALL-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX2-FAST-ALL-NEXT: vpermps %ymm2, %ymm5, %ymm2
; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
+; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
+; AVX2-FAST-ALL-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
; AVX2-FAST-ALL-NEXT: vmovups %ymm3, (%rsi)
; AVX2-FAST-ALL-NEXT: vmovups %ymm4, (%rdx)
; AVX2-FAST-ALL-NEXT: vmovups %ymm0, (%rcx)
@@ -1565,19 +1550,14 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
; AVX2-FAST-PERLANE-NEXT: vmovups (%rdi), %ymm0
; AVX2-FAST-PERLANE-NEXT: vmovups 32(%rdi), %ymm1
; AVX2-FAST-PERLANE-NEXT: vmovups 64(%rdi), %ymm2
-; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-FAST-PERLANE-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-FAST-PERLANE-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
-; AVX2-FAST-PERLANE-NEXT: vpermps %ymm5, %ymm6, %ymm5
-; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
; AVX2-FAST-PERLANE-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
; AVX2-FAST-PERLANE-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
; AVX2-FAST-PERLANE-NEXT: vpermps %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll
index 23de746ecb3539..bce060c4c6f56a 100644
--- a/llvm/test/CodeGen/X86/pr34592.ll
+++ b/llvm/test/CodeGen/X86/pr34592.ll
@@ -8,38 +8,40 @@ define <16 x i64> @pluto(<16 x i64> %arg, <16 x i64> %arg1, <16 x i64> %arg2, <1
; CHECK-O0-NEXT: pushq %rbp
; CHECK-O0-NEXT: movq %rsp, %rbp
; CHECK-O0-NEXT: andq $-32, %rsp
-; CHECK-O0-NEXT: subq $32, %rsp
+; CHECK-O0-NEXT: subq $64, %rsp
; CHECK-O0-NEXT: vmovaps %ymm4, %ymm10
; CHECK-O0-NEXT: vmovaps %ymm3, %ymm9
+; CHECK-O0-NEXT: vmovaps %ymm2, (%rsp) # 32-byte Spill
; CHECK-O0-NEXT: vmovaps %ymm1, %ymm8
+; CHECK-O0-NEXT: vmovaps %ymm0, %ymm3
+; CHECK-O0-NEXT: vmovaps (%rsp), %ymm0 # 32-byte Reload
; CHECK-O0-NEXT: vmovaps 240(%rbp), %ymm4
-; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm3
-; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm1
-; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm1
+; CHECK-O0-NEXT: vmovaps 208(%rbp), %ymm1
+; CHECK-O0-NEXT: vmovaps 176(%rbp), %ymm2
+; CHECK-O0-NEXT: vmovaps 144(%rbp), %ymm2
; CHECK-O0-NEXT: vmovaps 112(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 80(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 48(%rbp), %ymm11
; CHECK-O0-NEXT: vmovaps 16(%rbp), %ymm11
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[2],ymm1[2]
+; CHECK-O0-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
; CHECK-O0-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,1,2,1]
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
-; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm7[2,3],ymm6[0,1]
-; CHECK-O0-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5],ymm0[6,7]
+; CHECK-O0-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm7[2,3],ymm6[0,1]
+; CHECK-O0-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5,6,7]
+; CHECK-O0-NEXT: vmovaps %xmm1, %xmm3
+; CHECK-O0-NEXT: vmovaps %xmm7, %xmm1
+; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
+; CHECK-O0-NEXT: # implicit-def: $ymm1
+; CHECK-O0-NEXT: vmovaps %xmm3, %xmm1
+; CHECK-O0-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,0,1,3]
+; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1],ymm3[2,3,4,5],ymm1[6,7]
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} ymm1 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm4 = ymm4[1,1,1,1]
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3,4,5],ymm1[6,7]
-; CHECK-O0-NEXT: vmovaps %xmm3, %xmm4
-; CHECK-O0-NEXT: vmovaps %xmm7, %xmm3
-; CHECK-O0-NEXT: vpblendd {{.*#+}} xmm4 = xmm3[0,1],xmm4[2,3]
-; CHECK-O0-NEXT: # implicit-def: $ymm3
-; CHECK-O0-NEXT: vmovaps %xmm4, %xmm3
-; CHECK-O0-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,0,1,3]
-; CHECK-O0-NEXT: vpslldq {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23]
-; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3,4,5],ymm3[6,7]
+; CHECK-O0-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm4[2,3],ymm1[4,5,6,7]
+; CHECK-O0-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
; CHECK-O0-NEXT: movq %rbp, %rsp
; CHECK-O0-NEXT: popq %rbp
; CHECK-O0-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index a0ea6ddeca7dfd..afdeebc45ed0ab 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -519,19 +519,14 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
; AVX2-NEXT: vmovaps (%rdi), %ymm0
; AVX2-NEXT: vmovaps 32(%rdi), %ymm1
; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
-; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm3
-; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5]
+; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm3
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
+; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7]
+; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6]
; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
-; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
-; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-NEXT: vpermps %ymm2, %ymm4, %ymm4
-; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-NEXT: vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,...
[truncated]
|
Is it due to FastISel fall back the SelectionDAG? I don't think it's a problem here, but I'm concerning the debuggability in general if we do such optimizations under O0. |
664da5e
to
c92980c
Compare
ping? |
case X86ISD::BLENDI: { | ||
SDValue N0 = Op.getOperand(0); | ||
SDValue N1 = Op.getOperand(1); | ||
if (VT.getScalarSizeInBits() < 32 || !N0.hasOneUse() || !N1.hasOneUse()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why not for something like blendvb
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I think eventually I'd like to get BLENDV/VSELECT to work with this as well. This first version is trying very hard not to introduce extra variable blend masks, hence why its just for BLENDI, but with a bit of massaging I think we can get it beneficially working for BLENDV and predicate masks as well.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
afaict there is nothing about the impl that precludes them.
if (Demanded0[I]) | ||
NewPermuteMask[I] = ScaledMask0[I]; | ||
else if (Demanded1[I]) | ||
NewPermuteMask[I] = ScaledMask1[I]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need + NumElts
here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No - by blending first we've merged the 2 inputs into a single vector, so the new mask is now unary. The trick is that we never demand the same element from both sources we've blended together.
If we don't demand the same element from both single source shuffles (permutes), then attempt to blend the sources together first and then perform a merged permute. We don't attempt this for vXi16 blends as these are much more likely to involve byte/word vector shuffles that will result in the creation of a lot more variable-mask shuffles (PSHUFB etc.). This fold might be worth it for VSELECT with constant masks on AVX512 targets, but I haven't investigated this yet. The PR34592 -O0 regression is an unfortunate failure to cleanup with a later SimplifyDemandedElts pass like the -O3 does - I'm not sure how worried we should be tbh.
v16i16 is awkward as both the new blend and permute can be higher in complexity than the original blend(permute,permute) trio.
@goldsteinn I've pulled the combine out into its own combineBlendOfPermutes(). I've managed to get vXi16 VPBLENDW support working, with a few restrictions to avoid infinite combine loops due to creating more shuffles than we started with..... It should be a lot easier to add BLENDV/VSELECT support in a later PR now. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
LGTM. Wait one 1 more. |
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
If we don't demand the same element from both single source shuffles (permutes), then attempt to blend the sources together first and then perform a merged permute.
We don't attempt this for vXi16 blends as these are much more likely to involve byte/word vector shuffles that will result in the creation of a lot more variable-mask shuffles (PSHUFB etc.).
This fold might be worth it for VSELECT with constant masks on AVX512 targets, but I haven't investigated this yet.
The PR34592 -O0 regression is an unfortunate failure to cleanup with a later SimplifyDemandedElts pass like the -O3 does - I'm not sure how worried we should be tbh.