Skip to content

Commit f4d7f0d

Browse files
committed
[X86] combineBlendOfPermutes - add vXi16 (V)PBLENDW handling
v16i16 is awkward as both the new blend and permute can be higher in complexity than the original blend(permute,permute) trio.
1 parent c79a150 commit f4d7f0d

7 files changed

+2572
-2636
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -40094,8 +40094,20 @@ static SDValue combineBlendOfPermutes(MVT VT, SDValue N0, SDValue N1,
4009440094
}
4009540095
}
4009640096
}
40097+
assert(isBlendOrUndef(NewBlendMask) && "Bad blend");
4009740098
assert(isUndefOrInRange(NewPermuteMask, 0, NumElts) && "Bad permute");
4009840099

40100+
// v16i6 shuffles can explode in complexity very easily, only accept them if
40101+
// the blend mask is the same in the 128-bit subvectors (or can widen to
40102+
// v8i32) and the permute can be widened as well.
40103+
if (VT == MVT::v16i16) {
40104+
if (!is128BitLaneRepeatedShuffleMask(VT, NewBlendMask) &&
40105+
!canWidenShuffleElements(NewBlendMask))
40106+
return SDValue();
40107+
if (!canWidenShuffleElements(NewPermuteMask))
40108+
return SDValue();
40109+
}
40110+
4009940111
SDValue NewBlend =
4010040112
DAG.getVectorShuffle(VT, DL, DAG.getBitcast(VT, Ops0[0]),
4010140113
DAG.getBitcast(VT, Ops1[0]), NewBlendMask);
@@ -41856,14 +41868,12 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4185641868
break;
4185741869
}
4185841870
case X86ISD::BLENDI: {
41859-
if (VT.getScalarSizeInBits() >= 32) {
41860-
SmallVector<int, 16> BlendMask;
41861-
DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
41862-
if (SDValue R = combineBlendOfPermutes(VT.getSimpleVT(), Op.getOperand(0),
41863-
Op.getOperand(1), BlendMask,
41864-
DemandedElts, TLO.DAG, SDLoc(Op)))
41865-
return TLO.CombineTo(Op, R);
41866-
}
41871+
SmallVector<int, 16> BlendMask;
41872+
DecodeBLENDMask(NumElts, Op.getConstantOperandVal(2), BlendMask);
41873+
if (SDValue R = combineBlendOfPermutes(VT.getSimpleVT(), Op.getOperand(0),
41874+
Op.getOperand(1), BlendMask,
41875+
DemandedElts, TLO.DAG, SDLoc(Op)))
41876+
return TLO.CombineTo(Op, R);
4186741877
break;
4186841878
}
4186941879
case X86ISD::BLENDV: {

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 25 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -975,18 +975,15 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
975975
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3],xmm1[4],xmm4[5,6],xmm1[7]
976976
; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
977977
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
978-
; SSE42-NEXT: movdqa %xmm2, %xmm3
979-
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
980-
; SSE42-NEXT: movdqa %xmm0, %xmm5
981-
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
982-
; SSE42-NEXT: pshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
983-
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
984-
; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
978+
; SSE42-NEXT: movdqa %xmm0, %xmm3
979+
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
980+
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1,2],xmm2[3],xmm3[4,5],xmm2[6],xmm3[7]
981+
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
985982
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
986-
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
987-
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5,6,7]
983+
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
984+
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
988985
; SSE42-NEXT: movdqu %xmm4, (%rsi)
989-
; SSE42-NEXT: movdqu %xmm5, (%rdx)
986+
; SSE42-NEXT: movdqu %xmm3, (%rdx)
990987
; SSE42-NEXT: movdqu %xmm1, (%rcx)
991988
; SSE42-NEXT: retq
992989
;
@@ -1000,14 +997,12 @@ define void @interleave_24i16_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
1000997
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1001998
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
1002999
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3,4,5],xmm3[6,7]
1003-
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[u,u,u,u,u,u,u,u,u,u,0,1,6,7,12,13]
1004-
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1005-
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
1006-
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
1007-
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,2,3,8,9,14,15]
1000+
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1001+
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
1002+
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11,0,1,6,7,12,13]
10081003
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1009-
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
1010-
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
1004+
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3],xmm2[4],xmm0[5,6],xmm2[7]
1005+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15]
10111006
; AVX1-NEXT: vmovdqu %xmm3, (%rsi)
10121007
; AVX1-NEXT: vmovdqu %xmm4, (%rdx)
10131008
; AVX1-NEXT: vmovdqu %xmm0, (%rcx)
@@ -1125,18 +1120,15 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
11251120
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0],xmm2[1],xmm4[2,3],xmm2[4],xmm4[5,6],xmm2[7]
11261121
; SSE42-NEXT: pshufb {{.*#+}} xmm4 = xmm4[14,15,8,9,2,3,12,13,6,7,0,1,u,u,u,u]
11271122
; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5],xmm3[6,7]
1128-
; SSE42-NEXT: movdqa %xmm0, %xmm3
1129-
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u,u,u,u,u,u,14,15,8,9,2,3]
1130-
; SSE42-NEXT: movdqa %xmm2, %xmm5
1131-
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3,4],xmm1[5],xmm5[6,7]
1132-
; SSE42-NEXT: pshufb {{.*#+}} xmm5 = xmm5[12,13,6,7,0,1,10,11,4,5,u,u,u,u,u,u]
1133-
; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4],xmm3[5,6,7]
1134-
; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,12,13,6,7,0,1]
1123+
; SSE42-NEXT: movdqa %xmm2, %xmm3
1124+
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4],xmm1[5],xmm3[6,7]
1125+
; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm0[1],xmm3[2,3],xmm0[4],xmm3[5,6],xmm0[7]
1126+
; SSE42-NEXT: pshufb {{.*#+}} xmm3 = xmm3[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3]
11351127
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
1136-
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[10,11,4,5,14,15,8,9,2,3,u,u,u,u,u,u]
1137-
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm0[5,6,7]
1128+
; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7]
1129+
; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[10,11,4,5,14,15,8,9,2,3,12,13,6,7,0,1]
11381130
; SSE42-NEXT: movdqu %xmm4, (%rsi)
1139-
; SSE42-NEXT: movdqu %xmm5, (%rdx)
1131+
; SSE42-NEXT: movdqu %xmm3, (%rdx)
11401132
; SSE42-NEXT: movdqu %xmm1, (%rcx)
11411133
; SSE42-NEXT: retq
11421134
;
@@ -1145,14 +1137,12 @@ define void @interleave_24i16_out_reverse(ptr %p, ptr %q1, ptr %q2, ptr %q3) nou
11451137
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
11461138
; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
11471139
; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
1148-
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm2[14,15,8,9,2,3,u,u,u,u,u,u,u,u,u,u]
1149-
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1150-
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,12,13,6,7,0,1,10,11,4,5]
1151-
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3,4,5,6,7]
1152-
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[12,13,6,7,0,1,u,u,u,u,u,u,u,u,u,u]
1153-
; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1154-
; AVX1-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u,u,u,10,11,4,5,14,15,8,9,2,3]
1155-
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm5[3,4,5,6,7]
1140+
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1141+
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3],xmm2[4],xmm3[5,6],xmm2[7]
1142+
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[14,15,8,9,2,3,12,13,6,7,0,1,10,11,4,5]
1143+
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
1144+
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm4[1,2],xmm2[3],xmm4[4,5],xmm2[6],xmm4[7]
1145+
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[12,13,6,7,0,1,10,11,4,5,14,15,8,9,2,3]
11561146
; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
11571147
; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7]
11581148
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]

0 commit comments

Comments
 (0)