Skip to content

Commit 3521ecf

Browse files
committed
[X86] Add vector support to targetShrinkDemandedConstant for OR/XOR opcodes
If a constant is only allsignbits in the demanded/active bits, then sign extend it to an allsignbits bool pattern for OR/XOR ops. This also requires SimplifyDemandedBits XOR handling to be modified to call ShrinkDemandedConstant on any (non-NOT) XOR pattern to account for non-splat cases. Next step towards fixing PR45808 - with this patch we now get a <-1,-1,0,0> v4i64 constant instead of <1,1,0,0>. Differential Revision: https://reviews.llvm.org/D82257
1 parent 7f3d9cc commit 3521ecf

File tree

4 files changed

+69
-38
lines changed

4 files changed

+69
-38
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,7 +1330,8 @@ bool TargetLowering::SimplifyDemandedBits(
13301330
if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
13311331
return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));
13321332

1333-
if (ConstantSDNode *C = isConstOrConstSplat(Op1)) {
1333+
ConstantSDNode* C = isConstOrConstSplat(Op1, DemandedElts);
1334+
if (C) {
13341335
// If one side is a constant, and all of the known set bits on the other
13351336
// side are also set in the constant, turn this into an AND, as we know
13361337
// the bits will be cleared.
@@ -1345,18 +1346,19 @@ bool TargetLowering::SimplifyDemandedBits(
13451346
// If the RHS is a constant, see if we can change it. Don't alter a -1
13461347
// constant because that's a 'not' op, and that is better for combining
13471348
// and codegen.
1348-
if (!C->isAllOnesValue()) {
1349-
if (DemandedBits.isSubsetOf(C->getAPIntValue())) {
1350-
// We're flipping all demanded bits. Flip the undemanded bits too.
1351-
SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
1352-
return TLO.CombineTo(Op, New);
1353-
}
1354-
// If we can't turn this into a 'not', try to shrink the constant.
1355-
if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
1356-
return true;
1349+
if (!C->isAllOnesValue() &&
1350+
DemandedBits.isSubsetOf(C->getAPIntValue())) {
1351+
// We're flipping all demanded bits. Flip the undemanded bits too.
1352+
SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
1353+
return TLO.CombineTo(Op, New);
13571354
}
13581355
}
13591356

1357+
// If we can't turn this into a 'not', try to shrink the constant.
1358+
if (!C || !C->isAllOnesValue())
1359+
if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
1360+
return true;
1361+
13601362
Known ^= Known2;
13611363
break;
13621364
}

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 41 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33221,18 +33221,49 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
3322133221
const APInt &DemandedBits,
3322233222
const APInt &DemandedElts,
3322333223
TargetLoweringOpt &TLO) const {
33224-
// Only optimize Ands to prevent shrinking a constant that could be
33225-
// matched by movzx.
33226-
if (Op.getOpcode() != ISD::AND)
33227-
return false;
33228-
3322933224
EVT VT = Op.getValueType();
33225+
unsigned Opcode = Op.getOpcode();
33226+
unsigned EltSize = VT.getScalarSizeInBits();
3323033227

33231-
// Ignore vectors.
33232-
if (VT.isVector())
33228+
if (VT.isVector()) {
33229+
// If the constant is only all signbits in the active bits, then we should
33230+
// extend it to the entire constant to allow it act as a boolean constant
33231+
// vector.
33232+
auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
33233+
if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
33234+
return false;
33235+
for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
33236+
if (!DemandedElts[i] || V.getOperand(i).isUndef())
33237+
continue;
33238+
const APInt &Val = V.getConstantOperandAPInt(i);
33239+
if (Val.getBitWidth() > Val.getNumSignBits() &&
33240+
Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
33241+
return true;
33242+
}
33243+
return false;
33244+
};
33245+
// For vectors - if we have a constant, then try to sign extend.
33246+
// TODO: Handle AND/ANDN cases.
33247+
unsigned ActiveBits = DemandedBits.getActiveBits();
33248+
if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
33249+
(Opcode == ISD::OR || Opcode == ISD::XOR) &&
33250+
NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
33251+
EVT BoolVT = EVT::getVectorVT(*TLO.DAG.getContext(), MVT::i1,
33252+
VT.getVectorNumElements());
33253+
SDValue NewC =
33254+
TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
33255+
Op.getOperand(1), TLO.DAG.getValueType(BoolVT));
33256+
SDValue NewOp =
33257+
TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
33258+
return TLO.CombineTo(Op, NewOp);
33259+
}
3323333260
return false;
33261+
}
3323433262

33235-
unsigned Size = VT.getSizeInBits();
33263+
// Only optimize Ands to prevent shrinking a constant that could be
33264+
// matched by movzx.
33265+
if (Opcode != ISD::AND)
33266+
return false;
3323633267

3323733268
// Make sure the RHS really is a constant.
3323833269
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
@@ -33254,10 +33285,10 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
3325433285
// Find the next power of 2 width, rounding up to a byte.
3325533286
Width = PowerOf2Ceil(std::max(Width, 8U));
3325633287
// Truncate the width to size to handle illegal types.
33257-
Width = std::min(Width, Size);
33288+
Width = std::min(Width, EltSize);
3325833289

3325933290
// Calculate a possible zero extend mask for this constant.
33260-
APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
33291+
APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
3326133292

3326233293
// If we aren't changing the mask, just return true to keep it and prevent
3326333294
// the caller from optimizing.

llvm/test/CodeGen/X86/promote-cmp.ll

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,20 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
3030
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
3131
; SSE2-NEXT: por %xmm4, %xmm5
3232
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
33-
; SSE2-NEXT: movaps {{.*#+}} xmm4 = <1,1,u,u>
34-
; SSE2-NEXT: xorps %xmm5, %xmm4
35-
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,1,3,3]
33+
; SSE2-NEXT: pcmpeqd %xmm4, %xmm4
34+
; SSE2-NEXT: movaps %xmm5, %xmm6
35+
; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,1],xmm5[3,3]
36+
; SSE2-NEXT: psllq $63, %xmm6
37+
; SSE2-NEXT: psrad $31, %xmm6
38+
; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
39+
; SSE2-NEXT: pand %xmm6, %xmm1
40+
; SSE2-NEXT: pandn %xmm3, %xmm6
41+
; SSE2-NEXT: por %xmm6, %xmm1
42+
; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1,1,3]
43+
; SSE2-NEXT: xorps %xmm4, %xmm5
3644
; SSE2-NEXT: psllq $63, %xmm5
3745
; SSE2-NEXT: psrad $31, %xmm5
38-
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
39-
; SSE2-NEXT: pand %xmm5, %xmm1
40-
; SSE2-NEXT: pandn %xmm3, %xmm5
41-
; SSE2-NEXT: por %xmm5, %xmm1
42-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,1,1,3]
43-
; SSE2-NEXT: psllq $63, %xmm3
44-
; SSE2-NEXT: psrad $31, %xmm3
45-
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
46+
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
4647
; SSE2-NEXT: pand %xmm3, %xmm0
4748
; SSE2-NEXT: pandn %xmm2, %xmm3
4849
; SSE2-NEXT: por %xmm3, %xmm0
@@ -56,10 +57,11 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
5657
; SSE4-NEXT: movdqa %xmm4, %xmm5
5758
; SSE4-NEXT: pcmpgtq %xmm2, %xmm5
5859
; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
59-
; SSE4-NEXT: pxor {{.*}}(%rip), %xmm5
60+
; SSE4-NEXT: pcmpeqd %xmm6, %xmm6
61+
; SSE4-NEXT: pxor %xmm5, %xmm6
6062
; SSE4-NEXT: psllq $63, %xmm0
6163
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
62-
; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero
64+
; SSE4-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero
6365
; SSE4-NEXT: psllq $63, %xmm0
6466
; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
6567
; SSE4-NEXT: movapd %xmm2, %xmm0
@@ -72,17 +74,15 @@ define <4 x i64> @PR45808(<4 x i64> %0, <4 x i64> %1) {
7274
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
7375
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
7476
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
75-
; AVX1-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4
76-
; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm3
7777
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
78+
; AVX1-NEXT: vxorpd {{.*}}(%rip), %ymm2, %ymm2
7879
; AVX1-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
7980
; AVX1-NEXT: retq
8081
;
8182
; AVX2-LABEL: PR45808:
8283
; AVX2: # %bb.0:
8384
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
8485
; AVX2-NEXT: vpxor {{.*}}(%rip), %ymm2, %ymm2
85-
; AVX2-NEXT: vpsllq $63, %ymm2, %ymm2
8686
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
8787
; AVX2-NEXT: retq
8888
%3 = icmp sgt <4 x i64> %0, %1

llvm/test/CodeGen/X86/setcc-lowering.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
1616
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
1717
; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
1818
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
19-
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
20-
; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
2119
; AVX-NEXT: vzeroupper
2220
; AVX-NEXT: retq
2321
;

0 commit comments

Comments
 (0)