Skip to content

Commit 743d45e

Browse files
committed
[TargetLowering] Add SimplifyMultipleUseDemandedBits
This patch introduces the DAG version of SimplifyMultipleUseDemandedBits, which attempts to peek through ops (mainly and/or/xor so far) that don't contribute to the demandedbits/elts of a node - which means we can do this even in cases where we have multiple uses of an op, which normally requires us to demanded all bits/elts. The intention is to remove a similar instruction - SelectionDAG::GetDemandedBits - once SimplifyMultipleUseDemandedBits has matured. The InstCombine version of SimplifyMultipleUseDemandedBits can constant fold which I haven't added here yet, and so far I've only wired this up to some basic binops (and/or/xor/add/sub/mul) to demonstrate its use. We do see a couple of regressions that need to be addressed: AMDGPU unsigned dot product codegen retains an AND mask (for ZERO_EXTEND) that it previously removed (but otherwise the dotproduct codegen is a lot better). X86/AVX2 has poor handling of vector ANY_EXTEND/ANY_EXTEND_VECTOR_INREG - it prematurely gets converted to ZERO_EXTEND_VECTOR_INREG. The code owners have confirmed its ok for these cases to fixed up in future patches. Differential Revision: https://reviews.llvm.org/D63281 llvm-svn: 366799
1 parent 7c35db0 commit 743d45e

File tree

14 files changed

+1062
-1222
lines changed

14 files changed

+1062
-1222
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3065,6 +3065,14 @@ class TargetLowering : public TargetLoweringBase {
30653065
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
30663066
DAGCombinerInfo &DCI) const;
30673067

3068+
/// More limited version of SimplifyDemandedBits that can be used to "look
3069+
/// through" ops that don't contribute to the DemandedBits/DemandedElts -
3070+
/// bitwise ops etc.
3071+
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits,
3072+
const APInt &DemandedElts,
3073+
SelectionDAG &DAG,
3074+
unsigned Depth) const;
3075+
30683076
/// Look at Vector Op. At this point, we know that only the DemandedElts
30693077
/// elements of the result of Op are ever used downstream. If we can use
30703078
/// this information to simplify Op, create a new simplified DAG node and
@@ -3139,6 +3147,13 @@ class TargetLowering : public TargetLoweringBase {
31393147
TargetLoweringOpt &TLO,
31403148
unsigned Depth = 0) const;
31413149

3150+
/// More limited version of SimplifyDemandedBits that can be used to "look
3151+
/// through" ops that don't contribute to the DemandedBits/DemandedElts -
3152+
/// bitwise ops etc.
3153+
virtual SDValue SimplifyMultipleUseDemandedBitsForTargetNode(
3154+
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
3155+
SelectionDAG &DAG, unsigned Depth) const;
3156+
31423157
/// This method returns the constant pool value that will be loaded by LD.
31433158
/// NOTE: You must check for implicit extensions of the constant by LD.
31443159
virtual const Constant *getTargetConstantFromLoad(LoadSDNode *LD) const;

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,61 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
564564
AssumeSingleUse);
565565
}
566566

567+
// TODO: Can we merge SelectionDAG::GetDemandedBits into this?
568+
// TODO: Under what circumstances can we create nodes? BITCAST? Constant?
569+
SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
570+
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
571+
SelectionDAG &DAG, unsigned Depth) const {
572+
KnownBits LHSKnown, RHSKnown;
573+
switch (Op.getOpcode()) {
574+
case ISD::AND: {
575+
LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
576+
RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
577+
578+
// If all of the demanded bits are known 1 on one side, return the other.
579+
// These bits cannot contribute to the result of the 'and' in this
580+
// context.
581+
if (DemandedBits.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
582+
return Op.getOperand(0);
583+
if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
584+
return Op.getOperand(1);
585+
break;
586+
}
587+
case ISD::OR: {
588+
LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
589+
RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
590+
591+
// If all of the demanded bits are known zero on one side, return the
592+
// other. These bits cannot contribute to the result of the 'or' in this
593+
// context.
594+
if (DemandedBits.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
595+
return Op.getOperand(0);
596+
if (DemandedBits.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
597+
return Op.getOperand(1);
598+
break;
599+
}
600+
case ISD::XOR: {
601+
LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
602+
RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
603+
604+
// If all of the demanded bits are known zero on one side, return the
605+
// other.
606+
if (DemandedBits.isSubsetOf(RHSKnown.Zero))
607+
return Op.getOperand(0);
608+
if (DemandedBits.isSubsetOf(LHSKnown.Zero))
609+
return Op.getOperand(1);
610+
break;
611+
}
612+
default:
613+
if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
614+
if (SDValue V = SimplifyMultipleUseDemandedBitsForTargetNode(
615+
Op, DemandedBits, DemandedElts, DAG, Depth))
616+
return V;
617+
break;
618+
}
619+
return SDValue();
620+
}
621+
567622
/// Look at Op. At this point, we know that only the OriginalDemandedBits of the
568623
/// result of Op are ever used downstream. If we can use this information to
569624
/// simplify Op, create a new simplified DAG node and return true, returning the
@@ -834,6 +889,20 @@ bool TargetLowering::SimplifyDemandedBits(
834889
return true;
835890
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
836891

892+
// Attempt to avoid multi-use ops if we don't need anything from them.
893+
if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
894+
SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
895+
Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
896+
SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
897+
Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
898+
if (DemandedOp0 || DemandedOp1) {
899+
Op0 = DemandedOp0 ? DemandedOp0 : Op0;
900+
Op1 = DemandedOp1 ? DemandedOp1 : Op1;
901+
SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
902+
return TLO.CombineTo(Op, NewOp);
903+
}
904+
}
905+
837906
// If all of the demanded bits are known one on one side, return the other.
838907
// These bits cannot contribute to the result of the 'and'.
839908
if (DemandedBits.isSubsetOf(Known2.Zero | Known.One))
@@ -869,6 +938,20 @@ bool TargetLowering::SimplifyDemandedBits(
869938
return true;
870939
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
871940

941+
// Attempt to avoid multi-use ops if we don't need anything from them.
942+
if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
943+
SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
944+
Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
945+
SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
946+
Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
947+
if (DemandedOp0 || DemandedOp1) {
948+
Op0 = DemandedOp0 ? DemandedOp0 : Op0;
949+
Op1 = DemandedOp1 ? DemandedOp1 : Op1;
950+
SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
951+
return TLO.CombineTo(Op, NewOp);
952+
}
953+
}
954+
872955
// If all of the demanded bits are known zero on one side, return the other.
873956
// These bits cannot contribute to the result of the 'or'.
874957
if (DemandedBits.isSubsetOf(Known2.One | Known.Zero))
@@ -901,6 +984,20 @@ bool TargetLowering::SimplifyDemandedBits(
901984
return true;
902985
assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
903986

987+
// Attempt to avoid multi-use ops if we don't need anything from them.
988+
if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
989+
SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
990+
Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
991+
SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
992+
Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
993+
if (DemandedOp0 || DemandedOp1) {
994+
Op0 = DemandedOp0 ? DemandedOp0 : Op0;
995+
Op1 = DemandedOp1 ? DemandedOp1 : Op1;
996+
SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
997+
return TLO.CombineTo(Op, NewOp);
998+
}
999+
}
1000+
9041001
// If all of the demanded bits are known zero on one side, return the other.
9051002
// These bits cannot contribute to the result of the 'xor'.
9061003
if (DemandedBits.isSubsetOf(Known.Zero))
@@ -1663,6 +1760,7 @@ bool TargetLowering::SimplifyDemandedBits(
16631760
// Add, Sub, and Mul don't demand any bits in positions beyond that
16641761
// of the highest bit demanded of them.
16651762
SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
1763+
SDNodeFlags Flags = Op.getNode()->getFlags();
16661764
unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros();
16671765
APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
16681766
if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO,
@@ -1671,7 +1769,6 @@ bool TargetLowering::SimplifyDemandedBits(
16711769
Depth + 1) ||
16721770
// See if the operation should be performed at a smaller bit width.
16731771
ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
1674-
SDNodeFlags Flags = Op.getNode()->getFlags();
16751772
if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
16761773
// Disable the nsw and nuw flags. We can no longer guarantee that we
16771774
// won't wrap after simplification.
@@ -1684,6 +1781,23 @@ bool TargetLowering::SimplifyDemandedBits(
16841781
return true;
16851782
}
16861783

1784+
// Attempt to avoid multi-use ops if we don't need anything from them.
1785+
if (!LoMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
1786+
SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
1787+
Op0, LoMask, DemandedElts, TLO.DAG, Depth + 1);
1788+
SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
1789+
Op1, LoMask, DemandedElts, TLO.DAG, Depth + 1);
1790+
if (DemandedOp0 || DemandedOp1) {
1791+
Flags.setNoSignedWrap(false);
1792+
Flags.setNoUnsignedWrap(false);
1793+
Op0 = DemandedOp0 ? DemandedOp0 : Op0;
1794+
Op1 = DemandedOp1 ? DemandedOp1 : Op1;
1795+
SDValue NewOp =
1796+
TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags);
1797+
return TLO.CombineTo(Op, NewOp);
1798+
}
1799+
}
1800+
16871801
// If we have a constant operand, we may be able to turn it into -1 if we
16881802
// do not demand the high bits. This can make the constant smaller to
16891803
// encode, allow more general folding, or match specialized instruction
@@ -2357,6 +2471,19 @@ bool TargetLowering::SimplifyDemandedBitsForTargetNode(
23572471
return false;
23582472
}
23592473

2474+
SDValue TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
2475+
SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
2476+
SelectionDAG &DAG, unsigned Depth) const {
2477+
assert(
2478+
(Op.getOpcode() >= ISD::BUILTIN_OP_END ||
2479+
Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
2480+
Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
2481+
Op.getOpcode() == ISD::INTRINSIC_VOID) &&
2482+
"Should use SimplifyMultipleUseDemandedBits if you don't know whether Op"
2483+
" is a target node!");
2484+
return SDValue();
2485+
}
2486+
23602487
const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
23612488
return nullptr;
23622489
}

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4308,6 +4308,7 @@ bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,
43084308
resetSchedule();
43094309
initialFillReadyList(ReadyInsts);
43104310
}
4311+
assert(Bundle && "Failed to find schedule bundle");
43114312

43124313
LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
43134314
<< BB->getName() << "\n");

llvm/test/CodeGen/AArch64/bitfield-insert.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,7 @@ define void @test_32bit_opnd1_better(i32* %existing, i32* %new) {
265265
define i32 @test_nouseful_bits(i8 %a, i32 %b) {
266266
; CHECK-LABEL: test_nouseful_bits:
267267
; CHECK: // %bb.0:
268-
; CHECK-NEXT: mov w8, w0
269-
; CHECK-NEXT: bfi w8, w8, #8, #24
268+
; CHECK-NEXT: orr w8, w0, w8, lsl #8
270269
; CHECK-NEXT: mov w9, w0
271270
; CHECK-NEXT: bfi w9, w8, #8, #24
272271
; CHECK-NEXT: bfi w0, w9, #8, #24

llvm/test/CodeGen/AMDGPU/idot4s.ll

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -899,41 +899,28 @@ define amdgpu_kernel void @idot4_acc16_vecMul(<4 x i8> addrspace(1)* %src1,
899899
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
900900
; GFX7-NEXT: s_mov_b32 s3, 0xf000
901901
; GFX7-NEXT: s_mov_b32 s2, -1
902-
; GFX7-NEXT: s_mov_b32 s8, 0xffff
903902
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
904903
; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0
905904
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
906905
; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0
907906
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
908-
; GFX7-NEXT: s_sext_i32_i8 s6, s4
909-
; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80008
910-
; GFX7-NEXT: s_sext_i32_i8 s10, s5
907+
; GFX7-NEXT: s_ashr_i32 s6, s4, 24
908+
; GFX7-NEXT: s_bfe_i32 s7, s4, 0x80010
909+
; GFX7-NEXT: s_bfe_i32 s10, s5, 0x80010
911910
; GFX7-NEXT: s_bfe_i32 s11, s5, 0x80008
912-
; GFX7-NEXT: s_bfe_i32 s12, s5, 0x80010
913-
; GFX7-NEXT: s_ashr_i32 s5, s5, 24
914-
; GFX7-NEXT: v_mov_b32_e32 v3, s11
915-
; GFX7-NEXT: v_mov_b32_e32 v4, s10
916-
; GFX7-NEXT: s_bfe_i32 s9, s4, 0x80010
917-
; GFX7-NEXT: v_mov_b32_e32 v2, s12
918-
; GFX7-NEXT: s_ashr_i32 s4, s4, 24
911+
; GFX7-NEXT: s_ashr_i32 s9, s5, 24
912+
; GFX7-NEXT: s_sext_i32_i8 s5, s5
913+
; GFX7-NEXT: s_bfe_i32 s8, s4, 0x80008
914+
; GFX7-NEXT: s_sext_i32_i8 s4, s4
919915
; GFX7-NEXT: v_mov_b32_e32 v1, s5
920-
; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1
921-
; GFX7-NEXT: v_mul_i32_i24_e32 v2, s9, v2
922-
; GFX7-NEXT: v_mul_i32_i24_e32 v3, s7, v3
923-
; GFX7-NEXT: v_mul_i32_i24_e32 v4, s6, v4
924-
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
925-
; GFX7-NEXT: v_and_b32_e32 v2, s8, v2
926-
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
927-
; GFX7-NEXT: v_and_b32_e32 v4, s8, v4
928-
; GFX7-NEXT: v_or_b32_e32 v1, v2, v1
929-
; GFX7-NEXT: v_or_b32_e32 v2, v4, v3
930-
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v2
931-
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1
916+
; GFX7-NEXT: v_mov_b32_e32 v2, s11
917+
; GFX7-NEXT: v_mov_b32_e32 v3, s10
932918
; GFX7-NEXT: s_waitcnt vmcnt(0)
933-
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
934-
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0
935-
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
936-
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0
919+
; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0
920+
; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0
921+
; GFX7-NEXT: v_mad_i32_i24 v0, s7, v3, v0
922+
; GFX7-NEXT: v_mov_b32_e32 v1, s9
923+
; GFX7-NEXT: v_mad_i32_i24 v0, s6, v1, v0
937924
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
938925
; GFX7-NEXT: s_endpgm
939926
;

0 commit comments

Comments
 (0)