Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit fc170d8

Browse files
committed
[CodeGenPrepare] Sink and duplicate more 'and' instructions.
Summary: Rework the code that was sinking/duplicating (icmp and, 0) sequences into blocks where they were being used by conditional branches to form more tbz instructions on AArch64. The new code is more general in that it just looks for 'and's that have all icmp 0's as users, with a target hook used to select which subset of 'and' instructions to consider. This change also enables 'and' sinking for X86, where it is more widely beneficial than on AArch64. The 'and' sinking/duplicating code is moved into the optimizeInst phase of CodeGenPrepare, where it can take advantage of the fact the OptimizeCmpExpression has already sunk/duplicated any icmps into the blocks where they are used. One minor complication from this change is that optimizeLoadExt needed to be updated to always mark 'and's it has determined should be in the same block as their feeding load in the InsertedInsts set to avoid an infinite loop of hoisting and sinking the same 'and'. This change fixes a regression on X86 in the tsan runtime caused by moving GVNHoist to a later place in the optimization pipeline (see PR31382). Reviewers: t.p.northover, qcolombet, MatzeB Subscribers: aemerson, mcrosier, sebpop, llvm-commits Differential Revision: https://reviews.llvm.org/D28813 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295746 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent 8d22e6f commit fc170d8

File tree

10 files changed

+403
-91
lines changed

10 files changed

+403
-91
lines changed

include/llvm/Target/TargetLowering.h

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -395,16 +395,15 @@ class TargetLoweringBase {
395395
/// \brief Return if the target supports combining a
396396
/// chain like:
397397
/// \code
398-
/// %andResult = and %val1, #imm-with-one-bit-set;
398+
/// %andResult = and %val1, #mask
399399
/// %icmpResult = icmp %andResult, 0
400-
/// br i1 %icmpResult, label %dest1, label %dest2
401400
/// \endcode
402401
/// into a single machine instruction of a form like:
403402
/// \code
404-
/// brOnBitSet %register, #bitNumber, dest
403+
/// cc = test %register, #mask
405404
/// \endcode
406-
bool isMaskAndBranchFoldingLegal() const {
407-
return MaskAndBranchFoldingIsLegal;
405+
virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
406+
return false;
408407
}
409408

410409
/// Return true if the target should transform:
@@ -2202,10 +2201,6 @@ class TargetLoweringBase {
22022201
/// the branch is usually predicted right.
22032202
bool PredictableSelectIsExpensive;
22042203

2205-
/// MaskAndBranchFoldingIsLegal - Indicates if the target supports folding
2206-
/// a mask of a single bit, a compare, and a branch into a single instruction.
2207-
bool MaskAndBranchFoldingIsLegal;
2208-
22092204
/// \see enableExtLdPromotion.
22102205
bool EnableExtLdPromotion;
22112206

lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 89 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
7777
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
7878
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
7979
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
80-
STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
8180
STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
8281

8382
static cl::opt<bool> DisableBranchOpts(
@@ -217,7 +216,6 @@ class TypePromotionTransaction;
217216
bool optimizeExtractElementInst(Instruction *Inst);
218217
bool dupRetToEnableTailCallOpts(BasicBlock *BB);
219218
bool placeDbgValues(Function &F);
220-
bool sinkAndCmp(Function &F);
221219
bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
222220
Instruction *&Inst,
223221
const SmallVectorImpl<Instruction *> &Exts,
@@ -295,14 +293,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
295293
// find a node corresponding to the value.
296294
EverMadeChange |= placeDbgValues(F);
297295

298-
// If there is a mask, compare against zero, and branch that can be combined
299-
// into a single target instruction, push the mask and compare into branch
300-
// users. Do this before OptimizeBlock -> OptimizeInst ->
301-
// OptimizeCmpExpression, which perturbs the pattern being searched for.
302-
if (!DisableBranchOpts) {
303-
EverMadeChange |= sinkAndCmp(F);
296+
if (!DisableBranchOpts)
304297
EverMadeChange |= splitBranchCondition(F);
305-
}
306298

307299
bool MadeChange = true;
308300
while (MadeChange) {
@@ -1095,6 +1087,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
10951087
return false;
10961088
}
10971089

1090+
/// Duplicate and sink the given 'and' instruction into user blocks where it is
1091+
/// used in a compare to allow isel to generate better code for targets where
1092+
/// this operation can be combined.
1093+
///
1094+
/// Return true if any changes are made.
1095+
static bool sinkAndCmp0Expression(Instruction *AndI,
1096+
const TargetLowering &TLI,
1097+
SetOfInstrs &InsertedInsts) {
1098+
// Double-check that we're not trying to optimize an instruction that was
1099+
// already optimized by some other part of this pass.
1100+
assert(!InsertedInsts.count(AndI) &&
1101+
"Attempting to optimize already optimized and instruction");
1102+
(void) InsertedInsts;
1103+
1104+
// Nothing to do for single use in same basic block.
1105+
if (AndI->hasOneUse() &&
1106+
AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
1107+
return false;
1108+
1109+
// Try to avoid cases where sinking/duplicating is likely to increase register
1110+
// pressure.
1111+
if (!isa<ConstantInt>(AndI->getOperand(0)) &&
1112+
!isa<ConstantInt>(AndI->getOperand(1)) &&
1113+
AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
1114+
return false;
1115+
1116+
for (auto *U : AndI->users()) {
1117+
Instruction *User = cast<Instruction>(U);
1118+
1119+
// Only sink for and mask feeding icmp with 0.
1120+
if (!isa<ICmpInst>(User))
1121+
return false;
1122+
1123+
auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
1124+
if (!CmpC || !CmpC->isZero())
1125+
return false;
1126+
}
1127+
1128+
if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
1129+
return false;
1130+
1131+
DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
1132+
DEBUG(AndI->getParent()->dump());
1133+
1134+
// Push the 'and' into the same block as the icmp 0. There should only be
1135+
// one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
1136+
// others, so we don't need to keep track of which BBs we insert into.
1137+
for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
1138+
UI != E; ) {
1139+
Use &TheUse = UI.getUse();
1140+
Instruction *User = cast<Instruction>(*UI);
1141+
1142+
// Preincrement use iterator so we don't invalidate it.
1143+
++UI;
1144+
1145+
DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
1146+
1147+
// Keep the 'and' in the same place if the use is already in the same block.
1148+
Instruction *InsertPt =
1149+
User->getParent() == AndI->getParent() ? AndI : User;
1150+
Instruction *InsertedAnd =
1151+
BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
1152+
AndI->getOperand(1), "", InsertPt);
1153+
// Propagate the debug info.
1154+
InsertedAnd->setDebugLoc(AndI->getDebugLoc());
1155+
1156+
// Replace a use of the 'and' with a use of the new 'and'.
1157+
TheUse = InsertedAnd;
1158+
++NumAndUses;
1159+
DEBUG(User->getParent()->dump());
1160+
}
1161+
1162+
// We removed all uses, nuke the and.
1163+
AndI->eraseFromParent();
1164+
return true;
1165+
}
1166+
10981167
/// Check if the candidates could be combined with a shift instruction, which
10991168
/// includes:
11001169
/// 1. Truncate instruction
@@ -4544,13 +4613,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
45444613
!(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
45454614
return false;
45464615

4547-
// Skip loads we've already transformed or have no reason to transform.
4548-
if (Load->hasOneUse()) {
4549-
User *LoadUser = *Load->user_begin();
4550-
if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
4551-
!dyn_cast<PHINode>(LoadUser))
4552-
return false;
4553-
}
4616+
// Skip loads we've already transformed.
4617+
if (Load->hasOneUse() &&
4618+
InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
4619+
return false;
45544620

45554621
// Look at all uses of Load, looking through phis, to determine how many bits
45564622
// of the loaded value are needed.
@@ -4646,6 +4712,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
46464712
IRBuilder<> Builder(Load->getNextNode());
46474713
auto *NewAnd = dyn_cast<Instruction>(
46484714
Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
4715+
// Mark this instruction as "inserted by CGP", so that other
4716+
// optimizations don't touch it.
4717+
InsertedInsts.insert(NewAnd);
46494718

46504719
// Replace all uses of load with new and (except for the use of load in the
46514720
// new and itself).
@@ -5560,6 +5629,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
55605629

55615630
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
55625631

5632+
if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
5633+
EnableAndCmpSinking && TLI)
5634+
return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
5635+
55635636
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
55645637
BinOp->getOpcode() == Instruction::LShr)) {
55655638
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
@@ -5689,68 +5762,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
56895762
return MadeChange;
56905763
}
56915764

5692-
// If there is a sequence that branches based on comparing a single bit
5693-
// against zero that can be combined into a single instruction, and the
5694-
// target supports folding these into a single instruction, sink the
5695-
// mask and compare into the branch uses. Do this before OptimizeBlock ->
5696-
// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
5697-
// searched for.
5698-
bool CodeGenPrepare::sinkAndCmp(Function &F) {
5699-
if (!EnableAndCmpSinking)
5700-
return false;
5701-
if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
5702-
return false;
5703-
bool MadeChange = false;
5704-
for (BasicBlock &BB : F) {
5705-
// Does this BB end with the following?
5706-
// %andVal = and %val, #single-bit-set
5707-
// %icmpVal = icmp %andResult, 0
5708-
// br i1 %cmpVal label %dest1, label %dest2"
5709-
BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
5710-
if (!Brcc || !Brcc->isConditional())
5711-
continue;
5712-
ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
5713-
if (!Cmp || Cmp->getParent() != &BB)
5714-
continue;
5715-
ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
5716-
if (!Zero || !Zero->isZero())
5717-
continue;
5718-
Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
5719-
if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
5720-
continue;
5721-
ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
5722-
if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
5723-
continue;
5724-
DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
5725-
5726-
// Push the "and; icmp" for any users that are conditional branches.
5727-
// Since there can only be one branch use per BB, we don't need to keep
5728-
// track of which BBs we insert into.
5729-
for (Use &TheUse : Cmp->uses()) {
5730-
// Find brcc use.
5731-
BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
5732-
if (!BrccUser || !BrccUser->isConditional())
5733-
continue;
5734-
BasicBlock *UserBB = BrccUser->getParent();
5735-
if (UserBB == &BB) continue;
5736-
DEBUG(dbgs() << "found Brcc use\n");
5737-
5738-
// Sink the "and; icmp" to use.
5739-
MadeChange = true;
5740-
BinaryOperator *NewAnd =
5741-
BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
5742-
BrccUser);
5743-
CmpInst *NewCmp =
5744-
CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
5745-
"", BrccUser);
5746-
TheUse = NewCmp;
5747-
++NumAndCmpsMoved;
5748-
DEBUG(BrccUser->getParent()->dump());
5749-
}
5750-
}
5751-
return MadeChange;
5752-
}
5753-
57545765
/// \brief Scale down both weights to fit into uint32_t.
57555766
static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
57565767
uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;

lib/CodeGen/TargetLoweringBase.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
838838
HasExtractBitsInsn = false;
839839
JumpIsExpensive = JumpIsExpensiveOverride;
840840
PredictableSelectIsExpensive = false;
841-
MaskAndBranchFoldingIsLegal = false;
842841
EnableExtLdPromotion = false;
843842
HasFloatingPointExceptions = true;
844843
StackPointerRegisterToSaveRestore = 0;

lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -555,8 +555,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
555555

556556
setSchedulingPreference(Sched::Hybrid);
557557

558-
// Enable TBZ/TBNZ
559-
MaskAndBranchFoldingIsLegal = true;
560558
EnableExtLdPromotion = true;
561559

562560
// Set required alignment.
@@ -10644,6 +10642,19 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
1064410642
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
1064510643
}
1064610644

10645+
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
10646+
const Instruction &AndI) const {
10647+
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
10648+
// this is likely to be fold the and/cmp/br into a single tbz instruction. It
10649+
// may be beneficial to sink in other cases, but we would have to check that
10650+
// the cmp would not get folded into the br to form a cbz for these to be
10651+
// beneficial.
10652+
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
10653+
if (!Mask)
10654+
return false;
10655+
return Mask->getUniqueInteger().isPowerOf2();
10656+
}
10657+
1064710658
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1064810659
// Update IsSplitCSR in AArch64unctionInfo.
1064910660
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();

lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,8 @@ class AArch64TargetLowering : public TargetLowering {
412412
return true;
413413
}
414414

415+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
416+
415417
bool hasAndNotCompare(SDValue) const override {
416418
// 'bics'
417419
return true;

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4455,6 +4455,11 @@ bool X86TargetLowering::isCtlzFast() const {
44554455
return Subtarget.hasFastLZCNT();
44564456
}
44574457

4458+
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4459+
const Instruction &AndI) const {
4460+
return true;
4461+
}
4462+
44584463
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
44594464
if (!Subtarget.hasBMI())
44604465
return false;

lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,8 @@ namespace llvm {
808808
return false;
809809
}
810810

811+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
812+
811813
bool hasAndNotCompare(SDValue Y) const override;
812814

813815
/// Return the value type to use for ISD::SETCC.

0 commit comments

Comments
 (0)