Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 1645f39

Browse files
geoffberryadrian-prantl
authored andcommitted
[CodeGenPrepare] Sink and duplicate more 'and' instructions.
Summary: Rework the code that was sinking/duplicating (icmp and, 0) sequences into blocks where they were being used by conditional branches to form more tbz instructions on AArch64. The new code is more general in that it just looks for 'and's that have all icmp 0's as users, with a target hook used to select which subset of 'and' instructions to consider. This change also enables 'and' sinking for X86, where it is more widely beneficial than on AArch64. The 'and' sinking/duplicating code is moved into the optimizeInst phase of CodeGenPrepare, where it can take advantage of the fact the OptimizeCmpExpression has already sunk/duplicated any icmps into the blocks where they are used. One minor complication from this change is that optimizeLoadExt needed to be updated to always mark 'and's it has determined should be in the same block as their feeding load in the InsertedInsts set to avoid an infinite loop of hoisting and sinking the same 'and'. This change fixes a regression on X86 in the tsan runtime caused by moving GVNHoist to a later place in the optimization pipeline (see PR31382). Reviewers: t.p.northover, qcolombet, MatzeB Subscribers: aemerson, mcrosier, sebpop, llvm-commits Differential Revision: https://reviews.llvm.org/D28813 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295746 91177308-0d34-0410-b5e6-96231b3b80d8 (cherry picked from commit fc170d8)
1 parent c9ed111 commit 1645f39

File tree

10 files changed

+403
-91
lines changed

10 files changed

+403
-91
lines changed

include/llvm/Target/TargetLowering.h

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -395,16 +395,15 @@ class TargetLoweringBase {
395395
/// \brief Return if the target supports combining a
396396
/// chain like:
397397
/// \code
398-
/// %andResult = and %val1, #imm-with-one-bit-set;
398+
/// %andResult = and %val1, #mask
399399
/// %icmpResult = icmp %andResult, 0
400-
/// br i1 %icmpResult, label %dest1, label %dest2
401400
/// \endcode
402401
/// into a single machine instruction of a form like:
403402
/// \code
404-
/// brOnBitSet %register, #bitNumber, dest
403+
/// cc = test %register, #mask
405404
/// \endcode
406-
bool isMaskAndBranchFoldingLegal() const {
407-
return MaskAndBranchFoldingIsLegal;
405+
virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
406+
return false;
408407
}
409408

410409
/// Return true if the target should transform:
@@ -2203,10 +2202,6 @@ class TargetLoweringBase {
22032202
/// the branch is usually predicted right.
22042203
bool PredictableSelectIsExpensive;
22052204

2206-
/// MaskAndBranchFoldingIsLegal - Indicates if the target supports folding
2207-
/// a mask of a single bit, a compare, and a branch into a single instruction.
2208-
bool MaskAndBranchFoldingIsLegal;
2209-
22102205
/// \see enableExtLdPromotion.
22112206
bool EnableExtLdPromotion;
22122207

lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 89 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
7777
STATISTIC(NumRetsDup, "Number of return instructions duplicated");
7878
STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
7979
STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
80-
STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
8180
STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
8281

8382
static cl::opt<bool> DisableBranchOpts(
@@ -215,7 +214,6 @@ class TypePromotionTransaction;
215214
bool optimizeExtractElementInst(Instruction *Inst);
216215
bool dupRetToEnableTailCallOpts(BasicBlock *BB);
217216
bool placeDbgValues(Function &F);
218-
bool sinkAndCmp(Function &F);
219217
bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
220218
Instruction *&Inst,
221219
const SmallVectorImpl<Instruction *> &Exts,
@@ -290,14 +288,8 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
290288
// find a node corresponding to the value.
291289
EverMadeChange |= placeDbgValues(F);
292290

293-
// If there is a mask, compare against zero, and branch that can be combined
294-
// into a single target instruction, push the mask and compare into branch
295-
// users. Do this before OptimizeBlock -> OptimizeInst ->
296-
// OptimizeCmpExpression, which perturbs the pattern being searched for.
297-
if (!DisableBranchOpts) {
298-
EverMadeChange |= sinkAndCmp(F);
291+
if (!DisableBranchOpts)
299292
EverMadeChange |= splitBranchCondition(F);
300-
}
301293

302294
bool MadeChange = true;
303295
while (MadeChange) {
@@ -1090,6 +1082,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
10901082
return false;
10911083
}
10921084

1085+
/// Duplicate and sink the given 'and' instruction into user blocks where it is
1086+
/// used in a compare to allow isel to generate better code for targets where
1087+
/// this operation can be combined.
1088+
///
1089+
/// Return true if any changes are made.
1090+
static bool sinkAndCmp0Expression(Instruction *AndI,
1091+
const TargetLowering &TLI,
1092+
SetOfInstrs &InsertedInsts) {
1093+
// Double-check that we're not trying to optimize an instruction that was
1094+
// already optimized by some other part of this pass.
1095+
assert(!InsertedInsts.count(AndI) &&
1096+
"Attempting to optimize already optimized and instruction");
1097+
(void) InsertedInsts;
1098+
1099+
// Nothing to do for single use in same basic block.
1100+
if (AndI->hasOneUse() &&
1101+
AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
1102+
return false;
1103+
1104+
// Try to avoid cases where sinking/duplicating is likely to increase register
1105+
// pressure.
1106+
if (!isa<ConstantInt>(AndI->getOperand(0)) &&
1107+
!isa<ConstantInt>(AndI->getOperand(1)) &&
1108+
AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
1109+
return false;
1110+
1111+
for (auto *U : AndI->users()) {
1112+
Instruction *User = cast<Instruction>(U);
1113+
1114+
// Only sink for and mask feeding icmp with 0.
1115+
if (!isa<ICmpInst>(User))
1116+
return false;
1117+
1118+
auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
1119+
if (!CmpC || !CmpC->isZero())
1120+
return false;
1121+
}
1122+
1123+
if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
1124+
return false;
1125+
1126+
DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
1127+
DEBUG(AndI->getParent()->dump());
1128+
1129+
// Push the 'and' into the same block as the icmp 0. There should only be
1130+
// one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
1131+
// others, so we don't need to keep track of which BBs we insert into.
1132+
for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
1133+
UI != E; ) {
1134+
Use &TheUse = UI.getUse();
1135+
Instruction *User = cast<Instruction>(*UI);
1136+
1137+
// Preincrement use iterator so we don't invalidate it.
1138+
++UI;
1139+
1140+
DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
1141+
1142+
// Keep the 'and' in the same place if the use is already in the same block.
1143+
Instruction *InsertPt =
1144+
User->getParent() == AndI->getParent() ? AndI : User;
1145+
Instruction *InsertedAnd =
1146+
BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
1147+
AndI->getOperand(1), "", InsertPt);
1148+
// Propagate the debug info.
1149+
InsertedAnd->setDebugLoc(AndI->getDebugLoc());
1150+
1151+
// Replace a use of the 'and' with a use of the new 'and'.
1152+
TheUse = InsertedAnd;
1153+
++NumAndUses;
1154+
DEBUG(User->getParent()->dump());
1155+
}
1156+
1157+
// We removed all uses, nuke the and.
1158+
AndI->eraseFromParent();
1159+
return true;
1160+
}
1161+
10931162
/// Check if the candidates could be combined with a shift instruction, which
10941163
/// includes:
10951164
/// 1. Truncate instruction
@@ -4534,13 +4603,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
45344603
!(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
45354604
return false;
45364605

4537-
// Skip loads we've already transformed or have no reason to transform.
4538-
if (Load->hasOneUse()) {
4539-
User *LoadUser = *Load->user_begin();
4540-
if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
4541-
!dyn_cast<PHINode>(LoadUser))
4542-
return false;
4543-
}
4606+
// Skip loads we've already transformed.
4607+
if (Load->hasOneUse() &&
4608+
InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
4609+
return false;
45444610

45454611
// Look at all uses of Load, looking through phis, to determine how many bits
45464612
// of the loaded value are needed.
@@ -4636,6 +4702,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
46364702
IRBuilder<> Builder(Load->getNextNode());
46374703
auto *NewAnd = dyn_cast<Instruction>(
46384704
Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
4705+
// Mark this instruction as "inserted by CGP", so that other
4706+
// optimizations don't touch it.
4707+
InsertedInsts.insert(NewAnd);
46394708

46404709
// Replace all uses of load with new and (except for the use of load in the
46414710
// new and itself).
@@ -5550,6 +5619,10 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
55505619

55515620
BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
55525621

5622+
if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
5623+
EnableAndCmpSinking && TLI)
5624+
return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
5625+
55535626
if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
55545627
BinOp->getOpcode() == Instruction::LShr)) {
55555628
ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
@@ -5679,68 +5752,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
56795752
return MadeChange;
56805753
}
56815754

5682-
// If there is a sequence that branches based on comparing a single bit
5683-
// against zero that can be combined into a single instruction, and the
5684-
// target supports folding these into a single instruction, sink the
5685-
// mask and compare into the branch uses. Do this before OptimizeBlock ->
5686-
// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
5687-
// searched for.
5688-
bool CodeGenPrepare::sinkAndCmp(Function &F) {
5689-
if (!EnableAndCmpSinking)
5690-
return false;
5691-
if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
5692-
return false;
5693-
bool MadeChange = false;
5694-
for (BasicBlock &BB : F) {
5695-
// Does this BB end with the following?
5696-
// %andVal = and %val, #single-bit-set
5697-
// %icmpVal = icmp %andResult, 0
5698-
// br i1 %cmpVal label %dest1, label %dest2"
5699-
BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
5700-
if (!Brcc || !Brcc->isConditional())
5701-
continue;
5702-
ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
5703-
if (!Cmp || Cmp->getParent() != &BB)
5704-
continue;
5705-
ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
5706-
if (!Zero || !Zero->isZero())
5707-
continue;
5708-
Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
5709-
if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
5710-
continue;
5711-
ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
5712-
if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
5713-
continue;
5714-
DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
5715-
5716-
// Push the "and; icmp" for any users that are conditional branches.
5717-
// Since there can only be one branch use per BB, we don't need to keep
5718-
// track of which BBs we insert into.
5719-
for (Use &TheUse : Cmp->uses()) {
5720-
// Find brcc use.
5721-
BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
5722-
if (!BrccUser || !BrccUser->isConditional())
5723-
continue;
5724-
BasicBlock *UserBB = BrccUser->getParent();
5725-
if (UserBB == &BB) continue;
5726-
DEBUG(dbgs() << "found Brcc use\n");
5727-
5728-
// Sink the "and; icmp" to use.
5729-
MadeChange = true;
5730-
BinaryOperator *NewAnd =
5731-
BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
5732-
BrccUser);
5733-
CmpInst *NewCmp =
5734-
CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
5735-
"", BrccUser);
5736-
TheUse = NewCmp;
5737-
++NumAndCmpsMoved;
5738-
DEBUG(BrccUser->getParent()->dump());
5739-
}
5740-
}
5741-
return MadeChange;
5742-
}
5743-
57445755
/// \brief Scale down both weights to fit into uint32_t.
57455756
static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
57465757
uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;

lib/CodeGen/TargetLoweringBase.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -838,7 +838,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
838838
HasExtractBitsInsn = false;
839839
JumpIsExpensive = JumpIsExpensiveOverride;
840840
PredictableSelectIsExpensive = false;
841-
MaskAndBranchFoldingIsLegal = false;
842841
EnableExtLdPromotion = false;
843842
HasFloatingPointExceptions = true;
844843
StackPointerRegisterToSaveRestore = 0;

lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,8 +554,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
554554

555555
setSchedulingPreference(Sched::Hybrid);
556556

557-
// Enable TBZ/TBNZ
558-
MaskAndBranchFoldingIsLegal = true;
559557
EnableExtLdPromotion = true;
560558

561559
// Set required alignment.
@@ -10655,6 +10653,19 @@ Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) cons
1065510653
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
1065610654
}
1065710655

10656+
bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
10657+
const Instruction &AndI) const {
10658+
// Only sink 'and' mask to cmp use block if it is masking a single bit, since
10659+
// this is likely to be fold the and/cmp/br into a single tbz instruction. It
10660+
// may be beneficial to sink in other cases, but we would have to check that
10661+
// the cmp would not get folded into the br to form a cbz for these to be
10662+
// beneficial.
10663+
ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
10664+
if (!Mask)
10665+
return false;
10666+
return Mask->getUniqueInteger().isPowerOf2();
10667+
}
10668+
1065810669
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
1065910670
// Update IsSplitCSR in AArch64unctionInfo.
1066010671
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();

lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,8 @@ class AArch64TargetLowering : public TargetLowering {
412412
return true;
413413
}
414414

415+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
416+
415417
bool hasAndNotCompare(SDValue) const override {
416418
// 'bics'
417419
return true;

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4448,6 +4448,11 @@ bool X86TargetLowering::isCtlzFast() const {
44484448
return Subtarget.hasFastLZCNT();
44494449
}
44504450

4451+
bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4452+
const Instruction &AndI) const {
4453+
return true;
4454+
}
4455+
44514456
bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
44524457
if (!Subtarget.hasBMI())
44534458
return false;

lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -806,6 +806,8 @@ namespace llvm {
806806
return false;
807807
}
808808

809+
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
810+
809811
bool hasAndNotCompare(SDValue Y) const override;
810812

811813
/// Return the value type to use for ISD::SETCC.

0 commit comments

Comments
 (0)