Skip to content

Commit 81d6b53

Browse files
committed
LoopIdiomRecognize: detect and convert powi idiom
The following code, when compiled under -ffast-math, produces bad codegen due to LoopVectorize: float powi(float base, int exp) { float result = 1.0; for (int i = 0; i < exp; ++i) result *= base; return result; } It can easily be replaced with the llvm.powi intrinsic, when the exponent is a C int type. This is the job of LoopIdiomRecognize, and has been marked as a TODO item for years. Fulfill this wish, and replace computations of this form with the llvm.powi intrinsic.
1 parent 392e99c commit 81d6b53

File tree

3 files changed

+179
-112
lines changed

3 files changed

+179
-112
lines changed

llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ struct DisableLIRP {
3434

3535
/// When true, Memcpy is disabled.
3636
static bool Memcpy;
37+
38+
// When true, Powi is disabled.
39+
static bool Powi;
3740
};
3841

3942
/// Performs Loop Idiom Recognize Pass.

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Lines changed: 147 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
//
2323
// Future loop memory idioms to recognize:
2424
// memcmp, strlen, etc.
25-
// Future floating point idioms to recognize in -ffast-math mode:
26-
// fpowi
2725
//
2826
// This could recognize common matrix multiplies and dot product idioms and
2927
// replace them with calls to BLAS (if linked in??).
@@ -94,6 +92,7 @@
9492
#include <vector>
9593

9694
using namespace llvm;
95+
using namespace llvm::PatternMatch;
9796

9897
#define DEBUG_TYPE "loop-idiom"
9998

@@ -129,6 +128,14 @@ static cl::opt<bool, true>
129128
cl::location(DisableLIRP::Memcpy), cl::init(false),
130129
cl::ReallyHidden);
131130

131+
bool DisableLIRP::Powi;
132+
static cl::opt<bool, true>
133+
DisableLIRPPowi("disable-" DEBUG_TYPE "-powi",
134+
cl::desc("Proceed with loop idiom recognize pass, but do "
135+
"not convert the powi idiom."),
136+
cl::location(DisableLIRP::Powi), cl::init(false),
137+
cl::ReallyHidden);
138+
132139
static cl::opt<bool> UseLIRCodeSizeHeurs(
133140
"use-lir-code-size-heurs",
134141
cl::desc("Use loop idiom recognition code size heuristics when compiling"
@@ -206,6 +213,7 @@ class LoopIdiomRecognize {
206213
const SCEV *BECount);
207214
bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
208215
bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
216+
bool processLoopPowi(const SCEV *BECount);
209217

210218
bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
211219
MaybeAlign StoreAlignment, Value *StoredVal,
@@ -298,13 +306,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
298306
ApplyCodeSizeHeuristics =
299307
L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
300308

301-
HasMemset = TLI->has(LibFunc_memset);
302-
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
303-
HasMemcpy = TLI->has(LibFunc_memcpy);
304-
305-
if (HasMemset || HasMemsetPattern || HasMemcpy)
306-
if (SE->hasLoopInvariantBackedgeTakenCount(L))
307-
return runOnCountableLoop();
309+
if (SE->hasLoopInvariantBackedgeTakenCount(L))
310+
return runOnCountableLoop();
308311

309312
return runOnNoncountableLoop();
310313
}
@@ -549,33 +552,44 @@ bool LoopIdiomRecognize::runOnLoopBlock(
549552
BasicBlock *BB, const SCEV *BECount,
550553
SmallVectorImpl<BasicBlock *> &ExitBlocks) {
551554
// We can only promote stores in this block if they are unconditionally
552-
// executed in the loop. For a block to be unconditionally executed, it has
553-
// to dominate all the exit blocks of the loop. Verify this now.
555+
// executed in the loop. The powi idiom also requires the block to be
556+
// unconditionally executed. For a block to be unconditionally executed, it
557+
// has to dominate all the exit blocks of the loop.
554558
for (BasicBlock *ExitBlock : ExitBlocks)
555559
if (!DT->dominates(BB, ExitBlock))
556560
return false;
557561

558562
bool MadeChange = false;
559-
// Look for store instructions, which may be optimized to memset/memcpy.
560-
collectStores(BB);
561563

562-
// Look for a single store or sets of stores with a common base, which can be
563-
// optimized into a memset (memset_pattern). The latter most commonly happens
564-
// with structs and handunrolled loops.
565-
for (auto &SL : StoreRefsForMemset)
566-
MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
564+
HasMemset = TLI->has(LibFunc_memset);
565+
HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
566+
HasMemcpy = TLI->has(LibFunc_memcpy);
567+
568+
if (HasMemset || HasMemsetPattern || HasMemcpy) {
569+
// Look for store instructions, which may be optimized to memset/memcpy.
570+
collectStores(BB);
567571

568-
for (auto &SL : StoreRefsForMemsetPattern)
569-
MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
572+
// Look for a single store or sets of stores with a common base, which can
573+
// be optimized into a memset (memset_pattern). The latter most commonly
574+
// happens with structs and handunrolled loops.
575+
for (auto &SL : StoreRefsForMemset)
576+
MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
570577

571-
// Optimize the store into a memcpy, if it feeds an similarly strided load.
572-
for (auto &SI : StoreRefsForMemcpy)
573-
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
578+
for (auto &SL : StoreRefsForMemsetPattern)
579+
MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
580+
581+
// Optimize the store into a memcpy, if it feeds an similarly strided load.
582+
for (auto &SI : StoreRefsForMemcpy)
583+
MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
584+
585+
MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
586+
BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
587+
MadeChange |= processLoopMemIntrinsic<MemSetInst>(
588+
BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
589+
}
574590

575-
MadeChange |= processLoopMemIntrinsic<MemCpyInst>(
576-
BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
577-
MadeChange |= processLoopMemIntrinsic<MemSetInst>(
578-
BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
591+
if (!DisableLIRP::Powi)
592+
MadeChange |= processLoopPowi(BECount);
579593

580594
return MadeChange;
581595
}
@@ -925,6 +939,112 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
925939
BECount, IsNegStride, /*IsLoopMemset=*/true);
926940
}
927941

942+
static CallInst *createPowiIntrinsic(IRBuilder<> &IRBuilder, Value *Base,
943+
Value *Exp, const DebugLoc &DL) {
944+
Value *Ops[] = {Base, Exp};
945+
Type *Tys[] = {Base->getType(), Exp->getType()};
946+
947+
Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
948+
Function *Func = Intrinsic::getDeclaration(M, Intrinsic::powi, Tys);
949+
CallInst *CI = IRBuilder.CreateCall(Func, Ops);
950+
CI->setDebugLoc(DL);
951+
return CI;
952+
}
953+
954+
// Checks that the Phi is an fmul fast with a loop-invariant operand, and
955+
// returns the the fmul instruction.
956+
static Instruction *detectPowiIdiom(PHINode *Phi, BasicBlock *PH,
957+
BasicBlock *Latch, Loop *CurLoop) {
958+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Performing powi idiom detection\n");
959+
960+
// The phi must have two incoming values (one from the preheader, and another
961+
// from the latch), it must have one use (which we will subsequently check is
962+
// an fmul fast instruction), and it must be a floating-point type.
963+
if (Phi->getNumIncomingValues() != 2 || !Phi->hasOneUse() ||
964+
Phi->getBasicBlockIndex(PH) < 0 || Phi->getBasicBlockIndex(Latch) < 0 ||
965+
!Phi->getType()->isFloatingPointTy()) {
966+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Unable to operate on this PHI node\n");
967+
return nullptr;
968+
}
969+
970+
// Further, check that the incoming value from the preheader is 1.0.
971+
auto *ConstFP = dyn_cast<ConstantFP>(Phi->getIncomingValueForBlock(PH));
972+
if (!ConstFP || !ConstFP->isExactlyValue(1.0)) {
973+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Initial value comparison failed\n");
974+
return nullptr;
975+
}
976+
977+
auto *I = cast<Instruction>(Phi->use_begin()->getUser());
978+
Value *Op1, *Op2;
979+
if (!match(I, m_FMul(m_Value(Op1), m_Value(Op2))) || !I->hasApproxFunc()) {
980+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " fmul-afn test failed\n");
981+
return nullptr;
982+
}
983+
for (Use &U : I->uses()) {
984+
if (isa<PHINode>(U.getUser()))
985+
continue;
986+
if (U->isUsedInBasicBlock(Latch)) {
987+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " FMul used inside loop\n");
988+
return nullptr;
989+
}
990+
}
991+
Value *Base = Op1 == Phi ? Op2 : Op1;
992+
if (CurLoop->isLoopInvariant(Base))
993+
return I;
994+
else
995+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Base is not loop-invariant\n");
996+
return nullptr;
997+
}
998+
999+
/// Detect the powi idiom, and convert it to an intrinsic.
1000+
bool LoopIdiomRecognize::processLoopPowi(const SCEV *BECount) {
1001+
// We only process loops where the IV is found, and at most i32.
1002+
PHINode *IV = CurLoop->getInductionVariable(*SE);
1003+
if (!IV || IV->getType()->getScalarSizeInBits() > 32)
1004+
return false;
1005+
1006+
// If the loop doesn't have a valid preheader and latch, give up now.
1007+
BasicBlock *PH = CurLoop->getLoopPreheader();
1008+
BasicBlock *Latch = CurLoop->getLoopLatch();
1009+
if (!PH || !Latch)
1010+
return false;
1011+
1012+
// Find the Phi corresponding to the powi idiom, amongst all phis except the
1013+
// induction phi.
1014+
for (PHINode &Phi : Latch->phis()) {
1015+
if (&Phi == IV)
1016+
continue;
1017+
if (Instruction *FMul = detectPowiIdiom(&Phi, PH, Latch, CurLoop)) {
1018+
// Find the trip count, and expand the SCEV to find the exponent of the
1019+
// powi.
1020+
IRBuilder<> Builder(PH->getTerminator());
1021+
SCEVExpander Expander(*SE, *DL, "loop-idiom");
1022+
SCEVExpanderCleaner ExpCleaner(Expander);
1023+
Type *ExpTy = Builder.getInt32Ty();
1024+
const SCEV *TripCount =
1025+
SE->getTripCountFromExitCount(BECount, ExpTy, CurLoop);
1026+
if (!Expander.isSafeToExpand(TripCount)) {
1027+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " Trip count not safe to expand\n");
1028+
return false;
1029+
}
1030+
Value *Exp =
1031+
Expander.expandCodeFor(TripCount, ExpTy, PH->getTerminator());
1032+
1033+
// Insert the powi intrinsic, and replace its uses outside the block.
1034+
const DebugLoc &Loc = FMul->getDebugLoc();
1035+
Value *Base = isa<PHINode>(FMul->getOperand(0)) ? FMul->getOperand(1)
1036+
: FMul->getOperand(0);
1037+
CallInst *Powi = createPowiIntrinsic(Builder, Base, Exp, Loc);
1038+
FMul->replaceUsesOutsideBlock(Powi, Latch);
1039+
ExpCleaner.markResultUsed();
1040+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " powi idiom optimized!\n");
1041+
return true;
1042+
}
1043+
}
1044+
LLVM_DEBUG(dbgs() << DEBUG_TYPE " powi idiom detection failed\n");
1045+
return false;
1046+
}
1047+
9281048
/// mayLoopAccessLocation - Return true if the specified loop might access the
9291049
/// specified pointer location, which is a loop-strided access. The 'Access'
9301050
/// argument specifies what the verboten forms of access are (read or write).
@@ -2216,8 +2336,6 @@ static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
22162336
BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
22172337
assert(LoopPreheaderBB && "There is always a loop preheader.");
22182338

2219-
using namespace PatternMatch;
2220-
22212339
// Step 1: Check if the loop backedge is in desirable form.
22222340

22232341
ICmpInst::Predicate Pred;

0 commit comments

Comments
 (0)