|
22 | 22 | //
|
23 | 23 | // Future loop memory idioms to recognize:
|
24 | 24 | // memcmp, strlen, etc.
|
25 |
| -// Future floating point idioms to recognize in -ffast-math mode: |
26 |
| -// fpowi |
27 | 25 | //
|
28 | 26 | // This could recognize common matrix multiplies and dot product idioms and
|
29 | 27 | // replace them with calls to BLAS (if linked in??).
|
|
94 | 92 | #include <vector>
|
95 | 93 |
|
96 | 94 | using namespace llvm;
|
| 95 | +using namespace llvm::PatternMatch; |
97 | 96 |
|
98 | 97 | #define DEBUG_TYPE "loop-idiom"
|
99 | 98 |
|
@@ -129,6 +128,14 @@ static cl::opt<bool, true>
|
129 | 128 | cl::location(DisableLIRP::Memcpy), cl::init(false),
|
130 | 129 | cl::ReallyHidden);
|
131 | 130 |
|
| 131 | +bool DisableLIRP::Powi; |
| 132 | +static cl::opt<bool, true> |
| 133 | + DisableLIRPPowi("disable-" DEBUG_TYPE "-powi", |
| 134 | + cl::desc("Proceed with loop idiom recognize pass, but do " |
| 135 | + "not convert the powi idiom."), |
| 136 | + cl::location(DisableLIRP::Powi), cl::init(false), |
| 137 | + cl::ReallyHidden); |
| 138 | + |
132 | 139 | static cl::opt<bool> UseLIRCodeSizeHeurs(
|
133 | 140 | "use-lir-code-size-heurs",
|
134 | 141 | cl::desc("Use loop idiom recognition code size heuristics when compiling"
|
@@ -206,6 +213,7 @@ class LoopIdiomRecognize {
|
206 | 213 | const SCEV *BECount);
|
207 | 214 | bool processLoopMemCpy(MemCpyInst *MCI, const SCEV *BECount);
|
208 | 215 | bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
|
| 216 | + bool processLoopPowi(const SCEV *BECount); |
209 | 217 |
|
210 | 218 | bool processLoopStridedStore(Value *DestPtr, const SCEV *StoreSizeSCEV,
|
211 | 219 | MaybeAlign StoreAlignment, Value *StoredVal,
|
@@ -298,13 +306,8 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) {
|
298 | 306 | ApplyCodeSizeHeuristics =
|
299 | 307 | L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
|
300 | 308 |
|
301 |
| - HasMemset = TLI->has(LibFunc_memset); |
302 |
| - HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); |
303 |
| - HasMemcpy = TLI->has(LibFunc_memcpy); |
304 |
| - |
305 |
| - if (HasMemset || HasMemsetPattern || HasMemcpy) |
306 |
| - if (SE->hasLoopInvariantBackedgeTakenCount(L)) |
307 |
| - return runOnCountableLoop(); |
| 309 | + if (SE->hasLoopInvariantBackedgeTakenCount(L)) |
| 310 | + return runOnCountableLoop(); |
308 | 311 |
|
309 | 312 | return runOnNoncountableLoop();
|
310 | 313 | }
|
@@ -549,33 +552,44 @@ bool LoopIdiomRecognize::runOnLoopBlock(
|
549 | 552 | BasicBlock *BB, const SCEV *BECount,
|
550 | 553 | SmallVectorImpl<BasicBlock *> &ExitBlocks) {
|
551 | 554 | // We can only promote stores in this block if they are unconditionally
|
552 |
| - // executed in the loop. For a block to be unconditionally executed, it has |
553 |
| - // to dominate all the exit blocks of the loop. Verify this now. |
| 555 | + // executed in the loop. The powi idiom also requires the block to be |
| 556 | + // unconditionally executed. For a block to be unconditionally executed, it |
| 557 | + // has to dominate all the exit blocks of the loop. |
554 | 558 | for (BasicBlock *ExitBlock : ExitBlocks)
|
555 | 559 | if (!DT->dominates(BB, ExitBlock))
|
556 | 560 | return false;
|
557 | 561 |
|
558 | 562 | bool MadeChange = false;
|
559 |
| - // Look for store instructions, which may be optimized to memset/memcpy. |
560 |
| - collectStores(BB); |
561 | 563 |
|
562 |
| - // Look for a single store or sets of stores with a common base, which can be |
563 |
| - // optimized into a memset (memset_pattern). The latter most commonly happens |
564 |
| - // with structs and handunrolled loops. |
565 |
| - for (auto &SL : StoreRefsForMemset) |
566 |
| - MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes); |
| 564 | + HasMemset = TLI->has(LibFunc_memset); |
| 565 | + HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); |
| 566 | + HasMemcpy = TLI->has(LibFunc_memcpy); |
| 567 | + |
| 568 | + if (HasMemset || HasMemsetPattern || HasMemcpy) { |
| 569 | + // Look for store instructions, which may be optimized to memset/memcpy. |
| 570 | + collectStores(BB); |
567 | 571 |
|
568 |
| - for (auto &SL : StoreRefsForMemsetPattern) |
569 |
| - MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No); |
| 572 | + // Look for a single store or sets of stores with a common base, which can |
| 573 | + // be optimized into a memset (memset_pattern). The latter most commonly |
| 574 | + // happens with structs and handunrolled loops. |
| 575 | + for (auto &SL : StoreRefsForMemset) |
| 576 | + MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes); |
570 | 577 |
|
571 |
| - // Optimize the store into a memcpy, if it feeds an similarly strided load. |
572 |
| - for (auto &SI : StoreRefsForMemcpy) |
573 |
| - MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); |
| 578 | + for (auto &SL : StoreRefsForMemsetPattern) |
| 579 | + MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No); |
| 580 | + |
| 581 | + // Optimize the store into a memcpy, if it feeds an similarly strided load. |
| 582 | + for (auto &SI : StoreRefsForMemcpy) |
| 583 | + MadeChange |= processLoopStoreOfLoopLoad(SI, BECount); |
| 584 | + |
| 585 | + MadeChange |= processLoopMemIntrinsic<MemCpyInst>( |
| 586 | + BB, &LoopIdiomRecognize::processLoopMemCpy, BECount); |
| 587 | + MadeChange |= processLoopMemIntrinsic<MemSetInst>( |
| 588 | + BB, &LoopIdiomRecognize::processLoopMemSet, BECount); |
| 589 | + } |
574 | 590 |
|
575 |
| - MadeChange |= processLoopMemIntrinsic<MemCpyInst>( |
576 |
| - BB, &LoopIdiomRecognize::processLoopMemCpy, BECount); |
577 |
| - MadeChange |= processLoopMemIntrinsic<MemSetInst>( |
578 |
| - BB, &LoopIdiomRecognize::processLoopMemSet, BECount); |
| 591 | + if (!DisableLIRP::Powi) |
| 592 | + MadeChange |= processLoopPowi(BECount); |
579 | 593 |
|
580 | 594 | return MadeChange;
|
581 | 595 | }
|
@@ -925,6 +939,112 @@ bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
|
925 | 939 | BECount, IsNegStride, /*IsLoopMemset=*/true);
|
926 | 940 | }
|
927 | 941 |
|
| 942 | +static CallInst *createPowiIntrinsic(IRBuilder<> &IRBuilder, Value *Base, |
| 943 | + Value *Exp, const DebugLoc &DL) { |
| 944 | + Value *Ops[] = {Base, Exp}; |
| 945 | + Type *Tys[] = {Base->getType(), Exp->getType()}; |
| 946 | + |
| 947 | + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); |
| 948 | + Function *Func = Intrinsic::getDeclaration(M, Intrinsic::powi, Tys); |
| 949 | + CallInst *CI = IRBuilder.CreateCall(Func, Ops); |
| 950 | + CI->setDebugLoc(DL); |
| 951 | + return CI; |
| 952 | +} |
| 953 | + |
| 954 | +// Checks that the Phi is an fmul fast with a loop-invariant operand, and |
| 955 | +// returns the the fmul instruction. |
| 956 | +static Instruction *detectPowiIdiom(PHINode *Phi, BasicBlock *PH, |
| 957 | + BasicBlock *Latch, Loop *CurLoop) { |
| 958 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Performing powi idiom detection\n"); |
| 959 | + |
| 960 | + // The phi must have two incoming values (one from the preheader, and another |
| 961 | + // from the latch), it must have one use (which we will subsequently check is |
| 962 | + // an fmul fast instruction), and it must be a floating-point type. |
| 963 | + if (Phi->getNumIncomingValues() != 2 || !Phi->hasOneUse() || |
| 964 | + Phi->getBasicBlockIndex(PH) < 0 || Phi->getBasicBlockIndex(Latch) < 0 || |
| 965 | + !Phi->getType()->isFloatingPointTy()) { |
| 966 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Unable to operate on this PHI node\n"); |
| 967 | + return nullptr; |
| 968 | + } |
| 969 | + |
| 970 | + // Further, check that the incoming value from the preheader is 1.0. |
| 971 | + auto *ConstFP = dyn_cast<ConstantFP>(Phi->getIncomingValueForBlock(PH)); |
| 972 | + if (!ConstFP || !ConstFP->isExactlyValue(1.0)) { |
| 973 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Initial value comparison failed\n"); |
| 974 | + return nullptr; |
| 975 | + } |
| 976 | + |
| 977 | + auto *I = cast<Instruction>(Phi->use_begin()->getUser()); |
| 978 | + Value *Op1, *Op2; |
| 979 | + if (!match(I, m_FMul(m_Value(Op1), m_Value(Op2))) || !I->hasApproxFunc()) { |
| 980 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " fmul-afn test failed\n"); |
| 981 | + return nullptr; |
| 982 | + } |
| 983 | + for (Use &U : I->uses()) { |
| 984 | + if (isa<PHINode>(U.getUser())) |
| 985 | + continue; |
| 986 | + if (U->isUsedInBasicBlock(Latch)) { |
| 987 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " FMul used inside loop\n"); |
| 988 | + return nullptr; |
| 989 | + } |
| 990 | + } |
| 991 | + Value *Base = Op1 == Phi ? Op2 : Op1; |
| 992 | + if (CurLoop->isLoopInvariant(Base)) |
| 993 | + return I; |
| 994 | + else |
| 995 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Base is not loop-invariant\n"); |
| 996 | + return nullptr; |
| 997 | +} |
| 998 | + |
| 999 | +/// Detect the powi idiom, and convert it to an intrinsic. |
| 1000 | +bool LoopIdiomRecognize::processLoopPowi(const SCEV *BECount) { |
| 1001 | + // We only process loops where the IV is found, and at most i32. |
| 1002 | + PHINode *IV = CurLoop->getInductionVariable(*SE); |
| 1003 | + if (!IV || IV->getType()->getScalarSizeInBits() > 32) |
| 1004 | + return false; |
| 1005 | + |
| 1006 | + // If the loop doesn't have a valid preheader and latch, give up now. |
| 1007 | + BasicBlock *PH = CurLoop->getLoopPreheader(); |
| 1008 | + BasicBlock *Latch = CurLoop->getLoopLatch(); |
| 1009 | + if (!PH || !Latch) |
| 1010 | + return false; |
| 1011 | + |
| 1012 | + // Find the Phi corresponding to the powi idiom, amongst all phis except the |
| 1013 | + // induction phi. |
| 1014 | + for (PHINode &Phi : Latch->phis()) { |
| 1015 | + if (&Phi == IV) |
| 1016 | + continue; |
| 1017 | + if (Instruction *FMul = detectPowiIdiom(&Phi, PH, Latch, CurLoop)) { |
| 1018 | + // Find the trip count, and expand the SCEV to find the exponent of the |
| 1019 | + // powi. |
| 1020 | + IRBuilder<> Builder(PH->getTerminator()); |
| 1021 | + SCEVExpander Expander(*SE, *DL, "loop-idiom"); |
| 1022 | + SCEVExpanderCleaner ExpCleaner(Expander); |
| 1023 | + Type *ExpTy = Builder.getInt32Ty(); |
| 1024 | + const SCEV *TripCount = |
| 1025 | + SE->getTripCountFromExitCount(BECount, ExpTy, CurLoop); |
| 1026 | + if (!Expander.isSafeToExpand(TripCount)) { |
| 1027 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " Trip count not safe to expand\n"); |
| 1028 | + return false; |
| 1029 | + } |
| 1030 | + Value *Exp = |
| 1031 | + Expander.expandCodeFor(TripCount, ExpTy, PH->getTerminator()); |
| 1032 | + |
| 1033 | + // Insert the powi intrinsic, and replace its uses outside the block. |
| 1034 | + const DebugLoc &Loc = FMul->getDebugLoc(); |
| 1035 | + Value *Base = isa<PHINode>(FMul->getOperand(0)) ? FMul->getOperand(1) |
| 1036 | + : FMul->getOperand(0); |
| 1037 | + CallInst *Powi = createPowiIntrinsic(Builder, Base, Exp, Loc); |
| 1038 | + FMul->replaceUsesOutsideBlock(Powi, Latch); |
| 1039 | + ExpCleaner.markResultUsed(); |
| 1040 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " powi idiom optimized!\n"); |
| 1041 | + return true; |
| 1042 | + } |
| 1043 | + } |
| 1044 | + LLVM_DEBUG(dbgs() << DEBUG_TYPE " powi idiom detection failed\n"); |
| 1045 | + return false; |
| 1046 | +} |
| 1047 | + |
928 | 1048 | /// mayLoopAccessLocation - Return true if the specified loop might access the
|
929 | 1049 | /// specified pointer location, which is a loop-strided access. The 'Access'
|
930 | 1050 | /// argument specifies what the verboten forms of access are (read or write).
|
@@ -2216,8 +2336,6 @@ static bool detectShiftUntilBitTestIdiom(Loop *CurLoop, Value *&BaseX,
|
2216 | 2336 | BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
|
2217 | 2337 | assert(LoopPreheaderBB && "There is always a loop preheader.");
|
2218 | 2338 |
|
2219 |
| - using namespace PatternMatch; |
2220 |
| - |
2221 | 2339 | // Step 1: Check if the loop backedge is in desirable form.
|
2222 | 2340 |
|
2223 | 2341 | ICmpInst::Predicate Pred;
|
|
0 commit comments