112
112
#include " llvm/IR/Instructions.h"
113
113
#include " llvm/IR/IntrinsicInst.h"
114
114
#include " llvm/IR/Intrinsics.h"
115
+ #include " llvm/IR/MDBuilder.h"
115
116
#include " llvm/IR/Metadata.h"
116
117
#include " llvm/IR/Module.h"
117
118
#include " llvm/IR/Operator.h"
118
119
#include " llvm/IR/PatternMatch.h"
120
+ #include " llvm/IR/ProfDataUtils.h"
119
121
#include " llvm/IR/Type.h"
120
122
#include " llvm/IR/Use.h"
121
123
#include " llvm/IR/User.h"
@@ -396,6 +398,19 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
396
398
cl::Hidden,
397
399
cl::desc(" Try wider VFs if they enable the use of vector variants" ));
398
400
401
+ // Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402
+ // variables not overflowing do not hold. See `emitSCEVChecks`.
403
+ static constexpr uint32_t SCEVCheckBypassWeights[] = {1 , 127 };
404
+ // Likelyhood of bypassing the vectorized loop because pointers overlap. See
405
+ // `emitMemRuntimeChecks`.
406
+ static constexpr uint32_t MemCheckBypassWeights[] = {1 , 127 };
407
+ // Likelyhood of bypassing the vectorized loop because there are zero trips left
408
+ // after prolog. See `emitIterationCountCheck`.
409
+ static constexpr uint32_t MinItersBypassWeights[] = {1 , 127 };
410
+ // Likelyhood of bypassing the vectorized loop because of zero trips necessary.
411
+ // See `emitMinimumVectorEpilogueIterCountCheck`.
412
+ static constexpr uint32_t EpilogueMinItersBypassWeights[] = {1 , 127 };
413
+
399
414
// / A helper function that returns true if the given type is irregular. The
400
415
// / type is irregular if its allocated size doesn't equal the store size of an
401
416
// / element of the corresponding vector type.
@@ -1962,12 +1977,14 @@ class GeneratedRTChecks {
1962
1977
SCEVExpander MemCheckExp;
1963
1978
1964
1979
bool CostTooHigh = false ;
1980
+ const bool AddBranchWeights;
1965
1981
1966
1982
public:
1967
1983
GeneratedRTChecks (ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1968
- TargetTransformInfo *TTI, const DataLayout &DL)
1984
+ TargetTransformInfo *TTI, const DataLayout &DL,
1985
+ bool AddBranchWeights)
1969
1986
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, " scev.check" ),
1970
- MemCheckExp (SE, DL, " scev.check" ) {}
1987
+ MemCheckExp (SE, DL, " scev.check" ), AddBranchWeights(AddBranchWeights) {}
1971
1988
1972
1989
// / Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1973
1990
// / accurately estimate the cost of the runtime checks. The blocks are
@@ -2160,8 +2177,10 @@ class GeneratedRTChecks {
2160
2177
DT->addNewBlock (SCEVCheckBlock, Pred);
2161
2178
DT->changeImmediateDominator (LoopVectorPreHeader, SCEVCheckBlock);
2162
2179
2163
- ReplaceInstWithInst (SCEVCheckBlock->getTerminator (),
2164
- BranchInst::Create (Bypass, LoopVectorPreHeader, Cond));
2180
+ BranchInst &BI = *BranchInst::Create (Bypass, LoopVectorPreHeader, Cond);
2181
+ if (AddBranchWeights)
2182
+ setBranchWeights (BI, SCEVCheckBypassWeights);
2183
+ ReplaceInstWithInst (SCEVCheckBlock->getTerminator (), &BI);
2165
2184
return SCEVCheckBlock;
2166
2185
}
2167
2186
@@ -2185,9 +2204,12 @@ class GeneratedRTChecks {
2185
2204
if (auto *PL = LI->getLoopFor (LoopVectorPreHeader))
2186
2205
PL->addBasicBlockToLoop (MemCheckBlock, *LI);
2187
2206
2188
- ReplaceInstWithInst (
2189
- MemCheckBlock->getTerminator (),
2190
- BranchInst::Create (Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2207
+ BranchInst &BI =
2208
+ *BranchInst::Create (Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2209
+ if (AddBranchWeights) {
2210
+ setBranchWeights (BI, MemCheckBypassWeights);
2211
+ }
2212
+ ReplaceInstWithInst (MemCheckBlock->getTerminator (), &BI);
2191
2213
MemCheckBlock->getTerminator ()->setDebugLoc (
2192
2214
Pred->getTerminator ()->getDebugLoc ());
2193
2215
@@ -2900,9 +2922,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2900
2922
// dominator of the exit blocks.
2901
2923
DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
2902
2924
2903
- ReplaceInstWithInst (
2904
- TCCheckBlock->getTerminator (),
2905
- BranchInst::Create (Bypass, LoopVectorPreHeader, CheckMinIters));
2925
+ BranchInst &BI =
2926
+ *BranchInst::Create (Bypass, LoopVectorPreHeader, CheckMinIters);
2927
+ if (hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ()))
2928
+ setBranchWeights (BI, MinItersBypassWeights);
2929
+ ReplaceInstWithInst (TCCheckBlock->getTerminator (), &BI);
2906
2930
LoopBypassBlocks.push_back (TCCheckBlock);
2907
2931
}
2908
2932
@@ -3133,7 +3157,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3133
3157
IRBuilder<> B (LoopMiddleBlock->getTerminator ());
3134
3158
B.SetCurrentDebugLocation (ScalarLatchTerm->getDebugLoc ());
3135
3159
Value *CmpN = B.CreateICmpEQ (Count, VectorTripCount, " cmp.n" );
3136
- cast<BranchInst>(LoopMiddleBlock->getTerminator ())->setCondition (CmpN);
3160
+ BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator ());
3161
+ BI.setCondition (CmpN);
3162
+ if (hasBranchWeightMD (*ScalarLatchTerm)) {
3163
+ // Assume that `Count % VectorTripCount` is equally distributed.
3164
+ unsigned TripCount = UF * VF.getKnownMinValue ();
3165
+ assert (TripCount > 0 && " trip count should not be zero" );
3166
+ MDBuilder MDB (ScalarLatchTerm->getContext ());
3167
+ MDNode *BranchWeights = MDB.createBranchWeights (1 , TripCount - 1 );
3168
+ BI.setMetadata (LLVMContext::MD_prof, BranchWeights);
3169
+ }
3137
3170
}
3138
3171
3139
3172
#ifdef EXPENSIVE_CHECKS
@@ -7896,9 +7929,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7896
7929
EPI.TripCount = Count;
7897
7930
}
7898
7931
7899
- ReplaceInstWithInst (
7900
- TCCheckBlock->getTerminator (),
7901
- BranchInst::Create (Bypass, LoopVectorPreHeader, CheckMinIters));
7932
+ BranchInst &BI =
7933
+ *BranchInst::Create (Bypass, LoopVectorPreHeader, CheckMinIters);
7934
+ if (hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ()))
7935
+ setBranchWeights (BI, MinItersBypassWeights);
7936
+ ReplaceInstWithInst (TCCheckBlock->getTerminator (), &BI);
7902
7937
7903
7938
return TCCheckBlock;
7904
7939
}
@@ -8042,9 +8077,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8042
8077
EPI.EpilogueVF , EPI.EpilogueUF ),
8043
8078
" min.epilog.iters.check" );
8044
8079
8045
- ReplaceInstWithInst (
8046
- Insert->getTerminator (),
8047
- BranchInst::Create (Bypass, LoopVectorPreHeader, CheckMinIters));
8080
+ BranchInst &BI =
8081
+ *BranchInst::Create (Bypass, LoopVectorPreHeader, CheckMinIters);
8082
+ if (hasBranchWeightMD (*OrigLoop->getLoopLatch ()->getTerminator ()))
8083
+ setBranchWeights (BI, EpilogueMinItersBypassWeights);
8084
+ ReplaceInstWithInst (Insert->getTerminator (), &BI);
8048
8085
8049
8086
LoopBypassBlocks.push_back (Insert);
8050
8087
return Insert;
@@ -9731,8 +9768,10 @@ static bool processLoopInVPlanNativePath(
9731
9768
VPlan &BestPlan = LVP.getBestPlanFor (VF.Width );
9732
9769
9733
9770
{
9771
+ bool AddBranchWeights =
9772
+ hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
9734
9773
GeneratedRTChecks Checks (*PSE.getSE (), DT, LI, TTI,
9735
- F->getParent ()->getDataLayout ());
9774
+ F->getParent ()->getDataLayout (), AddBranchWeights );
9736
9775
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9737
9776
VF.Width , 1 , LVL, &CM, BFI, PSI, Checks);
9738
9777
LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
@@ -10077,8 +10116,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10077
10116
VectorizationFactor VF = VectorizationFactor::Disabled ();
10078
10117
unsigned IC = 1 ;
10079
10118
10119
+ bool AddBranchWeights =
10120
+ hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
10080
10121
GeneratedRTChecks Checks (*PSE.getSE (), DT, LI, TTI,
10081
- F->getParent ()->getDataLayout ());
10122
+ F->getParent ()->getDataLayout (), AddBranchWeights );
10082
10123
if (MaybeVF) {
10083
10124
VF = *MaybeVF;
10084
10125
// Select the interleave count.
0 commit comments