Skip to content

Commit 2bc02d7

Browse files
MatzeBsr-tream
authored andcommitted
LoopVectorize: Set branch_weight for conditional branches (llvm#72450)
Consistently add `branch_weights` metadata in any condition branch created by `LoopVectorize.cpp`: - Will only add metadata if the original loop-latch branch had metadata assigned. - Most checks should rarely trigger so I am using a 127:1 ratio. - For the middle block we assume an equal distribution of modulo results.
1 parent 11442f8 commit 2bc02d7

File tree

3 files changed

+171
-49
lines changed

3 files changed

+171
-49
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -112,10 +112,12 @@
112112
#include "llvm/IR/Instructions.h"
113113
#include "llvm/IR/IntrinsicInst.h"
114114
#include "llvm/IR/Intrinsics.h"
115+
#include "llvm/IR/MDBuilder.h"
115116
#include "llvm/IR/Metadata.h"
116117
#include "llvm/IR/Module.h"
117118
#include "llvm/IR/Operator.h"
118119
#include "llvm/IR/PatternMatch.h"
120+
#include "llvm/IR/ProfDataUtils.h"
119121
#include "llvm/IR/Type.h"
120122
#include "llvm/IR/Use.h"
121123
#include "llvm/IR/User.h"
@@ -396,6 +398,19 @@ static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
396398
cl::Hidden,
397399
cl::desc("Try wider VFs if they enable the use of vector variants"));
398400

401+
// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
402+
// variables not overflowing do not hold. See `emitSCEVChecks`.
403+
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
404+
// Likelyhood of bypassing the vectorized loop because pointers overlap. See
405+
// `emitMemRuntimeChecks`.
406+
static constexpr uint32_t MemCheckBypassWeights[] = {1, 127};
407+
// Likelyhood of bypassing the vectorized loop because there are zero trips left
408+
// after prolog. See `emitIterationCountCheck`.
409+
static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
410+
// Likelyhood of bypassing the vectorized loop because of zero trips necessary.
411+
// See `emitMinimumVectorEpilogueIterCountCheck`.
412+
static constexpr uint32_t EpilogueMinItersBypassWeights[] = {1, 127};
413+
399414
/// A helper function that returns true if the given type is irregular. The
400415
/// type is irregular if its allocated size doesn't equal the store size of an
401416
/// element of the corresponding vector type.
@@ -1962,12 +1977,14 @@ class GeneratedRTChecks {
19621977
SCEVExpander MemCheckExp;
19631978

19641979
bool CostTooHigh = false;
1980+
const bool AddBranchWeights;
19651981

19661982
public:
19671983
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1968-
TargetTransformInfo *TTI, const DataLayout &DL)
1984+
TargetTransformInfo *TTI, const DataLayout &DL,
1985+
bool AddBranchWeights)
19691986
: DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1970-
MemCheckExp(SE, DL, "scev.check") {}
1987+
MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {}
19711988

19721989
/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
19731990
/// accurately estimate the cost of the runtime checks. The blocks are
@@ -2160,8 +2177,10 @@ class GeneratedRTChecks {
21602177
DT->addNewBlock(SCEVCheckBlock, Pred);
21612178
DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
21622179

2163-
ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2164-
BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2180+
BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
2181+
if (AddBranchWeights)
2182+
setBranchWeights(BI, SCEVCheckBypassWeights);
2183+
ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
21652184
return SCEVCheckBlock;
21662185
}
21672186

@@ -2185,9 +2204,12 @@ class GeneratedRTChecks {
21852204
if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
21862205
PL->addBasicBlockToLoop(MemCheckBlock, *LI);
21872206

2188-
ReplaceInstWithInst(
2189-
MemCheckBlock->getTerminator(),
2190-
BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2207+
BranchInst &BI =
2208+
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2209+
if (AddBranchWeights) {
2210+
setBranchWeights(BI, MemCheckBypassWeights);
2211+
}
2212+
ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
21912213
MemCheckBlock->getTerminator()->setDebugLoc(
21922214
Pred->getTerminator()->getDebugLoc());
21932215

@@ -2900,9 +2922,11 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
29002922
// dominator of the exit blocks.
29012923
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
29022924

2903-
ReplaceInstWithInst(
2904-
TCCheckBlock->getTerminator(),
2905-
BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2925+
BranchInst &BI =
2926+
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
2927+
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
2928+
setBranchWeights(BI, MinItersBypassWeights);
2929+
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
29062930
LoopBypassBlocks.push_back(TCCheckBlock);
29072931
}
29082932

@@ -3133,7 +3157,16 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
31333157
IRBuilder<> B(LoopMiddleBlock->getTerminator());
31343158
B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc());
31353159
Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n");
3136-
cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3160+
BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator());
3161+
BI.setCondition(CmpN);
3162+
if (hasBranchWeightMD(*ScalarLatchTerm)) {
3163+
// Assume that `Count % VectorTripCount` is equally distributed.
3164+
unsigned TripCount = UF * VF.getKnownMinValue();
3165+
assert(TripCount > 0 && "trip count should not be zero");
3166+
MDBuilder MDB(ScalarLatchTerm->getContext());
3167+
MDNode *BranchWeights = MDB.createBranchWeights(1, TripCount - 1);
3168+
BI.setMetadata(LLVMContext::MD_prof, BranchWeights);
3169+
}
31373170
}
31383171

31393172
#ifdef EXPENSIVE_CHECKS
@@ -7896,9 +7929,11 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
78967929
EPI.TripCount = Count;
78977930
}
78987931

7899-
ReplaceInstWithInst(
7900-
TCCheckBlock->getTerminator(),
7901-
BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7932+
BranchInst &BI =
7933+
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7934+
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
7935+
setBranchWeights(BI, MinItersBypassWeights);
7936+
ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
79027937

79037938
return TCCheckBlock;
79047939
}
@@ -8042,9 +8077,11 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
80428077
EPI.EpilogueVF, EPI.EpilogueUF),
80438078
"min.epilog.iters.check");
80448079

8045-
ReplaceInstWithInst(
8046-
Insert->getTerminator(),
8047-
BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8080+
BranchInst &BI =
8081+
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
8082+
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
8083+
setBranchWeights(BI, EpilogueMinItersBypassWeights);
8084+
ReplaceInstWithInst(Insert->getTerminator(), &BI);
80488085

80498086
LoopBypassBlocks.push_back(Insert);
80508087
return Insert;
@@ -9731,8 +9768,10 @@ static bool processLoopInVPlanNativePath(
97319768
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
97329769

97339770
{
9771+
bool AddBranchWeights =
9772+
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
97349773
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9735-
F->getParent()->getDataLayout());
9774+
F->getParent()->getDataLayout(), AddBranchWeights);
97369775
InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
97379776
VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
97389777
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
@@ -10077,8 +10116,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1007710116
VectorizationFactor VF = VectorizationFactor::Disabled();
1007810117
unsigned IC = 1;
1007910118

10119+
bool AddBranchWeights =
10120+
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
1008010121
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10081-
F->getParent()->getDataLayout());
10122+
F->getParent()->getDataLayout(), AddBranchWeights);
1008210123
if (MaybeVF) {
1008310124
VF = *MaybeVF;
1008410125
// Select the interleave count.
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; RUN: opt < %s -S -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 | FileCheck %s
2+
3+
; CHECK-LABEL: @f0(
4+
;
5+
; CHECK: entry:
6+
; CHECK: br i1 %cmp.entry, label %iter.check, label %exit, !prof [[PROF_F0_ENTRY:![0-9]+]]
7+
;
8+
; CHECK: iter.check:
9+
; CHECK: br i1 %min.iters.check, label %vec.epilog.scalar.ph, label %vector.scevcheck, !prof [[PROF_F0_UNLIKELY:![0-9]+]]
10+
;
11+
; CHECK: vector.scevcheck:
12+
; CHECK: br i1 %4, label %vec.epilog.scalar.ph, label %vector.main.loop.iter.check, !prof [[PROF_F0_UNLIKELY]]
13+
;
14+
; CHECK: vector.main.loop.iter.check:
15+
; CHECK: br i1 %min.iters.check1, label %vec.epilog.ph, label %vector.ph, !prof [[PROF_F0_UNLIKELY]]
16+
;
17+
; CHECK: vector.ph:
18+
; CHECK: br label %vector.body
19+
;
20+
; CHECK: vector.body:
21+
; CHECK: br i1 %8, label %middle.block, label %vector.body, !prof [[PROF_F0_VECTOR_BODY:![0-9]+]]
22+
;
23+
; CHECK: middle.block:
24+
; CHECK: br i1 %cmp.n, label %exit.loopexit, label %vec.epilog.iter.check, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
25+
;
26+
; CHECK: vec.epilog.iter.check:
27+
; CHECK: br i1 %min.epilog.iters.check, label %vec.epilog.scalar.ph, label %vec.epilog.ph, !prof [[PROF_F0_UNLIKELY]]
28+
;
29+
; CHECK: vec.epilog.ph:
30+
; CHECK: br label %vec.epilog.vector.body
31+
;
32+
; CHECK: vec.epilog.vector.body:
33+
; CHECK: br i1 %12, label %vec.epilog.middle.block, label %vec.epilog.vector.body, !prof [[PROF_F0_VEC_EPILOG_VECTOR_BODY:![0-9]+]]
34+
;
35+
; CHECK: vec.epilog.middle.block:
36+
; CHECK: br i1 %cmp.n7, label %exit.loopexit, label %vec.epilog.scalar.ph, !prof [[PROF_F0_MIDDLE_BLOCKS:![0-9]+]]
37+
;
38+
; CHECK: vec.epilog.scalar.ph:
39+
; CHECK: br label %loop
40+
;
41+
; CHECK: loop:
42+
; CHECK: br i1 %cmp.loop, label %loop, label %exit.loopexit, !prof [[PROF_F0_LOOP:![0-9]+]]
43+
;
44+
; CHECK: exit.loopexit:
45+
; CHECK: br label %exit
46+
;
47+
; CHECK: exit:
48+
; CHECK: ret void
49+
50+
define void @f0(i8 %n, i32 %len, ptr %p) !prof !0 {
51+
entry:
52+
%cmp.entry = icmp sgt i32 %len, 0
53+
br i1 %cmp.entry, label %loop, label %exit, !prof !1
54+
55+
loop:
56+
%i8 = phi i8 [0, %entry], [%i8.inc, %loop]
57+
%i32 = phi i32 [0, %entry], [%i32.inc, %loop]
58+
59+
%ptr = getelementptr inbounds i32, ptr %p, i8 %i8
60+
store i32 %i32, ptr %ptr
61+
62+
%i8.inc = add i8 %i8, 1
63+
%i32.inc = add i32 %i32, 1
64+
65+
%cmp.loop = icmp ult i32 %i32, %len
66+
br i1 %cmp.loop, label %loop, label %exit, !prof !2
67+
68+
exit:
69+
ret void
70+
}
71+
72+
!0 = !{!"function_entry_count", i64 13}
73+
!1 = !{!"branch_weights", i32 12, i32 1}
74+
!2 = !{!"branch_weights", i32 1234, i32 1}
75+
76+
; CHECK: [[PROF_F0_ENTRY]] = !{!"branch_weights", i32 12, i32 1}
77+
; CHECK: [[PROF_F0_UNLIKELY]] = !{!"branch_weights", i32 1, i32 127}
78+
; CEHCK: [[PROF_F0_VECTOR_BODY]] = !{!"branch_weights", i32 1, i32 307}
79+
; CHECK: [[PROF_F0_MIDDLE_BLOCKS]] = !{!"branch_weights", i32 1, i32 3}
80+
; CHECK: [[PROF_F0_VEC_EPILOG_VECTOR_BODY]] = !{!"branch_weights", i32 0, i32 0}
81+
; CEHCK: [[PROF_F0_LOOP]] = !{!"branch_weights", i32 2, i32 1}

0 commit comments

Comments
 (0)