Skip to content

Commit c373a1f

Browse files
authored
LoopRotationUtils: Special case zero-branch weight cases (#66681)
The formula I added to LoopRotationUtils does not produce reasonable results if some of the branch weights are zero. Add special case handling for this. This fixes #66675
1 parent a5a008f commit c373a1f

File tree

2 files changed

+149
-23
lines changed

2 files changed

+149
-23
lines changed

llvm/lib/Transforms/Utils/LoopRotationUtils.cpp

Lines changed: 52 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -295,33 +295,62 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
295295
// We cannot generally deduce how often we had a zero-trip count loop so we
296296
// have to make a guess for how to distribute x among the new x0 and x1.
297297

298-
uint32_t ExitWeight0 = 0; // aka x0
299-
if (HasConditionalPreHeader) {
300-
// Here we cannot know how many 0-trip count loops we have, so we guess:
301-
if (OrigLoopBackedgeWeight > OrigLoopExitWeight) {
302-
// If the loop count is bigger than the exit count then we set
303-
// probabilities as if 0-trip count nearly never happens.
304-
ExitWeight0 = ZeroTripCountWeights[0];
305-
// Scale up counts if necessary so we can match `ZeroTripCountWeights` for
306-
// the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.
307-
while (OrigLoopExitWeight < ZeroTripCountWeights[1] + ExitWeight0) {
308-
// ... but don't overflow.
309-
uint32_t const HighBit = uint32_t{1} << (sizeof(uint32_t) * 8 - 1);
310-
if ((OrigLoopBackedgeWeight & HighBit) != 0 ||
311-
(OrigLoopExitWeight & HighBit) != 0)
312-
break;
313-
OrigLoopBackedgeWeight <<= 1;
314-
OrigLoopExitWeight <<= 1;
298+
uint32_t ExitWeight0; // aka x0
299+
uint32_t ExitWeight1; // aka x1
300+
uint32_t EnterWeight; // aka y0
301+
uint32_t LoopBackWeight; // aka y1
302+
if (OrigLoopExitWeight > 0 && OrigLoopBackedgeWeight > 0) {
303+
ExitWeight0 = 0;
304+
if (HasConditionalPreHeader) {
305+
// Here we cannot know how many 0-trip count loops we have, so we guess:
306+
if (OrigLoopBackedgeWeight >= OrigLoopExitWeight) {
307+
// If the loop count is bigger than the exit count then we set
308+
// probabilities as if 0-trip count nearly never happens.
309+
ExitWeight0 = ZeroTripCountWeights[0];
310+
// Scale up counts if necessary so we can match `ZeroTripCountWeights`
311+
// for the `ExitWeight0`:`ExitWeight1` (aka `x0`:`x1` ratio`) ratio.
312+
while (OrigLoopExitWeight < ZeroTripCountWeights[1] + ExitWeight0) {
313+
// ... but don't overflow.
314+
uint32_t const HighBit = uint32_t{1} << (sizeof(uint32_t) * 8 - 1);
315+
if ((OrigLoopBackedgeWeight & HighBit) != 0 ||
316+
(OrigLoopExitWeight & HighBit) != 0)
317+
break;
318+
OrigLoopBackedgeWeight <<= 1;
319+
OrigLoopExitWeight <<= 1;
320+
}
321+
} else {
322+
// If there's a higher exit-count than backedge-count then we set
323+
// probabilities as if there are only 0-trip and 1-trip cases.
324+
ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;
315325
}
326+
}
327+
ExitWeight1 = OrigLoopExitWeight - ExitWeight0;
328+
EnterWeight = ExitWeight1;
329+
LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight;
330+
} else if (OrigLoopExitWeight == 0) {
331+
if (OrigLoopBackedgeWeight == 0) {
332+
// degenerate case... keep everything zero...
333+
ExitWeight0 = 0;
334+
ExitWeight1 = 0;
335+
EnterWeight = 0;
336+
LoopBackWeight = 0;
316337
} else {
317-
// If there's a higher exit-count than backedge-count then we set
318-
// probabilities as if there are only 0-trip and 1-trip cases.
319-
ExitWeight0 = OrigLoopExitWeight - OrigLoopBackedgeWeight;
338+
// Special case "LoopExitWeight == 0" weights which behaves like an
339+
// endless where we don't want loop-enttry (y0) to be the same as
340+
// loop-exit (x1).
341+
ExitWeight0 = 0;
342+
ExitWeight1 = 0;
343+
EnterWeight = 1;
344+
LoopBackWeight = OrigLoopBackedgeWeight;
320345
}
346+
} else {
347+
// loop is never entered.
348+
assert(OrigLoopBackedgeWeight == 0 && "remaining case is backedge zero");
349+
ExitWeight0 = 1;
350+
ExitWeight1 = 1;
351+
EnterWeight = 0;
352+
LoopBackWeight = 0;
321353
}
322-
uint32_t ExitWeight1 = OrigLoopExitWeight - ExitWeight0; // aka x1
323-
uint32_t EnterWeight = ExitWeight1; // aka y0
324-
uint32_t LoopBackWeight = OrigLoopBackedgeWeight - EnterWeight; // aka y1
325354

326355
MDBuilder MDB(LoopBI.getContext());
327356
MDNode *LoopWeightMD =

llvm/test/Transforms/LoopRotate/update-branch-weights.ll

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
; BFI_AFTER: - inner_loop_exit: {{.*}} count = 1000
2424
; BFI_AFTER: - outer_loop_exit: {{.*}} count = 1
2525

26+
; IR-LABEL: define void @func0
2627
; IR: inner_loop_body:
2728
; IR: br i1 %cmp1, label %inner_loop_body, label %inner_loop_exit, !prof [[PROF_FUNC0_0:![0-9]+]]
2829
; IR: inner_loop_exit:
@@ -74,6 +75,7 @@ outer_loop_exit:
7475
; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
7576
; BFI_AFTER: - loop_exit: {{.*}} count = 1024
7677

78+
; IR-LABEL: define void @func1
7779
; IR: entry:
7880
; IR: br i1 %cmp1, label %loop_body.lr.ph, label %loop_exit, !prof [[PROF_FUNC1_0:![0-9]+]]
7981

@@ -114,6 +116,7 @@ loop_exit:
114116
; - loop_header.loop_exit_crit_edge: {{.*}} count = 32
115117
; - loop_exit: {{.*}} count = 1024
116118

119+
; IR-LABEL: define void @func2
117120
; IR: entry:
118121
; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC2_0:![0-9]+]]
119122

@@ -141,16 +144,110 @@ loop_exit:
141144
ret void
142145
}
143146

147+
; BFI_BEFORE-LABEL: block-frequency-info: func3_zero_branch_weight
148+
; BFI_BEFORE: - entry: {{.*}} count = 1024
149+
; BFI_BEFORE: - loop_header: {{.*}} count = 2199023255296
150+
; BFI_BEFORE: - loop_body: {{.*}} count = 2199023254272
151+
; BFI_BEFORE: - loop_exit: {{.*}} count = 1024
152+
153+
; BFI_AFTER-LABEL: block-frequency-info: func3_zero_branch_weight
154+
; BFI_AFTER: - entry: {{.*}} count = 1024
155+
; BFI_AFTER: - loop_body.lr.ph: {{.*}} count = 1024
156+
; BFI_AFTER: - loop_body: {{.*}} count = 2199023255296
157+
; BFI_AFTER: - loop_header.loop_exit_crit_edge: {{.*}} count = 1024
158+
; BFI_AFTER: - loop_exit: {{.*}} count = 1024
159+
160+
; IR-LABEL: define void @func3_zero_branch_weight
161+
; IR: entry:
162+
; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC3_0:![0-9]+]]
163+
164+
; IR: loop_body:
165+
; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC3_0]]
166+
167+
define void @func3_zero_branch_weight(i32 %n) !prof !3 {
168+
entry:
169+
br label %loop_header
170+
171+
loop_header:
172+
%i = phi i32 [0, %entry], [%i_inc, %loop_body]
173+
%cmp = icmp slt i32 %i, %n
174+
br i1 %cmp, label %loop_exit, label %loop_body, !prof !6
175+
176+
loop_body:
177+
store volatile i32 %i, ptr @g, align 4
178+
%i_inc = add i32 %i, 1
179+
br label %loop_header
180+
181+
loop_exit:
182+
ret void
183+
}
184+
185+
; IR-LABEL: define void @func4_zero_branch_weight
186+
; IR: entry:
187+
; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC4_0:![0-9]+]]
188+
189+
; IR: loop_body:
190+
; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC4_0]]
191+
192+
define void @func4_zero_branch_weight(i32 %n) !prof !3 {
193+
entry:
194+
br label %loop_header
195+
196+
loop_header:
197+
%i = phi i32 [0, %entry], [%i_inc, %loop_body]
198+
%cmp = icmp slt i32 %i, %n
199+
br i1 %cmp, label %loop_exit, label %loop_body, !prof !7
200+
201+
loop_body:
202+
store volatile i32 %i, ptr @g, align 4
203+
%i_inc = add i32 %i, 1
204+
br label %loop_header
205+
206+
loop_exit:
207+
ret void
208+
}
209+
210+
; IR-LABEL: define void @func5_zero_branch_weight
211+
; IR: entry:
212+
; IR: br i1 %cmp1, label %loop_exit, label %loop_body.lr.ph, !prof [[PROF_FUNC5_0:![0-9]+]]
213+
214+
; IR: loop_body:
215+
; IR: br i1 %cmp, label %loop_header.loop_exit_crit_edge, label %loop_body, !prof [[PROF_FUNC5_0]]
216+
217+
define void @func5_zero_branch_weight(i32 %n) !prof !3 {
218+
entry:
219+
br label %loop_header
220+
221+
loop_header:
222+
%i = phi i32 [0, %entry], [%i_inc, %loop_body]
223+
%cmp = icmp slt i32 %i, %n
224+
br i1 %cmp, label %loop_exit, label %loop_body, !prof !8
225+
226+
loop_body:
227+
store volatile i32 %i, ptr @g, align 4
228+
%i_inc = add i32 %i, 1
229+
br label %loop_header
230+
231+
loop_exit:
232+
ret void
233+
}
234+
144235
!0 = !{!"function_entry_count", i64 1}
145236
!1 = !{!"branch_weights", i32 1000, i32 1}
146237
!2 = !{!"branch_weights", i32 3000, i32 1000}
147238
!3 = !{!"function_entry_count", i64 1024}
148239
!4 = !{!"branch_weights", i32 40, i32 2}
149240
!5 = !{!"branch_weights", i32 10240, i32 320}
241+
!6 = !{!"branch_weights", i32 0, i32 1}
242+
!7 = !{!"branch_weights", i32 1, i32 0}
243+
!8 = !{!"branch_weights", i32 0, i32 0}
150244

151245
; IR: [[PROF_FUNC0_0]] = !{!"branch_weights", i32 2000, i32 1000}
152246
; IR: [[PROF_FUNC0_1]] = !{!"branch_weights", i32 999, i32 1}
153247
; IR: [[PROF_FUNC1_0]] = !{!"branch_weights", i32 127, i32 1}
154248
; IR: [[PROF_FUNC1_1]] = !{!"branch_weights", i32 2433, i32 127}
155249
; IR: [[PROF_FUNC2_0]] = !{!"branch_weights", i32 9920, i32 320}
156250
; IR: [[PROF_FUNC2_1]] = !{!"branch_weights", i32 320, i32 0}
251+
; IR: [[PROF_FUNC3_0]] = !{!"branch_weights", i32 0, i32 1}
252+
; IR: [[PROF_FUNC4_0]] = !{!"branch_weights", i32 1, i32 0}
253+
; IR: [[PROF_FUNC5_0]] = !{!"branch_weights", i32 0, i32 0}

0 commit comments

Comments
 (0)