Skip to content

Commit b636e7d

Browse files
committed
[NFC][PhaseOrdering] Add a test demonstrating pitfails of common code hoisting on loop rotation
Depending on the -rotation-max-header-size=?, hoisting common code early makes loop rotation impossible.
1 parent 5408024 commit b636e7d

File tree

1 file changed

+224
-0
lines changed

1 file changed

+224
-0
lines changed
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2+
; RUN: opt -O3 -rotation-max-header-size=0 -S < %s | FileCheck %s --check-prefixes=HOIST,THR0,FALLBACK0
3+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=0 -S < %s | FileCheck %s --check-prefixes=HOIST,THR0,FALLBACK1
4+
5+
; RUN: opt -O3 -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK2
6+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=1 -S < %s | FileCheck %s --check-prefixes=HOIST,THR1,FALLBACK3
7+
8+
; RUN: opt -O3 -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK4
9+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=2 -S < %s | FileCheck %s --check-prefixes=HOIST,THR2,FALLBACK5
10+
11+
; RUN: opt -O3 -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_OLDPM,FALLBACK6
12+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=3 -S < %s | FileCheck %s --check-prefixes=ROTATED_LATER,ROTATED_LATER_NEWPM,FALLBACK7
13+
14+
; RUN: opt -O3 -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_OLDPM,FALLBACK8
15+
; RUN: opt -passes='default<O3>' -rotation-max-header-size=4 -S < %s | FileCheck %s --check-prefixes=ROTATE,ROTATE_NEWPM,FALLBACK9
16+
17+
; This example is produced from a very basic C code:
18+
;
19+
; void f0();
20+
; void f1();
21+
; void f2();
22+
;
23+
; void loop(int width) {
24+
; if(width < 1)
25+
; return;
26+
; for(int i = 0; i < width - 1; ++i) {
27+
; f0();
28+
; f1();
29+
; }
30+
; f0();
31+
; f2();
32+
; }
33+
34+
; We have a choice here. We can either
35+
; * hoist the f0() call into loop header,
36+
; * which potentially makes loop rotation unprofitable since loop header might
37+
; have grown above certain threshold, and such unrotated loops will be
38+
; ignored by LoopVectorizer, preventing vectorization
39+
; * or loop rotation will succeed, resulting in some weird PHIs that will also
40+
; harm vectorization
41+
; * or not hoist f0() call before performing loop rotation,
42+
; at the cost of potential code bloat and/or potentially successfully rotating
43+
; the loops, vectorizing them at the cost of compile time.
44+
45+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
46+
47+
declare void @f0()
48+
declare void @f1()
49+
declare void @f2()
50+
51+
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
52+
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
53+
54+
define void @_Z4loopi(i32 %width) {
55+
; HOIST-LABEL: @_Z4loopi(
56+
; HOIST-NEXT: entry:
57+
; HOIST-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
58+
; HOIST-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
59+
; HOIST: for.cond.preheader:
60+
; HOIST-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
61+
; HOIST-NEXT: br label [[FOR_COND:%.*]]
62+
; HOIST: for.cond:
63+
; HOIST-NEXT: [[I_0:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY:%.*]] ], [ 0, [[FOR_COND_PREHEADER]] ]
64+
; HOIST-NEXT: tail call void @f0()
65+
; HOIST-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[I_0]], [[TMP0]]
66+
; HOIST-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
67+
; HOIST: for.cond.cleanup:
68+
; HOIST-NEXT: tail call void @f2()
69+
; HOIST-NEXT: br label [[RETURN]]
70+
; HOIST: for.body:
71+
; HOIST-NEXT: tail call void @f1()
72+
; HOIST-NEXT: [[INC]] = add nuw i32 [[I_0]], 1
73+
; HOIST-NEXT: br label [[FOR_COND]]
74+
; HOIST: return:
75+
; HOIST-NEXT: ret void
76+
;
77+
; ROTATED_LATER_OLDPM-LABEL: @_Z4loopi(
78+
; ROTATED_LATER_OLDPM-NEXT: entry:
79+
; ROTATED_LATER_OLDPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
80+
; ROTATED_LATER_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
81+
; ROTATED_LATER_OLDPM: for.cond.preheader:
82+
; ROTATED_LATER_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
83+
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
84+
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
85+
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]]
86+
; ROTATED_LATER_OLDPM: for.cond.cleanup:
87+
; ROTATED_LATER_OLDPM-NEXT: tail call void @f2()
88+
; ROTATED_LATER_OLDPM-NEXT: br label [[RETURN]]
89+
; ROTATED_LATER_OLDPM: for.body:
90+
; ROTATED_LATER_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_COND_PREHEADER]] ]
91+
; ROTATED_LATER_OLDPM-NEXT: tail call void @f1()
92+
; ROTATED_LATER_OLDPM-NEXT: [[INC]] = add nuw i32 [[I_04]], 1
93+
; ROTATED_LATER_OLDPM-NEXT: tail call void @f0()
94+
; ROTATED_LATER_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
95+
; ROTATED_LATER_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
96+
; ROTATED_LATER_OLDPM: return:
97+
; ROTATED_LATER_OLDPM-NEXT: ret void
98+
;
99+
; ROTATED_LATER_NEWPM-LABEL: @_Z4loopi(
100+
; ROTATED_LATER_NEWPM-NEXT: entry:
101+
; ROTATED_LATER_NEWPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
102+
; ROTATED_LATER_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
103+
; ROTATED_LATER_NEWPM: for.cond.preheader:
104+
; ROTATED_LATER_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
105+
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
106+
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT3:%.*]] = icmp eq i32 [[TMP0]], 0
107+
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT3]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE:%.*]]
108+
; ROTATED_LATER_NEWPM: for.cond.preheader.for.body_crit_edge:
109+
; ROTATED_LATER_NEWPM-NEXT: [[INC_1:%.*]] = add nuw i32 0, 1
110+
; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
111+
; ROTATED_LATER_NEWPM: for.cond.cleanup:
112+
; ROTATED_LATER_NEWPM-NEXT: tail call void @f2()
113+
; ROTATED_LATER_NEWPM-NEXT: br label [[RETURN]]
114+
; ROTATED_LATER_NEWPM: for.body:
115+
; ROTATED_LATER_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_COND_PREHEADER_FOR_BODY_CRIT_EDGE]] ]
116+
; ROTATED_LATER_NEWPM-NEXT: tail call void @f1()
117+
; ROTATED_LATER_NEWPM-NEXT: tail call void @f0()
118+
; ROTATED_LATER_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
119+
; ROTATED_LATER_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
120+
; ROTATED_LATER_NEWPM: for.body.for.body_crit_edge:
121+
; ROTATED_LATER_NEWPM-NEXT: [[INC_0]] = add nuw i32 [[INC_PHI]], 1
122+
; ROTATED_LATER_NEWPM-NEXT: br label [[FOR_BODY]]
123+
; ROTATED_LATER_NEWPM: return:
124+
; ROTATED_LATER_NEWPM-NEXT: ret void
125+
;
126+
; ROTATE_OLDPM-LABEL: @_Z4loopi(
127+
; ROTATE_OLDPM-NEXT: entry:
128+
; ROTATE_OLDPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
129+
; ROTATE_OLDPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
130+
; ROTATE_OLDPM: for.cond.preheader:
131+
; ROTATE_OLDPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
132+
; ROTATE_OLDPM-NEXT: tail call void @f0()
133+
; ROTATE_OLDPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
134+
; ROTATE_OLDPM: for.body.preheader:
135+
; ROTATE_OLDPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
136+
; ROTATE_OLDPM-NEXT: br label [[FOR_BODY:%.*]]
137+
; ROTATE_OLDPM: for.cond.cleanup:
138+
; ROTATE_OLDPM-NEXT: tail call void @f2()
139+
; ROTATE_OLDPM-NEXT: br label [[RETURN]]
140+
; ROTATE_OLDPM: for.body:
141+
; ROTATE_OLDPM-NEXT: [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
142+
; ROTATE_OLDPM-NEXT: tail call void @f1()
143+
; ROTATE_OLDPM-NEXT: [[INC]] = add nuw nsw i32 [[I_04]], 1
144+
; ROTATE_OLDPM-NEXT: tail call void @f0()
145+
; ROTATE_OLDPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[TMP0]]
146+
; ROTATE_OLDPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]]
147+
; ROTATE_OLDPM: return:
148+
; ROTATE_OLDPM-NEXT: ret void
149+
;
150+
; ROTATE_NEWPM-LABEL: @_Z4loopi(
151+
; ROTATE_NEWPM-NEXT: entry:
152+
; ROTATE_NEWPM-NEXT: [[CMP:%.*]] = icmp slt i32 [[WIDTH:%.*]], 1
153+
; ROTATE_NEWPM-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[FOR_COND_PREHEADER:%.*]]
154+
; ROTATE_NEWPM: for.cond.preheader:
155+
; ROTATE_NEWPM-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[WIDTH]], 1
156+
; ROTATE_NEWPM-NEXT: tail call void @f0()
157+
; ROTATE_NEWPM-NEXT: br i1 [[CMP13_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
158+
; ROTATE_NEWPM: for.body.preheader:
159+
; ROTATE_NEWPM-NEXT: [[TMP0:%.*]] = add nsw i32 [[WIDTH]], -1
160+
; ROTATE_NEWPM-NEXT: [[INC_1:%.*]] = add nuw nsw i32 0, 1
161+
; ROTATE_NEWPM-NEXT: br label [[FOR_BODY:%.*]]
162+
; ROTATE_NEWPM: for.cond.cleanup:
163+
; ROTATE_NEWPM-NEXT: tail call void @f2()
164+
; ROTATE_NEWPM-NEXT: br label [[RETURN]]
165+
; ROTATE_NEWPM: for.body:
166+
; ROTATE_NEWPM-NEXT: [[INC_PHI:%.*]] = phi i32 [ [[INC_0:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ], [ [[INC_1]], [[FOR_BODY_PREHEADER]] ]
167+
; ROTATE_NEWPM-NEXT: tail call void @f1()
168+
; ROTATE_NEWPM-NEXT: tail call void @f0()
169+
; ROTATE_NEWPM-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_PHI]], [[TMP0]]
170+
; ROTATE_NEWPM-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]]
171+
; ROTATE_NEWPM: for.body.for.body_crit_edge:
172+
; ROTATE_NEWPM-NEXT: [[INC_0]] = add nuw nsw i32 [[INC_PHI]], 1
173+
; ROTATE_NEWPM-NEXT: br label [[FOR_BODY]]
174+
; ROTATE_NEWPM: return:
175+
; ROTATE_NEWPM-NEXT: ret void
176+
;
177+
entry:
178+
%width.addr = alloca i32, align 4
179+
%i = alloca i32, align 4
180+
store i32 %width, i32* %width.addr, align 4
181+
%i1 = load i32, i32* %width.addr, align 4
182+
%cmp = icmp slt i32 %i1, 1
183+
br i1 %cmp, label %if.then, label %if.end
184+
185+
if.then:
186+
br label %return
187+
188+
if.end:
189+
%i2 = bitcast i32* %i to i8*
190+
call void @llvm.lifetime.start.p0i8(i64 4, i8* %i2)
191+
store i32 0, i32* %i, align 4
192+
br label %for.cond
193+
194+
for.cond:
195+
%i3 = load i32, i32* %i, align 4
196+
%i4 = load i32, i32* %width.addr, align 4
197+
%sub = sub nsw i32 %i4, 1
198+
%cmp1 = icmp slt i32 %i3, %sub
199+
br i1 %cmp1, label %for.body, label %for.cond.cleanup
200+
201+
for.cond.cleanup:
202+
%i5 = bitcast i32* %i to i8*
203+
call void @llvm.lifetime.end.p0i8(i64 4, i8* %i5)
204+
br label %for.end
205+
206+
for.body:
207+
call void @f0()
208+
call void @f1()
209+
br label %for.inc
210+
211+
for.inc:
212+
%i6 = load i32, i32* %i, align 4
213+
%inc = add nsw i32 %i6, 1
214+
store i32 %inc, i32* %i, align 4
215+
br label %for.cond
216+
217+
for.end:
218+
call void @f0()
219+
call void @f2()
220+
br label %return
221+
222+
return:
223+
ret void
224+
}

0 commit comments

Comments
 (0)