Skip to content

Commit 1db8591

Browse files
committed
[LV] Adding/modifying pre-commit tests for changing loop interleaving count computation
Added/modified tests for evaluating changes to loop interleaving count computation in (llvm#73766). The new set of tests address the change in IC computation to minimize the remainder TC of the vectorized loop while maximizing the IC when the remainder TC is the same.
1 parent 6a4489a commit 1db8591

File tree

1 file changed

+179
-10
lines changed

1 file changed

+179
-10
lines changed

llvm/test/Transforms/LoopVectorize/AArch64/interleave_count.ll

Lines changed: 179 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
1-
; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
1+
; RUN: opt < %s -tiny-trip-count-interleave-threshold=16 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
22
; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
33

44
target triple = "aarch64-linux-gnu"
55

66
%pair = type { i8, i8 }
77

88
; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
9-
; IC 2 since there is no remainder loop run needed when the vector loop runs.
9+
; IC 2 since there is no remainder loop run needed after the vector loop runs.
1010
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
1111
define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
1212
entry:
@@ -29,8 +29,8 @@ for.end:
2929
ret void
3030
}
3131

32-
; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
33-
; IC 1 since there may be a remainder loop that needs to run after the vector loop.
32+
; For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
33+
; IC 2 since there is a small remainder loop TC that needs to run after the vector loop.
3434
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
3535
define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
3636
entry:
@@ -53,9 +53,104 @@ for.end:
5353
ret void
5454
}
5555

56-
; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
57-
; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
58-
; won't need to run
56+
; For this loop with known TC of 39, when the auto-vectorizer chooses VF 16, it should choose
57+
; IC 2 since there is a small remainder loop that needs to run after the vector loop.
58+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
59+
define void @loop_with_tc_39(ptr noalias %p, ptr noalias %q) {
60+
entry:
61+
br label %for.body
62+
63+
for.body:
64+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
65+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
66+
%tmp1 = load i8, ptr %tmp0, align 1
67+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
68+
%tmp3 = load i8, ptr %tmp2, align 1
69+
%add = add i8 %tmp1, %tmp3
70+
%qi = getelementptr i8, ptr %q, i64 %i
71+
store i8 %add, ptr %qi, align 1
72+
%i.next = add nuw nsw i64 %i, 1
73+
%cond = icmp eq i64 %i.next, 39
74+
br i1 %cond, label %for.end, label %for.body
75+
76+
for.end:
77+
ret void
78+
}
79+
80+
; TODO: For this loop with known TC of 48, when the auto-vectorizer chooses VF 16, it should choose
81+
; IC 1 since there will be no remainder loop that needs to run after the vector loop.
82+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
83+
define void @loop_with_tc_48(ptr noalias %p, ptr noalias %q) {
84+
entry:
85+
br label %for.body
86+
87+
for.body:
88+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
89+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
90+
%tmp1 = load i8, ptr %tmp0, align 1
91+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
92+
%tmp3 = load i8, ptr %tmp2, align 1
93+
%add = add i8 %tmp1, %tmp3
94+
%qi = getelementptr i8, ptr %q, i64 %i
95+
store i8 %add, ptr %qi, align 1
96+
%i.next = add nuw nsw i64 %i, 1
97+
%cond = icmp eq i64 %i.next, 48
98+
br i1 %cond, label %for.end, label %for.body
99+
100+
for.end:
101+
ret void
102+
}
103+
104+
; TODO: For this loop with known TC of 49, when the auto-vectorizer chooses VF 16, it should choose
105+
; IC 1 since a remainder loop TC of 1 is more efficient than remainder loop TC of 17 with IC 2
106+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
107+
define void @loop_with_tc_49(ptr noalias %p, ptr noalias %q) {
108+
entry:
109+
br label %for.body
110+
111+
for.body:
112+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
113+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
114+
%tmp1 = load i8, ptr %tmp0, align 1
115+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
116+
%tmp3 = load i8, ptr %tmp2, align 1
117+
%add = add i8 %tmp1, %tmp3
118+
%qi = getelementptr i8, ptr %q, i64 %i
119+
store i8 %add, ptr %qi, align 1
120+
%i.next = add nuw nsw i64 %i, 1
121+
%cond = icmp eq i64 %i.next, 49
122+
br i1 %cond, label %for.end, label %for.body
123+
124+
for.end:
125+
ret void
126+
}
127+
128+
; TODO: For this loop with known TC of 55, when the auto-vectorizer chooses VF 16, it should choose
129+
; IC 1 since a remainder loop TC of 7 is more efficient than remainder loop TC of 23 with IC 2
130+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
131+
define void @loop_with_tc_55(ptr noalias %p, ptr noalias %q) {
132+
entry:
133+
br label %for.body
134+
135+
for.body:
136+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
137+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
138+
%tmp1 = load i8, ptr %tmp0, align 1
139+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
140+
%tmp3 = load i8, ptr %tmp2, align 1
141+
%add = add i8 %tmp1, %tmp3
142+
%qi = getelementptr i8, ptr %q, i64 %i
143+
store i8 %add, ptr %qi, align 1
144+
%i.next = add nuw nsw i64 %i, 1
145+
%cond = icmp eq i64 %i.next, 55
146+
br i1 %cond, label %for.end, label %for.body
147+
148+
for.end:
149+
ret void
150+
}
151+
152+
; TODO: For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
153+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
59154
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
60155
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
61156
entry:
@@ -78,9 +173,8 @@ for.end:
78173
ret void
79174
}
80175

81-
; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33,
82-
; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the
83-
; remainder loop will need to run
176+
; TODO: For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
177+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
84178
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
85179
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
86180
entry:
@@ -103,5 +197,80 @@ for.end:
103197
ret void
104198
}
105199

200+
; TODO: For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
201+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
202+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
203+
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
204+
entry:
205+
br label %for.body
206+
207+
for.body:
208+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
209+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
210+
%tmp1 = load i8, ptr %tmp0, align 1
211+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
212+
%tmp3 = load i8, ptr %tmp2, align 1
213+
%add = add i8 %tmp1, %tmp3
214+
%qi = getelementptr i8, ptr %q, i64 %i
215+
store i8 %add, ptr %qi, align 1
216+
%i.next = add nuw nsw i64 %i, 1
217+
%cond = icmp eq i64 %i.next, %n
218+
br i1 %cond, label %for.end, label %for.body, !prof !2
219+
220+
for.end:
221+
ret void
222+
}
223+
224+
; TODO: For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
225+
; it should conservatively choose IC 1 so that the vector loop runs twice at least
226+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
227+
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
228+
entry:
229+
br label %for.body
230+
231+
for.body:
232+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
233+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
234+
%tmp1 = load i8, ptr %tmp0, align 1
235+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
236+
%tmp3 = load i8, ptr %tmp2, align 1
237+
%add = add i8 %tmp1, %tmp3
238+
%qi = getelementptr i8, ptr %q, i64 %i
239+
store i8 %add, ptr %qi, align 1
240+
%i.next = add nuw nsw i64 %i, 1
241+
%cond = icmp eq i64 %i.next, %n
242+
br i1 %cond, label %for.end, label %for.body, !prof !3
243+
244+
for.end:
245+
ret void
246+
}
247+
248+
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
249+
; it should choose conservatively IC 2 so that the vector loop runs twice at least
250+
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
251+
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
252+
entry:
253+
br label %for.body
254+
255+
for.body:
256+
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
257+
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
258+
%tmp1 = load i8, ptr %tmp0, align 1
259+
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
260+
%tmp3 = load i8, ptr %tmp2, align 1
261+
%add = add i8 %tmp1, %tmp3
262+
%qi = getelementptr i8, ptr %q, i64 %i
263+
store i8 %add, ptr %qi, align 1
264+
%i.next = add nuw nsw i64 %i, 1
265+
%cond = icmp eq i64 %i.next, %n
266+
br i1 %cond, label %for.end, label %for.body, !prof !4
267+
268+
for.end:
269+
ret void
270+
}
271+
106272
!0 = !{!"branch_weights", i32 1, i32 31}
107273
!1 = !{!"branch_weights", i32 1, i32 32}
274+
!2 = !{!"branch_weights", i32 1, i32 47}
275+
!3 = !{!"branch_weights", i32 1, i32 62}
276+
!4 = !{!"branch_weights", i32 1, i32 63}

0 commit comments

Comments
 (0)