Skip to content

Commit bae6f8f

Browse files
committed
[ARM] Add some tests for non-zero VCTP generation. NFC
See D146517.
1 parent 69a0924 commit bae6f8f

File tree

1 file changed

+291
-0
lines changed

1 file changed

+291
-0
lines changed
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3+
4+
define arm_aapcs_vfpcc void @start12(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
5+
; CHECK-LABEL: start12:
6+
; CHECK: @ %bb.0: @ %entry
7+
; CHECK-NEXT: .save {r4, r5, r7, lr}
8+
; CHECK-NEXT: push {r4, r5, r7, lr}
9+
; CHECK-NEXT: cmp r3, #1
10+
; CHECK-NEXT: blt .LBB0_3
11+
; CHECK-NEXT: @ %bb.1: @ %vector.ph
12+
; CHECK-NEXT: vmov r12, s0
13+
; CHECK-NEXT: adds r4, r3, #3
14+
; CHECK-NEXT: bic r4, r4, #3
15+
; CHECK-NEXT: adr r5, .LCPI0_0
16+
; CHECK-NEXT: sub.w lr, r4, #16
17+
; CHECK-NEXT: movs r4, #1
18+
; CHECK-NEXT: adds r0, #48
19+
; CHECK-NEXT: adds r1, #48
20+
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
21+
; CHECK-NEXT: adds r2, #48
22+
; CHECK-NEXT: vldrw.u32 q0, [r5]
23+
; CHECK-NEXT: movs r4, #12
24+
; CHECK-NEXT: vdup.32 q1, r3
25+
; CHECK-NEXT: .LBB0_2: @ %vector.body
26+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
27+
; CHECK-NEXT: vqadd.u32 q2, q0, r4
28+
; CHECK-NEXT: adds r4, #4
29+
; CHECK-NEXT: vptt.u32 hi, q1, q2
30+
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
31+
; CHECK-NEXT: vldrwt.u32 q3, [r0], #16
32+
; CHECK-NEXT: vfmas.f32 q3, q2, r12
33+
; CHECK-NEXT: vpst
34+
; CHECK-NEXT: vstrwt.32 q3, [r2], #16
35+
; CHECK-NEXT: le lr, .LBB0_2
36+
; CHECK-NEXT: .LBB0_3: @ %for.cond.cleanup
37+
; CHECK-NEXT: pop {r4, r5, r7, pc}
38+
; CHECK-NEXT: .p2align 4
39+
; CHECK-NEXT: @ %bb.4:
40+
; CHECK-NEXT: .LCPI0_0:
41+
; CHECK-NEXT: .long 0 @ 0x0
42+
; CHECK-NEXT: .long 1 @ 0x1
43+
; CHECK-NEXT: .long 2 @ 0x2
44+
; CHECK-NEXT: .long 3 @ 0x3
45+
entry:
46+
%cmp8 = icmp sgt i32 %n, 0
47+
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
48+
49+
vector.ph: ; preds = %entry
50+
%n.rnd.up = add i32 %n, 3
51+
%n.vec = and i32 %n.rnd.up, -4
52+
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
53+
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
54+
br label %vector.body
55+
56+
vector.body: ; preds = %vector.body, %vector.ph
57+
%index = phi i32 [ 12, %vector.ph ], [ %index.next, %vector.body ]
58+
%0 = getelementptr inbounds float, ptr %x, i32 %index
59+
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
60+
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
61+
%2 = getelementptr inbounds float, ptr %y, i32 %index
62+
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
63+
%3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
64+
%4 = getelementptr inbounds float, ptr %z, i32 %index
65+
call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
66+
%index.next = add i32 %index, 4
67+
%5 = icmp eq i32 %index.next, %n.vec
68+
br i1 %5, label %for.cond.cleanup, label %vector.body
69+
70+
for.cond.cleanup: ; preds = %vector.body, %entry
71+
ret void
72+
}
73+
74+
75+
define arm_aapcs_vfpcc void @start11(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
76+
; CHECK-LABEL: start11:
77+
; CHECK: @ %bb.0: @ %entry
78+
; CHECK-NEXT: .save {r4, r5, r7, lr}
79+
; CHECK-NEXT: push {r4, r5, r7, lr}
80+
; CHECK-NEXT: cmp r3, #1
81+
; CHECK-NEXT: blt .LBB1_3
82+
; CHECK-NEXT: @ %bb.1: @ %vector.ph
83+
; CHECK-NEXT: vmov r12, s0
84+
; CHECK-NEXT: adds r4, r3, #3
85+
; CHECK-NEXT: adr r5, .LCPI1_0
86+
; CHECK-NEXT: bic lr, r4, #3
87+
; CHECK-NEXT: adds r0, #44
88+
; CHECK-NEXT: adds r1, #44
89+
; CHECK-NEXT: adds r2, #44
90+
; CHECK-NEXT: vldrw.u32 q0, [r5]
91+
; CHECK-NEXT: movs r4, #11
92+
; CHECK-NEXT: vdup.32 q1, r3
93+
; CHECK-NEXT: .LBB1_2: @ %vector.body
94+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
95+
; CHECK-NEXT: vqadd.u32 q2, q0, r4
96+
; CHECK-NEXT: adds r4, #4
97+
; CHECK-NEXT: cmp lr, r4
98+
; CHECK-NEXT: vptt.u32 hi, q1, q2
99+
; CHECK-NEXT: vldrwt.u32 q2, [r1], #16
100+
; CHECK-NEXT: vldrwt.u32 q3, [r0], #16
101+
; CHECK-NEXT: vfmas.f32 q3, q2, r12
102+
; CHECK-NEXT: vpst
103+
; CHECK-NEXT: vstrwt.32 q3, [r2], #16
104+
; CHECK-NEXT: bne .LBB1_2
105+
; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup
106+
; CHECK-NEXT: pop {r4, r5, r7, pc}
107+
; CHECK-NEXT: .p2align 4
108+
; CHECK-NEXT: @ %bb.4:
109+
; CHECK-NEXT: .LCPI1_0:
110+
; CHECK-NEXT: .long 0 @ 0x0
111+
; CHECK-NEXT: .long 1 @ 0x1
112+
; CHECK-NEXT: .long 2 @ 0x2
113+
; CHECK-NEXT: .long 3 @ 0x3
114+
entry:
115+
%cmp8 = icmp sgt i32 %n, 0
116+
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
117+
118+
vector.ph: ; preds = %entry
119+
%n.rnd.up = add i32 %n, 3
120+
%n.vec = and i32 %n.rnd.up, -4
121+
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
122+
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
123+
br label %vector.body
124+
125+
vector.body: ; preds = %vector.body, %vector.ph
126+
%index = phi i32 [ 11, %vector.ph ], [ %index.next, %vector.body ]
127+
%0 = getelementptr inbounds float, ptr %x, i32 %index
128+
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
129+
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
130+
%2 = getelementptr inbounds float, ptr %y, i32 %index
131+
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
132+
%3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
133+
%4 = getelementptr inbounds float, ptr %z, i32 %index
134+
call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
135+
%index.next = add i32 %index, 4
136+
%5 = icmp eq i32 %index.next, %n.vec
137+
br i1 %5, label %for.cond.cleanup, label %vector.body
138+
139+
for.cond.cleanup: ; preds = %vector.body, %entry
140+
ret void
141+
}
142+
143+
define arm_aapcs_vfpcc void @startS(i32 %S, ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
144+
; CHECK-LABEL: startS:
145+
; CHECK: @ %bb.0: @ %entry
146+
; CHECK-NEXT: .save {r4, r5, r7, lr}
147+
; CHECK-NEXT: push {r4, r5, r7, lr}
148+
; CHECK-NEXT: ldr r5, [sp, #16]
149+
; CHECK-NEXT: cmp r5, #1
150+
; CHECK-NEXT: blt .LBB2_3
151+
; CHECK-NEXT: @ %bb.1: @ %vector.ph
152+
; CHECK-NEXT: vmov r12, s0
153+
; CHECK-NEXT: adds r4, r5, #3
154+
; CHECK-NEXT: bic lr, r4, #3
155+
; CHECK-NEXT: adr r4, .LCPI2_0
156+
; CHECK-NEXT: add.w r1, r1, r0, lsl #2
157+
; CHECK-NEXT: add.w r2, r2, r0, lsl #2
158+
; CHECK-NEXT: add.w r3, r3, r0, lsl #2
159+
; CHECK-NEXT: vldrw.u32 q0, [r4]
160+
; CHECK-NEXT: vdup.32 q1, r5
161+
; CHECK-NEXT: .LBB2_2: @ %vector.body
162+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
163+
; CHECK-NEXT: vqadd.u32 q2, q0, r0
164+
; CHECK-NEXT: adds r0, #4
165+
; CHECK-NEXT: cmp lr, r0
166+
; CHECK-NEXT: vptt.u32 hi, q1, q2
167+
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
168+
; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
169+
; CHECK-NEXT: vfmas.f32 q3, q2, r12
170+
; CHECK-NEXT: vpst
171+
; CHECK-NEXT: vstrwt.32 q3, [r3], #16
172+
; CHECK-NEXT: bne .LBB2_2
173+
; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup
174+
; CHECK-NEXT: pop {r4, r5, r7, pc}
175+
; CHECK-NEXT: .p2align 4
176+
; CHECK-NEXT: @ %bb.4:
177+
; CHECK-NEXT: .LCPI2_0:
178+
; CHECK-NEXT: .long 0 @ 0x0
179+
; CHECK-NEXT: .long 1 @ 0x1
180+
; CHECK-NEXT: .long 2 @ 0x2
181+
; CHECK-NEXT: .long 3 @ 0x3
182+
entry:
183+
%cmp8 = icmp sgt i32 %n, 0
184+
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
185+
186+
vector.ph: ; preds = %entry
187+
%n.rnd.up = add i32 %n, 3
188+
%n.vec = and i32 %n.rnd.up, -4
189+
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
190+
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
191+
br label %vector.body
192+
193+
vector.body: ; preds = %vector.body, %vector.ph
194+
%index = phi i32 [ %S, %vector.ph ], [ %index.next, %vector.body ]
195+
%0 = getelementptr inbounds float, ptr %x, i32 %index
196+
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
197+
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
198+
%2 = getelementptr inbounds float, ptr %y, i32 %index
199+
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
200+
%3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
201+
%4 = getelementptr inbounds float, ptr %z, i32 %index
202+
call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
203+
%index.next = add i32 %index, 4
204+
%5 = icmp eq i32 %index.next, %n.vec
205+
br i1 %5, label %for.cond.cleanup, label %vector.body
206+
207+
for.cond.cleanup: ; preds = %vector.body, %entry
208+
ret void
209+
}
210+
211+
define arm_aapcs_vfpcc void @startSmod4(i32 %S, ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
212+
; CHECK-LABEL: startSmod4:
213+
; CHECK: @ %bb.0: @ %entry
214+
; CHECK-NEXT: .save {r4, r5, r6, lr}
215+
; CHECK-NEXT: push {r4, r5, r6, lr}
216+
; CHECK-NEXT: ldr r6, [sp, #16]
217+
; CHECK-NEXT: cmp r6, #1
218+
; CHECK-NEXT: blt .LBB3_3
219+
; CHECK-NEXT: @ %bb.1: @ %vector.ph
220+
; CHECK-NEXT: vmov r12, s0
221+
; CHECK-NEXT: mvn r4, #12
222+
; CHECK-NEXT: and.w r4, r4, r0, lsl #2
223+
; CHECK-NEXT: bic r0, r0, #3
224+
; CHECK-NEXT: add r1, r4
225+
; CHECK-NEXT: add r2, r4
226+
; CHECK-NEXT: add r3, r4
227+
; CHECK-NEXT: adds r4, r6, #3
228+
; CHECK-NEXT: bic r4, r4, #3
229+
; CHECK-NEXT: movs r5, #1
230+
; CHECK-NEXT: subs r4, r4, r0
231+
; CHECK-NEXT: vdup.32 q1, r6
232+
; CHECK-NEXT: subs r4, #4
233+
; CHECK-NEXT: add.w lr, r5, r4, lsr #2
234+
; CHECK-NEXT: adr r4, .LCPI3_0
235+
; CHECK-NEXT: vldrw.u32 q0, [r4]
236+
; CHECK-NEXT: .LBB3_2: @ %vector.body
237+
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
238+
; CHECK-NEXT: vqadd.u32 q2, q0, r0
239+
; CHECK-NEXT: adds r0, #4
240+
; CHECK-NEXT: vptt.u32 hi, q1, q2
241+
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
242+
; CHECK-NEXT: vldrwt.u32 q3, [r1], #16
243+
; CHECK-NEXT: vfmas.f32 q3, q2, r12
244+
; CHECK-NEXT: vpst
245+
; CHECK-NEXT: vstrwt.32 q3, [r3], #16
246+
; CHECK-NEXT: le lr, .LBB3_2
247+
; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup
248+
; CHECK-NEXT: pop {r4, r5, r6, pc}
249+
; CHECK-NEXT: .p2align 4
250+
; CHECK-NEXT: @ %bb.4:
251+
; CHECK-NEXT: .LCPI3_0:
252+
; CHECK-NEXT: .long 0 @ 0x0
253+
; CHECK-NEXT: .long 1 @ 0x1
254+
; CHECK-NEXT: .long 2 @ 0x2
255+
; CHECK-NEXT: .long 3 @ 0x3
256+
entry:
257+
%cmp8 = icmp sgt i32 %n, 0
258+
br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
259+
260+
vector.ph: ; preds = %entry
261+
%Sm = and i32 %S, -4
262+
%n.rnd.up = add i32 %n, 3
263+
%n.vec = and i32 %n.rnd.up, -4
264+
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
265+
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
266+
br label %vector.body
267+
268+
vector.body: ; preds = %vector.body, %vector.ph
269+
%index = phi i32 [ %Sm, %vector.ph ], [ %index.next, %vector.body ]
270+
%0 = getelementptr inbounds float, ptr %x, i32 %index
271+
%1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
272+
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
273+
%2 = getelementptr inbounds float, ptr %y, i32 %index
274+
%wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
275+
%3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
276+
%4 = getelementptr inbounds float, ptr %z, i32 %index
277+
call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
278+
%index.next = add i32 %index, 4
279+
%5 = icmp eq i32 %index.next, %n.vec
280+
br i1 %5, label %for.cond.cleanup, label %vector.body
281+
282+
for.cond.cleanup: ; preds = %vector.body, %entry
283+
ret void
284+
}
285+
286+
287+
declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
288+
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
289+
declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
290+
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
291+

0 commit comments

Comments
 (0)