Skip to content

Commit 3016fd1

Browse files
davemgreenfhahn
authored andcommitted
[AArch64] Extra testing for sinking splats to various instructions. NFC
(cherry-picked from 2b9c411)
1 parent 1378fdc commit 3016fd1

File tree

1 file changed

+381
-0
lines changed

1 file changed

+381
-0
lines changed
Lines changed: 381 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,381 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
3+
4+
define <4 x i32> @smull(<4 x i16> %x, <4 x i16> *%y) {
5+
; CHECK-LABEL: smull:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fmov d1, d0
8+
; CHECK-NEXT: mov w8, #1
9+
; CHECK-NEXT: movi v0.2d, #0000000000000000
10+
; CHECK-NEXT: dup v1.4h, v1.h[3]
11+
; CHECK-NEXT: .LBB0_1: // %l1
12+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
13+
; CHECK-NEXT: ldr d2, [x0]
14+
; CHECK-NEXT: subs w8, w8, #1
15+
; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h
16+
; CHECK-NEXT: b.eq .LBB0_1
17+
; CHECK-NEXT: // %bb.2: // %l2
18+
; CHECK-NEXT: ret
19+
entry:
20+
%a = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
21+
br label %l1
22+
23+
l1:
24+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
25+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
26+
%l = load <4 x i16>, <4 x i16> *%y
27+
%b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %l, <4 x i16> %a)
28+
%c = add nsw <4 x i32> %q, %b
29+
%pa = add i32 %p, 1
30+
%c1 = icmp eq i32 %p, 0
31+
br i1 %c1, label %l1, label %l2
32+
33+
l2:
34+
ret <4 x i32> %c
35+
}
36+
37+
define <4 x i32> @umull(<4 x i16> %x, <4 x i16> *%y) {
38+
; CHECK-LABEL: umull:
39+
; CHECK: // %bb.0: // %entry
40+
; CHECK-NEXT: fmov d1, d0
41+
; CHECK-NEXT: mov w8, #1
42+
; CHECK-NEXT: movi v0.2d, #0000000000000000
43+
; CHECK-NEXT: dup v1.4h, v1.h[3]
44+
; CHECK-NEXT: .LBB1_1: // %l1
45+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
46+
; CHECK-NEXT: ldr d2, [x0]
47+
; CHECK-NEXT: subs w8, w8, #1
48+
; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
49+
; CHECK-NEXT: b.eq .LBB1_1
50+
; CHECK-NEXT: // %bb.2: // %l2
51+
; CHECK-NEXT: ret
52+
entry:
53+
%a = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
54+
br label %l1
55+
56+
l1:
57+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
58+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
59+
%l = load <4 x i16>, <4 x i16> *%y
60+
%b = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %l, <4 x i16> %a)
61+
%c = add nsw <4 x i32> %q, %b
62+
%pa = add i32 %p, 1
63+
%c1 = icmp eq i32 %p, 0
64+
br i1 %c1, label %l1, label %l2
65+
66+
l2:
67+
ret <4 x i32> %c
68+
}
69+
70+
define <4 x i32> @sqadd(<4 x i32> %x, <4 x i32> *%y) {
71+
; CHECK-LABEL: sqadd:
72+
; CHECK: // %bb.0: // %entry
73+
; CHECK-NEXT: mov v1.16b, v0.16b
74+
; CHECK-NEXT: mov w8, #1
75+
; CHECK-NEXT: movi v0.2d, #0000000000000000
76+
; CHECK-NEXT: dup v1.4s, v1.s[3]
77+
; CHECK-NEXT: .LBB2_1: // %l1
78+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
79+
; CHECK-NEXT: ldr q2, [x0]
80+
; CHECK-NEXT: subs w8, w8, #1
81+
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s
82+
; CHECK-NEXT: sqadd v0.4s, v0.4s, v2.4s
83+
; CHECK-NEXT: b.eq .LBB2_1
84+
; CHECK-NEXT: // %bb.2: // %l2
85+
; CHECK-NEXT: ret
86+
entry:
87+
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
88+
br label %l1
89+
90+
l1:
91+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
92+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
93+
%l = load <4 x i32>, <4 x i32> *%y
94+
%b = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %l, <4 x i32> %a)
95+
%c = tail call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %q, <4 x i32> %b)
96+
%pa = add i32 %p, 1
97+
%c1 = icmp eq i32 %p, 0
98+
br i1 %c1, label %l1, label %l2
99+
100+
l2:
101+
ret <4 x i32> %c
102+
}
103+
104+
define <4 x i32> @sqsub(<4 x i32> %x, <4 x i32> *%y) {
105+
; CHECK-LABEL: sqsub:
106+
; CHECK: // %bb.0: // %entry
107+
; CHECK-NEXT: mov v1.16b, v0.16b
108+
; CHECK-NEXT: mov w8, #1
109+
; CHECK-NEXT: movi v0.2d, #0000000000000000
110+
; CHECK-NEXT: dup v1.4s, v1.s[3]
111+
; CHECK-NEXT: .LBB3_1: // %l1
112+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
113+
; CHECK-NEXT: ldr q2, [x0]
114+
; CHECK-NEXT: subs w8, w8, #1
115+
; CHECK-NEXT: sqrdmulh v2.4s, v2.4s, v1.4s
116+
; CHECK-NEXT: sqsub v0.4s, v0.4s, v2.4s
117+
; CHECK-NEXT: b.eq .LBB3_1
118+
; CHECK-NEXT: // %bb.2: // %l2
119+
; CHECK-NEXT: ret
120+
entry:
121+
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
122+
br label %l1
123+
124+
l1:
125+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
126+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
127+
%l = load <4 x i32>, <4 x i32> *%y
128+
%b = tail call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %l, <4 x i32> %a)
129+
%c = tail call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %q, <4 x i32> %b)
130+
%pa = add i32 %p, 1
131+
%c1 = icmp eq i32 %p, 0
132+
br i1 %c1, label %l1, label %l2
133+
134+
l2:
135+
ret <4 x i32> %c
136+
}
137+
138+
define <4 x i32> @sqdmulh(<4 x i32> %x, <4 x i32> *%y) {
139+
; CHECK-LABEL: sqdmulh:
140+
; CHECK: // %bb.0: // %entry
141+
; CHECK-NEXT: mov v1.16b, v0.16b
142+
; CHECK-NEXT: mov w8, #1
143+
; CHECK-NEXT: movi v0.2d, #0000000000000000
144+
; CHECK-NEXT: dup v1.4s, v1.s[3]
145+
; CHECK-NEXT: .LBB4_1: // %l1
146+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
147+
; CHECK-NEXT: ldr q2, [x0]
148+
; CHECK-NEXT: subs w8, w8, #1
149+
; CHECK-NEXT: sqdmulh v2.4s, v2.4s, v1.4s
150+
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
151+
; CHECK-NEXT: b.eq .LBB4_1
152+
; CHECK-NEXT: // %bb.2: // %l2
153+
; CHECK-NEXT: ret
154+
entry:
155+
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
156+
br label %l1
157+
158+
l1:
159+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
160+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
161+
%l = load <4 x i32>, <4 x i32> *%y
162+
%b = tail call <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32> %l, <4 x i32> %a)
163+
%c = add nsw <4 x i32> %q, %b
164+
%pa = add i32 %p, 1
165+
%c1 = icmp eq i32 %p, 0
166+
br i1 %c1, label %l1, label %l2
167+
168+
l2:
169+
ret <4 x i32> %c
170+
}
171+
172+
define <4 x i32> @sqdmull(<4 x i16> %x, <4 x i16> *%y) {
173+
; CHECK-LABEL: sqdmull:
174+
; CHECK: // %bb.0: // %entry
175+
; CHECK-NEXT: fmov d1, d0
176+
; CHECK-NEXT: mov w8, #1
177+
; CHECK-NEXT: movi v0.2d, #0000000000000000
178+
; CHECK-NEXT: dup v1.4h, v1.h[3]
179+
; CHECK-NEXT: .LBB5_1: // %l1
180+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
181+
; CHECK-NEXT: ldr d2, [x0]
182+
; CHECK-NEXT: subs w8, w8, #1
183+
; CHECK-NEXT: sqdmull v2.4s, v2.4h, v1.4h
184+
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
185+
; CHECK-NEXT: b.eq .LBB5_1
186+
; CHECK-NEXT: // %bb.2: // %l2
187+
; CHECK-NEXT: ret
188+
entry:
189+
%a = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
190+
br label %l1
191+
192+
l1:
193+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
194+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
195+
%l = load <4 x i16>, <4 x i16> *%y
196+
%b = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %l, <4 x i16> %a)
197+
%c = add nsw <4 x i32> %q, %b
198+
%pa = add i32 %p, 1
199+
%c1 = icmp eq i32 %p, 0
200+
br i1 %c1, label %l1, label %l2
201+
202+
l2:
203+
ret <4 x i32> %c
204+
}
205+
206+
define <4 x i32> @mlal(<4 x i32> %x, <4 x i32> *%y) {
207+
; CHECK-LABEL: mlal:
208+
; CHECK: // %bb.0: // %entry
209+
; CHECK-NEXT: mov v1.16b, v0.16b
210+
; CHECK-NEXT: mov w8, #1
211+
; CHECK-NEXT: movi v0.2d, #0000000000000000
212+
; CHECK-NEXT: dup v1.4s, v1.s[3]
213+
; CHECK-NEXT: .LBB6_1: // %l1
214+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
215+
; CHECK-NEXT: ldr q2, [x0]
216+
; CHECK-NEXT: subs w8, w8, #1
217+
; CHECK-NEXT: mla v0.4s, v2.4s, v1.4s
218+
; CHECK-NEXT: b.eq .LBB6_1
219+
; CHECK-NEXT: // %bb.2: // %l2
220+
; CHECK-NEXT: ret
221+
entry:
222+
%a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
223+
br label %l1
224+
225+
l1:
226+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
227+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
228+
%l = load <4 x i32>, <4 x i32> *%y
229+
%b = mul <4 x i32> %l, %a
230+
%c = add <4 x i32> %q, %b
231+
%pa = add i32 %p, 1
232+
%c1 = icmp eq i32 %p, 0
233+
br i1 %c1, label %l1, label %l2
234+
235+
l2:
236+
ret <4 x i32> %c
237+
}
238+
239+
define <4 x float> @fmul(<4 x float> %x, <4 x float> *%y) {
240+
; CHECK-LABEL: fmul:
241+
; CHECK: // %bb.0: // %entry
242+
; CHECK-NEXT: mov v1.16b, v0.16b
243+
; CHECK-NEXT: mov w8, #1
244+
; CHECK-NEXT: movi v0.2d, #0000000000000000
245+
; CHECK-NEXT: .LBB7_1: // %l1
246+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
247+
; CHECK-NEXT: ldr q2, [x0]
248+
; CHECK-NEXT: subs w8, w8, #1
249+
; CHECK-NEXT: fmul v2.4s, v2.4s, v1.s[3]
250+
; CHECK-NEXT: fadd v0.4s, v2.4s, v0.4s
251+
; CHECK-NEXT: b.eq .LBB7_1
252+
; CHECK-NEXT: // %bb.2: // %l2
253+
; CHECK-NEXT: ret
254+
entry:
255+
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
256+
br label %l1
257+
258+
l1:
259+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
260+
%q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
261+
%l = load <4 x float>, <4 x float> *%y
262+
%b = fmul <4 x float> %l, %a
263+
%c = fadd <4 x float> %b, %q
264+
%pa = add i32 %p, 1
265+
%c1 = icmp eq i32 %p, 0
266+
br i1 %c1, label %l1, label %l2
267+
268+
l2:
269+
ret <4 x float> %c
270+
}
271+
272+
define <4 x float> @fmuladd(<4 x float> %x, <4 x float> *%y) {
273+
; CHECK-LABEL: fmuladd:
274+
; CHECK: // %bb.0: // %entry
275+
; CHECK-NEXT: mov v1.16b, v0.16b
276+
; CHECK-NEXT: mov w8, #1
277+
; CHECK-NEXT: movi v0.2d, #0000000000000000
278+
; CHECK-NEXT: dup v1.4s, v1.s[3]
279+
; CHECK-NEXT: .LBB8_1: // %l1
280+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
281+
; CHECK-NEXT: ldr q2, [x0]
282+
; CHECK-NEXT: subs w8, w8, #1
283+
; CHECK-NEXT: fmla v0.4s, v1.4s, v2.4s
284+
; CHECK-NEXT: b.eq .LBB8_1
285+
; CHECK-NEXT: // %bb.2: // %l2
286+
; CHECK-NEXT: ret
287+
entry:
288+
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
289+
br label %l1
290+
291+
l1:
292+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
293+
%q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
294+
%l = load <4 x float>, <4 x float> *%y
295+
%b = fmul fast <4 x float> %l, %a
296+
%c = fadd fast <4 x float> %b, %q
297+
%pa = add i32 %p, 1
298+
%c1 = icmp eq i32 %p, 0
299+
br i1 %c1, label %l1, label %l2
300+
301+
l2:
302+
ret <4 x float> %c
303+
}
304+
305+
define <4 x float> @fma(<4 x float> %x, <4 x float> *%y) {
306+
; CHECK-LABEL: fma:
307+
; CHECK: // %bb.0: // %entry
308+
; CHECK-NEXT: mov v1.16b, v0.16b
309+
; CHECK-NEXT: mov w8, #1
310+
; CHECK-NEXT: movi v0.2d, #0000000000000000
311+
; CHECK-NEXT: dup v1.4s, v1.s[3]
312+
; CHECK-NEXT: .LBB9_1: // %l1
313+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
314+
; CHECK-NEXT: mov v2.16b, v0.16b
315+
; CHECK-NEXT: ldr q3, [x0]
316+
; CHECK-NEXT: mov v0.16b, v1.16b
317+
; CHECK-NEXT: subs w8, w8, #1
318+
; CHECK-NEXT: fmla v0.4s, v2.4s, v3.4s
319+
; CHECK-NEXT: b.eq .LBB9_1
320+
; CHECK-NEXT: // %bb.2: // %l2
321+
; CHECK-NEXT: ret
322+
entry:
323+
%a = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
324+
br label %l1
325+
326+
l1:
327+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
328+
%q = phi <4 x float> [ zeroinitializer, %entry ], [ %c, %l1 ]
329+
%l = load <4 x float>, <4 x float> *%y
330+
%c = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %l, <4 x float> %q, <4 x float> %a)
331+
%pa = add i32 %p, 1
332+
%c1 = icmp eq i32 %p, 0
333+
br i1 %c1, label %l1, label %l2
334+
335+
l2:
336+
ret <4 x float> %c
337+
}
338+
339+
define <4 x i32> @smull_nonsplat(<4 x i16> %x, <4 x i16> *%y) {
340+
; CHECK-LABEL: smull_nonsplat:
341+
; CHECK: // %bb.0: // %entry
342+
; CHECK-NEXT: fmov d1, d0
343+
; CHECK-NEXT: mov w8, #1
344+
; CHECK-NEXT: movi v0.2d, #0000000000000000
345+
; CHECK-NEXT: dup v2.4h, v1.h[3]
346+
; CHECK-NEXT: ext v2.8b, v1.8b, v2.8b, #4
347+
; CHECK-NEXT: ext v1.8b, v1.8b, v2.8b, #6
348+
; CHECK-NEXT: .LBB10_1: // %l1
349+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
350+
; CHECK-NEXT: ldr d2, [x0]
351+
; CHECK-NEXT: subs w8, w8, #1
352+
; CHECK-NEXT: smlal v0.4s, v2.4h, v1.4h
353+
; CHECK-NEXT: b.eq .LBB10_1
354+
; CHECK-NEXT: // %bb.2: // %l2
355+
; CHECK-NEXT: ret
356+
entry:
357+
%a = shufflevector <4 x i16> %x, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 3>
358+
br label %l1
359+
360+
l1:
361+
%p = phi i32 [ 0, %entry ], [ %pa, %l1 ]
362+
%q = phi <4 x i32> [ zeroinitializer, %entry ], [ %c, %l1 ]
363+
%l = load <4 x i16>, <4 x i16> *%y
364+
%b = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %l, <4 x i16> %a)
365+
%c = add nsw <4 x i32> %q, %b
366+
%pa = add i32 %p, 1
367+
%c1 = icmp eq i32 %p, 0
368+
br i1 %c1, label %l1, label %l2
369+
370+
l2:
371+
ret <4 x i32> %c
372+
}
373+
374+
declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
375+
declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
376+
declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
377+
declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
378+
declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
379+
declare <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>)
380+
declare <4 x i32> @llvm.aarch64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>)
381+
declare <4 x float> @llvm.fma.v4f32(<4 x float> %l, <4 x float> %a, <4 x float> %q)

0 commit comments

Comments
 (0)