Skip to content

Commit e449634

Browse files
committed
Add tests
1 parent 0010a3c commit e449634

File tree

1 file changed

+235
-3
lines changed

1 file changed

+235
-3
lines changed

llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll

Lines changed: 235 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,66 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
2626
ret <4 x i32> %partial.reduce
2727
}
2828

29+
define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
30+
; CHECK-DOT-LABEL: udot_in_loop:
31+
; CHECK-DOT: // %bb.0: // %entry
32+
; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
33+
; CHECK-DOT-NEXT: mov x8, xzr
34+
; CHECK-DOT-NEXT: .LBB1_1: // %vector.body
35+
; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
36+
; CHECK-DOT-NEXT: ldr q2, [x0, x8]
37+
; CHECK-DOT-NEXT: ldr q3, [x1, x8]
38+
; CHECK-DOT-NEXT: mov v0.16b, v1.16b
39+
; CHECK-DOT-NEXT: add x8, x8, #16
40+
; CHECK-DOT-NEXT: udot v1.4s, v2.16b, v3.16b
41+
; CHECK-DOT-NEXT: cmp x8, #16
42+
; CHECK-DOT-NEXT: b.ne .LBB1_1
43+
; CHECK-DOT-NEXT: // %bb.2: // %end
44+
; CHECK-DOT-NEXT: ret
45+
;
46+
; CHECK-NODOT-LABEL: udot_in_loop:
47+
; CHECK-NODOT: // %bb.0: // %entry
48+
; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
49+
; CHECK-NODOT-NEXT: mov x8, xzr
50+
; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body
51+
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
52+
; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
53+
; CHECK-NODOT-NEXT: ldr q2, [x1, x8]
54+
; CHECK-NODOT-NEXT: add x8, x8, #16
55+
; CHECK-NODOT-NEXT: cmp x8, #16
56+
; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
57+
; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
58+
; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
59+
; CHECK-NODOT-NEXT: ushll v1.4s, v2.4h, #0
60+
; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v3.4h
61+
; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
62+
; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
63+
; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
64+
; CHECK-NODOT-NEXT: b.ne .LBB1_1
65+
; CHECK-NODOT-NEXT: // %bb.2: // %end
66+
; CHECK-NODOT-NEXT: ret
67+
entry:
68+
br label %vector.body
69+
70+
vector.body:
71+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
72+
%acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
73+
%gep1 = getelementptr i8, ptr %p1, i64 %index
74+
%load1 = load <16 x i8>, ptr %gep1, align 16
75+
%load1.wide = zext <16 x i8> %load1 to <16 x i32>
76+
%gep2 = getelementptr i8, ptr %p2, i64 %index
77+
%load2 = load <16 x i8>, ptr %gep2, align 16
78+
%load2.wide = zext <16 x i8> %load2 to <16 x i32>
79+
%mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
80+
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
81+
%index.next = add nuw i64 %index, 16
82+
%cmp = icmp eq i64 %index.next, 16
83+
br i1 %cmp, label %end, label %vector.body
84+
85+
end:
86+
ret <4 x i32> %acc
87+
}
88+
2989
define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
3090
; CHECK-DOT-LABEL: udot_narrow:
3191
; CHECK-DOT: // %bb.0:
@@ -128,6 +188,68 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
128188
ret <4 x i32> %partial.reduce
129189
}
130190

191+
define <4 x i32> @usdot_in_loop(ptr %p1, ptr %p2){
192+
; CHECK-NOI8MM-LABEL: usdot_in_loop:
193+
; CHECK-NOI8MM: // %bb.0: // %entry
194+
; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
195+
; CHECK-NOI8MM-NEXT: mov x8, xzr
196+
; CHECK-NOI8MM-NEXT: .LBB6_1: // %vector.body
197+
; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
198+
; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
199+
; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
200+
; CHECK-NOI8MM-NEXT: add x8, x8, #16
201+
; CHECK-NOI8MM-NEXT: cmp x8, #16
202+
; CHECK-NOI8MM-NEXT: sshll v3.8h, v0.8b, #0
203+
; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v0.16b, #0
204+
; CHECK-NOI8MM-NEXT: ushll v5.8h, v2.8b, #0
205+
; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
206+
; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
207+
; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
208+
; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
209+
; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
210+
; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
211+
; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
212+
; CHECK-NOI8MM-NEXT: b.ne .LBB6_1
213+
; CHECK-NOI8MM-NEXT: // %bb.2: // %end
214+
; CHECK-NOI8MM-NEXT: ret
215+
;
216+
; CHECK-I8MM-LABEL: usdot_in_loop:
217+
; CHECK-I8MM: // %bb.0: // %entry
218+
; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
219+
; CHECK-I8MM-NEXT: mov x8, xzr
220+
; CHECK-I8MM-NEXT: .LBB6_1: // %vector.body
221+
; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
222+
; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
223+
; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
224+
; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
225+
; CHECK-I8MM-NEXT: add x8, x8, #16
226+
; CHECK-I8MM-NEXT: usdot v1.4s, v3.16b, v2.16b
227+
; CHECK-I8MM-NEXT: cmp x8, #16
228+
; CHECK-I8MM-NEXT: b.ne .LBB6_1
229+
; CHECK-I8MM-NEXT: // %bb.2: // %end
230+
; CHECK-I8MM-NEXT: ret
231+
entry:
232+
br label %vector.body
233+
234+
vector.body:
235+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
236+
%acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
237+
%gep1 = getelementptr i8, ptr %p1, i64 %index
238+
%load1 = load <16 x i8>, ptr %gep1, align 16
239+
%load1.wide = sext <16 x i8> %load1 to <16 x i32>
240+
%gep2 = getelementptr i8, ptr %p2, i64 %index
241+
%load2 = load <16 x i8>, ptr %gep2, align 16
242+
%load2.wide = zext <16 x i8> %load2 to <16 x i32>
243+
%mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
244+
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
245+
%index.next = add nuw i64 %index, 16
246+
%cmp = icmp eq i64 %index.next, 16
247+
br i1 %cmp, label %end, label %vector.body
248+
249+
end:
250+
ret <4 x i32> %acc
251+
}
252+
131253
define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
132254
; CHECK-NOI8MM-LABEL: usdot_narrow:
133255
; CHECK-NOI8MM: // %bb.0:
@@ -175,13 +297,75 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
175297
; CHECK-I8MM: // %bb.0:
176298
; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b
177299
; CHECK-I8MM-NEXT: ret
178-
%u.wide = sext <16 x i8> %u to <16 x i32>
179-
%s.wide = zext <16 x i8> %s to <16 x i32>
180-
%mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
300+
%s.wide = sext <16 x i8> %u to <16 x i32>
301+
%u.wide = zext <16 x i8> %s to <16 x i32>
302+
%mult = mul nuw nsw <16 x i32> %u.wide, %s.wide
181303
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
182304
ret <4 x i32> %partial.reduce
183305
}
184306

307+
define <4 x i32> @sudot_in_loop(ptr %p1, ptr %p2){
308+
; CHECK-NOI8MM-LABEL: sudot_in_loop:
309+
; CHECK-NOI8MM: // %bb.0: // %entry
310+
; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
311+
; CHECK-NOI8MM-NEXT: mov x8, xzr
312+
; CHECK-NOI8MM-NEXT: .LBB9_1: // %vector.body
313+
; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
314+
; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
315+
; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
316+
; CHECK-NOI8MM-NEXT: add x8, x8, #16
317+
; CHECK-NOI8MM-NEXT: cmp x8, #16
318+
; CHECK-NOI8MM-NEXT: ushll v3.8h, v0.8b, #0
319+
; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v0.16b, #0
320+
; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0
321+
; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
322+
; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
323+
; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
324+
; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
325+
; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
326+
; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
327+
; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
328+
; CHECK-NOI8MM-NEXT: b.ne .LBB9_1
329+
; CHECK-NOI8MM-NEXT: // %bb.2: // %end
330+
; CHECK-NOI8MM-NEXT: ret
331+
;
332+
; CHECK-I8MM-LABEL: sudot_in_loop:
333+
; CHECK-I8MM: // %bb.0: // %entry
334+
; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
335+
; CHECK-I8MM-NEXT: mov x8, xzr
336+
; CHECK-I8MM-NEXT: .LBB9_1: // %vector.body
337+
; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
338+
; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
339+
; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
340+
; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
341+
; CHECK-I8MM-NEXT: add x8, x8, #16
342+
; CHECK-I8MM-NEXT: usdot v1.4s, v2.16b, v3.16b
343+
; CHECK-I8MM-NEXT: cmp x8, #16
344+
; CHECK-I8MM-NEXT: b.ne .LBB9_1
345+
; CHECK-I8MM-NEXT: // %bb.2: // %end
346+
; CHECK-I8MM-NEXT: ret
347+
entry:
348+
br label %vector.body
349+
350+
vector.body:
351+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
352+
%acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
353+
%gep1 = getelementptr i8, ptr %p1, i64 %index
354+
%load1 = load <16 x i8>, ptr %gep1, align 16
355+
%load1.wide = zext <16 x i8> %load1 to <16 x i32>
356+
%gep2 = getelementptr i8, ptr %p2, i64 %index
357+
%load2 = load <16 x i8>, ptr %gep2, align 16
358+
%load2.wide = sext <16 x i8> %load2 to <16 x i32>
359+
%mul = mul nuw nsw <16 x i32> %load1.wide, %load2.wide
360+
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mul)
361+
%index.next = add nuw i64 %index, 16
362+
%cmp = icmp eq i64 %index.next, 16
363+
br i1 %cmp, label %end, label %vector.body
364+
365+
end:
366+
ret <4 x i32> %acc
367+
}
368+
185369
define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
186370
; CHECK-NOI8MM-LABEL: sudot_narrow:
187371
; CHECK-NOI8MM: // %bb.0:
@@ -389,6 +573,54 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
389573
ret <4 x i32> %partial.reduce
390574
}
391575

576+
define <4 x i32> @udot_no_bin_op_in_loop(ptr %p){
577+
; CHECK-LABEL: udot_no_bin_op_in_loop:
578+
; CHECK: // %bb.0: // %entry
579+
; CHECK-NEXT: adrp x8, .LCPI16_0
580+
; CHECK-NEXT: movi v4.2d, #0000000000000000
581+
; CHECK-NEXT: adrp x9, .LCPI16_2
582+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
583+
; CHECK-NEXT: adrp x8, .LCPI16_1
584+
; CHECK-NEXT: adrp x10, .LCPI16_3
585+
; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
586+
; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI16_2]
587+
; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI16_3]
588+
; CHECK-NEXT: mov x8, xzr
589+
; CHECK-NEXT: .LBB16_1: // %vector.body
590+
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
591+
; CHECK-NEXT: ldr q6, [x0, x8]
592+
; CHECK-NEXT: mov v0.16b, v4.16b
593+
; CHECK-NEXT: add x8, x8, #16
594+
; CHECK-NEXT: cmp x8, #16
595+
; CHECK-NEXT: tbl v7.16b, { v6.16b }, v2.16b
596+
; CHECK-NEXT: tbl v4.16b, { v6.16b }, v1.16b
597+
; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
598+
; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
599+
; CHECK-NEXT: add v7.4s, v0.4s, v7.4s
600+
; CHECK-NEXT: add v6.4s, v6.4s, v16.4s
601+
; CHECK-NEXT: add v4.4s, v4.4s, v7.4s
602+
; CHECK-NEXT: add v4.4s, v6.4s, v4.4s
603+
; CHECK-NEXT: b.ne .LBB16_1
604+
; CHECK-NEXT: // %bb.2: // %end
605+
; CHECK-NEXT: ret
606+
entry:
607+
br label %vector.body
608+
609+
vector.body:
610+
%index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
611+
%acc = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce, %vector.body ]
612+
%gep = getelementptr i8, ptr %p, i64 %index
613+
%load = load <16 x i8>, ptr %gep, align 16
614+
%load.wide = zext <16 x i8> %load to <16 x i32>
615+
%partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %load.wide)
616+
%index.next = add nuw i64 %index, 16
617+
%cmp = icmp eq i64 %index.next, 16
618+
br i1 %cmp, label %end, label %vector.body
619+
620+
end:
621+
ret <4 x i32> %acc
622+
}
623+
392624
define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
393625
; CHECK-DOT-LABEL: sdot_no_bin_op:
394626
; CHECK-DOT: // %bb.0:

0 commit comments

Comments
 (0)