@@ -26,6 +26,66 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
26
26
ret <4 x i32 > %partial.reduce
27
27
}
28
28
29
+ define <4 x i32 > @udot_in_loop (ptr %p1 , ptr %p2 ){
30
+ ; CHECK-DOT-LABEL: udot_in_loop:
31
+ ; CHECK-DOT: // %bb.0: // %entry
32
+ ; CHECK-DOT-NEXT: movi v1.2d, #0000000000000000
33
+ ; CHECK-DOT-NEXT: mov x8, xzr
34
+ ; CHECK-DOT-NEXT: .LBB1_1: // %vector.body
35
+ ; CHECK-DOT-NEXT: // =>This Inner Loop Header: Depth=1
36
+ ; CHECK-DOT-NEXT: ldr q2, [x0, x8]
37
+ ; CHECK-DOT-NEXT: ldr q3, [x1, x8]
38
+ ; CHECK-DOT-NEXT: mov v0.16b, v1.16b
39
+ ; CHECK-DOT-NEXT: add x8, x8, #16
40
+ ; CHECK-DOT-NEXT: udot v1.4s, v2.16b, v3.16b
41
+ ; CHECK-DOT-NEXT: cmp x8, #16
42
+ ; CHECK-DOT-NEXT: b.ne .LBB1_1
43
+ ; CHECK-DOT-NEXT: // %bb.2: // %end
44
+ ; CHECK-DOT-NEXT: ret
45
+ ;
46
+ ; CHECK-NODOT-LABEL: udot_in_loop:
47
+ ; CHECK-NODOT: // %bb.0: // %entry
48
+ ; CHECK-NODOT-NEXT: movi v1.2d, #0000000000000000
49
+ ; CHECK-NODOT-NEXT: mov x8, xzr
50
+ ; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body
51
+ ; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
52
+ ; CHECK-NODOT-NEXT: ldr q0, [x0, x8]
53
+ ; CHECK-NODOT-NEXT: ldr q2, [x1, x8]
54
+ ; CHECK-NODOT-NEXT: add x8, x8, #16
55
+ ; CHECK-NODOT-NEXT: cmp x8, #16
56
+ ; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
57
+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
58
+ ; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
59
+ ; CHECK-NODOT-NEXT: ushll v1.4s, v2.4h, #0
60
+ ; CHECK-NODOT-NEXT: uaddw v4.4s, v0.4s, v3.4h
61
+ ; CHECK-NODOT-NEXT: uaddw2 v1.4s, v1.4s, v3.8h
62
+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
63
+ ; CHECK-NODOT-NEXT: add v1.4s, v1.4s, v2.4s
64
+ ; CHECK-NODOT-NEXT: b.ne .LBB1_1
65
+ ; CHECK-NODOT-NEXT: // %bb.2: // %end
66
+ ; CHECK-NODOT-NEXT: ret
67
+ entry:
68
+ br label %vector.body
69
+
70
+ vector.body:
71
+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
72
+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
73
+ %gep1 = getelementptr i8 , ptr %p1 , i64 %index
74
+ %load1 = load <16 x i8 >, ptr %gep1 , align 16
75
+ %load1.wide = zext <16 x i8 > %load1 to <16 x i32 >
76
+ %gep2 = getelementptr i8 , ptr %p2 , i64 %index
77
+ %load2 = load <16 x i8 >, ptr %gep2 , align 16
78
+ %load2.wide = zext <16 x i8 > %load2 to <16 x i32 >
79
+ %mul = mul nuw nsw <16 x i32 > %load1.wide , %load2.wide
80
+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mul )
81
+ %index.next = add nuw i64 %index , 16
82
+ %cmp = icmp eq i64 %index.next , 16
83
+ br i1 %cmp , label %end , label %vector.body
84
+
85
+ end:
86
+ ret <4 x i32 > %acc
87
+ }
88
+
29
89
define <2 x i32 > @udot_narrow (<2 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) {
30
90
; CHECK-DOT-LABEL: udot_narrow:
31
91
; CHECK-DOT: // %bb.0:
@@ -128,6 +188,68 @@ define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
128
188
ret <4 x i32 > %partial.reduce
129
189
}
130
190
191
+ define <4 x i32 > @usdot_in_loop (ptr %p1 , ptr %p2 ){
192
+ ; CHECK-NOI8MM-LABEL: usdot_in_loop:
193
+ ; CHECK-NOI8MM: // %bb.0: // %entry
194
+ ; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
195
+ ; CHECK-NOI8MM-NEXT: mov x8, xzr
196
+ ; CHECK-NOI8MM-NEXT: .LBB6_1: // %vector.body
197
+ ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
198
+ ; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
199
+ ; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
200
+ ; CHECK-NOI8MM-NEXT: add x8, x8, #16
201
+ ; CHECK-NOI8MM-NEXT: cmp x8, #16
202
+ ; CHECK-NOI8MM-NEXT: sshll v3.8h, v0.8b, #0
203
+ ; CHECK-NOI8MM-NEXT: sshll2 v4.8h, v0.16b, #0
204
+ ; CHECK-NOI8MM-NEXT: ushll v5.8h, v2.8b, #0
205
+ ; CHECK-NOI8MM-NEXT: ushll2 v2.8h, v2.16b, #0
206
+ ; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
207
+ ; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
208
+ ; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
209
+ ; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
210
+ ; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
211
+ ; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
212
+ ; CHECK-NOI8MM-NEXT: b.ne .LBB6_1
213
+ ; CHECK-NOI8MM-NEXT: // %bb.2: // %end
214
+ ; CHECK-NOI8MM-NEXT: ret
215
+ ;
216
+ ; CHECK-I8MM-LABEL: usdot_in_loop:
217
+ ; CHECK-I8MM: // %bb.0: // %entry
218
+ ; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
219
+ ; CHECK-I8MM-NEXT: mov x8, xzr
220
+ ; CHECK-I8MM-NEXT: .LBB6_1: // %vector.body
221
+ ; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
222
+ ; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
223
+ ; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
224
+ ; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
225
+ ; CHECK-I8MM-NEXT: add x8, x8, #16
226
+ ; CHECK-I8MM-NEXT: usdot v1.4s, v3.16b, v2.16b
227
+ ; CHECK-I8MM-NEXT: cmp x8, #16
228
+ ; CHECK-I8MM-NEXT: b.ne .LBB6_1
229
+ ; CHECK-I8MM-NEXT: // %bb.2: // %end
230
+ ; CHECK-I8MM-NEXT: ret
231
+ entry:
232
+ br label %vector.body
233
+
234
+ vector.body:
235
+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
236
+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
237
+ %gep1 = getelementptr i8 , ptr %p1 , i64 %index
238
+ %load1 = load <16 x i8 >, ptr %gep1 , align 16
239
+ %load1.wide = sext <16 x i8 > %load1 to <16 x i32 >
240
+ %gep2 = getelementptr i8 , ptr %p2 , i64 %index
241
+ %load2 = load <16 x i8 >, ptr %gep2 , align 16
242
+ %load2.wide = zext <16 x i8 > %load2 to <16 x i32 >
243
+ %mul = mul nuw nsw <16 x i32 > %load1.wide , %load2.wide
244
+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mul )
245
+ %index.next = add nuw i64 %index , 16
246
+ %cmp = icmp eq i64 %index.next , 16
247
+ br i1 %cmp , label %end , label %vector.body
248
+
249
+ end:
250
+ ret <4 x i32 > %acc
251
+ }
252
+
131
253
define <2 x i32 > @usdot_narrow (<2 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
132
254
; CHECK-NOI8MM-LABEL: usdot_narrow:
133
255
; CHECK-NOI8MM: // %bb.0:
@@ -175,13 +297,75 @@ define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
175
297
; CHECK-I8MM: // %bb.0:
176
298
; CHECK-I8MM-NEXT: usdot v0.4s, v2.16b, v1.16b
177
299
; CHECK-I8MM-NEXT: ret
178
- %u .wide = sext <16 x i8 > %u to <16 x i32 >
179
- %s .wide = zext <16 x i8 > %s to <16 x i32 >
180
- %mult = mul nuw nsw <16 x i32 > %s .wide , %u .wide
300
+ %s .wide = sext <16 x i8 > %u to <16 x i32 >
301
+ %u .wide = zext <16 x i8 > %s to <16 x i32 >
302
+ %mult = mul nuw nsw <16 x i32 > %u .wide , %s .wide
181
303
%partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mult )
182
304
ret <4 x i32 > %partial.reduce
183
305
}
184
306
307
+ define <4 x i32 > @sudot_in_loop (ptr %p1 , ptr %p2 ){
308
+ ; CHECK-NOI8MM-LABEL: sudot_in_loop:
309
+ ; CHECK-NOI8MM: // %bb.0: // %entry
310
+ ; CHECK-NOI8MM-NEXT: movi v1.2d, #0000000000000000
311
+ ; CHECK-NOI8MM-NEXT: mov x8, xzr
312
+ ; CHECK-NOI8MM-NEXT: .LBB9_1: // %vector.body
313
+ ; CHECK-NOI8MM-NEXT: // =>This Inner Loop Header: Depth=1
314
+ ; CHECK-NOI8MM-NEXT: ldr q0, [x0, x8]
315
+ ; CHECK-NOI8MM-NEXT: ldr q2, [x1, x8]
316
+ ; CHECK-NOI8MM-NEXT: add x8, x8, #16
317
+ ; CHECK-NOI8MM-NEXT: cmp x8, #16
318
+ ; CHECK-NOI8MM-NEXT: ushll v3.8h, v0.8b, #0
319
+ ; CHECK-NOI8MM-NEXT: ushll2 v4.8h, v0.16b, #0
320
+ ; CHECK-NOI8MM-NEXT: sshll v5.8h, v2.8b, #0
321
+ ; CHECK-NOI8MM-NEXT: sshll2 v2.8h, v2.16b, #0
322
+ ; CHECK-NOI8MM-NEXT: mov v0.16b, v1.16b
323
+ ; CHECK-NOI8MM-NEXT: smlal v1.4s, v3.4h, v5.4h
324
+ ; CHECK-NOI8MM-NEXT: smull v6.4s, v4.4h, v2.4h
325
+ ; CHECK-NOI8MM-NEXT: smlal2 v1.4s, v4.8h, v2.8h
326
+ ; CHECK-NOI8MM-NEXT: smlal2 v6.4s, v3.8h, v5.8h
327
+ ; CHECK-NOI8MM-NEXT: add v1.4s, v6.4s, v1.4s
328
+ ; CHECK-NOI8MM-NEXT: b.ne .LBB9_1
329
+ ; CHECK-NOI8MM-NEXT: // %bb.2: // %end
330
+ ; CHECK-NOI8MM-NEXT: ret
331
+ ;
332
+ ; CHECK-I8MM-LABEL: sudot_in_loop:
333
+ ; CHECK-I8MM: // %bb.0: // %entry
334
+ ; CHECK-I8MM-NEXT: movi v1.2d, #0000000000000000
335
+ ; CHECK-I8MM-NEXT: mov x8, xzr
336
+ ; CHECK-I8MM-NEXT: .LBB9_1: // %vector.body
337
+ ; CHECK-I8MM-NEXT: // =>This Inner Loop Header: Depth=1
338
+ ; CHECK-I8MM-NEXT: ldr q2, [x0, x8]
339
+ ; CHECK-I8MM-NEXT: ldr q3, [x1, x8]
340
+ ; CHECK-I8MM-NEXT: mov v0.16b, v1.16b
341
+ ; CHECK-I8MM-NEXT: add x8, x8, #16
342
+ ; CHECK-I8MM-NEXT: usdot v1.4s, v2.16b, v3.16b
343
+ ; CHECK-I8MM-NEXT: cmp x8, #16
344
+ ; CHECK-I8MM-NEXT: b.ne .LBB9_1
345
+ ; CHECK-I8MM-NEXT: // %bb.2: // %end
346
+ ; CHECK-I8MM-NEXT: ret
347
+ entry:
348
+ br label %vector.body
349
+
350
+ vector.body:
351
+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
352
+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
353
+ %gep1 = getelementptr i8 , ptr %p1 , i64 %index
354
+ %load1 = load <16 x i8 >, ptr %gep1 , align 16
355
+ %load1.wide = zext <16 x i8 > %load1 to <16 x i32 >
356
+ %gep2 = getelementptr i8 , ptr %p2 , i64 %index
357
+ %load2 = load <16 x i8 >, ptr %gep2 , align 16
358
+ %load2.wide = sext <16 x i8 > %load2 to <16 x i32 >
359
+ %mul = mul nuw nsw <16 x i32 > %load1.wide , %load2.wide
360
+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %mul )
361
+ %index.next = add nuw i64 %index , 16
362
+ %cmp = icmp eq i64 %index.next , 16
363
+ br i1 %cmp , label %end , label %vector.body
364
+
365
+ end:
366
+ ret <4 x i32 > %acc
367
+ }
368
+
185
369
define <2 x i32 > @sudot_narrow (<2 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
186
370
; CHECK-NOI8MM-LABEL: sudot_narrow:
187
371
; CHECK-NOI8MM: // %bb.0:
@@ -389,6 +573,54 @@ define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
389
573
ret <4 x i32 > %partial.reduce
390
574
}
391
575
576
+ define <4 x i32 > @udot_no_bin_op_in_loop (ptr %p ){
577
+ ; CHECK-LABEL: udot_no_bin_op_in_loop:
578
+ ; CHECK: // %bb.0: // %entry
579
+ ; CHECK-NEXT: adrp x8, .LCPI16_0
580
+ ; CHECK-NEXT: movi v4.2d, #0000000000000000
581
+ ; CHECK-NEXT: adrp x9, .LCPI16_2
582
+ ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0]
583
+ ; CHECK-NEXT: adrp x8, .LCPI16_1
584
+ ; CHECK-NEXT: adrp x10, .LCPI16_3
585
+ ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI16_1]
586
+ ; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI16_2]
587
+ ; CHECK-NEXT: ldr q5, [x10, :lo12:.LCPI16_3]
588
+ ; CHECK-NEXT: mov x8, xzr
589
+ ; CHECK-NEXT: .LBB16_1: // %vector.body
590
+ ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
591
+ ; CHECK-NEXT: ldr q6, [x0, x8]
592
+ ; CHECK-NEXT: mov v0.16b, v4.16b
593
+ ; CHECK-NEXT: add x8, x8, #16
594
+ ; CHECK-NEXT: cmp x8, #16
595
+ ; CHECK-NEXT: tbl v7.16b, { v6.16b }, v2.16b
596
+ ; CHECK-NEXT: tbl v4.16b, { v6.16b }, v1.16b
597
+ ; CHECK-NEXT: tbl v16.16b, { v6.16b }, v3.16b
598
+ ; CHECK-NEXT: tbl v6.16b, { v6.16b }, v5.16b
599
+ ; CHECK-NEXT: add v7.4s, v0.4s, v7.4s
600
+ ; CHECK-NEXT: add v6.4s, v6.4s, v16.4s
601
+ ; CHECK-NEXT: add v4.4s, v4.4s, v7.4s
602
+ ; CHECK-NEXT: add v4.4s, v6.4s, v4.4s
603
+ ; CHECK-NEXT: b.ne .LBB16_1
604
+ ; CHECK-NEXT: // %bb.2: // %end
605
+ ; CHECK-NEXT: ret
606
+ entry:
607
+ br label %vector.body
608
+
609
+ vector.body:
610
+ %index = phi i64 [ 0 , %entry ], [ %index.next , %vector.body ]
611
+ %acc = phi <4 x i32 > [ zeroinitializer , %entry ], [ %partial.reduce , %vector.body ]
612
+ %gep = getelementptr i8 , ptr %p , i64 %index
613
+ %load = load <16 x i8 >, ptr %gep , align 16
614
+ %load.wide = zext <16 x i8 > %load to <16 x i32 >
615
+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %load.wide )
616
+ %index.next = add nuw i64 %index , 16
617
+ %cmp = icmp eq i64 %index.next , 16
618
+ br i1 %cmp , label %end , label %vector.body
619
+
620
+ end:
621
+ ret <4 x i32 > %acc
622
+ }
623
+
392
624
define <4 x i32 > @sdot_no_bin_op (<4 x i32 > %acc , <16 x i8 > %a ){
393
625
; CHECK-DOT-LABEL: sdot_no_bin_op:
394
626
; CHECK-DOT: // %bb.0:
0 commit comments