You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[AArch64][SVE] Add dot product codegen for partial reductions with
no binary operation on input
Add codegen for when the input type has 4 times as many elements as
the output type and the input to the partial reduction does not
have a binary operation performed on it.
Copy file name to clipboardExpand all lines: llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+78Lines changed: 78 additions & 0 deletions
Original file line number
Diff line number
Diff line change
@@ -316,6 +316,84 @@ entry:
316
316
ret <vscale x 4 x i64> %partial.reduce
317
317
}
318
318
319
+
define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
320
+
; CHECK-LABEL: udot_no_bin_op:
321
+
; CHECK: // %bb.0:
322
+
; CHECK-NEXT: mov z2.b, #1 // =0x1
323
+
; CHECK-NEXT: udot z0.s, z1.b, z2.b
324
+
; CHECK-NEXT: ret
325
+
%a.ext = zext <vscale x 16 x i8> %ato <vscale x 16 x i32>
326
+
%partial.reduce = tailcall <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
327
+
ret <vscale x 4 x i32> %partial.reduce
328
+
}
329
+
330
+
define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
331
+
; CHECK-LABEL: sdot_no_bin_op:
332
+
; CHECK: // %bb.0:
333
+
; CHECK-NEXT: mov z2.b, #1 // =0x1
334
+
; CHECK-NEXT: sdot z0.s, z1.b, z2.b
335
+
; CHECK-NEXT: ret
336
+
%a.ext = sext <vscale x 16 x i8> %ato <vscale x 16 x i32>
337
+
%partial.reduce = tailcall <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
338
+
ret <vscale x 4 x i32> %partial.reduce
339
+
}
340
+
341
+
define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
342
+
; CHECK-LABEL: udot_no_bin_op_wide:
343
+
; CHECK: // %bb.0: // %entry
344
+
; CHECK-NEXT: mov z2.h, #1 // =0x1
345
+
; CHECK-NEXT: udot z0.d, z1.h, z2.h
346
+
; CHECK-NEXT: ret
347
+
entry:
348
+
%a.wide = zext <vscale x 8 x i16> %ato <vscale x 8 x i64>
349
+
%partial.reduce = tailcall <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
350
+
ret <vscale x 2 x i64> %partial.reduce
351
+
}
352
+
353
+
define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
354
+
; CHECK-LABEL: sdot_no_bin_op_wide:
355
+
; CHECK: // %bb.0: // %entry
356
+
; CHECK-NEXT: mov z2.h, #1 // =0x1
357
+
; CHECK-NEXT: sdot z0.d, z1.h, z2.h
358
+
; CHECK-NEXT: ret
359
+
entry:
360
+
%a.wide = sext <vscale x 8 x i16> %ato <vscale x 8 x i64>
361
+
%partial.reduce = tailcall <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
362
+
ret <vscale x 2 x i64> %partial.reduce
363
+
}
364
+
365
+
define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
366
+
; CHECK-LABEL: udot_no_bin_op_8to64:
367
+
; CHECK: // %bb.0:
368
+
; CHECK-NEXT: mov z3.b, #1 // =0x1
369
+
; CHECK-NEXT: mov z4.s, #0 // =0x0
370
+
; CHECK-NEXT: udot z4.s, z2.b, z3.b
371
+
; CHECK-NEXT: sunpklo z2.d, z4.s
372
+
; CHECK-NEXT: sunpkhi z3.d, z4.s
373
+
; CHECK-NEXT: add z0.d, z0.d, z2.d
374
+
; CHECK-NEXT: add z1.d, z1.d, z3.d
375
+
; CHECK-NEXT: ret
376
+
%a.ext = zext <vscale x 16 x i8> %ato <vscale x 16 x i64>
377
+
%partial.reduce = tailcall <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
378
+
ret <vscale x 4 x i64> %partial.reduce
379
+
}
380
+
381
+
define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
382
+
; CHECK-LABEL: sdot_no_bin_op_8to64:
383
+
; CHECK: // %bb.0:
384
+
; CHECK-NEXT: mov z3.b, #1 // =0x1
385
+
; CHECK-NEXT: mov z4.s, #0 // =0x0
386
+
; CHECK-NEXT: sdot z4.s, z2.b, z3.b
387
+
; CHECK-NEXT: sunpklo z2.d, z4.s
388
+
; CHECK-NEXT: sunpkhi z3.d, z4.s
389
+
; CHECK-NEXT: add z0.d, z0.d, z2.d
390
+
; CHECK-NEXT: add z1.d, z1.d, z3.d
391
+
; CHECK-NEXT: ret
392
+
%a.ext = sext <vscale x 16 x i8> %ato <vscale x 16 x i64>
393
+
%partial.reduce = tailcall <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
394
+
ret <vscale x 4 x i64> %partial.reduce
395
+
}
396
+
319
397
define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
0 commit comments