@@ -367,6 +367,166 @@ entry:
367
367
ret <4 x i64 > %partial.reduce
368
368
}
369
369
370
+ define <4 x i32 > @udot_no_bin_op (<4 x i32 > %acc , <16 x i8 > %a ){
371
+ ; CHECK-DOT-LABEL: udot_no_bin_op:
372
+ ; CHECK-DOT: // %bb.0:
373
+ ; CHECK-DOT-NEXT: movi v2.16b, #1
374
+ ; CHECK-DOT-NEXT: udot v0.4s, v1.16b, v2.16b
375
+ ; CHECK-DOT-NEXT: ret
376
+ ;
377
+ ; CHECK-NODOT-LABEL: udot_no_bin_op:
378
+ ; CHECK-NODOT: // %bb.0:
379
+ ; CHECK-NODOT-NEXT: ushll v2.8h, v1.8b, #0
380
+ ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
381
+ ; CHECK-NODOT-NEXT: ushll v3.4s, v1.4h, #0
382
+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v2.4h
383
+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v3.4s, v2.8h
384
+ ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
385
+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
386
+ ; CHECK-NODOT-NEXT: ret
387
+ %a.wide = zext <16 x i8 > %a to <16 x i32 >
388
+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %a.wide )
389
+ ret <4 x i32 > %partial.reduce
390
+ }
391
+
392
+ define <4 x i32 > @sdot_no_bin_op (<4 x i32 > %acc , <16 x i8 > %a ){
393
+ ; CHECK-DOT-LABEL: sdot_no_bin_op:
394
+ ; CHECK-DOT: // %bb.0:
395
+ ; CHECK-DOT-NEXT: movi v2.16b, #1
396
+ ; CHECK-DOT-NEXT: sdot v0.4s, v1.16b, v2.16b
397
+ ; CHECK-DOT-NEXT: ret
398
+ ;
399
+ ; CHECK-NODOT-LABEL: sdot_no_bin_op:
400
+ ; CHECK-NODOT: // %bb.0:
401
+ ; CHECK-NODOT-NEXT: sshll v2.8h, v1.8b, #0
402
+ ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
403
+ ; CHECK-NODOT-NEXT: sshll v3.4s, v1.4h, #0
404
+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v2.4h
405
+ ; CHECK-NODOT-NEXT: saddw2 v2.4s, v3.4s, v2.8h
406
+ ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
407
+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
408
+ ; CHECK-NODOT-NEXT: ret
409
+ %a.wide = sext <16 x i8 > %a to <16 x i32 >
410
+ %partial.reduce = tail call <4 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<4 x i32 > %acc , <16 x i32 > %a.wide )
411
+ ret <4 x i32 > %partial.reduce
412
+ }
413
+
414
+ define <2 x i32 > @udot_no_bin_op_narrow (<2 x i32 > %acc , <8 x i8 > %a ){
415
+ ; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
416
+ ; CHECK-DOT: // %bb.0:
417
+ ; CHECK-DOT-NEXT: movi v2.8b, #1
418
+ ; CHECK-DOT-NEXT: udot v0.2s, v1.8b, v2.8b
419
+ ; CHECK-DOT-NEXT: ret
420
+ ;
421
+ ; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
422
+ ; CHECK-NODOT: // %bb.0:
423
+ ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
424
+ ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
425
+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
426
+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
427
+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
428
+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
429
+ ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
430
+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
431
+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
432
+ ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
433
+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
434
+ ; CHECK-NODOT-NEXT: ret
435
+ %a.wide = zext <8 x i8 > %a to <8 x i32 >
436
+ %partial.reduce = tail call <2 x i32 > @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32 (<2 x i32 > %acc , <8 x i32 > %a.wide )
437
+ ret <2 x i32 > %partial.reduce
438
+ }
439
+
440
+ define <2 x i32 > @sdot_no_bin_op_narrow (<2 x i32 > %acc , <8 x i8 > %a ){
441
+ ; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
442
+ ; CHECK-DOT: // %bb.0:
443
+ ; CHECK-DOT-NEXT: movi v2.8b, #1
444
+ ; CHECK-DOT-NEXT: sdot v0.2s, v1.8b, v2.8b
445
+ ; CHECK-DOT-NEXT: ret
446
+ ;
447
+ ; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
448
+ ; CHECK-NODOT: // %bb.0:
449
+ ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
450
+ ; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
451
+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
452
+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
453
+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
454
+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
455
+ ; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
456
+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
457
+ ; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
458
+ ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
459
+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
460
+ ; CHECK-NODOT-NEXT: ret
461
+ %a.wide = sext <8 x i8 > %a to <8 x i32 >
462
+ %partial.reduce = tail call <2 x i32 > @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32 (<2 x i32 > %acc , <8 x i32 > %a.wide )
463
+ ret <2 x i32 > %partial.reduce
464
+ }
465
+
466
+ define <4 x i64 > @udot_no_bin_op_8to64 (<4 x i64 > %acc , <16 x i8 > %a ){
467
+ ; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
468
+ ; CHECK-DOT: // %bb.0:
469
+ ; CHECK-DOT-NEXT: movi v3.16b, #1
470
+ ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
471
+ ; CHECK-DOT-NEXT: udot v4.4s, v2.16b, v3.16b
472
+ ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
473
+ ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
474
+ ; CHECK-DOT-NEXT: ret
475
+ ;
476
+ ; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
477
+ ; CHECK-NODOT: // %bb.0:
478
+ ; CHECK-NODOT-NEXT: ushll v3.8h, v2.8b, #0
479
+ ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
480
+ ; CHECK-NODOT-NEXT: ushll v4.4s, v3.4h, #0
481
+ ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
482
+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v3.8h, #0
483
+ ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
484
+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v4.4s
485
+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v4.2s
486
+ ; CHECK-NODOT-NEXT: uaddl2 v4.2d, v3.4s, v5.4s
487
+ ; CHECK-NODOT-NEXT: uaddl v3.2d, v3.2s, v5.2s
488
+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
489
+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
490
+ ; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
491
+ ; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
492
+ ; CHECK-NODOT-NEXT: ret
493
+ %a.wide = zext <16 x i8 > %a to <16 x i64 >
494
+ %partial.reduce = tail call <4 x i64 > @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64 (<4 x i64 > %acc , <16 x i64 > %a.wide )
495
+ ret <4 x i64 > %partial.reduce
496
+ }
497
+
498
+ define <4 x i64 > @sdot_no_bin_op_8to64 (<4 x i64 > %acc , <16 x i8 > %a ){
499
+ ; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
500
+ ; CHECK-DOT: // %bb.0:
501
+ ; CHECK-DOT-NEXT: movi v3.16b, #1
502
+ ; CHECK-DOT-NEXT: movi v4.2d, #0000000000000000
503
+ ; CHECK-DOT-NEXT: sdot v4.4s, v2.16b, v3.16b
504
+ ; CHECK-DOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
505
+ ; CHECK-DOT-NEXT: saddw v0.2d, v0.2d, v4.2s
506
+ ; CHECK-DOT-NEXT: ret
507
+ ;
508
+ ; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
509
+ ; CHECK-NODOT: // %bb.0:
510
+ ; CHECK-NODOT-NEXT: sshll v3.8h, v2.8b, #0
511
+ ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
512
+ ; CHECK-NODOT-NEXT: sshll v4.4s, v3.4h, #0
513
+ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
514
+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v3.8h, #0
515
+ ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
516
+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v4.4s
517
+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v4.2s
518
+ ; CHECK-NODOT-NEXT: saddl2 v4.2d, v3.4s, v5.4s
519
+ ; CHECK-NODOT-NEXT: saddl v3.2d, v3.2s, v5.2s
520
+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
521
+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
522
+ ; CHECK-NODOT-NEXT: add v1.2d, v4.2d, v1.2d
523
+ ; CHECK-NODOT-NEXT: add v0.2d, v3.2d, v0.2d
524
+ ; CHECK-NODOT-NEXT: ret
525
+ %a.wide = sext <16 x i8 > %a to <16 x i64 >
526
+ %partial.reduce = tail call <4 x i64 > @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64 (<4 x i64 > %acc , <16 x i64 > %a.wide )
527
+ ret <4 x i64 > %partial.reduce
528
+ }
529
+
370
530
define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
371
531
; CHECK-LABEL: not_udot:
372
532
; CHECK: // %bb.0:
@@ -398,3 +558,91 @@ define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
398
558
%partial.reduce = tail call <2 x i32 > @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32 (<2 x i32 > %acc , <4 x i32 > %mult )
399
559
ret <2 x i32 > %partial.reduce
400
560
}
561
+
562
+ define <2 x i64 > @udot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
563
+ ; CHECK-LABEL: udot_different_types:
564
+ ; CHECK: // %bb.0: // %entry
565
+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
566
+ ; CHECK-NEXT: ushll v3.4s, v1.4h, #0
567
+ ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
568
+ ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
569
+ ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
570
+ ; CHECK-NEXT: umull v5.2d, v1.2s, v2.2s
571
+ ; CHECK-NEXT: umlal v0.2d, v3.2s, v4.2s
572
+ ; CHECK-NEXT: umlal2 v0.2d, v1.4s, v2.4s
573
+ ; CHECK-NEXT: umlal2 v5.2d, v3.4s, v4.4s
574
+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
575
+ ; CHECK-NEXT: ret
576
+ entry:
577
+ %a.wide = zext <8 x i16 > %a to <8 x i64 >
578
+ %b.wide = zext <8 x i8 > %b to <8 x i64 >
579
+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
580
+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
581
+ ret <2 x i64 > %partial.reduce
582
+ }
583
+
584
+ define <2 x i64 > @sdot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
585
+ ; CHECK-LABEL: sdot_different_types:
586
+ ; CHECK: // %bb.0: // %entry
587
+ ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
588
+ ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
589
+ ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
590
+ ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
591
+ ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
592
+ ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
593
+ ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
594
+ ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
595
+ ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
596
+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
597
+ ; CHECK-NEXT: ret
598
+ entry:
599
+ %a.wide = sext <8 x i16 > %a to <8 x i64 >
600
+ %b.wide = sext <8 x i8 > %b to <8 x i64 >
601
+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
602
+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
603
+ ret <2 x i64 > %partial.reduce
604
+ }
605
+
606
+ define <2 x i64 > @usdot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
607
+ ; CHECK-LABEL: usdot_different_types:
608
+ ; CHECK: // %bb.0: // %entry
609
+ ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
610
+ ; CHECK-NEXT: ushll v3.4s, v1.4h, #0
611
+ ; CHECK-NEXT: ushll2 v1.4s, v1.8h, #0
612
+ ; CHECK-NEXT: sshll v4.4s, v2.4h, #0
613
+ ; CHECK-NEXT: sshll2 v2.4s, v2.8h, #0
614
+ ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
615
+ ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
616
+ ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
617
+ ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
618
+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
619
+ ; CHECK-NEXT: ret
620
+ entry:
621
+ %a.wide = zext <8 x i16 > %a to <8 x i64 >
622
+ %b.wide = sext <8 x i8 > %b to <8 x i64 >
623
+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
624
+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
625
+ ret <2 x i64 > %partial.reduce
626
+ }
627
+
628
+ define <2 x i64 > @sudot_different_types (<2 x i64 > %acc , <8 x i16 > %a , <8 x i8 > %b ){
629
+ ; CHECK-LABEL: sudot_different_types:
630
+ ; CHECK: // %bb.0: // %entry
631
+ ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
632
+ ; CHECK-NEXT: sshll v3.4s, v1.4h, #0
633
+ ; CHECK-NEXT: sshll2 v1.4s, v1.8h, #0
634
+ ; CHECK-NEXT: ushll v4.4s, v2.4h, #0
635
+ ; CHECK-NEXT: ushll2 v2.4s, v2.8h, #0
636
+ ; CHECK-NEXT: smull v5.2d, v1.2s, v2.2s
637
+ ; CHECK-NEXT: smlal v0.2d, v3.2s, v4.2s
638
+ ; CHECK-NEXT: smlal2 v0.2d, v1.4s, v2.4s
639
+ ; CHECK-NEXT: smlal2 v5.2d, v3.4s, v4.4s
640
+ ; CHECK-NEXT: add v0.2d, v5.2d, v0.2d
641
+ ; CHECK-NEXT: ret
642
+ entry:
643
+ %a.wide = sext <8 x i16 > %a to <8 x i64 >
644
+ %b.wide = zext <8 x i8 > %b to <8 x i64 >
645
+ %mult = mul nuw nsw <8 x i64 > %a.wide , %b.wide
646
+ %partial.reduce = tail call <2 x i64 > @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64 (<2 x i64 > %acc , <8 x i64 > %mult )
647
+ ret <2 x i64 > %partial.reduce
648
+ }
0 commit comments