@@ -530,3 +530,76 @@ entry:
530
530
%sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
531
531
ret i32 %sum
532
532
}
533
+
534
+
535
+ define <4 x i32 > @vqdot_vv_partial_reduce (<16 x i8 > %a , <16 x i8 > %b ) {
536
+ ; CHECK-LABEL: vqdot_vv_partial_reduce:
537
+ ; CHECK: # %bb.0: # %entry
538
+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
539
+ ; CHECK-NEXT: vsext.vf2 v12, v8
540
+ ; CHECK-NEXT: vsext.vf2 v14, v9
541
+ ; CHECK-NEXT: vwmul.vv v8, v12, v14
542
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
543
+ ; CHECK-NEXT: vslidedown.vi v12, v8, 12
544
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
545
+ ; CHECK-NEXT: vadd.vv v16, v12, v8
546
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
547
+ ; CHECK-NEXT: vslidedown.vi v12, v8, 8
548
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
549
+ ; CHECK-NEXT: vslidedown.vi v8, v8, 4
550
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
551
+ ; CHECK-NEXT: vadd.vv v8, v8, v12
552
+ ; CHECK-NEXT: vadd.vv v8, v8, v16
553
+ ; CHECK-NEXT: ret
554
+ entry:
555
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
556
+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
557
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
558
+ %res = call <4 x i32 > @llvm.experimental.vector.partial.reduce.add (<4 x i32 > zeroinitializer , <16 x i32 > %mul )
559
+ ret <4 x i32 > %res
560
+ }
561
+
562
+ define <4 x i32 > @vqdot_vv_partial_reduce2 (<16 x i8 > %a , <16 x i8 > %b , <4 x i32 > %accum ) {
563
+ ; CHECK-LABEL: vqdot_vv_partial_reduce2:
564
+ ; CHECK: # %bb.0: # %entry
565
+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
566
+ ; CHECK-NEXT: vsext.vf2 v16, v8
567
+ ; CHECK-NEXT: vsext.vf2 v18, v9
568
+ ; CHECK-NEXT: vwmul.vv v12, v16, v18
569
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
570
+ ; CHECK-NEXT: vadd.vv v16, v10, v12
571
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
572
+ ; CHECK-NEXT: vslidedown.vi v8, v12, 12
573
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
574
+ ; CHECK-NEXT: vadd.vv v16, v8, v16
575
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m4, ta, ma
576
+ ; CHECK-NEXT: vslidedown.vi v8, v12, 8
577
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma
578
+ ; CHECK-NEXT: vslidedown.vi v10, v12, 4
579
+ ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
580
+ ; CHECK-NEXT: vadd.vv v8, v10, v8
581
+ ; CHECK-NEXT: vadd.vv v8, v8, v16
582
+ ; CHECK-NEXT: ret
583
+ entry:
584
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
585
+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
586
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
587
+ %res = call <4 x i32 > @llvm.experimental.vector.partial.reduce.add (<4 x i32 > %accum , <16 x i32 > %mul )
588
+ ret <4 x i32 > %res
589
+ }
590
+
591
+ define <16 x i32 > @vqdot_vv_partial_reduce3 (<16 x i8 > %a , <16 x i8 > %b ) {
592
+ ; CHECK-LABEL: vqdot_vv_partial_reduce3:
593
+ ; CHECK: # %bb.0: # %entry
594
+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
595
+ ; CHECK-NEXT: vsext.vf2 v12, v8
596
+ ; CHECK-NEXT: vsext.vf2 v14, v9
597
+ ; CHECK-NEXT: vwmul.vv v8, v12, v14
598
+ ; CHECK-NEXT: ret
599
+ entry:
600
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
601
+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
602
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
603
+ %res = call <16 x i32 > @llvm.experimental.vector.partial.reduce.add.nvx8i32.nvx16i32.nvx16i32 (<16 x i32 > %mul , <16 x i32 > zeroinitializer )
604
+ ret <16 x i32 > %res
605
+ }
0 commit comments