@@ -2540,6 +2540,221 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg
2540
2540
ret <4 x float > %result
2541
2541
}
2542
2542
2543
+ ; --------------------------------------------------------------------
2544
+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8
2545
+ ; --------------------------------------------------------------------
2546
+
2547
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2548
+
2549
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_bf8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2550
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2551
+ ; SDAG: ; %bb.0: ; %bb
2552
+ ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2553
+ ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2554
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2555
+ ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2556
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2557
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2558
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2559
+ ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2560
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2561
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2562
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2563
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2564
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2565
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2566
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2567
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2568
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2569
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2570
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2571
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2572
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2573
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2574
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2575
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2576
+ ; SDAG-NEXT: s_nop 0
2577
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2578
+ ; SDAG-NEXT: s_nop 6
2579
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2580
+ ; SDAG-NEXT: s_endpgm
2581
+ ;
2582
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__vgpr:
2583
+ ; GISEL: ; %bb.0: ; %bb
2584
+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2585
+ ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2586
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2587
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2588
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2589
+ ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2590
+ ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2591
+ ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2592
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2593
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2594
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2595
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2596
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2597
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2598
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2599
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2600
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2601
+ ; GISEL-NEXT: s_nop 0
2602
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2603
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2604
+ ; GISEL-NEXT: s_nop 5
2605
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2606
+ ; GISEL-NEXT: s_endpgm
2607
+ bb:
2608
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2609
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2610
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2611
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2612
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2613
+ ret void
2614
+ }
2615
+
2616
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2617
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2618
+ ; SDAG: ; %bb.0:
2619
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2620
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2621
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2622
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2623
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2624
+ ; SDAG-NEXT: s_nop 1
2625
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16
2626
+ ; SDAG-NEXT: s_nop 6
2627
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2628
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2629
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2630
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2631
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2632
+ ;
2633
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8:
2634
+ ; GISEL: ; %bb.0:
2635
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2636
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16
2637
+ ; GISEL-NEXT: s_nop 6
2638
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2639
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2640
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2641
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2642
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2643
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2644
+ ret <4 x float > %result
2645
+ }
2646
+
2647
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2648
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2649
+ ; SDAG: ; %bb.0:
2650
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2651
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2652
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2653
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2654
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2655
+ ; SDAG-NEXT: s_nop 1
2656
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2657
+ ; SDAG-NEXT: s_nop 6
2658
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2659
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2660
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2661
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2662
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2663
+ ;
2664
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags0:
2665
+ ; GISEL: ; %bb.0:
2666
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2667
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2668
+ ; GISEL-NEXT: s_nop 6
2669
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2670
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2671
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2672
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2673
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2674
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2675
+ ret <4 x float > %result
2676
+ }
2677
+
2678
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2679
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2680
+ ; SDAG: ; %bb.0:
2681
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2682
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2683
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2684
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2685
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2686
+ ; SDAG-NEXT: s_nop 1
2687
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2688
+ ; SDAG-NEXT: s_nop 6
2689
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2690
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2691
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2692
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2693
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2694
+ ;
2695
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__flags1:
2696
+ ; GISEL: ; %bb.0:
2697
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2698
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2699
+ ; GISEL-NEXT: s_nop 6
2700
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2701
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2702
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2703
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2704
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2705
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2706
+ ret <4 x float > %result
2707
+ }
2708
+
2709
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_bf8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2710
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2711
+ ; SDAG: ; %bb.0:
2712
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2713
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2714
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2715
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2716
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2717
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2718
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2719
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2720
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2721
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2722
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2723
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2724
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2725
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2726
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2727
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2728
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2729
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2730
+ ; SDAG-NEXT: s_nop 1
2731
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 a[0:3], v[8:11], v[0:7], v12
2732
+ ; SDAG-NEXT: s_nop 6
2733
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2734
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2735
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2736
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2737
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2738
+ ;
2739
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_bf8__sgpr:
2740
+ ; GISEL: ; %bb.0:
2741
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2742
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2743
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2744
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2745
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2746
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2747
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2748
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2749
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2750
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2751
+ ; GISEL-NEXT: s_nop 1
2752
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_bf8 v[0:3], v[12:15], v[4:11], v16
2753
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2754
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.bf8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2755
+ ret <4 x float > %result
2756
+ }
2757
+
2543
2758
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
2544
2759
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2545
2760
; GCN: {{.*}}
0 commit comments