@@ -2755,6 +2755,221 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
2755
2755
ret <4 x float > %result
2756
2756
}
2757
2757
2758
+ ; --------------------------------------------------------------------
2759
+ ; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
2760
+ ; --------------------------------------------------------------------
2761
+
2762
+ declare <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 >, <8 x i32 >, <4 x float >, i32 , i32 immarg, i32 immarg)
2763
+
2764
+ define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr (ptr addrspace (1 ) %arg , <4 x i32 > %a , <8 x i32 > %b , i32 %idx ) #0 {
2765
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2766
+ ; SDAG: ; %bb.0: ; %bb
2767
+ ; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2768
+ ; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2769
+ ; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2770
+ ; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2771
+ ; SDAG-NEXT: v_mov_b32_e32 v16, 0
2772
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2773
+ ; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2774
+ ; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2775
+ ; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2776
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s8
2777
+ ; SDAG-NEXT: v_mov_b32_e32 v13, s9
2778
+ ; SDAG-NEXT: v_mov_b32_e32 v14, s10
2779
+ ; SDAG-NEXT: v_mov_b32_e32 v15, s11
2780
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s12
2781
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s13
2782
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s14
2783
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s15
2784
+ ; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2785
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s0
2786
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s1
2787
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s2
2788
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s3
2789
+ ; SDAG-NEXT: v_mov_b32_e32 v17, s16
2790
+ ; SDAG-NEXT: s_waitcnt vmcnt(0)
2791
+ ; SDAG-NEXT: s_nop 0
2792
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2793
+ ; SDAG-NEXT: s_nop 6
2794
+ ; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2795
+ ; SDAG-NEXT: s_endpgm
2796
+ ;
2797
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2798
+ ; GISEL: ; %bb.0: ; %bb
2799
+ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2800
+ ; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2801
+ ; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2802
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2803
+ ; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2804
+ ; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2805
+ ; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2806
+ ; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2807
+ ; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2808
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2809
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2810
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2811
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2812
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2813
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2814
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s2
2815
+ ; GISEL-NEXT: s_waitcnt vmcnt(0)
2816
+ ; GISEL-NEXT: s_nop 0
2817
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2818
+ ; GISEL-NEXT: v_mov_b32_e32 v0, 0
2819
+ ; GISEL-NEXT: s_nop 5
2820
+ ; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2821
+ ; GISEL-NEXT: s_endpgm
2822
+ bb:
2823
+ %id = call i32 @llvm.amdgcn.workitem.id.x ()
2824
+ %gep = getelementptr <4 x float >, ptr addrspace (1 ) %arg , i32 %id
2825
+ %in.1 = load <4 x float >, ptr addrspace (1 ) %gep
2826
+ %mai.1 = tail call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %a , <8 x i32 > %b , <4 x float > %in.1 , i32 %idx , i32 1 , i32 2 )
2827
+ store <4 x float > %mai.1 , ptr addrspace (1 ) %arg
2828
+ ret void
2829
+ }
2830
+
2831
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2832
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2833
+ ; SDAG: ; %bb.0:
2834
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2835
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2836
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2837
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2838
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2839
+ ; SDAG-NEXT: s_nop 1
2840
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
2841
+ ; SDAG-NEXT: s_nop 6
2842
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2843
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2844
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2845
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2846
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2847
+ ;
2848
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2849
+ ; GISEL: ; %bb.0:
2850
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2851
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
2852
+ ; GISEL-NEXT: s_nop 6
2853
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2854
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2855
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2856
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2857
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2858
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2859
+ ret <4 x float > %result
2860
+ }
2861
+
2862
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__flags0 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2863
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2864
+ ; SDAG: ; %bb.0:
2865
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2866
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2867
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2868
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2869
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2870
+ ; SDAG-NEXT: s_nop 1
2871
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2872
+ ; SDAG-NEXT: s_nop 6
2873
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2874
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2875
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2876
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2877
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2878
+ ;
2879
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2880
+ ; GISEL: ; %bb.0:
2881
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2882
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2883
+ ; GISEL-NEXT: s_nop 6
2884
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2885
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2886
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2887
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2888
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2889
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 1 , i32 immarg 3 )
2890
+ ret <4 x float > %result
2891
+ }
2892
+
2893
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__flags1 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 ) {
2894
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2895
+ ; SDAG: ; %bb.0:
2896
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2897
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2898
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2899
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2900
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2901
+ ; SDAG-NEXT: s_nop 1
2902
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2903
+ ; SDAG-NEXT: s_nop 6
2904
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2905
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2906
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2907
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2908
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2909
+ ;
2910
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2911
+ ; GISEL: ; %bb.0:
2912
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2913
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2914
+ ; GISEL-NEXT: s_nop 6
2915
+ ; GISEL-NEXT: v_mov_b32_e32 v0, v12
2916
+ ; GISEL-NEXT: v_mov_b32_e32 v1, v13
2917
+ ; GISEL-NEXT: v_mov_b32_e32 v2, v14
2918
+ ; GISEL-NEXT: v_mov_b32_e32 v3, v15
2919
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2920
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 3 , i32 immarg 1 )
2921
+ ret <4 x float > %result
2922
+ }
2923
+
2924
+ define <4 x float > @test_smfmac_f32_16x16x128_fp8_fp8__sgpr (<4 x i32 > inreg %arg0 , <8 x i32 > inreg %arg1 , <4 x float > inreg %arg2 , i32 inreg %arg3 ) {
2925
+ ; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2926
+ ; SDAG: ; %bb.0:
2927
+ ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2928
+ ; SDAG-NEXT: v_mov_b32_e32 v8, s0
2929
+ ; SDAG-NEXT: v_mov_b32_e32 v9, s1
2930
+ ; SDAG-NEXT: v_mov_b32_e32 v10, s2
2931
+ ; SDAG-NEXT: v_mov_b32_e32 v11, s3
2932
+ ; SDAG-NEXT: v_mov_b32_e32 v0, s16
2933
+ ; SDAG-NEXT: v_mov_b32_e32 v1, s17
2934
+ ; SDAG-NEXT: v_mov_b32_e32 v2, s18
2935
+ ; SDAG-NEXT: v_mov_b32_e32 v3, s19
2936
+ ; SDAG-NEXT: v_mov_b32_e32 v4, s20
2937
+ ; SDAG-NEXT: v_mov_b32_e32 v5, s21
2938
+ ; SDAG-NEXT: v_mov_b32_e32 v6, s22
2939
+ ; SDAG-NEXT: v_mov_b32_e32 v7, s23
2940
+ ; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2941
+ ; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2942
+ ; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2943
+ ; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2944
+ ; SDAG-NEXT: v_mov_b32_e32 v12, s28
2945
+ ; SDAG-NEXT: s_nop 1
2946
+ ; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
2947
+ ; SDAG-NEXT: s_nop 6
2948
+ ; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2949
+ ; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2950
+ ; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2951
+ ; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2952
+ ; SDAG-NEXT: s_setpc_b64 s[30:31]
2953
+ ;
2954
+ ; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2955
+ ; GISEL: ; %bb.0:
2956
+ ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2957
+ ; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2958
+ ; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2959
+ ; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2960
+ ; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2961
+ ; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2962
+ ; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2963
+ ; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2964
+ ; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2965
+ ; GISEL-NEXT: v_mov_b32_e32 v16, s28
2966
+ ; GISEL-NEXT: s_nop 1
2967
+ ; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
2968
+ ; GISEL-NEXT: s_setpc_b64 s[30:31]
2969
+ %result = call <4 x float > @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8 (<4 x i32 > %arg0 , <8 x i32 > %arg1 , <4 x float > %arg2 , i32 %arg3 , i32 immarg 0 , i32 immarg 0 )
2970
+ ret <4 x float > %result
2971
+ }
2972
+
2758
2973
attributes #0 = { "amdgpu-flat-work-group-size" ="1,256" }
2759
2974
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
2760
2975
; GCN: {{.*}}
0 commit comments