Skip to content

Commit 836d2dc

Browse files
authored
AMDGPU: Add v_smfmac_f32_16x16x128_fp8_fp8 for gfx950 (#117235)
1 parent 3312491 commit 836d2dc

File tree

12 files changed

+306
-1
lines changed

12 files changed

+306
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x64_i8, "V16iV4iV8iV16iiIiIi", "
453453
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
454454
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
455455
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
456+
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
456457

457458
//===----------------------------------------------------------------------===//
458459
// GFX12+ only builtins.

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -531,4 +531,11 @@ void test_smfmac_f32_16x16x128_fp8_bf8(global v4f* out, v4i a, v8i b, v4f c, int
531531
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, 0, 0);
532532
}
533533

534+
// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_fp8_fp8
535+
// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
536+
void test_smfmac_f32_16x16x128_fp8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx)
537+
{
538+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, 0, 0);
539+
}
540+
534541
#endif

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,9 @@ void test_smfmac_f32_16x16x128_fp8_bf8(global float4* out, int4 a, int8 b, float
118118
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8' must be a constant integer}}
119119
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8' must be a constant integer}}
120120
}
121+
122+
void test_smfmac_f32_16x16x128_fp8_fp8(global float4* out, int4 a, int8 b, float4 c, int idx, int d)
123+
{
124+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8' must be a constant integer}}
125+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8' must be a constant integer}}
126+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
4343
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' needs target feature gfx950-insts}}
4444
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' needs target feature gfx950-insts}}
4545
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_fp8_bf8' needs target feature gfx950-insts}}
46+
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_fp8_fp8' needs target feature gfx950-insts}}
4647
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
4748
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
4849
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3161,6 +3161,7 @@ def int_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, l
31613161
def int_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31623162
def int_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31633163
def int_amdgcn_smfmac_f32_16x16x128_fp8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
3164+
def int_amdgcn_smfmac_f32_16x16x128_fp8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31643165
}
31653166

31663167
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1099,6 +1099,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10991099
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
11001100
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
11011101
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1102+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
11021103
return selectSMFMACIntrin(I);
11031104
default:
11041105
return selectImpl(I, *CoverageInfo);
@@ -3518,6 +3519,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
35183519
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
35193520
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
35203521
break;
3522+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3523+
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3524+
break;
35213525
default:
35223526
llvm_unreachable("unhandled smfmac intrinsic");
35233527
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4813,7 +4813,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48134813
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
48144814
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
48154815
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
4816-
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8: {
4816+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
4817+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8: {
48174818
// vdst, srcA, srcB, srcC, idx
48184819
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
48194820
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,7 @@ defm V_SMFMAC_I32_32X32X64_I8 : SMFMACInst<"v_smfmac_i32_32x32x64_i8",
10601060
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_bf8>;
10611061
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_fp8>;
10621062
defm V_SMFMAC_F32_16X16X128_FP8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_fp8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_fp8_bf8>;
1063+
defm V_SMFMAC_F32_16X16X128_FP8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_fp8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_fp8_fp8>;
10631064
}
10641065

10651066
def MAIInstInfoTable : GenericTable {
@@ -2161,6 +2162,7 @@ defm V_SMFMAC_I32_32X32X64_I8 : VOP3P_Real_SMFMAC <0x47, "v_smfmac_i32_32x3
21612162
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : VOP3P_Real_SMFMAC <0x3b, "v_smfmac_f32_16x16x128bf8bf8">;
21622163
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : VOP3P_Real_SMFMAC <0x3c, "v_smfmac_f32_16x16x128bf8fp8">;
21632164
defm V_SMFMAC_F32_16X16X128_FP8_BF8 : VOP3P_Real_SMFMAC <0x3d, "v_smfmac_f32_16x16x128fp8bf8">;
2165+
defm V_SMFMAC_F32_16X16X128_FP8_FP8 : VOP3P_Real_SMFMAC <0x43, "v_smfmac_f32_16x16x128fp8fp8">;
21642166

21652167
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
21662168
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,15 @@ define amdgpu_kernel void @smfmac_f32_16x16x128_fp8_bf8(<4 x i32> %arg0, <8 x i3
395395
ret void
396396
}
397397

398+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
399+
400+
; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
401+
define amdgpu_kernel void @smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
402+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
403+
store <4 x float> %result, ptr addrspace(1) %out
404+
ret void
405+
}
406+
398407
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
399408
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
400409
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2755,6 +2755,221 @@ define <4 x float> @test_smfmac_f32_16x16x128_fp8_bf8__sgpr(<4 x i32> inreg %arg
27552755
ret <4 x float> %result
27562756
}
27572757

2758+
; --------------------------------------------------------------------
2759+
; llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8
2760+
; --------------------------------------------------------------------
2761+
2762+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
2763+
2764+
define amdgpu_kernel void @test_smfmac_f32_16x16x128_fp8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2765+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2766+
; SDAG: ; %bb.0: ; %bb
2767+
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2768+
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2769+
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2770+
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2771+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2772+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2773+
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2774+
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2775+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2776+
; SDAG-NEXT: v_mov_b32_e32 v12, s8
2777+
; SDAG-NEXT: v_mov_b32_e32 v13, s9
2778+
; SDAG-NEXT: v_mov_b32_e32 v14, s10
2779+
; SDAG-NEXT: v_mov_b32_e32 v15, s11
2780+
; SDAG-NEXT: v_mov_b32_e32 v0, s12
2781+
; SDAG-NEXT: v_mov_b32_e32 v1, s13
2782+
; SDAG-NEXT: v_mov_b32_e32 v2, s14
2783+
; SDAG-NEXT: v_mov_b32_e32 v3, s15
2784+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2785+
; SDAG-NEXT: v_mov_b32_e32 v4, s0
2786+
; SDAG-NEXT: v_mov_b32_e32 v5, s1
2787+
; SDAG-NEXT: v_mov_b32_e32 v6, s2
2788+
; SDAG-NEXT: v_mov_b32_e32 v7, s3
2789+
; SDAG-NEXT: v_mov_b32_e32 v17, s16
2790+
; SDAG-NEXT: s_waitcnt vmcnt(0)
2791+
; SDAG-NEXT: s_nop 0
2792+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2793+
; SDAG-NEXT: s_nop 6
2794+
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2795+
; SDAG-NEXT: s_endpgm
2796+
;
2797+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__vgpr:
2798+
; GISEL: ; %bb.0: ; %bb
2799+
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2800+
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2801+
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2802+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2803+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2804+
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2805+
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2806+
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2807+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2808+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2809+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2810+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2811+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2812+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2813+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2814+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
2815+
; GISEL-NEXT: s_waitcnt vmcnt(0)
2816+
; GISEL-NEXT: s_nop 0
2817+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2818+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2819+
; GISEL-NEXT: s_nop 5
2820+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2821+
; GISEL-NEXT: s_endpgm
2822+
bb:
2823+
%id = call i32 @llvm.amdgcn.workitem.id.x()
2824+
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
2825+
%in.1 = load <4 x float>, ptr addrspace(1) %gep
2826+
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
2827+
store <4 x float> %mai.1, ptr addrspace(1) %arg
2828+
ret void
2829+
}
2830+
2831+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2832+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2833+
; SDAG: ; %bb.0:
2834+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2835+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2836+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2837+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2838+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2839+
; SDAG-NEXT: s_nop 1
2840+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16
2841+
; SDAG-NEXT: s_nop 6
2842+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2843+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2844+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2845+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2846+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2847+
;
2848+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8:
2849+
; GISEL: ; %bb.0:
2850+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2851+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16
2852+
; GISEL-NEXT: s_nop 6
2853+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2854+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2855+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2856+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2857+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2858+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2859+
ret <4 x float> %result
2860+
}
2861+
2862+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2863+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2864+
; SDAG: ; %bb.0:
2865+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2866+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2867+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2868+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2869+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2870+
; SDAG-NEXT: s_nop 1
2871+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2872+
; SDAG-NEXT: s_nop 6
2873+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2874+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2875+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2876+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2877+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2878+
;
2879+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags0:
2880+
; GISEL: ; %bb.0:
2881+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2882+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2883+
; GISEL-NEXT: s_nop 6
2884+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2885+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2886+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2887+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2888+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2889+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2890+
ret <4 x float> %result
2891+
}
2892+
2893+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2894+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2895+
; SDAG: ; %bb.0:
2896+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2897+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2898+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2899+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2900+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2901+
; SDAG-NEXT: s_nop 1
2902+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2903+
; SDAG-NEXT: s_nop 6
2904+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2905+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2906+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2907+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2908+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2909+
;
2910+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__flags1:
2911+
; GISEL: ; %bb.0:
2912+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2913+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2914+
; GISEL-NEXT: s_nop 6
2915+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2916+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2917+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2918+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2919+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2920+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2921+
ret <4 x float> %result
2922+
}
2923+
2924+
define <4 x float> @test_smfmac_f32_16x16x128_fp8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2925+
; SDAG-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2926+
; SDAG: ; %bb.0:
2927+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2928+
; SDAG-NEXT: v_mov_b32_e32 v8, s0
2929+
; SDAG-NEXT: v_mov_b32_e32 v9, s1
2930+
; SDAG-NEXT: v_mov_b32_e32 v10, s2
2931+
; SDAG-NEXT: v_mov_b32_e32 v11, s3
2932+
; SDAG-NEXT: v_mov_b32_e32 v0, s16
2933+
; SDAG-NEXT: v_mov_b32_e32 v1, s17
2934+
; SDAG-NEXT: v_mov_b32_e32 v2, s18
2935+
; SDAG-NEXT: v_mov_b32_e32 v3, s19
2936+
; SDAG-NEXT: v_mov_b32_e32 v4, s20
2937+
; SDAG-NEXT: v_mov_b32_e32 v5, s21
2938+
; SDAG-NEXT: v_mov_b32_e32 v6, s22
2939+
; SDAG-NEXT: v_mov_b32_e32 v7, s23
2940+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2941+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2942+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2943+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2944+
; SDAG-NEXT: v_mov_b32_e32 v12, s28
2945+
; SDAG-NEXT: s_nop 1
2946+
; SDAG-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 a[0:3], v[8:11], v[0:7], v12
2947+
; SDAG-NEXT: s_nop 6
2948+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2949+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2950+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2951+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2952+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2953+
;
2954+
; GISEL-LABEL: test_smfmac_f32_16x16x128_fp8_fp8__sgpr:
2955+
; GISEL: ; %bb.0:
2956+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2957+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2958+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2959+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2960+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2961+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2962+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2963+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2964+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2965+
; GISEL-NEXT: v_mov_b32_e32 v16, s28
2966+
; GISEL-NEXT: s_nop 1
2967+
; GISEL-NEXT: v_smfmac_f32_16x16x128_fp8_fp8 v[0:3], v[12:15], v[4:11], v16
2968+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2969+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.fp8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2970+
ret <4 x float> %result
2971+
}
2972+
27582973
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
27592974
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
27602975
; GCN: {{.*}}

0 commit comments

Comments
 (0)