Skip to content

Commit 3678f8a

Browse files
authored
AMDGPU: Add v_smfmac_f32_16x16x128_bf8_fp8 for gfx950 (#117233)
1 parent 7baadb2 commit 3678f8a

File tree

12 files changed

+307
-1
lines changed

12 files changed

+307
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,7 @@ TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_32x32x32_bf16, "V16fV8yV16yV16fiIiIi"
451451
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_16x16x128_i8, "V4iV4iV8iV4iiIiIi", "nc", "gfx950-insts")
452452
TARGET_BUILTIN(__builtin_amdgcn_smfmac_i32_32x32x64_i8, "V16iV4iV8iV16iiIiIi", "nc", "gfx950-insts")
453453
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
454+
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8, "V4fV4iV8iV4fiIiIi", "nc", "gfx950-insts")
454455

455456
//===----------------------------------------------------------------------===//
456457
// GFX12+ only builtins.

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,4 +517,11 @@ void test_smfmac_f32_16x16x128_bf8_bf8(global v4f* out, v4i a, v8i b, v4f c, int
517517
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, 0, 0);
518518
}
519519

520+
// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x128_bf8_fp8
521+
// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
522+
void test_smfmac_f32_16x16x128_bf8_fp8(global v4f* out, v4i a, v8i b, v4f c, int idx)
523+
{
524+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, 0, 0);
525+
}
526+
520527
#endif

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,9 @@ void test_smfmac_f32_16x16x128_bf8_bf8(global float4* out, int4 a, int8 b, float
106106
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' must be a constant integer}}
107107
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' must be a constant integer}}
108108
}
109+
110+
void test_smfmac_f32_16x16x128_bf8_fp8(global float4* out, int4 a, int8 b, float4 c, int idx, int d)
111+
{
112+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' must be a constant integer}}
113+
*out = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' must be a constant integer}}
114+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
4141
*out10 = __builtin_amdgcn_smfmac_i32_16x16x128_i8(a10, b10, c10, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_i32_16x16x128_i8' needs target feature gfx950-insts}}
4242
*out11 = __builtin_amdgcn_smfmac_i32_32x32x64_i8(a11, b11, c11, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_i32_32x32x64_i8' needs target feature gfx950-insts}}
4343
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_bf8' needs target feature gfx950-insts}}
44+
*out12 = __builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8(a12, b12, c12, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x128_bf8_fp8' needs target feature gfx950-insts}}
4445
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
4546
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
4647
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3159,6 +3159,7 @@ def int_amdgcn_smfmac_f32_32x32x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty,
31593159
def int_amdgcn_smfmac_i32_16x16x128_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31603160
def int_amdgcn_smfmac_i32_32x32x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31613161
def int_amdgcn_smfmac_f32_16x16x128_bf8_bf8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
3162+
def int_amdgcn_smfmac_f32_16x16x128_bf8_fp8 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i32_ty, llvm_v8i32_ty>;
31623163
}
31633164

31643165
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1097,6 +1097,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10971097
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
10981098
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
10991099
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1100+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
11001101
return selectSMFMACIntrin(I);
11011102
default:
11021103
return selectImpl(I, *CoverageInfo);
@@ -3510,6 +3511,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
35103511
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
35113512
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
35123513
break;
3514+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3515+
Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3516+
break;
35133517
default:
35143518
llvm_unreachable("unhandled smfmac intrinsic");
35153519
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4811,7 +4811,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
48114811
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
48124812
case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
48134813
case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
4814-
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8: {
4814+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
4815+
case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8: {
48154816
// vdst, srcA, srcB, srcC, idx
48164817
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
48174818
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,6 +1058,7 @@ defm V_SMFMAC_F32_32X32X32_BF16 : SMFMACInst<"v_smfmac_f32_32x32x32_bf16",
10581058
defm V_SMFMAC_I32_16X16X128_I8 : SMFMACInst<"v_smfmac_i32_16x16x128_i8", "I32_16X16X128_I8", int_amdgcn_smfmac_i32_16x16x128_i8>;
10591059
defm V_SMFMAC_I32_32X32X64_I8 : SMFMACInst<"v_smfmac_i32_32x32x64_i8", "I32_32X32X64_I8", int_amdgcn_smfmac_i32_32x32x64_i8>;
10601060
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_bf8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_bf8>;
1061+
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : SMFMACInst<"v_smfmac_f32_16x16x128_bf8_fp8", "F32_16X16X128_F8", int_amdgcn_smfmac_f32_16x16x128_bf8_fp8>;
10611062
}
10621063

10631064
def MAIInstInfoTable : GenericTable {
@@ -2157,6 +2158,7 @@ defm V_SMFMAC_I32_16X16X128_I8 : VOP3P_Real_SMFMAC <0x3a, "v_smfmac_i32_16x1
21572158
defm V_SMFMAC_I32_32X32X64_I8 : VOP3P_Real_SMFMAC <0x47, "v_smfmac_i32_32x32x64i8">;
21582159

21592160
defm V_SMFMAC_F32_16X16X128_BF8_BF8 : VOP3P_Real_SMFMAC <0x3b, "v_smfmac_f32_16x16x128bf8bf8">;
2161+
defm V_SMFMAC_F32_16X16X128_BF8_FP8 : VOP3P_Real_SMFMAC <0x3c, "v_smfmac_f32_16x16x128bf8fp8">;
21602162

21612163
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
21622164
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,15 @@ define amdgpu_kernel void @smfmac_f32_16x16x128_bf8_bf8(<4 x i32> %arg0, <8 x i3
377377
ret void
378378
}
379379

380+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32, i32)
381+
382+
; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
383+
define amdgpu_kernel void @smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
384+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 1, i32 2)
385+
store <4 x float> %result, ptr addrspace(1) %out
386+
ret void
387+
}
388+
380389
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
381390
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
382391
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2325,6 +2325,221 @@ define <4 x float> @test_smfmac_f32_16x16x128_bf8_bf8__sgpr(<4 x i32> inreg %arg
23252325
ret <4 x float> %result
23262326
}
23272327

2328+
; --------------------------------------------------------------------
2329+
; llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8
2330+
; --------------------------------------------------------------------
2331+
2332+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32>, <8 x i32>, <4 x float>, i32, i32 immarg, i32 immarg)
2333+
2334+
define amdgpu_kernel void @test_smfmac_f32_16x16x128_bf8_fp8__vgpr(ptr addrspace(1) %arg, <4 x i32> %a, <8 x i32> %b, i32 %idx) #0 {
2335+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2336+
; SDAG: ; %bb.0: ; %bb
2337+
; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
2338+
; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2339+
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2340+
; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2341+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
2342+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2343+
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7]
2344+
; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64
2345+
; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54
2346+
; SDAG-NEXT: v_mov_b32_e32 v12, s8
2347+
; SDAG-NEXT: v_mov_b32_e32 v13, s9
2348+
; SDAG-NEXT: v_mov_b32_e32 v14, s10
2349+
; SDAG-NEXT: v_mov_b32_e32 v15, s11
2350+
; SDAG-NEXT: v_mov_b32_e32 v0, s12
2351+
; SDAG-NEXT: v_mov_b32_e32 v1, s13
2352+
; SDAG-NEXT: v_mov_b32_e32 v2, s14
2353+
; SDAG-NEXT: v_mov_b32_e32 v3, s15
2354+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2355+
; SDAG-NEXT: v_mov_b32_e32 v4, s0
2356+
; SDAG-NEXT: v_mov_b32_e32 v5, s1
2357+
; SDAG-NEXT: v_mov_b32_e32 v6, s2
2358+
; SDAG-NEXT: v_mov_b32_e32 v7, s3
2359+
; SDAG-NEXT: v_mov_b32_e32 v17, s16
2360+
; SDAG-NEXT: s_waitcnt vmcnt(0)
2361+
; SDAG-NEXT: s_nop 0
2362+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
2363+
; SDAG-NEXT: s_nop 6
2364+
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7]
2365+
; SDAG-NEXT: s_endpgm
2366+
;
2367+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__vgpr:
2368+
; GISEL: ; %bb.0: ; %bb
2369+
; GISEL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
2370+
; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
2371+
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
2372+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2373+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[0:1]
2374+
; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
2375+
; GISEL-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x54
2376+
; GISEL-NEXT: s_load_dword s2, s[4:5], 0x64
2377+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
2378+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[10:11]
2379+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[8:9]
2380+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
2381+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
2382+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2383+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2384+
; GISEL-NEXT: v_mov_b32_e32 v16, s2
2385+
; GISEL-NEXT: s_waitcnt vmcnt(0)
2386+
; GISEL-NEXT: s_nop 0
2387+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
2388+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
2389+
; GISEL-NEXT: s_nop 5
2390+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[0:1]
2391+
; GISEL-NEXT: s_endpgm
2392+
bb:
2393+
%id = call i32 @llvm.amdgcn.workitem.id.x()
2394+
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
2395+
%in.1 = load <4 x float>, ptr addrspace(1) %gep
2396+
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %a, <8 x i32> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
2397+
store <4 x float> %mai.1, ptr addrspace(1) %arg
2398+
ret void
2399+
}
2400+
2401+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2402+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2403+
; SDAG: ; %bb.0:
2404+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2405+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2406+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2407+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2408+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2409+
; SDAG-NEXT: s_nop 1
2410+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16
2411+
; SDAG-NEXT: s_nop 6
2412+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2413+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2414+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2415+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2416+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2417+
;
2418+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8:
2419+
; GISEL: ; %bb.0:
2420+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2421+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16
2422+
; GISEL-NEXT: s_nop 6
2423+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2424+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2425+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2426+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2427+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2428+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2429+
ret <4 x float> %result
2430+
}
2431+
2432+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags0(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2433+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2434+
; SDAG: ; %bb.0:
2435+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2436+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2437+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2438+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2439+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2440+
; SDAG-NEXT: s_nop 1
2441+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
2442+
; SDAG-NEXT: s_nop 6
2443+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2444+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2445+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2446+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2447+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2448+
;
2449+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags0:
2450+
; GISEL: ; %bb.0:
2451+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2452+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
2453+
; GISEL-NEXT: s_nop 6
2454+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2455+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2456+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2457+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2458+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2459+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
2460+
ret <4 x float> %result
2461+
}
2462+
2463+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__flags1(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3) {
2464+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2465+
; SDAG: ; %bb.0:
2466+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2467+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
2468+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
2469+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
2470+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
2471+
; SDAG-NEXT: s_nop 1
2472+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
2473+
; SDAG-NEXT: s_nop 6
2474+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2475+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2476+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2477+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2478+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2479+
;
2480+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__flags1:
2481+
; GISEL: ; %bb.0:
2482+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2483+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
2484+
; GISEL-NEXT: s_nop 6
2485+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
2486+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
2487+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
2488+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
2489+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2490+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
2491+
ret <4 x float> %result
2492+
}
2493+
2494+
define <4 x float> @test_smfmac_f32_16x16x128_bf8_fp8__sgpr(<4 x i32> inreg %arg0, <8 x i32> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
2495+
; SDAG-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2496+
; SDAG: ; %bb.0:
2497+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498+
; SDAG-NEXT: v_mov_b32_e32 v8, s0
2499+
; SDAG-NEXT: v_mov_b32_e32 v9, s1
2500+
; SDAG-NEXT: v_mov_b32_e32 v10, s2
2501+
; SDAG-NEXT: v_mov_b32_e32 v11, s3
2502+
; SDAG-NEXT: v_mov_b32_e32 v0, s16
2503+
; SDAG-NEXT: v_mov_b32_e32 v1, s17
2504+
; SDAG-NEXT: v_mov_b32_e32 v2, s18
2505+
; SDAG-NEXT: v_mov_b32_e32 v3, s19
2506+
; SDAG-NEXT: v_mov_b32_e32 v4, s20
2507+
; SDAG-NEXT: v_mov_b32_e32 v5, s21
2508+
; SDAG-NEXT: v_mov_b32_e32 v6, s22
2509+
; SDAG-NEXT: v_mov_b32_e32 v7, s23
2510+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s24
2511+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s25
2512+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s26
2513+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s27
2514+
; SDAG-NEXT: v_mov_b32_e32 v12, s28
2515+
; SDAG-NEXT: s_nop 1
2516+
; SDAG-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 a[0:3], v[8:11], v[0:7], v12
2517+
; SDAG-NEXT: s_nop 6
2518+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
2519+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
2520+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
2521+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
2522+
; SDAG-NEXT: s_setpc_b64 s[30:31]
2523+
;
2524+
; GISEL-LABEL: test_smfmac_f32_16x16x128_bf8_fp8__sgpr:
2525+
; GISEL: ; %bb.0:
2526+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2527+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
2528+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
2529+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[16:17]
2530+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25]
2531+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[18:19]
2532+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[20:21]
2533+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[22:23]
2534+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27]
2535+
; GISEL-NEXT: v_mov_b32_e32 v16, s28
2536+
; GISEL-NEXT: s_nop 1
2537+
; GISEL-NEXT: v_smfmac_f32_16x16x128_bf8_fp8 v[0:3], v[12:15], v[4:11], v16
2538+
; GISEL-NEXT: s_setpc_b64 s[30:31]
2539+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x128.bf8.fp8(<4 x i32> %arg0, <8 x i32> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
2540+
ret <4 x float> %result
2541+
}
2542+
23282543
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
23292544
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
23302545
; GCN: {{.*}}

0 commit comments

Comments
 (0)