Skip to content

Commit 5717806

Browse files
arsenmpravinjagtap
authored andcommitted
AMDGPU: Add v_smfmac_f32_16x16x64_f16 for gfx950 (llvm#117202)
1 parent 3f9f64a commit 5717806

File tree

13 files changed

+322
-1
lines changed

13 files changed

+322
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,7 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x16_bf16, "V16fV8yV8yV16fIiIiIi",
444444
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x64_i8, "V4iV4iV4iV4iIiIiIi", "nc", "gfx950-insts")
445445
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x32_i8, "V16iV4iV4iV16iIiIiIi", "nc", "gfx950-insts")
446446

447+
TARGET_BUILTIN(__builtin_amdgcn_smfmac_f32_16x16x64_f16, "V4fV8hV16hV4fiIiIi", "nc", "gfx950-insts")
447448
//===----------------------------------------------------------------------===//
448449
// GFX12+ only builtins.
449450
//===----------------------------------------------------------------------===//

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -467,4 +467,11 @@ v4f test_mfma_f32_16x16x32_bf16(v8bf16 a, v8bf16 b, v4f c)
467467
return __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 1, 2, 3);
468468
}
469469

470+
// CHECK-GFX950-LABEL: @test_smfmac_f32_16x16x64_f16
471+
// CHECK-GFX950: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0)
472+
void test_smfmac_f32_16x16x64_f16(global v4f* out, v8h a, v16h b, v4f c, int idx)
473+
{
474+
*out = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a, b, c, idx, 0, 0);
475+
}
476+
470477
#endif

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950-param.cl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
typedef float float4 __attribute__((ext_vector_type(4)));
55
typedef float float16 __attribute__((ext_vector_type(16)));
66
typedef half half8 __attribute__((ext_vector_type(8)));
7+
typedef half half16 __attribute__((ext_vector_type(16)));
78
typedef __bf16 bfloat8 __attribute__((ext_vector_type(8)));
89
typedef int int4 __attribute__((ext_vector_type(4)));
910
typedef int int8 __attribute__((ext_vector_type(8)));
@@ -62,3 +63,9 @@ void test_mfma_f32_16x16x32_bf16(__global float4* out, bfloat8 a, bfloat8 b, flo
6263
*out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, X, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
6364
*out = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a, b, c, 0, 0, X); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x32_bf16' must be a constant integer}}
6465
}
66+
67+
void test_smfmac_f32_16x16x64_f16(global float4* out, half8 a, half16 b, float4 c, int idx, int d)
68+
{
69+
*out = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a, b, c, idx, d, 0); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_f16' must be a constant integer}}
70+
*out = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a, b, c, idx, 0, d); // expected-error{{argument to '__builtin_amdgcn_smfmac_f32_16x16x64_f16' must be a constant integer}}
71+
}

clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ void test(__global float4* out0, half8 a0, half8 b0, float4 c0,
3434
*out3 = __builtin_amdgcn_mfma_i32_16x16x64_i8(a3, b3, c3, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_16x16x64_i8' needs target feature gfx950-insts}}
3535
*out4 = __builtin_amdgcn_mfma_i32_32x32x32_i8(a4, b4, c4, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_i32_32x32x32_i8' needs target feature gfx950-insts}}
3636
*out5 = __builtin_amdgcn_mfma_f32_16x16x32_bf16(a5, b5, c5, 0, 0, 0); // expected-error{{'__builtin_amdgcn_mfma_f32_16x16x32_bf16' needs target feature gfx950-insts}}
37+
*out6 = __builtin_amdgcn_smfmac_f32_16x16x64_f16(a6, b6, c6, 0, 0, 0); // expected-error{{'__builtin_amdgcn_smfmac_f32_16x16x64_f16' needs target feature gfx950-insts}}
3738
*out14 = __builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4(a14, b14, c14, 0, 0, 0, d14, 0, e14); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4' needs target feature gfx950-insts}}
3839
*out15 = __builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4(a15, b15, c15, 0, 0, 0, d15, 0, e15); // expected-error{{'__builtin_amdgcn_mfma_scale_f32_32x32x64_f8f6f4' needs target feature gfx950-insts}}
3940
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3179,6 +3179,7 @@ def int_amdgcn_mfma_f32_16x16x32_bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_
31793179
def int_amdgcn_mfma_f32_32x32x16_bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v8bf16_ty>;
31803180
def int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v4f32_ty>;
31813181
def int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4 : AMDGPUMfmaScaleIntrinsic<llvm_v16f32_ty>;
3182+
def int_amdgcn_smfmac_f32_16x16x64_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v8f16_ty, llvm_v16f16_ty>;
31823183
}
31833184

31843185
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,7 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10751075
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
10761076
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
10771077
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1078+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
10781079
return selectSMFMACIntrin(I);
10791080
default:
10801081
return selectImpl(I, *CoverageInfo);
@@ -3446,6 +3447,8 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
34463447
return true;
34473448
}
34483449

3450+
// FIXME: This should be removed and let the patterns select. We just need the
3451+
// AGPR/VGPR combination versions.
34493452
bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
34503453
unsigned Opc;
34513454
switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
@@ -3491,6 +3494,9 @@ bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
34913494
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
34923495
Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
34933496
break;
3497+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3498+
Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3499+
break;
34943500
default:
34953501
llvm_unreachable("unhandled smfmac intrinsic");
34963502
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4781,7 +4781,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
47814781
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
47824782
case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
47834783
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
4784-
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8: {
4784+
case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
4785+
case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16: {
47854786
// vdst, srcA, srcB, srcC, idx
47864787
OpdsMapping[0] = getAGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
47874788
OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2677,6 +2677,7 @@ def VOP_V4F32_I64_I64_V4F32 : VOPProfile <[v4f32, i64, i64, v4f32]>;
26772677
def VOP_V16F32_I64_I64_V16F32 : VOPProfile <[v16f32, i64, i64, v16f32]>;
26782678

26792679
def VOP_V4F32_V4F16_V8F16_I32 : VOPProfile <[v4f32, v4f16, v8f16, i32]>;
2680+
def VOP_V4F32_V8F16_V16F16_I32 : VOPProfile <[v4f32, v8f16, v16f16, i32]>;
26802681
def VOP_V16F32_V4F16_V8F16_I32 : VOPProfile <[v16f32, v4f16, v8f16, i32]>;
26812682
def VOP_V4F32_V4I16_V8I16_I32 : VOPProfile <[v4f32, v4i16, v8i16, i32]>;
26822683
def VOP_V16F32_V4I16_V8I16_I32 : VOPProfile <[v16f32, v4i16, v8i16, i32]>;

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,7 @@ def VOPProfileMAI_F32_I64_X32_VCD : VOPProfileMAI<VOP_V4F32_I64_I64_V4F32,
633633
def VOPProfileMAI_F32_I64_X16_VCD : VOPProfileMAI<VOP_V16F32_I64_I64_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
634634

635635
def VOPProfileSMFMAC_F32_16X16X32_F16 : VOPProfileSMFMAC<VOP_V4F32_V4F16_V8F16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
636+
def VOPProfileSMFMAC_F32_16X16X64_F16 : VOPProfileSMFMAC<VOP_V4F32_V8F16_V16F16_I32, AVDst_128, AVSrc_128, AVSrc_256>;
636637
def VOPProfileSMFMAC_F32_32X32X16_F16 : VOPProfileSMFMAC<VOP_V16F32_V4F16_V8F16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
637638
def VOPProfileSMFMAC_F32_16X16X32_I16 : VOPProfileSMFMAC<VOP_V4F32_V4I16_V8I16_I32, AVDst_128, AVSrc_64, AVSrc_128>;
638639
def VOPProfileSMFMAC_F32_32X32X16_I16 : VOPProfileSMFMAC<VOP_V16F32_V4I16_V8I16_I32, AVDst_512, AVSrc_64, AVSrc_128>;
@@ -1050,6 +1051,10 @@ defm V_SMFMAC_F32_32X32X32_FP8_BF8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_bf8",
10501051
defm V_SMFMAC_F32_32X32X32_FP8_FP8 : SMFMACInst<"v_smfmac_f32_32x32x32_fp8_fp8", "F32_32X32X32_F8", int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
10511052
} // End SubtargetPredicate = HasFP8Insts, is_gfx940_xdl = 1
10521053

1054+
let SubtargetPredicate = HasGFX950Insts in {
1055+
defm V_SMFMAC_F32_16X16X64_F16 : SMFMACInst<"v_smfmac_f32_16x16x64_f16", "F32_16X16X64_F16", int_amdgcn_smfmac_f32_16x16x64_f16>;
1056+
}
1057+
10531058
def MAIInstInfoTable : GenericTable {
10541059
let FilterClass = "MAIInst";
10551060
let CppTypeName = "MAIInstInfo";
@@ -2146,6 +2151,8 @@ defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x3
21462151
defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
21472152
} // End SubtargetPredicate = HasFP8Insts
21482153

2154+
defm V_SMFMAC_F32_16X16X64_F16 : VOP3P_Real_SMFMAC <0x5a, "v_smfmac_f32_16x16x64f16">;
2155+
21492156
defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
21502157
defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
21512158
defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>;

llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,15 @@ define amdgpu_kernel void @mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloa
332332
ret void
333333
}
334334

335+
declare <4 x float> @llvm.amdgcn.smmfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg)
336+
337+
; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
338+
define amdgpu_kernel void @smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, ptr addrspace(1) %out) {
339+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
340+
store <4 x float> %result, ptr addrspace(1) %out
341+
ret void
342+
}
343+
335344
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
336345
declare i32 @llvm.amdgcn.permlane16.i32(i32, i32, i32, i32, i1, i1) #1
337346
declare i32 @llvm.amdgcn.permlanex16.i32(i32, i32, i32, i32, i1, i1) #1
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
4+
5+
declare i32 @llvm.amdgcn.workitem.id.x()
6+
7+
; --------------------------------------------------------------------
8+
; llvm.amdgcn.smfmac.f32.16x16x64.f16
9+
; --------------------------------------------------------------------
10+
11+
declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half>, <16 x half>, <4 x float>, i32, i32 immarg, i32 immarg)
12+
13+
define amdgpu_kernel void @test_smfmac_f32_16x16x64_f16__vgpr(ptr addrspace(1) %arg, <8 x half> %a, <16 x half> %b, i32 %idx) #0 {
14+
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
15+
; SDAG: ; %bb.0: ; %bb
16+
; SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
17+
; SDAG-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
18+
; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
19+
; SDAG-NEXT: v_mov_b32_e32 v16, 0
20+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
21+
; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
22+
; SDAG-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
23+
; SDAG-NEXT: s_load_dword s16, s[0:1], 0x64
24+
; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
25+
; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
26+
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
27+
; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
28+
; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
29+
; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
30+
; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
31+
; SDAG-NEXT: v_mov_b32_e32 v17, s16
32+
; SDAG-NEXT: s_waitcnt vmcnt(0)
33+
; SDAG-NEXT: s_nop 0
34+
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2
35+
; SDAG-NEXT: s_nop 6
36+
; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[2:3]
37+
; SDAG-NEXT: s_endpgm
38+
;
39+
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__vgpr:
40+
; GISEL: ; %bb.0: ; %bb
41+
; GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
42+
; GISEL-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x34
43+
; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0
44+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
45+
; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[2:3]
46+
; GISEL-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
47+
; GISEL-NEXT: s_load_dword s16, s[0:1], 0x64
48+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[12:13]
49+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[14:15]
50+
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
51+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
52+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
53+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[8:9]
54+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[10:11]
55+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
56+
; GISEL-NEXT: s_waitcnt vmcnt(0)
57+
; GISEL-NEXT: s_nop 0
58+
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2
59+
; GISEL-NEXT: v_mov_b32_e32 v0, 0
60+
; GISEL-NEXT: s_nop 5
61+
; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[2:3]
62+
; GISEL-NEXT: s_endpgm
63+
bb:
64+
%id = call i32 @llvm.amdgcn.workitem.id.x()
65+
%gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id
66+
%in.1 = load <4 x float>, ptr addrspace(1) %gep
67+
%mai.1 = tail call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %a, <16 x half> %b, <4 x float> %in.1, i32 %idx, i32 1, i32 2)
68+
store <4 x float> %mai.1, ptr addrspace(1) %arg
69+
ret void
70+
}
71+
72+
define <4 x float> @test_smfmac_f32_16x16x64_f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
73+
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16:
74+
; SDAG: ; %bb.0:
75+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
76+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
77+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
78+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
79+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
80+
; SDAG-NEXT: s_nop 1
81+
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16
82+
; SDAG-NEXT: s_nop 6
83+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
84+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
85+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
86+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
87+
; SDAG-NEXT: s_setpc_b64 s[30:31]
88+
;
89+
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16:
90+
; GISEL: ; %bb.0:
91+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
92+
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16
93+
; GISEL-NEXT: s_nop 6
94+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
95+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
96+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
97+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
98+
; GISEL-NEXT: s_setpc_b64 s[30:31]
99+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
100+
ret <4 x float> %result
101+
}
102+
103+
define <4 x float> @test_smfmac_f32_16x16x64_f16__flags0(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
104+
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
105+
; SDAG: ; %bb.0:
106+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
108+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
109+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
110+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
111+
; SDAG-NEXT: s_nop 1
112+
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3
113+
; SDAG-NEXT: s_nop 6
114+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
115+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
116+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
117+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
118+
; SDAG-NEXT: s_setpc_b64 s[30:31]
119+
;
120+
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags0:
121+
; GISEL: ; %bb.0:
122+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
123+
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3
124+
; GISEL-NEXT: s_nop 6
125+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
126+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
127+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
128+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
129+
; GISEL-NEXT: s_setpc_b64 s[30:31]
130+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3)
131+
ret <4 x float> %result
132+
}
133+
134+
define <4 x float> @test_smfmac_f32_16x16x64_f16__flags1(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3) {
135+
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
136+
; SDAG: ; %bb.0:
137+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
138+
; SDAG-NEXT: v_accvgpr_write_b32 a0, v12
139+
; SDAG-NEXT: v_accvgpr_write_b32 a1, v13
140+
; SDAG-NEXT: v_accvgpr_write_b32 a2, v14
141+
; SDAG-NEXT: v_accvgpr_write_b32 a3, v15
142+
; SDAG-NEXT: s_nop 1
143+
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1
144+
; SDAG-NEXT: s_nop 6
145+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
146+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
147+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
148+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
149+
; SDAG-NEXT: s_setpc_b64 s[30:31]
150+
;
151+
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__flags1:
152+
; GISEL: ; %bb.0:
153+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
154+
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1
155+
; GISEL-NEXT: s_nop 6
156+
; GISEL-NEXT: v_mov_b32_e32 v0, v12
157+
; GISEL-NEXT: v_mov_b32_e32 v1, v13
158+
; GISEL-NEXT: v_mov_b32_e32 v2, v14
159+
; GISEL-NEXT: v_mov_b32_e32 v3, v15
160+
; GISEL-NEXT: s_setpc_b64 s[30:31]
161+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1)
162+
ret <4 x float> %result
163+
}
164+
165+
define <4 x float> @test_smfmac_f32_16x16x64_f16__sgpr(<8 x half> inreg %arg0, <16 x half> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) {
166+
; SDAG-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
167+
; SDAG: ; %bb.0:
168+
; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
169+
; SDAG-NEXT: v_mov_b32_e32 v8, s0
170+
; SDAG-NEXT: v_mov_b32_e32 v9, s1
171+
; SDAG-NEXT: v_mov_b32_e32 v10, s2
172+
; SDAG-NEXT: v_mov_b32_e32 v11, s3
173+
; SDAG-NEXT: v_mov_b32_e32 v0, s4
174+
; SDAG-NEXT: v_mov_b32_e32 v1, s5
175+
; SDAG-NEXT: v_mov_b32_e32 v2, s6
176+
; SDAG-NEXT: v_mov_b32_e32 v3, s7
177+
; SDAG-NEXT: v_mov_b32_e32 v4, s8
178+
; SDAG-NEXT: v_mov_b32_e32 v5, s9
179+
; SDAG-NEXT: v_mov_b32_e32 v6, s10
180+
; SDAG-NEXT: v_mov_b32_e32 v7, s11
181+
; SDAG-NEXT: v_accvgpr_write_b32 a0, s12
182+
; SDAG-NEXT: v_accvgpr_write_b32 a1, s13
183+
; SDAG-NEXT: v_accvgpr_write_b32 a2, s14
184+
; SDAG-NEXT: v_accvgpr_write_b32 a3, s15
185+
; SDAG-NEXT: v_mov_b32_e32 v12, s16
186+
; SDAG-NEXT: s_nop 1
187+
; SDAG-NEXT: v_smfmac_f32_16x16x64_f16 a[0:3], v[8:11], v[0:7], v12
188+
; SDAG-NEXT: s_nop 6
189+
; SDAG-NEXT: v_accvgpr_read_b32 v0, a0
190+
; SDAG-NEXT: v_accvgpr_read_b32 v1, a1
191+
; SDAG-NEXT: v_accvgpr_read_b32 v2, a2
192+
; SDAG-NEXT: v_accvgpr_read_b32 v3, a3
193+
; SDAG-NEXT: s_setpc_b64 s[30:31]
194+
;
195+
; GISEL-LABEL: test_smfmac_f32_16x16x64_f16__sgpr:
196+
; GISEL: ; %bb.0:
197+
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198+
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3]
199+
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1]
200+
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5]
201+
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
202+
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7]
203+
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9]
204+
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11]
205+
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
206+
; GISEL-NEXT: v_mov_b32_e32 v16, s16
207+
; GISEL-NEXT: s_nop 1
208+
; GISEL-NEXT: v_smfmac_f32_16x16x64_f16 v[0:3], v[12:15], v[4:11], v16
209+
; GISEL-NEXT: s_setpc_b64 s[30:31]
210+
%result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.f16(<8 x half> %arg0, <16 x half> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0)
211+
ret <4 x float> %result
212+
}
213+
214+
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
215+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
216+
; GCN: {{.*}}

0 commit comments

Comments
 (0)