Skip to content

Commit 27439a7

Browse files
committed
[AMDGPU] New gfx940 mfma instructions
Differential Revision: https://reviews.llvm.org/D122044
1 parent 1e3713f commit 27439a7

File tree

14 files changed

+474
-1
lines changed

14 files changed

+474
-1
lines changed

clang/include/clang/Basic/BuiltinsAMDGPU.def

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,5 +305,10 @@ TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x16bf16_1k, "V4fV4sV4sV4fIiIiIi",
305305
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_16x16x4f64, "V4dddV4dIiIiIi", "nc", "mai-insts")
306306
TARGET_BUILTIN(__builtin_amdgcn_mfma_f64_4x4x4f64, "ddddIiIiIi", "nc", "mai-insts")
307307

308+
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_16x16x32_i8, "V4iWiWiV4iIiIiIi", "nc", "mai-insts")
309+
TARGET_BUILTIN(__builtin_amdgcn_mfma_i32_32x32x16_i8, "V16iWiWiV16iIiIiIi", "nc", "mai-insts")
310+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_16x16x8_xf32, "V4fV2fV2fV4fIiIiIi", "nc", "mai-insts")
311+
TARGET_BUILTIN(__builtin_amdgcn_mfma_f32_32x32x4_xf32, "V16fV2fV2fV16fIiIiIi", "nc", "mai-insts")
312+
308313
#undef BUILTIN
309314
#undef TARGET_BUILTIN

clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
// REQUIRES: amdgpu-registered-target
22
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908
33
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A
4+
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
45

56
#pragma OPENCL EXTENSION cl_khr_fp64:enable
67

8+
typedef float v2f __attribute__((ext_vector_type(2)));
79
typedef float v4f __attribute__((ext_vector_type(4)));
810
typedef float v16f __attribute__((ext_vector_type(16)));
911
typedef float v32f __attribute__((ext_vector_type(32)));
@@ -216,3 +218,33 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c)
216218
}
217219

218220
#endif // MFMA_GFX90A_TESTS
221+
222+
#ifdef MFMA_GFX940_TESTS
223+
// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8
224+
// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0)
225+
void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c)
226+
{
227+
*out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0);
228+
}
229+
230+
// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8
231+
// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0)
232+
void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c)
233+
{
234+
*out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0);
235+
}
236+
237+
// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32
238+
// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0)
239+
void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c)
240+
{
241+
*out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0);
242+
}
243+
244+
// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32
245+
// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0)
246+
void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c)
247+
{
248+
*out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0);
249+
}
250+
#endif // MFMA_GFX940_TESTS
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s
2+
3+
typedef float v2f __attribute__((ext_vector_type(2)));
4+
typedef float v4f __attribute__((ext_vector_type(4)));
5+
typedef float v16f __attribute__((ext_vector_type(16)));
6+
typedef int v4i __attribute__((ext_vector_type(4)));
7+
typedef int v16i __attribute__((ext_vector_type(16)));
8+
9+
void test_mfma_i32_16x16x32i8(global v4i* out, long a, long b, v4i c, int d)
10+
{
11+
*out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}}
12+
*out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}}
13+
*out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_16x16x32_i8' must be a constant integer}}
14+
}
15+
16+
void test_mfma_i32_32x32x16i8(global v16i* out, long a, long b, v16i c, int d)
17+
{
18+
*out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}}
19+
*out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}}
20+
*out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_i32_32x32x16_i8' must be a constant integer}}
21+
}
22+
23+
void test_mfma_f32_16x16x8xf32(global v4f* out, v2f a, v2f b, v4f c, int d)
24+
{
25+
*out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}}
26+
*out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}}
27+
*out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_16x16x8_xf32' must be a constant integer}}
28+
}
29+
30+
void test_mfma_f32_32x32x4xf32(global v16f* out, v2f a, v2f b, v16f c, int d)
31+
{
32+
*out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, d, 0, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
33+
*out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, d, 0); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
34+
*out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, d); // expected-error{{argument to '__builtin_amdgcn_mfma_f32_32x32x4_xf32' must be a constant integer}}
35+
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,6 +1999,11 @@ def int_amdgcn_ds_fadd_v2bf16 : Intrinsic<
19991999
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>]>,
20002000
GCCBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
20012001

2002+
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
2003+
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
2004+
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
2005+
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
2006+
20022007
//===----------------------------------------------------------------------===//
20032008
// Special Intrinsics for backend internal use only. No frontend
20042009
// should emit calls to these.

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4243,7 +4243,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
42434243
case Intrinsic::amdgcn_mfma_f32_32x32x8bf16_1k:
42444244
case Intrinsic::amdgcn_mfma_f32_16x16x16bf16_1k:
42454245
case Intrinsic::amdgcn_mfma_f64_16x16x4f64:
4246-
case Intrinsic::amdgcn_mfma_f64_4x4x4f64: {
4246+
case Intrinsic::amdgcn_mfma_f64_4x4x4f64:
4247+
case Intrinsic::amdgcn_mfma_i32_16x16x32_i8:
4248+
case Intrinsic::amdgcn_mfma_i32_32x32x16_i8:
4249+
case Intrinsic::amdgcn_mfma_f32_16x16x8_xf32:
4250+
case Intrinsic::amdgcn_mfma_f32_32x32x4_xf32: {
42474251
// Default for MAI intrinsics.
42484252
// srcC can also be an immediate which can be folded later.
42494253
// FIXME: Should we eventually add an alternative mapping with AGPR src

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,10 @@ def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
295295
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
296296
def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
297297
def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
298+
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
299+
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
300+
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
301+
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
298302

299303
// The dummy boolean output is divergent from the IR's perspective,
300304
// but the mask results are uniform. These produce a divergent and

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2407,6 +2407,11 @@ def VOP_V4F32_V4I16_V4I16_V4F32 : VOPProfile <[v4f32, v4i16, v4i16, v4f32]>;
24072407
def VOP_V16F32_V4I16_V4I16_V16F32 : VOPProfile <[v16f32, v4i16, v4i16, v16f32]>;
24082408
def VOP_V32F32_V4I16_V4I16_V32F32 : VOPProfile <[v32f32, v4i16, v4i16, v32f32]>;
24092409

2410+
def VOP_V4I32_I64_I64_V4I32 : VOPProfile <[v4i32, i64, i64, v4i32]>;
2411+
def VOP_V16I32_I64_I64_V16I32 : VOPProfile <[v16i32, i64, i64, v16i32]>;
2412+
def VOP_V4F32_V2F32_V2F32_V4F32 : VOPProfile <[v4f32, v2f32, v2f32, v4f32]>;
2413+
def VOP_V16F32_V2F32_V2F32_V16F32 : VOPProfile <[v16f32, v2f32, v2f32, v16f32]>;
2414+
24102415
class Commutable_REV <string revOp, bit isOrig> {
24112416
string RevOp = revOp;
24122417
bit IsOrig = isOrig;

llvm/lib/Target/AMDGPU/SISchedule.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,14 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
264264
def : InstRW<[Write64Bit], (instregex "^V_ACCVGPR_WRITE_B32_e64$")>;
265265
def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_4X4X")>;
266266

267+
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X8X")>;
267268
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X16")>;
269+
def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X32")>;
268270
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_16X16X[14][FBI]")>;
269271

272+
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X4XF")>;
270273
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X8")>;
274+
def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X16")>;
271275
def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_.32_32X32X[124][FBI]")>;
272276

273277
def : InstRW<[Write4PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_4X4X")>;

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,10 @@ def VOPProfileMAI_F32_V4I16_X16 : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F32, A
412412
def VOPProfileMAI_F32_V4I16_X32 : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
413413
def VOPProfileMAI_F64_16X16X4F64 : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, AISrc_256_f64, ADst_256, AVSrc_64>;
414414
def VOPProfileMAI_F64_4X4X4F64 : VOPProfileMAI<VOP_F64_F64_F64_F64, AISrc_64_f64, ADst_64, AVSrc_64>;
415+
def VOPProfileMAI_I32_I64_X16 : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, AISrc_128_b32, ADst_128, AVSrc_64>;
416+
def VOPProfileMAI_I32_I64_X32 : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, AISrc_512_b32, ADst_512, AVSrc_64>;
417+
def VOPProfileMAI_F32_V2F32_X16 : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, AISrc_128_b32, ADst_128, AVSrc_64>;
418+
def VOPProfileMAI_F32_V2F32_X32 : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, AISrc_512_b32, ADst_512, AVSrc_64>;
415419

416420
def VOPProfileMAI_F32_F32_X4_VCD : VOPProfileMAI<VOP_V4F32_F32_F32_V4F32, VISrc_128_f32, VDst_128>;
417421
def VOPProfileMAI_F32_F32_X16_VCD : VOPProfileMAI<VOP_V16F32_F32_F32_V16F32, VISrc_512_f32, VDst_512>;
@@ -430,6 +434,10 @@ def VOPProfileMAI_F32_V4I16_X16_VCD : VOPProfileMAI<VOP_V16F32_V4I16_V4I16_V16F
430434
def VOPProfileMAI_F32_V4I16_X32_VCD : VOPProfileMAI<VOP_V32F32_V4I16_V4I16_V32F32, VISrc_1024_b32, VDst_1024, AVSrc_64>;
431435
def VOPProfileMAI_F64_16X16X4F64_VCD : VOPProfileMAI<VOP_V4F64_F64_F64_V4F64, VISrc_256_f64, VDst_256, AVSrc_64>;
432436
def VOPProfileMAI_F64_4X4X4F64_VCD : VOPProfileMAI<VOP_F64_F64_F64_F64, VISrc_64_f64, VDst_64, AVSrc_64>;
437+
def VOPProfileMAI_I32_I64_X16_VCD : VOPProfileMAI<VOP_V4I32_I64_I64_V4I32, VISrc_128_b32, VDst_128, AVSrc_64>;
438+
def VOPProfileMAI_I32_I64_X32_VCD : VOPProfileMAI<VOP_V16I32_I64_I64_V16I32, VISrc_512_b32, VDst_512, AVSrc_64>;
439+
def VOPProfileMAI_F32_V2F32_X16_VCD : VOPProfileMAI<VOP_V4F32_V2F32_V2F32_V4F32, VISrc_128_b32, VDst_128, AVSrc_64>;
440+
def VOPProfileMAI_F32_V2F32_X32_VCD : VOPProfileMAI<VOP_V16F32_V2F32_V2F32_V16F32, VISrc_512_b32, VDst_512, AVSrc_64>;
433441

434442
class MFMATable <bit is_mac, string Name> {
435443
bit IsMac = is_mac;
@@ -527,6 +535,13 @@ let Predicates = [isGFX90APlus] in {
527535
defm V_MFMA_F64_4X4X4F64 : MAIInst<"v_mfma_f64_4x4x4f64", "F64_4X4X4F64", int_amdgcn_mfma_f64_4x4x4f64>;
528536
} // End Predicates = [isGFX90APlus]
529537

538+
let Predicates = [isGFX940Plus] in {
539+
defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>;
540+
defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>;
541+
defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>;
542+
defm V_MFMA_F32_32X32X4XF32 : MAIInst<"v_mfma_f32_32x32x4xf32", "F32_V2F32_X32", int_amdgcn_mfma_f32_32x32x4_xf32>;
543+
} // End Predicates = [isGFX940Plus]
544+
530545
let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1 in {
531546
defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
532547
defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
@@ -727,6 +742,11 @@ defm V_MFMA_F32_16X16X16BF16_1K : VOP3P_Real_MFMA_gfx90a <0x67>;
727742
defm V_MFMA_F64_16X16X4F64 : VOP3P_Real_MFMA_gfx90a <0x6e>;
728743
defm V_MFMA_F64_4X4X4F64 : VOP3P_Real_MFMA_gfx90a <0x6f>;
729744

745+
defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
746+
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
747+
defm V_MFMA_F32_16X16X8XF32 : VOP3P_Real_MFMA_gfx940 <0x3e, "v_mfma_f32_16x16x8_xf32">;
748+
defm V_MFMA_F32_32X32X4XF32 : VOP3P_Real_MFMA_gfx940 <0x3f, "v_mfma_f32_32x32x4_xf32">;
749+
730750
defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5d, "v_mfma_f32_32x32x4_2b_bf16">;
731751
defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5e, "v_mfma_f32_16x16x4_4b_bf16">;
732752
defm V_MFMA_F32_4X4X4BF16_1K : VOP3P_Real_MFMA_gfx940 <0x5f, "v_mfma_f32_4x4x4_16b_bf16">;

0 commit comments

Comments
 (0)