Skip to content

Commit adfe14c

Browse files
arsenmchencha3
authored andcommitted
AMDGPU: Use defset to cleanup marking MFMA intrinsics as divergent (llvm#85915)
1 parent cadade8 commit adfe14c

File tree

2 files changed

+59
-114
lines changed

2 files changed

+59
-114
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
26532653
// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
26542654
// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
26552655
// The content of the other 16-bit half is preserved from the input.
2656+
2657+
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX11 = {
26562658
def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
26572659
def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
26582660

@@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, l
26682670
// GFX12: The op_sel bit must be 0.
26692671
def int_amdgcn_wmma_f16_16x16x16_f16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
26702672
def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
2673+
}
26712674

26722675
//===----------------------------------------------------------------------===//
26732676
// GFX12 Intrinsics
@@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
26872690
[IntrNoMem, IntrConvergent, IntrWillReturn,
26882691
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
26892692

2690-
2691-
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2692-
//
2693-
// These operations perform a matrix multiplication and accumulation of
2694-
// the form: D = A * B + C .
2695-
2696-
// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
2697-
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2698-
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2699-
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2700-
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2701-
// A and B are <16 x iu4>.
2702-
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2703-
27042693
// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
27052694
//
27062695
// These operations perform a sparse matrix multiplication and accumulation of
@@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType I
27342723
[IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
27352724
>;
27362725

2726+
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX12 = {
2727+
// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
2728+
//
2729+
// These operations perform a matrix multiplication and accumulation of
2730+
// the form: D = A * B + C .
2731+
2732+
// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
2733+
def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2734+
def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2735+
def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2736+
def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
2737+
// A and B are <16 x iu4>.
2738+
def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
2739+
27372740
def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
27382741
def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
27392742
def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
@@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyin
27452748
def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
27462749
def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
27472750
def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
2751+
}
27482752

27492753
def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
27502754

@@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic<LLVMType DestTy, LLVMType SrcABTy> :
30123016
[IntrConvergent, IntrNoMem,
30133017
ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
30143018

3019+
defset list<Intrinsic> AMDGPUMFMAIntrinsics908 = {
30153020
def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_float_ty>;
30163021
def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_float_ty>;
30173022
def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_float_ty>;
@@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v
30323037
def int_amdgcn_mfma_f32_4x4x2bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
30333038
def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2i16_ty>;
30343039
def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2i16_ty>;
3040+
}
30353041

30363042
//===----------------------------------------------------------------------===//
30373043
// gfx90a intrinsics
@@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30433049
def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30443050
def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
30453051

3052+
defset list<Intrinsic> AMDGPUMFMAIntrinsics90A = {
30463053
def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v32f32_ty, llvm_v4i16_ty>;
30473054
def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty>;
30483055
def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty>;
@@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, ll
30543061
// source operand.
30553062
def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic<llvm_v4f64_ty, llvm_double_ty>;
30563063
def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic<llvm_double_ty, llvm_double_ty>;
3064+
}
30573065

30583066
//===----------------------------------------------------------------------===//
30593067
// gfx940 intrinsics
30603068
// ===----------------------------------------------------------------------===//
30613069

3062-
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
3063-
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
3064-
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
3065-
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
3066-
[llvm_v2i16_ty],
3067-
[LLVMQualPointerType<3>, llvm_v2i16_ty],
3068-
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
3069-
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
3070-
3071-
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
3072-
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
3073-
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
3074-
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
3075-
30763070
class AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> :
30773071
AMDGPUMfmaIntrinsic<DestTy, llvm_i64_ty>;
30783072

@@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic<LLVMType DestTy> {
30813075
def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic<DestTy>;
30823076
}
30833077

3084-
defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
3085-
defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
3086-
30873078
// llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid
30883079
class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
30893080
ClangBuiltin<!subst("int", "__builtin", NAME)>,
@@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic<LLVMType DestTy, LLVMType SrcA, LLVMType SrcB> :
30933084
[IntrConvergent, IntrNoMem,
30943085
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>]>;
30953086

3096-
def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
3097-
def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
3098-
def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
3099-
def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
3100-
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
3101-
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
3102-
31033087
class AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> :
31043088
AMDGPUMSmfmacIntrinsic<DestTy, llvm_v2i32_ty, llvm_v4i32_ty>;
31053089

@@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic<LLVMType DestTy> {
31083092
def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic<DestTy>;
31093093
}
31103094

3095+
// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm.
3096+
def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
3097+
def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
3098+
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
3099+
[llvm_v2i16_ty],
3100+
[LLVMQualPointerType<3>, llvm_v2i16_ty],
3101+
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
3102+
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
3103+
3104+
defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
3105+
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
3106+
def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic<llvm_v16i32_ty, llvm_i64_ty>;
3107+
def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic<llvm_v4f32_ty, llvm_v2f32_ty>;
3108+
def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic<llvm_v16f32_ty, llvm_v2f32_ty>;
3109+
3110+
defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic<llvm_v4f32_ty>;
3111+
defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic<llvm_v16f32_ty>;
3112+
3113+
def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
3114+
def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4f16_ty, llvm_v8f16_ty>;
3115+
def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v4f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
3116+
def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic<llvm_v16f32_ty, llvm_v4i16_ty, llvm_v8i16_ty>;
3117+
def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic<llvm_v4i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
3118+
def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic<llvm_v16i32_ty, llvm_v2i32_ty, llvm_v4i32_ty>;
3119+
31113120
defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic<llvm_v4f32_ty>;
31123121
defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic<llvm_v16f32_ty>;
3122+
}
31133123

31143124
// llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3]
31153125
// byte_sel selects byte from srcA.

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 11 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -354,82 +354,17 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
354354
def : SourceOfDivergence<int_amdgcn_update_dpp>;
355355
def : SourceOfDivergence<int_amdgcn_writelane>;
356356

357-
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
358-
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
359-
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4f16>;
360-
def : SourceOfDivergence<int_amdgcn_mfma_i32_4x4x4i8>;
361-
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x2bf16>;
362-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x1f32>;
363-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f32>;
364-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4f16>;
365-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16f16>;
366-
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x4i8>;
367-
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x16i8>;
368-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x2bf16>;
369-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8bf16>;
370-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x1f32>;
371-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2f32>;
372-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4f16>;
373-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8f16>;
374-
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x4i8>;
375-
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
376-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
377-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
378-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16_1k>;
379-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x4bf16_1k>;
380-
def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x4bf16_1k>;
381-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x8bf16_1k>;
382-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x16bf16_1k>;
383-
def : SourceOfDivergence<int_amdgcn_mfma_f64_16x16x4f64>;
384-
def : SourceOfDivergence<int_amdgcn_mfma_f64_4x4x4f64>;
385-
def : SourceOfDivergence<int_amdgcn_mfma_i32_16x16x32_i8>;
386-
def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x16_i8>;
387-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x8_xf32>;
388-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4_xf32>;
389-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_bf8>;
390-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_bf8_fp8>;
391-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_bf8>;
392-
def : SourceOfDivergence<int_amdgcn_mfma_f32_16x16x32_fp8_fp8>;
393-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_bf8>;
394-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
395-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
396-
def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
397-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_f16>;
398-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_f16>;
399-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x32_bf16>;
400-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x16_bf16>;
401-
def : SourceOfDivergence<int_amdgcn_smfmac_i32_16x16x64_i8>;
402-
def : SourceOfDivergence<int_amdgcn_smfmac_i32_32x32x32_i8>;
403-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_bf8>;
404-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_bf8_fp8>;
405-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_bf8>;
406-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_16x16x64_fp8_fp8>;
407-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_bf8>;
408-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_bf8_fp8>;
409-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_bf8>;
410-
def : SourceOfDivergence<int_amdgcn_smfmac_f32_32x32x32_fp8_fp8>;
411-
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_f16>;
412-
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf16>;
413-
def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
414-
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
415-
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
416-
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
417-
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
418-
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
419-
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
420-
def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
421-
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
422-
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
423-
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
424-
def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
425-
def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
426-
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
427-
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
428-
def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
429-
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
430-
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
431-
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
432-
def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
357+
foreach intr = AMDGPUMFMAIntrinsics908 in
358+
def : SourceOfDivergence<intr>;
359+
foreach intr = AMDGPUMFMAIntrinsics90A in
360+
def : SourceOfDivergence<intr>;
361+
foreach intr = AMDGPUMFMAIntrinsics940 in
362+
def : SourceOfDivergence<intr>;
363+
foreach intr = AMDGPUWMMAIntrinsicsGFX11 in
364+
def : SourceOfDivergence<intr>;
365+
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
366+
def : SourceOfDivergence<intr>;
367+
433368
def : SourceOfDivergence<int_amdgcn_global_load_tr>;
434369

435370
// The dummy boolean output is divergent from the IR's perspective,

0 commit comments

Comments
 (0)