Skip to content

Commit 6a71e67

Browse files
committed
AMDGPU: Optimize mfma_scale intrinsics with 0 inputs
We can use the unscaled form of the instruction if we know the scale factors are both 0.
1 parent 1160fea commit 6a71e67

File tree

4 files changed

+53
-40
lines changed

4 files changed

+53
-40
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,19 @@ def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
304304
[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
305305
>;
306306

307+
308+
// Optimize v_mfma_scale* instructions to avoid the scale if the
309+
// scales are known 0.
310+
class UnscaledMFMAOptimizationPat<SDPatternOperator intrin> : PatFrag<
311+
(ops node:$srca, node:$srcb, node:$srcc,
312+
node:$cbsz, node:$abid, node:$blgp),
313+
(intrin $srca, $srcb, $srcc, $cbsz, $abid, $blgp,
314+
srcvalue, 0, srcvalue, 0)
315+
>;
316+
317+
def mfma_f32_16x16x128_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale_f32_16x16x128_f8f6f4>;
318+
def mfma_f32_32x32x64_f8f6f4 : UnscaledMFMAOptimizationPat<int_amdgcn_mfma_scale_f32_32x32x64_f8f6f4>;
319+
307320
//===----------------------------------------------------------------------===//
308321
// ValueType helpers
309322
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -818,9 +818,9 @@ defm V_MFMA_F32_32X32X16_F16 : MAIInst<"v_mfma_f32_32x32x16f16", "F32_V8F16
818818
defm V_MFMA_F32_32X32X16_BF16 : MAIInst<"v_mfma_f32_32x32x16bf16", "F32_V8BF16_X16", int_amdgcn_mfma_f32_32x32x16_bf16>;
819819

820820
defm V_MFMA_F32_16X16X128_F8F6F4 : MAIInst<"v_mfma_f32_16x16x128f8f6f4",
821-
"F32_V8I32_X128">;
821+
"F32_V8I32_X128", mfma_f32_16x16x128_f8f6f4>;
822822
defm V_MFMA_F32_32X32X64_F8F6F4 : MAIInst<"v_mfma_f32_32x32x64f8f6f4",
823-
"F32_V8I32_X512">;
823+
"F32_V8I32_X512", mfma_f32_32x32x64_f8f6f4>;
824824

825825
defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : ScaledMAIInst_mc<
826826
"v_mfma_scale_f32_16x16x128_f8f6f4", "V_MFMA_F32_16X16X128_F8F6F4",

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_a(
880880
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
881881
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
882882
; GCN-NEXT: s_nop 1
883-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0]
883+
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
884884
; GCN-NEXT: s_nop 3
885885
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
886886
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
@@ -903,7 +903,7 @@ define <4 x float> @test_mfma_scale_f32_16x16x128_f8f6f4___constant_scale_0_0_b(
903903
; GCN-NEXT: v_accvgpr_write_b32 a2, v18
904904
; GCN-NEXT: v_accvgpr_write_b32 a3, v19
905905
; GCN-NEXT: s_nop 1
906-
; GCN-NEXT: v_mfma_scale_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3], 0, 0 op_sel_hi:[0,0,0]
906+
; GCN-NEXT: v_mfma_f32_16x16x128_f8f6f4 a[0:3], v[0:7], v[8:15], a[0:3]
907907
; GCN-NEXT: s_nop 3
908908
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
909909
; GCN-NEXT: v_accvgpr_read_b32 v1, a1

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2591,24 +2591,24 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
25912591
; SDAG-NEXT: v_mov_b32_e32 v14, s26
25922592
; SDAG-NEXT: v_mov_b32_e32 v15, s27
25932593
; SDAG-NEXT: s_waitcnt lgkmcnt(0)
2594-
; SDAG-NEXT: v_accvgpr_write_b32 a0, s8
2595-
; SDAG-NEXT: v_accvgpr_write_b32 a1, s9
2596-
; SDAG-NEXT: v_accvgpr_write_b32 a2, s10
2597-
; SDAG-NEXT: v_accvgpr_write_b32 a3, s11
2598-
; SDAG-NEXT: v_accvgpr_write_b32 a4, s12
2599-
; SDAG-NEXT: v_accvgpr_write_b32 a5, s13
2600-
; SDAG-NEXT: v_accvgpr_write_b32 a6, s14
2601-
; SDAG-NEXT: v_accvgpr_write_b32 a7, s15
2602-
; SDAG-NEXT: v_accvgpr_write_b32 a8, s16
2603-
; SDAG-NEXT: v_accvgpr_write_b32 a9, s17
2604-
; SDAG-NEXT: v_accvgpr_write_b32 a10, s18
2605-
; SDAG-NEXT: v_accvgpr_write_b32 a11, s19
2606-
; SDAG-NEXT: v_accvgpr_write_b32 a12, s20
2607-
; SDAG-NEXT: v_accvgpr_write_b32 a13, s21
2608-
; SDAG-NEXT: v_accvgpr_write_b32 a14, s22
2609-
; SDAG-NEXT: v_accvgpr_write_b32 a15, s23
2594+
; SDAG-NEXT: v_accvgpr_write_b32 a31, s23
2595+
; SDAG-NEXT: v_accvgpr_write_b32 a30, s22
2596+
; SDAG-NEXT: v_accvgpr_write_b32 a29, s21
2597+
; SDAG-NEXT: v_accvgpr_write_b32 a28, s20
2598+
; SDAG-NEXT: v_accvgpr_write_b32 a27, s19
2599+
; SDAG-NEXT: v_accvgpr_write_b32 a26, s18
2600+
; SDAG-NEXT: v_accvgpr_write_b32 a25, s17
2601+
; SDAG-NEXT: v_accvgpr_write_b32 a24, s16
2602+
; SDAG-NEXT: v_accvgpr_write_b32 a23, s15
2603+
; SDAG-NEXT: v_accvgpr_write_b32 a22, s14
2604+
; SDAG-NEXT: v_accvgpr_write_b32 a21, s13
2605+
; SDAG-NEXT: v_accvgpr_write_b32 a20, s12
2606+
; SDAG-NEXT: v_accvgpr_write_b32 a19, s11
2607+
; SDAG-NEXT: v_accvgpr_write_b32 a18, s10
2608+
; SDAG-NEXT: v_accvgpr_write_b32 a17, s9
2609+
; SDAG-NEXT: v_accvgpr_write_b32 a16, s8
26102610
; SDAG-NEXT: s_nop 1
2611-
; SDAG-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 abid:2 blgp:3
2611+
; SDAG-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] cbsz:1 abid:2 blgp:3
26122612
; SDAG-NEXT: v_mov_b32_e32 v0, s20
26132613
; SDAG-NEXT: v_mov_b32_e32 v1, s21
26142614
; SDAG-NEXT: v_mov_b32_e32 v2, s22
@@ -2655,31 +2655,31 @@ define amdgpu_kernel void @test_mfma_scale_f32_32x32x64_f8f6f4_0_0__vgprcd_nonma
26552655
; GISEL-NEXT: s_waitcnt lgkmcnt(0)
26562656
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[36:37]
26572657
; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[44:45]
2658-
; GISEL-NEXT: v_accvgpr_write_b32 a0, s8
2658+
; GISEL-NEXT: v_accvgpr_write_b32 a31, s23
26592659
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[38:39]
26602660
; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[40:41]
26612661
; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[42:43]
26622662
; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[46:47]
26632663
; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[48:49]
26642664
; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[50:51]
2665-
; GISEL-NEXT: v_accvgpr_write_b32 a1, s9
2666-
; GISEL-NEXT: v_accvgpr_write_b32 a2, s10
2667-
; GISEL-NEXT: v_accvgpr_write_b32 a3, s11
2668-
; GISEL-NEXT: v_accvgpr_write_b32 a4, s12
2669-
; GISEL-NEXT: v_accvgpr_write_b32 a5, s13
2670-
; GISEL-NEXT: v_accvgpr_write_b32 a6, s14
2671-
; GISEL-NEXT: v_accvgpr_write_b32 a7, s15
2672-
; GISEL-NEXT: v_accvgpr_write_b32 a8, s16
2673-
; GISEL-NEXT: v_accvgpr_write_b32 a9, s17
2674-
; GISEL-NEXT: v_accvgpr_write_b32 a10, s18
2675-
; GISEL-NEXT: v_accvgpr_write_b32 a11, s19
2676-
; GISEL-NEXT: v_accvgpr_write_b32 a12, s20
2677-
; GISEL-NEXT: v_accvgpr_write_b32 a13, s21
2678-
; GISEL-NEXT: v_accvgpr_write_b32 a14, s22
2679-
; GISEL-NEXT: v_accvgpr_write_b32 a15, s23
2665+
; GISEL-NEXT: v_accvgpr_write_b32 a30, s22
2666+
; GISEL-NEXT: v_accvgpr_write_b32 a29, s21
2667+
; GISEL-NEXT: v_accvgpr_write_b32 a28, s20
2668+
; GISEL-NEXT: v_accvgpr_write_b32 a27, s19
2669+
; GISEL-NEXT: v_accvgpr_write_b32 a26, s18
2670+
; GISEL-NEXT: v_accvgpr_write_b32 a25, s17
2671+
; GISEL-NEXT: v_accvgpr_write_b32 a24, s16
2672+
; GISEL-NEXT: v_accvgpr_write_b32 a23, s15
2673+
; GISEL-NEXT: v_accvgpr_write_b32 a22, s14
2674+
; GISEL-NEXT: v_accvgpr_write_b32 a21, s13
2675+
; GISEL-NEXT: v_accvgpr_write_b32 a20, s12
2676+
; GISEL-NEXT: v_accvgpr_write_b32 a19, s11
2677+
; GISEL-NEXT: v_accvgpr_write_b32 a18, s10
2678+
; GISEL-NEXT: v_accvgpr_write_b32 a17, s9
2679+
; GISEL-NEXT: v_accvgpr_write_b32 a16, s8
26802680
; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11]
26812681
; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9]
2682-
; GISEL-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0] cbsz:1 abid:2 blgp:3
2682+
; GISEL-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[16:31] cbsz:1 abid:2 blgp:3
26832683
; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
26842684
; GISEL-NEXT: v_mov_b64_e32 v[4:5], 0
26852685
; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
@@ -2887,7 +2887,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_a(
28872887
; GCN-NEXT: v_accvgpr_write_b32 a14, v30
28882888
; GCN-NEXT: s_waitcnt vmcnt(0)
28892889
; GCN-NEXT: s_nop 0
2890-
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0]
2890+
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
28912891
; GCN-NEXT: s_nop 3
28922892
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
28932893
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
@@ -2935,7 +2935,7 @@ define <16 x float> @test_mfma_scale_f32_32x32x64_f8f6f4___constant_scale_0_0_b(
29352935
; GCN-NEXT: v_accvgpr_write_b32 a14, v30
29362936
; GCN-NEXT: s_waitcnt vmcnt(0)
29372937
; GCN-NEXT: s_nop 0
2938-
; GCN-NEXT: v_mfma_scale_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15], 0, 0 op_sel_hi:[0,0,0]
2938+
; GCN-NEXT: v_mfma_f32_32x32x64_f8f6f4 a[0:15], v[0:7], v[8:15], a[0:15]
29392939
; GCN-NEXT: s_nop 3
29402940
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
29412941
; GCN-NEXT: v_accvgpr_read_b32 v1, a1

0 commit comments

Comments
 (0)