Skip to content

Commit 0f3628a

Browse files
authored
AMDGPU: Correct cycle counts for f64 mfma on gfx940 (#83782)
1 parent a30233f commit 0f3628a

File tree

4 files changed

+79
-36
lines changed

4 files changed

+79
-36
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 29 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -2538,23 +2538,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
25382538
break;
25392539
case 8:
25402540
NeedWaitStates =
2541-
ST.hasGFX940Insts()
2542-
? isXDL(ST, *MFMA)
2543-
? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2544-
: GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2545-
: SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2541+
isDGEMM(MFMA->getOpcode())
2542+
? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2543+
: DMFMA16x16WriteVgprVALUReadWaitStates
2544+
: ST.hasGFX940Insts()
2545+
? isXDL(ST, *MFMA)
2546+
? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2547+
: GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2548+
: SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
25462549
break;
25472550
case 16: [[fallthrough]];
25482551
default:
2552+
assert(!isDGEMM(MFMA->getOpcode()));
25492553
NeedWaitStates =
2550-
isDGEMM(MFMA->getOpcode())
2551-
? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2552-
: DMFMA16x16WriteVgprVALUReadWaitStates
2553-
: ST.hasGFX940Insts()
2554-
? isXDL(ST, *MFMA)
2555-
? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2556-
: GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2557-
: SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2554+
ST.hasGFX940Insts()
2555+
? isXDL(ST, *MFMA)
2556+
? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2557+
: GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2558+
: SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
25582559
break;
25592560
}
25602561

@@ -2633,21 +2634,24 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
26332634
: GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
26342635
break;
26352636
case 8:
2636-
NeedWaitStates = ST.hasGFX940Insts()
2637-
? isXDL(ST, *MFMA)
2638-
? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2639-
: GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2640-
: SMFMA16x16WriteVgprVALUWawWaitStates;
2637+
NeedWaitStates =
2638+
isDGEMM(MFMA->getOpcode()) ? DMFMA16x16WriteVgprVALUWriteWaitStates
2639+
:
2640+
2641+
ST.hasGFX940Insts()
2642+
? isXDL(ST, *MFMA) ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2643+
: GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2644+
: SMFMA16x16WriteVgprVALUWawWaitStates;
26412645
break;
26422646
case 16: [[fallthrough]];
26432647
default:
2644-
NeedWaitStates = isDGEMM(MFMA->getOpcode())
2645-
? DMFMA16x16WriteVgprVALUWriteWaitStates
2646-
: ST.hasGFX940Insts()
2647-
? isXDL(ST, *MFMA)
2648-
? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2649-
: GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2650-
: SMFMA32x32WriteVgprVALUWawWaitStates;
2648+
assert(!isDGEMM(MFMA->getOpcode()));
2649+
NeedWaitStates =
2650+
ST.hasGFX940Insts()
2651+
? isXDL(ST, *MFMA)
2652+
? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2653+
: GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2654+
: SMFMA32x32WriteVgprVALUWawWaitStates;
26512655
break;
26522656
}
26532657

llvm/lib/Target/AMDGPU/SISchedule.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
165165
def : HWVALUWriteRes<WriteTrans32, 4>;
166166
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
167167

168+
let ReleaseAtCycles = [4] in
168169
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
169-
def : HWVALUWriteRes<Write8PassDGEMM, 16>;
170+
let ReleaseAtCycles = [8] in
171+
def : HWVALUWriteRes<Write8PassDGEMM, 8>;
170172

171173
let ReleaseAtCycles = [2] in
172174
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
2+
3+
# CHECK: Instruction Info:
4+
# CHECK-NEXT: [1]: #uOps
5+
# CHECK-NEXT: [2]: Latency
6+
# CHECK-NEXT: [3]: RThroughput
7+
# CHECK-NEXT: [4]: MayLoad
8+
# CHECK-NEXT: [5]: MayStore
9+
# CHECK-NEXT: [6]: HasSideEffects (U)
10+
11+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
12+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
13+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
14+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
15+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
16+
17+
18+
# CHECK: Resources:
19+
# CHECK-NEXT: [0] - HWBranch
20+
# CHECK-NEXT: [1] - HWExport
21+
# CHECK-NEXT: [2] - HWLGKM
22+
# CHECK-NEXT: [3] - HWSALU
23+
# CHECK-NEXT: [4] - HWVALU
24+
# CHECK-NEXT: [5] - HWVMEM
25+
# CHECK-NEXT: [6] - HWXDL
26+
27+
# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
28+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
29+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
30+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
31+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
32+
v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
33+
v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
34+
35+
36+
v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
37+
v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
38+

llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# CHECK: Iterations: 1
44
# CHECK: Instructions: 78
5-
# CHECK: Total Cycles: 699
5+
# CHECK: Total Cycles: 701
66
# CHECK: Total uOps: 78
77

88
v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
@@ -128,11 +128,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
128128
# CHECK-NEXT:[6]: HasSideEffects (U)
129129

130130
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
131-
132-
# CHECK: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
133-
# CHECK-NEXT: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
134-
# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
135-
# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
131+
# CHECK: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
132+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
133+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
134+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
136135

137136
# CHECK: Resources:
138137
# CHECK: [0] - HWBranch
@@ -148,10 +147,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
148147
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
149148
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
150149
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
151-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
152-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
153-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
154-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
150+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
151+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
152+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
153+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
155154
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
156155
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
157156
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]

0 commit comments

Comments
 (0)