Skip to content

Commit a5fd68c

Browse files
committed
AMDGPU: Correct cycle counts for f64 mfma on gfx940/gfx90a
The manual states these are 4 and 8 pass instructions. I'm also not sure if reporting these as using VALU and not XDL resource is correct. The Latency and ReleaseAtCycles values were also mismatched, which I'm also not sure was intentional or not.
1 parent f407f2d commit a5fd68c

File tree

6 files changed

+61
-30
lines changed

6 files changed

+61
-30
lines changed

llvm/lib/Target/AMDGPU/SISchedule.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
165165
def : HWVALUWriteRes<WriteTrans32, 4>;
166166
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
167167

168+
let ReleaseAtCycles = [4] in
168169
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
169-
def : HWVALUWriteRes<Write8PassDGEMM, 16>;
170+
let ReleaseAtCycles = [8] in
171+
def : HWVALUWriteRes<Write8PassDGEMM, 8>;
170172

171173
let ReleaseAtCycles = [2] in
172174
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
277277
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
278278
; GCN-NEXT: v_mov_b32_e32 v0, 0
279279
; GCN-NEXT: s_nop 7
280-
; GCN-NEXT: s_nop 7
281-
; GCN-NEXT: s_nop 0
280+
; GCN-NEXT: s_nop 1
282281
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
283282
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
284283
; GCN-NEXT: s_endpgm
@@ -302,8 +301,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
302301
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
303302
; GCN-NEXT: v_mov_b32_e32 v0, 0
304303
; GCN-NEXT: s_nop 7
305-
; GCN-NEXT: s_nop 7
306-
; GCN-NEXT: s_nop 0
304+
; GCN-NEXT: s_nop 1
307305
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
308306
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
309307
; GCN-NEXT: s_endpgm
@@ -338,8 +336,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
338336
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
339337
; GCN-NEXT: v_mov_b32_e32 v0, 0
340338
; GCN-NEXT: s_nop 7
341-
; GCN-NEXT: s_nop 7
342-
; GCN-NEXT: s_nop 0
339+
; GCN-NEXT: s_nop 1
343340
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
344341
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
345342
; GCN-NEXT: s_endpgm
@@ -374,8 +371,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
374371
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
375372
; GCN-NEXT: v_mov_b32_e32 v0, 0
376373
; GCN-NEXT: s_nop 7
377-
; GCN-NEXT: s_nop 7
378-
; GCN-NEXT: s_nop 0
374+
; GCN-NEXT: s_nop 1
379375
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
380376
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
381377
; GCN-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,7 @@ body: |
481481
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
482482
# GCN: V_MFMA
483483
# GCN-NEXT: S_NOP 7
484-
# GCN-NEXT: S_NOP 7
485-
# GCN-NEXT: S_NOP 1
484+
# GCN-NEXT: S_NOP 2
486485
# GCN-NEXT: FLAT_STORE_DWORD
487486
name: dmfma16x16_write_vgpr_flat_read
488487
body: |
@@ -1219,8 +1218,7 @@ body: |
12191218
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
12201219
# GCN: V_MFMA
12211220
# GCN-NEXT: S_NOP 7
1222-
# GCN-NEXT: S_NOP 7
1223-
# GCN-NEXT: S_NOP 1
1221+
# GCN-NEXT: S_NOP 2
12241222
# GCN-NEXT: FLAT_STORE_DWORD
12251223
name: dmfma16x16_write_agpr_flat_read
12261224
body: |

llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -749,7 +749,6 @@ body: |
749749
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
750750
# GCN: V_MFMA
751751
# GCN-NEXT: S_NOP 7
752-
# GCN-NEXT: S_NOP 7
753752
# GCN-NEXT: S_NOP 1
754753
# GCN-NEXT: FLAT_STORE_DWORD
755754
name: dmfma16x16_write_vgpr_flat_read
@@ -804,7 +803,7 @@ body: |
804803
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
805804
# GCN: V_MFMA
806805
# GCN-NEXT: S_NOP 7
807-
# GCN-NEXT: S_NOP 2
806+
# GCN-NEXT: S_NOP 1
808807
# GCN-NEXT: V_MOV_B32
809808
name: dmfma16x16_write_vgpr_valu_read
810809
body: |
@@ -868,7 +867,7 @@ body: |
868867
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
869868
# GCN: V_MFMA
870869
# GCN-NEXT: S_NOP 7
871-
# GCN-NEXT: S_NOP 2
870+
# GCN-NEXT: S_NOP 1
872871
# GCN-NEXT: V_DOT
873872
name: dmfma16x16_write_vgpr_dot_read
874873
body: |
@@ -988,7 +987,7 @@ body: |
988987
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
989988
# GCN: V_MFMA
990989
# GCN-NEXT: S_NOP 7
991-
# GCN-NEXT: S_NOP 2
990+
# GCN-NEXT: S_NOP 1
992991
# GCN-NEXT: V_MOV_B32
993992
name: dmfma16x16_write_vgpr_valu_write
994993
body: |
@@ -1484,7 +1483,6 @@ body: |
14841483
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
14851484
# GCN: V_MFMA
14861485
# GCN-NEXT: S_NOP 7
1487-
# GCN-NEXT: S_NOP 7
14881486
# GCN-NEXT: S_NOP 1
14891487
# GCN-NEXT: FLAT_STORE_DWORD
14901488
name: dmfma16x16_write_agpr_flat_read
@@ -1506,7 +1504,7 @@ body: |
15061504
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
15071505
# GCN: V_MFMA
15081506
# GCN-NEXT: S_NOP 7
1509-
# GCN-NEXT: S_NOP 2
1507+
# GCN-NEXT: S_NOP 1
15101508
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
15111509
name: dmfma16x16_write_agpr_valu_read
15121510
body: |
@@ -1527,7 +1525,7 @@ body: |
15271525
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
15281526
# GCN: V_MFMA
15291527
# GCN-NEXT: S_NOP 7
1530-
# GCN-NEXT: S_NOP 2
1528+
# GCN-NEXT: S_NOP 1
15311529
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
15321530
name: dmfma16x16_write_agpr_valu_write
15331531
body: |
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
2+
3+
# CHECK: Instruction Info:
4+
# CHECK-NEXT: [1]: #uOps
5+
# CHECK-NEXT: [2]: Latency
6+
# CHECK-NEXT: [3]: RThroughput
7+
# CHECK-NEXT: [4]: MayLoad
8+
# CHECK-NEXT: [5]: MayStore
9+
# CHECK-NEXT: [6]: HasSideEffects (U)
10+
11+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
12+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
13+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
14+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
15+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
16+
17+
18+
# CHECK: Resources:
19+
# CHECK-NEXT: [0] - HWBranch
20+
# CHECK-NEXT: [1] - HWExport
21+
# CHECK-NEXT: [2] - HWLGKM
22+
# CHECK-NEXT: [3] - HWSALU
23+
# CHECK-NEXT: [4] - HWVALU
24+
# CHECK-NEXT: [5] - HWVMEM
25+
# CHECK-NEXT: [6] - HWXDL
26+
27+
# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
28+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
29+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
30+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
31+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
32+
v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
33+
v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
34+
35+
36+
v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
37+
v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
38+

llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
# CHECK: Iterations: 1
44
# CHECK: Instructions: 78
5-
# CHECK: Total Cycles: 699
5+
# CHECK: Total Cycles: 701
66
# CHECK: Total uOps: 78
77

88
v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
@@ -128,11 +128,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
128128
# CHECK-NEXT:[6]: HasSideEffects (U)
129129

130130
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
131-
132-
# CHECK: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
133-
# CHECK-NEXT: 1 8 1.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
134-
# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
135-
# CHECK-NEXT: 1 20 1.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
131+
# CHECK: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
132+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
133+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
134+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
136135

137136
# CHECK: Resources:
138137
# CHECK: [0] - HWBranch
@@ -148,10 +147,10 @@ v_smfmac_f32_32x32x32_fp8_fp8 v[0:15], v[2:3], v[4:7], v1 cbsz:3 abid:1
148147
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
149148
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
150149
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
151-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
152-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
153-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
154-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
150+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
151+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
152+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
153+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
155154
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
156155
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
157156
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]

0 commit comments

Comments
 (0)