Skip to content

Commit f68b404

Browse files
committed
AMDGPU: Correct cycle counts for f64 mfma on gfx940/gfx90a
The manual states these are 4 and 8 pass instructions. I'm also not sure if reporting these as using VALU and not XDL resource is correct. The Latency and ReleaseAtCycles values were also mismatched, which I'm also not sure was intentional or not.
1 parent 3a146d5 commit f68b404

File tree

6 files changed

+81
-34
lines changed

6 files changed

+81
-34
lines changed

llvm/lib/Target/AMDGPU/SISchedule.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,8 +165,10 @@ multiclass SICommonWriteRes {
165165
def : HWVALUWriteRes<WriteTrans32, 4>;
166166
def : HWVALUWriteRes<WriteQuarterRate32, 4>;
167167

168+
let ReleaseAtCycles = [4] in
168169
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
169-
def : HWVALUWriteRes<Write8PassDGEMM, 16>;
170+
let ReleaseAtCycles = [8] in
171+
def : HWVALUWriteRes<Write8PassDGEMM, 8>;
170172

171173
let ReleaseAtCycles = [2] in
172174
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64(ptr addrspace(1) %arg, doubl
277277
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
278278
; GCN-NEXT: v_mov_b32_e32 v0, 0
279279
; GCN-NEXT: s_nop 7
280-
; GCN-NEXT: s_nop 7
281-
; GCN-NEXT: s_nop 0
280+
; GCN-NEXT: s_nop 1
282281
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
283282
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
284283
; GCN-NEXT: s_endpgm
@@ -302,8 +301,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_imm(ptr addrspace(1) %
302301
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7] cbsz:1 abid:2 blgp:3
303302
; GCN-NEXT: v_mov_b32_e32 v0, 0
304303
; GCN-NEXT: s_nop 7
305-
; GCN-NEXT: s_nop 7
306-
; GCN-NEXT: s_nop 0
304+
; GCN-NEXT: s_nop 1
307305
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[4:5]
308306
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[4:5] offset:16
309307
; GCN-NEXT: s_endpgm
@@ -338,8 +336,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, d
338336
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
339337
; GCN-NEXT: v_mov_b32_e32 v0, 0
340338
; GCN-NEXT: s_nop 7
341-
; GCN-NEXT: s_nop 7
342-
; GCN-NEXT: s_nop 0
339+
; GCN-NEXT: s_nop 1
343340
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[8:9]
344341
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
345342
; GCN-NEXT: s_endpgm
@@ -374,8 +371,7 @@ define amdgpu_kernel void @test_mfma_f64_16x16x4f64_splat_lit(ptr addrspace(1) %
374371
; GCN-NEXT: v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
375372
; GCN-NEXT: v_mov_b32_e32 v0, 0
376373
; GCN-NEXT: s_nop 7
377-
; GCN-NEXT: s_nop 7
378-
; GCN-NEXT: s_nop 0
374+
; GCN-NEXT: s_nop 1
379375
; GCN-NEXT: global_store_dwordx4 v0, a[0:3], s[12:13]
380376
; GCN-NEXT: global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
381377
; GCN-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/mai-hazards-gfx90a.mir

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -481,8 +481,7 @@ body: |
481481
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
482482
# GCN: V_MFMA
483483
# GCN-NEXT: S_NOP 7
484-
# GCN-NEXT: S_NOP 7
485-
# GCN-NEXT: S_NOP 1
484+
# GCN-NEXT: S_NOP 2
486485
# GCN-NEXT: FLAT_STORE_DWORD
487486
name: dmfma16x16_write_vgpr_flat_read
488487
body: |
@@ -1219,8 +1218,7 @@ body: |
12191218
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
12201219
# GCN: V_MFMA
12211220
# GCN-NEXT: S_NOP 7
1222-
# GCN-NEXT: S_NOP 7
1223-
# GCN-NEXT: S_NOP 1
1221+
# GCN-NEXT: S_NOP 2
12241222
# GCN-NEXT: FLAT_STORE_DWORD
12251223
name: dmfma16x16_write_agpr_flat_read
12261224
body: |

llvm/test/CodeGen/AMDGPU/mai-hazards-gfx940.mir

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -749,7 +749,6 @@ body: |
749749
# GCN-LABEL: name: dmfma16x16_write_vgpr_flat_read
750750
# GCN: V_MFMA
751751
# GCN-NEXT: S_NOP 7
752-
# GCN-NEXT: S_NOP 7
753752
# GCN-NEXT: S_NOP 1
754753
# GCN-NEXT: FLAT_STORE_DWORD
755754
name: dmfma16x16_write_vgpr_flat_read
@@ -804,7 +803,7 @@ body: |
804803
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_read
805804
# GCN: V_MFMA
806805
# GCN-NEXT: S_NOP 7
807-
# GCN-NEXT: S_NOP 2
806+
# GCN-NEXT: S_NOP 1
808807
# GCN-NEXT: V_MOV_B32
809808
name: dmfma16x16_write_vgpr_valu_read
810809
body: |
@@ -868,7 +867,7 @@ body: |
868867
# GCN-LABEL: name: dmfma16x16_write_vgpr_dot_read
869868
# GCN: V_MFMA
870869
# GCN-NEXT: S_NOP 7
871-
# GCN-NEXT: S_NOP 2
870+
# GCN-NEXT: S_NOP 1
872871
# GCN-NEXT: V_DOT
873872
name: dmfma16x16_write_vgpr_dot_read
874873
body: |
@@ -988,7 +987,7 @@ body: |
988987
# GCN-LABEL: name: dmfma16x16_write_vgpr_valu_write
989988
# GCN: V_MFMA
990989
# GCN-NEXT: S_NOP 7
991-
# GCN-NEXT: S_NOP 2
990+
# GCN-NEXT: S_NOP 1
992991
# GCN-NEXT: V_MOV_B32
993992
name: dmfma16x16_write_vgpr_valu_write
994993
body: |
@@ -1484,7 +1483,6 @@ body: |
14841483
# GCN-LABEL: name: dmfma16x16_write_agpr_flat_read
14851484
# GCN: V_MFMA
14861485
# GCN-NEXT: S_NOP 7
1487-
# GCN-NEXT: S_NOP 7
14881486
# GCN-NEXT: S_NOP 1
14891487
# GCN-NEXT: FLAT_STORE_DWORD
14901488
name: dmfma16x16_write_agpr_flat_read
@@ -1506,7 +1504,7 @@ body: |
15061504
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_read
15071505
# GCN: V_MFMA
15081506
# GCN-NEXT: S_NOP 7
1509-
# GCN-NEXT: S_NOP 2
1507+
# GCN-NEXT: S_NOP 1
15101508
# GCN-NEXT: V_ACCVGPR_READ_B32_e64
15111509
name: dmfma16x16_write_agpr_valu_read
15121510
body: |
@@ -1527,7 +1525,7 @@ body: |
15271525
# GCN-LABEL: name: dmfma16x16_write_agpr_valu_write
15281526
# GCN: V_MFMA
15291527
# GCN-NEXT: S_NOP 7
1530-
# GCN-NEXT: S_NOP 2
1528+
# GCN-NEXT: S_NOP 1
15311529
# GCN-NEXT: V_ACCVGPR_WRITE_B32_e64
15321530
name: dmfma16x16_write_agpr_valu_write
15331531
body: |
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# RUN: llvm-mca -mtriple=amdgcn -mcpu=gfx90a --timeline --iterations=1 --timeline-max-cycles=0 < %s | FileCheck %s
2+
3+
# CHECK: Instruction Info:
4+
# CHECK-NEXT: [1]: #uOps
5+
# CHECK-NEXT: [2]: Latency
6+
# CHECK-NEXT: [3]: RThroughput
7+
# CHECK-NEXT: [4]: MayLoad
8+
# CHECK-NEXT: [5]: MayStore
9+
# CHECK-NEXT: [6]: HasSideEffects (U)
10+
11+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
12+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
13+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
14+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
15+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
16+
17+
18+
# CHECK: Resources:
19+
# CHECK-NEXT: [0] - HWBranch
20+
# CHECK-NEXT: [1] - HWExport
21+
# CHECK-NEXT: [2] - HWLGKM
22+
# CHECK-NEXT: [3] - HWSALU
23+
# CHECK-NEXT: [4] - HWVALU
24+
# CHECK-NEXT: [5] - HWVMEM
25+
# CHECK-NEXT: [6] - HWXDL
26+
27+
# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
28+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
29+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
30+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
31+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
32+
v_mfma_f64_4x4x4f64 a[0:1], v[0:1], v[2:3], a[0:1]
33+
v_mfma_f64_4x4x4f64 v[0:1], v[0:1], v[2:3], v[0:1]
34+
35+
36+
v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
37+
v_mfma_f64_16x16x4f64 v[0:7], v[0:1], v[2:3], v[0:7]
38+

llvm/test/tools/llvm-mca/AMDGPU/gfx940-mfma.s

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,18 +2,9 @@
22

33
# CHECK: Iterations: 1
44
# CHECK: Instructions: 58
5-
# CHECK: Total Cycles: 543
5+
# CHECK: Total Cycles: 545
66
# CHECK: Total uOps: 58
77

8-
# CHECK: Resources:
9-
# CHECK: [0] - HWBranch
10-
# CHECK: [1] - HWExport
11-
# CHECK: [2] - HWLGKM
12-
# CHECK: [3] - HWSALU
13-
# CHECK: [4] - HWVALU
14-
# CHECK: [5] - HWVMEM
15-
# CHECK: [6] - HWXDL
16-
178
v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
189
v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
1910

@@ -101,15 +92,39 @@ v_smfmac_i32_16x16x64_i8 a[10:13], v[2:3], a[4:7], v9
10192
v_smfmac_i32_32x32x32_i8 v[10:25], a[2:3], v[4:7], v10 cbsz:3 abid:1
10293
v_smfmac_i32_32x32x32_i8 a[10:25], v[2:3], a[4:7], v11
10394

95+
# CHECK: Instruction Info:
96+
# CHECK-NEXT:[1]: #uOps
97+
# CHECK-NEXT:[2]: Latency
98+
# CHECK-NEXT:[3]: RThroughput
99+
# CHECK-NEXT:[4]: MayLoad
100+
# CHECK-NEXT:[5]: MayStore
101+
# CHECK-NEXT:[6]: HasSideEffects (U)
102+
103+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
104+
105+
# CHECK: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
106+
# CHECK-NEXT: 1 8 4.00 U v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
107+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
108+
# CHECK-NEXT: 1 12 8.00 U v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
109+
110+
# CHECK: Resources:
111+
# CHECK: [0] - HWBranch
112+
# CHECK: [1] - HWExport
113+
# CHECK: [2] - HWLGKM
114+
# CHECK: [3] - HWSALU
115+
# CHECK: [4] - HWVALU
116+
# CHECK: [5] - HWVMEM
117+
# CHECK: [6] - HWXDL
118+
104119
# CHECK: [0] [1] [2] [3] [4] [5] [6] Instructions:
105120
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 a[0:3], v0, v1, a[2:5]
106121
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_16x16x4_f32 v[0:3], v0, v1, v[2:5]
107122
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 a[0:15], v0, v1, a[18:33]
108123
# CHECK-NEXT: - - - - - - 16.00 v_mfma_f32_32x32x2_f32 v[0:15], v0, v1, v[18:33]
109-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
110-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
111-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
112-
# CHECK-NEXT: - - - - 1.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
124+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 a[0:1], v[0:1], a[2:3], a[2:3]
125+
# CHECK-NEXT: - - - - 4.00 - - v_mfma_f64_4x4x4_4b_f64 v[0:1], v[0:1], v[2:3], v[2:3]
126+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 a[0:7], v[0:1], v[2:3], a[0:7]
127+
# CHECK-NEXT: - - - - 8.00 - - v_mfma_f64_16x16x4_f64 v[0:7], v[0:1], v[2:3], v[0:7]
113128
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 v[0:3], v[4:5], v[6:7], v[0:3]
114129
# CHECK-NEXT: - - - - - - 4.00 v_mfma_f32_16x16x16_f16 a[0:3], v[4:5], v[6:7], a[0:3]
115130
# CHECK-NEXT: - - - - - - 8.00 v_mfma_f32_32x32x8_f16 v[0:15], v[4:5], v[6:7], v[0:15]

0 commit comments

Comments
 (0)