Skip to content

Commit 11c27ae

Browse files
VigneshwarJbcahoon
authored andcommitted
AMDGPU: Handle gfx950 XDL-write-VGPR-VALU-Mem-Exp wait state change (llvm#126727)
(cherry picked from commit a2263eb)
1 parent 3560926 commit 11c27ae

7 files changed

+395
-379
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2593,12 +2593,14 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {
25932593
return NumPasses + 3;
25942594
}
25952595

2596-
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
2597-
// 2 pass -> 5
2598-
// 4 pass -> 7
2599-
// 8 pass -> 11
2600-
// 16 pass -> 19
2601-
return NumPasses + 3;
2596+
static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses,
2597+
bool IsGFX950) {
2598+
// xdl def cycles | gfx940 | gfx950
2599+
// 2 pass | 5 5
2600+
// 4 pass | 7 8
2601+
// 8 pass | 11 12
2602+
// 16 pass | 19 20
2603+
return NumPasses + 3 + (NumPasses != 2 && IsGFX950);
26022604
}
26032605

26042606
static int GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses) {
@@ -2749,7 +2751,8 @@ int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
27492751
} else if (ST.hasGFX940Insts()) {
27502752
NeedWaitStates =
27512753
isXDL(ST, *MFMA)
2752-
? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(NumPasses)
2754+
? GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(
2755+
NumPasses, ST.hasGFX950Insts())
27532756
: GFX940_SMFMA_N_PassWriteVgprVALUMemExpReadWaitStates(
27542757
NumPasses);
27552758
} else {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.bf16.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16(<8 x bfloat> %arg0, <8 x
4949
; GCN-NEXT: v_mov_b32_e32 v9, s13
5050
; GCN-NEXT: v_mov_b32_e32 v10, s14
5151
; GCN-NEXT: v_mov_b32_e32 v11, s15
52-
; GCN-NEXT: s_nop 3
52+
; GCN-NEXT: s_nop 4
5353
; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
5454
; GCN-NEXT: s_waitcnt vmcnt(0)
5555
; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -122,7 +122,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__flags(<8 x bfloat> %arg0
122122
; GCN-NEXT: v_mov_b32_e32 v9, s13
123123
; GCN-NEXT: v_mov_b32_e32 v10, s14
124124
; GCN-NEXT: v_mov_b32_e32 v11, s15
125-
; GCN-NEXT: s_nop 3
125+
; GCN-NEXT: s_nop 4
126126
; GCN-NEXT: global_store_dwordx4 v[12:13], a[28:31], off sc0 sc1
127127
; GCN-NEXT: s_waitcnt vmcnt(0)
128128
; GCN-NEXT: global_store_dwordx4 v[14:15], a[24:27], off sc0 sc1
@@ -179,7 +179,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac(<8 x bfloat> %arg0, <8 x b
179179
; GCN-NEXT: s_nop 1
180180
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15]
181181
; GCN-NEXT: s_nop 7
182-
; GCN-NEXT: s_nop 2
182+
; GCN-NEXT: s_nop 3
183183
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
184184
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
185185
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -224,7 +224,7 @@ define <16 x float> @test_mfma_f32_32x32x16_bf16__mac__flags(<8 x bfloat> %arg0,
224224
; GCN-NEXT: s_nop 1
225225
; GCN-NEXT: v_mfma_f32_32x32x16_bf16 a[0:15], v[0:3], v[4:7], a[0:15] cbsz:1 abid:1 blgp:1
226226
; GCN-NEXT: s_nop 7
227-
; GCN-NEXT: s_nop 2
227+
; GCN-NEXT: s_nop 3
228228
; GCN-NEXT: v_accvgpr_read_b32 v0, a0
229229
; GCN-NEXT: v_accvgpr_read_b32 v1, a1
230230
; GCN-NEXT: v_accvgpr_read_b32 v2, a2
@@ -396,7 +396,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac(<8 x bfloat>
396396
; GCN-NEXT: v_mov_b32_e32 v16, 0
397397
; GCN-NEXT: s_waitcnt lgkmcnt(0)
398398
; GCN-NEXT: s_nop 7
399-
; GCN-NEXT: s_nop 0
399+
; GCN-NEXT: s_nop 1
400400
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
401401
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
402402
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -431,7 +431,7 @@ define amdgpu_kernel void @test_mfma_f32_32x32x16_bf16__vgprcd_mac_flags(<8 x bf
431431
; GCN-NEXT: v_mov_b32_e32 v16, 0
432432
; GCN-NEXT: s_waitcnt lgkmcnt(0)
433433
; GCN-NEXT: s_nop 7
434-
; GCN-NEXT: s_nop 0
434+
; GCN-NEXT: s_nop 1
435435
; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
436436
; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
437437
; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16

0 commit comments

Comments
 (0)