Skip to content

Commit 27a8afa

Browse files
authored
AMDGPU: Handle gfx950 valu write vdst + permlane read hazard (#117287)
1 parent c3fe5ad commit 27a8afa

File tree

4 files changed

+153
-2
lines changed

4 files changed

+153
-2
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2551,8 +2551,34 @@ int GCNHazardRecognizer::checkPermlaneHazards(MachineInstr *MI) {
25512551
return isVCmpXWritesExec(*TII, *TRI, MI);
25522552
};
25532553

2554-
const int NumWaitStates = 4;
2555-
return NumWaitStates - getWaitStatesSince(IsVCmpXWritesExecFn, NumWaitStates);
2554+
auto IsVALUFn = [](const MachineInstr &MI) {
2555+
return SIInstrInfo::isVALU(MI);
2556+
};
2557+
2558+
const int VCmpXWritesExecWaitStates = 4;
2559+
const int VALUWritesVDstWaitStates = 2;
2560+
int WaitStatesNeeded = 0;
2561+
2562+
for (const MachineOperand &Op : MI->explicit_uses()) {
2563+
if (!Op.isReg() || !TRI->isVGPR(MF.getRegInfo(), Op.getReg()))
2564+
continue;
2565+
Register Reg = Op.getReg();
2566+
2567+
int WaitStatesSinceDef =
2568+
VALUWritesVDstWaitStates -
2569+
getWaitStatesSinceDef(Reg, IsVALUFn,
2570+
/*MaxWaitStates=*/VALUWritesVDstWaitStates);
2571+
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesSinceDef);
2572+
if (WaitStatesNeeded >= VALUWritesVDstWaitStates)
2573+
break;
2574+
}
2575+
2576+
int VCmpXHazardWaits =
2577+
VCmpXWritesExecWaitStates -
2578+
getWaitStatesSince(IsVCmpXWritesExecFn, VCmpXWritesExecWaitStates);
2579+
2580+
WaitStatesNeeded = std::max(WaitStatesNeeded, VCmpXHazardWaits);
2581+
return WaitStatesNeeded;
25562582
}
25572583

25582584
static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) {

llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,3 +142,116 @@ body: |
142142
$vgpr4 = V_MOV_B32_e32 0, implicit $exec
143143
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
144144
...
145+
146+
---
147+
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0
148+
# GCN: V_MOV_B32
149+
# GCN-NEXT: S_NOP 1
150+
# GCN-NEXT: V_PERMLANE
151+
name: valu_write_vdst_read_permlane16_swap_0
152+
body: |
153+
bb.0:
154+
liveins: $vgpr1
155+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
156+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
157+
...
158+
159+
---
160+
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_1
161+
# GCN: V_MOV_B32
162+
# GCN-NEXT: S_NOP 1
163+
# GCN-NEXT: V_PERMLANE
164+
name: valu_write_vdst_read_permlane16_swap_1
165+
body: |
166+
bb.0:
167+
liveins: $vgpr0
168+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
169+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
170+
...
171+
172+
---
173+
# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_0
174+
# GCN: V_MOV_B32
175+
# GCN-NEXT: S_NOP 1
176+
# GCN-NEXT: V_PERMLANE
177+
name: valu_write_vdst_read_permlane32_swap_0
178+
body: |
179+
bb.0:
180+
liveins: $vgpr1
181+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
182+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
183+
...
184+
185+
---
186+
# GCN-LABEL: name: valu_write_vdst_read_permlane32_swap_1
187+
# GCN: V_MOV_B32
188+
# GCN-NEXT: S_NOP 1
189+
# GCN-NEXT: V_PERMLANE
190+
name: valu_write_vdst_read_permlane32_swap_1
191+
body: |
192+
bb.0:
193+
liveins: $vgpr0
194+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
195+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
196+
...
197+
198+
---
199+
# No hazard, write of other register
200+
# GCN-LABEL: name: valu_write_vdst_read_permlane16_swap_0_otherreg
201+
# GCN: V_MOV_B32
202+
# GCN-NEXT: V_PERMLANE
203+
name: valu_write_vdst_read_permlane16_swap_0_otherreg
204+
body: |
205+
bb.0:
206+
liveins: $vgpr1
207+
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
208+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE16_SWAP_B32_e64 killed $vgpr0, killed $vgpr1, -1, 1, implicit $exec
209+
...
210+
211+
---
212+
# Both permlane hazards at once.
213+
# GCN-LABEL: name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
214+
# GCN: V_MOV_B32
215+
# GCN: V_CMPX_EQ_I32
216+
# GCN-NEXT: S_NOP 3
217+
# GCN-NEXT: V_PERMLANE
218+
name: valu_writes_vdst__vcmpx_write_exec__permlane32_swap
219+
body: |
220+
bb.0:
221+
liveins: $vgpr0, $vgpr2, $vgpr3
222+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
223+
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
224+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
225+
...
226+
227+
---
228+
# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
229+
# GCN: V_CMPX_EQ_I32
230+
# GCN: V_MOV_B32
231+
# GCN-NEXT: S_NOP 2
232+
# GCN-NEXT: V_PERMLANE
233+
name: vcmpx_write_exec__valu_writes_vdst___permlane32_swap
234+
body: |
235+
bb.0:
236+
liveins: $vgpr0, $vgpr2, $vgpr3
237+
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
238+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
239+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
240+
...
241+
242+
---
243+
# GCN-LABEL: name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
244+
# GCN: V_CMPX_EQ_I32
245+
# GCN: V_MOV_B32
246+
# GCN: V_MOV_B32
247+
# GCN-NEXT: S_NOP 1
248+
# GCN-NEXT: V_PERMLANE
249+
name: vcmpx_write_exec__valu_writes_vdstx2___permlane32_swap
250+
body: |
251+
bb.0:
252+
liveins: $vgpr0, $vgpr2, $vgpr3
253+
$exec = V_CMPX_EQ_I32_e64 $vgpr2, $vgpr3, implicit $exec
254+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
255+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
256+
renamable $vgpr0, renamable $vgpr1 = V_PERMLANE32_SWAP_B32_e32 killed $vgpr0, killed $vgpr1, implicit $exec
257+
...

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane16.swap.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vi(i32 %vdst_old) {
2626
; GCN: ; %bb.0:
2727
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2828
; GCN-NEXT: v_mov_b32_e32 v1, 1
29+
; GCN-NEXT: s_nop 1
2930
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
3031
; GCN-NEXT: s_setpc_b64 s[30:31]
3132
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vl(i32 %vdst_old) {
3738
; GCN: ; %bb.0:
3839
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3940
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
41+
; GCN-NEXT: s_nop 1
4042
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
4143
; GCN-NEXT: s_setpc_b64 s[30:31]
4244
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane16_swap_b32_iv(i32 %src0_old) {
4951
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5052
; GCN-NEXT: v_mov_b32_e32 v1, v0
5153
; GCN-NEXT: v_mov_b32_e32 v0, 1
54+
; GCN-NEXT: s_nop 1
5255
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
5356
; GCN-NEXT: s_setpc_b64 s[30:31]
5457
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane16_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
6164
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6265
; GCN-NEXT: v_mov_b32_e32 v0, s0
6366
; GCN-NEXT: v_mov_b32_e32 v1, s1
67+
; GCN-NEXT: s_nop 1
6468
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
6569
; GCN-NEXT: s_setpc_b64 s[30:31]
6670
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane16_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
7377
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7478
; GCN-NEXT: v_mov_b32_e32 v1, v0
7579
; GCN-NEXT: v_mov_b32_e32 v0, s0
80+
; GCN-NEXT: s_nop 1
7681
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
7782
; GCN-NEXT: s_setpc_b64 s[30:31]
7883
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane16_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
8489
; GCN: ; %bb.0:
8590
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8691
; GCN-NEXT: v_mov_b32_e32 v1, s0
92+
; GCN-NEXT: s_nop 1
8793
; GCN-NEXT: v_permlane16_swap_b32_e32 v0, v1
8894
; GCN-NEXT: s_setpc_b64 s[30:31]
8995
%v = call { i32, i32 } @llvm.amdgcn.permlane16.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane32.swap.ll

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vi(i32 %vdst_old) {
2626
; GCN: ; %bb.0:
2727
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2828
; GCN-NEXT: v_mov_b32_e32 v1, 1
29+
; GCN-NEXT: s_nop 1
2930
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
3031
; GCN-NEXT: s_setpc_b64 s[30:31]
3132
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 1, i1 false, i1 false)
@@ -37,6 +38,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vl(i32 %vdst_old) {
3738
; GCN: ; %bb.0:
3839
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3940
; GCN-NEXT: v_mov_b32_e32 v1, 0xc1d1
41+
; GCN-NEXT: s_nop 1
4042
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
4143
; GCN-NEXT: s_setpc_b64 s[30:31]
4244
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 49617, i1 false, i1 false)
@@ -49,6 +51,7 @@ define { i32, i32 } @v_permlane32_swap_b32_iv(i32 %src0_old) {
4951
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5052
; GCN-NEXT: v_mov_b32_e32 v1, v0
5153
; GCN-NEXT: v_mov_b32_e32 v0, 1
54+
; GCN-NEXT: s_nop 1
5255
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
5356
; GCN-NEXT: s_setpc_b64 s[30:31]
5457
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 1, i32 %src0_old, i1 false, i1 false)
@@ -61,6 +64,7 @@ define { i32, i32 } @v_permlane32_swap_b32_ss(i32 inreg %vdst_old, i32 inreg %sr
6164
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6265
; GCN-NEXT: v_mov_b32_e32 v0, s0
6366
; GCN-NEXT: v_mov_b32_e32 v1, s1
67+
; GCN-NEXT: s_nop 1
6468
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
6569
; GCN-NEXT: s_setpc_b64 s[30:31]
6670
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -73,6 +77,7 @@ define { i32, i32 } @v_permlane32_swap_b32_sv(i32 inreg %vdst_old, i32 %src0_old
7377
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7478
; GCN-NEXT: v_mov_b32_e32 v1, v0
7579
; GCN-NEXT: v_mov_b32_e32 v0, s0
80+
; GCN-NEXT: s_nop 1
7681
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
7782
; GCN-NEXT: s_setpc_b64 s[30:31]
7883
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)
@@ -84,6 +89,7 @@ define { i32, i32 } @v_permlane32_swap_b32_vs(i32 %vdst_old, i32 inreg %src0_old
8489
; GCN: ; %bb.0:
8590
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8691
; GCN-NEXT: v_mov_b32_e32 v1, s0
92+
; GCN-NEXT: s_nop 1
8793
; GCN-NEXT: v_permlane32_swap_b32_e32 v0, v1
8894
; GCN-NEXT: s_setpc_b64 s[30:31]
8995
%v = call { i32, i32 } @llvm.amdgcn.permlane32.swap(i32 %vdst_old, i32 %src0_old, i1 false, i1 false)

0 commit comments

Comments
 (0)