Skip to content

[AMDGPU] Don't send DEALLOC_VGPRs after calls #77439

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ class WaitcntBrackets {
VgprVmemTypes[GprNo] = 0;
}

void setNonKernelFunctionInitialState() {
void setStateOnFunctionEntryOrReturn() {
setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
PendingEvents |= WaitEventMaskForInst[VS_CNT];
}
Expand Down Expand Up @@ -1487,6 +1487,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else {
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
Expand Down Expand Up @@ -1879,7 +1880,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {

auto NonKernelInitialState =
std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
NonKernelInitialState->setNonKernelFunctionInitialState();
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);

Modified = true;
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/AMDGPU/call-argument-types.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4462,8 +4462,6 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: buffer_store_b32 v0, off, s[36:39], 0 dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
;
; HSA-LABEL: test_call_external_i32_func_i32_imm:
Expand Down
4 changes: 0 additions & 4 deletions llvm/test/CodeGen/AMDGPU/calling-conventions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,6 @@ define amdgpu_kernel void @call_coldcc() #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call float @coldcc(float 1.0)
store float %val, ptr addrspace(1) undef
Expand Down Expand Up @@ -231,8 +229,6 @@ define amdgpu_kernel void @call_fastcc() #0 {
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1]
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
%val = call float @fastcc(float 1.0)
store float %val, ptr addrspace(1) undef
Expand Down
24 changes: 0 additions & 24 deletions llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
Original file line number Diff line number Diff line change
Expand Up @@ -626,8 +626,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB1_4:
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
Expand Down Expand Up @@ -675,8 +673,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB1_4:
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
Expand Down Expand Up @@ -988,8 +984,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB1_2:
; GFX1164-DPP-NEXT: s_nop 0
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
Expand Down Expand Up @@ -1051,8 +1045,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB1_2:
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
Expand Down Expand Up @@ -3042,8 +3034,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB5_4:
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
Expand Down Expand Up @@ -3091,8 +3081,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB5_4:
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
Expand Down Expand Up @@ -3404,8 +3392,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB5_2:
; GFX1164-DPP-NEXT: s_nop 0
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
Expand Down Expand Up @@ -3467,8 +3453,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB5_2:
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
Expand Down Expand Up @@ -3770,8 +3754,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1164-NEXT: .LBB6_4:
; GFX1164-NEXT: s_nop 0
; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-NEXT: s_endpgm
;
; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
Expand Down Expand Up @@ -3819,8 +3801,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX1132-NEXT: .LBB6_4:
; GFX1132-NEXT: s_nop 0
; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-NEXT: s_endpgm
;
; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
Expand Down Expand Up @@ -4132,8 +4112,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1164-DPP-NEXT: .LBB6_2:
; GFX1164-DPP-NEXT: s_nop 0
; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1164-DPP-NEXT: s_endpgm
;
; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
Expand Down Expand Up @@ -4195,8 +4173,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
; GFX1132-DPP-NEXT: .LBB6_2:
; GFX1132-DPP-NEXT: s_nop 0
; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX1132-DPP-NEXT: s_endpgm
%divValue = call float @div.float.value()
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
Expand Down
16 changes: 0 additions & 16 deletions llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,6 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
Expand Down Expand Up @@ -930,8 +928,6 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
Expand Down Expand Up @@ -1294,8 +1290,6 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add3_u32 v0, v3, v1, v0
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
Expand Down Expand Up @@ -1543,8 +1537,6 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v4, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v5, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v8, v[0:1], s[34:35]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
Expand Down Expand Up @@ -1753,8 +1745,6 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1) %buffer) {
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add3_u32 v0, v2, v0, v3
; GFX11-NEXT: global_store_b32 v6, v0, s[34:35]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
Expand Down Expand Up @@ -2017,8 +2007,6 @@ define amdgpu_kernel void @DiffBase(ptr addrspace(1) %buffer1,
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v12, v[0:1], s[36:37]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
ptr addrspace(1) %buffer2) {
entry:
Expand Down Expand Up @@ -2349,8 +2337,6 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
; GFX11-NEXT: global_store_b64 v16, v[0:1], s[34:35]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0)
Expand Down Expand Up @@ -2553,8 +2539,6 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v2, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[34:35]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
entry:
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/AMDGPU/release-vgprs.mir
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
define amdgpu_ps void @global_atomic() { ret void }
define amdgpu_ps void @image_atomic() { ret void }
define amdgpu_ps void @global_store_optnone() noinline optnone { ret void }
define amdgpu_cs void @with_calls() { ret void }
define fastcc void @with_tail_calls() { ret void }
...

---
Expand Down Expand Up @@ -565,3 +567,33 @@ body: |
S_WAITCNT_VSCNT undef $sgpr_null, 0
S_ENDPGM 0
...

---
name: with_calls
frameInfo:
hasCalls: true
body: |
bb.0:
; Make sure we don't send DEALLOC_VGPRS after a call, since there might be
; scratch stores still in progress.
; CHECK-LABEL: name: with_calls
; CHECK-NOT: S_SENDMSG 3
; CHECK: S_ENDPGM 0
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
$sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu
S_ENDPGM 0
...
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it worth adding a tail call case too?


---
name: with_tail_calls
frameInfo:
hasCalls: true
body: |
bb.0:
; Make sure we don't send DEALLOC_VGPRS when there's a tail call, since the
; only valid action after DEALLOC_VGPRS is to terminate the wave.
; CHECK-LABEL: name: with_tail_calls
; CHECK-NOT: S_SENDMSG 3
GLOBAL_STORE_DWORD undef renamable $vgpr0_vgpr1, killed renamable $vgpr1, 0, 4, implicit $exec
SI_TCRETURN undef renamable $sgpr4_sgpr5, @with_tail_calls, 0, csr_amdgpu
...