Skip to content

Commit 8a53324

Browse files
authored
[AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode (#130037)
In dynamic VGPR mode, Waves must deallocate all VGPRs before exiting. If the shader program does not do this, hardware inserts `S_ALLOC_VGPR 0` before S_ENDPGM, but this may incur some performance cost. Therefore it's better if the compiler proactively generates that instruction. This patch extends `si-insert-waitcnts` to deallocate the VGPRs via a `S_ALLOC_VGPR 0` before any `S_ENDPGM` when in dynamic VGPR mode.
1 parent a1c8dda commit 8a53324

File tree

2 files changed

+393
-23
lines changed

2 files changed

+393
-23
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1640,17 +1640,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16401640
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
16411641
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
16421642
}
1643-
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1644-
// stores. In this case it can be useful to send a message to explicitly
1645-
// release all VGPRs before the stores have completed, but it is only safe to
1646-
// do this if:
1647-
// * there are no outstanding scratch stores
1648-
// * we are not in Dynamic VGPR mode
1643+
// In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1644+
// Technically the hardware will do this on its own if we don't, but that
1645+
// might cost extra cycles compared to doing it explicitly.
1646+
// When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1647+
// have to wait for outstanding VMEM stores. In this case it can be useful to
1648+
// send a message to explicitly release all VGPRs before the stores have
1649+
// completed, but it is only safe to do this if there are no outstanding
1650+
// scratch stores.
16491651
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
16501652
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1651-
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1652-
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1653-
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1653+
if (!WCG->isOptNone() &&
1654+
(ST->isDynamicVGPREnabled() ||
1655+
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1656+
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1657+
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
16541658
ReleaseVGPRInsts.insert(&MI);
16551659
}
16561660
// Resolve vm waits before gs-done.
@@ -2593,26 +2597,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
25932597
}
25942598
}
25952599

2596-
// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2597-
// instructions.
2600+
// Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2601+
// This is done in different ways depending on how the VGPRs were allocated
2602+
// (i.e. whether we're in dynamic VGPR mode or not).
25982603
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
25992604
// waveslot limited kernel runs slower with the deallocation.
2600-
if (!ReleaseVGPRInsts.empty() &&
2601-
(MF.getFrameInfo().hasCalls() ||
2602-
ST->getOccupancyWithNumVGPRs(
2603-
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2604-
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2605+
if (ST->isDynamicVGPREnabled()) {
26052606
for (MachineInstr *MI : ReleaseVGPRInsts) {
2606-
if (ST->requiresNopBeforeDeallocVGPRs()) {
2607-
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2608-
TII->get(AMDGPU::S_NOP))
2609-
.addImm(0);
2610-
}
26112607
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2612-
TII->get(AMDGPU::S_SENDMSG))
2613-
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2608+
TII->get(AMDGPU::S_ALLOC_VGPR))
2609+
.addImm(0);
26142610
Modified = true;
26152611
}
2612+
} else {
2613+
if (!ReleaseVGPRInsts.empty() &&
2614+
(MF.getFrameInfo().hasCalls() ||
2615+
ST->getOccupancyWithNumVGPRs(
2616+
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2617+
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2618+
for (MachineInstr *MI : ReleaseVGPRInsts) {
2619+
if (ST->requiresNopBeforeDeallocVGPRs()) {
2620+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2621+
TII->get(AMDGPU::S_NOP))
2622+
.addImm(0);
2623+
}
2624+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2625+
TII->get(AMDGPU::S_SENDMSG))
2626+
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2627+
Modified = true;
2628+
}
2629+
}
26162630
}
26172631
ReleaseVGPRInsts.clear();
26182632
PreheadersToFlush.clear();

0 commit comments

Comments
 (0)