Skip to content

Commit c30cc5d

Browse files
committed
[AMDGPU] Deallocate VGPRs before exiting in dynamic VGPR mode llvm#130037
1 parent d690eb8 commit c30cc5d

File tree

2 files changed

+393
-23
lines changed

2 files changed

+393
-23
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
16471647
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
16481648
Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
16491649
}
1650-
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1651-
// stores. In this case it can be useful to send a message to explicitly
1652-
// release all VGPRs before the stores have completed, but it is only safe to
1653-
// do this if:
1654-
// * there are no outstanding scratch stores
1655-
// * we are not in Dynamic VGPR mode
1650+
// In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1651+
// Technically the hardware will do this on its own if we don't, but that
1652+
// might cost extra cycles compared to doing it explicitly.
1653+
// When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1654+
// have to wait for outstanding VMEM stores. In this case it can be useful to
1655+
// send a message to explicitly release all VGPRs before the stores have
1656+
// completed, but it is only safe to do this if there are no outstanding
1657+
// scratch stores.
16561658
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
16571659
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
1658-
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone() &&
1659-
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1660-
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
1660+
if (!WCG->isOptNone() &&
1661+
(ST->isDynamicVGPREnabled() ||
1662+
(ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
1663+
ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
1664+
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))))
16611665
ReleaseVGPRInsts.insert(&MI);
16621666
}
16631667
// Resolve vm waits before gs-done.
@@ -2611,26 +2615,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
26112615
}
26122616
}
26132617

2614-
// Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2615-
// instructions.
2618+
// Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2619+
// This is done in different ways depending on how the VGPRs were allocated
2620+
// (i.e. whether we're in dynamic VGPR mode or not).
26162621
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
26172622
// waveslot limited kernel runs slower with the deallocation.
2618-
if (!ReleaseVGPRInsts.empty() &&
2619-
(MF.getFrameInfo().hasCalls() ||
2620-
ST->getOccupancyWithNumVGPRs(
2621-
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2622-
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2623+
if (ST->isDynamicVGPREnabled()) {
26232624
for (MachineInstr *MI : ReleaseVGPRInsts) {
2624-
if (ST->requiresNopBeforeDeallocVGPRs()) {
2625-
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2626-
TII->get(AMDGPU::S_NOP))
2627-
.addImm(0);
2628-
}
26292625
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2630-
TII->get(AMDGPU::S_SENDMSG))
2631-
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2626+
TII->get(AMDGPU::S_ALLOC_VGPR))
2627+
.addImm(0);
26322628
Modified = true;
26332629
}
2630+
} else {
2631+
if (!ReleaseVGPRInsts.empty() &&
2632+
(MF.getFrameInfo().hasCalls() ||
2633+
ST->getOccupancyWithNumVGPRs(
2634+
TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass)) <
2635+
AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
2636+
for (MachineInstr *MI : ReleaseVGPRInsts) {
2637+
if (ST->requiresNopBeforeDeallocVGPRs()) {
2638+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2639+
TII->get(AMDGPU::S_NOP))
2640+
.addImm(0);
2641+
}
2642+
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
2643+
TII->get(AMDGPU::S_SENDMSG))
2644+
.addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2645+
Modified = true;
2646+
}
2647+
}
26342648
}
26352649
ReleaseVGPRInsts.clear();
26362650
PreheadersToFlush.clear();

0 commit comments

Comments
 (0)