@@ -1640,17 +1640,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1640
1640
(MI.isReturn () && MI.isCall () && !callWaitsOnFunctionEntry (MI))) {
1641
1641
Wait = Wait.combined (WCG->getAllZeroWaitcnt (/* IncludeVSCnt=*/ false ));
1642
1642
}
1643
- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1644
- // stores. In this case it can be useful to send a message to explicitly
1645
- // release all VGPRs before the stores have completed, but it is only safe to
1646
- // do this if:
1647
- // * there are no outstanding scratch stores
1648
- // * we are not in Dynamic VGPR mode
1643
+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1644
+ // Technically the hardware will do this on its own if we don't, but that
1645
+ // might cost extra cycles compared to doing it explicitly.
1646
+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1647
+ // have to wait for outstanding VMEM stores. In this case it can be useful to
1648
+ // send a message to explicitly release all VGPRs before the stores have
1649
+ // completed, but it is only safe to do this if there are no outstanding
1650
+ // scratch stores.
1649
1651
else if (MI.getOpcode () == AMDGPU::S_ENDPGM ||
1650
1652
MI.getOpcode () == AMDGPU::S_ENDPGM_SAVED) {
1651
- if (ST->getGeneration () >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone () &&
1652
- ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1653
- !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))
1653
+ if (!WCG->isOptNone () &&
1654
+ (ST->isDynamicVGPREnabled () ||
1655
+ (ST->getGeneration () >= AMDGPUSubtarget::GFX11 &&
1656
+ ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1657
+ !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))))
1654
1658
ReleaseVGPRInsts.insert (&MI);
1655
1659
}
1656
1660
// Resolve vm waits before gs-done.
@@ -2593,26 +2597,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2593
2597
}
2594
2598
}
2595
2599
2596
- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2597
- // instructions.
2600
+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2601
+ // This is done in different ways depending on how the VGPRs were allocated
2602
+ // (i.e. whether we're in dynamic VGPR mode or not).
2598
2603
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2599
2604
// waveslot limited kernel runs slower with the deallocation.
2600
- if (!ReleaseVGPRInsts.empty () &&
2601
- (MF.getFrameInfo ().hasCalls () ||
2602
- ST->getOccupancyWithNumVGPRs (
2603
- TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2604
- AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2605
+ if (ST->isDynamicVGPREnabled ()) {
2605
2606
for (MachineInstr *MI : ReleaseVGPRInsts) {
2606
- if (ST->requiresNopBeforeDeallocVGPRs ()) {
2607
- BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2608
- TII->get (AMDGPU::S_NOP))
2609
- .addImm (0 );
2610
- }
2611
2607
BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2612
- TII->get (AMDGPU::S_SENDMSG ))
2613
- .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus );
2608
+ TII->get (AMDGPU::S_ALLOC_VGPR ))
2609
+ .addImm (0 );
2614
2610
Modified = true ;
2615
2611
}
2612
+ } else {
2613
+ if (!ReleaseVGPRInsts.empty () &&
2614
+ (MF.getFrameInfo ().hasCalls () ||
2615
+ ST->getOccupancyWithNumVGPRs (
2616
+ TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2617
+ AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2618
+ for (MachineInstr *MI : ReleaseVGPRInsts) {
2619
+ if (ST->requiresNopBeforeDeallocVGPRs ()) {
2620
+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2621
+ TII->get (AMDGPU::S_NOP))
2622
+ .addImm (0 );
2623
+ }
2624
+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2625
+ TII->get (AMDGPU::S_SENDMSG))
2626
+ .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2627
+ Modified = true ;
2628
+ }
2629
+ }
2616
2630
}
2617
2631
ReleaseVGPRInsts.clear ();
2618
2632
PreheadersToFlush.clear ();
0 commit comments