@@ -1647,17 +1647,21 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1647
1647
(MI.isReturn () && MI.isCall () && !callWaitsOnFunctionEntry (MI))) {
1648
1648
Wait = Wait.combined (WCG->getAllZeroWaitcnt (/* IncludeVSCnt=*/ false ));
1649
1649
}
1650
- // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
1651
- // stores. In this case it can be useful to send a message to explicitly
1652
- // release all VGPRs before the stores have completed, but it is only safe to
1653
- // do this if:
1654
- // * there are no outstanding scratch stores
1655
- // * we are not in Dynamic VGPR mode
1650
+ // In dynamic VGPR mode, we want to release the VGPRs before the wave exits.
1651
+ // Technically the hardware will do this on its own if we don't, but that
1652
+ // might cost extra cycles compared to doing it explicitly.
1653
+ // When not in dynamic VGPR mode, identify S_ENDPGM instructions which may
1654
+ // have to wait for outstanding VMEM stores. In this case it can be useful to
1655
+ // send a message to explicitly release all VGPRs before the stores have
1656
+ // completed, but it is only safe to do this if there are no outstanding
1657
+ // scratch stores.
1656
1658
else if (MI.getOpcode () == AMDGPU::S_ENDPGM ||
1657
1659
MI.getOpcode () == AMDGPU::S_ENDPGM_SAVED) {
1658
- if (ST->getGeneration () >= AMDGPUSubtarget::GFX11 && !WCG->isOptNone () &&
1659
- ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1660
- !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))
1660
+ if (!WCG->isOptNone () &&
1661
+ (ST->isDynamicVGPREnabled () ||
1662
+ (ST->getGeneration () >= AMDGPUSubtarget::GFX11 &&
1663
+ ScoreBrackets.getScoreRange (STORE_CNT) != 0 &&
1664
+ !ScoreBrackets.hasPendingEvent (SCRATCH_WRITE_ACCESS))))
1661
1665
ReleaseVGPRInsts.insert (&MI);
1662
1666
}
1663
1667
// Resolve vm waits before gs-done.
@@ -2611,26 +2615,36 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
2611
2615
}
2612
2616
}
2613
2617
2614
- // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
2615
- // instructions.
2618
+ // Deallocate the VGPRs before previously identified S_ENDPGM instructions.
2619
+ // This is done in different ways depending on how the VGPRs were allocated
2620
+ // (i.e. whether we're in dynamic VGPR mode or not).
2616
2621
// Skip deallocation if kernel is waveslot limited vs VGPR limited. A short
2617
2622
// waveslot limited kernel runs slower with the deallocation.
2618
- if (!ReleaseVGPRInsts.empty () &&
2619
- (MF.getFrameInfo ().hasCalls () ||
2620
- ST->getOccupancyWithNumVGPRs (
2621
- TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2622
- AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2623
+ if (ST->isDynamicVGPREnabled ()) {
2623
2624
for (MachineInstr *MI : ReleaseVGPRInsts) {
2624
- if (ST->requiresNopBeforeDeallocVGPRs ()) {
2625
- BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2626
- TII->get (AMDGPU::S_NOP))
2627
- .addImm (0 );
2628
- }
2629
2625
BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2630
- TII->get (AMDGPU::S_SENDMSG ))
2631
- .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus );
2626
+ TII->get (AMDGPU::S_ALLOC_VGPR ))
2627
+ .addImm (0 );
2632
2628
Modified = true ;
2633
2629
}
2630
+ } else {
2631
+ if (!ReleaseVGPRInsts.empty () &&
2632
+ (MF.getFrameInfo ().hasCalls () ||
2633
+ ST->getOccupancyWithNumVGPRs (
2634
+ TRI->getNumUsedPhysRegs (*MRI, AMDGPU::VGPR_32RegClass)) <
2635
+ AMDGPU::IsaInfo::getMaxWavesPerEU (ST))) {
2636
+ for (MachineInstr *MI : ReleaseVGPRInsts) {
2637
+ if (ST->requiresNopBeforeDeallocVGPRs ()) {
2638
+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2639
+ TII->get (AMDGPU::S_NOP))
2640
+ .addImm (0 );
2641
+ }
2642
+ BuildMI (*MI->getParent (), MI, MI->getDebugLoc (),
2643
+ TII->get (AMDGPU::S_SENDMSG))
2644
+ .addImm (AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
2645
+ Modified = true ;
2646
+ }
2647
+ }
2634
2648
}
2635
2649
ReleaseVGPRInsts.clear ();
2636
2650
PreheadersToFlush.clear ();
0 commit comments