Skip to content

[AMDGPU] Move WWM register pre-allocation to during regalloc #70618

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1277,7 +1277,6 @@ void GCNPassConfig::addFastRegAlloc() {
insertPass(&PHIEliminationID, &SILowerControlFlowID);

insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);

TargetPassConfig::addFastRegAlloc();
}
Expand All @@ -1286,7 +1285,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
// instructions that cause scheduling barriers.
insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);

if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
Expand Down Expand Up @@ -1373,6 +1371,7 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
addPass(&SIPreAllocateWWMRegsID);

addPass(createVGPRAllocPass(false));

Expand All @@ -1396,6 +1395,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
addPass(&SIPreAllocateWWMRegsID);

addPass(createVGPRAllocPass(true));

Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,9 @@ class SIPreAllocateWWMRegs : public MachineFunctionPass {

void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
AU.addPreserved<LiveIntervals>();
AU.addRequired<VirtRegMap>();
AU.addRequired<LiveRegMatrix>();
AU.addPreserved<SlotIndexes>();
AU.setPreservesCFG();
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}

Expand Down Expand Up @@ -101,7 +99,7 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
LiveInterval &LI = LIS->getInterval(Reg);

for (MCRegister PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
if (!MRI->isPhysRegUsed(PhysReg) &&
if (!MRI->isPhysRegUsed(PhysReg, /*SkipRegMaskTest=*/true) &&
Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
Matrix->assign(LI, PhysReg);
assert(PhysReg != 0);
Expand Down Expand Up @@ -217,6 +215,9 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
RegsAssigned |= processDef(MI.getOperand(0));

if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
continue;

if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
; REGALLOC-NEXT: renamable $vgpr3 = IMPLICIT_DEF
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr2, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
; REGALLOC-NEXT: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
; REGALLOC-NEXT: renamable $vgpr1 = COPY killed $vgpr0
; REGALLOC-NEXT: renamable $vgpr1 = COPY $vgpr0
; REGALLOC-NEXT: $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
; REGALLOC-NEXT: renamable $sgpr4 = S_MOV_B32 49
; REGALLOC-NEXT: renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr1, killed $sgpr4, implicit $exec
Expand Down
24 changes: 9 additions & 15 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,14 @@
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
; GCN-O0-NEXT: SI Whole Quad Mode
; GCN-O0-NEXT: Virtual Register Map
; GCN-O0-NEXT: Live Register Matrix
; GCN-O0-NEXT: SI Pre-allocate WWM Registers
; GCN-O0-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI lower SGPR spill instructions
; GCN-O0-NEXT: Slot index numbering
; GCN-O0-NEXT: Live Interval Analysis
; GCN-O0-NEXT: Virtual Register Map
; GCN-O0-NEXT: Live Register Matrix
; GCN-O0-NEXT: SI Pre-allocate WWM Registers
; GCN-O0-NEXT: Fast Register Allocator
; GCN-O0-NEXT: SI Lower WWM Copies
; GCN-O0-NEXT: SI Fix VGPR copies
Expand Down Expand Up @@ -334,9 +336,6 @@
; GCN-O1-NEXT: Rename Disconnected Subregister Components
; GCN-O1-NEXT: Machine Instruction Scheduler
; GCN-O1-NEXT: SI Whole Quad Mode
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O1-NEXT: Machine Natural Loop Construction
Expand All @@ -354,6 +353,7 @@
; GCN-O1-NEXT: SI lower SGPR spill instructions
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: GCN NSA Reassign
Expand Down Expand Up @@ -625,9 +625,6 @@
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA optimizations
; GCN-O1-OPTS-NEXT: Machine Instruction Scheduler
; GCN-O1-OPTS-NEXT: SI Whole Quad Mode
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-OPTS-NEXT: SI optimize exec mask operations pre-RA
; GCN-O1-OPTS-NEXT: AMDGPU Pre-RA Long Branch Reg
; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction
Expand All @@ -645,6 +642,7 @@
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
Expand Down Expand Up @@ -927,9 +925,6 @@
; GCN-O2-NEXT: AMDGPU Pre-RA optimizations
; GCN-O2-NEXT: Machine Instruction Scheduler
; GCN-O2-NEXT: SI Whole Quad Mode
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
; GCN-O2-NEXT: SI optimize exec mask operations pre-RA
; GCN-O2-NEXT: SI Form memory clauses
; GCN-O2-NEXT: AMDGPU Pre-RA Long Branch Reg
Expand All @@ -948,6 +943,7 @@
; GCN-O2-NEXT: SI lower SGPR spill instructions
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: GCN NSA Reassign
Expand Down Expand Up @@ -1242,9 +1238,6 @@
; GCN-O3-NEXT: AMDGPU Pre-RA optimizations
; GCN-O3-NEXT: Machine Instruction Scheduler
; GCN-O3-NEXT: SI Whole Quad Mode
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
; GCN-O3-NEXT: SI optimize exec mask operations pre-RA
; GCN-O3-NEXT: SI Form memory clauses
; GCN-O3-NEXT: AMDGPU Pre-RA Long Branch Reg
Expand All @@ -1263,6 +1256,7 @@
; GCN-O3-NEXT: SI lower SGPR spill instructions
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: GCN NSA Reassign
Expand Down
9 changes: 9 additions & 0 deletions llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
; DEFAULT-NEXT: SI lower SGPR spill instructions
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: GCN NSA Reassign
Expand All @@ -28,6 +29,11 @@

; O0: Fast Register Allocator
; O0-NEXT: SI lower SGPR spill instructions
; O0-NEXT: Slot index numbering
; O0-NEXT: Live Interval Analysis
; O0-NEXT: Virtual Register Map
; O0-NEXT: Live Register Matrix
; O0-NEXT: SI Pre-allocate WWM Registers
; O0-NEXT: Fast Register Allocator
; O0-NEXT: SI Lower WWM Copies
; O0-NEXT: SI Fix VGPR copies
Expand All @@ -46,6 +52,7 @@
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
; BASIC-DEFAULT-NEXT: Virtual Register Map
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
Expand All @@ -63,6 +70,7 @@
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
Expand All @@ -82,6 +90,7 @@
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: GCN NSA Reassign
Expand Down
Loading