Skip to content

Commit 74381cb

Browse files
committed
[WIP] Implemented a patch to optimize SGPR spills.
Introduced the StackSlotColoring pass after SGPR RegAlloc and Spill to optimize stack slots reusage.
1 parent 1ba8ed0 commit 74381cb

10 files changed

+490
-83
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1421,6 +1421,9 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
14211421
// since FastRegAlloc does the replacements itself.
14221422
addPass(createVirtRegRewriter(false));
14231423

1424+
// Optimizes SGPR spills into VGPR lanes for non-interferring spill-ranges.
1425+
addPass(&StackSlotColoringID);
1426+
14241427
// Equivalent of PEI for SGPRs.
14251428
addPass(&SILowerSGPRSpillsID);
14261429
addPass(&SIPreAllocateWWMRegsID);

llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@ class SILowerSGPRSpills : public MachineFunctionPass {
5252
void calculateSaveRestoreBlocks(MachineFunction &MF);
5353
bool spillCalleeSavedRegs(MachineFunction &MF,
5454
SmallVectorImpl<int> &CalleeSavedFIs);
55-
void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
55+
void extendWWMVirtRegLiveness(MachineFunction &MF, SlotIndexes *Indexes,
56+
LiveIntervals *LIS);
5657

5758
bool runOnMachineFunction(MachineFunction &MF) override;
5859

@@ -260,6 +261,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(
260261
}
261262

262263
void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
264+
SlotIndexes *Indexes,
263265
LiveIntervals *LIS) {
264266
// TODO: This is a workaround to avoid the unmodelled liveness computed with
265267
// whole-wave virtual registers when allocated together with the regular VGPR
@@ -278,14 +280,21 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
278280
for (auto Reg : MFI->getSGPRSpillVGPRs()) {
279281
for (MachineBasicBlock *SaveBlock : SaveBlocks) {
280282
MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
283+
MachineInstrSpan MIS(InsertBefore, SaveBlock);
284+
281285
DebugLoc DL = SaveBlock->findDebugLoc(InsertBefore);
282286
auto MIB = BuildMI(*SaveBlock, InsertBefore, DL,
283287
TII->get(AMDGPU::IMPLICIT_DEF), Reg);
284288
MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
285289
// Set SGPR_SPILL asm printer flag
286290
MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
291+
287292
if (LIS) {
288293
LIS->InsertMachineInstrInMaps(*MIB);
294+
} else if (Indexes) {
295+
assert(std::distance(MIS.begin(), InsertBefore) == 1);
296+
MachineInstr &Inst = *std::prev(InsertBefore);
297+
Indexes->insertMachineInstrInMaps(Inst);
289298
}
290299
}
291300
}
@@ -300,8 +309,13 @@ void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
300309
auto MIB = BuildMI(*RestoreBlock, InsertBefore, DL,
301310
TII->get(TargetOpcode::KILL));
302311
MIB.addReg(Reg);
303-
if (LIS)
312+
313+
if (LIS) {
304314
LIS->InsertMachineInstrInMaps(*MIB);
315+
} else if (Indexes) {
316+
MachineInstr &Inst = *std::prev(InsertBefore);
317+
Indexes->insertMachineInstrInMaps(Inst);
318+
}
305319
}
306320
}
307321
}
@@ -392,7 +406,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
392406
}
393407

394408
if (SpilledToVirtVGPRLanes) {
395-
extendWWMVirtRegLiveness(MF, LIS);
409+
extendWWMVirtRegLiveness(MF, Indexes, LIS);
396410
if (LIS) {
397411
// Compute the LiveInterval for the newly created virtual registers.
398412
for (auto Reg : FuncInfo->getSGPRSpillVGPRs())

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1775,8 +1775,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
17751775

17761776
if (SpillToVGPR) {
17771777

1778-
assert(SB.NumSubRegs == VGPRSpills.size() &&
1779-
"Num of VGPR lanes should be equal to num of SGPRs spilled");
1778+
assert(SB.NumSubRegs <= VGPRSpills.size() &&
1779+
"Num of VGPR lanes should be greater or equal to num of SGPRs "
1780+
"spilled, as Stack Slot Coloring pass assigns different SGPR spills "
1781+
"into same stack slots");
17801782

17811783
for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
17821784
Register SubReg =

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,10 +366,12 @@
366366
; GCN-O1-NEXT: Machine Optimization Remark Emitter
367367
; GCN-O1-NEXT: Greedy Register Allocator
368368
; GCN-O1-NEXT: Virtual Register Rewriter
369+
; GCN-O1-NEXT: Stack Slot Coloring
369370
; GCN-O1-NEXT: SI lower SGPR spill instructions
370371
; GCN-O1-NEXT: Virtual Register Map
371372
; GCN-O1-NEXT: Live Register Matrix
372373
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
374+
; GCN-O1-NEXT: Live Stack Slot Analysis
373375
; GCN-O1-NEXT: Greedy Register Allocator
374376
; GCN-O1-NEXT: SI Lower WWM Copies
375377
; GCN-O1-NEXT: GCN NSA Reassign
@@ -671,10 +673,12 @@
671673
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
672674
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
673675
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
676+
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
674677
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
675678
; GCN-O1-OPTS-NEXT: Virtual Register Map
676679
; GCN-O1-OPTS-NEXT: Live Register Matrix
677680
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
681+
; GCN-O1-OPTS-NEXT: Live Stack Slot Analysis
678682
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
679683
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
680684
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
@@ -982,10 +986,12 @@
982986
; GCN-O2-NEXT: Machine Optimization Remark Emitter
983987
; GCN-O2-NEXT: Greedy Register Allocator
984988
; GCN-O2-NEXT: Virtual Register Rewriter
989+
; GCN-O2-NEXT: Stack Slot Coloring
985990
; GCN-O2-NEXT: SI lower SGPR spill instructions
986991
; GCN-O2-NEXT: Virtual Register Map
987992
; GCN-O2-NEXT: Live Register Matrix
988993
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
994+
; GCN-O2-NEXT: Live Stack Slot Analysis
989995
; GCN-O2-NEXT: Greedy Register Allocator
990996
; GCN-O2-NEXT: SI Lower WWM Copies
991997
; GCN-O2-NEXT: GCN NSA Reassign
@@ -1305,10 +1311,12 @@
13051311
; GCN-O3-NEXT: Machine Optimization Remark Emitter
13061312
; GCN-O3-NEXT: Greedy Register Allocator
13071313
; GCN-O3-NEXT: Virtual Register Rewriter
1314+
; GCN-O3-NEXT: Stack Slot Coloring
13081315
; GCN-O3-NEXT: SI lower SGPR spill instructions
13091316
; GCN-O3-NEXT: Virtual Register Map
13101317
; GCN-O3-NEXT: Live Register Matrix
13111318
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
1319+
; GCN-O3-NEXT: Live Stack Slot Analysis
13121320
; GCN-O3-NEXT: Greedy Register Allocator
13131321
; GCN-O3-NEXT: SI Lower WWM Copies
13141322
; GCN-O3-NEXT: GCN NSA Reassign

llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -221,15 +221,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
221221
; GFX906-NEXT: ; def s29
222222
; GFX906-NEXT: ;;#ASMEND
223223
; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
224-
; GFX906-NEXT: v_writelane_b32 v40, s21, 24
225-
; GFX906-NEXT: v_writelane_b32 v40, s22, 25
226-
; GFX906-NEXT: v_writelane_b32 v40, s23, 26
227-
; GFX906-NEXT: v_writelane_b32 v40, s24, 27
228-
; GFX906-NEXT: v_writelane_b32 v40, s25, 28
229-
; GFX906-NEXT: v_writelane_b32 v40, s26, 29
230-
; GFX906-NEXT: v_writelane_b32 v40, s27, 30
231-
; GFX906-NEXT: v_writelane_b32 v40, s28, 31
232-
; GFX906-NEXT: v_writelane_b32 v40, s29, 32
224+
; GFX906-NEXT: v_writelane_b32 v40, s21, 12
225+
; GFX906-NEXT: v_writelane_b32 v40, s22, 13
226+
; GFX906-NEXT: v_writelane_b32 v40, s23, 14
227+
; GFX906-NEXT: v_writelane_b32 v40, s24, 15
228+
; GFX906-NEXT: v_writelane_b32 v40, s25, 16
229+
; GFX906-NEXT: v_writelane_b32 v40, s26, 17
230+
; GFX906-NEXT: v_writelane_b32 v40, s27, 18
231+
; GFX906-NEXT: v_writelane_b32 v40, s28, 19
232+
; GFX906-NEXT: v_writelane_b32 v40, s29, 20
233233
; GFX906-NEXT: v_readlane_b32 s4, v40, 10
234234
; GFX906-NEXT: v_readlane_b32 s6, v40, 0
235235
; GFX906-NEXT: v_readlane_b32 s8, v40, 8
@@ -249,39 +249,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
249249
; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17]
250250
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
251251
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
252-
; GFX906-NEXT: v_readlane_b32 s21, v40, 24
252+
; GFX906-NEXT: v_readlane_b32 s21, v40, 12
253253
; GFX906-NEXT: ;;#ASMSTART
254254
; GFX906-NEXT: ; use s21
255255
; GFX906-NEXT: ;;#ASMEND
256-
; GFX906-NEXT: v_readlane_b32 s22, v40, 25
256+
; GFX906-NEXT: v_readlane_b32 s22, v40, 13
257257
; GFX906-NEXT: ;;#ASMSTART
258258
; GFX906-NEXT: ; use s22
259259
; GFX906-NEXT: ;;#ASMEND
260-
; GFX906-NEXT: v_readlane_b32 s23, v40, 26
260+
; GFX906-NEXT: v_readlane_b32 s23, v40, 14
261261
; GFX906-NEXT: ;;#ASMSTART
262262
; GFX906-NEXT: ; use s23
263263
; GFX906-NEXT: ;;#ASMEND
264-
; GFX906-NEXT: v_readlane_b32 s24, v40, 27
264+
; GFX906-NEXT: v_readlane_b32 s24, v40, 15
265265
; GFX906-NEXT: ;;#ASMSTART
266266
; GFX906-NEXT: ; use s24
267267
; GFX906-NEXT: ;;#ASMEND
268-
; GFX906-NEXT: v_readlane_b32 s25, v40, 28
268+
; GFX906-NEXT: v_readlane_b32 s25, v40, 16
269269
; GFX906-NEXT: ;;#ASMSTART
270270
; GFX906-NEXT: ; use s25
271271
; GFX906-NEXT: ;;#ASMEND
272-
; GFX906-NEXT: v_readlane_b32 s26, v40, 29
272+
; GFX906-NEXT: v_readlane_b32 s26, v40, 17
273273
; GFX906-NEXT: ;;#ASMSTART
274274
; GFX906-NEXT: ; use s26
275275
; GFX906-NEXT: ;;#ASMEND
276-
; GFX906-NEXT: v_readlane_b32 s27, v40, 30
276+
; GFX906-NEXT: v_readlane_b32 s27, v40, 18
277277
; GFX906-NEXT: ;;#ASMSTART
278278
; GFX906-NEXT: ; use s27
279279
; GFX906-NEXT: ;;#ASMEND
280-
; GFX906-NEXT: v_readlane_b32 s28, v40, 31
280+
; GFX906-NEXT: v_readlane_b32 s28, v40, 19
281281
; GFX906-NEXT: ;;#ASMSTART
282282
; GFX906-NEXT: ; use s28
283283
; GFX906-NEXT: ;;#ASMEND
284-
; GFX906-NEXT: v_readlane_b32 s29, v40, 32
284+
; GFX906-NEXT: v_readlane_b32 s29, v40, 20
285285
; GFX906-NEXT: ;;#ASMSTART
286286
; GFX906-NEXT: ; use s29
287287
; GFX906-NEXT: ;;#ASMEND
@@ -602,15 +602,15 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
602602
; GFX908-NEXT: ; def s29
603603
; GFX908-NEXT: ;;#ASMEND
604604
; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
605-
; GFX908-NEXT: v_writelane_b32 v40, s21, 24
606-
; GFX908-NEXT: v_writelane_b32 v40, s22, 25
607-
; GFX908-NEXT: v_writelane_b32 v40, s23, 26
608-
; GFX908-NEXT: v_writelane_b32 v40, s24, 27
609-
; GFX908-NEXT: v_writelane_b32 v40, s25, 28
610-
; GFX908-NEXT: v_writelane_b32 v40, s26, 29
611-
; GFX908-NEXT: v_writelane_b32 v40, s27, 30
612-
; GFX908-NEXT: v_writelane_b32 v40, s28, 31
613-
; GFX908-NEXT: v_writelane_b32 v40, s29, 32
605+
; GFX908-NEXT: v_writelane_b32 v40, s21, 12
606+
; GFX908-NEXT: v_writelane_b32 v40, s22, 13
607+
; GFX908-NEXT: v_writelane_b32 v40, s23, 14
608+
; GFX908-NEXT: v_writelane_b32 v40, s24, 15
609+
; GFX908-NEXT: v_writelane_b32 v40, s25, 16
610+
; GFX908-NEXT: v_writelane_b32 v40, s26, 17
611+
; GFX908-NEXT: v_writelane_b32 v40, s27, 18
612+
; GFX908-NEXT: v_writelane_b32 v40, s28, 19
613+
; GFX908-NEXT: v_writelane_b32 v40, s29, 20
614614
; GFX908-NEXT: v_readlane_b32 s4, v40, 10
615615
; GFX908-NEXT: v_readlane_b32 s6, v40, 0
616616
; GFX908-NEXT: v_readlane_b32 s8, v40, 8
@@ -630,39 +630,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
630630
; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17]
631631
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
632632
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
633-
; GFX908-NEXT: v_readlane_b32 s21, v40, 24
633+
; GFX908-NEXT: v_readlane_b32 s21, v40, 12
634634
; GFX908-NEXT: ;;#ASMSTART
635635
; GFX908-NEXT: ; use s21
636636
; GFX908-NEXT: ;;#ASMEND
637-
; GFX908-NEXT: v_readlane_b32 s22, v40, 25
637+
; GFX908-NEXT: v_readlane_b32 s22, v40, 13
638638
; GFX908-NEXT: ;;#ASMSTART
639639
; GFX908-NEXT: ; use s22
640640
; GFX908-NEXT: ;;#ASMEND
641-
; GFX908-NEXT: v_readlane_b32 s23, v40, 26
641+
; GFX908-NEXT: v_readlane_b32 s23, v40, 14
642642
; GFX908-NEXT: ;;#ASMSTART
643643
; GFX908-NEXT: ; use s23
644644
; GFX908-NEXT: ;;#ASMEND
645-
; GFX908-NEXT: v_readlane_b32 s24, v40, 27
645+
; GFX908-NEXT: v_readlane_b32 s24, v40, 15
646646
; GFX908-NEXT: ;;#ASMSTART
647647
; GFX908-NEXT: ; use s24
648648
; GFX908-NEXT: ;;#ASMEND
649-
; GFX908-NEXT: v_readlane_b32 s25, v40, 28
649+
; GFX908-NEXT: v_readlane_b32 s25, v40, 16
650650
; GFX908-NEXT: ;;#ASMSTART
651651
; GFX908-NEXT: ; use s25
652652
; GFX908-NEXT: ;;#ASMEND
653-
; GFX908-NEXT: v_readlane_b32 s26, v40, 29
653+
; GFX908-NEXT: v_readlane_b32 s26, v40, 17
654654
; GFX908-NEXT: ;;#ASMSTART
655655
; GFX908-NEXT: ; use s26
656656
; GFX908-NEXT: ;;#ASMEND
657-
; GFX908-NEXT: v_readlane_b32 s27, v40, 30
657+
; GFX908-NEXT: v_readlane_b32 s27, v40, 18
658658
; GFX908-NEXT: ;;#ASMSTART
659659
; GFX908-NEXT: ; use s27
660660
; GFX908-NEXT: ;;#ASMEND
661-
; GFX908-NEXT: v_readlane_b32 s28, v40, 31
661+
; GFX908-NEXT: v_readlane_b32 s28, v40, 19
662662
; GFX908-NEXT: ;;#ASMSTART
663663
; GFX908-NEXT: ; use s28
664664
; GFX908-NEXT: ;;#ASMEND
665-
; GFX908-NEXT: v_readlane_b32 s29, v40, 32
665+
; GFX908-NEXT: v_readlane_b32 s29, v40, 20
666666
; GFX908-NEXT: ;;#ASMSTART
667667
; GFX908-NEXT: ; use s29
668668
; GFX908-NEXT: ;;#ASMEND

llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717

1818
; DEFAULT: Greedy Register Allocator
1919
; DEFAULT-NEXT: Virtual Register Rewriter
20+
; DEFAULT-NEXT: Stack Slot Coloring
2021
; DEFAULT-NEXT: SI lower SGPR spill instructions
2122
; DEFAULT-NEXT: Virtual Register Map
2223
; DEFAULT-NEXT: Live Register Matrix
2324
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
25+
; DEFAULT-NEXT: Live Stack Slot Analysis
2426
; DEFAULT-NEXT: Greedy Register Allocator
2527
; DEFAULT-NEXT: SI Lower WWM Copies
2628
; DEFAULT-NEXT: GCN NSA Reassign
@@ -50,10 +52,12 @@
5052
; BASIC-DEFAULT-NEXT: Live Register Matrix
5153
; BASIC-DEFAULT-NEXT: Basic Register Allocator
5254
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
55+
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
5356
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
5457
; BASIC-DEFAULT-NEXT: Virtual Register Map
5558
; BASIC-DEFAULT-NEXT: Live Register Matrix
5659
; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
60+
; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
5761
; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
5862
; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
5963
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
@@ -69,10 +73,12 @@
6973

7074
; DEFAULT-BASIC: Greedy Register Allocator
7175
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
76+
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
7277
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
7378
; DEFAULT-BASIC-NEXT: Virtual Register Map
7479
; DEFAULT-BASIC-NEXT: Live Register Matrix
7580
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
81+
; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
7682
; DEFAULT-BASIC-NEXT: Basic Register Allocator
7783
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
7884
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
@@ -90,10 +96,12 @@
9096
; BASIC-BASIC-NEXT: Live Register Matrix
9197
; BASIC-BASIC-NEXT: Basic Register Allocator
9298
; BASIC-BASIC-NEXT: Virtual Register Rewriter
99+
; BASIC-BASIC-NEXT: Stack Slot Coloring
93100
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
94101
; BASIC-BASIC-NEXT: Virtual Register Map
95102
; BASIC-BASIC-NEXT: Live Register Matrix
96103
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
104+
; BASIC-BASIC-NEXT: Live Stack Slot Analysis
97105
; BASIC-BASIC-NEXT: Basic Register Allocator
98106
; BASIC-BASIC-NEXT: SI Lower WWM Copies
99107
; BASIC-BASIC-NEXT: GCN NSA Reassign

0 commit comments

Comments
 (0)