Skip to content

Commit ca6845a

Browse files
committed
[AMDGPU] Add mark last scratch load pass
1 parent 4d18462 commit ca6845a

File tree

10 files changed

+700
-0
lines changed

10 files changed

+700
-0
lines changed

llvm/include/llvm/CodeGen/TargetPassConfig.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,9 @@ class TargetPassConfig : public ImmutablePass {
423423
/// to physical registers.
424424
virtual void addPostRewrite() { }
425425

426+
/// Add passes to be run immediately before Stack Slot Coloring pass.
427+
virtual void addPreStackSlotColoring() {}
428+
426429
/// This method may be implemented by targets that want to run passes after
427430
/// register allocation pass pipeline but before prolog-epilog insertion.
428431
virtual void addPostRegAlloc() { }

llvm/lib/CodeGen/TargetPassConfig.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1502,6 +1502,8 @@ void TargetPassConfig::addOptimizedRegAlloc() {
15021502
addPass(&MachineSchedulerID);
15031503

15041504
if (addRegAssignAndRewriteOptimized()) {
1505+
addPreStackSlotColoring();
1506+
15051507
// Perform stack slot coloring and post-ra machine LICM.
15061508
addPass(&StackSlotColoringID);
15071509

llvm/lib/Target/AMDGPU/AMDGPU.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,9 @@ extern char &SILowerI1CopiesID;
166166
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
167167
extern char &AMDGPUGlobalISelDivergenceLoweringID;
168168

169+
void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &);
170+
extern char &AMDGPUMarkLastScratchLoadID;
171+
169172
void initializeSILowerSGPRSpillsPass(PassRegistry &);
170173
extern char &SILowerSGPRSpillsID;
171174

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
//===-- AMDGPUMarkLastScratchLoad.cpp -------------------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// Mark scratch load/spill instructions which are guaranteed to be the last time
10+
// this scratch slot is used so it can be evicted from caches.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "AMDGPU.h"
15+
#include "GCNSubtarget.h"
16+
#include "llvm/CodeGen/LiveStacks.h"
17+
#include "llvm/CodeGen/MachineOperand.h"
18+
19+
using namespace llvm;
20+
21+
#define DEBUG_TYPE "amdgpu-mark-last-scratch-load"
22+
23+
namespace {
24+
25+
class AMDGPUMarkLastScratchLoad : public MachineFunctionPass {
26+
private:
27+
LiveStacks *LS = nullptr;
28+
SlotIndexes *SI = nullptr;
29+
const SIInstrInfo *SII = nullptr;
30+
31+
public:
32+
static char ID;
33+
34+
AMDGPUMarkLastScratchLoad() : MachineFunctionPass(ID) {
35+
initializeAMDGPUMarkLastScratchLoadPass(*PassRegistry::getPassRegistry());
36+
}
37+
38+
bool runOnMachineFunction(MachineFunction &MF) override;
39+
40+
void getAnalysisUsage(AnalysisUsage &AU) const override {
41+
AU.addRequired<SlotIndexes>();
42+
AU.addRequired<LiveStacks>();
43+
AU.setPreservesAll();
44+
MachineFunctionPass::getAnalysisUsage(AU);
45+
}
46+
47+
StringRef getPassName() const override {
48+
return "AMDGPU Mark Last Scratch Load";
49+
}
50+
};
51+
52+
} // end anonymous namespace
53+
54+
bool AMDGPUMarkLastScratchLoad::runOnMachineFunction(MachineFunction &MF) {
55+
LLVM_DEBUG({
56+
dbgs() << "********** Mark Last Scratch Load **********\n"
57+
<< "********** Function: " << MF.getName() << '\n';
58+
});
59+
60+
if (skipFunction(MF.getFunction()))
61+
return false;
62+
63+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
64+
if (ST.getGeneration() < AMDGPUSubtarget::GFX12)
65+
return false;
66+
67+
LS = &getAnalysis<LiveStacks>();
68+
SI = &getAnalysis<SlotIndexes>();
69+
SII = ST.getInstrInfo();
70+
71+
const unsigned NumSlots = LS->getNumIntervals();
72+
if (NumSlots == 0) {
73+
LLVM_DEBUG(dbgs() << "No live slots, skipping\n");
74+
return false;
75+
}
76+
77+
LLVM_DEBUG(dbgs() << LS->getNumIntervals() << " intervals\n");
78+
79+
bool Changed = false;
80+
81+
for (auto &[SS, LI] : *LS) {
82+
LLVM_DEBUG(dbgs() << "Checking interval: " << LI << "\n");
83+
84+
for (const LiveRange::Segment &Segment : LI.segments) {
85+
LLVM_DEBUG(dbgs() << " Checking segment: " << Segment << "\n");
86+
87+
// Ignore segments that run to the end of basic block because in this case
88+
// slot is still live at the end of it.
89+
if (Segment.end.isBlock())
90+
continue;
91+
92+
const int FrameIndex = Register::stackSlot2Index(LI.reg());
93+
MachineInstr *LastLoad = nullptr;
94+
95+
MachineInstr *MISegmentStart = SI->getInstructionFromIndex(Segment.start);
96+
MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end);
97+
if (!MISegmentEnd) {
98+
// FIXME: The start and end can refer to deleted instructions. We should
99+
// be able to handle this more gracefully by finding the closest real
100+
// instructions.
101+
continue;
102+
}
103+
MachineBasicBlock *BB = MISegmentEnd->getParent();
104+
105+
// Start iteration backwards from segment end until the start of basic
106+
// block or start of segment if it is in the same basic block.
107+
auto End = BB->instr_rend();
108+
if (MISegmentStart && MISegmentStart->getParent() == BB)
109+
End = MISegmentStart->getReverseIterator();
110+
111+
for (auto MI = MISegmentEnd->getReverseIterator(); MI != End; ++MI) {
112+
int LoadFI = 0;
113+
114+
if (SII->isLoadFromStackSlot(*MI, LoadFI) && LoadFI == FrameIndex) {
115+
LastLoad = &*MI;
116+
break;
117+
}
118+
}
119+
120+
if (LastLoad) {
121+
MachineOperand *LastUse =
122+
SII->getNamedOperand(*LastLoad, AMDGPU::OpName::last_use);
123+
assert(LastUse && "This instruction must have a last_use operand");
124+
LastUse->setImm(1);
125+
Changed = true;
126+
LLVM_DEBUG(dbgs() << " Found last load: " << *LastLoad;);
127+
}
128+
}
129+
}
130+
131+
return Changed;
132+
}
133+
134+
char AMDGPUMarkLastScratchLoad::ID = 0;
135+
136+
char &llvm::AMDGPUMarkLastScratchLoadID = AMDGPUMarkLastScratchLoad::ID;
137+
138+
INITIALIZE_PASS_BEGIN(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
139+
"AMDGPU Mark last scratch load", false, false)
140+
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
141+
INITIALIZE_PASS_DEPENDENCY(LiveStacks)
142+
INITIALIZE_PASS_END(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
143+
"AMDGPU Mark last scratch load", false, false)

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
382382
initializeSILowerI1CopiesPass(*PR);
383383
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
384384
initializeSILowerWWMCopiesPass(*PR);
385+
initializeAMDGPUMarkLastScratchLoadPass(*PR);
385386
initializeSILowerSGPRSpillsPass(*PR);
386387
initializeSIFixSGPRCopiesPass(*PR);
387388
initializeSIFixVGPRCopiesPass(*PR);
@@ -962,6 +963,7 @@ class GCNPassConfig final : public AMDGPUPassConfig {
962963

963964
void addPreRegAlloc() override;
964965
bool addPreRewrite() override;
966+
void addPreStackSlotColoring() override;
965967
void addPostRegAlloc() override;
966968
void addPreSched2() override;
967969
void addPreEmitPass() override;
@@ -1346,6 +1348,10 @@ bool GCNPassConfig::addPreRewrite() {
13461348
return true;
13471349
}
13481350

1351+
void GCNPassConfig::addPreStackSlotColoring() {
1352+
addPass(&AMDGPUMarkLastScratchLoadID);
1353+
}
1354+
13491355
FunctionPass *GCNPassConfig::createSGPRAllocPass(bool Optimized) {
13501356
// Initialize the global default.
13511357
llvm::call_once(InitializeDefaultSGPRRegisterAllocatorFlag,

llvm/lib/Target/AMDGPU/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ add_llvm_target(AMDGPUCodeGen
7979
AMDGPUMCInstLower.cpp
8080
AMDGPUIGroupLP.cpp
8181
AMDGPUInsertSingleUseVDST.cpp
82+
AMDGPUMarkLastScratchLoad.cpp
8283
AMDGPUMIRFormatter.cpp
8384
AMDGPUOpenCLEnqueuedBlockLowering.cpp
8485
AMDGPUPerfHintAnalysis.cpp

llvm/test/CodeGen/AMDGPU/llc-pipeline.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@
359359
; GCN-O1-NEXT: SI Lower WWM Copies
360360
; GCN-O1-NEXT: GCN NSA Reassign
361361
; GCN-O1-NEXT: Virtual Register Rewriter
362+
; GCN-O1-NEXT: AMDGPU Mark Last Scratch Load
362363
; GCN-O1-NEXT: Stack Slot Coloring
363364
; GCN-O1-NEXT: Machine Copy Propagation Pass
364365
; GCN-O1-NEXT: Machine Loop Invariant Code Motion
@@ -655,6 +656,7 @@
655656
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
656657
; GCN-O1-OPTS-NEXT: GCN NSA Reassign
657658
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
659+
; GCN-O1-OPTS-NEXT: AMDGPU Mark Last Scratch Load
658660
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
659661
; GCN-O1-OPTS-NEXT: Machine Copy Propagation Pass
660662
; GCN-O1-OPTS-NEXT: Machine Loop Invariant Code Motion
@@ -957,6 +959,7 @@
957959
; GCN-O2-NEXT: SI Lower WWM Copies
958960
; GCN-O2-NEXT: GCN NSA Reassign
959961
; GCN-O2-NEXT: Virtual Register Rewriter
962+
; GCN-O2-NEXT: AMDGPU Mark Last Scratch Load
960963
; GCN-O2-NEXT: Stack Slot Coloring
961964
; GCN-O2-NEXT: Machine Copy Propagation Pass
962965
; GCN-O2-NEXT: Machine Loop Invariant Code Motion
@@ -1271,6 +1274,7 @@
12711274
; GCN-O3-NEXT: SI Lower WWM Copies
12721275
; GCN-O3-NEXT: GCN NSA Reassign
12731276
; GCN-O3-NEXT: Virtual Register Rewriter
1277+
; GCN-O3-NEXT: AMDGPU Mark Last Scratch Load
12741278
; GCN-O3-NEXT: Stack Slot Coloring
12751279
; GCN-O3-NEXT: Machine Copy Propagation Pass
12761280
; GCN-O3-NEXT: Machine Loop Invariant Code Motion

llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
; DEFAULT-NEXT: SI Lower WWM Copies
2626
; DEFAULT-NEXT: GCN NSA Reassign
2727
; DEFAULT-NEXT: Virtual Register Rewriter
28+
; DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
2829
; DEFAULT-NEXT: Stack Slot Coloring
2930

3031
; O0: Fast Register Allocator
@@ -61,6 +62,7 @@
6162
; BASIC-DEFAULT-NEXT: SI Lower WWM Copies
6263
; BASIC-DEFAULT-NEXT: GCN NSA Reassign
6364
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
65+
; BASIC-DEFAULT-NEXT: AMDGPU Mark Last Scratch Load
6466
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
6567

6668

@@ -75,6 +77,7 @@
7577
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
7678
; DEFAULT-BASIC-NEXT: GCN NSA Reassign
7779
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
80+
; DEFAULT-BASIC-NEXT: AMDGPU Mark Last Scratch Load
7881
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
7982

8083

@@ -95,6 +98,7 @@
9598
; BASIC-BASIC-NEXT: SI Lower WWM Copies
9699
; BASIC-BASIC-NEXT: GCN NSA Reassign
97100
; BASIC-BASIC-NEXT: Virtual Register Rewriter
101+
; BASIC-BASIC-NEXT: AMDGPU Mark Last Scratch Load
98102
; BASIC-BASIC-NEXT: Stack Slot Coloring
99103

100104

0 commit comments

Comments
 (0)