Skip to content

Commit 7db61b5

Browse files
committed
set last use in cache policy
1 parent 99e37d7 commit 7db61b5

File tree

4 files changed

+42
-34
lines changed

4 files changed

+42
-34
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
148148
LiveUnits.addReg(SpillReg);
149149
bool IsKill = !MBB.isLiveIn(SpillReg);
150150
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
151-
DwordOff, MMO, nullptr, &LiveUnits);
151+
DwordOff, false, MMO, nullptr, &LiveUnits);
152152
if (IsKill)
153153
LiveUnits.removeReg(SpillReg);
154154
}
@@ -170,7 +170,7 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
170170
PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
171171
FrameInfo.getObjectAlign(FI));
172172
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
173-
DwordOff, MMO, nullptr, &LiveUnits);
173+
DwordOff, false, MMO, nullptr, &LiveUnits);
174174
}
175175

176176
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1320,8 +1320,8 @@ static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
13201320
void SIRegisterInfo::buildSpillLoadStore(
13211321
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
13221322
unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1323-
MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1324-
RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1323+
MCRegister ScratchOffsetReg, int64_t InstOffset, bool LastUse,
1324+
MachineMemOperand *MMO, RegScavenger *RS, LiveRegUnits *LiveUnits) const {
13251325
assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
13261326

13271327
MachineFunction *MF = MBB.getParent();
@@ -1657,8 +1657,10 @@ void SIRegisterInfo::buildSpillLoadStore(
16571657
} else {
16581658
MIB.addReg(SOffset, SOffsetRegState);
16591659
}
1660+
1661+
int64_t CPol = AMDGPU::isGFX12Plus(ST) && LastUse ? AMDGPU::CPol::TH_LU : 0;
16601662
MIB.addImm(Offset + RegOffset)
1661-
.addImm(0); // cpol
1663+
.addImm(CPol);
16621664
if (!IsFlat)
16631665
MIB.addImm(0); // swz
16641666
MIB.addMemOperand(NewMMO);
@@ -1734,12 +1736,12 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
17341736
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
17351737
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
17361738
buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1737-
FrameReg, Offset * SB.EltSize, MMO, SB.RS);
1739+
FrameReg, Offset * SB.EltSize, false, MMO, SB.RS);
17381740
} else {
17391741
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
17401742
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
17411743
buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1742-
FrameReg, Offset * SB.EltSize, MMO, SB.RS);
1744+
FrameReg, Offset * SB.EltSize, false, MMO, SB.RS);
17431745
// This only ever adds one VGPR spill
17441746
SB.MFI.addToSpilledVGPRs(1);
17451747
}
@@ -2175,7 +2177,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
21752177
}
21762178
buildSpillLoadStore(
21772179
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2178-
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2180+
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), false,
21792181
*MI->memoperands_begin(), RS);
21802182
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
21812183
if (IsWWMRegSpill)
@@ -2241,14 +2243,20 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
22412243
TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
22422244
RS->isRegUsed(AMDGPU::SCC));
22432245
}
2246+
int16_t LastUseIdx =
2247+
AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::last_use);
2248+
bool LastUse = (LastUseIdx != -1)
2249+
? (MI->getOperand(LastUseIdx).getImm() == 1)
2250+
: false;
2251+
22442252
buildSpillLoadStore(
22452253
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2246-
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2254+
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(), LastUse,
22472255
*MI->memoperands_begin(), RS);
2248-
2249-
if (IsWWMRegSpill)
2256+
2257+
if (IsWWMRegSpill)
22502258
TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2251-
2259+
22522260
MI->eraseFromParent();
22532261
return true;
22542262
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -427,8 +427,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
427427
MachineBasicBlock::iterator MI, const DebugLoc &DL,
428428
unsigned LoadStoreOp, int Index, Register ValueReg,
429429
bool ValueIsKill, MCRegister ScratchOffsetReg,
430-
int64_t InstrOffset, MachineMemOperand *MMO,
431-
RegScavenger *RS,
430+
int64_t InstrOffset, bool LastUse,
431+
MachineMemOperand *MMO, RegScavenger *RS,
432432
LiveRegUnits *LiveUnits = nullptr) const;
433433

434434
// Return alignment in register file of first register in a register tuple.

llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.ll

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,19 @@ define amdgpu_cs void @max_6_vgprs(ptr addrspace(1) %p) "amdgpu-num-vgpr"="6" {
3030
; CHECK-NEXT: ;;#ASMEND
3131
; CHECK-NEXT: global_store_b32 v[0:1], v5, off th:TH_STORE_NT_RT
3232
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
33-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload
33+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
3434
; CHECK-NEXT: s_waitcnt vmcnt(0)
3535
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
3636
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
37-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 ; 4-byte Folded Reload
37+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
3838
; CHECK-NEXT: s_waitcnt vmcnt(0)
3939
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
4040
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
41-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 ; 4-byte Folded Reload
41+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
4242
; CHECK-NEXT: s_waitcnt vmcnt(0)
4343
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
4444
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
45-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 ; 4-byte Folded Reload
45+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
4646
; CHECK-NEXT: s_waitcnt vmcnt(0)
4747
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
4848
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
@@ -118,31 +118,31 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
118118
; CHECK-NEXT: ;;#ASMEND
119119
; CHECK-NEXT: global_store_b32 v[0:1], v10, off th:TH_STORE_NT_RT
120120
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
121-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 ; 4-byte Folded Reload
121+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
122122
; CHECK-NEXT: s_waitcnt vmcnt(0)
123123
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
124124
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
125-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 ; 4-byte Folded Reload
125+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
126126
; CHECK-NEXT: s_waitcnt vmcnt(0)
127127
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
128128
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
129-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 ; 4-byte Folded Reload
129+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
130130
; CHECK-NEXT: s_waitcnt vmcnt(0)
131131
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
132132
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
133-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 ; 4-byte Folded Reload
133+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
134134
; CHECK-NEXT: s_waitcnt vmcnt(0)
135135
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
136136
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
137-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 ; 4-byte Folded Reload
137+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
138138
; CHECK-NEXT: s_waitcnt vmcnt(0)
139139
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
140140
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
141-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 ; 4-byte Folded Reload
141+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
142142
; CHECK-NEXT: s_waitcnt vmcnt(0)
143143
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
144144
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
145-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 ; 4-byte Folded Reload
145+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
146146
; CHECK-NEXT: s_waitcnt vmcnt(0)
147147
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
148148
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
@@ -176,41 +176,41 @@ define amdgpu_cs void @max_11_vgprs_branch(ptr addrspace(1) %p, i32 %tmp) "amdgp
176176
; CHECK-NEXT: ;;#ASMEND
177177
; CHECK-NEXT: global_store_b32 v[0:1], v10, off th:TH_STORE_NT_RT
178178
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
179-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 ; 4-byte Folded Reload
179+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:20 th:TH_LOAD_LU ; 4-byte Folded Reload
180180
; CHECK-NEXT: s_waitcnt vmcnt(0)
181181
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
182182
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
183-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 ; 4-byte Folded Reload
183+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:24 th:TH_LOAD_LU ; 4-byte Folded Reload
184184
; CHECK-NEXT: s_waitcnt vmcnt(0)
185185
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
186186
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
187-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 ; 4-byte Folded Reload
187+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:28 th:TH_LOAD_LU ; 4-byte Folded Reload
188188
; CHECK-NEXT: s_waitcnt vmcnt(0)
189189
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
190190
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
191-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 ; 4-byte Folded Reload
191+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:32 th:TH_LOAD_LU ; 4-byte Folded Reload
192192
; CHECK-NEXT: s_waitcnt vmcnt(0)
193193
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
194194
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
195-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 ; 4-byte Folded Reload
195+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:36 th:TH_LOAD_LU ; 4-byte Folded Reload
196196
; CHECK-NEXT: s_waitcnt vmcnt(0)
197197
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
198198
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
199-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 ; 4-byte Folded Reload
199+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:12 th:TH_LOAD_LU ; 4-byte Folded Reload
200200
; CHECK-NEXT: s_waitcnt vmcnt(0)
201201
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
202202
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
203-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 ; 4-byte Folded Reload
203+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:16 th:TH_LOAD_LU ; 4-byte Folded Reload
204204
; CHECK-NEXT: s_waitcnt vmcnt(0)
205205
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
206206
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
207207
; CHECK-NEXT: .LBB1_4: ; %.exit
208208
; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s0
209-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 ; 4-byte Folded Reload
209+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:4 th:TH_LOAD_LU ; 4-byte Folded Reload
210210
; CHECK-NEXT: s_waitcnt vmcnt(0)
211211
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
212212
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0
213-
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 ; 4-byte Folded Reload
213+
; CHECK-NEXT: scratch_load_b32 v0, off, off offset:8 th:TH_LOAD_LU ; 4-byte Folded Reload
214214
; CHECK-NEXT: s_waitcnt vmcnt(0)
215215
; CHECK-NEXT: global_store_b32 v[0:1], v0, off th:TH_STORE_NT_RT
216216
; CHECK-NEXT: s_waitcnt_vscnt null, 0x0

0 commit comments

Comments
 (0)