Skip to content

Commit e703088

Browse files
committed
[AMDGPU] Support block load/store for CSR llvm#130013
1 parent 039e907 commit e703088

18 files changed

+1022
-15
lines changed

llvm/include/llvm/CodeGen/MachineFrameInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ class CalleeSavedInfo {
5353
/// Flag indicating whether the register is spilled to stack or another
5454
/// register.
5555
bool SpilledToReg = false;
56+
/// Flag indicating whether this CSI has been handled by the target and can be
57+
/// skipped by the generic code in the prolog/epilog inserter.
58+
bool IsHandledByTarget = false;
5659

5760
public:
5861
explicit CalleeSavedInfo(unsigned R, int FI = 0) : Reg(R), FrameIdx(FI) {}
@@ -61,6 +64,7 @@ class CalleeSavedInfo {
6164
Register getReg() const { return Reg; }
6265
int getFrameIdx() const { return FrameIdx; }
6366
unsigned getDstReg() const { return DstReg; }
67+
void setReg(MCRegister R) { Reg = R; }
6468
void setFrameIdx(int FI) {
6569
FrameIdx = FI;
6670
SpilledToReg = false;
@@ -72,6 +76,9 @@ class CalleeSavedInfo {
7276
bool isRestored() const { return Restored; }
7377
void setRestored(bool R) { Restored = R; }
7478
bool isSpilledToReg() const { return SpilledToReg; }
79+
80+
bool isHandledByTarget() const { return IsHandledByTarget; }
81+
void setHandledByTarget() { IsHandledByTarget = true; }
7582
};
7683

7784
/// The MachineFrameInfo class represents an abstract stack frame until

llvm/lib/CodeGen/PrologEpilogInserter.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
478478
for (auto &CS : CSI) {
479479
// If the target has spilled this register to another register, we don't
480480
// need to allocate a stack slot.
481-
if (CS.isSpilledToReg())
481+
if (CS.isSpilledToReg() || CS.isHandledByTarget())
482482
continue;
483483

484484
unsigned Reg = CS.getReg();
@@ -604,6 +604,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
604604
MachineBasicBlock::iterator I = SaveBlock.begin();
605605
if (!TFI->spillCalleeSavedRegisters(SaveBlock, I, CSI, TRI)) {
606606
for (const CalleeSavedInfo &CS : CSI) {
607+
if (CS.isHandledByTarget())
608+
continue;
607609
// Insert the spill to the stack frame.
608610
unsigned Reg = CS.getReg();
609611

@@ -634,6 +636,9 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
634636

635637
if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
636638
for (const CalleeSavedInfo &CI : reverse(CSI)) {
639+
if (CI.isHandledByTarget())
640+
continue;
641+
637642
unsigned Reg = CI.getReg();
638643
if (CI.isSpilledToReg()) {
639644
BuildMI(RestoreBlock, I, DebugLoc(), TII.get(TargetOpcode::COPY), Reg)

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,14 @@ def FeatureXF32Insts : SubtargetFeature<"xf32-insts",
12451245
"v_mfma_f32_16x16x8_xf32 and v_mfma_f32_32x32x4_xf32"
12461246
>;
12471247

1248+
// Enable the use of SCRATCH_STORE/LOAD_BLOCK instructions for saving and
1249+
// restoring the callee-saved registers.
1250+
def FeatureUseBlockVGPROpsForCSR : SubtargetFeature<"block-vgpr-csr",
1251+
"UseBlockVGPROpsForCSR",
1252+
"true",
1253+
"Use block load/store for VGPR callee saved registers"
1254+
>;
1255+
12481256
// Dummy feature used to disable assembler instructions.
12491257
def FeatureDisable : SubtargetFeature<"",
12501258
"FeatureDisable","true",

llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "AMDGPUMachineFunction.h"
1919
#include "MCTargetDesc/AMDGPUInstPrinter.h"
2020
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21+
#include "SIMachineFunctionInfo.h"
2122
#include "llvm/CodeGen/MachineBasicBlock.h"
2223
#include "llvm/CodeGen/MachineInstr.h"
2324
#include "llvm/IR/Constants.h"
@@ -181,6 +182,36 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
181182
return AsmPrinter::lowerConstant(CV);
182183
}
183184

185+
static void emitVGPRBlockComment(const MachineInstr *MI, MCStreamer &OS) {
186+
// The instruction will only transfer a subset of the registers in the block,
187+
// based on the mask that is stored in m0. We could search for the instruction
188+
// that sets m0, but most of the time we'll already have the mask stored in
189+
// the machine function info. Try to use that. This assumes that we only use
190+
// block loads/stores for CSR spills.
191+
const MachineFunction *MF = MI->getParent()->getParent();
192+
const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
193+
const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
194+
const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
195+
196+
Register RegBlock =
197+
TII->getNamedOperand(*MI, MI->mayLoad() ? AMDGPU::OpName::vdst
198+
: AMDGPU::OpName::vdata)
199+
->getReg();
200+
Register FirstRegInBlock = TRI.getSubReg(RegBlock, AMDGPU::sub0);
201+
uint32_t Mask = MFI->getMaskForVGPRBlockOps(RegBlock);
202+
203+
SmallString<512> TransferredRegs;
204+
for (unsigned I = 0; I < 32; ++I) {
205+
if (Mask & (1 << I)) {
206+
(llvm::Twine(" ") + TRI.getName(FirstRegInBlock + I))
207+
.toVector(TransferredRegs);
208+
}
209+
}
210+
211+
if (!TransferredRegs.empty())
212+
OS.emitRawComment(" transferring at most " + TransferredRegs);
213+
}
214+
184215
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
185216
// FIXME: Enable feature predicate checks once all the test pass.
186217
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
@@ -269,6 +300,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
269300
return;
270301
}
271302

303+
if (STI.getInstrInfo()->isBlockLoadStore(MI->getOpcode()))
304+
if (isVerbose())
305+
emitVGPRBlockComment(MI, *OutStreamer);
306+
272307
MCInst TmpInst;
273308
MCInstLowering.lower(MI, TmpInst);
274309
EmitToStreamer(*OutStreamer, TmpInst);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
255255
bool HasMinimum3Maximum3PKF16 = false;
256256

257257
bool RequiresCOV6 = false;
258+
bool UseBlockVGPROpsForCSR = false;
258259

259260
// Dummy feature to use for assembler in tablegen.
260261
bool FeatureDisable = false;
@@ -1270,6 +1271,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12701271

12711272
bool requiresCodeObjectV6() const { return RequiresCOV6; }
12721273

1274+
bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
1275+
12731276
bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
12741277

12751278
bool hasVALUReadSGPRHazard() const { return getGeneration() == GFX12; }

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1847,6 +1847,110 @@ void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
18471847
}
18481848
}
18491849

1850+
static void assignSlotsUsingVGPRBlocks(MachineFunction &MF,
1851+
const GCNSubtarget &ST,
1852+
const TargetRegisterInfo *TRI,
1853+
std::vector<CalleeSavedInfo> &CSI,
1854+
unsigned &MinCSFrameIndex,
1855+
unsigned &MaxCSFrameIndex) {
1856+
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1857+
MachineFrameInfo &MFI = MF.getFrameInfo();
1858+
const SIInstrInfo *TII = ST.getInstrInfo();
1859+
const SIRegisterInfo *MRI = ST.getRegisterInfo();
1860+
1861+
assert(std::is_sorted(CSI.begin(), CSI.end(),
1862+
[](const CalleeSavedInfo &A, const CalleeSavedInfo &B) {
1863+
return A.getReg() < B.getReg();
1864+
}) &&
1865+
"Callee saved registers not sorted");
1866+
1867+
auto CanUseBlockOps = [&](const CalleeSavedInfo &CSI) {
1868+
return !CSI.isSpilledToReg() &&
1869+
MRI->isVGPR(MF.getRegInfo(), CSI.getReg()) &&
1870+
!FuncInfo->isWWMReservedRegister(CSI.getReg());
1871+
};
1872+
1873+
auto CSEnd = CSI.end();
1874+
for (auto CSIt = CSI.begin(); CSIt != CSEnd; ++CSIt) {
1875+
Register Reg = CSIt->getReg();
1876+
if (!CanUseBlockOps(*CSIt))
1877+
continue;
1878+
1879+
// Find all the regs that will fit in a 32-bit block starting at the current
1880+
// reg and build the mask. It should have 1 for every register that's
1881+
// included, with the current register as the least significant bit.
1882+
uint32_t Mask = 1;
1883+
CSEnd = std::remove_if(
1884+
CSIt + 1, CSEnd, [&](const CalleeSavedInfo &CSI) -> bool {
1885+
if (CanUseBlockOps(CSI) && CSI.getReg() < Reg + 32) {
1886+
Mask |= 1 << (CSI.getReg() - Reg);
1887+
return true;
1888+
} else {
1889+
return false;
1890+
}
1891+
});
1892+
1893+
const TargetRegisterClass *BlockRegClass =
1894+
TII->getRegClassForBlockOp(TRI, MF);
1895+
Register RegBlock =
1896+
MRI->getMatchingSuperReg(Reg, AMDGPU::sub0, BlockRegClass);
1897+
if (!RegBlock) {
1898+
// We couldn't find a super register for the block. This can happen if
1899+
// the register we started with is too high (e.g. v232 if the maximum is
1900+
// v255). We therefore try to get the last register block and figure out
1901+
// the mask from there.
1902+
Register LastBlockStart =
1903+
AMDGPU::VGPR0 + alignDown(Reg - AMDGPU::VGPR0, 32);
1904+
RegBlock =
1905+
MRI->getMatchingSuperReg(LastBlockStart, AMDGPU::sub0, BlockRegClass);
1906+
assert(RegBlock && MRI->isSubRegister(RegBlock, Reg) &&
1907+
"Couldn't find super register");
1908+
int RegDelta = Reg - LastBlockStart;
1909+
assert(RegDelta > 0 && llvm::countl_zero(Mask) >= RegDelta &&
1910+
"Bad shift amount");
1911+
Mask <<= RegDelta;
1912+
}
1913+
1914+
FuncInfo->setMaskForVGPRBlockOps(RegBlock, Mask);
1915+
1916+
// The stack objects can be a bit smaller than the register block if we know
1917+
// some of the high bits of Mask are 0. This may happen often with calling
1918+
// conventions where the caller and callee-saved VGPRs are interleaved at
1919+
// a small boundary (e.g. 8 or 16).
1920+
int UnusedBits = llvm::countl_zero(Mask);
1921+
unsigned BlockSize = MRI->getSpillSize(*BlockRegClass) - UnusedBits * 4;
1922+
int FrameIdx =
1923+
MFI.CreateStackObject(BlockSize, MRI->getSpillAlign(*BlockRegClass),
1924+
/*isSpillSlot=*/true);
1925+
if ((unsigned)FrameIdx < MinCSFrameIndex)
1926+
MinCSFrameIndex = FrameIdx;
1927+
if ((unsigned)FrameIdx > MaxCSFrameIndex)
1928+
MaxCSFrameIndex = FrameIdx;
1929+
1930+
CSIt->setFrameIdx(FrameIdx);
1931+
CSIt->setReg(RegBlock);
1932+
CSIt->setHandledByTarget();
1933+
}
1934+
CSI.erase(CSEnd, CSI.end());
1935+
}
1936+
1937+
bool SIFrameLowering::assignCalleeSavedSpillSlots(
1938+
MachineFunction &MF, const TargetRegisterInfo *TRI,
1939+
std::vector<CalleeSavedInfo> &CSI, unsigned &MinCSFrameIndex,
1940+
unsigned &MaxCSFrameIndex) const {
1941+
if (CSI.empty())
1942+
return true; // Early exit if no callee saved registers are modified!
1943+
1944+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1945+
bool UseVGPRBlocks = ST.useVGPRBlockOpsForCSR();
1946+
1947+
if (UseVGPRBlocks)
1948+
assignSlotsUsingVGPRBlocks(MF, ST, TRI, CSI, MinCSFrameIndex,
1949+
MaxCSFrameIndex);
1950+
1951+
return assignCalleeSavedSpillSlots(MF, TRI, CSI);
1952+
}
1953+
18501954
bool SIFrameLowering::assignCalleeSavedSpillSlots(
18511955
MachineFunction &MF, const TargetRegisterInfo *TRI,
18521956
std::vector<CalleeSavedInfo> &CSI) const {
@@ -1915,6 +2019,101 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
19152019
return true;
19162020
}
19172021

2022+
bool SIFrameLowering::spillCalleeSavedRegisters(
2023+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2024+
ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2025+
MachineFunction *MF = MBB.getParent();
2026+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2027+
if (!ST.useVGPRBlockOpsForCSR())
2028+
return false;
2029+
2030+
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2031+
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2032+
const SIInstrInfo *TII = ST.getInstrInfo();
2033+
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2034+
2035+
for (const CalleeSavedInfo &CS : CSI) {
2036+
Register Reg = CS.getReg();
2037+
if (!CS.isHandledByTarget())
2038+
continue;
2039+
2040+
// Build a scratch block store.
2041+
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
2042+
int FrameIndex = CS.getFrameIdx();
2043+
MachinePointerInfo PtrInfo =
2044+
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
2045+
MachineMemOperand *MMO =
2046+
MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
2047+
FrameInfo.getObjectSize(FrameIndex),
2048+
FrameInfo.getObjectAlign(FrameIndex));
2049+
2050+
BuildMI(MBB, MI, MI->getDebugLoc(),
2051+
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_SAVE))
2052+
.addReg(Reg, getKillRegState(false))
2053+
.addFrameIndex(FrameIndex)
2054+
.addReg(MFI->getStackPtrOffsetReg())
2055+
.addImm(0)
2056+
.addImm(Mask)
2057+
.addMemOperand(MMO);
2058+
2059+
FuncInfo->setHasSpilledVGPRs();
2060+
2061+
// Add the register to the liveins. This is necessary because if any of the
2062+
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
2063+
// then the whole block will be marked as reserved and `updateLiveness` will
2064+
// skip it.
2065+
MBB.addLiveIn(Reg);
2066+
}
2067+
2068+
return false;
2069+
}
2070+
2071+
bool SIFrameLowering::restoreCalleeSavedRegisters(
2072+
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
2073+
MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
2074+
MachineFunction *MF = MBB.getParent();
2075+
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2076+
if (!ST.useVGPRBlockOpsForCSR())
2077+
return false;
2078+
2079+
SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
2080+
MachineFrameInfo &MFI = MF->getFrameInfo();
2081+
const SIInstrInfo *TII = ST.getInstrInfo();
2082+
const SIRegisterInfo *SITRI = static_cast<const SIRegisterInfo *>(TRI);
2083+
for (const CalleeSavedInfo &CS : reverse(CSI)) {
2084+
if (!CS.isHandledByTarget())
2085+
continue;
2086+
2087+
// Build a scratch block load.
2088+
Register Reg = CS.getReg();
2089+
uint32_t Mask = FuncInfo->getMaskForVGPRBlockOps(Reg);
2090+
int FrameIndex = CS.getFrameIdx();
2091+
MachinePointerInfo PtrInfo =
2092+
MachinePointerInfo::getFixedStack(*MF, FrameIndex);
2093+
MachineMemOperand *MMO = MF->getMachineMemOperand(
2094+
PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
2095+
MFI.getObjectAlign(FrameIndex));
2096+
2097+
auto MIB = BuildMI(MBB, MI, MI->getDebugLoc(),
2098+
TII->get(AMDGPU::SI_BLOCK_SPILL_V1024_RESTORE), Reg)
2099+
.addFrameIndex(FrameIndex)
2100+
.addReg(FuncInfo->getStackPtrOffsetReg())
2101+
.addImm(0)
2102+
.addImm(Mask)
2103+
.addMemOperand(MMO);
2104+
SITRI->addImplicitUsesForBlockCSRLoad(MIB, Reg);
2105+
2106+
// Add the register to the liveins. This is necessary because if any of the
2107+
// VGPRs in the register block is reserved (e.g. if it's a WWM register),
2108+
// then the whole block will be marked as reserved and `updateLiveness` will
2109+
// skip it.
2110+
if (!MBB.isLiveIn(Reg))
2111+
MBB.addLiveIn(Reg);
2112+
}
2113+
2114+
return false;
2115+
}
2116+
19182117
MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
19192118
MachineFunction &MF,
19202119
MachineBasicBlock &MBB,

llvm/lib/Target/AMDGPU/SIFrameLowering.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,23 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
4949
const TargetRegisterInfo *TRI,
5050
std::vector<CalleeSavedInfo> &CSI) const override;
5151

52+
bool assignCalleeSavedSpillSlots(MachineFunction &MF,
53+
const TargetRegisterInfo *TRI,
54+
std::vector<CalleeSavedInfo> &CSI,
55+
unsigned &MinCSFrameIndex,
56+
unsigned &MaxCSFrameIndex) const override;
57+
58+
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
59+
MachineBasicBlock::iterator MI,
60+
ArrayRef<CalleeSavedInfo> CSI,
61+
const TargetRegisterInfo *TRI) const override;
62+
63+
bool
64+
restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
65+
MachineBasicBlock::iterator MI,
66+
MutableArrayRef<CalleeSavedInfo> CSI,
67+
const TargetRegisterInfo *TRI) const override;
68+
5269
bool allocateScavengingFrameIndexesNearIncomingSP(
5370
const MachineFunction &MF) const override;
5471

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5769,6 +5769,16 @@ const TargetRegisterClass *SIInstrInfo::getRegClass(const MCInstrDesc &TID,
57695769
IsAllocatable);
57705770
}
57715771

5772+
const TargetRegisterClass *
5773+
SIInstrInfo::getRegClassForBlockOp(const TargetRegisterInfo *TRI,
5774+
const MachineFunction &MF) const {
5775+
const MCInstrDesc &ScratchStoreBlockOp =
5776+
get(AMDGPU::SCRATCH_STORE_BLOCK_SADDR);
5777+
int VDataIdx = AMDGPU::getNamedOperandIdx(ScratchStoreBlockOp.getOpcode(),
5778+
AMDGPU::OpName::vdata);
5779+
return getRegClass(ScratchStoreBlockOp, VDataIdx, TRI, MF);
5780+
}
5781+
57725782
const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
57735783
unsigned OpNo) const {
57745784
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();

0 commit comments

Comments
 (0)