Skip to content

Commit b9e094e

Browse files
committed
[AMDGPU] Allocate scratch space for dVGPRs for CWSR llvm#130055
1 parent 29dde91 commit b9e094e

17 files changed

+474
-44
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -6009,8 +6009,13 @@ Frame Pointer
60096009

60106010
If the kernel needs a frame pointer for the reasons defined in
60116011
``SIFrameLowering`` then SGPR33 is used and is always set to ``0`` in the
6012-
kernel prolog. If a frame pointer is not required then all uses of the frame
6013-
pointer are replaced with immediate ``0`` offsets.
6012+
kernel prolog. On GFX12+, when dynamic VGPRs are enabled, the prologue will
6013+
check if the kernel is running on a compute queue, and if so it will reserve
6014+
some scratch space for any dynamic VGPRs that might need to be saved by the
6015+
CWSR trap handler. In this case, the frame pointer will be initialized to
6016+
a suitably aligned offset above this reserved area. If a frame pointer is not
6017+
required then all uses of the frame pointer are replaced with immediate ``0``
6018+
offsets.
60146019

60156020
.. _amdgpu-amdhsa-kernel-prolog-flat-scratch:
60166021

@@ -17137,33 +17142,35 @@ within a map that has been added by the same *vendor-name*.
1713717142
.. table:: AMDPAL Code Object Hardware Stage Metadata Map
1713817143
:name: amdgpu-amdpal-code-object-hardware-stage-metadata-map-table
1713917144

17140-
========================== ============== ========= ===============================================================
17141-
String Key Value Type Required? Description
17142-
========================== ============== ========= ===============================================================
17143-
".entry_point" string The ELF symbol pointing to this pipeline's stage entry point.
17144-
".scratch_memory_size" integer Scratch memory size in bytes.
17145-
".lds_size" integer Local Data Share size in bytes.
17146-
".perf_data_buffer_size" integer Performance data buffer size in bytes.
17147-
".vgpr_count" integer Number of VGPRs used.
17148-
".agpr_count" integer Number of AGPRs used.
17149-
".sgpr_count" integer Number of SGPRs used.
17150-
".vgpr_limit" integer If non-zero, indicates the shader was compiled with a
17151-
directive to instruct the compiler to limit the VGPR usage to
17152-
be less than or equal to the specified value (only set if
17153-
different from HW default).
17154-
".sgpr_limit" integer SGPR count upper limit (only set if different from HW
17155-
default).
17156-
".threadgroup_dimensions" sequence of Thread-group X/Y/Z dimensions (Compute only).
17157-
3 integers
17158-
".wavefront_size" integer Wavefront size (only set if different from HW default).
17159-
".uses_uavs" boolean The shader reads or writes UAVs.
17160-
".uses_rovs" boolean The shader reads or writes ROVs.
17161-
".writes_uavs" boolean The shader writes to one or more UAVs.
17162-
".writes_depth" boolean The shader writes out a depth value.
17163-
".uses_append_consume" boolean The shader uses append and/or consume operations, either
17164-
memory or GDS.
17165-
".uses_prim_id" boolean The shader uses PrimID.
17166-
========================== ============== ========= ===============================================================
17145+
=========================== ============== ========= ===============================================================
17146+
String Key Value Type Required? Description
17147+
=========================== ============== ========= ===============================================================
17148+
".entry_point" string The ELF symbol pointing to this pipeline's stage entry point.
17149+
".scratch_memory_size" integer Scratch memory size in bytes.
17150+
".lds_size" integer Local Data Share size in bytes.
17151+
".perf_data_buffer_size" integer Performance data buffer size in bytes.
17152+
".vgpr_count" integer Number of VGPRs used.
17153+
".agpr_count" integer Number of AGPRs used.
17154+
".sgpr_count" integer Number of SGPRs used.
17155+
".dynamic_vgpr_saved_count" integer No Number of dynamic VGPRs that can be stored in scratch by the
17156+
CWSR trap handler. Only used on GFX12+.
17157+
".vgpr_limit" integer If non-zero, indicates the shader was compiled with a
17158+
directive to instruct the compiler to limit the VGPR usage to
17159+
be less than or equal to the specified value (only set if
17160+
different from HW default).
17161+
".sgpr_limit" integer SGPR count upper limit (only set if different from HW
17162+
default).
17163+
".threadgroup_dimensions" sequence of Thread-group X/Y/Z dimensions (Compute only).
17164+
3 integers
17165+
".wavefront_size" integer Wavefront size (only set if different from HW default).
17166+
".uses_uavs" boolean The shader reads or writes UAVs.
17167+
".uses_rovs" boolean The shader reads or writes ROVs.
17168+
".writes_uavs" boolean The shader writes to one or more UAVs.
17169+
".writes_depth" boolean The shader writes out a depth value.
17170+
".uses_append_consume" boolean The shader uses append and/or consume operations, either
17171+
memory or GDS.
17172+
".uses_prim_id" boolean The shader uses PrimID.
17173+
=========================== ============== ========= ===============================================================
1716717174

1716817175
..
1716917176

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1442,8 +1442,15 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
14421442
MD->setEntryPoint(CC, MF.getFunction().getName());
14431443
MD->setNumUsedVgprs(CC, CurrentProgramInfo.NumVGPRsForWavesPerEU, Ctx);
14441444

1445-
// Only set AGPRs for supported devices
1445+
// For targets that support dynamic VGPRs, set the number of saved dynamic
1446+
// VGPRs (if any) in the PAL metadata.
14461447
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
1448+
if (STM.isDynamicVGPREnabled() &&
1449+
MFI->getScratchReservedForDynamicVGPRs() > 0)
1450+
MD->setHwStage(CC, ".dynamic_vgpr_saved_count",
1451+
MFI->getScratchReservedForDynamicVGPRs() / 4);
1452+
1453+
// Only set AGPRs for supported devices
14471454
if (STM.hasMAIInsts()) {
14481455
MD->setNumUsedAgprs(CC, CurrentProgramInfo.NumAccVGPR);
14491456
}

llvm/lib/Target/AMDGPU/SIDefines.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -552,6 +552,7 @@ enum Id { // HwRegCode, (6) [5:0]
552552

553553
enum Offset : unsigned { // Offset, (5) [10:6]
554554
OFFSET_MEM_VIOL = 8,
555+
OFFSET_ME_ID = 8, // in HW_ID2
555556
};
556557

557558
enum ModeRegisterMasks : uint32_t {

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 62 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -844,17 +844,62 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
844844
}
845845
assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
846846

847-
if (hasFP(MF)) {
847+
unsigned Offset = FrameInfo.getStackSize() * getScratchScaleFactor(ST);
848+
if (!mayReserveScratchForCWSR(MF)) {
849+
if (hasFP(MF)) {
850+
Register FPReg = MFI->getFrameOffsetReg();
851+
assert(FPReg != AMDGPU::FP_REG);
852+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
853+
}
854+
855+
if (requiresStackPointerReference(MF)) {
856+
Register SPReg = MFI->getStackPtrOffsetReg();
857+
assert(SPReg != AMDGPU::SP_REG);
858+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
859+
}
860+
} else {
861+
// We need to check if we're on a compute queue - if we are, then the CWSR
862+
// trap handler may need to store some VGPRs on the stack. The first VGPR
863+
// block is saved separately, so we only need to allocate space for any
864+
// additional VGPR blocks used. For now, we will make sure there's enough
865+
// room for the theoretical maximum number of VGPRs that can be allocated.
866+
// FIXME: Figure out if the shader uses fewer VGPRs in practice.
867+
assert(hasFP(MF));
848868
Register FPReg = MFI->getFrameOffsetReg();
849869
assert(FPReg != AMDGPU::FP_REG);
850-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
851-
}
852-
853-
if (requiresStackPointerReference(MF)) {
854-
Register SPReg = MFI->getStackPtrOffsetReg();
855-
assert(SPReg != AMDGPU::SP_REG);
856-
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
857-
.addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
870+
unsigned VGPRSize =
871+
llvm::alignTo((ST.getAddressableNumVGPRs() -
872+
AMDGPU::IsaInfo::getVGPRAllocGranule(&ST)) *
873+
4,
874+
FrameInfo.getMaxAlign());
875+
MFI->setScratchReservedForDynamicVGPRs(VGPRSize);
876+
877+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), FPReg)
878+
.addImm(AMDGPU::Hwreg::HwregEncoding::encode(
879+
AMDGPU::Hwreg::ID_HW_ID2, AMDGPU::Hwreg::OFFSET_ME_ID, 2));
880+
// The MicroEngine ID is 0 for the graphics queue, and 1 or 2 for compute
881+
// (3 is unused, so we ignore it). Unfortunately, S_GETREG doesn't set
882+
// SCC, so we need to check for 0 manually.
883+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)).addImm(0).addReg(FPReg);
884+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), FPReg).addImm(VGPRSize);
885+
if (requiresStackPointerReference(MF)) {
886+
Register SPReg = MFI->getStackPtrOffsetReg();
887+
assert(SPReg != AMDGPU::SP_REG);
888+
889+
// If at least one of the constants can be inlined, then we can use
890+
// s_cselect. Otherwise, use a mov and cmovk.
891+
if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm()) ||
892+
AMDGPU::isInlinableLiteral32(Offset + VGPRSize,
893+
ST.hasInv2PiInlineImm())) {
894+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CSELECT_B32), SPReg)
895+
.addImm(Offset + VGPRSize)
896+
.addImm(Offset);
897+
} else {
898+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg).addImm(Offset);
899+
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CMOVK_I32), SPReg)
900+
.addImm(Offset + VGPRSize);
901+
}
902+
}
858903
}
859904

860905
bool NeedsFlatScratchInit =
@@ -2182,9 +2227,17 @@ bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
21822227
return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
21832228
MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
21842229
MF) ||
2230+
mayReserveScratchForCWSR(MF) ||
21852231
MF.getTarget().Options.DisableFramePointerElim(MF);
21862232
}
21872233

2234+
bool SIFrameLowering::mayReserveScratchForCWSR(
2235+
const MachineFunction &MF) const {
2236+
return MF.getSubtarget<GCNSubtarget>().isDynamicVGPREnabled() &&
2237+
AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv()) &&
2238+
AMDGPU::isCompute(MF.getFunction().getCallingConv());
2239+
}
2240+
21882241
// This is essentially a reduced version of hasFP for entry functions. Since the
21892242
// stack pointer is known 0 on entry to kernels, we never really need an FP
21902243
// register. We may need to initialize the stack pointer depending on the frame

llvm/lib/Target/AMDGPU/SIFrameLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,10 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
107107

108108
public:
109109
bool requiresStackPointerReference(const MachineFunction &MF) const;
110+
111+
// Returns true if the function may need to reserve space on the stack for the
112+
// CWSR trap handler.
113+
bool mayReserveScratchForCWSR(const MachineFunction &MF) const;
110114
};
111115

112116
} // end namespace llvm

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
714714
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
715715
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
716716
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
717-
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) {
717+
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
718+
ScratchReservedForDynamicVGPRs(MFI.getScratchReservedForDynamicVGPRs()) {
718719
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
719720
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
720721

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
299299

300300
bool HasInitWholeWave = false;
301301

302+
unsigned ScratchReservedForDynamicVGPRs = 0;
303+
302304
SIMachineFunctionInfo() = default;
303305
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
304306
const TargetRegisterInfo &TRI,
@@ -350,6 +352,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
350352
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
351353
StringValue());
352354
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
355+
YamlIO.mapOptional("scratchReservedForDynamicVGPRs",
356+
MFI.ScratchReservedForDynamicVGPRs, 0);
353357
}
354358
};
355359

@@ -474,6 +478,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
474478
unsigned NumSpilledSGPRs = 0;
475479
unsigned NumSpilledVGPRs = 0;
476480

481+
// The size of the scratch space reserved for the CWSR trap handler to spill
482+
// some of the dynamic VGPRs.
483+
unsigned ScratchReservedForDynamicVGPRs = 0;
484+
477485
// Tracks information about user SGPRs that will be setup by hardware which
478486
// will apply to all wavefronts of the grid.
479487
GCNUserSGPRUsageInfo UserSGPRInfo;
@@ -837,6 +845,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
837845
BytesInStackArgArea = Bytes;
838846
}
839847

848+
// This is only used if we need to save any dynamic VGPRs in scratch.
849+
unsigned getScratchReservedForDynamicVGPRs() const {
850+
return ScratchReservedForDynamicVGPRs;
851+
}
852+
853+
void setScratchReservedForDynamicVGPRs(unsigned Size) {
854+
ScratchReservedForDynamicVGPRs = Size;
855+
}
856+
840857
// Add user SGPRs.
841858
Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
842859
Register addDispatchPtr(const SIRegisterInfo &TRI);

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,7 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
512512
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
513513
const SIFrameLowering *TFI = ST.getFrameLowering();
514514
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
515+
515516
// During ISel lowering we always reserve the stack pointer in entry and chain
516517
// functions, but never actually want to reference it when accessing our own
517518
// frame. If we need a frame pointer we use it, but otherwise we can just use

0 commit comments

Comments
 (0)