Skip to content

Commit 01b87dc

Browse files
committed
[amdgpu] Add llvm.amdgcn.init.whole.wave intrinsic
This intrinsic is meant to be used in functions that have a "tail" that needs to be run with all the lanes enabled. The "tail" may contain complex control flow that makes it unsuitable for the use of the existing WWM intrinsics. Instead, we will pretend that the function starts with all the lanes enabled, then branches into the actual body of the function for the lanes that were meant to run it, and then finally all the lanes will rejoin and run the tail. As such, the intrinsic will return the EXEC mask for the body of the function, and is meant to be used only as part of a very limited pattern (for now only in amdgpu_cs_chain functions): ``` entry: %func_exec = call i1 @llvm.amdgcn.init.whole.wave() br i1 %func_exec, label %func, label %tail func: ; ... stuff that should run with the actual EXEC mask br label %tail tail: ; ... stuff that runs with all the lanes enabled; ; can contain more than one basic block ``` It's an error to use the result of this intrinsic for anything other than a branch (but unfortunately checking that in the verifier is non-trivial because SIAnnotateControlFlow will introduce an amdgcn.if between the intrinsic and the branch). The intrinsic is lowered to a SI_INIT_WHOLE_WAVE pseudo, which for now is expanded in si-wqm (which is where SI_INIT_EXEC is handled too); however the information that the function was conceptually started in whole wave mode is stored in the machine function info (hasInitWholeWave). This will be useful in prolog epilog insertion, where we can skip saving the inactive lanes for CSRs (since if the function started with all the lanes active, then there are no inactive lanes to preserve).
1 parent 32cef07 commit 01b87dc

21 files changed

+2033
-4
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,20 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
208208
[IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
209209
IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
210210

211+
// Sets the function into whole-wave-mode and returns whether the lane was
212+
// active when entering the function. A branch depending on this return will
213+
// revert the EXEC mask to what it was when entering the function, thus
214+
// resulting in a no-op. This pattern is used to optimize branches when function
215+
// tails need to be run in whole-wave-mode. It may also have other consequences
216+
// (mostly related to WWM CSR handling) that differentiate it from using
217+
// a plain `amdgcn.init.exec -1`.
218+
//
219+
// Can only be used in functions with the `amdgpu_cs_chain` calling convention.
220+
// Using this intrinsic without immediately branching on its return value is an
221+
// error.
222+
def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
223+
IntrHasSideEffects, IntrNoMem, IntrNoDuplicate, IntrConvergent]>;
224+
211225
def int_amdgcn_wavefrontsize :
212226
ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
213227
DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
27382738
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
27392739
SelectDSBvhStackIntrinsic(N);
27402740
return;
2741+
case Intrinsic::amdgcn_init_whole_wave:
2742+
CurDAG->getMachineFunction()
2743+
.getInfo<SIMachineFunctionInfo>()
2744+
->setInitWholeWave();
2745+
break;
27412746
}
27422747

27432748
SelectCode(N);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
17721772
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
17731773
}
17741774

1775+
bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
1776+
MachineFunction *MF = MI.getParent()->getParent();
1777+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1778+
1779+
MFInfo->setInitWholeWave();
1780+
return selectImpl(MI, *CoverageInfo);
1781+
}
1782+
17751783
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
17761784
if (TM.getOptLevel() > CodeGenOptLevel::None) {
17771785
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
20992107
return selectDSAppendConsume(I, true);
21002108
case Intrinsic::amdgcn_ds_consume:
21012109
return selectDSAppendConsume(I, false);
2110+
case Intrinsic::amdgcn_init_whole_wave:
2111+
return selectInitWholeWave(I);
21022112
case Intrinsic::amdgcn_s_barrier:
21032113
return selectSBarrier(I);
21042114
case Intrinsic::amdgcn_raw_buffer_load_lds:

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
120120
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
121121
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
122122
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
123+
bool selectInitWholeWave(MachineInstr &MI) const;
123124
bool selectSBarrier(MachineInstr &MI) const;
124125
bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
125126

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
6767
// Kernel may need limited waves per EU for better performance.
6868
bool WaveLimiter = false;
6969

70+
bool HasInitWholeWave = false;
71+
7072
public:
7173
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
7274

@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
109111
return WaveLimiter;
110112
}
111113

114+
bool hasInitWholeWave() const { return HasInitWholeWave; }
115+
void setInitWholeWave() { HasInitWholeWave = true; }
116+
112117
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
113118
return allocateLDSGlobal(DL, GV, DynLDSAlign);
114119
}

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49974997
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
49984998
break;
49994999
}
5000+
case Intrinsic::amdgcn_init_whole_wave:
50005001
case Intrinsic::amdgcn_live_mask: {
50015002
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
50025003
break;

llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
329329
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
330330
def : SourceOfDivergence<int_amdgcn_update_dpp>;
331331
def : SourceOfDivergence<int_amdgcn_writelane>;
332+
def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
332333

333334
foreach intr = AMDGPUMFMAIntrinsics908 in
334335
def : SourceOfDivergence<intr>;

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
17391739
? DenormalMode::IEEE
17401740
: DenormalMode::PreserveSign;
17411741

1742+
if (YamlMFI.HasInitWholeWave)
1743+
MFI->setInitWholeWave();
1744+
17421745
return false;
17431746
}
17441747

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
13431343

13441344
// Allocate spill slots for WWM reserved VGPRs.
13451345
// For chain functions, we only need to do this if we have calls to
1346-
// llvm.amdgcn.cs.chain.
1347-
bool IsChainWithoutCalls =
1348-
FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1349-
if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1346+
// llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
1347+
// chain functions do not return) and the function did not contain a call to
1348+
// llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
1349+
// when entering the function).
1350+
bool IsChainWithoutRestores =
1351+
FuncInfo->isChainFunction() &&
1352+
(!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
1353+
if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
13501354
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
13511355
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
13521356
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15677,6 +15677,133 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
1567715677
}
1567815678
}
1567915679

15680+
static void removeInitWholeWaveBranch(MachineFunction &MF,
15681+
MachineRegisterInfo &MRI,
15682+
const SIInstrInfo *TII) {
15683+
// Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
15684+
// EXEC to -1 at SI_END_CF.
15685+
auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
15686+
return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
15687+
});
15688+
if (IWWIt == MF.begin()->instr_end())
15689+
return; // We've been here before (GISel runs finalizeLowering twice).
15690+
15691+
MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
15692+
assert(If.getOpcode() == AMDGPU::SI_IF &&
15693+
"Unexpected user for init.whole.wave result");
15694+
assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
15695+
"Expected simple control flow");
15696+
15697+
MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
15698+
MachineBasicBlock *EndBB = EndCf.getParent();
15699+
15700+
// Update all the Phis: since we're removing a predecessor, we need to remove
15701+
// the corresponding pair of operands. However, we can't just drop the value
15702+
// coming from the 'if' block - that's going to be the value of the inactive
15703+
// lanes.
15704+
// %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
15705+
// should become
15706+
// %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
15707+
// %v = v_set_inactive %t, %inactive
15708+
// Note that usually EndCf will be the first instruction after the phis and as
15709+
// such will serve as the end of the range when iterating over phis.
15710+
// Therefore, we shouldn't introduce any new instructions before it.
15711+
const SIRegisterInfo &TRI = TII->getRegisterInfo();
15712+
auto AfterEndCf = std::next(EndCf.getIterator());
15713+
for (auto &Phi : EndBB->phis()) {
15714+
Register PhiDest = Phi.getOperand(0).getReg();
15715+
const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
15716+
15717+
Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
15718+
Phi.getOperand(0).setReg(NewPhiDest);
15719+
15720+
unsigned InactiveOpIdx = 0;
15721+
for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
15722+
if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
15723+
InactiveOpIdx = I;
15724+
break;
15725+
}
15726+
}
15727+
assert(InactiveOpIdx != 0 && "Broken phi?");
15728+
15729+
// At this point, the register class could be larger than 32 or 64, so we
15730+
// might have to use more than one V_SET_INACTIVE instruction.
15731+
unsigned Size = TRI.getRegSizeInBits(*PhiRC);
15732+
switch (Size) {
15733+
case 32:
15734+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15735+
TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
15736+
.addReg(NewPhiDest)
15737+
.add(Phi.getOperand(InactiveOpIdx));
15738+
break;
15739+
case 64:
15740+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15741+
TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
15742+
.addReg(NewPhiDest)
15743+
.add(Phi.getOperand(InactiveOpIdx));
15744+
break;
15745+
default: {
15746+
// For each 32-bit subregister of the register at InactiveOpIdx, insert
15747+
// a COPY to a new register, and a V_SET_INACTIVE_B32 using the
15748+
// corresponding subregisters of PhiDest and NewPhiDest.
15749+
// FIXME: There has to be a better way to iterate over this...
15750+
llvm::SmallVector<Register, 16> PhiSubRegs;
15751+
const unsigned SubRegIndices[] = {
15752+
AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
15753+
AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
15754+
AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
15755+
AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
15756+
AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
15757+
AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
15758+
AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
15759+
AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31};
15760+
const unsigned NumSubRegs = Size / 32;
15761+
assert(sizeof(SubRegIndices) / sizeof(SubRegIndices[0]) >= NumSubRegs &&
15762+
"Not enough subregister indices");
15763+
for (unsigned I = 0; I != NumSubRegs; ++I) {
15764+
unsigned SubRegIdx = SubRegIndices[I];
15765+
Register InactiveSubReg =
15766+
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15767+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(), TII->get(AMDGPU::COPY),
15768+
InactiveSubReg)
15769+
.addReg(Phi.getOperand(InactiveOpIdx).getReg(), 0, SubRegIdx);
15770+
15771+
Register AllLanesSubReg =
15772+
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15773+
BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15774+
TII->get(AMDGPU::V_SET_INACTIVE_B32), AllLanesSubReg)
15775+
.addReg(NewPhiDest, 0, SubRegIdx)
15776+
.addReg(InactiveSubReg);
15777+
PhiSubRegs.push_back(AllLanesSubReg);
15778+
}
15779+
// Now we need to combine the subregisters into the original register.
15780+
auto RegSequence = BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
15781+
TII->get(AMDGPU::REG_SEQUENCE), PhiDest);
15782+
for (unsigned I = 0; I < NumSubRegs; ++I) {
15783+
RegSequence.addReg(PhiSubRegs[I]);
15784+
RegSequence.addImm(SubRegIndices[I]);
15785+
}
15786+
break;
15787+
}
15788+
}
15789+
15790+
Phi.removeOperand(InactiveOpIdx + 1);
15791+
Phi.removeOperand(InactiveOpIdx);
15792+
}
15793+
If.getParent()->removeSuccessor(EndBB);
15794+
15795+
BuildMI(*EndBB, AfterEndCf, IWWIt->getDebugLoc(),
15796+
TII->get(MF.getSubtarget<GCNSubtarget>().isWave32()
15797+
? AMDGPU::S_MOV_B32
15798+
: AMDGPU::S_MOV_B64),
15799+
TII->getRegisterInfo().getExec())
15800+
.addImm(-1);
15801+
15802+
EndCf.eraseFromParent();
15803+
If.eraseFromParent();
15804+
IWWIt->eraseFromParent();
15805+
}
15806+
1568015807
// Figure out which registers should be reserved for stack access. Only after
1568115808
// the function is legalized do we know all of the non-spill stack objects or if
1568215809
// calls are present.
@@ -15687,6 +15814,12 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1568715814
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
1568815815
const SIInstrInfo *TII = ST.getInstrInfo();
1568915816

15817+
if (Info->hasInitWholeWave()) {
15818+
assert(Info->isChainFunction() &&
15819+
"init.whole.wave may only be used in chain functions");
15820+
removeInitWholeWaveBranch(MF, MRI, TII);
15821+
}
15822+
1569015823
if (Info->isEntryFunction()) {
1569115824
// Callable functions have fixed registers used for stack access.
1569215825
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,14 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
583583
let Defs = [EXEC];
584584
}
585585

586+
// Sets EXEC to all lanes and returns the previous EXEC.
587+
def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
588+
(outs SReg_1:$dst), (ins),
589+
[(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
590+
let Defs = [EXEC];
591+
let Uses = [EXEC];
592+
}
593+
586594
// Return for returning shaders to a shader variant epilog.
587595
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
588596
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
289289
StringValue SGPRForEXECCopy;
290290
StringValue LongBranchReservedReg;
291291

292+
bool HasInitWholeWave = false;
293+
292294
SIMachineFunctionInfo() = default;
293295
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
294296
const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
336338
StringValue()); // Don't print out when it's empty.
337339
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
338340
StringValue());
341+
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
339342
}
340343
};
341344

0 commit comments

Comments
 (0)