Skip to content

Commit 428ae0f

Browse files
authored
AMDGPU: Do not tail call if an inreg argument requires waterfalling (#111002)
If we have a divergent value passed to an outgoing inreg argument, the call needs to be executed in a waterfall loop and thus cannot be tail called. The waterfall handling of arbitrary calls is broken on the selectiondag path, so some of these cases still hit an error later. I also noticed the argument evaluation code in isEligibleForTailCallOptimization is not correctly accounting for implicit argument assignments. It also seems inreg codegen is generally broken; we are assigning arguments to the reserved private resource descriptor.
1 parent 6a1bdd9 commit 428ae0f

File tree

7 files changed

+510
-123
lines changed

7 files changed

+510
-123
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1142,6 +1142,9 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
11421142
return false;
11431143
}
11441144

1145+
// FIXME: We need to check if any arguments passed in SGPR are uniform. If
1146+
// they are not, this cannot be a tail call. If they are uniform, but may be
1147+
// VGPR, we need to insert readfirstlanes.
11451148
if (!areCalleeOutgoingArgsTailCallable(Info, MF, OutArgs))
11461149
return false;
11471150

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3565,7 +3565,6 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
35653565
if (IsVarArg)
35663566
return false;
35673567

3568-
// FIXME: We need to know all arguments passed in SGPR are uniform.
35693568
for (const Argument &Arg : CallerF.args()) {
35703569
if (Arg.hasByValAttr())
35713570
return false;
@@ -3593,6 +3592,8 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
35933592
SmallVector<CCValAssign, 16> ArgLocs;
35943593
CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
35953594

3595+
// FIXME: We are not allocating special input registers, so we will be
3596+
// deciding based on incorrect register assignments.
35963597
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
35973598

35983599
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -3602,6 +3603,21 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
36023603
if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
36033604
return false;
36043605

3606+
for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3607+
// FIXME: What about inreg arguments that end up passed in memory?
3608+
if (!CCVA.isRegLoc())
3609+
continue;
3610+
3611+
// If we are passing an argument in an SGPR, and the value is divergent,
3612+
// this call requires a waterfall loop.
3613+
if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3614+
LLVM_DEBUG(
3615+
dbgs() << "Cannot tail call due to divergent outgoing argument in "
3616+
<< printReg(CCVA.getLocReg(), TRI) << '\n');
3617+
return false;
3618+
}
3619+
}
3620+
36053621
const MachineRegisterInfo &MRI = MF.getRegInfo();
36063622
return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
36073623
}
@@ -3734,6 +3750,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
37343750
// arguments to begin at SP+0. Completely unused for non-tail calls.
37353751
int32_t FPDiff = 0;
37363752
MachineFrameInfo &MFI = MF.getFrameInfo();
3753+
auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
37373754

37383755
// Adjust the stack pointer for the new arguments...
37393756
// These operations are automatically eliminated by the prolog/epilog pass
@@ -3756,6 +3773,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
37563773
}
37573774
}
37583775

3776+
const unsigned NumSpecialInputs = RegsToPass.size();
3777+
37593778
MVT PtrVT = MVT::i32;
37603779

37613780
// Walk the register/memloc assignments, inserting copies/loads.
@@ -3857,16 +3876,40 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38573876
if (!MemOpChains.empty())
38583877
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
38593878

3879+
SDValue ReadFirstLaneID =
3880+
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3881+
3882+
SDValue TokenGlue;
3883+
if (CLI.ConvergenceControlToken) {
3884+
TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3885+
CLI.ConvergenceControlToken);
3886+
}
3887+
38603888
// Build a sequence of copy-to-reg nodes chained together with token chain
38613889
// and flag operands which copy the outgoing args into the appropriate regs.
38623890
SDValue InGlue;
3863-
for (auto &RegToPass : RegsToPass) {
3864-
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3865-
RegToPass.second, InGlue);
3891+
3892+
unsigned ArgIdx = 0;
3893+
for (auto [Reg, Val] : RegsToPass) {
3894+
if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
3895+
TRI->isSGPRPhysReg(Reg)) {
3896+
// Speculatively insert a readfirstlane in case this is a uniform value in
3897+
// a VGPR.
3898+
//
3899+
// FIXME: We need to execute this in a waterfall loop if it is a divergent
3900+
// value, so let that continue to produce invalid code.
3901+
3902+
SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3903+
if (TokenGlue)
3904+
ReadfirstlaneArgs.push_back(TokenGlue);
3905+
Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
3906+
ReadfirstlaneArgs);
3907+
}
3908+
3909+
Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
38663910
InGlue = Chain.getValue(1);
38673911
}
38683912

3869-
38703913
// We don't usually want to end the call-sequence here because we would tidy
38713914
// the frame up *after* the call, however in the ABI-changing tail-call case
38723915
// we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3896,12 +3939,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
38963939
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
38973940

38983941
SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3899-
if (CLI.ConvergenceControlToken) {
3900-
SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {},
3901-
MVT::Glue, CLI.ConvergenceControlToken);
3942+
if (TokenGlue)
39023943
ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3903-
}
3904-
39053944
Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
39063945
ReadfirstlaneArgs);
39073946
}
@@ -3928,7 +3967,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
39283967
}
39293968

39303969
// Add a register mask operand representing the call-preserved registers.
3931-
auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
39323970
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
39333971
assert(Mask && "Missing call preserved mask for calling convention");
39343972
Ops.push_back(DAG.getRegisterMask(Mask));

llvm/lib/Target/AMDGPU/SIRegisterInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
210210
}
211211

212212
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const;
213+
bool isSGPRPhysReg(Register Reg) const {
214+
return isSGPRClass(getPhysRegBaseClass(Reg));
215+
}
213216

214217
/// \returns true if this class contains only VGPR registers
215218
static bool isVGPRClass(const TargetRegisterClass *RC) {

0 commit comments

Comments
 (0)