@@ -3565,7 +3565,6 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
3565
3565
if (IsVarArg)
3566
3566
return false;
3567
3567
3568
- // FIXME: We need to know all arguments passed in SGPR are uniform.
3569
3568
for (const Argument &Arg : CallerF.args()) {
3570
3569
if (Arg.hasByValAttr())
3571
3570
return false;
@@ -3593,6 +3592,8 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
3593
3592
SmallVector<CCValAssign, 16> ArgLocs;
3594
3593
CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3595
3594
3595
+ // FIXME: We are not allocating special input registers, so we will be
3596
+ // deciding based on incorrect register assignments.
3596
3597
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3597
3598
3598
3599
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -3602,6 +3603,21 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
3602
3603
if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3603
3604
return false;
3604
3605
3606
+ for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
3607
+ // FIXME: What about inreg arguments that end up passed in memory?
3608
+ if (!CCVA.isRegLoc())
3609
+ continue;
3610
+
3611
+ // If we are passing an argument in an SGPR, and the value is divergent,
3612
+ // this call requires a waterfall loop.
3613
+ if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
3614
+ LLVM_DEBUG(
3615
+ dbgs() << "Cannot tail call due to divergent outgoing argument in "
3616
+ << printReg(CCVA.getLocReg(), TRI) << '\n');
3617
+ return false;
3618
+ }
3619
+ }
3620
+
3605
3621
const MachineRegisterInfo &MRI = MF.getRegInfo();
3606
3622
return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3607
3623
}
@@ -3734,6 +3750,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3734
3750
// arguments to begin at SP+0. Completely unused for non-tail calls.
3735
3751
int32_t FPDiff = 0;
3736
3752
MachineFrameInfo &MFI = MF.getFrameInfo();
3753
+ auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3737
3754
3738
3755
// Adjust the stack pointer for the new arguments...
3739
3756
// These operations are automatically eliminated by the prolog/epilog pass
@@ -3756,6 +3773,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3756
3773
}
3757
3774
}
3758
3775
3776
+ const unsigned NumSpecialInputs = RegsToPass.size();
3777
+
3759
3778
MVT PtrVT = MVT::i32;
3760
3779
3761
3780
// Walk the register/memloc assignments, inserting copies/loads.
@@ -3857,16 +3876,40 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3857
3876
if (!MemOpChains.empty())
3858
3877
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3859
3878
3879
+ SDValue ReadFirstLaneID =
3880
+ DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3881
+
3882
+ SDValue TokenGlue;
3883
+ if (CLI.ConvergenceControlToken) {
3884
+ TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
3885
+ CLI.ConvergenceControlToken);
3886
+ }
3887
+
3860
3888
// Build a sequence of copy-to-reg nodes chained together with token chain
3861
3889
// and flag operands which copy the outgoing args into the appropriate regs.
3862
3890
SDValue InGlue;
3863
- for (auto &RegToPass : RegsToPass) {
3864
- Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3865
- RegToPass.second, InGlue);
3891
+
3892
+ unsigned ArgIdx = 0;
3893
+ for (auto [Reg, Val] : RegsToPass) {
3894
+ if (ArgIdx++ >= NumSpecialInputs && !Val->isDivergent() &&
3895
+ TRI->isSGPRPhysReg(Reg)) {
3896
+ // Speculatively insert a readfirstlane in case this is a uniform value in
3897
+ // a VGPR.
3898
+ //
3899
+ // FIXME: We need to execute this in a waterfall loop if it is a divergent
3900
+ // value, so let that continue to produce invalid code.
3901
+
3902
+ SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
3903
+ if (TokenGlue)
3904
+ ReadfirstlaneArgs.push_back(TokenGlue);
3905
+ Val = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Val.getValueType(),
3906
+ ReadfirstlaneArgs);
3907
+ }
3908
+
3909
+ Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
3866
3910
InGlue = Chain.getValue(1);
3867
3911
}
3868
3912
3869
-
3870
3913
// We don't usually want to end the call-sequence here because we would tidy
3871
3914
// the frame up *after* the call, however in the ABI-changing tail-call case
3872
3915
// we've carefully laid out the parameters so that when sp is reset they'll be
@@ -3896,12 +3939,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3896
3939
DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3897
3940
3898
3941
SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
3899
- if (CLI.ConvergenceControlToken) {
3900
- SDValue TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, {},
3901
- MVT::Glue, CLI.ConvergenceControlToken);
3942
+ if (TokenGlue)
3902
3943
ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
3903
- }
3904
-
3905
3944
Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
3906
3945
ReadfirstlaneArgs);
3907
3946
}
@@ -3928,7 +3967,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
3928
3967
}
3929
3968
3930
3969
// Add a register mask operand representing the call-preserved registers.
3931
- auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3932
3970
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3933
3971
assert(Mask && "Missing call preserved mask for calling convention");
3934
3972
Ops.push_back(DAG.getRegisterMask(Mask));
0 commit comments