Skip to content

[AMDGPU] ISel for @llvm.amdgcn.cs.chain intrinsic #68186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/PatternMatch.h"
Expand Down Expand Up @@ -2390,6 +2391,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
Info.OrigRet = {Register(), Type::getVoidTy(CI.getContext()), 0};
return CLI->lowerCall(MIRBuilder, Info);
}
case Intrinsic::amdgcn_cs_chain:
return translateCallBase(CI, MIRBuilder);
case Intrinsic::fptrunc_round: {
uint32_t Flags = MachineInstr::copyFlagsFromInstruction(CI);

Expand Down
49 changes: 49 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsWebAssembly.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Metadata.h"
Expand Down Expand Up @@ -7424,6 +7425,54 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, Val);
return;
}
case Intrinsic::amdgcn_cs_chain: {
assert(I.arg_size() == 5 && "Additional args not supported yet");
assert(cast<ConstantInt>(I.getOperand(4))->isZero() &&
"Non-zero flags not supported yet");

// At this point we don't care if it's amdgpu_cs_chain or
// amdgpu_cs_chain_preserve.
CallingConv::ID CC = CallingConv::AMDGPU_CS_Chain;

Type *RetTy = I.getType();
assert(RetTy->isVoidTy() && "Should not return");

SDValue Callee = getValue(I.getOperand(0));

// We only have 2 actual args: one for the SGPRs and one for the VGPRs.
// We'll also tack the value of the EXEC mask at the end.
TargetLowering::ArgListTy Args;
Args.reserve(3);

for (unsigned Idx : {2, 3, 1}) {
TargetLowering::ArgListEntry Arg;
Arg.Node = getValue(I.getOperand(Idx));
Arg.Ty = I.getOperand(Idx)->getType();
Arg.setAttributes(&I, Idx);
Args.push_back(Arg);
}

assert(Args[0].IsInReg && "SGPR args should be marked inreg");
assert(!Args[1].IsInReg && "VGPR args should not be marked inreg");
Args[2].IsInReg = true; // EXEC should be inreg

TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(getCurSDLoc())
.setChain(getRoot())
.setCallee(CC, RetTy, Callee, std::move(Args))
.setNoReturn(true)
.setTailCall(true)
.setConvergent(I.isConvergent());
CLI.CB = &I;
std::pair<SDValue, SDValue> Result =
lowerInvokable(CLI, /*EHPadBB*/ nullptr);
(void)Result;
assert(!Result.first.getNode() && !Result.second.getNode() &&
"Should've lowered as tail call");

HasTailCall = true;
return;
}
case Intrinsic::ptrmask: {
SDValue Ptr = getValue(I.getOperand(0));
SDValue Mask = getValue(I.getOperand(1));
Expand Down
112 changes: 101 additions & 11 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -957,12 +957,18 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
}

static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
bool IsTailCall, CallingConv::ID CC) {
assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
"because the address can be divergent");
bool IsTailCall, bool isWave32,
CallingConv::ID CC) {
// For calls to amdgpu_cs_chain functions, the address is known to be uniform.
assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
"Indirect calls can't be tail calls, "
"because the address can be divergent");
if (!IsTailCall)
return AMDGPU::G_SI_CALL;

if (AMDGPU::isChainCC(CC))
return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;

return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
AMDGPU::SI_TCRETURN;
}
Expand Down Expand Up @@ -1150,14 +1156,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
void AMDGPUCallLowering::handleImplicitCallArguments(
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
if (!ST.enableFlatScratch()) {
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
FuncInfo.getScratchRSrcReg());
MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);

auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
: AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;

MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
}

for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
Expand Down Expand Up @@ -1189,7 +1201,8 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!IsSibCall)
CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);

unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
unsigned Opc =
getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
return false;
Expand All @@ -1198,8 +1211,27 @@ bool AMDGPUCallLowering::lowerTailCall(
// be 0.
MIB.addImm(0);

// Tell the call which registers are clobbered.
// If this is a chain call, we need to pass in the EXEC mask.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
if (AMDGPU::isChainCC(Info.CallConv)) {
ArgInfo ExecArg = Info.OrigArgs[1];
assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");

if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
return false;

if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
MIB.addImm(CI->getSExtValue());
} else {
MIB.addReg(ExecArg.Regs[0]);
unsigned Idx = MIB->getNumOperands() - 1;
MIB->getOperand(Idx).setReg(constrainOperandRegClass(
MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
MIB->getDesc(), MIB->getOperand(Idx), Idx));
}
}

// Tell the call which registers are clobbered.
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
MIB.addRegMask(Mask);

Expand Down Expand Up @@ -1253,7 +1285,8 @@ bool AMDGPUCallLowering::lowerTailCall(
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;

if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
!AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
Expand All @@ -1269,7 +1302,8 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
return false;

handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
ImplicitArgRegs);

// If we have -tailcallopt, we need to adjust the stack. We'll do the call
// sequence start and end here.
Expand Down Expand Up @@ -1303,8 +1337,62 @@ bool AMDGPUCallLowering::lowerTailCall(
return true;
}

/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
ArgInfo Callee = Info.OrigArgs[0];
ArgInfo SGPRArgs = Info.OrigArgs[2];
ArgInfo VGPRArgs = Info.OrigArgs[3];
ArgInfo Flags = Info.OrigArgs[4];

assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
"Non-zero flags aren't supported yet.");
assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");

MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();

// The function to jump to is actually the first argument, so we'll change the
// Callee and other info to match that before using our existing helper.
const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
if (const Function *F = dyn_cast<Function>(CalleeV)) {
Info.Callee = MachineOperand::CreateGA(F, 0);
Info.CallConv = F->getCallingConv();
} else {
assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
// behaves the same here.
}

// The function that we're calling cannot be vararg (only the intrinsic is).
Info.IsVarArg = false;

assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
"SGPR arguments should be marked inreg");
assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
[](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
"VGPR arguments should not be marked inreg");

SmallVector<ArgInfo, 8> OutArgs;
splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);

Info.IsMustTailCall = true;
return lowerTailCall(MIRBuilder, Info, OutArgs);
}

bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
if (Function *F = Info.CB->getCalledFunction())
if (F->isIntrinsic()) {
assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
"Unexpected intrinsic");
return lowerChainCall(MIRBuilder, Info);
}

if (Info.IsVarArg) {
LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
return false;
Expand Down Expand Up @@ -1353,7 +1441,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
Info.CallConv);

auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.addDef(TRI->getReturnAddressReg(MF));
Expand Down Expand Up @@ -1395,7 +1484,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
ImplicitArgRegs);

// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getStackSize();
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,13 @@ class AMDGPUCallLowering final : public CallLowering {
void handleImplicitCallArguments(
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;

bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
SmallVectorImpl<ArgInfo> &OutArgs) const;
bool lowerChainCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;

Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5212,6 +5212,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_GLUE)
NODE_NAME_CASE(WAVE_ADDRESS)
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ enum NodeType : unsigned {
CALL,
TC_RETURN,
TC_RETURN_GFX,
TC_RETURN_CHAIN,
TRAP,

// Masked control flow nodes.
Expand Down
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,11 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;

def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
Expand Down
Loading