Skip to content

[AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types #89217

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 38 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
aa4e757
[AMDGPU] add support for i64 readlane
vikramRH Apr 10, 2024
b895dd5
add support for i64 readfirstlane and writelane intrinsics
vikramRH Apr 12, 2024
dfa3219
Fix issues with writelane expansion
vikramRH Apr 15, 2024
fcc0a1a
code refactor and add patterns for f64
vikramRH Apr 18, 2024
4e71a06
clang format
vikramRH Apr 18, 2024
c7ff0e5
fix corner case with regkill and add readlane tests
vikramRH Apr 18, 2024
d6a8ce4
update builtin handling for readlane and readfirstlane
vikramRH Apr 19, 2024
15cbd90
add and update tests, fixes to writelane src0 imm handling
vikramRH Apr 19, 2024
776a4c6
address review comments
Apr 22, 2024
82da530
Implement lowering in legalizer for legal types
vikramRH May 2, 2024
14fcf44
refactor/improve GIsel lowering, added new tests
vikramRH May 6, 2024
d0610c4
Review comments, refactor GISel Impl
vikramRH May 9, 2024
9233833
clang-format
vikramRH May 9, 2024
5feef44
Merge branch 'main' into rw_lane_64
vikramRH May 13, 2024
993a630
Review comments, improve pointer handling with GISel
vikramRH May 13, 2024
556dda2
align comments
vikramRH May 13, 2024
b59873e
Review comments
vikramRH May 15, 2024
edd3179
fix type profile
vikramRH May 16, 2024
a75eb6b
remove spurious comma
vikramRH May 17, 2024
52d7020
review comments, move pointer tests to new files
vikramRH May 18, 2024
66ca57c
remove bitcasts, avoid special handling of pointers in gisel
vikramRH May 23, 2024
c3e512c
Review comments, updated AMDGPUUsage.rst
vikramRH May 27, 2024
72af37c
preserve legel 32-bit pieces, update usage doc
vikramRH May 30, 2024
2e4c5bc
Refactor GIsel lowering
vikramRH May 30, 2024
67e19e5
fix documentation mess
vikramRH May 30, 2024
cba2b1d
review comments
vikramRH May 30, 2024
26223c8
handle comment
vikramRH May 31, 2024
429fb0f
Review comments
vikramRH May 31, 2024
ec7b5c1
test for convergence related crash
vikramRH Jun 3, 2024
3d9cf2e
Update convergence-laneops-xfail.ll
vikramRH Jun 3, 2024
c015040
Merge branch 'main' into rw_lane_64
vikramRH Jun 6, 2024
482f380
update convergence related failure tests
vikramRH Jun 12, 2024
2b4cabb
Merge branch 'main' into rw_lane_64
vikramRH Jun 12, 2024
1a33cbc
revert targte specific SDNodes, handle convergence tokens in SDAG
vikramRH Jun 14, 2024
cfa659d
remove spurious new lines
vikramRH Jun 14, 2024
be90ba6
review comments
vikramRH Jun 17, 2024
104121f
Merge branch 'main' into rw_lane_64
vikramRH Jun 22, 2024
31b8838
update builtin CodeGen
vikramRH Jun 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18450,6 +18450,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
return Builder.CreateCall(F, Args);
}
case AMDGPU::BI__builtin_amdgcn_readlane:
return emitBuiltinWithOneOverloadedType<2>(*this, E,
Intrinsic::amdgcn_readlane);
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_readfirstlane);
case AMDGPU::BI__builtin_amdgcn_div_fixup:
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
case AMDGPU::BI__builtin_amdgcn_div_fixuph:
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Original file line number Diff line number Diff line change
Expand Up @@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
}

// CHECK-LABEL: @test_readfirstlane
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
void test_readfirstlane(global int* out, int a)
{
*out = __builtin_amdgcn_readfirstlane(a);
}

// CHECK-LABEL: @test_readlane
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
void test_readlane(global int* out, int a, int b)
{
*out = __builtin_amdgcn_readlane(a, b);
Expand Down
17 changes: 17 additions & 0 deletions llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1188,6 +1188,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics.

:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.

llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
the lowest active lane of the input operand. Currently implemented
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
i64, double, pointers, multiples of the 32-bit vectors.

llvm.amdgcn.readlane Provides direct access to v_readlane_b32. Returns the value in the
specified lane of the first input operand. The second operand specifies
the lane to read from. Currently implemented for i16, i32, float, half,
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
multiples of the 32-bit vectors.

llvm.amdgcn.writelane Provides direct access to v_writelane_b32. Writes value in the first input
operand to the specified lane of divergent output. The second operand
specifies the lane to write. Currently implemented for i16, i32, float,
half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
multiples of the 32-bit vectors.

llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
provided by each lane in the wavefront.
Intrinsic takes a hint for reduction strategy using second operand
Expand Down
15 changes: 6 additions & 9 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2043,26 +2043,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;

def int_amdgcn_readfirstlane :
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The lane argument must be uniform across the currently active threads of the
// current wave. Otherwise, the result is undefined.
def int_amdgcn_readlane :
ClangBuiltin<"__builtin_amdgcn_readlane">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;

// The value to write and lane select arguments must be uniform across the
// currently active threads of the current wave. Otherwise, the result is
// undefined.
def int_amdgcn_writelane :
ClangBuiltin<"__builtin_amdgcn_writelane">,
Intrinsic<[llvm_i32_ty], [
llvm_i32_ty, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
llvm_i32_ty // returned by all lanes other than the selected one
Intrinsic<[llvm_any_ty], [
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
llvm_i32_ty, // uniform lane select
LLVMMatchType<0> // returned by all lanes other than the selected one
],
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
V = B.CreateBitCast(V, IntNTy);
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
Expand Down Expand Up @@ -514,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
B.getInt32(0xf), B.getFalse()});
} else {
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
Function *WriteLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
Function *ReadLane = Intrinsic::getDeclaration(
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
Function *WriteLane = Intrinsic::getDeclaration(
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());

// On GFX10 all DPP operations are confined to a single row. To get cross-
// row operations we have to use permlane or readlane.
Expand Down
96 changes: 96 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5432,6 +5432,98 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
return true;
}

// TODO: Fix pointer type handling
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstr &MI,
Intrinsic::ID IID) const {

MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();

auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
LLT VT) -> Register {
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
default:
llvm_unreachable("unhandled lane op");
}
};

Register DstReg = MI.getOperand(0).getReg();
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane) {
Src2 = MI.getOperand(4).getReg();
}
}

LLT Ty = MRI.getType(DstReg);
unsigned Size = Ty.getSizeInBits();

if (Size == 32) {
// Already legal
return true;
}

if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
if (Src2.isValid())
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);

Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
B.buildTrunc(DstReg, LaneOpDst);

MI.eraseFromParent();
return true;
}

if (Size % 32 != 0)
return false;

LLT PartialResTy = S32;
if (Ty.isVector()) {
LLT EltTy = Ty.getElementType();
switch (EltTy.getSizeInBits()) {
case 16:
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
break;
case 32:
PartialResTy = EltTy;
break;
default:
// Handle all other cases via S32 pieces;
break;
}
}

SmallVector<Register, 2> PartialRes;
unsigned NumParts = Size / 32;
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src2Parts;

if (Src2.isValid())
Src2Parts = B.buildUnmerge(PartialResTy, Src2);

for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);
if (Src2.isValid())
Src2 = Src2Parts.getReg(i);
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
}

B.buildMergeLikeInstr(DstReg, PartialRes);
MI.eraseFromParent();
return true;
}

bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
Expand Down Expand Up @@ -7373,6 +7465,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Observer.changedInstr(MI);
return true;
}
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_writelane:
case Intrinsic::amdgcn_readfirstlane:
return legalizeLaneOp(Helper, MI, IntrID);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
Intrinsic::ID IID) const;

bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
Intrinsic::ID IID) const;

bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;

bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
Expand Down
155 changes: 155 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6100,6 +6100,157 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
}

static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
unsigned ValSize = VT.getSizeInBits();
unsigned IID = N->getConstantOperandVal(0);
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);

auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
SDValue Src2, MVT ValT) -> SDValue {
SmallVector<SDValue, 8> Operands;
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
switch (IID) {
case Intrinsic::amdgcn_readfirstlane:
Operands.push_back(Src0);
break;
case Intrinsic::amdgcn_readlane:
Operands.push_back(Src0);
Operands.push_back(Src1);
break;
case Intrinsic::amdgcn_writelane:
Operands.push_back(Src0);
Operands.push_back(Src1);
Operands.push_back(Src2);
break;
}

if (SDNode *GL = N->getGluedNode()) {
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
GL = GL->getOperand(0).getNode();
Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
SDValue(GL, 0)));
}

return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
};

SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane)
Src2 = N->getOperand(3);
}

if (ValSize == 32) {
// Already legal
return SDValue();
}

if (ValSize < 32) {
bool IsFloat = VT.isFloatingPoint();
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);
if (Src2.getNode()) {
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
SL, MVT::i32);
}
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
}

if (ValSize % 32 != 0)
return SDValue();

auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
EVT VT = N->getValueType(0);
unsigned NE = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();
SmallVector<SDValue, 8> Scalars;
unsigned NumOperands = N->getNumOperands();
SmallVector<SDValue, 4> Operands(NumOperands);
SDNode *GL = N->getGluedNode();

// only handle convergencectrl_glue
assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);

for (unsigned i = 0; i != NE; ++i) {
for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
++j) {
SDValue Operand = N->getOperand(j);
EVT OperandVT = Operand.getValueType();
if (OperandVT.isVector()) {
// A vector operand; extract a single element.
EVT OperandEltVT = OperandVT.getVectorElementType();
Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
Operand, DAG.getVectorIdxConstant(i, SL));
} else {
// A scalar operand; just use it as is.
Operands[j] = Operand;
}
}

if (GL)
Operands[NumOperands - 1] =
DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
SDValue(GL->getOperand(0).getNode(), 0));

Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
}

EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
return DAG.getBuildVector(VecVT, SL, Scalars);
};

if (VT.isVector()) {
switch (MVT::SimpleValueType EltTy =
VT.getVectorElementType().getSimpleVT().SimpleTy) {
case MVT::i32:
case MVT::f32: {
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
return unrollLaneOp(LaneOp.getNode());
}
case MVT::i16:
case MVT::f16:
case MVT::bf16: {
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
SmallVector<SDValue, 4> Pieces;
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
SDValue Src0SubVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));

SDValue Src2SubVec;
if (Src2)
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
DAG.getConstant(EltIdx, SL, MVT::i32));

Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
}
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
}
default:
// Handle all other cases by bitcasting to i32 vectors
break;
}
}

MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);

if (Src2)
Src2 = DAG.getBitcast(VecVT, Src2);

SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
return DAG.getBitcast(VT, UnrolledLaneOp);
}

void SITargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const {
Expand Down Expand Up @@ -8566,6 +8717,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_addrspacecast_nonnull:
return lowerADDRSPACECAST(Op, DAG);
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_writelane:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
Expand Down
Loading