Skip to content

Commit 2c6e680

Browse files
vikramRHvikramRH
authored andcommitted
[AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (llvm#89217)
This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This legalizes 64 bit readlane, writelane and readfirstlane ops pre-ISel --------- Co-authored-by: vikramRH <[email protected]>
1 parent 92a8644 commit 2c6e680

27 files changed

+5341
-265
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18467,6 +18467,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1846718467
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
1846818468
return Builder.CreateCall(F, Args);
1846918469
}
18470+
case AMDGPU::BI__builtin_amdgcn_readlane:
18471+
return emitBuiltinWithOneOverloadedType<2>(*this, E,
18472+
Intrinsic::amdgcn_readlane);
18473+
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
18474+
return emitBuiltinWithOneOverloadedType<1>(*this, E,
18475+
Intrinsic::amdgcn_readfirstlane);
1847018476
case AMDGPU::BI__builtin_amdgcn_div_fixup:
1847118477
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
1847218478
case AMDGPU::BI__builtin_amdgcn_div_fixuph:

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,14 +308,14 @@ void test_ds_bpermute(global int* out, int a, int b)
308308
}
309309

310310
// CHECK-LABEL: @test_readfirstlane
311-
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane(i32 %a)
311+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
312312
void test_readfirstlane(global int* out, int a)
313313
{
314314
*out = __builtin_amdgcn_readfirstlane(a);
315315
}
316316

317317
// CHECK-LABEL: @test_readlane
318-
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
318+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
319319
void test_readlane(global int* out, int a, int b)
320320
{
321321
*out = __builtin_amdgcn_readlane(a, b);

llvm/docs/AMDGPUUsage.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,23 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
11881188

11891189
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
11901190

1191+
llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
1192+
the lowest active lane of the input operand. Currently implemented
1193+
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
1194+
i64, double, pointers, multiples of the 32-bit vectors.
1195+
1196+
llvm.amdgcn.readlane Provides direct access to v_readlane_b32. Returns the value in the
1197+
specified lane of the first input operand. The second operand specifies
1198+
the lane to read from. Currently implemented for i16, i32, float, half,
1199+
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
1200+
multiples of the 32-bit vectors.
1201+
1202+
llvm.amdgcn.writelane Provides direct access to v_writelane_b32. Writes value in the first input
1203+
operand to the specified lane of divergent output. The second operand
1204+
specifies the lane to write. Currently implemented for i16, i32, float,
1205+
half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
1206+
multiples of the 32-bit vectors.
1207+
11911208
llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
11921209
provided by each lane in the wavefront.
11931210
Intrinsic takes a hint for reduction strategy using second operand

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2042,26 +2042,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
20422042
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
20432043

20442044
def int_amdgcn_readfirstlane :
2045-
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
2046-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2045+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
20472046
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
20482047

20492048
// The lane argument must be uniform across the currently active threads of the
20502049
// current wave. Otherwise, the result is undefined.
20512050
def int_amdgcn_readlane :
2052-
ClangBuiltin<"__builtin_amdgcn_readlane">,
2053-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2051+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
20542052
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
20552053

20562054
// The value to write and lane select arguments must be uniform across the
20572055
// currently active threads of the current wave. Otherwise, the result is
20582056
// undefined.
20592057
def int_amdgcn_writelane :
2060-
ClangBuiltin<"__builtin_amdgcn_writelane">,
2061-
Intrinsic<[llvm_i32_ty], [
2062-
llvm_i32_ty, // uniform value to write: returned by the selected lane
2063-
llvm_i32_ty, // uniform lane select
2064-
llvm_i32_ty // returned by all lanes other than the selected one
2058+
Intrinsic<[llvm_any_ty], [
2059+
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
2060+
llvm_i32_ty, // uniform lane select
2061+
LLVMMatchType<0> // returned by all lanes other than the selected one
20652062
],
20662063
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
20672064
>;

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
424424
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
425425
// combine them with a scalar operation.
426426
Function *ReadLane =
427-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
427+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
428428
V = B.CreateBitCast(V, IntNTy);
429429
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
430430
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -514,10 +514,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
514514
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
515515
B.getInt32(0xf), B.getFalse()});
516516
} else {
517-
Function *ReadLane =
518-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
519-
Function *WriteLane =
520-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
517+
Function *ReadLane = Intrinsic::getDeclaration(
518+
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
519+
Function *WriteLane = Intrinsic::getDeclaration(
520+
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
521521

522522
// On GFX10 all DPP operations are confined to a single row. To get cross-
523523
// row operations we have to use permlane or readlane.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5430,6 +5430,98 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
54305430
return true;
54315431
}
54325432

5433+
// TODO: Fix pointer type handling
5434+
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5435+
MachineInstr &MI,
5436+
Intrinsic::ID IID) const {
5437+
5438+
MachineIRBuilder &B = Helper.MIRBuilder;
5439+
MachineRegisterInfo &MRI = *B.getMRI();
5440+
5441+
auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
5442+
LLT VT) -> Register {
5443+
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5444+
switch (IID) {
5445+
case Intrinsic::amdgcn_readfirstlane:
5446+
return LaneOp.getReg(0);
5447+
case Intrinsic::amdgcn_readlane:
5448+
return LaneOp.addUse(Src1).getReg(0);
5449+
case Intrinsic::amdgcn_writelane:
5450+
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5451+
default:
5452+
llvm_unreachable("unhandled lane op");
5453+
}
5454+
};
5455+
5456+
Register DstReg = MI.getOperand(0).getReg();
5457+
Register Src0 = MI.getOperand(2).getReg();
5458+
Register Src1, Src2;
5459+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5460+
Src1 = MI.getOperand(3).getReg();
5461+
if (IID == Intrinsic::amdgcn_writelane) {
5462+
Src2 = MI.getOperand(4).getReg();
5463+
}
5464+
}
5465+
5466+
LLT Ty = MRI.getType(DstReg);
5467+
unsigned Size = Ty.getSizeInBits();
5468+
5469+
if (Size == 32) {
5470+
// Already legal
5471+
return true;
5472+
}
5473+
5474+
if (Size < 32) {
5475+
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5476+
if (Src2.isValid())
5477+
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5478+
5479+
Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5480+
B.buildTrunc(DstReg, LaneOpDst);
5481+
5482+
MI.eraseFromParent();
5483+
return true;
5484+
}
5485+
5486+
if (Size % 32 != 0)
5487+
return false;
5488+
5489+
LLT PartialResTy = S32;
5490+
if (Ty.isVector()) {
5491+
LLT EltTy = Ty.getElementType();
5492+
switch (EltTy.getSizeInBits()) {
5493+
case 16:
5494+
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5495+
break;
5496+
case 32:
5497+
PartialResTy = EltTy;
5498+
break;
5499+
default:
5500+
// Handle all other cases via S32 pieces;
5501+
break;
5502+
}
5503+
}
5504+
5505+
SmallVector<Register, 2> PartialRes;
5506+
unsigned NumParts = Size / 32;
5507+
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5508+
MachineInstrBuilder Src2Parts;
5509+
5510+
if (Src2.isValid())
5511+
Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5512+
5513+
for (unsigned i = 0; i < NumParts; ++i) {
5514+
Src0 = Src0Parts.getReg(i);
5515+
if (Src2.isValid())
5516+
Src2 = Src2Parts.getReg(i);
5517+
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5518+
}
5519+
5520+
B.buildMergeLikeInstr(DstReg, PartialRes);
5521+
MI.eraseFromParent();
5522+
return true;
5523+
}
5524+
54335525
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
54345526
MachineRegisterInfo &MRI,
54355527
MachineIRBuilder &B) const {
@@ -7370,6 +7462,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73707462
Observer.changedInstr(MI);
73717463
return true;
73727464
}
7465+
case Intrinsic::amdgcn_readlane:
7466+
case Intrinsic::amdgcn_writelane:
7467+
case Intrinsic::amdgcn_readfirstlane:
7468+
return legalizeLaneOp(Helper, MI, IntrID);
73737469
default: {
73747470
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
73757471
AMDGPU::getImageDimIntrinsicInfo(IntrID))

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
210210
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
211211
Intrinsic::ID IID) const;
212212

213+
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
214+
Intrinsic::ID IID) const;
215+
213216
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
214217

215218
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6098,6 +6098,157 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
60986098
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
60996099
}
61006100

6101+
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
6102+
SelectionDAG &DAG) {
6103+
EVT VT = N->getValueType(0);
6104+
unsigned ValSize = VT.getSizeInBits();
6105+
unsigned IID = N->getConstantOperandVal(0);
6106+
SDLoc SL(N);
6107+
MVT IntVT = MVT::getIntegerVT(ValSize);
6108+
6109+
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
6110+
SDValue Src2, MVT ValT) -> SDValue {
6111+
SmallVector<SDValue, 8> Operands;
6112+
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
6113+
switch (IID) {
6114+
case Intrinsic::amdgcn_readfirstlane:
6115+
Operands.push_back(Src0);
6116+
break;
6117+
case Intrinsic::amdgcn_readlane:
6118+
Operands.push_back(Src0);
6119+
Operands.push_back(Src1);
6120+
break;
6121+
case Intrinsic::amdgcn_writelane:
6122+
Operands.push_back(Src0);
6123+
Operands.push_back(Src1);
6124+
Operands.push_back(Src2);
6125+
break;
6126+
}
6127+
6128+
if (SDNode *GL = N->getGluedNode()) {
6129+
assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6130+
GL = GL->getOperand(0).getNode();
6131+
Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6132+
SDValue(GL, 0)));
6133+
}
6134+
6135+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
6136+
};
6137+
6138+
SDValue Src0 = N->getOperand(1);
6139+
SDValue Src1, Src2;
6140+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
6141+
Src1 = N->getOperand(2);
6142+
if (IID == Intrinsic::amdgcn_writelane)
6143+
Src2 = N->getOperand(3);
6144+
}
6145+
6146+
if (ValSize == 32) {
6147+
// Already legal
6148+
return SDValue();
6149+
}
6150+
6151+
if (ValSize < 32) {
6152+
bool IsFloat = VT.isFloatingPoint();
6153+
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
6154+
SL, MVT::i32);
6155+
if (Src2.getNode()) {
6156+
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
6157+
SL, MVT::i32);
6158+
}
6159+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
6160+
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
6161+
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
6162+
}
6163+
6164+
if (ValSize % 32 != 0)
6165+
return SDValue();
6166+
6167+
auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
6168+
EVT VT = N->getValueType(0);
6169+
unsigned NE = VT.getVectorNumElements();
6170+
EVT EltVT = VT.getVectorElementType();
6171+
SmallVector<SDValue, 8> Scalars;
6172+
unsigned NumOperands = N->getNumOperands();
6173+
SmallVector<SDValue, 4> Operands(NumOperands);
6174+
SDNode *GL = N->getGluedNode();
6175+
6176+
// only handle convergencectrl_glue
6177+
assert(!GL || GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
6178+
6179+
for (unsigned i = 0; i != NE; ++i) {
6180+
for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
6181+
++j) {
6182+
SDValue Operand = N->getOperand(j);
6183+
EVT OperandVT = Operand.getValueType();
6184+
if (OperandVT.isVector()) {
6185+
// A vector operand; extract a single element.
6186+
EVT OperandEltVT = OperandVT.getVectorElementType();
6187+
Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
6188+
Operand, DAG.getVectorIdxConstant(i, SL));
6189+
} else {
6190+
// A scalar operand; just use it as is.
6191+
Operands[j] = Operand;
6192+
}
6193+
}
6194+
6195+
if (GL)
6196+
Operands[NumOperands - 1] =
6197+
DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
6198+
SDValue(GL->getOperand(0).getNode(), 0));
6199+
6200+
Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
6201+
}
6202+
6203+
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
6204+
return DAG.getBuildVector(VecVT, SL, Scalars);
6205+
};
6206+
6207+
if (VT.isVector()) {
6208+
switch (MVT::SimpleValueType EltTy =
6209+
VT.getVectorElementType().getSimpleVT().SimpleTy) {
6210+
case MVT::i32:
6211+
case MVT::f32: {
6212+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
6213+
return unrollLaneOp(LaneOp.getNode());
6214+
}
6215+
case MVT::i16:
6216+
case MVT::f16:
6217+
case MVT::bf16: {
6218+
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
6219+
SmallVector<SDValue, 4> Pieces;
6220+
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
6221+
SDValue Src0SubVec =
6222+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6223+
DAG.getConstant(EltIdx, SL, MVT::i32));
6224+
6225+
SDValue Src2SubVec;
6226+
if (Src2)
6227+
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
6228+
DAG.getConstant(EltIdx, SL, MVT::i32));
6229+
6230+
Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6231+
EltIdx += 2;
6232+
}
6233+
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6234+
}
6235+
default:
6236+
// Handle all other cases by bitcasting to i32 vectors
6237+
break;
6238+
}
6239+
}
6240+
6241+
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6242+
Src0 = DAG.getBitcast(VecVT, Src0);
6243+
6244+
if (Src2)
6245+
Src2 = DAG.getBitcast(VecVT, Src2);
6246+
6247+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6248+
SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6249+
return DAG.getBitcast(VT, UnrolledLaneOp);
6250+
}
6251+
61016252
void SITargetLowering::ReplaceNodeResults(SDNode *N,
61026253
SmallVectorImpl<SDValue> &Results,
61036254
SelectionDAG &DAG) const {
@@ -8564,6 +8715,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
85648715
}
85658716
case Intrinsic::amdgcn_addrspacecast_nonnull:
85668717
return lowerADDRSPACECAST(Op, DAG);
8718+
case Intrinsic::amdgcn_readlane:
8719+
case Intrinsic::amdgcn_readfirstlane:
8720+
case Intrinsic::amdgcn_writelane:
8721+
return lowerLaneOp(*this, Op.getNode(), DAG);
85678722
default:
85688723
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
85698724
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

0 commit comments

Comments
 (0)