Skip to content

Commit 4d13d20

Browse files
committed
[AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types (llvm#89217)
This patch is intended to be the first of a series with end goal to adapt atomic optimizer pass to support i64 and f64 operations (along with removing all unnecessary bitcasts). This legalizes 64 bit readlane, writelane and readfirstlane ops pre-ISel Change-Id: I9d302867e39316767b2aabcf876e9ea7a9e484e0
1 parent 33787d6 commit 4d13d20

27 files changed

+5361
-466
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17933,6 +17933,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1793317933
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
1793417934
return Builder.CreateCall(F, Args);
1793517935
}
17936+
case AMDGPU::BI__builtin_amdgcn_readlane:
17937+
return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
17938+
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
17939+
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
1793617940
case AMDGPU::BI__builtin_amdgcn_div_fixup:
1793717941
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
1793817942
case AMDGPU::BI__builtin_amdgcn_div_fixuph:

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)
306306
}
307307

308308
// CHECK-LABEL: @test_readfirstlane
309-
// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
309+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
310310
void test_readfirstlane(global int* out, int a)
311311
{
312312
*out = __builtin_amdgcn_readfirstlane(a);
313313
}
314314

315315
// CHECK-LABEL: @test_readlane
316-
// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
316+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
317317
void test_readlane(global int* out, int a, int b)
318318
{
319319
*out = __builtin_amdgcn_readlane(a, b);

llvm/docs/AMDGPUUsage.rst

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,6 +1165,36 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
11651165
register do not exactly match the FLT_ROUNDS values,
11661166
so a conversion is performed.
11671167

1168+
:ref:`llvm.set.rounding<int_set_rounding>` Input value expected to be one of the valid results
1169+
from '``llvm.get.rounding``'. Rounding mode is
1170+
undefined if not passed a valid input. This should be
1171+
a wave uniform value. In case of a divergent input
1172+
value, the first active lane's value will be used.
1173+
1174+
:ref:`llvm.get.fpenv<int_get_fpenv>` Returns the current value of the AMDGPU floating point environment.
1175+
This stores information related to the current rounding mode,
1176+
denormalization mode, enabled traps, and floating point exceptions.
1177+
The format is a 64-bit concatenation of the MODE and TRAPSTS registers.
1178+
1179+
:ref:`llvm.set.fpenv<int_set_fpenv>` Sets the floating point environment to the specifies state.
1180+
1181+
llvm.amdgcn.readfirstlane Provides direct access to v_readfirstlane_b32. Returns the value in
1182+
the lowest active lane of the input operand. Currently implemented
1183+
for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>,
1184+
i64, double, pointers, multiples of the 32-bit vectors.
1185+
1186+
llvm.amdgcn.readlane Provides direct access to v_readlane_b32. Returns the value in the
1187+
specified lane of the first input operand. The second operand specifies
1188+
the lane to read from. Currently implemented for i16, i32, float, half,
1189+
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
1190+
multiples of the 32-bit vectors.
1191+
1192+
llvm.amdgcn.writelane Provides direct access to v_writelane_b32. Writes value in the first input
1193+
operand to the specified lane of divergent output. The second operand
1194+
specifies the lane to write. Currently implemented for i16, i32, float,
1195+
half, bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers,
1196+
multiples of the 32-bit vectors.
1197+
11681198
llvm.amdgcn.wave.reduce.umin Performs an arithmetic unsigned min reduction on the unsigned values
11691199
provided by each lane in the wavefront.
11701200
Intrinsic takes a hint for reduction strategy using second operand

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2155,26 +2155,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
21552155
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
21562156

21572157
def int_amdgcn_readfirstlane :
2158-
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
2159-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2158+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
21602159
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
21612160

21622161
// The lane argument must be uniform across the currently active threads of the
21632162
// current wave. Otherwise, the result is undefined.
21642163
def int_amdgcn_readlane :
2165-
ClangBuiltin<"__builtin_amdgcn_readlane">,
2166-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2164+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
21672165
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
21682166

21692167
// The value to write and lane select arguments must be uniform across the
21702168
// currently active threads of the current wave. Otherwise, the result is
21712169
// undefined.
21722170
def int_amdgcn_writelane :
2173-
ClangBuiltin<"__builtin_amdgcn_writelane">,
2174-
Intrinsic<[llvm_i32_ty], [
2175-
llvm_i32_ty, // uniform value to write: returned by the selected lane
2176-
llvm_i32_ty, // uniform lane select
2177-
llvm_i32_ty // returned by all lanes other than the selected one
2171+
Intrinsic<[llvm_any_ty], [
2172+
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
2173+
llvm_i32_ty, // uniform lane select
2174+
LLVMMatchType<0> // returned by all lanes other than the selected one
21782175
],
21792176
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
21802177
>;

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
433433
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
434434
// combine them with a scalar operation.
435435
Function *ReadLane =
436-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
436+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
437437
V = B.CreateBitCast(V, IntNTy);
438438
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
439439
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
523523
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
524524
B.getInt32(0xf), B.getFalse()});
525525
} else {
526-
Function *ReadLane =
527-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
528-
Function *WriteLane =
529-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
526+
Function *ReadLane = Intrinsic::getDeclaration(
527+
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
528+
Function *WriteLane = Intrinsic::getDeclaration(
529+
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
530530

531531
// On GFX10 all DPP operations are confined to a single row. To get cross-
532532
// row operations we have to use permlane or readlane.

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5332,6 +5332,99 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
53325332
return true;
53335333
}
53345334

5335+
// TODO: Fix pointer type handling
5336+
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5337+
MachineInstr &MI,
5338+
Intrinsic::ID IID) const {
5339+
5340+
MachineIRBuilder &B = Helper.MIRBuilder;
5341+
MachineRegisterInfo &MRI = *B.getMRI();
5342+
LLT S32 = LLT::scalar(32);
5343+
5344+
auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
5345+
LLT VT) -> Register {
5346+
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
5347+
switch (IID) {
5348+
case Intrinsic::amdgcn_readfirstlane:
5349+
return LaneOp.getReg(0);
5350+
case Intrinsic::amdgcn_readlane:
5351+
return LaneOp.addUse(Src1).getReg(0);
5352+
case Intrinsic::amdgcn_writelane:
5353+
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5354+
default:
5355+
llvm_unreachable("unhandled lane op");
5356+
}
5357+
};
5358+
5359+
Register DstReg = MI.getOperand(0).getReg();
5360+
Register Src0 = MI.getOperand(2).getReg();
5361+
Register Src1, Src2;
5362+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5363+
Src1 = MI.getOperand(3).getReg();
5364+
if (IID == Intrinsic::amdgcn_writelane) {
5365+
Src2 = MI.getOperand(4).getReg();
5366+
}
5367+
}
5368+
5369+
LLT Ty = MRI.getType(DstReg);
5370+
unsigned Size = Ty.getSizeInBits();
5371+
5372+
if (Size == 32) {
5373+
// Already legal
5374+
return true;
5375+
}
5376+
5377+
if (Size < 32) {
5378+
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5379+
if (Src2.isValid())
5380+
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
5381+
5382+
Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
5383+
B.buildTrunc(DstReg, LaneOpDst);
5384+
5385+
MI.eraseFromParent();
5386+
return true;
5387+
}
5388+
5389+
if (Size % 32 != 0)
5390+
return false;
5391+
5392+
LLT PartialResTy = S32;
5393+
if (Ty.isVector()) {
5394+
LLT EltTy = Ty.getElementType();
5395+
switch (EltTy.getSizeInBits()) {
5396+
case 16:
5397+
PartialResTy = Ty.changeElementCount(ElementCount::getFixed(2));
5398+
break;
5399+
case 32:
5400+
PartialResTy = EltTy;
5401+
break;
5402+
default:
5403+
// Handle all other cases via S32 pieces;
5404+
break;
5405+
}
5406+
}
5407+
5408+
SmallVector<Register, 2> PartialRes;
5409+
unsigned NumParts = Size / 32;
5410+
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5411+
MachineInstrBuilder Src2Parts;
5412+
5413+
if (Src2.isValid())
5414+
Src2Parts = B.buildUnmerge(PartialResTy, Src2);
5415+
5416+
for (unsigned i = 0; i < NumParts; ++i) {
5417+
Src0 = Src0Parts.getReg(i);
5418+
if (Src2.isValid())
5419+
Src2 = Src2Parts.getReg(i);
5420+
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
5421+
}
5422+
5423+
B.buildMergeLikeInstr(DstReg, PartialRes);
5424+
MI.eraseFromParent();
5425+
return true;
5426+
}
5427+
53355428
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
53365429
MachineRegisterInfo &MRI,
53375430
MachineIRBuilder &B) const {
@@ -7232,6 +7325,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
72327325
Observer.changedInstr(MI);
72337326
return true;
72347327
}
7328+
case Intrinsic::amdgcn_readlane:
7329+
case Intrinsic::amdgcn_writelane:
7330+
case Intrinsic::amdgcn_readfirstlane:
7331+
return legalizeLaneOp(Helper, MI, IntrID);
72357332
default: {
72367333
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
72377334
AMDGPU::getImageDimIntrinsicInfo(IntrID))

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
208208
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
209209
Intrinsic::ID IID) const;
210210

211+
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
212+
Intrinsic::ID IID) const;
213+
211214
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
212215

213216
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5882,6 +5882,140 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
58825882
DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
58835883
}
58845884

5885+
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
5886+
SelectionDAG &DAG) {
5887+
EVT VT = N->getValueType(0);
5888+
unsigned ValSize = VT.getSizeInBits();
5889+
unsigned IID = N->getConstantOperandVal(0);
5890+
SDLoc SL(N);
5891+
MVT IntVT = MVT::getIntegerVT(ValSize);
5892+
5893+
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
5894+
SDValue Src2, MVT ValT) -> SDValue {
5895+
SmallVector<SDValue, 8> Operands;
5896+
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
5897+
switch (IID) {
5898+
case Intrinsic::amdgcn_readfirstlane:
5899+
Operands.push_back(Src0);
5900+
break;
5901+
case Intrinsic::amdgcn_readlane:
5902+
Operands.push_back(Src0);
5903+
Operands.push_back(Src1);
5904+
break;
5905+
case Intrinsic::amdgcn_writelane:
5906+
Operands.push_back(Src0);
5907+
Operands.push_back(Src1);
5908+
Operands.push_back(Src2);
5909+
break;
5910+
}
5911+
5912+
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
5913+
};
5914+
5915+
SDValue Src0 = N->getOperand(1);
5916+
SDValue Src1, Src2;
5917+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5918+
Src1 = N->getOperand(2);
5919+
if (IID == Intrinsic::amdgcn_writelane)
5920+
Src2 = N->getOperand(3);
5921+
}
5922+
5923+
if (ValSize == 32) {
5924+
// Already legal
5925+
return SDValue();
5926+
}
5927+
5928+
if (ValSize < 32) {
5929+
bool IsFloat = VT.isFloatingPoint();
5930+
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
5931+
SL, MVT::i32);
5932+
if (Src2.getNode()) {
5933+
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
5934+
SL, MVT::i32);
5935+
}
5936+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
5937+
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
5938+
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
5939+
}
5940+
5941+
if (ValSize % 32 != 0)
5942+
return SDValue();
5943+
5944+
auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
5945+
EVT VT = N->getValueType(0);
5946+
unsigned NE = VT.getVectorNumElements();
5947+
EVT EltVT = VT.getVectorElementType();
5948+
SmallVector<SDValue, 8> Scalars;
5949+
unsigned NumOperands = N->getNumOperands();
5950+
SmallVector<SDValue, 4> Operands(NumOperands);
5951+
5952+
for (unsigned i = 0; i != NE; ++i) {
5953+
for (unsigned j = 0, e = NumOperands; j != e; ++j) {
5954+
SDValue Operand = N->getOperand(j);
5955+
EVT OperandVT = Operand.getValueType();
5956+
if (OperandVT.isVector()) {
5957+
// A vector operand; extract a single element.
5958+
EVT OperandEltVT = OperandVT.getVectorElementType();
5959+
Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
5960+
Operand, DAG.getVectorIdxConstant(i, SL));
5961+
} else {
5962+
// A scalar operand; just use it as is.
5963+
Operands[j] = Operand;
5964+
}
5965+
}
5966+
5967+
Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
5968+
}
5969+
5970+
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
5971+
return DAG.getBuildVector(VecVT, SL, Scalars);
5972+
};
5973+
5974+
if (VT.isVector()) {
5975+
switch (MVT::SimpleValueType EltTy =
5976+
VT.getVectorElementType().getSimpleVT().SimpleTy) {
5977+
case MVT::i32:
5978+
case MVT::f32: {
5979+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
5980+
return unrollLaneOp(LaneOp.getNode());
5981+
}
5982+
case MVT::i16:
5983+
case MVT::f16:
5984+
case MVT::bf16: {
5985+
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
5986+
SmallVector<SDValue, 4> Pieces;
5987+
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
5988+
SDValue Src0SubVec =
5989+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
5990+
DAG.getConstant(EltIdx, SL, MVT::i32));
5991+
5992+
SDValue Src2SubVec;
5993+
if (Src2)
5994+
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
5995+
DAG.getConstant(EltIdx, SL, MVT::i32));
5996+
5997+
Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
5998+
EltIdx += 2;
5999+
}
6000+
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
6001+
}
6002+
default:
6003+
// Handle all other cases by bitcasting to i32 vectors
6004+
break;
6005+
}
6006+
}
6007+
6008+
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
6009+
Src0 = DAG.getBitcast(VecVT, Src0);
6010+
6011+
if (Src2)
6012+
Src2 = DAG.getBitcast(VecVT, Src2);
6013+
6014+
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
6015+
SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
6016+
return DAG.getBitcast(VT, UnrolledLaneOp);
6017+
}
6018+
58856019
void SITargetLowering::ReplaceNodeResults(SDNode *N,
58866020
SmallVectorImpl<SDValue> &Results,
58876021
SelectionDAG &DAG) const {
@@ -8327,6 +8461,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
83278461
Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
83288462
IndexKeyi32, Op.getOperand(7)});
83298463
}
8464+
case Intrinsic::amdgcn_readlane:
8465+
case Intrinsic::amdgcn_readfirstlane:
8466+
case Intrinsic::amdgcn_writelane:
8467+
return lowerLaneOp(*this, Op.getNode(), DAG);
83308468
default:
83318469
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
83328470
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))

0 commit comments

Comments
 (0)