Skip to content

Commit b002711

Browse files
committed
[AMDGPU] Extend readlane, writelane and readfirstlane intrinsic lowering for generic types
1 parent e76b257 commit b002711

28 files changed

+5485
-265
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18479,6 +18479,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1847918479
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
1848018480
return Builder.CreateCall(F, Args);
1848118481
}
18482+
case AMDGPU::BI__builtin_amdgcn_readlane:
18483+
return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
18484+
case AMDGPU::BI__builtin_amdgcn_readfirstlane:
18485+
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
1848218486
case AMDGPU::BI__builtin_amdgcn_div_fixup:
1848318487
case AMDGPU::BI__builtin_amdgcn_div_fixupf:
1848418488
case AMDGPU::BI__builtin_amdgcn_div_fixuph:

clang/test/CodeGenOpenCL/builtins-amdgcn.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)
306306
}
307307

308308
// CHECK-LABEL: @test_readfirstlane
309-
// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
309+
// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
310310
void test_readfirstlane(global int* out, int a)
311311
{
312312
*out = __builtin_amdgcn_readfirstlane(a);
313313
}
314314

315315
// CHECK-LABEL: @test_readlane
316-
// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
316+
// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
317317
void test_readlane(global int* out, int a, int b)
318318
{
319319
*out = __builtin_amdgcn_readlane(a, b);

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2176,26 +2176,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
21762176
def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
21772177

21782178
def int_amdgcn_readfirstlane :
2179-
ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
2180-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2179+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
21812180
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
21822181

21832182
// The lane argument must be uniform across the currently active threads of the
21842183
// current wave. Otherwise, the result is undefined.
21852184
def int_amdgcn_readlane :
2186-
ClangBuiltin<"__builtin_amdgcn_readlane">,
2187-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
2185+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
21882186
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
21892187

21902188
// The value to write and lane select arguments must be uniform across the
21912189
// currently active threads of the current wave. Otherwise, the result is
21922190
// undefined.
21932191
def int_amdgcn_writelane :
2194-
ClangBuiltin<"__builtin_amdgcn_writelane">,
2195-
Intrinsic<[llvm_i32_ty], [
2196-
llvm_i32_ty, // uniform value to write: returned by the selected lane
2197-
llvm_i32_ty, // uniform lane select
2198-
llvm_i32_ty // returned by all lanes other than the selected one
2192+
Intrinsic<[llvm_any_ty], [
2193+
LLVMMatchType<0>, // uniform value to write: returned by the selected lane
2194+
llvm_i32_ty, // uniform lane select
2195+
LLVMMatchType<0> // returned by all lanes other than the selected one
21992196
],
22002197
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
22012198
>;

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
433433
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
434434
// combine them with a scalar operation.
435435
Function *ReadLane =
436-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
436+
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
437437
V = B.CreateBitCast(V, IntNTy);
438438
Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
439439
Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
523523
{Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
524524
B.getInt32(0xf), B.getFalse()});
525525
} else {
526-
Function *ReadLane =
527-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
528-
Function *WriteLane =
529-
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
526+
Function *ReadLane = Intrinsic::getDeclaration(
527+
M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
528+
Function *WriteLane = Intrinsic::getDeclaration(
529+
M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
530530

531531
// On GFX10 all DPP operations are confined to a single row. To get cross-
532532
// row operations we have to use permlane or readlane.

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5496,6 +5496,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
54965496
NODE_NAME_CASE(LDS)
54975497
NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
54985498
NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
5499+
NODE_NAME_CASE(READLANE)
5500+
NODE_NAME_CASE(READFIRSTLANE)
5501+
NODE_NAME_CASE(WRITELANE)
54995502
NODE_NAME_CASE(DUMMY_CHAIN)
55005503
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
55015504
NODE_NAME_CASE(LOAD_D16_HI)

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,6 +558,10 @@ enum NodeType : unsigned {
558558
FPTRUNC_ROUND_UPWARD,
559559
FPTRUNC_ROUND_DOWNWARD,
560560

561+
READLANE,
562+
READFIRSTLANE,
563+
WRITELANE,
564+
561565
DUMMY_CHAIN,
562566
FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
563567
LOAD_D16_HI,

llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,22 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
342342

343343
def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
344344

345+
def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [
346+
SDTCisSameAs<0, 1>
347+
]>;
348+
349+
def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
350+
SDTCisSameAs<0, 1>, SDTCisInt<2>
351+
]>;
352+
353+
def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
354+
SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
355+
]>;
356+
357+
def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
358+
def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
359+
def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
360+
345361
// SI+ export
346362
def AMDGPUExportOp : SDTypeProfile<0, 8, [
347363
SDTCisInt<0>, // i8 tgt
@@ -506,3 +522,16 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
506522
def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
507523
[(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
508524
(AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
525+
526+
def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1),
527+
[(int_amdgcn_readlane node:$src0, node:$src1),
528+
(AMDGPUreadlane_impl node:$src0, node:$src1)]>;
529+
530+
def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
531+
[(int_amdgcn_readfirstlane node:$src),
532+
(AMDGPUreadfirstlane_impl node:$src)]>;
533+
534+
def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
535+
[(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
536+
(AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
537+

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5387,6 +5387,192 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
53875387
return true;
53885388
}
53895389

5390+
bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
5391+
MachineInstr &MI,
5392+
Intrinsic::ID IID) const {
5393+
5394+
MachineIRBuilder &B = Helper.MIRBuilder;
5395+
MachineRegisterInfo &MRI = *B.getMRI();
5396+
5397+
Register DstReg = MI.getOperand(0).getReg();
5398+
Register Src0 = MI.getOperand(2).getReg();
5399+
5400+
auto createLaneOp = [&](Register Src0, Register Src1,
5401+
Register Src2) -> Register {
5402+
auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
5403+
switch (IID) {
5404+
case Intrinsic::amdgcn_readfirstlane:
5405+
return LaneOp.getReg(0);
5406+
case Intrinsic::amdgcn_readlane:
5407+
return LaneOp.addUse(Src1).getReg(0);
5408+
case Intrinsic::amdgcn_writelane:
5409+
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5410+
default:
5411+
llvm_unreachable("unhandled lane op");
5412+
}
5413+
};
5414+
5415+
Register Src1, Src2;
5416+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5417+
Src1 = MI.getOperand(3).getReg();
5418+
if (IID == Intrinsic::amdgcn_writelane) {
5419+
Src2 = MI.getOperand(4).getReg();
5420+
}
5421+
}
5422+
5423+
LLT Ty = MRI.getType(DstReg);
5424+
unsigned Size = Ty.getSizeInBits();
5425+
5426+
if (Size == 32) {
5427+
// Already legal
5428+
return true;
5429+
}
5430+
5431+
if (Size < 32) {
5432+
Register Src0Cast = MRI.getType(Src0).isScalar()
5433+
? Src0
5434+
: B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
5435+
Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
5436+
if (Src2.isValid()) {
5437+
Register Src2Cast =
5438+
MRI.getType(Src2).isScalar()
5439+
? Src2
5440+
: B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
5441+
Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
5442+
}
5443+
5444+
Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
5445+
if (Ty.isScalar())
5446+
B.buildTrunc(DstReg, LaneOpDst);
5447+
else {
5448+
auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
5449+
B.buildBitcast(DstReg, Trunc);
5450+
}
5451+
5452+
MI.eraseFromParent();
5453+
return true;
5454+
}
5455+
5456+
if ((Size % 32) == 0) {
5457+
SmallVector<Register, 2> PartialRes;
5458+
unsigned NumParts = Size / 32;
5459+
auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
5460+
MachineInstrBuilder Src0Parts;
5461+
5462+
if (Ty.isPointer()) {
5463+
auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
5464+
Src0Parts = B.buildUnmerge(S32, PtrToInt);
5465+
} else if (Ty.isPointerVector()) {
5466+
LLT IntVecTy = Ty.changeElementType(
5467+
LLT::scalar(Ty.getElementType().getSizeInBits()));
5468+
auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
5469+
Src0Parts = B.buildUnmerge(S32, PtrToInt);
5470+
} else
5471+
Src0Parts =
5472+
IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
5473+
5474+
switch (IID) {
5475+
case Intrinsic::amdgcn_readlane: {
5476+
Register Src1 = MI.getOperand(3).getReg();
5477+
for (unsigned i = 0; i < NumParts; ++i) {
5478+
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
5479+
: Src0Parts.getReg(i);
5480+
PartialRes.push_back(
5481+
(B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
5482+
.addUse(Src0)
5483+
.addUse(Src1))
5484+
.getReg(0));
5485+
}
5486+
break;
5487+
}
5488+
case Intrinsic::amdgcn_readfirstlane: {
5489+
for (unsigned i = 0; i < NumParts; ++i) {
5490+
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
5491+
: Src0Parts.getReg(i);
5492+
PartialRes.push_back(
5493+
(B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
5494+
.addUse(Src0)
5495+
.getReg(0)));
5496+
}
5497+
5498+
break;
5499+
}
5500+
case Intrinsic::amdgcn_writelane: {
5501+
Register Src1 = MI.getOperand(3).getReg();
5502+
Register Src2 = MI.getOperand(4).getReg();
5503+
MachineInstrBuilder Src2Parts;
5504+
5505+
if (Ty.isPointer()) {
5506+
auto PtrToInt = B.buildPtrToInt(S64, Src2);
5507+
Src2Parts = B.buildUnmerge(S32, PtrToInt);
5508+
} else if (Ty.isPointerVector()) {
5509+
LLT IntVecTy = Ty.changeElementType(
5510+
LLT::scalar(Ty.getElementType().getSizeInBits()));
5511+
auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
5512+
Src2Parts = B.buildUnmerge(S32, PtrToInt);
5513+
} else
5514+
Src2Parts =
5515+
IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
5516+
5517+
for (unsigned i = 0; i < NumParts; ++i) {
5518+
Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
5519+
: Src0Parts.getReg(i);
5520+
Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
5521+
: Src2Parts.getReg(i);
5522+
PartialRes.push_back(
5523+
(B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
5524+
.addUse(Src0)
5525+
.addUse(Src1)
5526+
.addUse(Src2))
5527+
.getReg(0));
5528+
}
5529+
5530+
break;
5531+
}
5532+
}
5533+
5534+
if (Ty.isPointerVector()) {
5535+
unsigned PtrSize = Ty.getElementType().getSizeInBits();
5536+
SmallVector<Register, 2> PtrElements;
5537+
if (PtrSize == 32) {
5538+
// Handle 32 bit pointers
5539+
for (unsigned i = 0; i < NumParts; i++)
5540+
PtrElements.push_back(
5541+
B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0));
5542+
} else {
5543+
// Handle legalization of <? x [pointer type bigger than 32 bits]>
5544+
SmallVector<Register, 2> PtrParts;
5545+
unsigned NumS32Parts = PtrSize / 32;
5546+
unsigned PartIdx = 0;
5547+
for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) {
5548+
// Merge S32 components of a pointer element first.
5549+
for (; PartIdx < (j * NumS32Parts); PartIdx++)
5550+
PtrParts.push_back(PartialRes[PartIdx]);
5551+
5552+
auto MergedPtr =
5553+
B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts);
5554+
PtrElements.push_back(
5555+
B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0));
5556+
PtrParts.clear();
5557+
}
5558+
}
5559+
5560+
B.buildMergeLikeInstr(DstReg, PtrElements);
5561+
} else {
5562+
if (IsS16Vec) {
5563+
for (unsigned i = 0; i < NumParts; i++)
5564+
PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0);
5565+
}
5566+
B.buildMergeLikeInstr(DstReg, PartialRes);
5567+
}
5568+
5569+
MI.eraseFromParent();
5570+
return true;
5571+
}
5572+
5573+
return false;
5574+
}
5575+
53905576
bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
53915577
MachineRegisterInfo &MRI,
53925578
MachineIRBuilder &B) const {
@@ -7330,6 +7516,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73307516
Observer.changedInstr(MI);
73317517
return true;
73327518
}
7519+
case Intrinsic::amdgcn_readlane:
7520+
case Intrinsic::amdgcn_writelane:
7521+
case Intrinsic::amdgcn_readfirstlane:
7522+
return legalizeLaneOp(Helper, MI, IntrID);
73337523
default: {
73347524
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
73357525
AMDGPU::getImageDimIntrinsicInfo(IntrID))

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
208208
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
209209
Intrinsic::ID IID) const;
210210

211+
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
212+
Intrinsic::ID IID) const;
213+
211214
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
212215

213216
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;

0 commit comments

Comments
 (0)