Skip to content

Commit 670289e

Browse files
committed
[AMDGPU] Extend permlane16, permlanex16 and permlane64 intrinsic lowering for generic types (llvm#92725)
These are incremental changes over llvm#89217 , with core logic being the same. This patch along with llvm#89217 and llvm#91190 should get us ready to enable 64 bit optimizations in atomic optimizer. Change-Id: Ief70422a47461606c29134b217f40204ee4a198b
1 parent d56b2ba commit 670289e

File tree

17 files changed

+10763
-1063
lines changed

17 files changed

+10763
-1063
lines changed

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17933,6 +17933,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
1793317933
CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
1793417934
return Builder.CreateCall(F, Args);
1793517935
}
17936+
case AMDGPU::BI__builtin_amdgcn_permlane16:
17937+
case AMDGPU::BI__builtin_amdgcn_permlanex16: {
17938+
Intrinsic::ID IID = BuiltinID == AMDGPU::BI__builtin_amdgcn_permlane16
17939+
? Intrinsic::amdgcn_permlane16
17940+
: Intrinsic::amdgcn_permlanex16;
17941+
llvm::Value *Src0 = EmitScalarExpr(E->getArg(0));
17942+
llvm::Value *Src1 = EmitScalarExpr(E->getArg(1));
17943+
llvm::Value *Src2 = EmitScalarExpr(E->getArg(2));
17944+
llvm::Value *Src3 = EmitScalarExpr(E->getArg(3));
17945+
llvm::Value *Src4 = EmitScalarExpr(E->getArg(4));
17946+
llvm::Value *Src5 = EmitScalarExpr(E->getArg(5));
17947+
17948+
llvm::Function *F = CGM.getIntrinsic(IID, Src1->getType());
17949+
return Builder.CreateCall(F, {Src0, Src1, Src2, Src3, Src4, Src5});
17950+
}
17951+
case AMDGPU::BI__builtin_amdgcn_permlane64:
17952+
return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_permlane64);
1793617953
case AMDGPU::BI__builtin_amdgcn_readlane:
1793717954
return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
1793817955
case AMDGPU::BI__builtin_amdgcn_readfirstlane:

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx10.cl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@ typedef unsigned int uint;
77
typedef unsigned long ulong;
88

99
// CHECK-LABEL: @test_permlane16(
10-
// CHECK: call i32 @llvm.amdgcn.permlane16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
10+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
1111
void test_permlane16(global uint* out, uint a, uint b, uint c, uint d) {
1212
*out = __builtin_amdgcn_permlane16(a, b, c, d, 0, 0);
1313
}
1414

1515
// CHECK-LABEL: @test_permlanex16(
16-
// CHECK: call i32 @llvm.amdgcn.permlanex16(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
16+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlanex16.i32(i32 %a, i32 %b, i32 %c, i32 %d, i1 false, i1 false)
1717
void test_permlanex16(global uint* out, uint a, uint b, uint c, uint d) {
1818
*out = __builtin_amdgcn_permlanex16(a, b, c, d, 0, 0);
1919
}

clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ void test_ds_bvh_stack_rtn(global uint2* out, uint addr, uint data, uint4 data1)
3636
}
3737

3838
// CHECK-LABEL: @test_permlane64(
39-
// CHECK: call i32 @llvm.amdgcn.permlane64(i32 %a)
39+
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.permlane64.i32(i32 %a)
4040
void test_permlane64(global uint* out, uint a) {
4141
*out = __builtin_amdgcn_permlane64(a);
4242
}

llvm/docs/AMDGPUUsage.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,6 +1215,26 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
12151215
reduction will be performed using default iterative strategy.
12161216
Intrinsic is currently only implemented for i32.
12171217

1218+
llvm.amdgcn.permlane16 Provides direct access to v_permlane16_b32. Performs arbitrary gather-style
1219+
operation within a row (16 contiguous lanes) of the second input operand.
1220+
The third and fourth inputs must be scalar values. these are combined into
1221+
a single 64-bit value representing lane selects used to swizzle within each
1222+
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>,
1223+
<2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
1224+
1225+
llvm.amdgcn.permlanex16 Provides direct access to v_permlanex16_b32. Performs arbitrary gather-style
1226+
operation across two rows of the second input operand (each row is 16 contiguous
1227+
lanes). The third and fourth inputs must be scalar values. these are combined
1228+
into a single 64-bit value representing lane selects used to swizzle within each
1229+
row. Currently implemented for i16, i32, float, half, bfloat, <2 x i16>, <2 x half>,
1230+
<2 x bfloat>, i64, double, pointers, multiples of the 32-bit vectors.
1231+
1232+
llvm.amdgcn.permlane64 Provides direct access to v_permlane64_b32. Performs a specific permutation across
1233+
lanes of the input operand where the high half and low half of a wave64 are swapped.
1234+
Performs no operation in wave32 mode. Currently implemented for i16, i32, float, half,
1235+
bfloat, <2 x i16>, <2 x half>, <2 x bfloat>, i64, double, pointers, multiples of the
1236+
32-bit vectors.
1237+
12181238
llvm.amdgcn.udot2 Provides direct access to v_dot2_u32_u16 across targets which
12191239
support such instructions. This performs unsigned dot product
12201240
with two v2i16 operands, summed with the third i32 operand. The

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2462,16 +2462,16 @@ def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
24622462
//===----------------------------------------------------------------------===//
24632463

24642464
// llvm.amdgcn.permlane16 <old> <src0> <src1> <src2> <fi> <bound_control>
2465-
def int_amdgcn_permlane16 : ClangBuiltin<"__builtin_amdgcn_permlane16">,
2466-
Intrinsic<[llvm_i32_ty],
2467-
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2465+
def int_amdgcn_permlane16 :
2466+
Intrinsic<[llvm_any_ty],
2467+
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
24682468
[IntrNoMem, IntrConvergent, IntrWillReturn,
24692469
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
24702470

24712471
// llvm.amdgcn.permlanex16 <old> <src0> <src1> <src2> <fi> <bound_control>
2472-
def int_amdgcn_permlanex16 : ClangBuiltin<"__builtin_amdgcn_permlanex16">,
2473-
Intrinsic<[llvm_i32_ty],
2474-
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
2472+
def int_amdgcn_permlanex16 :
2473+
Intrinsic<[llvm_any_ty],
2474+
[LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty, llvm_i1_ty],
24752475
[IntrNoMem, IntrConvergent, IntrWillReturn,
24762476
ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, IntrNoCallback, IntrNoFree]>;
24772477

@@ -2514,8 +2514,7 @@ def int_amdgcn_image_bvh_intersect_ray :
25142514

25152515
// llvm.amdgcn.permlane64 <src0>
25162516
def int_amdgcn_permlane64 :
2517-
ClangBuiltin<"__builtin_amdgcn_permlane64">,
2518-
Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
2517+
Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
25192518
[IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
25202519

25212520
def int_amdgcn_ds_add_gs_reg_rtn :

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -413,7 +413,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
413413
assert(ST->hasPermLaneX16());
414414
V = B.CreateBitCast(V, IntNTy);
415415
Value *Permlanex16Call = B.CreateIntrinsic(
416-
Intrinsic::amdgcn_permlanex16, {},
416+
V->getType(), Intrinsic::amdgcn_permlanex16,
417417
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
418418
V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
419419
B.CreateBitCast(Permlanex16Call, AtomicTy));
@@ -425,7 +425,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
425425
// Reduce across the upper and lower 32 lanes.
426426
V = B.CreateBitCast(V, IntNTy);
427427
Value *Permlane64Call =
428-
B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
428+
B.CreateIntrinsic(V->getType(), Intrinsic::amdgcn_permlane64, V);
429429
return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
430430
B.CreateBitCast(Permlane64Call, AtomicTy));
431431
}
@@ -481,7 +481,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
481481
assert(ST->hasPermLaneX16());
482482
V = B.CreateBitCast(V, IntNTy);
483483
Value *PermX = B.CreateIntrinsic(
484-
Intrinsic::amdgcn_permlanex16, {},
484+
V->getType(), Intrinsic::amdgcn_permlanex16,
485485
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
486486

487487
Value *UpdateDPPCall =

llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5341,16 +5341,32 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
53415341
MachineRegisterInfo &MRI = *B.getMRI();
53425342
LLT S32 = LLT::scalar(32);
53435343

5344-
auto createLaneOp = [&IID, &B](Register Src0, Register Src1, Register Src2,
5345-
LLT VT) -> Register {
5344+
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5345+
IID == Intrinsic::amdgcn_permlanex16;
5346+
5347+
auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
5348+
Register Src2, LLT VT) -> Register {
53465349
auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
53475350
switch (IID) {
53485351
case Intrinsic::amdgcn_readfirstlane:
5352+
case Intrinsic::amdgcn_permlane64:
53495353
return LaneOp.getReg(0);
53505354
case Intrinsic::amdgcn_readlane:
53515355
return LaneOp.addUse(Src1).getReg(0);
53525356
case Intrinsic::amdgcn_writelane:
53535357
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
5358+
case Intrinsic::amdgcn_permlane16:
5359+
case Intrinsic::amdgcn_permlanex16: {
5360+
Register Src3 = MI.getOperand(5).getReg();
5361+
Register Src4 = MI.getOperand(6).getImm();
5362+
Register Src5 = MI.getOperand(7).getImm();
5363+
return LaneOp.addUse(Src1)
5364+
.addUse(Src2)
5365+
.addUse(Src3)
5366+
.addImm(Src4)
5367+
.addImm(Src5)
5368+
.getReg(0);
5369+
}
53545370
default:
53555371
llvm_unreachable("unhandled lane op");
53565372
}
@@ -5359,9 +5375,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
53595375
Register DstReg = MI.getOperand(0).getReg();
53605376
Register Src0 = MI.getOperand(2).getReg();
53615377
Register Src1, Src2;
5362-
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5378+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5379+
IsPermLane16) {
53635380
Src1 = MI.getOperand(3).getReg();
5364-
if (IID == Intrinsic::amdgcn_writelane) {
5381+
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
53655382
Src2 = MI.getOperand(4).getReg();
53665383
}
53675384
}
@@ -5376,12 +5393,15 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
53765393

53775394
if (Size < 32) {
53785395
Src0 = B.buildAnyExt(S32, Src0).getReg(0);
5379-
if (Src2.isValid())
5396+
5397+
if (IsPermLane16)
5398+
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
5399+
5400+
if (IID == Intrinsic::amdgcn_writelane)
53805401
Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
53815402

53825403
Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
53835404
B.buildTrunc(DstReg, LaneOpDst);
5384-
53855405
MI.eraseFromParent();
53865406
return true;
53875407
}
@@ -5408,15 +5428,23 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
54085428
SmallVector<Register, 2> PartialRes;
54095429
unsigned NumParts = Size / 32;
54105430
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
5411-
MachineInstrBuilder Src2Parts;
5431+
MachineInstrBuilder Src1Parts, Src2Parts;
5432+
5433+
if (IsPermLane16)
5434+
Src1Parts = B.buildUnmerge(PartialResTy, Src1);
54125435

5413-
if (Src2.isValid())
5436+
if (IID == Intrinsic::amdgcn_writelane)
54145437
Src2Parts = B.buildUnmerge(PartialResTy, Src2);
54155438

54165439
for (unsigned i = 0; i < NumParts; ++i) {
54175440
Src0 = Src0Parts.getReg(i);
5418-
if (Src2.isValid())
5441+
5442+
if (IsPermLane16)
5443+
Src1 = Src1Parts.getReg(i);
5444+
5445+
if (IID == Intrinsic::amdgcn_writelane)
54195446
Src2 = Src2Parts.getReg(i);
5447+
54205448
PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
54215449
}
54225450

@@ -7328,6 +7356,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
73287356
case Intrinsic::amdgcn_readlane:
73297357
case Intrinsic::amdgcn_writelane:
73307358
case Intrinsic::amdgcn_readfirstlane:
7359+
case Intrinsic::amdgcn_permlane16:
7360+
case Intrinsic::amdgcn_permlanex16:
7361+
case Intrinsic::amdgcn_permlane64:
73317362
return legalizeLaneOp(Helper, MI, IntrID);
73327363
default: {
73337364
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 49 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5887,36 +5887,47 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
58875887
EVT VT = N->getValueType(0);
58885888
unsigned ValSize = VT.getSizeInBits();
58895889
unsigned IID = N->getConstantOperandVal(0);
5890+
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
5891+
IID == Intrinsic::amdgcn_permlanex16;
58905892
SDLoc SL(N);
58915893
MVT IntVT = MVT::getIntegerVT(ValSize);
58925894

58935895
auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
58945896
SDValue Src2, MVT ValT) -> SDValue {
58955897
SmallVector<SDValue, 8> Operands;
5896-
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
58975898
switch (IID) {
5898-
case Intrinsic::amdgcn_readfirstlane:
5899-
Operands.push_back(Src0);
5900-
break;
5899+
case Intrinsic::amdgcn_permlane16:
5900+
case Intrinsic::amdgcn_permlanex16:
5901+
Operands.push_back(N->getOperand(6));
5902+
Operands.push_back(N->getOperand(5));
5903+
Operands.push_back(N->getOperand(4));
5904+
[[fallthrough]];
5905+
case Intrinsic::amdgcn_writelane:
5906+
Operands.push_back(Src2);
5907+
[[fallthrough]];
59015908
case Intrinsic::amdgcn_readlane:
5902-
Operands.push_back(Src0);
59035909
Operands.push_back(Src1);
5904-
break;
5905-
case Intrinsic::amdgcn_writelane:
5910+
[[fallthrough]];
5911+
case Intrinsic::amdgcn_readfirstlane:
5912+
case Intrinsic::amdgcn_permlane64:
59065913
Operands.push_back(Src0);
5907-
Operands.push_back(Src1);
5908-
Operands.push_back(Src2);
59095914
break;
5915+
default:
5916+
llvm_unreachable("unhandled lane op");
59105917
}
59115918

5919+
Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
5920+
std::reverse(Operands.begin(), Operands.end());
5921+
59125922
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
59135923
};
59145924

59155925
SDValue Src0 = N->getOperand(1);
59165926
SDValue Src1, Src2;
5917-
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
5927+
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
5928+
IsPermLane16) {
59185929
Src1 = N->getOperand(2);
5919-
if (IID == Intrinsic::amdgcn_writelane)
5930+
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
59205931
Src2 = N->getOperand(3);
59215932
}
59225933

@@ -5929,10 +5940,17 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
59295940
bool IsFloat = VT.isFloatingPoint();
59305941
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
59315942
SL, MVT::i32);
5932-
if (Src2.getNode()) {
5943+
5944+
if (IsPermLane16) {
5945+
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
5946+
SL, MVT::i32);
5947+
}
5948+
5949+
if (IID == Intrinsic::amdgcn_writelane) {
59335950
Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
59345951
SL, MVT::i32);
59355952
}
5953+
59365954
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
59375955
SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
59385956
return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
@@ -5984,17 +6002,23 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
59846002
case MVT::bf16: {
59856003
MVT SubVecVT = MVT::getVectorVT(EltTy, 2);
59866004
SmallVector<SDValue, 4> Pieces;
6005+
SDValue Src0SubVec, Src1SubVec, Src2SubVec;
59876006
for (unsigned i = 0, EltIdx = 0; i < ValSize / 32; i++) {
5988-
SDValue Src0SubVec =
5989-
DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
5990-
DAG.getConstant(EltIdx, SL, MVT::i32));
6007+
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
6008+
DAG.getConstant(EltIdx, SL, MVT::i32));
59916009

5992-
SDValue Src2SubVec;
5993-
if (Src2)
6010+
if (IsPermLane16)
6011+
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
6012+
DAG.getConstant(EltIdx, SL, MVT::i32));
6013+
6014+
if (IID == Intrinsic::amdgcn_writelane)
59946015
Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
59956016
DAG.getConstant(EltIdx, SL, MVT::i32));
59966017

5997-
Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
6018+
Pieces.push_back(
6019+
IsPermLane16
6020+
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
6021+
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
59986022
EltIdx += 2;
59996023
}
60006024
return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
@@ -6008,7 +6032,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
60086032
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
60096033
Src0 = DAG.getBitcast(VecVT, Src0);
60106034

6011-
if (Src2)
6035+
if (IsPermLane16)
6036+
Src1 = DAG.getBitcast(VecVT, Src1);
6037+
6038+
if (IID == Intrinsic::amdgcn_writelane)
60126039
Src2 = DAG.getBitcast(VecVT, Src2);
60136040

60146041
SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
@@ -8464,6 +8491,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
84648491
case Intrinsic::amdgcn_readlane:
84658492
case Intrinsic::amdgcn_readfirstlane:
84668493
case Intrinsic::amdgcn_writelane:
8494+
case Intrinsic::amdgcn_permlane16:
8495+
case Intrinsic::amdgcn_permlanex16:
8496+
case Intrinsic::amdgcn_permlane64:
84678497
return lowerLaneOp(*this, Op.getNode(), DAG);
84688498
default:
84698499
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -737,15 +737,19 @@ def V_ACCVGPR_MOV_B32 : VOP1_Pseudo<"v_accvgpr_mov_b32", VOPProfileAccMov, [], 1
737737
let SubtargetPredicate = isGFX11Plus in {
738738
// Restrict src0 to be VGPR
739739
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
740-
getVOP1Pat64<int_amdgcn_permlane64,
741-
VOP_MOVRELS>.ret,
742-
/*VOP1Only=*/ 1>;
740+
[], /*VOP1Only=*/ 1>;
743741
defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
744742
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
745743
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
746744
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
747745
} // End SubtargetPredicate = isGFX11Plus
748746

747+
foreach vt = Reg32Types.types in {
748+
def : GCNPat<(int_amdgcn_permlane64 (vt VRegSrc_32:$src0)),
749+
(vt (V_PERMLANE64_B32 (vt VRegSrc_32:$src0)))
750+
>;
751+
}
752+
749753
//===----------------------------------------------------------------------===//
750754
// Target-specific instruction encodings.
751755
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)