Skip to content

Commit a131fbf

Browse files
authored
Reland "[NVPTX] deprecate nvvm.rotate.* intrinsics, cleanup funnel-shift handling" (#110025)
This change deprecates the following intrinsics which can be trivially converted to llvm funnel-shift intrinsics: - @llvm.nvvm.rotate.b32 - @llvm.nvvm.rotate.right.b64 - @llvm.nvvm.rotate.b64 This fixes a bug in the previous version (#107655) which flipped the order of the operands to the PTX funnel shift instruction. In LLVM IR the high bits are the first arg and the low bits are the second arg, while in PTX this is reversed.
1 parent 0c6ee1f commit a131fbf

File tree

9 files changed

+465
-582
lines changed

9 files changed

+465
-582
lines changed

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4479,22 +4479,6 @@ def int_nvvm_sust_p_3d_v4i32_trap
44794479
"llvm.nvvm.sust.p.3d.v4i32.trap">,
44804480
ClangBuiltin<"__nvvm_sust_p_3d_v4i32_trap">;
44814481

4482-
4483-
def int_nvvm_rotate_b32
4484-
: DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
4485-
[IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b32">,
4486-
ClangBuiltin<"__nvvm_rotate_b32">;
4487-
4488-
def int_nvvm_rotate_b64
4489-
: DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
4490-
[IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.b64">,
4491-
ClangBuiltin<"__nvvm_rotate_b64">;
4492-
4493-
def int_nvvm_rotate_right_b64
4494-
: DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty],
4495-
[IntrNoMem, IntrSpeculatable], "llvm.nvvm.rotate.right.b64">,
4496-
ClangBuiltin<"__nvvm_rotate_right_b64">;
4497-
44984482
def int_nvvm_swap_lo_hi_b64
44994483
: DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty],
45004484
[IntrNoMem, IntrSpeculatable], "llvm.nvvm.swap.lo.hi.b64">,

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 106 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
12721272
// nvvm.bitcast.{f2i,i2f,ll2d,d2ll}
12731273
Expand =
12741274
Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll";
1275+
else if (Name.consume_front("rotate."))
1276+
// nvvm.rotate.{b32,b64,right.b64}
1277+
Expand = Name == "b32" || Name == "b64" || Name == "right.b64";
12751278
else
12761279
Expand = false;
12771280

@@ -2258,6 +2261,108 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
22582261
}
22592262
}
22602263

2264+
static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
2265+
Function *F, IRBuilder<> &Builder) {
2266+
Value *Rep = nullptr;
2267+
2268+
if (Name == "abs.i" || Name == "abs.ll") {
2269+
Value *Arg = CI->getArgOperand(0);
2270+
Value *Neg = Builder.CreateNeg(Arg, "neg");
2271+
Value *Cmp = Builder.CreateICmpSGE(
2272+
Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
2273+
Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
2274+
} else if (Name.starts_with("atomic.load.add.f32.p") ||
2275+
Name.starts_with("atomic.load.add.f64.p")) {
2276+
Value *Ptr = CI->getArgOperand(0);
2277+
Value *Val = CI->getArgOperand(1);
2278+
Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
2279+
AtomicOrdering::SequentiallyConsistent);
2280+
} else if (Name.consume_front("max.") &&
2281+
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
2282+
Name == "ui" || Name == "ull")) {
2283+
Value *Arg0 = CI->getArgOperand(0);
2284+
Value *Arg1 = CI->getArgOperand(1);
2285+
Value *Cmp = Name.starts_with("u")
2286+
? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
2287+
: Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
2288+
Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
2289+
} else if (Name.consume_front("min.") &&
2290+
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
2291+
Name == "ui" || Name == "ull")) {
2292+
Value *Arg0 = CI->getArgOperand(0);
2293+
Value *Arg1 = CI->getArgOperand(1);
2294+
Value *Cmp = Name.starts_with("u")
2295+
? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
2296+
: Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
2297+
Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
2298+
} else if (Name == "clz.ll") {
2299+
// llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
2300+
Value *Arg = CI->getArgOperand(0);
2301+
Value *Ctlz = Builder.CreateCall(
2302+
Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
2303+
{Arg->getType()}),
2304+
{Arg, Builder.getFalse()}, "ctlz");
2305+
Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
2306+
} else if (Name == "popc.ll") {
2307+
// llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
2308+
// i64.
2309+
Value *Arg = CI->getArgOperand(0);
2310+
Value *Popc = Builder.CreateCall(
2311+
Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
2312+
{Arg->getType()}),
2313+
Arg, "ctpop");
2314+
Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
2315+
} else if (Name == "h2f") {
2316+
Rep = Builder.CreateCall(
2317+
Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16,
2318+
{Builder.getFloatTy()}),
2319+
CI->getArgOperand(0), "h2f");
2320+
} else if (Name.consume_front("bitcast.") &&
2321+
(Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
2322+
Name == "d2ll")) {
2323+
Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType());
2324+
} else if (Name == "rotate.b32") {
2325+
Value *Arg = CI->getOperand(0);
2326+
Value *ShiftAmt = CI->getOperand(1);
2327+
Rep = Builder.CreateIntrinsic(Builder.getInt32Ty(), Intrinsic::fshl,
2328+
{Arg, Arg, ShiftAmt});
2329+
} else if (Name == "rotate.b64") {
2330+
Type *Int64Ty = Builder.getInt64Ty();
2331+
Value *Arg = CI->getOperand(0);
2332+
Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
2333+
Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshl,
2334+
{Arg, Arg, ZExtShiftAmt});
2335+
} else if (Name == "rotate.right.b64") {
2336+
Type *Int64Ty = Builder.getInt64Ty();
2337+
Value *Arg = CI->getOperand(0);
2338+
Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
2339+
Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr,
2340+
{Arg, Arg, ZExtShiftAmt});
2341+
} else {
2342+
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
2343+
if (IID != Intrinsic::not_intrinsic &&
2344+
!F->getReturnType()->getScalarType()->isBFloatTy()) {
2345+
rename(F);
2346+
Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
2347+
SmallVector<Value *, 2> Args;
2348+
for (size_t I = 0; I < NewFn->arg_size(); ++I) {
2349+
Value *Arg = CI->getArgOperand(I);
2350+
Type *OldType = Arg->getType();
2351+
Type *NewType = NewFn->getArg(I)->getType();
2352+
Args.push_back(
2353+
(OldType->isIntegerTy() && NewType->getScalarType()->isBFloatTy())
2354+
? Builder.CreateBitCast(Arg, NewType)
2355+
: Arg);
2356+
}
2357+
Rep = Builder.CreateCall(NewFn, Args);
2358+
if (F->getReturnType()->isIntegerTy())
2359+
Rep = Builder.CreateBitCast(Rep, F->getReturnType());
2360+
}
2361+
}
2362+
2363+
return Rep;
2364+
}
2365+
22612366
static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
22622367
IRBuilder<> &Builder) {
22632368
LLVMContext &C = F->getContext();
@@ -4208,85 +4313,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
42084313

42094314
if (!IsX86 && Name == "stackprotectorcheck") {
42104315
Rep = nullptr;
4211-
} else if (IsNVVM && (Name == "abs.i" || Name == "abs.ll")) {
4212-
Value *Arg = CI->getArgOperand(0);
4213-
Value *Neg = Builder.CreateNeg(Arg, "neg");
4214-
Value *Cmp = Builder.CreateICmpSGE(
4215-
Arg, llvm::Constant::getNullValue(Arg->getType()), "abs.cond");
4216-
Rep = Builder.CreateSelect(Cmp, Arg, Neg, "abs");
4217-
} else if (IsNVVM && (Name.starts_with("atomic.load.add.f32.p") ||
4218-
Name.starts_with("atomic.load.add.f64.p"))) {
4219-
Value *Ptr = CI->getArgOperand(0);
4220-
Value *Val = CI->getArgOperand(1);
4221-
Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
4222-
AtomicOrdering::SequentiallyConsistent);
4223-
} else if (IsNVVM && Name.consume_front("max.") &&
4224-
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
4225-
Name == "ui" || Name == "ull")) {
4226-
Value *Arg0 = CI->getArgOperand(0);
4227-
Value *Arg1 = CI->getArgOperand(1);
4228-
Value *Cmp = Name.starts_with("u")
4229-
? Builder.CreateICmpUGE(Arg0, Arg1, "max.cond")
4230-
: Builder.CreateICmpSGE(Arg0, Arg1, "max.cond");
4231-
Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "max");
4232-
} else if (IsNVVM && Name.consume_front("min.") &&
4233-
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
4234-
Name == "ui" || Name == "ull")) {
4235-
Value *Arg0 = CI->getArgOperand(0);
4236-
Value *Arg1 = CI->getArgOperand(1);
4237-
Value *Cmp = Name.starts_with("u")
4238-
? Builder.CreateICmpULE(Arg0, Arg1, "min.cond")
4239-
: Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
4240-
Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
4241-
} else if (IsNVVM && Name == "clz.ll") {
4242-
// llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
4243-
Value *Arg = CI->getArgOperand(0);
4244-
Value *Ctlz = Builder.CreateCall(
4245-
Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
4246-
{Arg->getType()}),
4247-
{Arg, Builder.getFalse()}, "ctlz");
4248-
Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
4249-
} else if (IsNVVM && Name == "popc.ll") {
4250-
// llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
4251-
// i64.
4252-
Value *Arg = CI->getArgOperand(0);
4253-
Value *Popc = Builder.CreateCall(
4254-
Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop,
4255-
{Arg->getType()}),
4256-
Arg, "ctpop");
4257-
Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
42584316
} else if (IsNVVM) {
4259-
if (Name == "h2f") {
4260-
Rep =
4261-
Builder.CreateCall(Intrinsic::getDeclaration(
4262-
F->getParent(), Intrinsic::convert_from_fp16,
4263-
{Builder.getFloatTy()}),
4264-
CI->getArgOperand(0), "h2f");
4265-
} else if (Name.consume_front("bitcast.") &&
4266-
(Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
4267-
Name == "d2ll")) {
4268-
Rep = Builder.CreateBitCast(CI->getArgOperand(0), CI->getType());
4269-
} else {
4270-
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
4271-
if (IID != Intrinsic::not_intrinsic &&
4272-
!F->getReturnType()->getScalarType()->isBFloatTy()) {
4273-
rename(F);
4274-
NewFn = Intrinsic::getDeclaration(F->getParent(), IID);
4275-
SmallVector<Value *, 2> Args;
4276-
for (size_t I = 0; I < NewFn->arg_size(); ++I) {
4277-
Value *Arg = CI->getArgOperand(I);
4278-
Type *OldType = Arg->getType();
4279-
Type *NewType = NewFn->getArg(I)->getType();
4280-
Args.push_back((OldType->isIntegerTy() &&
4281-
NewType->getScalarType()->isBFloatTy())
4282-
? Builder.CreateBitCast(Arg, NewType)
4283-
: Arg);
4284-
}
4285-
Rep = Builder.CreateCall(NewFn, Args);
4286-
if (F->getReturnType()->isIntegerTy())
4287-
Rep = Builder.CreateBitCast(Rep, F->getReturnType());
4288-
}
4289-
}
4317+
Rep = upgradeNVVMIntrinsicCall(Name, CI, F, Builder);
42904318
} else if (IsX86) {
42914319
Rep = upgradeX86IntrinsicCall(Name, CI, F, Builder);
42924320
} else if (IsARM) {

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -594,20 +594,13 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
594594
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
595595
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
596596

597-
// TODO: we may consider expanding ROTL/ROTR on older GPUs. Currently on GPUs
598-
// that don't have h/w rotation we lower them to multi-instruction assembly.
599-
// See ROT*_sw in NVPTXIntrInfo.td
600-
setOperationAction(ISD::ROTL, MVT::i64, Legal);
601-
setOperationAction(ISD::ROTR, MVT::i64, Legal);
602-
setOperationAction(ISD::ROTL, MVT::i32, Legal);
603-
setOperationAction(ISD::ROTR, MVT::i32, Legal);
604-
605-
setOperationAction(ISD::ROTL, MVT::i16, Expand);
606-
setOperationAction(ISD::ROTL, MVT::v2i16, Expand);
607-
setOperationAction(ISD::ROTR, MVT::i16, Expand);
608-
setOperationAction(ISD::ROTR, MVT::v2i16, Expand);
609-
setOperationAction(ISD::ROTL, MVT::i8, Expand);
610-
setOperationAction(ISD::ROTR, MVT::i8, Expand);
597+
setOperationAction({ISD::ROTL, ISD::ROTR},
598+
{MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
599+
Expand);
600+
601+
if (STI.hasHWROT32())
602+
setOperationAction({ISD::FSHL, ISD::FSHR}, MVT::i32, Legal);
603+
611604
setOperationAction(ISD::BSWAP, MVT::i16, Expand);
612605

613606
setOperationAction(ISD::BR_JT, MVT::Other, Custom);
@@ -958,8 +951,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
958951
MAKE_CASE(NVPTXISD::LDUV4)
959952
MAKE_CASE(NVPTXISD::StoreV2)
960953
MAKE_CASE(NVPTXISD::StoreV4)
961-
MAKE_CASE(NVPTXISD::FUN_SHFL_CLAMP)
962-
MAKE_CASE(NVPTXISD::FUN_SHFR_CLAMP)
954+
MAKE_CASE(NVPTXISD::FSHL_CLAMP)
955+
MAKE_CASE(NVPTXISD::FSHR_CLAMP)
963956
MAKE_CASE(NVPTXISD::IMAD)
964957
MAKE_CASE(NVPTXISD::BFE)
965958
MAKE_CASE(NVPTXISD::BFI)
@@ -2490,8 +2483,8 @@ SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
24902483
// dLo = shf.r.clamp aLo, aHi, Amt
24912484

24922485
SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2493-
SDValue Lo = DAG.getNode(NVPTXISD::FUN_SHFR_CLAMP, dl, VT, ShOpLo, ShOpHi,
2494-
ShAmt);
2486+
SDValue Lo =
2487+
DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
24952488

24962489
SDValue Ops[2] = { Lo, Hi };
24972490
return DAG.getMergeValues(Ops, dl);
@@ -2549,8 +2542,8 @@ SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
25492542
// dHi = shf.l.clamp aLo, aHi, Amt
25502543
// dLo = aLo << Amt
25512544

2552-
SDValue Hi = DAG.getNode(NVPTXISD::FUN_SHFL_CLAMP, dl, VT, ShOpLo, ShOpHi,
2553-
ShAmt);
2545+
SDValue Hi =
2546+
DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
25542547
SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
25552548

25562549
SDValue Ops[2] = { Lo, Hi };

llvm/lib/Target/NVPTX/NVPTXISelLowering.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,8 +51,8 @@ enum NodeType : unsigned {
5151
CallSeqEnd,
5252
CallPrototype,
5353
ProxyReg,
54-
FUN_SHFL_CLAMP,
55-
FUN_SHFR_CLAMP,
54+
FSHL_CLAMP,
55+
FSHR_CLAMP,
5656
MUL_WIDE_SIGNED,
5757
MUL_WIDE_UNSIGNED,
5858
IMAD,

0 commit comments

Comments
 (0)