-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Auto-Upgrade llvm.nvvm.atomic.load.{inc,dec}.32 #134111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Auto-Upgrade llvm.nvvm.atomic.load.{inc,dec}.32 #134111
Conversation
CC @gonzalobg |
@llvm/pr-subscribers-llvm-selectiondag @llvm/pr-subscribers-clang-codegen Author: Alex MacLean (AlexMaclean) ChangesThese intrinsics can be upgrade to an atomicrmw instruction. Full diff: https://github.com/llvm/llvm-project/pull/134111.diff 9 Files Affected:
diff --git a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
index aaac19b229905..0f7ab9fd3b099 100644
--- a/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
@@ -481,21 +481,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
AtomicOrdering::SequentiallyConsistent);
}
- case NVPTX::BI__nvvm_atom_inc_gen_ui: {
- Value *Ptr = EmitScalarExpr(E->getArg(0));
- Value *Val = EmitScalarExpr(E->getArg(1));
- Function *FnALI32 =
- CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
- return Builder.CreateCall(FnALI32, {Ptr, Val});
- }
+ case NVPTX::BI__nvvm_atom_inc_gen_ui:
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E);
- case NVPTX::BI__nvvm_atom_dec_gen_ui: {
- Value *Ptr = EmitScalarExpr(E->getArg(0));
- Value *Val = EmitScalarExpr(E->getArg(1));
- Function *FnALD32 =
- CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
- return Builder.CreateCall(FnALD32, {Ptr, Val});
- }
+ case NVPTX::BI__nvvm_atom_dec_gen_ui:
+ return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E);
case NVPTX::BI__nvvm_ldg_c:
case NVPTX::BI__nvvm_ldg_sc:
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 80e10f33b770d..243d33fc5d72d 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -46,6 +46,8 @@
// * llvm.nvvm.ldg.global.f --> ibid.
// * llvm.nvvm.ldg.global.p --> ibid.
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
+// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
+// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
@@ -1555,14 +1557,6 @@ let TargetPrefix = "nvvm" in {
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;
-// Atomics not available as llvm intrinsics.
- def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
- [llvm_anyptr_ty, llvm_i32_ty],
- [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
- def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
- [llvm_anyptr_ty, llvm_i32_ty],
- [IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
-
class SCOPED_ATOMIC2_impl<LLVMType elty>
: Intrinsic<[elty],
[llvm_anyptr_ty, LLVMMatchType<0>],
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 42a5fbec95174..9c241b6c4df0f 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -1825,6 +1825,8 @@ defm atomic_load_min : binary_atomic_op<atomic_load_min>;
defm atomic_load_max : binary_atomic_op<atomic_load_max>;
defm atomic_load_umin : binary_atomic_op<atomic_load_umin>;
defm atomic_load_umax : binary_atomic_op<atomic_load_umax>;
+defm atomic_load_uinc_wrap : binary_atomic_op<atomic_load_uinc_wrap>;
+defm atomic_load_udec_wrap : binary_atomic_op<atomic_load_udec_wrap>;
defm atomic_cmp_swap : ternary_atomic_op<atomic_cmp_swap>;
/// Atomic load which zeroes the excess high bits.
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 963fb1b6ad8c0..0b329d91c3c7c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1302,6 +1302,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
else if (Name.consume_front("atomic.load.add."))
// nvvm.atomic.load.add.{f32.p,f64.p}
Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p");
+ else if (Name.consume_front("atomic.load.") && Name.consume_back(".32"))
+ // nvvm.atomic.load.{inc,dec}.32
+ Expand = Name == "inc" || Name == "dec";
else if (Name.consume_front("bitcast."))
// nvvm.bitcast.{f2i,i2f,ll2d,d2ll}
Expand =
@@ -2314,6 +2317,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
Value *Val = CI->getArgOperand(1);
Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
AtomicOrdering::SequentiallyConsistent);
+ } else if (Name.consume_front("atomic.load.") && Name.consume_back(".32")) {
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Val = CI->getArgOperand(1);
+ auto Op = Name == "inc" ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
+ Rep = Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(),
+ AtomicOrdering::SequentiallyConsistent);
} else if (Name.consume_front("max.") &&
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
Name == "ui" || Name == "ull")) {
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index b566cdd4b6bfc..904890b01596d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -4067,9 +4067,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}
- case Intrinsic::nvvm_atomic_load_inc_32:
- case Intrinsic::nvvm_atomic_load_dec_32:
-
case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
@@ -6145,6 +6142,18 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
default:
llvm_unreachable("unsupported width encountered");
}
+ case AtomicRMWInst::BinOp::UIncWrap:
+ case AtomicRMWInst::BinOp::UDecWrap:
+ switch (ITy->getBitWidth()) {
+ case 32:
+ return AtomicExpansionKind::None;
+ case 8:
+ case 16:
+ case 64:
+ return AtomicExpansionKind::CmpXChg;
+ default:
+ llvm_unreachable("unsupported width encountered");
+ }
}
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 34cb63e44ca71..8528ff702f236 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2070,8 +2070,8 @@ defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u3
defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>;
// atom_inc atom_dec
-defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_inc_32, "inc.u32">;
-defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_dec_32, "dec.u32">;
+defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_uinc_wrap_i32, "inc.u32">;
+defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_udec_wrap_i32, "dec.u32">;
// atom_and
defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS<I32RT, atomic_load_and_i32, "and.b32">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index a89ca3037c7ff..b7bf6f03f8185 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -47,9 +47,6 @@ static bool readsLaneId(const IntrinsicInst *II) {
static bool isNVVMAtomic(const IntrinsicInst *II) {
switch (II->getIntrinsicID()) {
default: return false;
- case Intrinsic::nvvm_atomic_load_inc_32:
- case Intrinsic::nvvm_atomic_load_dec_32:
-
case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 588e79a7428a4..74b9640df6977 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -52,6 +52,9 @@ declare i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr, i32)
declare ptr @llvm.nvvm.ldg.global.p.p0(ptr, i32)
declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32)
+declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32)
+declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32)
+
; CHECK-LABEL: @simple_upgrade
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -224,4 +227,15 @@ define void @ldg(ptr %p0, ptr addrspace(1) %p1) {
%v6 = call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %p0, i32 16)
ret void
-}
\ No newline at end of file
+}
+
+; CHECK-LABEL: @atomics
+define i32 @atomics(ptr %p0, i32 %a) {
+; CHECK: %1 = atomicrmw uinc_wrap ptr %p0, i32 %a seq_cst
+; CHECK: %2 = atomicrmw udec_wrap ptr %p0, i32 %a seq_cst
+
+ %r1 = call i32 @llvm.nvvm.atomic.load.inc.32(ptr %p0, i32 %a)
+ %r2 = call i32 @llvm.nvvm.atomic.load.dec.32(ptr %p0, i32 %a)
+ ret i32 %r2
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll
index e1d9aaf7cfb20..bb04aa856d656 100644
--- a/llvm/test/CodeGen/NVPTX/atomics.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics.ll
@@ -313,6 +313,38 @@ define i64 @atom19(ptr %subr, i64 %val) {
ret i64 %ret
}
+define i32 @atom20(ptr %subr, i32 %val) {
+; CHECK-LABEL: atom20(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom20_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom20_param_1];
+; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %ret = atomicrmw uinc_wrap ptr %subr, i32 %val seq_cst
+ ret i32 %ret
+}
+
+define i32 @atom21(ptr %subr, i32 %val) {
+; CHECK-LABEL: atom21(
+; CHECK: {
+; CHECK-NEXT: .reg .b32 %r<3>;
+; CHECK-NEXT: .reg .b64 %rd<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT: ld.param.u64 %rd1, [atom21_param_0];
+; CHECK-NEXT: ld.param.u32 %r1, [atom21_param_1];
+; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1;
+; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
+; CHECK-NEXT: ret;
+ %ret = atomicrmw udec_wrap ptr %subr, i32 %val seq_cst
+ ret i32 %ret
+}
+
declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
; CHECK-LABEL: atomic_add_f32_generic
@@ -409,7 +441,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
; CHECK-NEXT: not.b32 %r2, %r9;
; CHECK-NEXT: ld.u32 %r16, [%rd1];
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
-; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start
+; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: shr.u32 %r10, %r16, %r1;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r10;
@@ -424,7 +456,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14;
; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16;
; CHECK-NEXT: mov.b32 %r16, %r5;
-; CHECK-NEXT: @%p1 bra $L__BB22_1;
+; CHECK-NEXT: @%p1 bra $L__BB24_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: shr.u32 %r15, %r5, %r1;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r15;
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
887645d
to
48cd911
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! Comments are just questions.
@@ -2070,8 +2070,8 @@ defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u3 | |||
defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>; | |||
|
|||
// atom_inc atom_dec |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't have a PTX test to prove that the final lowering to PTX will be the same, but this looks good enough.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it makes sense to test the auto-upgrade rules and test the lowering of the current syntax but not to maintain lowering tests using out-of-date syntax.
Value *Val = CI->getArgOperand(1); | ||
auto Op = Name == "inc" ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap; | ||
Rep = Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(), | ||
AtomicOrdering::SequentiallyConsistent); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The intrinsic does not mention anything about ordering, but I suppose seq_cst is the safest?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yea, I do wonder about this as well. This is consistent with how the rest of the atomicrmw
instructions are lowered, but I'm not sure it's correct. We're lowering seq_cst atomicrmw without a syncscope to the PTX atom instruction which has implied .relaxed
ordering and .gpu
scope.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For historical reasons, frontends generate seq_cst
and "system scope", and we (LLVM and NVVM) lower that to relaxed+gpu scope
. We should definetly prioritize fixing these bugs by lowering these properly (there are also a few other related bugs that we need to fix).
However, these fixes will break all the frontends, so we need to give some thought to how to approach this to minimize churn.
Therefore, i don't know if this PR is the right place to make this fix (this has been broken for a long time).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
But we are lowering cmpxchg with seq_cst correctly by using fence.sc and atom.acq_rel.cas. @gonzalobg - will this also break frontends?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, sounds like there is a larger issue to address around the scope and semantics of atomics in NVPTX. This change maintains consistency with all other atomicrmw
instructions and I think the larger bug can be addressed separately.
48cd911
to
46de785
Compare
These intrinsics can be upgrade to an atomicrmw instruction.
These intrinsics can be upgrade to an atomicrmw instruction.