Skip to content

[NVPTX] Auto-Upgrade llvm.nvvm.atomic.load.{inc,dec}.32 #134111

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 4 additions & 14 deletions clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -481,21 +481,11 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
AtomicOrdering::SequentiallyConsistent);
}

case NVPTX::BI__nvvm_atom_inc_gen_ui: {
Value *Ptr = EmitScalarExpr(E->getArg(0));
Value *Val = EmitScalarExpr(E->getArg(1));
Function *FnALI32 =
CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_inc_32, Ptr->getType());
return Builder.CreateCall(FnALI32, {Ptr, Val});
}
case NVPTX::BI__nvvm_atom_inc_gen_ui:
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UIncWrap, E);

case NVPTX::BI__nvvm_atom_dec_gen_ui: {
Value *Ptr = EmitScalarExpr(E->getArg(0));
Value *Val = EmitScalarExpr(E->getArg(1));
Function *FnALD32 =
CGM.getIntrinsic(Intrinsic::nvvm_atomic_load_dec_32, Ptr->getType());
return Builder.CreateCall(FnALD32, {Ptr, Val});
}
case NVPTX::BI__nvvm_atom_dec_gen_ui:
return MakeBinaryAtomicValue(*this, llvm::AtomicRMWInst::UDecWrap, E);

case NVPTX::BI__nvvm_ldg_c:
case NVPTX::BI__nvvm_ldg_sc:
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,10 @@ __device__ void nvvm_atom(float *fp, float f, double *dfp, double df,
// CHECK: atomicrmw fadd ptr {{.*}} seq_cst, align 4
__nvvm_atom_add_gen_f(fp, f);

// CHECK: call i32 @llvm.nvvm.atomic.load.inc.32.p0
// CHECK: atomicrmw uinc_wrap ptr {{.*}} seq_cst, align 4
__nvvm_atom_inc_gen_ui(uip, ui);

// CHECK: call i32 @llvm.nvvm.atomic.load.dec.32.p0
// CHECK: atomicrmw udec_wrap ptr {{.*}} seq_cst, align 4
__nvvm_atom_dec_gen_ui(uip, ui);


Expand Down
10 changes: 2 additions & 8 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@
// * llvm.nvvm.ldg.global.f --> ibid.
// * llvm.nvvm.ldg.global.p --> ibid.
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap

def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
Expand Down Expand Up @@ -1633,14 +1635,6 @@ let TargetPrefix = "nvvm" in {
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem]>;

// Atomics not available as llvm intrinsics.
def int_nvvm_atomic_load_inc_32 : Intrinsic<[llvm_i32_ty],
[llvm_anyptr_ty, llvm_i32_ty],
[IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;
def int_nvvm_atomic_load_dec_32 : Intrinsic<[llvm_i32_ty],
[llvm_anyptr_ty, llvm_i32_ty],
[IntrArgMemOnly, IntrNoCallback, NoCapture<ArgIndex<0>>]>;

class SCOPED_ATOMIC2_impl<LLVMType elty>
: Intrinsic<[elty],
[llvm_anyptr_ty, LLVMMatchType<0>],
Expand Down
2 changes: 2 additions & 0 deletions llvm/include/llvm/Target/TargetSelectionDAG.td
Original file line number Diff line number Diff line change
Expand Up @@ -1825,6 +1825,8 @@ defm atomic_load_min : binary_atomic_op<atomic_load_min>;
defm atomic_load_max : binary_atomic_op<atomic_load_max>;
defm atomic_load_umin : binary_atomic_op<atomic_load_umin>;
defm atomic_load_umax : binary_atomic_op<atomic_load_umax>;
defm atomic_load_uinc_wrap : binary_atomic_op<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op<atomic_load_udec_wrap>;
defm atomic_cmp_swap : ternary_atomic_op<atomic_cmp_swap>;

/// Atomic load which zeroes the excess high bits.
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1302,6 +1302,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
else if (Name.consume_front("atomic.load.add."))
// nvvm.atomic.load.add.{f32.p,f64.p}
Expand = Name.starts_with("f32.p") || Name.starts_with("f64.p");
else if (Name.consume_front("atomic.load.") && Name.consume_back(".32"))
// nvvm.atomic.load.{inc,dec}.32
Expand = Name == "inc" || Name == "dec";
else if (Name.consume_front("bitcast."))
// nvvm.bitcast.{f2i,i2f,ll2d,d2ll}
Expand =
Expand Down Expand Up @@ -2314,6 +2317,12 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
Value *Val = CI->getArgOperand(1);
Rep = Builder.CreateAtomicRMW(AtomicRMWInst::FAdd, Ptr, Val, MaybeAlign(),
AtomicOrdering::SequentiallyConsistent);
} else if (Name.consume_front("atomic.load.") && Name.consume_back(".32")) {
Value *Ptr = CI->getArgOperand(0);
Value *Val = CI->getArgOperand(1);
auto Op = Name == "inc" ? AtomicRMWInst::UIncWrap : AtomicRMWInst::UDecWrap;
Rep = Builder.CreateAtomicRMW(Op, Ptr, Val, MaybeAlign(),
AtomicOrdering::SequentiallyConsistent);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The intrinsic does not mention anything about ordering, but I suppose seq_cst is the safest?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea, I do wonder about this as well. This is consistent with how the rest of the atomicrmw instructions are lowered, but I'm not sure it's correct. We're lowering seq_cst atomicrmw without a syncscope to the PTX atom instruction which has implied .relaxed ordering and .gpu scope.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For historical reasons, frontends generate seq_cst and "system scope", and we (LLVM and NVVM) lower that to relaxed+gpu scope. We should definetly prioritize fixing these bugs by lowering these properly (there are also a few other related bugs that we need to fix).

However, these fixes will break all the frontends, so we need to give some thought to how to approach this to minimize churn.

Therefore, i don't know if this PR is the right place to make this fix (this has been broken for a long time).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we are lowering cmpxchg with seq_cst correctly by using fence.sc and atom.acq_rel.cas. @gonzalobg - will this also break frontends?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, sounds like there is a larger issue to address around the scope and semantics of atomics in NVPTX. This change maintains consistency with all other atomicrmw instructions and I think the larger bug can be addressed separately.

} else if (Name.consume_front("max.") &&
(Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
Name == "ui" || Name == "ull")) {
Expand Down
15 changes: 12 additions & 3 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4067,9 +4067,6 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}

case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:

case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
Expand Down Expand Up @@ -6145,6 +6142,18 @@ NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
default:
llvm_unreachable("unsupported width encountered");
}
case AtomicRMWInst::BinOp::UIncWrap:
case AtomicRMWInst::BinOp::UDecWrap:
switch (ITy->getBitWidth()) {
case 32:
return AtomicExpansionKind::None;
case 8:
case 16:
case 64:
return AtomicExpansionKind::CmpXChg;
default:
llvm_unreachable("unsupported width encountered");
}
}

return AtomicExpansionKind::CmpXChg;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -2070,8 +2070,8 @@ defm INT_PTX_ATOMIC_UMIN_32 : F_ATOMIC_2_AS<I32RT, atomic_load_umin_i32, "min.u3
defm INT_PTX_ATOMIC_UMIN_64 : F_ATOMIC_2_AS<I64RT, atomic_load_umin_i64, "min.u64", [hasSM<32>]>;

// atom_inc atom_dec
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't have a PTX test to prove that the final lowering to PTX will be the same, but this looks good enough.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it makes sense to test the auto-upgrade rules and test the lowering of the current syntax but not to maintain lowering tests using out-of-date syntax.

defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_inc_32, "inc.u32">;
defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, int_nvvm_atomic_load_dec_32, "dec.u32">;
defm INT_PTX_ATOM_INC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_uinc_wrap_i32, "inc.u32">;
defm INT_PTX_ATOM_DEC_32 : F_ATOMIC_2_AS<I32RT, atomic_load_udec_wrap_i32, "dec.u32">;

// atom_and
defm INT_PTX_ATOM_AND_32 : F_ATOMIC_2_AS<I32RT, atomic_load_and_i32, "and.b32">;
Expand Down
52 changes: 25 additions & 27 deletions llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,33 +46,31 @@ static bool readsLaneId(const IntrinsicInst *II) {
// Whether the given intrinsic is an atomic instruction in PTX.
static bool isNVVMAtomic(const IntrinsicInst *II) {
switch (II->getIntrinsicID()) {
default: return false;
case Intrinsic::nvvm_atomic_load_inc_32:
case Intrinsic::nvvm_atomic_load_dec_32:

case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
case Intrinsic::nvvm_atomic_add_gen_i_sys:
case Intrinsic::nvvm_atomic_and_gen_i_cta:
case Intrinsic::nvvm_atomic_and_gen_i_sys:
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
case Intrinsic::nvvm_atomic_max_gen_i_cta:
case Intrinsic::nvvm_atomic_max_gen_i_sys:
case Intrinsic::nvvm_atomic_min_gen_i_cta:
case Intrinsic::nvvm_atomic_min_gen_i_sys:
case Intrinsic::nvvm_atomic_or_gen_i_cta:
case Intrinsic::nvvm_atomic_or_gen_i_sys:
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
case Intrinsic::nvvm_atomic_xor_gen_i_sys:
return true;
default:
return false;
case Intrinsic::nvvm_atomic_add_gen_f_cta:
case Intrinsic::nvvm_atomic_add_gen_f_sys:
case Intrinsic::nvvm_atomic_add_gen_i_cta:
case Intrinsic::nvvm_atomic_add_gen_i_sys:
case Intrinsic::nvvm_atomic_and_gen_i_cta:
case Intrinsic::nvvm_atomic_and_gen_i_sys:
case Intrinsic::nvvm_atomic_cas_gen_i_cta:
case Intrinsic::nvvm_atomic_cas_gen_i_sys:
case Intrinsic::nvvm_atomic_dec_gen_i_cta:
case Intrinsic::nvvm_atomic_dec_gen_i_sys:
case Intrinsic::nvvm_atomic_inc_gen_i_cta:
case Intrinsic::nvvm_atomic_inc_gen_i_sys:
case Intrinsic::nvvm_atomic_max_gen_i_cta:
case Intrinsic::nvvm_atomic_max_gen_i_sys:
case Intrinsic::nvvm_atomic_min_gen_i_cta:
case Intrinsic::nvvm_atomic_min_gen_i_sys:
case Intrinsic::nvvm_atomic_or_gen_i_cta:
case Intrinsic::nvvm_atomic_or_gen_i_sys:
case Intrinsic::nvvm_atomic_exch_gen_i_cta:
case Intrinsic::nvvm_atomic_exch_gen_i_sys:
case Intrinsic::nvvm_atomic_xor_gen_i_cta:
case Intrinsic::nvvm_atomic_xor_gen_i_sys:
return true;
}
}

Expand Down
16 changes: 15 additions & 1 deletion llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ declare i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr, i32)
declare ptr @llvm.nvvm.ldg.global.p.p0(ptr, i32)
declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32)

declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32)
declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32)

; CHECK-LABEL: @simple_upgrade
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
Expand Down Expand Up @@ -224,4 +227,15 @@ define void @ldg(ptr %p0, ptr addrspace(1) %p1) {
%v6 = call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %p0, i32 16)

ret void
}
}

; CHECK-LABEL: @atomics
define i32 @atomics(ptr %p0, i32 %a) {
; CHECK: %1 = atomicrmw uinc_wrap ptr %p0, i32 %a seq_cst
; CHECK: %2 = atomicrmw udec_wrap ptr %p0, i32 %a seq_cst

%r1 = call i32 @llvm.nvvm.atomic.load.inc.32(ptr %p0, i32 %a)
%r2 = call i32 @llvm.nvvm.atomic.load.dec.32(ptr %p0, i32 %a)
ret i32 %r2
}

36 changes: 34 additions & 2 deletions llvm/test/CodeGen/NVPTX/atomics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,38 @@ define i64 @atom19(ptr %subr, i64 %val) {
ret i64 %ret
}

define i32 @atom20(ptr %subr, i32 %val) {
; CHECK-LABEL: atom20(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [atom20_param_0];
; CHECK-NEXT: ld.param.u32 %r1, [atom20_param_1];
; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%ret = atomicrmw uinc_wrap ptr %subr, i32 %val seq_cst
ret i32 %ret
}

define i32 @atom21(ptr %subr, i32 %val) {
; CHECK-LABEL: atom21(
; CHECK: {
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-NEXT: .reg .b64 %rd<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u64 %rd1, [atom21_param_0];
; CHECK-NEXT: ld.param.u32 %r1, [atom21_param_1];
; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1;
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
; CHECK-NEXT: ret;
%ret = atomicrmw udec_wrap ptr %subr, i32 %val seq_cst
ret i32 %ret
}

declare float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)

; CHECK-LABEL: atomic_add_f32_generic
Expand Down Expand Up @@ -409,7 +441,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
; CHECK-NEXT: not.b32 %r2, %r9;
; CHECK-NEXT: ld.u32 %r16, [%rd1];
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
; CHECK-NEXT: $L__BB22_1: // %atomicrmw.start
; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: shr.u32 %r10, %r16, %r1;
; CHECK-NEXT: cvt.u16.u32 %rs2, %r10;
Expand All @@ -424,7 +456,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
; CHECK-NEXT: atom.cas.b32 %r5, [%rd1], %r16, %r14;
; CHECK-NEXT: setp.ne.s32 %p1, %r5, %r16;
; CHECK-NEXT: mov.b32 %r16, %r5;
; CHECK-NEXT: @%p1 bra $L__BB22_1;
; CHECK-NEXT: @%p1 bra $L__BB24_1;
; CHECK-NEXT: // %bb.2: // %atomicrmw.end
; CHECK-NEXT: shr.u32 %r15, %r5, %r1;
; CHECK-NEXT: cvt.u16.u32 %rs4, %r15;
Expand Down
Loading