Skip to content

Reland "[NVPTX] Unify and extend barrier{.cta} intrinsic support" #141143

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,6 +1160,22 @@ Value *CodeGenFunction::EmitNVPTXBuiltinExpr(unsigned BuiltinID,
case NVPTX::BI__nvvm_fence_sc_cluster:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_fence_sc_cluster));
case NVPTX::BI__nvvm_bar_sync:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
EmitScalarExpr(E->getArg(0)));
case NVPTX::BI__syncthreads:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all),
Builder.getInt32(0));
case NVPTX::BI__nvvm_barrier_sync:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all),
EmitScalarExpr(E->getArg(0)));
case NVPTX::BI__nvvm_barrier_sync_cnt:
return Builder.CreateCall(
CGM.getIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count),
{EmitScalarExpr(E->getArg(0)), EmitScalarExpr(E->getArg(1))});
default:
return nullptr;
}
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/builtins-nvptx-ptx60.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ __device__ void nvvm_sync(unsigned mask, int i, float f, int a, int b,
// CHECK: call void @llvm.nvvm.bar.warp.sync(i32
// expected-error@+1 {{'__nvvm_bar_warp_sync' needs target feature ptx60}}
__nvvm_bar_warp_sync(mask);
// CHECK: call void @llvm.nvvm.barrier.sync(i32
// CHECK: call void @llvm.nvvm.barrier.cta.sync.all(i32
// expected-error@+1 {{'__nvvm_barrier_sync' needs target feature ptx60}}
__nvvm_barrier_sync(mask);
// CHECK: call void @llvm.nvvm.barrier.sync.cnt(i32
// CHECK: call void @llvm.nvvm.barrier.cta.sync.count(i32
// expected-error@+1 {{'__nvvm_barrier_sync_cnt' needs target feature ptx60}}
__nvvm_barrier_sync_cnt(mask, i);

Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/builtins-nvptx.c
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ __device__ int read_pms() {

__device__ void sync() {

// CHECK: call void @llvm.nvvm.bar.sync(i32 0)
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)

__nvvm_bar_sync(0);

Expand Down Expand Up @@ -259,7 +259,7 @@ __device__ void nvvm_math(float f1, float f2, double d1, double d2) {
__nvvm_membar_gl();
// CHECK: call void @llvm.nvvm.membar.sys()
__nvvm_membar_sys();
// CHECK: call void @llvm.nvvm.barrier0()
// CHECK: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
__syncthreads();
}

Expand Down
2 changes: 1 addition & 1 deletion clang/test/Headers/gpuintrin.c
Original file line number Diff line number Diff line change
Expand Up @@ -887,7 +887,7 @@ __gpu_kernel void foo() {
// NVPTX-LABEL: define internal void @__gpu_sync_threads(
// NVPTX-SAME: ) #[[ATTR0]] {
// NVPTX-NEXT: [[ENTRY:.*:]]
// NVPTX-NEXT: call void @llvm.nvvm.barrier0()
// NVPTX-NEXT: call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
// NVPTX-NEXT: ret void
//
//
Expand Down
48 changes: 43 additions & 5 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -199,21 +199,59 @@ map in the following way to CUDA builtins:
Barriers
--------

'``llvm.nvvm.barrier0``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^
'``llvm.nvvm.barrier.cta.*``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare void @llvm.nvvm.barrier0()
declare void @llvm.nvvm.barrier.cta.sync.count(i32 %id, i32 %n)
declare void @llvm.nvvm.barrier.cta.sync.all(i32 %id)
declare void @llvm.nvvm.barrier.cta.arrive.count(i32 %id, i32 %n)

declare void @llvm.nvvm.barrier.cta.sync.aligned.count(i32 %id, i32 %n)
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 %id)
declare void @llvm.nvvm.barrier.cta.arrive.aligned.count(i32 %id, i32 %n)

Overview:
"""""""""

The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
The '``@llvm.nvvm.barrier.cta.*``' family of intrinsics perform barrier
synchronization and communication within a CTA. They can be used by the threads
within the CTA for synchronization and communication.

Semantics:
""""""""""

Operand %id specifies a logical barrier resource and must fall within the range
0 through 15. When present, operand %n specifies the number of threads
participating in the barrier. When specifying a thread count, the value must be
a multiple of the warp size. With the '``@llvm.nvvm.barrier.cta.sync.*``'
variants, the '``.all``' suffix indicates that all threads in the CTA should
participate in the barrier while the '``.count``' suffix indicates that only
the threads specified by the %n operand should participate in the barrier.

All forms of the '``@llvm.nvvm.barrier.cta.*``' intrinsic cause the executing
thread to wait for all non-exited threads from its warp and then marks the
warp's arrival at the barrier. In addition to signaling its arrival at the
barrier, the '``@llvm.nvvm.barrier.cta.sync.*``' intrinsics cause the executing
thread to wait for non-exited threads of all other warps participating in the
barrier to arrive. On the other hand, the '``@llvm.nvvm.barrier.cta.arrive.*``'
intrinsic does not cause the executing thread to wait for threads of other
participating warps.

When a barrier completes, the waiting threads are restarted without delay,
and the barrier is reinitialized so that it can be immediately reused.

The '``@llvm.nvvm.barrier.cta.*``' intrinsic has an optional '``.aligned``'
modifier to indicate textual alignment of the barrier. When specified, it
indicates that all threads in the CTA will execute the same
'``@llvm.nvvm.barrier.cta.*``' instruction. In conditionally executed code, an
aligned '``@llvm.nvvm.barrier.cta.*``' instruction should only be used if it is
known that all threads in the CTA evaluate the condition identically, otherwise
behavior is undefined.

Electing a thread
-----------------
Expand Down
37 changes: 18 additions & 19 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,12 @@
// * llvm.nvvm.swap.lo.hi.b64 --> llvm.fshl(x, x, 32)
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap
// * llvm.nvvm.barrier0 --> llvm.nvvm.barrier.cta.sync.aligned.all(0)
// * llvm.nvvm.barrier.n --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
// * llvm.nvvm.bar.sync --> llvm.nvvm.barrier.cta.sync.aligned.all(x)
// * llvm.nvvm.barrier --> llvm.nvvm.barrier.cta.sync.aligned(x, y)
// * llvm.nvvm.barrier.sync --> llvm.nvvm.barrier.cta.sync.all(x)
// * llvm.nvvm.barrier.sync.cnt --> llvm.nvvm.barrier.cta.sync(x, y)

def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
Expand Down Expand Up @@ -1278,35 +1284,28 @@ let TargetPrefix = "nvvm" in {
defm int_nvvm_atomic_cas_gen_i : PTXAtomicWithScope3<llvm_anyint_ty>;

// Bar.Sync

// The builtin for "bar.sync 0" is called __syncthreads. Unlike most of the
// intrinsics in this file, this one is a user-facing API.
def int_nvvm_barrier0 : ClangBuiltin<"__syncthreads">,
Intrinsic<[], [], [IntrConvergent, IntrNoCallback]>;
// Synchronize all threads in the CTA at barrier 'n'.
def int_nvvm_barrier_n : ClangBuiltin<"__nvvm_bar_n">,
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
// the CTA at barrier 'n' (arg 1).
def int_nvvm_barrier : ClangBuiltin<"__nvvm_bar">,
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_barrier0_popc : ClangBuiltin<"__nvvm_bar0_popc">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_barrier0_and : ClangBuiltin<"__nvvm_bar0_and">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_barrier0_or : ClangBuiltin<"__nvvm_bar0_or">,
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;

def int_nvvm_bar_sync : NVVMBuiltin,
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_bar_warp_sync : NVVMBuiltin,
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;

// barrier.sync id[, cnt]
def int_nvvm_barrier_sync : NVVMBuiltin,
Intrinsic<[], [llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
def int_nvvm_barrier_sync_cnt : NVVMBuiltin,
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent, IntrNoCallback]>;
// barrier{.cta}.sync{.aligned} a{, b};
// barrier{.cta}.arrive{.aligned} a, b;
let IntrProperties = [IntrConvergent, IntrNoCallback] in {
foreach align = ["", "_aligned"] in {
def int_nvvm_barrier_cta_sync # align # _all :
Intrinsic<[], [llvm_i32_ty]>;
def int_nvvm_barrier_cta_sync # align # _count :
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
def int_nvvm_barrier_cta_arrive # align # _count :
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty]>;
}
}

// barrier.cluster.[wait, arrive, arrive.relaxed]
def int_nvvm_barrier_cluster_arrive :
Expand Down
35 changes: 29 additions & 6 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1343,12 +1343,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
// nvvm.abs.{i,ii}
Expand =
Name == "i" || Name == "ll" || Name == "bf16" || Name == "bf16x2";
else if (Name == "fabs.f" || Name == "fabs.ftz.f" || Name == "fabs.d")
else if (Name.consume_front("fabs."))
// nvvm.fabs.{f,ftz.f,d}
Expand = true;
else if (Name == "clz.ll" || Name == "popc.ll" || Name == "h2f" ||
Name == "swap.lo.hi.b64")
Expand = true;
Expand = Name == "f" || Name == "ftz.f" || Name == "d";
else if (Name.consume_front("max.") || Name.consume_front("min."))
// nvvm.{min,max}.{i,ii,ui,ull}
Expand = Name == "s" || Name == "i" || Name == "ll" || Name == "us" ||
Expand Down Expand Up @@ -1380,7 +1377,18 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
Expand = (Name.starts_with("i.") || Name.starts_with("f.") ||
Name.starts_with("p."));
else
Expand = false;
Expand = StringSwitch<bool>(Name)
.Case("barrier0", true)
.Case("barrier.n", true)
.Case("barrier.sync.cnt", true)
.Case("barrier.sync", true)
.Case("barrier", true)
.Case("bar.sync", true)
.Case("clz.ll", true)
.Case("popc.ll", true)
.Case("h2f", true)
.Case("swap.lo.hi.b64", true)
.Default(false);

if (Expand) {
NewFn = nullptr;
Expand Down Expand Up @@ -2478,6 +2486,21 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
MDNode *MD = MDNode::get(Builder.getContext(), {});
LD->setMetadata(LLVMContext::MD_invariant_load, MD);
return LD;
} else if (Name == "barrier0" || Name == "barrier.n" || Name == "bar.sync") {
Value *Arg =
Name.ends_with('0') ? Builder.getInt32(0) : CI->getArgOperand(0);
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_aligned_all,
{}, {Arg});
} else if (Name == "barrier") {
Rep = Builder.CreateIntrinsic(
Intrinsic::nvvm_barrier_cta_sync_aligned_count, {},
{CI->getArgOperand(0), CI->getArgOperand(1)});
} else if (Name == "barrier.sync") {
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_all, {},
{CI->getArgOperand(0)});
} else if (Name == "barrier.sync.cnt") {
Rep = Builder.CreateIntrinsic(Intrinsic::nvvm_barrier_cta_sync_count, {},
{CI->getArgOperand(0), CI->getArgOperand(1)});
} else {
Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
if (IID != Intrinsic::not_intrinsic &&
Expand Down
71 changes: 37 additions & 34 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,6 @@ class THREADMASK_INFO<bit sync> {
// Synchronization and shuffle functions
//-----------------------------------
let isConvergent = true in {
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
"bar.sync \t0;",
[(int_nvvm_barrier0)]>;
def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
"bar.sync \t$src1;",
[(int_nvvm_barrier_n i32:$src1)]>;
def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
"bar.sync \t$src1, $src2;",
[(int_nvvm_barrier i32:$src1, i32:$src2)]>;
def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
!strconcat("{{ \n\t",
".reg .pred \t%p1; \n\t",
Expand All @@ -102,39 +93,51 @@ def INT_BARRIER0_OR : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
"}}"),
[(set i32:$dst, (int_nvvm_barrier0_or i32:$pred))]>;

def INT_BAR_SYNC : NVPTXInst<(outs), (ins i32imm:$i), "bar.sync \t$i;",
[(int_nvvm_bar_sync imm:$i)]>;

def INT_BAR_WARP_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "bar.warp.sync \t$i;",
[(int_nvvm_bar_warp_sync imm:$i)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
def INT_BAR_WARP_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "bar.warp.sync \t$i;",
[(int_nvvm_bar_warp_sync i32:$i)]>,
Requires<[hasPTX<60>, hasSM<30>]>;

def INT_BARRIER_SYNC_I : NVPTXInst<(outs), (ins i32imm:$i), "barrier.sync \t$i;",
[(int_nvvm_barrier_sync imm:$i)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
def INT_BARRIER_SYNC_R : NVPTXInst<(outs), (ins Int32Regs:$i), "barrier.sync \t$i;",
[(int_nvvm_barrier_sync i32:$i)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
multiclass BARRIER1<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
def _i : BasicNVPTXInst<(outs), (ins i32imm:$i), asmstr,
[(intrinsic imm:$i)]>,
Requires<requires>;

def INT_BARRIER_SYNC_CNT_RR : NVPTXInst<(outs), (ins Int32Regs:$id, Int32Regs:$cnt),
"barrier.sync \t$id, $cnt;",
[(int_nvvm_barrier_sync_cnt i32:$id, i32:$cnt)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
def INT_BARRIER_SYNC_CNT_RI : NVPTXInst<(outs), (ins Int32Regs:$id, i32imm:$cnt),
"barrier.sync \t$id, $cnt;",
[(int_nvvm_barrier_sync_cnt i32:$id, imm:$cnt)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
def INT_BARRIER_SYNC_CNT_IR : NVPTXInst<(outs), (ins i32imm:$id, Int32Regs:$cnt),
"barrier.sync \t$id, $cnt;",
[(int_nvvm_barrier_sync_cnt imm:$id, i32:$cnt)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
def INT_BARRIER_SYNC_CNT_II : NVPTXInst<(outs), (ins i32imm:$id, i32imm:$cnt),
"barrier.sync \t$id, $cnt;",
[(int_nvvm_barrier_sync_cnt imm:$id, imm:$cnt)]>,
Requires<[hasPTX<60>, hasSM<30>]>;
def _r : BasicNVPTXInst<(outs), (ins Int32Regs:$i), asmstr,
[(intrinsic i32:$i)]>,
Requires<requires>;
}

multiclass BARRIER2<string asmstr, Intrinsic intrinsic, list<Predicate> requires = []> {
def _rr : BasicNVPTXInst<(outs), (ins Int32Regs:$i, Int32Regs:$j), asmstr,
[(intrinsic i32:$i, i32:$j)]>,
Requires<requires>;

def _ri : BasicNVPTXInst<(outs), (ins Int32Regs:$i, i32imm:$j), asmstr,
[(intrinsic i32:$i, imm:$j)]>,
Requires<requires>;

def _ir : BasicNVPTXInst<(outs), (ins i32imm:$i, Int32Regs:$j), asmstr,
[(intrinsic imm:$i, i32:$j)]>,
Requires<requires>;

def _ii : BasicNVPTXInst<(outs), (ins i32imm:$i, i32imm:$j), asmstr,
[(intrinsic imm:$i, imm:$j)]>,
Requires<requires>;
}

// Note the "bar.sync" variants could be renamed to the equivalent corresponding
// "barrier.*.aligned" variants. We use the older syntax for compatibility with
// older versions of the PTX ISA.
defm BARRIER_CTA_SYNC_ALIGNED_ALL : BARRIER1<"bar.sync", int_nvvm_barrier_cta_sync_aligned_all>;
defm BARRIER_CTA_SYNC_ALIGNED : BARRIER2<"bar.sync", int_nvvm_barrier_cta_sync_aligned_count>;
defm BARRIER_CTA_ARRIVE_ALIGNED : BARRIER2<"bar.arrive", int_nvvm_barrier_cta_arrive_aligned_count>;

defm BARRIER_CTA_SYNC_ALL : BARRIER1<"barrier.sync", int_nvvm_barrier_cta_sync_all, [hasPTX<60>]>;
defm BARRIER_CTA_SYNC : BARRIER2<"barrier.sync", int_nvvm_barrier_cta_sync_count, [hasPTX<60>]>;
defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive_count, [hasPTX<60>]>;

class INT_BARRIER_CLUSTER<string variant, Intrinsic Intr,
list<Predicate> Preds = [hasPTX<78>, hasSM<90>]>:
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/IPO/AttributorAttributes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2150,7 +2150,8 @@ struct AANoUnwindCallSite final

bool AANoSync::isAlignedBarrier(const CallBase &CB, bool ExecutedAligned) {
switch (CB.getIntrinsicID()) {
case Intrinsic::nvvm_barrier0:
case Intrinsic::nvvm_barrier_cta_sync_aligned_all:
case Intrinsic::nvvm_barrier_cta_sync_aligned_count:
case Intrinsic::nvvm_barrier0_and:
case Intrinsic::nvvm_barrier0_or:
case Intrinsic::nvvm_barrier0_popc:
Expand Down
19 changes: 3 additions & 16 deletions llvm/test/Analysis/GlobalsModRef/functions_without_nosync.ll
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,15 @@ target triple = "nvptx64-nvidia-cuda"

; CHECK-LABEL: @bar_sync
; CHECK: store
; CHECK: tail call void @llvm.nvvm.bar.sync(i32 0)
; CHECK: tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
; CHECK: load
define dso_local i32 @bar_sync(i32 %0) local_unnamed_addr {
store i32 %0, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4
tail call void @llvm.nvvm.bar.sync(i32 0)
tail call void @llvm.nvvm.barrier.cta.sync.aligned.all(i32 0)
%2 = load i32, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4
ret i32 %2
}

declare void @llvm.nvvm.bar.sync(i32) #0

; CHECK-LABEL: @barrier0
; CHECK: store
; CHECK: tail call void @llvm.nvvm.barrier0()
; CHECK: load
define dso_local i32 @barrier0(i32 %0) local_unnamed_addr {
store i32 %0, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4
tail call void @llvm.nvvm.barrier0()
%2 = load i32, ptr addrspacecast (ptr addrspace(3) @s to ptr), align 4
ret i32 %2
}

declare void @llvm.nvvm.barrier0() #0
declare void @llvm.nvvm.barrier.cta.sync.aligned.all(i32) #0

attributes #0 = { convergent nounwind }
Loading