Skip to content
This repository was archived by the owner on Mar 28, 2020. It is now read-only.

Commit 5692e8e

Browse files
committed
[NVPTX] Add intrinsics to support named barriers.
Support for barrier synchronization between a subset of threads in a CTA through one of sixteen explicitly specified barriers. These intrinsics are not directly exposed in CUDA but are critical for forthcoming support of OpenMP on NVPTX GPUs. The intrinsics allow the synchronization of an arbitrary (multiple of 32) number of threads in a CTA at one of 16 distinct barriers. The two intrinsics added are as follows: call void @llvm.nvvm.barrier.n(i32 10) waits for all threads in a CTA to arrive at named barrier #10. call void @llvm.nvvm.barrier(i32 15, i32 992) waits for 992 threads in a CTA to arrive at barrier #15. Detailed description of these intrinsics are available in the PTX manual. http://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions Reviewers: hfinkel, jlebar Differential Revision: https://reviews.llvm.org/D17657 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293384 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent bd96ea7 commit 5692e8e

File tree

3 files changed

+53
-0
lines changed

3 files changed

+53
-0
lines changed

include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,13 @@ let TargetPrefix = "nvvm" in {
733733
// intrinsics in this file, this one is a user-facing API.
734734
def int_nvvm_barrier0 : GCCBuiltin<"__syncthreads">,
735735
Intrinsic<[], [], [IntrConvergent]>;
736+
// Synchronize all threads in the CTA at barrier 'n'.
737+
def int_nvvm_barrier_n : GCCBuiltin<"__nvvm_bar_n">,
738+
Intrinsic<[], [llvm_i32_ty], [IntrConvergent]>;
739+
// Synchronize 'm', a multiple of warp size, (arg 2) threads in
740+
// the CTA at barrier 'n' (arg 1).
741+
def int_nvvm_barrier : GCCBuiltin<"__nvvm_bar">,
742+
Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], [IntrConvergent]>;
736743
def int_nvvm_barrier0_popc : GCCBuiltin<"__nvvm_bar0_popc">,
737744
Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrConvergent]>;
738745
def int_nvvm_barrier0_and : GCCBuiltin<"__nvvm_bar0_and">,

lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ let isConvergent = 1 in {
3636
def INT_BARRIER0 : NVPTXInst<(outs), (ins),
3737
"bar.sync \t0;",
3838
[(int_nvvm_barrier0)]>;
39+
def INT_BARRIERN : NVPTXInst<(outs), (ins Int32Regs:$src1),
40+
"bar.sync \t$src1;",
41+
[(int_nvvm_barrier_n Int32Regs:$src1)]>;
42+
def INT_BARRIER : NVPTXInst<(outs), (ins Int32Regs:$src1, Int32Regs:$src2),
43+
"bar.sync \t$src1, $src2;",
44+
[(int_nvvm_barrier Int32Regs:$src1, Int32Regs:$src2)]>;
3945
def INT_BARRIER0_POPC : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$pred),
4046
!strconcat("{{ \n\t",
4147
".reg .pred \t%p1; \n\t",

test/CodeGen/NVPTX/named-barriers.ll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
3+
4+
; Use bar.sync to arrive at a pre-computed barrier number and
5+
; wait for all threads in CTA to also arrive:
6+
define ptx_device void @test_barrier_named_cta() {
7+
; CHECK: mov.u32 %r[[REG0:[0-9]+]], 0;
8+
; CHECK: bar.sync %r[[REG0]];
9+
; CHECK: mov.u32 %r[[REG1:[0-9]+]], 10;
10+
; CHECK: bar.sync %r[[REG1]];
11+
; CHECK: mov.u32 %r[[REG2:[0-9]+]], 15;
12+
; CHECK: bar.sync %r[[REG2]];
13+
; CHECK: ret;
14+
call void @llvm.nvvm.barrier.n(i32 0)
15+
call void @llvm.nvvm.barrier.n(i32 10)
16+
call void @llvm.nvvm.barrier.n(i32 15)
17+
ret void
18+
}
19+
20+
; Use bar.sync to arrive at a pre-computed barrier number and
21+
; wait for fixed number of cooperating threads to arrive:
22+
define ptx_device void @test_barrier_named() {
23+
; CHECK: mov.u32 %r[[REG0A:[0-9]+]], 32;
24+
; CHECK: mov.u32 %r[[REG0B:[0-9]+]], 0;
25+
; CHECK: bar.sync %r[[REG0B]], %r[[REG0A]];
26+
; CHECK: mov.u32 %r[[REG1A:[0-9]+]], 352;
27+
; CHECK: mov.u32 %r[[REG1B:[0-9]+]], 10;
28+
; CHECK: bar.sync %r[[REG1B]], %r[[REG1A]];
29+
; CHECK: mov.u32 %r[[REG2A:[0-9]+]], 992;
30+
; CHECK: mov.u32 %r[[REG2B:[0-9]+]], 15;
31+
; CHECK: bar.sync %r[[REG2B]], %r[[REG2A]];
32+
; CHECK: ret;
33+
call void @llvm.nvvm.barrier(i32 0, i32 32)
34+
call void @llvm.nvvm.barrier(i32 10, i32 352)
35+
call void @llvm.nvvm.barrier(i32 15, i32 992)
36+
ret void
37+
}
38+
39+
declare void @llvm.nvvm.barrier(i32, i32)
40+
declare void @llvm.nvvm.barrier.n(i32)

0 commit comments

Comments
 (0)