Skip to content

Commit 2d67a18

Browse files
committed
[NVPTX] Add elect.sync Intrinsic
This patch adds an NVVM intrinsic and NVPTX codegen for the elect.sync PTX instruction. Lit tests are added in elect.ll and verified through ptxas. PTX ISA reference: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync Signed-off-by: Durgadoss R <[email protected]>
1 parent b05c554 commit 2d67a18

File tree

4 files changed

+85
-1
lines changed

4 files changed

+85
-1
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,34 @@ Overview:
251251
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
252252
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
253253

254+
Electing a thread
255+
-----------------
256+
257+
'``llvm.nvvm.elect.sync``'
258+
^^^^^^^^^^^^^^^^^^^^^^^^^^
259+
260+
Syntax:
261+
"""""""
262+
263+
.. code-block:: llvm
264+
265+
declare {i32, i1} @llvm.nvvm.elect.sync(i32 %membermask)
266+
267+
Overview:
268+
"""""""""
269+
270+
The '``@llvm.nvvm.elect.sync``' intrinsic generates the ``elect.sync``
271+
PTX instruction, which elects one predicated active leader thread from
272+
a set of threads specified by ``membermask``. The behavior is undefined
273+
if the executing thread is not in ``membermask``. The laneid of the
274+
elected thread is captured in the i32 return value. The i1 return
275+
value is set to ``True`` for the leader thread and ``False`` for all
276+
the other threads. For more information, refer PTX ISA
277+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync>`_.
278+
254279
Membar/Fences
255280
-------------
256281

257-
258282
'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
259283
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260284

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4679,6 +4679,14 @@ def int_nvvm_match_all_sync_i64p :
46794679
Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
46804680
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
46814681

4682+
//
4683+
// ELECT.SYNC
4684+
//
4685+
// elect.sync dst|pred, membermask
4686+
def int_nvvm_elect_sync :
4687+
DefaultAttrsIntrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty],
4688+
[IntrInaccessibleMemOnly, IntrConvergent]>;
4689+
46824690
//
46834691
// REDUX.SYNC
46844692
//

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,16 @@ defm VOTE_SYNC_ANY : VOTE_SYNC<Int1Regs, "any.pred", int_nvvm_vote_any_sync>;
243243
defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>;
244244
defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>;
245245

246+
// elect.sync
247+
def INT_ELECT_SYNC_I : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask),
248+
"elect.sync \t$dest|$pred, $mask;",
249+
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync imm:$mask))]>,
250+
Requires<[hasPTX<80>, hasSM<90>]>;
251+
def INT_ELECT_SYNC_R : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask),
252+
"elect.sync \t$dest|$pred, $mask;",
253+
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync Int32Regs:$mask))]>,
254+
Requires<[hasPTX<80>, hasSM<90>]>;
255+
246256
multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
247257
Operand ImmOp> {
248258
def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),

llvm/test/CodeGen/NVPTX/elect.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s
3+
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
declare {i32, i1} @llvm.nvvm.elect.sync(i32)
8+
9+
define {i32, i1} @elect_sync(i32 %mask) {
10+
; CHECK-LABEL: elect_sync(
11+
; CHECK: {
12+
; CHECK-NEXT: .reg .pred %p<2>;
13+
; CHECK-NEXT: .reg .b16 %rs<2>;
14+
; CHECK-NEXT: .reg .b32 %r<3>;
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0:
17+
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0];
18+
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
19+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
20+
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
21+
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
22+
; CHECK-NEXT: ret;
23+
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
24+
ret {i32, i1} %val
25+
}
26+
27+
define {i32, i1} @elect_sync_imm() {
28+
; CHECK-LABEL: elect_sync_imm(
29+
; CHECK: {
30+
; CHECK-NEXT: .reg .pred %p<2>;
31+
; CHECK-NEXT: .reg .b16 %rs<2>;
32+
; CHECK-NEXT: .reg .b32 %r<2>;
33+
; CHECK-EMPTY:
34+
; CHECK-NEXT: // %bb.0:
35+
; CHECK-NEXT: elect.sync %r1|%p1, -1;
36+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
37+
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
38+
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
39+
; CHECK-NEXT: ret;
40+
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 u0xffffffff)
41+
ret {i32, i1} %val
42+
}

0 commit comments

Comments
 (0)