Skip to content

Commit d492faa

Browse files
authored
[NVPTX] Add 'activemask' builtin and intrinsic support (llvm#79768)
Summary: This patch adds support for getting the 'activemask' instruction's value without needing to use inline assembly. See the relevant PTX reference for details. https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-activemask
1 parent 3d0a689 commit d492faa

File tree

6 files changed

+72
-6
lines changed

6 files changed

+72
-6
lines changed

clang/include/clang/Basic/BuiltinsNVPTX.def

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#pragma push_macro("PTX42")
4545
#pragma push_macro("PTX60")
4646
#pragma push_macro("PTX61")
47+
#pragma push_macro("PTX62")
4748
#pragma push_macro("PTX63")
4849
#pragma push_macro("PTX64")
4950
#pragma push_macro("PTX65")
@@ -76,7 +77,8 @@
7677
#define PTX65 "ptx65|" PTX70
7778
#define PTX64 "ptx64|" PTX65
7879
#define PTX63 "ptx63|" PTX64
79-
#define PTX61 "ptx61|" PTX63
80+
#define PTX62 "ptx62|" PTX63
81+
#define PTX61 "ptx61|" PTX62
8082
#define PTX60 "ptx60|" PTX61
8183
#define PTX42 "ptx42|" PTX60
8284

@@ -632,6 +634,9 @@ TARGET_BUILTIN(__nvvm_vote_any_sync, "bUib", "", PTX60)
632634
TARGET_BUILTIN(__nvvm_vote_uni_sync, "bUib", "", PTX60)
633635
TARGET_BUILTIN(__nvvm_vote_ballot_sync, "UiUib", "", PTX60)
634636

637+
// Mask
638+
TARGET_BUILTIN(__nvvm_activemask, "i", "n", PTX62)
639+
635640
// Match
636641
TARGET_BUILTIN(__nvvm_match_any_sync_i32, "UiUiUi", "", AND(SM_70,PTX60))
637642
TARGET_BUILTIN(__nvvm_match_any_sync_i64, "UiUiWi", "", AND(SM_70,PTX60))
@@ -1065,6 +1070,7 @@ TARGET_BUILTIN(__nvvm_getctarank_shared_cluster, "iv*3", "", AND(SM_90,PTX78))
10651070
#pragma pop_macro("PTX42")
10661071
#pragma pop_macro("PTX60")
10671072
#pragma pop_macro("PTX61")
1073+
#pragma pop_macro("PTX62")
10681074
#pragma pop_macro("PTX63")
10691075
#pragma pop_macro("PTX64")
10701076
#pragma pop_macro("PTX65")

clang/test/CodeGen/builtins-nvptx.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@
55
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_80 -target-feature +ptx70 \
66
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
77
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=CHECK_PTX70_SM80 -check-prefix=LP64 %s
8-
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 \
8+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
99
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
1010
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP32 %s
11-
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 \
11+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_60 -target-feature +ptx62 \
1212
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
1313
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
14-
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 \
14+
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx64-unknown-unknown -target-cpu sm_61 -target-feature +ptx62 \
1515
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
1616
// RUN: | FileCheck -check-prefix=CHECK -check-prefix=LP64 %s
17-
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 \
17+
// RUN: %clang_cc1 -triple nvptx-unknown-unknown -target-cpu sm_53 -target-feature +ptx62 \
1818
// RUN: -DERROR_CHECK -fcuda-is-device -S -o /dev/null -x cuda -verify %s
1919
// RUN: %clang_cc1 -ffp-contract=off -triple nvptx-unknown-unknown -target-cpu sm_86 -target-feature +ptx72 \
2020
// RUN: -fcuda-is-device -S -emit-llvm -o - -x cuda %s \
@@ -165,6 +165,14 @@ __device__ void sync() {
165165

166166
}
167167

168+
__device__ void activemask() {
169+
170+
// CHECK: call i32 @llvm.nvvm.activemask()
171+
172+
__nvvm_activemask();
173+
174+
}
175+
168176

169177
// NVVM intrinsics
170178

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4599,6 +4599,14 @@ def int_nvvm_vote_ballot_sync :
45994599
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.vote.ballot.sync">,
46004600
ClangBuiltin<"__nvvm_vote_ballot_sync">;
46014601

4602+
//
4603+
// ACTIVEMASK
4604+
//
4605+
def int_nvvm_activemask :
4606+
Intrinsic<[llvm_i32_ty], [],
4607+
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback, IntrHasSideEffects], "llvm.nvvm.activemask">,
4608+
ClangBuiltin<"__nvvm_activemask">;
4609+
46024610
//
46034611
// MATCH.SYNC
46044612
//

llvm/lib/Target/NVPTX/NVPTX.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ foreach sm = [20, 21, 30, 32, 35, 37, 50, 52, 53,
4040

4141
def SM90a: FeatureSM<"90a", 901>;
4242

43-
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 63, 64, 65,
43+
foreach version = [32, 40, 41, 42, 43, 50, 60, 61, 62, 63, 64, 65,
4444
70, 71, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 83] in
4545
def PTX#version: FeaturePTX<version>;
4646

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,12 @@ multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntO
263263
Requires<[hasPTX<60>, hasSM<70>]>;
264264
}
265265

266+
// activemask.b32
267+
def ACTIVEMASK : NVPTXInst<(outs Int32Regs:$dest), (ins),
268+
"activemask.b32 \t$dest;",
269+
[(set Int32Regs:$dest, (int_nvvm_activemask))]>,
270+
Requires<[hasPTX<62>, hasSM<30>]>;
271+
266272
defm MATCH_ANY_SYNC_32 : MATCH_ANY_SYNC<Int32Regs, "b32", int_nvvm_match_any_sync_i32,
267273
i32imm>;
268274
defm MATCH_ANY_SYNC_64 : MATCH_ANY_SYNC<Int64Regs, "b64", int_nvvm_match_any_sync_i64,

llvm/test/CodeGen/NVPTX/activemask.ll

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; RUN: llc < %s -march=nvptx64 -O2 -mcpu=sm_52 -mattr=+ptx62 | FileCheck %s
2+
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_52 -mattr=+ptx62 | %ptxas-verify %}
3+
4+
declare i32 @llvm.nvvm.activemask()
5+
6+
; CHECK-LABEL: activemask(
7+
;
8+
; CHECK: activemask.b32 %[[REG:.+]];
9+
; CHECK-NEXT: st.param.b32 [func_retval0+0], %[[REG]];
10+
; CHECK-NEXT: ret;
11+
define dso_local i32 @activemask() {
12+
entry:
13+
%mask = call i32 @llvm.nvvm.activemask()
14+
ret i32 %mask
15+
}
16+
17+
; CHECK-LABEL: convergent(
18+
;
19+
; CHECK: activemask.b32 %[[REG:.+]];
20+
; CHECK: activemask.b32 %[[REG]];
21+
; CHECK: .param.b32 [func_retval0+0], %[[REG]];
22+
; CHECK-NEXT: ret;
23+
define dso_local i32 @convergent(i1 %cond) {
24+
entry:
25+
br i1 %cond, label %if.else, label %if.then
26+
27+
if.then:
28+
%0 = call i32 @llvm.nvvm.activemask()
29+
br label %if.end
30+
31+
if.else:
32+
%1 = call i32 @llvm.nvvm.activemask()
33+
br label %if.end
34+
35+
if.end:
36+
%mask = phi i32 [ %0, %if.then ], [ %1, %if.else ]
37+
ret i32 %mask
38+
}

0 commit comments

Comments
 (0)