Skip to content

Commit 9ccf038

Browse files
[NVPTX] Support for fence.acquire and fence.release (#124865)
Adds codegen support for fence.acquire and fence.release, a script and generated tests for all possible legal fences, and cleans up some tablegen rules.
1 parent c798a5c commit 9ccf038

File tree

9 files changed

+522
-159
lines changed

9 files changed

+522
-159
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -648,9 +648,50 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
648648
if (S == NVPTX::Scope::Cluster)
649649
T->failIfClustersUnsupported(".cluster scope fence");
650650

651+
// Fall back to .acq_rel if .acquire, .release is not supported.
652+
if (!T->hasSplitAcquireAndReleaseFences() &&
653+
(O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
654+
O = NVPTX::Ordering::AcquireRelease;
655+
651656
switch (O) {
652657
case NVPTX::Ordering::Acquire:
658+
switch (S) {
659+
case NVPTX::Scope::System:
660+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
661+
: NVPTX::INT_MEMBAR_SYS;
662+
case NVPTX::Scope::Block:
663+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
664+
: NVPTX::INT_MEMBAR_CTA;
665+
case NVPTX::Scope::Cluster:
666+
return NVPTX::atomic_thread_fence_acquire_cluster;
667+
case NVPTX::Scope::Device:
668+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
669+
: NVPTX::INT_MEMBAR_GL;
670+
case NVPTX::Scope::Thread:
671+
report_fatal_error(
672+
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
673+
ScopeToString(S)));
674+
}
675+
break;
653676
case NVPTX::Ordering::Release:
677+
switch (S) {
678+
case NVPTX::Scope::System:
679+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
680+
: NVPTX::INT_MEMBAR_SYS;
681+
case NVPTX::Scope::Block:
682+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
683+
: NVPTX::INT_MEMBAR_CTA;
684+
case NVPTX::Scope::Cluster:
685+
return NVPTX::atomic_thread_fence_release_cluster;
686+
case NVPTX::Scope::Device:
687+
return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
688+
: NVPTX::INT_MEMBAR_GL;
689+
case NVPTX::Scope::Thread:
690+
report_fatal_error(
691+
formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
692+
ScopeToString(S)));
693+
}
694+
break;
654695
case NVPTX::Ordering::AcquireRelease: {
655696
switch (S) {
656697
case NVPTX::Scope::System:

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 10 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3867,33 +3867,16 @@ def : Pat <
38673867
// PTX Fence instructions
38683868
////////////////////////////////////////////////////////////////////////////////
38693869

3870-
def atomic_thread_fence_seq_cst_sys :
3871-
NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
3872-
Requires<[hasPTX<60>, hasSM<70>]>;
3873-
def atomic_thread_fence_acq_rel_sys :
3874-
NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
3875-
Requires<[hasPTX<60>, hasSM<70>]>;
3876-
3877-
def atomic_thread_fence_seq_cst_gpu :
3878-
NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
3879-
Requires<[hasPTX<60>, hasSM<70>]>;
3880-
def atomic_thread_fence_acq_rel_gpu :
3881-
NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
3882-
Requires<[hasPTX<60>, hasSM<70>]>;
3883-
3884-
def atomic_thread_fence_seq_cst_cluster :
3885-
NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
3886-
Requires<[hasPTX<78>, hasSM<90>]>;
3887-
def atomic_thread_fence_acq_rel_cluster :
3888-
NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
3889-
Requires<[hasPTX<78>, hasSM<90>]>;
3890-
3891-
def atomic_thread_fence_seq_cst_cta :
3892-
NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
3893-
Requires<[hasPTX<60>, hasSM<70>]>;
3894-
def atomic_thread_fence_acq_rel_cta :
3895-
NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
3896-
Requires<[hasPTX<60>, hasSM<70>]>;
3870+
class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
3871+
NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
3872+
Requires<[ptx, hasSM<70>]>;
3873+
3874+
foreach scope = ["sys", "gpu", "cluster", "cta"] in {
3875+
def atomic_thread_fence_seq_cst_#scope: NVPTXFenceInst<scope, "sc", hasPTX<60>>;
3876+
def atomic_thread_fence_acq_rel_#scope: NVPTXFenceInst<scope, "acq_rel", hasPTX<60>>;
3877+
def atomic_thread_fence_acquire_#scope: NVPTXFenceInst<scope, "acquire", hasPTX<87>>;
3878+
def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
3879+
}
38973880

38983881
def fpimm_any_zero : FPImmLeaf<fAny, [{
38993882
return Imm.isZero();

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
8888
// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
8989
// release, acq_rel, sc) ?
9090
bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
91+
// Does SM & PTX support .acquire and .release qualifiers for fence?
92+
bool hasSplitAcquireAndReleaseFences() const {
93+
return SmVersion >= 90 && PTXVersion >= 86;
94+
}
9195
// Does SM & PTX support atomic relaxed MMIO operations ?
9296
bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
9397
bool hasDotInstructions() const {
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
3+
; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify %}
4+
5+
define void @fence_acquire_cluster() {
6+
; SM90-LABEL: fence_acquire_cluster(
7+
; SM90: {
8+
; SM90-EMPTY:
9+
; SM90-EMPTY:
10+
; SM90-NEXT: // %bb.0:
11+
; SM90-NEXT: fence.acquire.cluster;
12+
; SM90-NEXT: ret;
13+
fence syncscope("cluster") acquire
14+
ret void
15+
}
16+
17+
18+
define void @fence_release_cluster() {
19+
; SM90-LABEL: fence_release_cluster(
20+
; SM90: {
21+
; SM90-EMPTY:
22+
; SM90-EMPTY:
23+
; SM90-NEXT: // %bb.0:
24+
; SM90-NEXT: fence.release.cluster;
25+
; SM90-NEXT: ret;
26+
fence syncscope("cluster") release
27+
ret void
28+
}
29+
30+
31+
define void @fence_acq_rel_cluster() {
32+
; SM90-LABEL: fence_acq_rel_cluster(
33+
; SM90: {
34+
; SM90-EMPTY:
35+
; SM90-EMPTY:
36+
; SM90-NEXT: // %bb.0:
37+
; SM90-NEXT: fence.acq_rel.cluster;
38+
; SM90-NEXT: ret;
39+
fence syncscope("cluster") acq_rel
40+
ret void
41+
}
42+
43+
44+
define void @fence_seq_cst_cluster() {
45+
; SM90-LABEL: fence_seq_cst_cluster(
46+
; SM90: {
47+
; SM90-EMPTY:
48+
; SM90-EMPTY:
49+
; SM90-NEXT: // %bb.0:
50+
; SM90-NEXT: fence.sc.cluster;
51+
; SM90-NEXT: ret;
52+
fence syncscope("cluster") seq_cst
53+
ret void
54+
}
55+

0 commit comments

Comments
 (0)