-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[NVPTX] Support for fence.acquire and fence.release #124865
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-nvptx Author: Akshay Deodhar (akshayrdeodhar) ChangesAdds codegen support for fence.acquire and fence.release, a script and generated tests for all possible legal fences, and cleans up some tablegen rules. Patch is 22.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/124865.diff 10 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ac8ce05724750c..ec654e0f3f200f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -648,9 +648,50 @@ static unsigned int getFenceOp(NVPTX::Ordering O, NVPTX::Scope S,
if (S == NVPTX::Scope::Cluster)
T->failIfClustersUnsupported(".cluster scope fence");
+ // Fall back to .acq_rel if .acquire, .release is not supported.
+ if (!T->hasSplitAcquireAndReleaseFences() &&
+ (O == NVPTX::Ordering::Acquire || O == NVPTX::Ordering::Release))
+ O = NVPTX::Ordering::AcquireRelease;
+
switch (O) {
case NVPTX::Ordering::Acquire:
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_acquire_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_acquire_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(
+ formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+ ScopeToString(S)));
+ }
+ break;
case NVPTX::Ordering::Release:
+ switch (S) {
+ case NVPTX::Scope::System:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_sys
+ : NVPTX::INT_MEMBAR_SYS;
+ case NVPTX::Scope::Block:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_cta
+ : NVPTX::INT_MEMBAR_CTA;
+ case NVPTX::Scope::Cluster:
+ return NVPTX::atomic_thread_fence_release_cluster;
+ case NVPTX::Scope::Device:
+ return T->hasMemoryOrdering() ? NVPTX::atomic_thread_fence_release_gpu
+ : NVPTX::INT_MEMBAR_GL;
+ case NVPTX::Scope::Thread:
+ report_fatal_error(
+ formatv("Unsupported scope \"{}\" for acquire/release/acq_rel fence.",
+ ScopeToString(S)));
+ }
+ break;
case NVPTX::Ordering::AcquireRelease: {
switch (S) {
case NVPTX::Scope::System:
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 633a99d0fc1be3..74423d79e41e05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -3866,33 +3866,16 @@ def : Pat <
// PTX Fence instructions
////////////////////////////////////////////////////////////////////////////////
-def atomic_thread_fence_seq_cst_sys :
- NVPTXInst<(outs), (ins), "fence.sc.sys;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_sys :
- NVPTXInst<(outs), (ins), "fence.acq_rel.sys;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-
-def atomic_thread_fence_seq_cst_gpu :
- NVPTXInst<(outs), (ins), "fence.sc.gpu;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_gpu :
- NVPTXInst<(outs), (ins), "fence.acq_rel.gpu;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-
-def atomic_thread_fence_seq_cst_cluster :
- NVPTXInst<(outs), (ins), "fence.sc.cluster;", []>,
- Requires<[hasPTX<78>, hasSM<90>]>;
-def atomic_thread_fence_acq_rel_cluster :
- NVPTXInst<(outs), (ins), "fence.acq_rel.cluster;", []>,
- Requires<[hasPTX<78>, hasSM<90>]>;
-
-def atomic_thread_fence_seq_cst_cta :
- NVPTXInst<(outs), (ins), "fence.sc.cta;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
-def atomic_thread_fence_acq_rel_cta :
- NVPTXInst<(outs), (ins), "fence.acq_rel.cta;", []>,
- Requires<[hasPTX<60>, hasSM<70>]>;
+class NVPTXFenceInst<string scope, string sem, Predicate ptx>:
+ NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>,
+ Requires<[ptx, hasSM<70>]>;
+
+foreach scope = ["sys", "gpu", "cluster", "cta"] in {
+ def atomic_thread_fence_seq_cst_#scope: NVPTXFenceInst<scope, "sc", hasPTX<60>>;
+ def atomic_thread_fence_acq_rel_#scope: NVPTXFenceInst<scope, "acq_rel", hasPTX<60>>;
+ def atomic_thread_fence_acquire_#scope: NVPTXFenceInst<scope, "acquire", hasPTX<87>>;
+ def atomic_thread_fence_release_#scope: NVPTXFenceInst<scope, "release", hasPTX<87>>;
+}
def fpimm_any_zero : FPImmLeaf<fAny, [{
return Imm.isZero();
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index 919f487c701416..990ad3c62367fd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -88,6 +88,10 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
// release, acq_rel, sc) ?
bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
+ // Does SM & PTX support .acquire and .release qualifiers for fence?
+ bool hasSplitAcquireAndReleaseFences() const {
+ return SmVersion >= 90 && PTXVersion >= 86;
+ }
// Does SM & PTX support atomic relaxed MMIO operations ?
bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
bool hasDotInstructions() const {
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll b/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
deleted file mode 100644
index dce39bf3e1e3ed..00000000000000
--- a/llvm/test/CodeGen/NVPTX/fence-sm-90.ll
+++ /dev/null
@@ -1,30 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | FileCheck %s
-; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx78 | %ptxas-verify -arch=sm_90 %}
-
-; CHECK-LABEL: fence_sc_cluster
-define void @fence_sc_cluster() local_unnamed_addr {
- ; CHECK: fence.sc.cluster
- fence syncscope("cluster") seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_cluster
-define void @fence_acq_rel_cluster() local_unnamed_addr {
- ; CHECK: fence.acq_rel.cluster
- fence syncscope("cluster") acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_cluster
-define void @fence_release_cluster() local_unnamed_addr {
- ; CHECK: fence.acq_rel.cluster
- fence syncscope("cluster") release
- ret void
-}
-
-; CHECK-LABEL: fence_acquire_cluster
-define void @fence_acquire_cluster() local_unnamed_addr {
- ; CHECK: fence.acq_rel.cluster
- fence syncscope("cluster") acquire
- ret void
-}
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm30.ll b/llvm/test/CodeGen/NVPTX/fence-sm30.ll
new file mode 100644
index 00000000000000..16365db21d5b9c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm30.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -mattr=+ptx50 | FileCheck %s --check-prefix=SM30
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_30 -mattr=+ptx50 | %ptxas-verfy %}
+
+
+define void @fence_acquire_() {
+; SM30-LABEL: fence_acquire_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") acquire
+ ret void
+}
+
+
+define void @fence_acquire_block() {
+; SM30-LABEL: fence_acquire_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") acquire
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_acquire_device() {
+; SM30-LABEL: fence_acquire_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") acquire
+ ret void
+}
+
+
+define void @fence_release_() {
+; SM30-LABEL: fence_release_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") release
+ ret void
+}
+
+
+define void @fence_release_block() {
+; SM30-LABEL: fence_release_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") release
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_release_device() {
+; SM30-LABEL: fence_release_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") release
+ ret void
+}
+
+
+define void @fence_acq_rel_() {
+; SM30-LABEL: fence_acq_rel_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_block() {
+; SM30-LABEL: fence_acq_rel_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_acq_rel_device() {
+; SM30-LABEL: fence_acq_rel_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+
+define void @fence_seq_cst_() {
+; SM30-LABEL: fence_seq_cst_(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.sys;
+; SM30-NEXT: ret;
+ fence syncscope("") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_block() {
+; SM30-LABEL: fence_seq_cst_block(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.cta;
+; SM30-NEXT: ret;
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; .cluster scope unsupported on SM = 30 PTX = 50
+
+define void @fence_seq_cst_device() {
+; SM30-LABEL: fence_seq_cst_device(
+; SM30: {
+; SM30-EMPTY:
+; SM30-EMPTY:
+; SM30-NEXT: // %bb.0:
+; SM30-NEXT: membar.gl;
+; SM30-NEXT: ret;
+ fence syncscope("device") seq_cst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm70.ll b/llvm/test/CodeGen/NVPTX/fence-sm70.ll
new file mode 100644
index 00000000000000..085529571e0443
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm70.ll
@@ -0,0 +1,165 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verfy %}
+
+
+define void @fence_acquire_() {
+; SM70-LABEL: fence_acquire_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") acquire
+ ret void
+}
+
+
+define void @fence_acquire_block() {
+; SM70-LABEL: fence_acquire_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") acquire
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_acquire_device() {
+; SM70-LABEL: fence_acquire_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") acquire
+ ret void
+}
+
+
+define void @fence_release_() {
+; SM70-LABEL: fence_release_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") release
+ ret void
+}
+
+
+define void @fence_release_block() {
+; SM70-LABEL: fence_release_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") release
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_release_device() {
+; SM70-LABEL: fence_release_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") release
+ ret void
+}
+
+
+define void @fence_acq_rel_() {
+; SM70-LABEL: fence_acq_rel_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_block() {
+; SM70-LABEL: fence_acq_rel_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_acq_rel_device() {
+; SM70-LABEL: fence_acq_rel_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.acq_rel.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+
+define void @fence_seq_cst_() {
+; SM70-LABEL: fence_seq_cst_(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.sc.sys;
+; SM70-NEXT: ret;
+ fence syncscope("") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_block() {
+; SM70-LABEL: fence_seq_cst_block(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.sc.cta;
+; SM70-NEXT: ret;
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+; .cluster scope unsupported on SM = 70 PTX = 60
+
+define void @fence_seq_cst_device() {
+; SM70-LABEL: fence_seq_cst_device(
+; SM70: {
+; SM70-EMPTY:
+; SM70-EMPTY:
+; SM70-NEXT: // %bb.0:
+; SM70-NEXT: fence.sc.gpu;
+; SM70-NEXT: ret;
+ fence syncscope("device") seq_cst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/fence-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-sm90.ll
new file mode 100644
index 00000000000000..6c1959d34df4e5
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-sm90.ll
@@ -0,0 +1,213 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck %s --check-prefix=SM90
+; RUN: %if ptxas %{ llc < %s -march=nvptx -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verfy %}
+
+
+define void @fence_acquire_() {
+; SM90-LABEL: fence_acquire_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") acquire
+ ret void
+}
+
+
+define void @fence_acquire_block() {
+; SM90-LABEL: fence_acquire_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") acquire
+ ret void
+}
+
+
+define void @fence_acquire_cluster() {
+; SM90-LABEL: fence_acquire_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") acquire
+ ret void
+}
+
+
+define void @fence_acquire_device() {
+; SM90-LABEL: fence_acquire_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acquire.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") acquire
+ ret void
+}
+
+
+define void @fence_release_() {
+; SM90-LABEL: fence_release_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") release
+ ret void
+}
+
+
+define void @fence_release_block() {
+; SM90-LABEL: fence_release_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") release
+ ret void
+}
+
+
+define void @fence_release_cluster() {
+; SM90-LABEL: fence_release_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") release
+ ret void
+}
+
+
+define void @fence_release_device() {
+; SM90-LABEL: fence_release_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.release.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") release
+ ret void
+}
+
+
+define void @fence_acq_rel_() {
+; SM90-LABEL: fence_acq_rel_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_block() {
+; SM90-LABEL: fence_acq_rel_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_cluster() {
+; SM90-LABEL: fence_acq_rel_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") acq_rel
+ ret void
+}
+
+
+define void @fence_acq_rel_device() {
+; SM90-LABEL: fence_acq_rel_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.acq_rel.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") acq_rel
+ ret void
+}
+
+
+define void @fence_seq_cst_() {
+; SM90-LABEL: fence_seq_cst_(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.sys;
+; SM90-NEXT: ret;
+ fence syncscope("") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_block() {
+; SM90-LABEL: fence_seq_cst_block(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.cta;
+; SM90-NEXT: ret;
+ fence syncscope("block") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_cluster() {
+; SM90-LABEL: fence_seq_cst_cluster(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.cluster;
+; SM90-NEXT: ret;
+ fence syncscope("cluster") seq_cst
+ ret void
+}
+
+
+define void @fence_seq_cst_device() {
+; SM90-LABEL: fence_seq_cst_device(
+; SM90: {
+; SM90-EMPTY:
+; SM90-EMPTY:
+; SM90-NEXT: // %bb.0:
+; SM90-NEXT: fence.sc.gpu;
+; SM90-NEXT: ret;
+ fence syncscope("device") seq_cst
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/NVPTX/fence.ll b/llvm/test/CodeGen/NVPTX/fence.ll
deleted file mode 100644
index e094ddf5775a63..00000000000000
--- a/llvm/test/CodeGen/NVPTX/fence.ll
+++ /dev/null
@@ -1,102 +0,0 @@
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=SM60
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | FileCheck %s --check-prefix=SM70
-; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx60 | %ptxas-verify -arch=sm_70 %}
-
-; TODO: implement and test thread scope.
-
-; CHECK-LABEL: fence_sc_sys
-define void @fence_sc_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.sc.sys
- fence seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_sys
-define void @fence_acq_rel_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
- fence acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_sys
-define void @fence_release_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
- fence release
- ret void
-}
-
-; CHECK-LABEL: fence_acquire_sys
-define void @fence_acquire_sys() local_unnamed_addr {
- ; SM60: membar.sys
- ; SM70: fence.acq_rel.sys
- fence acquire
- ret void
-}
-
-; CHECK-LABEL: fence_sc_gpu
-define void @fence_sc_gpu() local_unnamed_addr {
- ; SM60: membar.gl
- ; SM70: fence.sc.gpu
- fence syncscope("device") seq_cst
- ret void
-}
-
-; CHECK-LABEL: fence_acq_rel_gpu
-define void @fence_acq_rel_gpu() local_unnamed_addr {
- ; SM60: membar.gl
- ; SM70: fence.acq_rel.gpu
- fence syncscope("device") acq_rel
- ret void
-}
-
-; CHECK-LABEL: fence_release_gpu
...
[truncated]
|
✅ With the latest revision this PR passed the Python code formatter. |
CC: @gonzalobg |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM overall, but tests arrangement looks questionable.
LGTM as well. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good to me as well! Thanks!
Adds codegen support for fence.acquire and fence.release, a script and generated tests for all possible legal fences, and cleans up some tablegen rules.
Adds codegen support for fence.acquire and fence.release, a script and generated tests for all possible legal fences, and cleans up some tablegen rules.