Skip to content

Commit f304049

Browse files
authored
[NVPTX] Add tcgen05 wait/fence/commit intrinsics (#126091)
This patch adds intrinsics for tcgen05 wait, fence and commit PTX instructions. lit tests are added and verified with a ptxas-12.8 executable. Docs are updated in the NVPTXUsage.rst file. Signed-off-by: Durgadoss R <[email protected]>
1 parent 2c43479 commit f304049

File tree

5 files changed

+331
-0
lines changed

5 files changed

+331
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,81 @@ flavors of the instruction respectively.
10601060
For more information, refer to the PTX ISA
10611061
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-memory-allocation-and-management-instructions>`_.
10621062

1063+
'``llvm.nvvm.tcgen05.commit``'
1064+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1065+
1066+
Syntax:
1067+
"""""""
1068+
1069+
.. code-block:: llvm
1070+
1071+
declare void @llvm.nvvm.tcgen05.commit.{cg1,cg2}(ptr %mbar)
1072+
declare void @llvm.nvvm.tcgen05.commit.shared.{cg1,cg2}(ptr addrspace(3) %mbar)
1073+
declare void @llvm.nvvm.tcgen05.commit.mc.{cg1,cg2}(ptr %mbar, i16 %mc)
1074+
declare void @llvm.nvvm.tcgen05.commit.mc.shared.{cg1,cg2}(ptr addrspace(3) %mbar, i16 %mc)
1075+
1076+
Overview:
1077+
"""""""""
1078+
1079+
The '``@llvm.nvvm.tcgen05.commit.*``' intrinsics correspond to the
1080+
``tcgen05.commit.{cg1/cg2}.mbarrier::arrive::one.*`` set of PTX instructions.
1081+
The ``tcgen05.commit`` is an asynchronous instruction which makes the mbarrier
1082+
object (``%mbar``) track the completion of all prior asynchronous tcgen05 operations.
1083+
The ``.mc`` variants allow signaling on the mbarrier objects of multiple CTAs
1084+
(specified by ``%mc``) in the cluster. The ``.cg1`` and ``.cg2`` variants generate
1085+
``cta_group::1`` and ``cta_group::2`` flavors of the instruction respectively.
1086+
1087+
For more information, refer to the PTX ISA
1088+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen-async-sync-operations-commit>`_.
1089+
1090+
'``llvm.nvvm.tcgen05.wait``'
1091+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1092+
1093+
Syntax:
1094+
"""""""
1095+
1096+
.. code-block:: llvm
1097+
1098+
declare void @llvm.nvvm.tcgen05.wait.ld()
1099+
declare void @llvm.nvvm.tcgen05.wait.st()
1100+
1101+
Overview:
1102+
"""""""""
1103+
1104+
The '``@llvm.nvvm.tcgen05.wait.ld/st``' intrinsics correspond to
1105+
the ``tcgen05.wait::{ld/st}.sync.aligned`` pair of PTX instructions.
1106+
The ``tcgen05.wait::ld`` causes the executing thread to block until
1107+
all prior ``tcgen05.ld`` operations issued by the executing thread
1108+
have completed. The ``tcgen05.wait::st`` causes the executing thread
1109+
to block until all prior ``tcgen05.st`` operations issued by the
1110+
executing thread have completed.
1111+
1112+
For more information, refer to the PTX ISA
1113+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-wait>`_.
1114+
1115+
'``llvm.nvvm.tcgen05.fence``'
1116+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1117+
1118+
Syntax:
1119+
"""""""
1120+
1121+
.. code-block:: llvm
1122+
1123+
declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
1124+
declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
1125+
1126+
Overview:
1127+
"""""""""
1128+
1129+
The '``@llvm.nvvm.tcgen05.fence.*``' intrinsics correspond to
1130+
the ``tcgen05.fence::{before/after}_thread_sync`` pair of PTX instructions.
1131+
These instructions act as code motion fences for asynchronous tcgen05
1132+
operations.
1133+
1134+
For more information, refer to the PTX ISA
1135+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence>`_.
1136+
1137+
10631138
Other Intrinsics
10641139
----------------
10651140

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5083,6 +5083,38 @@ foreach cta_group = ["cg1", "cg2"] in {
50835083

50845084
def int_nvvm_tcgen05_relinq_alloc_permit_ # cta_group : Intrinsic<[], [],
50855085
[IntrConvergent, IntrInaccessibleMemOnly]>;
5086+
5087+
def int_nvvm_tcgen05_commit_ # cta_group : Intrinsic<[],
5088+
[llvm_ptr_ty], // mbar_ptr
5089+
[IntrConvergent, IntrInaccessibleMemOrArgMemOnly,
5090+
NoCapture<ArgIndex<0>>]>;
5091+
5092+
def int_nvvm_tcgen05_commit_shared_ # cta_group : Intrinsic<[],
5093+
[llvm_shared_ptr_ty], // mbar_ptr
5094+
[IntrConvergent, IntrInaccessibleMemOrArgMemOnly,
5095+
NoCapture<ArgIndex<0>>]>;
5096+
5097+
def int_nvvm_tcgen05_commit_mc_ # cta_group : Intrinsic<[],
5098+
[llvm_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask
5099+
[IntrConvergent, IntrInaccessibleMemOrArgMemOnly,
5100+
NoCapture<ArgIndex<0>>]>;
5101+
5102+
def int_nvvm_tcgen05_commit_mc_shared_ # cta_group : Intrinsic<[],
5103+
[llvm_shared_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask
5104+
[IntrConvergent, IntrInaccessibleMemOrArgMemOnly,
5105+
NoCapture<ArgIndex<0>>]>;
50865106
}
50875107

5108+
// Tcgen05 wait_ld/st intrinsics
5109+
def int_nvvm_tcgen05_wait_ld : Intrinsic<[], [],
5110+
[IntrConvergent, IntrInaccessibleMemOnly]>;
5111+
def int_nvvm_tcgen05_wait_st : Intrinsic<[], [],
5112+
[IntrConvergent, IntrInaccessibleMemOnly]>;
5113+
5114+
// Tcgen05 Fence intrinsics
5115+
def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [],
5116+
[IntrNoMem, IntrHasSideEffects]>;
5117+
def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [],
5118+
[IntrNoMem, IntrHasSideEffects]>;
5119+
50885120
} // let TargetPrefix = "nvvm"

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7622,4 +7622,51 @@ multiclass TCGEN05_RELINQ_PERMIT_INTR<string num, Intrinsic Intr> {
76227622
defm TCGEN05_RELINQ_CG1: TCGEN05_RELINQ_PERMIT_INTR<"1", int_nvvm_tcgen05_relinq_alloc_permit_cg1>;
76237623
defm TCGEN05_RELINQ_CG2: TCGEN05_RELINQ_PERMIT_INTR<"2", int_nvvm_tcgen05_relinq_alloc_permit_cg2>;
76247624

7625+
def tcgen05_wait_ld: NVPTXInst<(outs), (ins), "tcgen05.wait::ld.sync.aligned;",
7626+
[(int_nvvm_tcgen05_wait_ld)]>,
7627+
Requires<[hasTcgen05Instructions]>;
7628+
7629+
def tcgen05_wait_st: NVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligned;",
7630+
[(int_nvvm_tcgen05_wait_st)]>,
7631+
Requires<[hasTcgen05Instructions]>;
7632+
7633+
multiclass TCGEN05_COMMIT_INTR<NVPTXRegClass rc, string AS, string num> {
7634+
defvar prefix = "tcgen05.commit.cta_group::" # num;
7635+
defvar suffix = ".mbarrier::arrive::one.shared::cluster";
7636+
7637+
defvar intr_suffix = !if(!eq(AS, "shared"), "_shared", "") # "_cg" # num;
7638+
defvar Intr = !cast<Intrinsic>("int_nvvm_tcgen05_commit" # intr_suffix);
7639+
defvar IntrMC = !cast<Intrinsic>("int_nvvm_tcgen05_commit_mc" # intr_suffix);
7640+
7641+
def NAME : NVPTXInst<(outs), (ins rc:$mbar),
7642+
!strconcat(prefix, suffix, ".b64 [$mbar];"),
7643+
[(Intr rc:$mbar)]>,
7644+
Requires<[hasTcgen05Instructions]>;
7645+
def NAME # _MC : NVPTXInst<(outs), (ins rc:$mbar, Int16Regs:$mc),
7646+
!strconcat(prefix, suffix, ".multicast::cluster.b64 [$mbar], $mc;"),
7647+
[(IntrMC rc:$mbar, Int16Regs:$mc)]>,
7648+
Requires<[hasTcgen05Instructions]>;
7649+
}
7650+
7651+
defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<Int64Regs, "", "1">;
7652+
defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<Int64Regs, "", "2">;
7653+
defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<Int64Regs, "shared", "1">;
7654+
defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<Int64Regs, "shared", "2">;
7655+
defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR<Int32Regs, "shared", "1">;
7656+
defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR<Int32Regs, "shared", "2">;
7657+
76257658
} // isConvergent
7659+
7660+
let hasSideEffects = 1 in {
7661+
7662+
def tcgen05_fence_before_thread_sync: NVPTXInst<(outs), (ins),
7663+
"tcgen05.fence::before_thread_sync;",
7664+
[(int_nvvm_tcgen05_fence_before_thread_sync)]>,
7665+
Requires<[hasTcgen05Instructions]>;
7666+
7667+
def tcgen05_fence_after_thread_sync: NVPTXInst<(outs), (ins),
7668+
"tcgen05.fence::after_thread_sync;",
7669+
[(int_nvvm_tcgen05_fence_after_thread_sync)]>,
7670+
Requires<[hasTcgen05Instructions]>;
7671+
7672+
} // hasSideEffects
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK_PTX64 %s
3+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | FileCheck --check-prefixes=CHECK_PTX64_SHARED32 %s
4+
; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
5+
; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr | %ptxas-verify -arch=sm_100a %}
6+
7+
declare void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
8+
declare void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
9+
declare void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr)
10+
declare void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr)
11+
12+
; CHECK-LABEL: test_tcgen05_commit
13+
define void @test_tcgen05_commit(ptr %bar_addr) {
14+
; CHECK_PTX64-LABEL: test_tcgen05_commit(
15+
; CHECK_PTX64: {
16+
; CHECK_PTX64-NEXT: .reg .b64 %rd<2>;
17+
; CHECK_PTX64-EMPTY:
18+
; CHECK_PTX64-NEXT: // %bb.0:
19+
; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_param_0];
20+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
21+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
22+
; CHECK_PTX64-NEXT: ret;
23+
;
24+
; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit(
25+
; CHECK_PTX64_SHARED32: {
26+
; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>;
27+
; CHECK_PTX64_SHARED32-EMPTY:
28+
; CHECK_PTX64_SHARED32-NEXT: // %bb.0:
29+
; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_param_0];
30+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
31+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
32+
; CHECK_PTX64_SHARED32-NEXT: ret;
33+
call void @llvm.nvvm.tcgen05.commit.cg1(ptr %bar_addr)
34+
35+
call void @llvm.nvvm.tcgen05.commit.cg2(ptr %bar_addr)
36+
37+
ret void
38+
}
39+
40+
; CHECK-LABEL: test_tcgen05_commit_shared
41+
define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) {
42+
; CHECK_PTX64-LABEL: test_tcgen05_commit_shared(
43+
; CHECK_PTX64: {
44+
; CHECK_PTX64-NEXT: .reg .b64 %rd<2>;
45+
; CHECK_PTX64-EMPTY:
46+
; CHECK_PTX64-NEXT: // %bb.0:
47+
; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_shared_param_0];
48+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
49+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1];
50+
; CHECK_PTX64-NEXT: ret;
51+
;
52+
; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_shared(
53+
; CHECK_PTX64_SHARED32: {
54+
; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>;
55+
; CHECK_PTX64_SHARED32-EMPTY:
56+
; CHECK_PTX64_SHARED32-NEXT: // %bb.0:
57+
; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_commit_shared_param_0];
58+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%r1];
59+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1];
60+
; CHECK_PTX64_SHARED32-NEXT: ret;
61+
call void @llvm.nvvm.tcgen05.commit.shared.cg1(ptr addrspace(3) %bar_addr)
62+
63+
call void @llvm.nvvm.tcgen05.commit.shared.cg2(ptr addrspace(3) %bar_addr)
64+
65+
ret void
66+
}
67+
68+
declare void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %bar_addr, i16 %cta_mask)
69+
declare void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask)
70+
declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask)
71+
declare void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask)
72+
73+
; CHECK-LABEL: test_tcgen05_commit_mc
74+
define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) {
75+
; CHECK_PTX64-LABEL: test_tcgen05_commit_mc(
76+
; CHECK_PTX64: {
77+
; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
78+
; CHECK_PTX64-NEXT: .reg .b64 %rd<2>;
79+
; CHECK_PTX64-EMPTY:
80+
; CHECK_PTX64-NEXT: // %bb.0:
81+
; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0];
82+
; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1];
83+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
84+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
85+
; CHECK_PTX64-NEXT: ret;
86+
;
87+
; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc(
88+
; CHECK_PTX64_SHARED32: {
89+
; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>;
90+
; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>;
91+
; CHECK_PTX64_SHARED32-EMPTY:
92+
; CHECK_PTX64_SHARED32-NEXT: // %bb.0:
93+
; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0];
94+
; CHECK_PTX64_SHARED32-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1];
95+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
96+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
97+
; CHECK_PTX64_SHARED32-NEXT: ret;
98+
call void @llvm.nvvm.tcgen05.commit.mc.cg1(ptr %bar_addr, i16 %cta_mask)
99+
100+
call void @llvm.nvvm.tcgen05.commit.mc.cg2(ptr %bar_addr, i16 %cta_mask)
101+
102+
ret void
103+
}
104+
105+
; CHECK-LABEL: test_tcgen05_commit_mc_shared
106+
define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_mask) {
107+
; CHECK_PTX64-LABEL: test_tcgen05_commit_mc_shared(
108+
; CHECK_PTX64: {
109+
; CHECK_PTX64-NEXT: .reg .b16 %rs<2>;
110+
; CHECK_PTX64-NEXT: .reg .b64 %rd<2>;
111+
; CHECK_PTX64-EMPTY:
112+
; CHECK_PTX64-NEXT: // %bb.0:
113+
; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_shared_param_0];
114+
; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
115+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
116+
; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1;
117+
; CHECK_PTX64-NEXT: ret;
118+
;
119+
; CHECK_PTX64_SHARED32-LABEL: test_tcgen05_commit_mc_shared(
120+
; CHECK_PTX64_SHARED32: {
121+
; CHECK_PTX64_SHARED32-NEXT: .reg .b16 %rs<2>;
122+
; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>;
123+
; CHECK_PTX64_SHARED32-EMPTY:
124+
; CHECK_PTX64_SHARED32-NEXT: // %bb.0:
125+
; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_commit_mc_shared_param_0];
126+
; CHECK_PTX64_SHARED32-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1];
127+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
128+
; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1;
129+
; CHECK_PTX64_SHARED32-NEXT: ret;
130+
call void @llvm.nvvm.tcgen05.commit.mc.shared.cg1(ptr addrspace(3) %bar_addr, i16 %cta_mask)
131+
132+
call void @llvm.nvvm.tcgen05.commit.mc.shared.cg2(ptr addrspace(3) %bar_addr, i16 %cta_mask)
133+
134+
ret void
135+
}
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
3+
; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %}
4+
5+
declare void @llvm.nvvm.tcgen05.fence.before.thread.sync()
6+
declare void @llvm.nvvm.tcgen05.fence.after.thread.sync()
7+
declare void @llvm.nvvm.tcgen05.wait.ld()
8+
declare void @llvm.nvvm.tcgen05.wait.st()
9+
10+
; CHECK-LABEL: test_tcgen05_fence
11+
define void @test_tcgen05_fence() {
12+
; CHECK-LABEL: test_tcgen05_fence(
13+
; CHECK: {
14+
; CHECK-EMPTY:
15+
; CHECK-EMPTY:
16+
; CHECK-NEXT: // %bb.0:
17+
; CHECK-NEXT: tcgen05.fence::before_thread_sync;
18+
; CHECK-NEXT: tcgen05.fence::after_thread_sync;
19+
; CHECK-NEXT: ret;
20+
call void @llvm.nvvm.tcgen05.fence.before.thread.sync()
21+
22+
call void @llvm.nvvm.tcgen05.fence.after.thread.sync()
23+
24+
ret void
25+
}
26+
27+
; CHECK-LABEL: test_tcgen05_wait
28+
define void @test_tcgen05_wait() {
29+
; CHECK-LABEL: test_tcgen05_wait(
30+
; CHECK: {
31+
; CHECK-EMPTY:
32+
; CHECK-EMPTY:
33+
; CHECK-NEXT: // %bb.0:
34+
; CHECK-NEXT: tcgen05.wait::ld.sync.aligned;
35+
; CHECK-NEXT: tcgen05.wait::st.sync.aligned;
36+
; CHECK-NEXT: ret;
37+
call void @llvm.nvvm.tcgen05.wait.ld()
38+
39+
call void @llvm.nvvm.tcgen05.wait.st()
40+
41+
ret void
42+
}

0 commit comments

Comments
 (0)