Skip to content

Commit 5fa07db

Browse files
[LLVM][NVPTX] Add NVPTX codegen support for fence.proxy.tensormap
This commit adds LLVM Intrinsics and NVPTX codegen support for `fence.proxy.tensormap` with lit tests under fence-proxy-tensormap.ll. Also, added Intrinsics documentation in NVPTXUsage.rst
1 parent f2a0f97 commit 5fa07db

File tree

4 files changed

+120
-0
lines changed

4 files changed

+120
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,34 @@ Overview:
251251
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
252252
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
253253

254+
Membar/Fences
255+
-------------
256+
257+
258+
'``llvm.nvvm.fence.proxy.tensormap.*``'
259+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260+
261+
Syntax:
262+
"""""""
263+
264+
.. code-block:: llvm
265+
266+
declare void @llvm.nvvm.fence.proxy.tensormap.release.cta()
267+
declare void @llvm.nvvm.fence.proxy.tensormap.release.cluster()
268+
declare void @llvm.nvvm.fence.proxy.tensormap.release.gpu()
269+
declare void @llvm.nvvm.fence.proxy.tensormap.release.sys()
270+
271+
declare void @llvm.nvvm.fence.proxy.tensormap.acquire.cta(ptr %addr, i32 %size)
272+
declare void @llvm.nvvm.fence.proxy.tensormap.acquire.cluster(ptr %addr, i32 %size)
273+
declare void @llvm.nvvm.fence.proxy.tensormap.acquire.gpu(ptr %addr, i32 %size)
274+
declare void @llvm.nvvm.fence.proxy.tensormap.acquire.sys(ptr %addr, i32 %size)
275+
276+
Overview:
277+
"""""""""
278+
279+
The '``@llvm.nvvm.fence.proxy.tensormap.release.*``' intrinsic emits ``fence.proxy.tensormap::generic.release.*`` and '``@llvm.nvvm.fence.proxy.tensormap.acquire.*``' intrinsic emits ``fence.proxy.tensormap::generic.acquire.* [addr], size;``. ``nvvm.fence.proxy.tensormap*`` is a uni-directional fence used to establish ordering between memory accesses that may happen through different proxies. ``nvvm.fence.proxy.tensormap.release`` can form a release sequence that synchronizes with an acquire sequence that contains the ``nvvm.fence.proxy.tensormap.acquire`` proxy fence
280+
281+
The address operand ``addr`` and the operand ``size`` together specifies the memory range ``[addr, addr+size-1]`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_.
254282

255283
Other Intrinsics
256284
----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,6 +1418,20 @@ let TargetPrefix = "nvvm" in {
14181418
def int_nvvm_fence_sc_cluster:
14191419
Intrinsic<[], [], [IntrNoCallback]>;
14201420

1421+
// Proxy fence (uni-directional)
1422+
foreach scope = ["cta", "cluster", "gpu", "sys"] in {
1423+
1424+
def int_nvvm_fence_proxy_tensormap_release_ # scope:
1425+
Intrinsic<[], [], [IntrNoCallback],
1426+
"llvm.nvvm.fence.proxy.tensormap.release." # scope>;
1427+
1428+
def int_nvvm_fence_proxy_tensormap_acquire_ # scope:
1429+
Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
1430+
[IntrNoCallback, ImmArg<ArgIndex<1>>],
1431+
"llvm.nvvm.fence.proxy.tensormap.acquire." # scope>;
1432+
1433+
}
1434+
14211435
// Async Copy
14221436
def int_nvvm_cp_async_mbarrier_arrive :
14231437
ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive">,

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,48 @@ def INT_FENCE_SC_CLUSTER:
335335
MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
336336
Requires<[hasPTX<78>, hasSM<90>]>;
337337

338+
// Proxy fence (uni-directional)
339+
// fence.proxy.tensormap.release variants
340+
341+
class FENCE_PROXY_TENSORMAP_RELEASE<string Scope, Intrinsic Intr> :
342+
NVPTXInst<(outs), (ins),
343+
"fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>,
344+
Requires<[hasPTX<83>, hasSM<90>]>;
345+
346+
def INT_FENCE_PROXY_TENSORMAP_RELEASE_CTA:
347+
FENCE_PROXY_TENSORMAP_RELEASE<"cta",
348+
int_nvvm_fence_proxy_tensormap_release_cta>;
349+
def INT_FENCE_PROXY_TENSORMAP_RELEASE_CLUSTER:
350+
FENCE_PROXY_TENSORMAP_RELEASE<"cluster",
351+
int_nvvm_fence_proxy_tensormap_release_cluster>;
352+
def INT_FENCE_PROXY_TENSORMAP_RELEASE_GPU:
353+
FENCE_PROXY_TENSORMAP_RELEASE<"gpu",
354+
int_nvvm_fence_proxy_tensormap_release_gpu>;
355+
def INT_FENCE_PROXY_TENSORMAP_RELEASE_SYS:
356+
FENCE_PROXY_TENSORMAP_RELEASE<"sys",
357+
int_nvvm_fence_proxy_tensormap_release_sys>;
358+
359+
// fence.proxy.tensormap.acquire variants
360+
361+
class FENCE_PROXY_TENSORMAP_ACQUIRE<string Scope, Intrinsic Intr> :
362+
NVPTXInst<(outs), (ins Int64Regs:$addr, i32imm:$size),
363+
"fence.proxy.tensormap::generic.acquire." # Scope # " [$addr], $size;",
364+
[(Intr Int64Regs:$addr, timm:$size)]>,
365+
Requires<[hasPTX<83>, hasSM<90>]>;
366+
367+
def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_CTA :
368+
FENCE_PROXY_TENSORMAP_ACQUIRE<"cta",
369+
int_nvvm_fence_proxy_tensormap_acquire_cta>;
370+
def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_CLUSTER :
371+
FENCE_PROXY_TENSORMAP_ACQUIRE<"cluster",
372+
int_nvvm_fence_proxy_tensormap_acquire_cluster>;
373+
def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_GPU :
374+
FENCE_PROXY_TENSORMAP_ACQUIRE<"gpu",
375+
int_nvvm_fence_proxy_tensormap_acquire_gpu>;
376+
def INT_FENCE_PROXY_TENSORMAP_ACQUIRE_SYS :
377+
FENCE_PROXY_TENSORMAP_ACQUIRE<"sys",
378+
int_nvvm_fence_proxy_tensormap_acquire_sys>;
379+
338380
//-----------------------------------
339381
// Async Copy Functions
340382
//-----------------------------------
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s
2+
; RUN: %if ptxas-12.5 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
3+
4+
; CHECK-LABEL: test_fence_proxy_tensormap_release
5+
define void @test_fence_proxy_tensormap_release() {
6+
; CHECK: fence.proxy.tensormap::generic.release.cta;
7+
call void @llvm.nvvm.fence.proxy.tensormap.release.cta();
8+
9+
; CHECK: fence.proxy.tensormap::generic.release.cluster;
10+
call void @llvm.nvvm.fence.proxy.tensormap.release.cluster();
11+
12+
; CHECK: fence.proxy.tensormap::generic.release.gpu;
13+
call void @llvm.nvvm.fence.proxy.tensormap.release.gpu();
14+
15+
; CHECK: fence.proxy.tensormap::generic.release.sys;
16+
call void @llvm.nvvm.fence.proxy.tensormap.release.sys();
17+
18+
ret void
19+
}
20+
21+
; CHECK-LABEL: test_fence_proxy_tensormap_acquire
22+
define void @test_fence_proxy_tensormap_acquire(ptr addrspace(0) %addr) {
23+
; CHECK: fence.proxy.tensormap::generic.acquire.cta [%rd{{[0-9]+}}], 128;
24+
call void @llvm.nvvm.fence.proxy.tensormap.acquire.cta(ptr addrspace(0) %addr, i32 128);
25+
26+
; CHECK: fence.proxy.tensormap::generic.acquire.cluster [%rd{{[0-9]+}}], 128;
27+
call void @llvm.nvvm.fence.proxy.tensormap.acquire.cluster(ptr addrspace(0) %addr, i32 128);
28+
29+
; CHECK: fence.proxy.tensormap::generic.acquire.gpu [%rd{{[0-9]+}}], 128;
30+
call void @llvm.nvvm.fence.proxy.tensormap.acquire.gpu(ptr addrspace(0) %addr, i32 128);
31+
32+
; CHECK: fence.proxy.tensormap::generic.acquire.sys [%rd{{[0-9]+}}], 128;
33+
call void @llvm.nvvm.fence.proxy.tensormap.acquire.sys(ptr addrspace(0) %addr, i32 128);
34+
35+
ret void
36+
}

0 commit comments

Comments
 (0)