Skip to content

Commit bfef7cc

Browse files
[LLVM][NVPTX] Add NVPTX codegen support for fence.proxy.tensormap (llvm#100748)
This commit adds LLVM Intrinsics and NVPTX codegen support for `fence.proxy.tensormap` with lit tests under fence-proxy-tensormap.ll. Also, added Intrinsics documentation in NVPTXUsage.rst --------- Co-authored-by: gonzalobg <[email protected]>
1 parent 999bab7 commit bfef7cc

File tree

5 files changed

+135
-0
lines changed

5 files changed

+135
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,41 @@ Overview:
251251
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
252252
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
253253

254+
Membar/Fences
255+
-------------
256+
257+
258+
'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
259+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
260+
261+
Syntax:
262+
"""""""
263+
264+
.. code-block:: llvm
265+
266+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta()
267+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster()
268+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu()
269+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys()
270+
271+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr %addr, i32 %size)
272+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr %addr, i32 %size)
273+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr %addr, i32 %size)
274+
declare void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr %addr, i32 %size)
275+
276+
Overview:
277+
"""""""""
278+
279+
The ``@llvm.nvvm.fence.proxy.tensormap_generic.*`` is a uni-directional fence used to establish ordering between a prior memory access performed via the generic `proxy<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#proxies>_` and a subsequent memory access performed via the tensormap proxy. ``nvvm.fence.proxy.tensormap_generic.release`` can form a release sequence that synchronizes with an acquire sequence that contains the ``nvvm.fence.proxy.tensormap_generic.acquire`` proxy fence. The following table describes the mapping between LLVM Intrinsic and the PTX instruction:
280+
281+
====================================================== =========================================================
282+
NVVM Intrinsic PTX Instruction
283+
====================================================== =========================================================
284+
``@llvm.nvvm.fence.proxy.tensormap_generic.release.*`` ``fence.proxy.tensormap::generic.release.*``
285+
``@llvm.nvvm.fence.proxy.tensormap_generic.acquire.*`` ``fence.proxy.tensormap::generic.acquire.* [addr], size``
286+
====================================================== =========================================================
287+
288+
The address operand ``addr`` and the operand ``size`` together specify the memory range ``[addr, addr+size)`` on which the ordering guarantees on the memory accesses across the proxies is to be provided. The only supported value for the ``size`` operand is ``128`` and must be an immediate. Generic Addressing is used unconditionally, and the address specified by the operand addr must fall within the ``.global`` state space. Otherwise, the behavior is undefined. For more information, see `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-membar>`_.
254289

255290
Other Intrinsics
256291
----------------

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,6 +1418,20 @@ let TargetPrefix = "nvvm" in {
14181418
def int_nvvm_fence_sc_cluster:
14191419
Intrinsic<[], [], [IntrNoCallback]>;
14201420

1421+
// Proxy fence (uni-directional)
1422+
foreach scope = ["cta", "cluster", "gpu", "sys"] in {
1423+
1424+
def int_nvvm_fence_proxy_tensormap_generic_release_ # scope:
1425+
Intrinsic<[], [], [IntrNoCallback],
1426+
"llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>;
1427+
1428+
def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope:
1429+
Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
1430+
[IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>],
1431+
"llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>;
1432+
1433+
}
1434+
14211435
// Async Copy
14221436
def int_nvvm_cp_async_mbarrier_arrive :
14231437
ClangBuiltin<"__nvvm_cp_async_mbarrier_arrive">,

llvm/lib/IR/Verifier.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6329,6 +6329,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
63296329
"llvm.threadlocal.address operand isThreadLocal() must be true");
63306330
break;
63316331
}
6332+
case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cta:
6333+
case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_cluster:
6334+
case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_gpu:
6335+
case Intrinsic::nvvm_fence_proxy_tensormap_generic_acquire_sys: {
6336+
unsigned size = cast<ConstantInt>(Call.getArgOperand(1))->getZExtValue();
6337+
Check(size == 128, " The only supported value for size operand is 128");
6338+
break;
6339+
}
63326340
};
63336341

63346342
// Verify that there aren't any unmediated control transfers between funclets.

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,48 @@ def INT_FENCE_SC_CLUSTER:
335335
MEMBAR<"fence.sc.cluster;", int_nvvm_fence_sc_cluster>,
336336
Requires<[hasPTX<78>, hasSM<90>]>;
337337

338+
// Proxy fence (uni-directional)
339+
// fence.proxy.tensormap.release variants
340+
341+
class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> :
342+
NVPTXInst<(outs), (ins),
343+
"fence.proxy.tensormap::generic.release." # Scope # ";", [(Intr)]>,
344+
Requires<[hasPTX<83>, hasSM<90>]>;
345+
346+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CTA:
347+
FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cta",
348+
int_nvvm_fence_proxy_tensormap_generic_release_cta>;
349+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_CLUSTER:
350+
FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"cluster",
351+
int_nvvm_fence_proxy_tensormap_generic_release_cluster>;
352+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_GPU:
353+
FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"gpu",
354+
int_nvvm_fence_proxy_tensormap_generic_release_gpu>;
355+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_RELEASE_SYS:
356+
FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<"sys",
357+
int_nvvm_fence_proxy_tensormap_generic_release_sys>;
358+
359+
// fence.proxy.tensormap.acquire variants
360+
361+
class FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<string Scope, Intrinsic Intr> :
362+
NVPTXInst<(outs), (ins Int64Regs:$addr),
363+
"fence.proxy.tensormap::generic.acquire." # Scope # " [$addr], 128;",
364+
[(Intr Int64Regs:$addr, (i32 128))]>,
365+
Requires<[hasPTX<83>, hasSM<90>]>;
366+
367+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CTA :
368+
FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cta",
369+
int_nvvm_fence_proxy_tensormap_generic_acquire_cta>;
370+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_CLUSTER :
371+
FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"cluster",
372+
int_nvvm_fence_proxy_tensormap_generic_acquire_cluster>;
373+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_GPU :
374+
FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"gpu",
375+
int_nvvm_fence_proxy_tensormap_generic_acquire_gpu>;
376+
def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS :
377+
FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE<"sys",
378+
int_nvvm_fence_proxy_tensormap_generic_acquire_sys>;
379+
338380
//-----------------------------------
339381
// Async Copy Functions
340382
//-----------------------------------
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck --check-prefixes=CHECK %s
2+
; RUN: %if ptxas-12.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
3+
4+
; CHECK-LABEL: test_fence_proxy_tensormap_generic_release
5+
define void @test_fence_proxy_tensormap_generic_release() {
6+
; CHECK: fence.proxy.tensormap::generic.release.cta;
7+
call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cta();
8+
9+
; CHECK: fence.proxy.tensormap::generic.release.cluster;
10+
call void @llvm.nvvm.fence.proxy.tensormap_generic.release.cluster();
11+
12+
; CHECK: fence.proxy.tensormap::generic.release.gpu;
13+
call void @llvm.nvvm.fence.proxy.tensormap_generic.release.gpu();
14+
15+
; CHECK: fence.proxy.tensormap::generic.release.sys;
16+
call void @llvm.nvvm.fence.proxy.tensormap_generic.release.sys();
17+
18+
ret void
19+
}
20+
21+
; CHECK-LABEL: test_fence_proxy_tensormap_generic_acquire
22+
define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) {
23+
; CHECK: fence.proxy.tensormap::generic.acquire.cta [%rd{{[0-9]+}}], 128;
24+
call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 128);
25+
26+
; CHECK: fence.proxy.tensormap::generic.acquire.cluster [%rd{{[0-9]+}}], 128;
27+
call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cluster(ptr addrspace(0) %addr, i32 128);
28+
29+
; CHECK: fence.proxy.tensormap::generic.acquire.gpu [%rd{{[0-9]+}}], 128;
30+
call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.gpu(ptr addrspace(0) %addr, i32 128);
31+
32+
; CHECK: fence.proxy.tensormap::generic.acquire.sys [%rd{{[0-9]+}}], 128;
33+
call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.sys(ptr addrspace(0) %addr, i32 128);
34+
35+
ret void
36+
}

0 commit comments

Comments
 (0)