Skip to content

Commit 241a56d

Browse files
authored
[NVPTX] Add Intrinsics for applypriority.* (#127989)
\[NVPTX\] Add ApplyPriority intrinsics This PR adds applypriority.\* intrinsics with relevant eviction priorities. * The lowering is handled from nvvm to nvptx tablegen directly. * Lit tests are added as part of applypriority.ll * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst. For more information, refer to the PTX ISA `<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-applypriority>`_. --------- Co-authored-by: abmajumder <[email protected]>
1 parent 15e295d commit 241a56d

File tree

4 files changed

+79
-0
lines changed

4 files changed

+79
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,30 @@ uses and eviction priority which can be accessed by the '``.level::eviction_prio
647647
For more information, refer to the PTX ISA
648648
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
649649

650+
'``llvm.nvvm.applypriority.*``'
651+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
652+
653+
Syntax:
654+
"""""""
655+
656+
.. code-block:: llvm
657+
658+
declare void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 %size)
659+
declare void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 %size)
660+
661+
Overview:
662+
"""""""""
663+
664+
The '``@llvm.nvvm.applypriority.*``' applies the cache eviction priority specified by the
665+
.level::eviction_priority qualifier to the address range [a..a+size) in the specified cache
666+
level. If no state space is specified then Generic Addressing is used. If the specified address
667+
does not fall within the address window of .global state space then the behavior is undefined.
668+
The operand size is an integer constant that specifies the amount of data, in bytes, in the specified cache
669+
level on which the priority is to be applied. The only supported value for the size operand is 128.
670+
671+
For more information, refer to the PTX ISA
672+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-applypriority>`_.
673+
650674
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
651675
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
652676

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5072,6 +5072,16 @@ def int_nvvm_prefetch_global_L2_evict_last: Intrinsic<[], [llvm_global_ptr_ty],
50725072
def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty],
50735073
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
50745074

5075+
def int_nvvm_applypriority_global_L2_evict_normal
5076+
: DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty, llvm_i64_ty],
5077+
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
5078+
ImmArg<ArgIndex<1>>]>;
5079+
5080+
def int_nvvm_applypriority_L2_evict_normal
5081+
: DefaultAttrsIntrinsic<[], [llvm_ptr_ty, llvm_i64_ty],
5082+
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
5083+
ImmArg<ArgIndex<1>>]>;
5084+
50755085

50765086
// Intrinsics for Bulk Copy using TMA (non-tensor)
50775087
// From Global to Shared Cluster

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,17 @@ def PREFETCH_GLOBAL_L2_EVICT_LAST : NVPTXInst<(outs), (ins Int64Regs:$addr),
789789

790790
def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
791791

792+
//Applypriority intrinsics
793+
class APPLYPRIORITY_L2_INTRS<string addr> :
794+
NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size),
795+
StrJoin<".", ["applypriority", addr , "L2::evict_normal"]>.ret # " [$addr], $size;",
796+
[(!cast<Intrinsic>(StrJoin<"_", ["int_nvvm_applypriority", addr , "L2_evict_normal"]>.ret)
797+
i64:$addr, i64:$size)]>,
798+
Requires<[hasPTX<74>, hasSM<80>]>;
799+
800+
def APPLYPRIORITY_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"">;
801+
def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">;
802+
792803
//-----------------------------------
793804
// MBarrier Functions
794805
//-----------------------------------
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| FileCheck --check-prefixes=CHECK-PTX64 %s
3+
; RUN: %if ptxas-11.4 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_80 -mattr=+ptx74| %ptxas-verify -arch=sm_80 %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
declare void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 %size)
8+
declare void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 %size)
9+
10+
define void @applypriority_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
11+
; CHECK-PTX64-LABEL: applypriority_global_L2(
12+
; CHECK-PTX64: {
13+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
14+
; CHECK-PTX64-EMPTY:
15+
; CHECK-PTX64-NEXT: // %bb.0:
16+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_global_L2_param_0];
17+
; CHECK-PTX64-NEXT: applypriority.global.L2::evict_normal [%rd1], 128;
18+
; CHECK-PTX64-NEXT: ret;
19+
tail call void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 128)
20+
ret void
21+
}
22+
23+
define void @applypriority_L2(ptr %ptr, i64 %size) {
24+
; CHECK-PTX64-LABEL: applypriority_L2(
25+
; CHECK-PTX64: {
26+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
27+
; CHECK-PTX64-EMPTY:
28+
; CHECK-PTX64-NEXT: // %bb.0:
29+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_L2_param_0];
30+
; CHECK-PTX64-NEXT: applypriority.L2::evict_normal [%rd1], 128;
31+
; CHECK-PTX64-NEXT: ret;
32+
tail call void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 128)
33+
ret void
34+
}

0 commit comments

Comments
 (0)