Skip to content

Commit 035d0b8

Browse files
abhilash1910joaosaffran
authored andcommitted
[NVPTX] Add intrinsics for prefetch.* (llvm#125887)
\[NVPTX\] Add Prefetch intrinsics This PR adds prefetch intrinsics with the relevant eviction priorities. * Lit tests are added as part of prefetch.ll * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst. For more information, refer PTX ISA `<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_. --------- Co-authored-by: abmajumder <[email protected]>
1 parent 4eb1b47 commit 035d0b8

File tree

4 files changed

+169
-0
lines changed

4 files changed

+169
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,46 @@ prefetched in terms of bytes and it must be a multiple of 16.
581581
For more information, refer PTX ISA
582582
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_.
583583

584+
'``llvm.nvvm.prefetch.*``'
585+
^^^^^^^^^^^^^^^^^^^^^^^^^^
586+
587+
Syntax:
588+
"""""""
589+
590+
.. code-block:: llvm
591+
592+
declare void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr)
593+
declare void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr)
594+
595+
declare void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr)
596+
declare void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr)
597+
declare void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr)
598+
declare void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr)
599+
600+
declare void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr)
601+
declare void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr)
602+
603+
declare void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr)
604+
605+
Overview:
606+
"""""""""
607+
608+
The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic
609+
correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions.
610+
The '``prefetch.*``' instructions bring the cache line containing the
611+
specified address in global or local memory address space into the
612+
specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line
613+
containing the specified generic address into the specified uniform cache level.
614+
If no address space is specified, it is assumed to be generic address. The intrinsic
615+
uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier.
616+
617+
* A prefetch to a shared memory location performs no operation.
618+
* A prefetch into the uniform cache requires a generic address,
619+
and no operation occurs if the address maps to a const, local, or shared memory location.
620+
621+
For more information, refer to the PTX ISA
622+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-prefetch-prefetchu>`_.
623+
584624
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
585625
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
586626

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848

4949
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
5050
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
51+
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
5152
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
5253

5354
//
@@ -4999,6 +5000,26 @@ foreach dim = [1, 2, 3, 4, 5] in {
49995000
}
50005001
}
50015002

5003+
// Intrinsics for Prefetch and Prefetchu
5004+
foreach level = ["L1", "L2"] in {
5005+
foreach addr = ["global", "local", ""] in {
5006+
foreach evict = !if(!eq(addr, "global"), ["evictlast", "evictnormal"], ["evictnormal"]) in {
5007+
defvar suffix = "" # !if(!eq(addr, ""), "", addr # "_") # level # "_" # evict;
5008+
def int_nvvm_prefetch_ # suffix : Intrinsic<[],
5009+
!cond(
5010+
!eq(addr, "global") : [llvm_global_ptr_ty],
5011+
!eq(addr, "local") : [llvm_local_ptr_ty],
5012+
!eq(addr, "") : [llvm_ptr_ty]),
5013+
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>,
5014+
NoCapture<ArgIndex<0>>]>;
5015+
}
5016+
}
5017+
}
5018+
5019+
def int_nvvm_prefetchu_L1_evictnormal : Intrinsic<[], [llvm_ptr_ty],
5020+
[IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>]>;
5021+
5022+
50025023
// Intrinsics for Bulk Copy using TMA (non-tensor)
50035024
// From Global to Shared Cluster
50045025
def int_nvvm_cp_async_bulk_global_to_shared_cluster

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,33 @@ foreach dim = [1, 2, 3, 4, 5] in {
740740
}
741741
}
742742

743+
//Prefetch and Prefetchu
744+
class Join<string sep, list<string> lst> {
745+
string ret = !foldl("", lst, a, b, !if(!eq(a, ""), b, !if(!eq(b,""), a, !strconcat(a, sep, b))));
746+
}
747+
748+
class PREFETCH_INTRS<string InstName> :
749+
NVPTXInst<(outs), (ins Int64Regs:$addr),
750+
InstName # " [$addr];",
751+
[(!cast<Intrinsic>(!strconcat("int_nvvm_",
752+
!subst(".", "_", InstName))) i64:$addr)]>,
753+
Requires<[hasPTX<80>, hasSM<90>]>;
754+
755+
756+
// Only global supports evictlast and evictnormal.
757+
// Other variants (local and default) only support evictnormal
758+
foreach level = ["L1", "L2"] in {
759+
foreach addr = ["global", "local", ""] in {
760+
foreach evict = !if(!eq(addr, "global"), ["evictlast", "evictnormal"], ["evictnormal"]) in {
761+
defvar suffix = Join<"_", [addr, level, evict]>.ret;
762+
defvar inst_name = "prefetch." # !subst("_", ".", suffix);
763+
def PREFETCH_# suffix : PREFETCH_INTRS<inst_name>;
764+
}
765+
}
766+
}
767+
768+
def PREFETCHU_L1_EVICTNORMAL : PREFETCH_INTRS<"prefetchu.L1.evictnormal">;
769+
743770
//-----------------------------------
744771
// MBarrier Functions
745772
//-----------------------------------

llvm/test/CodeGen/NVPTX/prefetch.ll

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s
3+
; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %}
4+
5+
target triple = "nvptx64-nvidia-cuda"
6+
7+
declare void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr)
8+
declare void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr)
9+
10+
declare void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr)
11+
declare void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr)
12+
declare void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr)
13+
declare void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr)
14+
15+
declare void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr)
16+
declare void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr)
17+
18+
declare void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr)
19+
20+
define void @prefetch_local(ptr addrspace(5) %local_ptr) {
21+
; CHECK-PTX64-LABEL: prefetch_local(
22+
; CHECK-PTX64: {
23+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
24+
; CHECK-PTX64-EMPTY:
25+
; CHECK-PTX64-NEXT: // %bb.0:
26+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_local_param_0];
27+
; CHECK-PTX64-NEXT: prefetch.local.L1.evictnormal [%rd1];
28+
; CHECK-PTX64-NEXT: prefetch.local.L2.evictnormal [%rd1];
29+
; CHECK-PTX64-NEXT: ret;
30+
tail call void @llvm.nvvm.prefetch.local.L1.evictnormal(ptr addrspace(5) %local_ptr)
31+
tail call void @llvm.nvvm.prefetch.local.L2.evictnormal(ptr addrspace(5) %local_ptr)
32+
ret void
33+
}
34+
35+
define void @prefetch_global(ptr addrspace(1) %global_ptr) {
36+
; CHECK-PTX64-LABEL: prefetch_global(
37+
; CHECK-PTX64: {
38+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
39+
; CHECK-PTX64-EMPTY:
40+
; CHECK-PTX64-NEXT: // %bb.0:
41+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_global_param_0];
42+
; CHECK-PTX64-NEXT: prefetch.global.L1.evictnormal [%rd1];
43+
; CHECK-PTX64-NEXT: prefetch.global.L2.evictnormal [%rd1];
44+
; CHECK-PTX64-NEXT: prefetch.global.L1.evictlast [%rd1];
45+
; CHECK-PTX64-NEXT: prefetch.global.L2.evictlast [%rd1];
46+
; CHECK-PTX64-NEXT: ret;
47+
tail call void @llvm.nvvm.prefetch.global.L1.evictnormal(ptr addrspace(1) %global_ptr)
48+
tail call void @llvm.nvvm.prefetch.global.L2.evictnormal(ptr addrspace(1) %global_ptr)
49+
tail call void @llvm.nvvm.prefetch.global.L1.evictlast(ptr addrspace(1) %global_ptr)
50+
tail call void @llvm.nvvm.prefetch.global.L2.evictlast(ptr addrspace(1) %global_ptr)
51+
ret void
52+
}
53+
54+
55+
define void @prefetch_(ptr %ptr) {
56+
; CHECK-PTX64-LABEL: prefetch_(
57+
; CHECK-PTX64: {
58+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
59+
; CHECK-PTX64-EMPTY:
60+
; CHECK-PTX64-NEXT: // %bb.0:
61+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch__param_0];
62+
; CHECK-PTX64-NEXT: prefetch.L1.evictnormal [%rd1];
63+
; CHECK-PTX64-NEXT: prefetch.L2.evictnormal [%rd1];
64+
; CHECK-PTX64-NEXT: ret;
65+
tail call void @llvm.nvvm.prefetch.L1.evictnormal(ptr %ptr)
66+
tail call void @llvm.nvvm.prefetch.L2.evictnormal(ptr %ptr)
67+
ret void
68+
}
69+
70+
define void @prefetchu_l1(ptr %ptr) {
71+
; CHECK-PTX64-LABEL: prefetchu_l1(
72+
; CHECK-PTX64: {
73+
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
74+
; CHECK-PTX64-EMPTY:
75+
; CHECK-PTX64-NEXT: // %bb.0:
76+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetchu_l1_param_0];
77+
; CHECK-PTX64-NEXT: prefetchu.L1.evictnormal [%rd1];
78+
; CHECK-PTX64-NEXT: ret;
79+
tail call void @llvm.nvvm.prefetchu.L1.evictnormal(ptr %ptr)
80+
ret void
81+
}

0 commit comments

Comments
 (0)