Skip to content

Commit fa7f0e5

Browse files
authored
[NVPTX] Add Bulk Copy Prefetch Intrinsics (#123226)
This patch adds NVVM intrinsics and NVPTX codegen for: - cp.async.bulk.prefetch.L2.* variants - These intrinsics optionally support cache_hints as indicated by the boolean flag argument. - Lit tests are added for all combinations of these intrinsics in cp-async-bulk.ll. - The generated PTX is verified with a 12.3 ptxas executable. - Added docs for these intrinsics in NVPTXUsage.rst file. PTX Spec reference: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch Co-authored-by: abmajumder <[email protected]>
1 parent ad6d808 commit fa7f0e5

File tree

6 files changed

+93
-0
lines changed

6 files changed

+93
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,34 @@ it must be a multiple of 16.
553553
For more information, refer PTX ISA
554554
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk>`_.
555555

556+
'``llvm.nvvm.cp.async.bulk.prefetch.L2``'
557+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
558+
559+
Syntax:
560+
"""""""
561+
562+
.. code-block:: llvm
563+
564+
declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 %flag_ch)
565+
566+
Overview:
567+
"""""""""
568+
569+
The '``@llvm.nvvm.cp.async.bulk.prefetch.L2``' intrinsic
570+
corresponds to the ``cp.async.bulk.prefetch.L2.*`` family
571+
of PTX instructions. These instructions initiate an asynchronous
572+
prefetch of bulk data from global memory to the L2 cache.
573+
The 32-bit operand ``%size`` specifies the amount of memory to be
574+
prefetched in terms of bytes and it must be a multiple of 16.
575+
576+
* The last argument to these intrinsics is boolean flag indicating
577+
support for cache_hint. These flag argument must be compile-time
578+
constant. When set, it indicates a valid cache_hint (``i64 %ch``)
579+
and generates the ``.L2::cache_hint`` variant of the PTX instruction.
580+
581+
For more information, refer PTX ISA
582+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch>`_.
583+
556584
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
557585
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
558586

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5033,4 +5033,15 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
50335033
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
50345034
ImmArg<ArgIndex<4>>]>;
50355035

5036+
// Intrinsics for Bulk Copy Prefetch L2
5037+
def int_nvvm_cp_async_bulk_prefetch_L2
5038+
: DefaultAttrsIntrinsic<[],
5039+
[llvm_global_ptr_ty, // src_gmem_ptr
5040+
llvm_i32_ty, // copy_size
5041+
llvm_i64_ty, // cache_hint
5042+
llvm_i1_ty], // Flag for cache_hint
5043+
[IntrConvergent, IntrArgMemOnly,
5044+
NoCapture<ArgIndex<0>>, ReadOnly<ArgIndex<0>>,
5045+
ImmArg<ArgIndex<3>>]>;
5046+
50365047
} // let TargetPrefix = "nvvm"

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3168,6 +3168,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
31683168
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
31693169
}
31703170

3171+
void NVPTXDAGToDAGISel::SelectCpAsyncBulkPrefetchL2(SDNode *N) {
3172+
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
3173+
// src, size, cache_hint, cache_hint_flag
3174+
// NumOperands = {Chain, IID} + {Actual intrinsic args}
3175+
// = {2} + {4}
3176+
size_t NumOps = N->getNumOperands();
3177+
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
3178+
size_t NumArgs = IsCacheHint ? 3 : 2; // src, size, cache_hint
3179+
3180+
SDLoc DL(N);
3181+
SmallVector<SDValue, 4> Ops(N->ops().slice(2, NumArgs));
3182+
Ops.push_back(N->getOperand(0)); // Chain operand
3183+
3184+
unsigned Opcode = IsCacheHint
3185+
? NVPTX::CP_ASYNC_BULK_PREFETCH_CH
3186+
: NVPTX::CP_ASYNC_BULK_PREFETCH;
3187+
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
3188+
}
3189+
31713190
bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
31723191
unsigned IID = N->getConstantOperandVal(1);
31733192
using TMARedTy = llvm::nvvm::TMAReductionOp;
@@ -3181,6 +3200,9 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
31813200
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
31823201
SelectCpAsyncBulkS2G(N);
31833202
return true;
3203+
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
3204+
SelectCpAsyncBulkPrefetchL2(N);
3205+
return true;
31843206
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
31853207
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
31863208
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9393
void SelectI128toV2I64(SDNode *N);
9494
void SelectCpAsyncBulkG2S(SDNode *N);
9595
void SelectCpAsyncBulkS2G(SDNode *N);
96+
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
9697
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9798
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
9899
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,18 @@ multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER<NVPTXRegClass rc> {
547547
defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int64Regs>;
548548
defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER<Int32Regs>;
549549

550+
//------------------------------
551+
// Bulk Copy Prefetch Functions
552+
//------------------------------
553+
def CP_ASYNC_BULK_PREFETCH : NVPTXInst<(outs),
554+
(ins Int64Regs:$src, Int32Regs:$size),
555+
"cp.async.bulk.prefetch.L2.global [$src], $size;", []>,
556+
Requires<[hasPTX<80>, hasSM<90>]>;
557+
558+
def CP_ASYNC_BULK_PREFETCH_CH : NVPTXInst<(outs),
559+
(ins Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch),
560+
"cp.async.bulk.prefetch.L2.global.L2::cache_hint [$src], $size, $ch;", []>,
561+
Requires<[hasPTX<80>, hasSM<90>]>;
550562
//-------------------------------------
551563
// TMA Async Bulk Tensor Copy Functions
552564
//-------------------------------------

llvm/test/CodeGen/NVPTX/cp-async-bulk.ll

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ target triple = "nvptx64-nvidia-cuda"
99
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
1010
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1)
1111
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)
12+
declare void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1), i32, i64, i1)
1213

1314
define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) {
1415
; CHECK-PTX64-LABEL: cp_async_bulk_g2s(
@@ -116,3 +117,21 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
116117
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
117118
ret void
118119
}
120+
121+
define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
122+
; CHECK-PTX64-LABEL: cp_async_bulk_prefetch(
123+
; CHECK-PTX64: {
124+
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
125+
; CHECK-PTX64-NEXT: .reg .b64 %rd<3>;
126+
; CHECK-PTX64-EMPTY:
127+
; CHECK-PTX64-NEXT: // %bb.0:
128+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0];
129+
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1];
130+
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2];
131+
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
132+
; CHECK-PTX64-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1;
133+
; CHECK-PTX64-NEXT: ret;
134+
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 %ch, i1 1)
135+
tail call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %src, i32 %size, i64 0, i1 0)
136+
ret void
137+
}

0 commit comments

Comments
 (0)