Skip to content

Commit c507a08

Browse files
authored
[NVPTX] Add TMA Bulk Copy Intrinsics (#138679)
This patch adds a new variant of TMA Bulk Copy intrinsics introduced in sm100+. This variant has an additional byte_mask to select the bytes for the copy operation. * Selection is all done through table-gen now. So, this patch removes the corresponding SelectCpAsyncBulkS2G() function. * lit tests are verified with a cuda-12.8 ptxas executable. PTX Spec link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-bulk-copy Signed-off-by: Durgadoss R <[email protected]>
1 parent d5da557 commit c507a08

File tree

7 files changed

+88
-48
lines changed

7 files changed

+88
-48
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -672,6 +672,7 @@ Syntax:
672672
.. code-block:: llvm
673673
674674
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch)
675+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i64 %ch, i1 %flag_ch, i16 %mask)
675676
676677
Overview:
677678
"""""""""
@@ -680,10 +681,13 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic
680681
corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX
681682
instructions. These instructions initiate an asynchronous copy from
682683
shared::cta to global memory. The 32-bit operand ``%size`` specifies
683-
the amount of memory to be copied and it must be a multiple of 16.
684+
the amount of memory to be copied (in bytes) and it must be a multiple
685+
of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand
686+
specifies whether the i-th byte of each 16-byte wide chunk of source
687+
data is copied to the destination.
684688

685-
* The last argument to these intrinsics is a boolean flag
686-
indicating support for cache_hint. This flag argument must
689+
* The ``i1 %flag_ch`` argument to these intrinsics is a boolean
690+
flag indicating support for cache_hint. This flag argument must
687691
be a compile-time constant. When set, it indicates a valid
688692
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
689693
variant of the PTX instruction.

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2112,6 +2112,19 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
21122112
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
21132113
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;
21142114

2115+
// From Shared CTA to Global memory with bytemask
2116+
def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
2117+
: DefaultAttrsIntrinsic<[],
2118+
[llvm_global_ptr_ty, // dst_gmem_ptr
2119+
llvm_shared_ptr_ty, // src_smem_ptr
2120+
llvm_i32_ty, // copy_size
2121+
llvm_i64_ty, // cache_hint
2122+
llvm_i1_ty, // Flag for cache_hint
2123+
llvm_i16_ty], // byte_mask
2124+
[IntrConvergent, IntrArgMemOnly,
2125+
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
2126+
ImmArg<ArgIndex<4>>]>;
2127+
21152128
// Intrinsics for Bulk Copy Prefetch L2
21162129
def int_nvvm_cp_async_bulk_prefetch_L2
21172130
: DefaultAttrsIntrinsicFlags<[],

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,31 +2685,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
26852685
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
26862686
}
26872687

2688-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
2689-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2690-
// dst, src, size, cache_hint, cache_hint_flag
2691-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2692-
// = {2} + {5}
2693-
size_t NumOps = N->getNumOperands();
2694-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2695-
size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint
2696-
2697-
SDLoc DL(N);
2698-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2699-
Ops.push_back(N->getOperand(0)); // Chain operand
2700-
2701-
bool IsShared32 =
2702-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2703-
unsigned Opcode;
2704-
if (IsCacheHint)
2705-
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
2706-
: NVPTX::CP_ASYNC_BULK_S2G_CH;
2707-
else
2708-
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
2709-
: NVPTX::CP_ASYNC_BULK_S2G;
2710-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2711-
}
2712-
27132688
void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
27142689
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
27152690
// {dst, mbar, src, size, multicast, cache_hint,
@@ -2892,9 +2867,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
28922867
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
28932868
SelectCpAsyncBulkG2S(N);
28942869
return true;
2895-
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
2896-
SelectCpAsyncBulkS2G(N);
2897-
return true;
28982870
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
28992871
SelectCpAsyncBulkPrefetchL2(N);
29002872
return true;

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9393
void SelectV2I64toI128(SDNode *N);
9494
void SelectI128toV2I64(SDNode *N);
9595
void SelectCpAsyncBulkG2S(SDNode *N);
96-
void SelectCpAsyncBulkS2G(SDNode *N);
9796
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
9897
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9998
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,11 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
511511
// TMA Async Bulk Copy Functions
512512
//------------------------------
513513

514-
class CpAsyncBulkStr<bit mc, bit ch> {
514+
class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
515515
// Shared to Global memory
516516
string S2G = "cp.async.bulk.global.shared::cta.bulk_group"
517-
# !if(ch, ".L2::cache_hint", "");
517+
# !if(ch, ".L2::cache_hint", "")
518+
# !if(mask, ".cp_mask", "");
518519

519520
// Global to Shared cluster memory
520521
string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes"
@@ -525,18 +526,23 @@ class CpAsyncBulkStr<bit mc, bit ch> {
525526
string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
526527
}
527528

528-
multiclass CP_ASYNC_BULK_S2G<NVPTXRegClass rc> {
529-
def NAME: NVPTXInst<(outs),
530-
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size),
531-
!strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>,
532-
Requires<[hasPTX<80>, hasSM<90>]>;
533-
def NAME # _CH: NVPTXInst<(outs),
534-
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch),
535-
!strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
536-
Requires<[hasPTX<80>, hasSM<90>]>;
529+
multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
530+
def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
531+
!if(has_ch,
532+
CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;",
533+
CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"),
534+
[(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
535+
Requires<[hasPTX<80>, hasSM<90>]>;
536+
537+
def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
538+
!if(has_ch,
539+
CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;",
540+
CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"),
541+
[(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>,
542+
Requires<[hasPTX<86>, hasSM<100>]>;
537543
}
538-
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G<Int64Regs>;
539-
defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G<Int32Regs>;
544+
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>;
545+
defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>;
540546

541547
multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
542548
def NAME: NVPTXInst<(outs),
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
3+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
4+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
5+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}
6+
7+
target triple = "nvptx64-nvidia-cuda"
8+
9+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i64, i1, i16)
10+
11+
define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i16 %mask) {
12+
; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask(
13+
; CHECK-PTX64: {
14+
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
15+
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
16+
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
17+
; CHECK-PTX64-EMPTY:
18+
; CHECK-PTX64-NEXT: // %bb.0:
19+
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
20+
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_1];
21+
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_2];
22+
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_bytemask_param_3];
23+
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4];
24+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1;
25+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1;
26+
; CHECK-PTX64-NEXT: ret;
27+
;
28+
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask(
29+
; CHECK-PTX-SHARED32: {
30+
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
31+
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>;
32+
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
33+
; CHECK-PTX-SHARED32-EMPTY:
34+
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
35+
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
36+
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_1];
37+
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_bytemask_param_2];
38+
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_3];
39+
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4];
40+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1;
41+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1;
42+
; CHECK-PTX-SHARED32-NEXT: ret;
43+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1, i16 %mask)
44+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0, i16 %mask)
45+
ret void
46+
}

llvm/test/CodeGen/NVPTX/cp-async-bulk.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
6666
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0];
6767
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1];
6868
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2];
69-
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
7069
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3];
70+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
7171
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3;
7272
; CHECK-PTX64-NEXT: ret;
7373
;
@@ -80,11 +80,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
8080
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0];
8181
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1];
8282
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2];
83-
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
8483
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3];
84+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
8585
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2;
8686
; CHECK-PTX-SHARED32-NEXT: ret;
87-
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0)
87+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0)
8888
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1)
8989
ret void
9090
}

0 commit comments

Comments
 (0)