Skip to content

Commit 5c97fa7

Browse files
committed
[NVPTX] Add TMA Bulk Copy Intrinsics
This patch adds a new variant of TMA Bulk Copy intrinsics introduced in sm100+. This variant has an additional byte_mask to select the bytes for the copy operation. Signed-off-by: Durgadoss R <[email protected]>
1 parent ffc5f79 commit 5c97fa7

File tree

6 files changed

+121
-30
lines changed

6 files changed

+121
-30
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,7 @@ Syntax:
616616
.. code-block:: llvm
617617
618618
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch)
619+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i16 %mask, i64 %ch, i1 %flag_ch)
619620
620621
Overview:
621622
"""""""""
@@ -624,7 +625,10 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic
624625
corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX
625626
instructions. These instructions initiate an asynchronous copy from
626627
shared::cta to global memory. The 32-bit operand ``%size`` specifies
627-
the amount of memory to be copied and it must be a multiple of 16.
628+
the amount of memory to be copied (in bytes) and it must be a multiple
629+
of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand
630+
specifies whether the i-th byte of each 16-byte wide chunk of source
631+
data is copied to the destination.
628632

629633
* The last argument to these intrinsics is a boolean flag
630634
indicating support for cache_hint. This flag argument must

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5323,6 +5323,19 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
53235323
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
53245324
ImmArg<ArgIndex<4>>]>;
53255325

5326+
// From Shared CTA to Global memory with bytemask
5327+
def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
5328+
: DefaultAttrsIntrinsic<[],
5329+
[llvm_global_ptr_ty, // dst_gmem_ptr
5330+
llvm_shared_ptr_ty, // src_smem_ptr
5331+
llvm_i32_ty, // copy_size
5332+
llvm_i16_ty, // byte_mask
5333+
llvm_i64_ty, // cache_hint
5334+
llvm_i1_ty], // Flag for cache_hint
5335+
[IntrConvergent, IntrArgMemOnly,
5336+
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
5337+
ImmArg<ArgIndex<5>>]>;
5338+
53265339
// Intrinsics for Bulk Copy Prefetch L2
53275340
def int_nvvm_cp_async_bulk_prefetch_L2
53285341
: DefaultAttrsIntrinsic<[],

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2720,28 +2720,44 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
27202720
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
27212721
}
27222722

2723-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
2723+
void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2GCommon(SDNode *N, bool HasMask) {
27242724
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2725-
// dst, src, size, cache_hint, cache_hint_flag
2725+
// dst, src, size, mask, cache_hint, cache_hint_flag
27262726
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2727-
// = {2} + {5}
2727+
// = {2} + {6}
27282728
size_t NumOps = N->getNumOperands();
27292729
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2730-
size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint
2730+
size_t CacheHintIdx = NumOps - 2;
27312731

27322732
SDLoc DL(N);
2733-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2734-
Ops.push_back(N->getOperand(0)); // Chain operand
2733+
SDValue DstBase, DstOffset, SrcBase, SrcOffset;
2734+
SelectADDR(N->getOperand(2), DstBase, DstOffset); // dst
2735+
SelectADDR(N->getOperand(3), SrcBase, SrcOffset); // src
27352736

2736-
bool IsShared32 =
2737-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2738-
unsigned Opcode;
2737+
// BaseArgs: {dst, src, size}
2738+
SmallVector<SDValue, 8> Ops{DstBase, DstOffset, SrcBase, SrcOffset,
2739+
N->getOperand(4)};
2740+
2741+
// Push Mask operand, if available
2742+
if (HasMask)
2743+
Ops.push_back(N->getOperand(CacheHintIdx - 1));
2744+
2745+
// Push CacheHint operand, if available
27392746
if (IsCacheHint)
2740-
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
2741-
: NVPTX::CP_ASYNC_BULK_S2G_CH;
2742-
else
2743-
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
2744-
: NVPTX::CP_ASYNC_BULK_S2G;
2747+
Ops.push_back(N->getOperand(CacheHintIdx));
2748+
2749+
// Finally, the chain operand
2750+
Ops.push_back(N->getOperand(0));
2751+
2752+
unsigned Opcode = [&]() {
2753+
if (HasMask && IsCacheHint)
2754+
return NVPTX::CP_ASYNC_BULK_S2G_BM_CH;
2755+
if (HasMask)
2756+
return NVPTX::CP_ASYNC_BULK_S2G_BM;
2757+
if (IsCacheHint)
2758+
return NVPTX::CP_ASYNC_BULK_S2G_CH;
2759+
return NVPTX::CP_ASYNC_BULK_S2G;
2760+
}();
27452761
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
27462762
}
27472763

@@ -2928,7 +2944,10 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
29282944
SelectCpAsyncBulkG2S(N);
29292945
return true;
29302946
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
2931-
SelectCpAsyncBulkS2G(N);
2947+
SelectCpAsyncBulkS2GCommon(N);
2948+
return true;
2949+
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global_bytemask:
2950+
SelectCpAsyncBulkS2GCommon(N, /*HasMask=*/true);
29322951
return true;
29332952
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
29342953
SelectCpAsyncBulkPrefetchL2(N);

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9393
void SelectV2I64toI128(SDNode *N);
9494
void SelectI128toV2I64(SDNode *N);
9595
void SelectCpAsyncBulkG2S(SDNode *N);
96-
void SelectCpAsyncBulkS2G(SDNode *N);
96+
void SelectCpAsyncBulkS2GCommon(SDNode *N, bool HasMask = false);
9797
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
9898
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9999
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -511,10 +511,11 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
511511
// TMA Async Bulk Copy Functions
512512
//------------------------------
513513

514-
class CpAsyncBulkStr<bit mc, bit ch> {
514+
class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
515515
// Shared to Global memory
516516
string S2G = "cp.async.bulk.global.shared::cta.bulk_group"
517-
# !if(ch, ".L2::cache_hint", "");
517+
# !if(ch, ".L2::cache_hint", "")
518+
# !if(mask, ".cp_mask", "");
518519

519520
// Global to Shared cluster memory
520521
string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes"
@@ -525,18 +526,26 @@ class CpAsyncBulkStr<bit mc, bit ch> {
525526
string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
526527
}
527528

528-
multiclass CP_ASYNC_BULK_S2G<NVPTXRegClass rc> {
529-
def NAME: NVPTXInst<(outs),
530-
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size),
531-
!strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>,
529+
def CP_ASYNC_BULK_S2G : NVPTXInst<(outs),
530+
(ins ADDR:$dst, ADDR:$src, Int32Regs:$size),
531+
CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;", []>,
532532
Requires<[hasPTX<80>, hasSM<90>]>;
533-
def NAME # _CH: NVPTXInst<(outs),
534-
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch),
535-
!strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
536-
Requires<[hasPTX<80>, hasSM<90>]>;
537-
}
538-
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G<Int64Regs>;
539-
defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G<Int32Regs>;
533+
534+
def CP_ASYNC_BULK_S2G_CH : NVPTXInst<(outs),
535+
(ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
536+
CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;", []>,
537+
Requires<[hasPTX<80>, hasSM<90>]>;
538+
539+
// Variants with bytemask
540+
def CP_ASYNC_BULK_S2G_BM : NVPTXInst<(outs),
541+
(ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int16Regs:$mask),
542+
CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;", []>,
543+
Requires<[hasPTX<86>, hasSM<100>]>;
544+
545+
def CP_ASYNC_BULK_S2G_BM_CH : NVPTXInst<(outs),
546+
(ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch),
547+
CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;", []>,
548+
Requires<[hasPTX<86>, hasSM<100>]>;
540549

541550
multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
542551
def NAME: NVPTXInst<(outs),
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
3+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
4+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
5+
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}
6+
7+
target triple = "nvptx64-nvidia-cuda"
8+
9+
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i16, i64, i1)
10+
11+
define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 %ch) {
12+
; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask(
13+
; CHECK-PTX64: {
14+
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
15+
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
16+
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
17+
; CHECK-PTX64-EMPTY:
18+
; CHECK-PTX64-NEXT: // %bb.0:
19+
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
20+
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_bytemask_param_1];
21+
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_bytemask_param_2];
22+
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_s2g_bytemask_param_3];
23+
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_s2g_bytemask_param_4];
24+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1;
25+
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1;
26+
; CHECK-PTX64-NEXT: ret;
27+
;
28+
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask(
29+
; CHECK-PTX-SHARED32: {
30+
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
31+
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>;
32+
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
33+
; CHECK-PTX-SHARED32-EMPTY:
34+
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
35+
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
36+
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_bytemask_param_1];
37+
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_s2g_bytemask_param_2];
38+
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_s2g_bytemask_param_3];
39+
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_bytemask_param_4];
40+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1;
41+
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1;
42+
; CHECK-PTX-SHARED32-NEXT: ret;
43+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 %ch, i1 1)
44+
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i16 %mask, i64 0, i1 0)
45+
ret void
46+
}

0 commit comments

Comments
 (0)