Skip to content

[NVPTX] Add TMA Bulk Copy Intrinsics #138679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,7 @@ Syntax:
.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i64 %ch, i1 %flag_ch, i16 %mask)

Overview:
"""""""""
Expand All @@ -680,10 +681,13 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic
corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX
instructions. These instructions initiate an asynchronous copy from
shared::cta to global memory. The 32-bit operand ``%size`` specifies
the amount of memory to be copied and it must be a multiple of 16.
the amount of memory to be copied (in bytes) and it must be a multiple
of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand
specifies whether the i-th byte of each 16-byte wide chunk of source
data is copied to the destination.

* The last argument to these intrinsics is a boolean flag
indicating support for cache_hint. This flag argument must
* The ``i1 %flag_ch`` argument to these intrinsics is a boolean
flag indicating support for cache_hint. This flag argument must
be a compile-time constant. When set, it indicates a valid
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
variant of the PTX instruction.
Expand Down
13 changes: 13 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -2112,6 +2112,19 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>]>;

// From Shared CTA to Global memory with bytemask
def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask
: DefaultAttrsIntrinsic<[],
[llvm_global_ptr_ty, // dst_gmem_ptr
llvm_shared_ptr_ty, // src_smem_ptr
llvm_i32_ty, // copy_size
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cache_hint
llvm_i16_ty], // byte_mask
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
ImmArg<ArgIndex<4>>]>;

// Intrinsics for Bulk Copy Prefetch L2
def int_nvvm_cp_async_bulk_prefetch_L2
: DefaultAttrsIntrinsicFlags<[],
Expand Down
28 changes: 0 additions & 28 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2685,31 +2685,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
// dst, src, size, cache_hint, cache_hint_flag
// NumOperands = {Chain, IID} + {Actual intrinsic args}
// = {2} + {5}
size_t NumOps = N->getNumOperands();
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint

SDLoc DL(N);
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
Ops.push_back(N->getOperand(0)); // Chain operand

bool IsShared32 =
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
unsigned Opcode;
if (IsCacheHint)
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH
: NVPTX::CP_ASYNC_BULK_S2G_CH;
else
Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32
: NVPTX::CP_ASYNC_BULK_S2G;
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
// {dst, mbar, src, size, multicast, cache_hint,
Expand Down Expand Up @@ -2892,9 +2867,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
SelectCpAsyncBulkG2S(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global:
SelectCpAsyncBulkS2G(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_prefetch_L2:
SelectCpAsyncBulkPrefetchL2(N);
return true;
Expand Down
1 change: 0 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectV2I64toI128(SDNode *N);
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkG2S(SDNode *N);
void SelectCpAsyncBulkS2G(SDNode *N);
void SelectCpAsyncBulkPrefetchL2(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
Expand Down
32 changes: 19 additions & 13 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -511,10 +511,11 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ :
// TMA Async Bulk Copy Functions
//------------------------------

class CpAsyncBulkStr<bit mc, bit ch> {
class CpAsyncBulkStr<bit mc, bit ch, bit mask = 0> {
// Shared to Global memory
string S2G = "cp.async.bulk.global.shared::cta.bulk_group"
# !if(ch, ".L2::cache_hint", "");
# !if(ch, ".L2::cache_hint", "")
# !if(mask, ".cp_mask", "");

// Global to Shared cluster memory
string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes"
Expand All @@ -525,18 +526,23 @@ class CpAsyncBulkStr<bit mc, bit ch> {
string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes";
}

multiclass CP_ASYNC_BULK_S2G<NVPTXRegClass rc> {
def NAME: NVPTXInst<(outs),
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size),
!strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>,
Requires<[hasPTX<80>, hasSM<90>]>;
def NAME # _CH: NVPTXInst<(outs),
(ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch),
!strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>,
Requires<[hasPTX<80>, hasSM<90>]>;
multiclass CP_ASYNC_BULK_S2G_INTR<bit has_ch> {
def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch),
!if(has_ch,
CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;",
CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"),
[(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>,
Requires<[hasPTX<80>, hasSM<90>]>;

def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask),
!if(has_ch,
CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;",
CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"),
[(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>,
Requires<[hasPTX<86>, hasSM<100>]>;
}
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G<Int64Regs>;
defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G<Int32Regs>;
defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>;
defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>;

multiclass CP_ASYNC_BULK_G2S<NVPTXRegClass rc> {
def NAME: NVPTXInst<(outs),
Expand Down
46 changes: 46 additions & 0 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %}
; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %}

target triple = "nvptx64-nvidia-cuda"

declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i64, i1, i16)

define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i16 %mask) {
; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b16 %rs<2>;
; CHECK-PTX64-NEXT: .reg .b32 %r<2>;
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_1];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_2];
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_bytemask_param_3];
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4];
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1;
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1;
; CHECK-PTX64-NEXT: ret;
;
; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask(
; CHECK-PTX-SHARED32: {
; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>;
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>;
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
; CHECK-PTX-SHARED32-EMPTY:
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_1];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_bytemask_param_2];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_3];
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1;
; CHECK-PTX-SHARED32-NEXT: ret;
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1, i16 %mask)
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0, i16 %mask)
ret void
}
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/NVPTX/cp-async-bulk.ll
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0];
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1];
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2];
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3];
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3;
; CHECK-PTX64-NEXT: ret;
;
Expand All @@ -80,11 +80,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0];
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1];
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3];
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2;
; CHECK-PTX-SHARED32-NEXT: ret;
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0)
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0)
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1)
ret void
}
Expand Down