Skip to content

[NVPTX] Add TMA bulk tensor prefetch intrinsics #115527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above.
For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.

'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
Overview:
"""""""""

The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
of PTX instructions. These instructions initiate an asynchronous prefetch
of tensor data from global memory to the L2 cache. In tile mode, the
multi-dimensional layout of the source tensor is preserved at the destination.
The dimension of the tensor data ranges from 1d to 5d with the coordinates
specified by the ``i32 %d0 ... i32 %d4`` arguments.

* The last argument to these intrinsics is a boolean flag
indicating support for cache_hint. This flag argument must
be a compile-time constant. When set, it indicates a valid
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
variant of the PTX instruction.

For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.

'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
Overview:
"""""""""

The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
of PTX instructions. These instructions initiate an asynchronous prefetch
of tensor data from global memory to the L2 cache. In im2col mode, some
dimensions of the source tensor are unrolled into a single dimensional
column at the destination. In this mode, the tensor has to be at least
three-dimensional. Along with the tensor coordinates, im2col offsets are
also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
of im2col offsets is two less than the number of dimensions of the tensor
operation. The last argument to these intrinsics is a boolean flag, with
the same functionality as described in the ``tile`` mode intrinsics above.

For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.

Other Intrinsics
----------------

Expand Down
24 changes: 24 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,28 @@ class CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, string mode> {
ImmArg<ArgIndex<FlagsStartIdx>>];
}

class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d";

bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0);
int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0);
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
list<LLVMType> ArgsTy = !listconcat(
[llvm_ptr_ty], // tensormap_ptr
TensorDimsTy, // actual tensor dims
Im2ColOffsetsTy, // im2col offsets
[llvm_i64_ty, // cache_hint
llvm_i1_ty] // Flag for cache_hint
);

int TempFlagsStartIdx = !add(dim, 2);
int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets);
list<IntrinsicProperty> IntrProp = [IntrConvergent,
ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
ImmArg<ArgIndex<FlagsStartIdx>>];
}

let TargetPrefix = "nvvm" in {
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
Expand Down Expand Up @@ -4902,6 +4924,8 @@ foreach dim = [1, 2, 3, 4, 5] in {
def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>;
foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, mode>] in
def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>;
foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>] in
def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>;
}
}

Expand Down
104 changes: 90 additions & 14 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4175,6 +4175,10 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, ); \
}()

#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode) \
(IsCacheHint ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)

static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
bool IsCacheHint, bool IsIm2Col) {
if (IsIm2Col) {
Expand Down Expand Up @@ -4242,6 +4246,55 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
}
}

static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
bool IsIm2Col) {
if (IsIm2Col) {
switch (Dim) {
case 3:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL);
case 4:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL);
case 5:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL);
default:
llvm_unreachable("Invalid Dimension in im2col mode for "
"GetCpAsyncBulkTensorPrefetchOpcode.");
}
} else {
switch (Dim) {
case 1:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE);
case 2:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE);
case 3:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE);
case 4:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE);
case 5:
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE);
default:
llvm_unreachable("Invalid Dimension in tile mode for "
"GetCpAsyncBulkTensorPrefetchOpcode.");
}
}
}

static size_t GetDimsFromIntrinsic(unsigned IID) {
switch (IID) {
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
return 3;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
return 4;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
return 5;
default:
llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
}
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
bool IsIm2Col) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
Expand All @@ -4250,21 +4303,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
// multicast_flag, cache_hint_flag}
// NumOperands = {Chain, IID} + {Actual intrinsic args}
// = {2} + {7 + dims + im2col_offsets}
auto getDimsFromIntrinsic = [](unsigned IID) {
switch (IID) {
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
return 3;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
return 4;
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
return 5;
default:
llvm_unreachable(
"Invalid im2col intrinsic in SelectCpAsyncBulkTensorG2SCommon.");
}
};
size_t NumOps = N->getNumOperands();
size_t NumDims = IsIm2Col ? getDimsFromIntrinsic(N->getConstantOperandVal(1))
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
: (NumOps - 9);
// Offsets is always 'NumDims - 2' and only for im2col mode
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
Expand Down Expand Up @@ -4316,6 +4356,30 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
bool IsIm2Col) {
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
// {src, dims{d0...dN}, im2col_offsets{dims-2}
// cache_hint, cache_hint_flag}
// NumOperands = {Chain, IID} + {Actual intrinsic args}
// = {2} + {3 + dims + im2col_offsets}
size_t NumOps = N->getNumOperands();
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
: (NumOps - 5);
// Offsets is always 'NumDims - 2' and only for im2col mode
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);

SDLoc DL(N);
SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
Ops.push_back(N->getOperand(0)); // Chain operand

unsigned Opcode =
GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
}

bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
unsigned IID = N->getConstantOperandVal(1);
switch (IID) {
Expand Down Expand Up @@ -4345,5 +4409,17 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
SelectCpAsyncBulkTensorPrefetchCommon(N);
return true;
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
return true;
}
}
1 change: 1 addition & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
void SelectI128toV2I64(SDNode *N);
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
Expand Down
46 changes: 46 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,52 @@ foreach dim = [1, 2, 3, 4, 5] in {
}
}

// TMA Prefetch from Global memory to L2 cache
class PREFETCH_STRINGS<int dim, string mode, bit ch> {
string prefix = "cp.async.bulk.prefetch.tensor";
string dir = "L2.global";
string inst_name = prefix
# "." # dim # "d"
# "." # dir
# "." # mode
# !if(ch, ".L2::cache_hint", "");
string intr_name = "CP_ASYNC_BULK_TENSOR_PREFETCH_"
# dim # "D"
# !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
}

multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i));
defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", ");
defvar asm_str_default = " [$tmap, {{" # dims_str # "}}]";

defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
defvar im2col_dag = !if(!eq(mode, "im2col"),
!dag(ins, !listsplat(Int16Regs, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
(ins));
defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
defvar im2col_asm_str = ", {{" # im2col_str # "}}";

defvar asm_str = !if(!eq(mode, "im2col"),
!strconcat(asm_str_default, im2col_asm_str), asm_str_default);

def "": NVPTXInst<(outs),
!con((ins Int64Regs:$tmap), dims_dag, im2col_dag),
!strconcat(PREFETCH_STRINGS<dim, mode, 0>.inst_name, asm_str, ";"), []>,
Requires<[hasPTX<80>, hasSM<90>]>;
def _CH: NVPTXInst<(outs),
!con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)),
!strconcat(PREFETCH_STRINGS<dim, mode, 1>.inst_name, asm_str, ", $ch;"), []>,
Requires<[hasPTX<80>, hasSM<90>]>;
}

foreach dim = [1, 2, 3, 4, 5] in {
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
defm PREFETCH_STRINGS<dim, mode, 0>.intr_name :
CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>;
}
}

//-----------------------------------
// MBarrier Functions
//-----------------------------------
Expand Down
Loading
Loading