Skip to content

Commit a953aa9

Browse files
committed
[NVPTX] Add TMA bulk tensor copy intrinsics
This patch adds NVVM intrinsics and NVPTX codeGen for: * cp.async.bulk.tensor.S2G.1D -> 5D variants, supporting both Tile and Im2Col modes. These intrinsics optionally support cache_hints as indicated by the boolean flag argument. * cp.async.bulk.tensor.G2S.1D -> 5D variants, with support for both Tile and Im2Col modes. The Im2Col variants have an extra set of offsets as parameters. These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instruction. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk-tensor-g2s/s2g.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. Signed-off-by: Durgadoss R <[email protected]>
1 parent 93cda6d commit a953aa9

File tree

7 files changed

+1237
-0
lines changed

7 files changed

+1237
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,143 @@ to left-shift the found bit into the most-significant bit position, otherwise
462462
the result is the shift amount needed to right-shift the found bit into the
463463
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
464464

465+
TMA family of Intrinsics
466+
------------------------
467+
468+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
469+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
470+
471+
Syntax:
472+
"""""""
473+
474+
.. code-block:: llvm
475+
476+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
477+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
478+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
479+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
480+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
481+
482+
Overview:
483+
"""""""""
484+
485+
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' intrinsics
486+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
487+
These instructions initiate an asynchronous copy of tensor data from
488+
global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
489+
in ``tile`` mode. In tile mode, the multi-dimensional layout of the
490+
source tensor is preserved at the destination. The dimension of the
491+
tensor data ranges from 1d to 5d with the coordinates specified
492+
by the ``i32 %d0 ... i32 %d4`` arguments.
493+
494+
* The last two arguments to these intrinsics are boolean flags
495+
indicating support for cache_hint and/or multicast modifiers.
496+
These flag arguments must be compile-time constants. The backend
497+
looks through these flags and lowers the intrinsics appropriately.
498+
499+
* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
500+
a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
501+
variant of the PTX instruction.
502+
503+
* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
504+
the presence of a multicast mask (``i16 %mc``) and generates the PTX
505+
instruction with the ``.multicast::cluster`` modifier.
506+
507+
For more information, refer PTX ISA
508+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
509+
510+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``'
511+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
512+
513+
Syntax:
514+
"""""""
515+
516+
.. code-block:: llvm
517+
518+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
519+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
520+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
521+
522+
Overview:
523+
"""""""""
524+
525+
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``' intrinsics
526+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
527+
These instructions initiate an asynchronous copy of tensor data from
528+
global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
529+
in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
530+
are unrolled into a single dimensional column at the destination. In this
531+
mode, the tensor has to be at least three-dimensional. Along with the tensor
532+
coordinates, im2col offsets are also specified (denoted by
533+
``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
534+
than the number of dimensions of the tensor operation. The last two arguments
535+
to these intrinsics are boolean flags, with the same functionality as described
536+
in the ``tile`` mode intrinsics above.
537+
538+
For more information, refer PTX ISA
539+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
540+
541+
'``llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``'
542+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
543+
544+
Syntax:
545+
"""""""
546+
547+
.. code-block:: llvm
548+
549+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
550+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(..., i32 %d0, i32 %d1, ...)
551+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
552+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
553+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
554+
555+
Overview:
556+
"""""""""
557+
558+
The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``' intrinsics
559+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
560+
These instructions initiate an asynchronous copy of tensor data from
561+
shared::cta to global memory (indicated by the ``s2g`` prefix)
562+
in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
563+
with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
564+
565+
* The last argument to these intrinsics is a boolean flag
566+
indicating support for cache_hint. This flag argument must
567+
be a compile-time constant. When set, it indicates a valid
568+
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
569+
variant of the PTX instruction.
570+
571+
For more information, refer PTX ISA
572+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
573+
574+
'``llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[3-5]d``'
575+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
576+
577+
Syntax:
578+
"""""""
579+
580+
.. code-block:: llvm
581+
582+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
583+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
584+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
585+
586+
Overview:
587+
"""""""""
588+
589+
The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[1-5]d``' intrinsics
590+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
591+
These instructions initiate an asynchronous copy of tensor data from
592+
shared::cta to global memory (indicated by the ``s2g`` prefix)
593+
in ``im2col`` mode. In this mode, the tensor has to be at least
594+
three-dimensional. Unlike the ``g2s`` variants, there are no
595+
im2col_offsets for these intrinsics. The last argument to these
596+
intrinsics is a boolean flag, with the same functionality as
597+
described in the ``s2g.tile`` mode intrinsics above.
598+
599+
For more information, refer PTX ISA
600+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
601+
465602
Other Intrinsics
466603
----------------
467604

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,52 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
567567
[OpType, llvm_i32_ty, llvm_i32_ty]);
568568
}
569569

570+
class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
571+
string Name = "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim # "d";
572+
573+
bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0);
574+
int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0);
575+
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
576+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
577+
list<LLVMType> ArgsTy = !listconcat(
578+
[llvm_shared_ptr_ty, // dst_smem_ptr
579+
llvm_shared_ptr_ty, // mbarrier_smem_ptr
580+
llvm_ptr_ty], // tensormap_ptr
581+
TensorDimsTy, // actual tensor dims
582+
Im2ColOffsetsTy, // im2col offsets
583+
[llvm_i16_ty, // cta_mask
584+
llvm_i64_ty, // cache_hint
585+
llvm_i1_ty, // Flag for cta_mask
586+
llvm_i1_ty] // Flag for cache_hint
587+
);
588+
589+
int TempFlagsStartIdx = !add(dim, 5);
590+
int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets);
591+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
592+
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
593+
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>,
594+
ImmArg<ArgIndex<FlagsStartIdx>>,
595+
ImmArg<ArgIndex<!add(FlagsStartIdx, 1)>>];
596+
}
597+
598+
class CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, string mode> {
599+
string Name = "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # "d";
600+
601+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
602+
list<LLVMType> ArgsTy = !listconcat(
603+
[llvm_shared_ptr_ty, // src_smem_ptr
604+
llvm_ptr_ty], // tensormap_ptr
605+
TensorDimsTy, // actual tensor dims
606+
[llvm_i64_ty, // cache_hint
607+
llvm_i1_ty] // Flag for cache_hint
608+
);
609+
int FlagsStartIdx = !add(dim, 3);
610+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
611+
ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
612+
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
613+
ImmArg<ArgIndex<FlagsStartIdx>>];
614+
}
615+
570616
let TargetPrefix = "nvvm" in {
571617
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
572618
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4847,4 +4893,16 @@ def int_nvvm_setmaxnreg_dec_sync_aligned_u32
48474893
def int_nvvm_exit : ClangBuiltin<"__nvvm_exit">,
48484894
Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>;
48494895

4896+
// Intrinsics for Tensor Copy using TMA
4897+
// G2S -> From Global to Shared memory variants
4898+
// S2G -> From Shared to Global memory variants
4899+
foreach dim = [1, 2, 3, 4, 5] in {
4900+
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
4901+
foreach g2s = [CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, mode>] in
4902+
def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>;
4903+
foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, mode>] in
4904+
def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>;
4905+
}
4906+
}
4907+
48504908
} // let TargetPrefix = "nvvm"

0 commit comments

Comments
 (0)