Skip to content

Commit f7813ce

Browse files
committed
[NVPTX] Add TMA bulk tensor copy intrinsics
This patch adds NVVM intrinsics and NVPTX codeGen for: * cp.async.bulk.tensor.S2G.1D -> 5D variants, supporting both Tile and Im2Col modes. These intrinsics optionally support cache_hints as indicated by the boolean flag argument. * cp.async.bulk.tensor.G2S.1D -> 5D variants, with support for both Tile and Im2Col modes. The Im2Col variants have an extra set of offsets as parameters. These intrinsics optionally support multicast and cache_hints, as indicated by the boolean arguments at the end of the intrinsics. * The backend looks through these flag arguments and lowers to the appropriate PTX instruction. * Lit tests are added for all combinations of these intrinsics in cp-async-bulk-tensor-g2s/s2g.ll. * The generated PTX is verified with a 12.3 ptxas executable. * Added docs for these intrinsics in NVPTXUsage.rst file. Signed-off-by: Durgadoss R <[email protected]>
1 parent 7ceb19e commit f7813ce

File tree

7 files changed

+1229
-0
lines changed

7 files changed

+1229
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,143 @@ to left-shift the found bit into the most-significant bit position, otherwise
432432
the result is the shift amount needed to right-shift the found bit into the
433433
least-significant bit position. 0xffffffff is returned if no 1 bit is found.
434434

435+
TMA family of Intrinsics
436+
------------------------
437+
438+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``'
439+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
440+
441+
Syntax:
442+
"""""""
443+
444+
.. code-block:: llvm
445+
446+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
447+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
448+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
449+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
450+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
451+
452+
Overview:
453+
"""""""""
454+
455+
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' intrinsics
456+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
457+
These instructions initiate an asynchronous copy of tensor data from
458+
global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
459+
in ``tile`` mode. In tile mode, the multi-dimensional layout of the
460+
source tensor is preserved at the destination. The dimension of the
461+
tensor data ranges from 1d to 5d with the coordinates specified
462+
by the ``i32 %d0 ... i32 %d4`` arguments.
463+
464+
* The last two arguments to these intrinsics are boolean flags
465+
indicating support for cache_hint and/or multicast modifiers.
466+
These flag arguments must be compile-time constants. The backend
467+
looks through these flags and lowers the intrinsics appropriately.
468+
469+
* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates
470+
a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
471+
variant of the PTX instruction.
472+
473+
* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates
474+
the presence of a multicast mask (``i16 %mc``) and generates the PTX
475+
instruction with the ``.multicast::cluster`` modifier.
476+
477+
For more information, refer PTX ISA
478+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
479+
480+
'``llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``'
481+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
482+
483+
Syntax:
484+
"""""""
485+
486+
.. code-block:: llvm
487+
488+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
489+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
490+
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
491+
492+
Overview:
493+
"""""""""
494+
495+
The '``@llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.[3-5]d``' intrinsics
496+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
497+
These instructions initiate an asynchronous copy of tensor data from
498+
global memory to shared::cluster memory (indicated by the ``g2s`` prefix)
499+
in ``im2col`` mode. In im2col mode, some dimensions of the source tensor
500+
are unrolled into a single dimensional column at the destination. In this
501+
mode, the tensor has to be at least three-dimensional. Along with the tensor
502+
coordinates, im2col offsets are also specified (denoted by
503+
``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less
504+
than the number of dimensions of the tensor operation. The last two arguments
505+
to these intrinsics are boolean flags, with the same functionality as described
506+
in the ``tile`` mode intrinsics above.
507+
508+
For more information, refer PTX ISA
509+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
510+
511+
'``llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``'
512+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
513+
514+
Syntax:
515+
"""""""
516+
517+
.. code-block:: llvm
518+
519+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
520+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(..., i32 %d0, i32 %d1, ...)
521+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
522+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
523+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
524+
525+
Overview:
526+
"""""""""
527+
528+
The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.tile.[1-5]d``' intrinsics
529+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
530+
These instructions initiate an asynchronous copy of tensor data from
531+
shared::cta to global memory (indicated by the ``s2g`` prefix)
532+
in ``tile`` mode. The dimension of the tensor data ranges from 1d to 5d
533+
with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments.
534+
535+
* The last argument to these intrinsics is a boolean flag
536+
indicating support for cache_hint. This flag argument must
537+
be a compile-time constant. When set, it indicates a valid
538+
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
539+
variant of the PTX instruction.
540+
541+
For more information, refer PTX ISA
542+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
543+
544+
'``llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[3-5]d``'
545+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
546+
547+
Syntax:
548+
"""""""
549+
550+
.. code-block:: llvm
551+
552+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.3d(ptr addrspace(3) %src, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i64 %ch, i1 %flag_ch)
553+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
554+
declare void @llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
555+
556+
Overview:
557+
"""""""""
558+
559+
The '``@llvm.nvvm.cp.async.bulk.tensor.s2g.im2col.[1-5]d``' intrinsics
560+
correspond to the ``cp.async.bulk.tensor.[1-5]d.*`` set of PTX instructions.
561+
These instructions initiate an asynchronous copy of tensor data from
562+
shared::cta to global memory (indicated by the ``s2g`` prefix)
563+
in ``im2col`` mode. In this mode, the tensor has to be at least
564+
three-dimensional. Unlike the ``g2s`` variants, there are no
565+
im2col_offsets for these intrinsics. The last argument to these
566+
intrinsics is a boolean flag, with the same functionality as
567+
described in the ``s2g.tile`` mode intrinsics above.
568+
569+
For more information, refer PTX ISA
570+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
571+
435572
Other Intrinsics
436573
----------------
437574

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,52 @@ class SHFL_INFO<bit sync, string mode, string type, bit return_pred> {
567567
[OpType, llvm_i32_ty, llvm_i32_ty]);
568568
}
569569

570+
class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
571+
string Name = "int_nvvm_cp_async_bulk_tensor_g2s_" # mode # "_" # dim # "d";
572+
573+
bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0);
574+
int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0);
575+
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
576+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
577+
list<LLVMType> ArgsTy = !listconcat(
578+
[llvm_shared_ptr_ty, // dst_smem_ptr
579+
llvm_shared_ptr_ty, // mbarrier_smem_ptr
580+
llvm_ptr_ty], // tensormap_ptr
581+
TensorDimsTy, // actual tensor dims
582+
Im2ColOffsetsTy, // im2col offsets
583+
[llvm_i16_ty, // cta_mask
584+
llvm_i64_ty, // cache_hint
585+
llvm_i1_ty, // Flag for cta_mask
586+
llvm_i1_ty] // Flag for cache_hint
587+
);
588+
589+
int TempFlagsStartIdx = !add(dim, 5);
590+
int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets);
591+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
592+
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
593+
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>, NoCapture<ArgIndex<2>>,
594+
ImmArg<ArgIndex<FlagsStartIdx>>,
595+
ImmArg<ArgIndex<!add(FlagsStartIdx, 1)>>];
596+
}
597+
598+
class CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, string mode> {
599+
string Name = "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # "d";
600+
601+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
602+
list<LLVMType> ArgsTy = !listconcat(
603+
[llvm_shared_ptr_ty, // src_smem_ptr
604+
llvm_ptr_ty], // tensormap_ptr
605+
TensorDimsTy, // actual tensor dims
606+
[llvm_i64_ty, // cache_hint
607+
llvm_i1_ty] // Flag for cache_hint
608+
);
609+
int FlagsStartIdx = !add(dim, 3);
610+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
611+
ReadOnly<ArgIndex<0>>, ReadOnly<ArgIndex<1>>,
612+
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
613+
ImmArg<ArgIndex<FlagsStartIdx>>];
614+
}
615+
570616
let TargetPrefix = "nvvm" in {
571617
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
572618
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4847,4 +4893,16 @@ def int_nvvm_setmaxnreg_dec_sync_aligned_u32
48474893
def int_nvvm_exit : ClangBuiltin<"__nvvm_exit">,
48484894
Intrinsic<[], [], [IntrConvergent, IntrInaccessibleMemOnly, IntrNoReturn]>;
48494895

4896+
// Intrinsics for Tensor Copy using TMA
4897+
// G2S -> From Global to Shared memory variants
4898+
// S2G -> From Shared to Global memory variants
4899+
foreach dim = [1, 2, 3, 4, 5] in {
4900+
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
4901+
foreach g2s = [CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, mode>] in
4902+
def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>;
4903+
foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, mode>] in
4904+
def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>;
4905+
}
4906+
}
4907+
48504908
} // let TargetPrefix = "nvvm"

0 commit comments

Comments
 (0)