Skip to content

Commit da7d1a7

Browse files
[LLVM][NVPTX] Add codegen support for tcgen05.{ld, st} instructions
This commit adds support for tcgen05.{ld, st} instructions with lit tests under tcgen05-ld.ll and tcgen05-st.ll and intrinsics documentation under NVPTXUsage.rst
1 parent d89c23b commit da7d1a7

File tree

9 files changed

+2267
-0
lines changed

9 files changed

+2267
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1175,6 +1175,101 @@ For more information, refer to the PTX ISA
11751175
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tensorcore-5th-generation-instructions-tcgen05-fence>`_.
11761176

11771177

1178+
'``llvm.nvvm.tcgen05.ld.*``'
1179+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1180+
1181+
Syntax:
1182+
"""""""
1183+
1184+
.. code-block:: llvm
1185+
1186+
declare <n x i32> @llvm.nvvm.tcgen05.ld.<shape>.<num>(ptr addrspace(6) %tmem_addr, i1 %pack)
1187+
1188+
declare <n x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, i64 %offset, i1 %pack)
1189+
1190+
Overview:
1191+
"""""""""
1192+
1193+
This group of intrinsics asynchronously load data from the Tensor Memory at the location specified
1194+
by the 32-bit address operand `tmem_addr` into the destination registers, collectively across all threads
1195+
of the warps.
1196+
1197+
All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address
1198+
of the collective load operation. Otherwise, the behavior is undefined.
1199+
1200+
The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
1201+
is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
1202+
indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
1203+
1204+
Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
1205+
1206+
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
1207+
1208+
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
1209+
1210+
The result of the intrinsic is a vector consisting of one or more 32-bit registers derived from `shape` and
1211+
`num` as shown below.
1212+
1213+
=========== ========================= ========== ==========
1214+
num/shape 16x32bx2/16x64b/32x32b 16x128b 16x256b
1215+
=========== ========================= ========== ==========
1216+
x1 1 2 4
1217+
x2 2 4 8
1218+
x4 4 8 16
1219+
x8 8 16 32
1220+
x16 16 32 64
1221+
x32 32 64 128
1222+
x64 64 128 NA
1223+
x128 128 NA NA
1224+
=========== ========================= ========== ==========
1225+
1226+
The last argument `i1 %pack` is a compile-time constant which when set, indicates that the adjacent columns are packed into a single 32-bit element during the load
1227+
1228+
For more information, refer to the
1229+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
1230+
1231+
1232+
'``llvm.nvvm.tcgen05.st.*``'
1233+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1234+
1235+
Syntax:
1236+
"""""""
1237+
1238+
.. code-block:: llvm
1239+
1240+
declare void @llvm.nvvm.tcgen05.st.<shape>.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i1 %unpack)
1241+
1242+
declare void @llvm.nvvm.tcgen05.st.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i64 %offset, i1 %unpack)
1243+
1244+
Overview:
1245+
"""""""""
1246+
1247+
This group of intrinsics asynchronously store data from the source vector into the Tensor Memory at the location
1248+
specified by the 32-bit address operand 'tmem_addr` collectively across all threads of the warps.
1249+
1250+
All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address of the
1251+
collective load operation. Otherwise, the behavior is undefined.
1252+
1253+
The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
1254+
is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
1255+
indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
1256+
1257+
Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
1258+
1259+
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
1260+
1261+
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
1262+
1263+
`args` argument is a vector consisting of one or more 32-bit registers derived from `shape` and
1264+
`num` as listed in the table listed in the `tcgen05.ld` section.
1265+
1266+
Each shape support an `unpack` mode to allow a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns. `unpack` mode can be enabled by setting the `%unpack` operand to 1 and can be disabled by setting it to 0.
1267+
1268+
The last argument `i1 %unpack` is a compile-time constant which when set, indicates that a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns.
1269+
1270+
For more information, refer to the
1271+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
1272+
11781273
Other Intrinsics
11791274
----------------
11801275

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,7 @@ def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
564564
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
565565
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
566566
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
567+
def llvm_v128i32_ty : LLVMType<v128i32>; //128 x i32
567568
def llvm_v256i32_ty : LLVMType<v256i32>; //256 x i32
568569

569570
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -656,6 +656,35 @@ class CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, string mode, string op> {
656656
ImmArg<ArgIndex<FlagsStartIdx>>];
657657
}
658658

659+
class NVVM_TCGEN05_LDST_NAME<string Op, string Shape, int Num> {
660+
string intr = "llvm.nvvm.tcgen05." # Op
661+
# "." # Shape
662+
# "." # "x" # !shl(1, Num);
663+
664+
string record = !subst(".", "_",
665+
!subst("llvm.", "int_", intr));
666+
}
667+
668+
669+
class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num> {
670+
int shift = !cond(!eq(Shape, "16x128b"): 1,
671+
!eq(Shape, "16x256b"): 2,
672+
true : 0);
673+
674+
int veclen = !shl(1, !add(Num, shift));
675+
676+
int valid = !le(veclen, 128);
677+
LLVMType type = !cond(!eq(veclen, 1): llvm_i32_ty,
678+
!eq(veclen, 2): llvm_v2i32_ty,
679+
!eq(veclen, 4): llvm_v4i32_ty,
680+
!eq(veclen, 8): llvm_v8i32_ty,
681+
!eq(veclen, 16): llvm_v16i32_ty,
682+
!eq(veclen, 32): llvm_v32i32_ty,
683+
!eq(veclen, 64): llvm_v64i32_ty,
684+
!eq(veclen, 128): llvm_v128i32_ty,
685+
true : llvm_void_ty);
686+
}
687+
659688
let TargetPrefix = "nvvm" in {
660689
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
661690
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -5138,4 +5167,40 @@ def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [],
51385167
def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [],
51395168
[IntrNoMem, IntrHasSideEffects]>;
51405169

5170+
// Tcgen05 ld
5171+
class NVVM_TCGEN05_LD<string Shape, int Num> :
5172+
Intrinsic<[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
5173+
!listconcat([llvm_tmem_ptr_ty],
5174+
!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
5175+
[llvm_i1_ty]),
5176+
!listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
5177+
!if(!eq(Shape, "16x32bx2"),
5178+
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
5179+
[ImmArg<ArgIndex<1>>])),
5180+
NVVM_TCGEN05_LDST_NAME<"ld", Shape, Num>.intr>;
5181+
5182+
// Tcgen05 st
5183+
class NVVM_TCGEN05_ST<string Shape, int Num> :
5184+
Intrinsic<[],
5185+
!listconcat([llvm_tmem_ptr_ty],
5186+
!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
5187+
[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
5188+
[llvm_i1_ty]),
5189+
!listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
5190+
!if(!eq(Shape, "16x32bx2"),
5191+
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>],
5192+
[ImmArg<ArgIndex<2>>])),
5193+
NVVM_TCGEN05_LDST_NAME<"st", Shape, Num>.intr>;
5194+
5195+
foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
5196+
foreach num = !range(0, 8) in {
5197+
if NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>.valid then {
5198+
def NVVM_TCGEN05_LDST_NAME<"ld", shape, num>.record:
5199+
NVVM_TCGEN05_LD<shape, num>;
5200+
def NVVM_TCGEN05_LDST_NAME<"st", shape, num>.record:
5201+
NVVM_TCGEN05_ST<shape, num>;
5202+
}
5203+
}
5204+
}
5205+
51415206
} // let TargetPrefix = "nvvm"

0 commit comments

Comments
 (0)