Skip to content

Commit fd55ab4

Browse files
[LLVM][NVPTX] Add codegen support for tcgen05.{ld, st} instructions
This commit adds support for tcgen05.{ld, st} instructions with lit tests under tcgen05-ld.ll and tcgen05-st.ll and intrinsics documentation under NVPTXUsage.rst
1 parent 0fe0968 commit fd55ab4

File tree

9 files changed

+2264
-0
lines changed

9 files changed

+2264
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,6 +1271,101 @@ For more information on the decompression schemes, refer to the PTX ISA
12711271
For more information on the tcgen05.cp instruction, refer to the PTX ISA
12721272
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-cp>`_.
12731273

1274+
'``llvm.nvvm.tcgen05.ld.*``'
1275+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1276+
1277+
Syntax:
1278+
"""""""
1279+
1280+
.. code-block:: llvm
1281+
1282+
declare <n x i32> @llvm.nvvm.tcgen05.ld.<shape>.<num>(ptr addrspace(6) %tmem_addr, i1 %pack)
1283+
1284+
declare <n x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, i64 %offset, i1 %pack)
1285+
1286+
Overview:
1287+
"""""""""
1288+
1289+
This group of intrinsics asynchronously load data from the Tensor Memory at the location specified
1290+
by the 32-bit address operand `tmem_addr` into the destination registers, collectively across all threads
1291+
of the warps.
1292+
1293+
All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address
1294+
of the collective load operation. Otherwise, the behavior is undefined.
1295+
1296+
The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
1297+
is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
1298+
indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
1299+
1300+
Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
1301+
1302+
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
1303+
1304+
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
1305+
1306+
The result of the intrinsic is a vector consisting of one or more 32-bit registers derived from `shape` and
1307+
`num` as shown below.
1308+
1309+
=========== ========================= ========== ==========
1310+
num/shape 16x32bx2/16x64b/32x32b 16x128b 16x256b
1311+
=========== ========================= ========== ==========
1312+
x1 1 2 4
1313+
x2 2 4 8
1314+
x4 4 8 16
1315+
x8 8 16 32
1316+
x16 16 32 64
1317+
x32 32 64 128
1318+
x64 64 128 NA
1319+
x128 128 NA NA
1320+
=========== ========================= ========== ==========
1321+
1322+
The last argument `i1 %pack` is a compile-time constant which when set, indicates that the adjacent columns are packed into a single 32-bit element during the load
1323+
1324+
For more information, refer to the
1325+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
1326+
1327+
1328+
'``llvm.nvvm.tcgen05.st.*``'
1329+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1330+
1331+
Syntax:
1332+
"""""""
1333+
1334+
.. code-block:: llvm
1335+
1336+
declare void @llvm.nvvm.tcgen05.st.<shape>.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i1 %unpack)
1337+
1338+
declare void @llvm.nvvm.tcgen05.st.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i64 %offset, i1 %unpack)
1339+
1340+
Overview:
1341+
"""""""""
1342+
1343+
This group of intrinsics asynchronously store data from the source vector into the Tensor Memory at the location
1344+
specified by the 32-bit address operand 'tmem_addr` collectively across all threads of the warps.
1345+
1346+
All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address of the
1347+
collective load operation. Otherwise, the behavior is undefined.
1348+
1349+
The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
1350+
is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
1351+
indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
1352+
1353+
Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
1354+
1355+
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
1356+
1357+
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
1358+
1359+
`args` argument is a vector consisting of one or more 32-bit registers derived from `shape` and
1360+
`num` as listed in the table listed in the `tcgen05.ld` section.
1361+
1362+
Each shape support an `unpack` mode to allow a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns. `unpack` mode can be enabled by setting the `%unpack` operand to 1 and can be disabled by setting it to 0.
1363+
1364+
The last argument `i1 %unpack` is a compile-time constant which when set, indicates that a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns.
1365+
1366+
For more information, refer to the
1367+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
1368+
12741369
Other Intrinsics
12751370
----------------
12761371

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,7 @@ def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
564564
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
565565
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
566566
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
567+
def llvm_v128i32_ty : LLVMType<v128i32>; //128 x i32
567568
def llvm_v256i32_ty : LLVMType<v256i32>; //256 x i32
568569

569570
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,35 @@ class CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, string mode, string op> {
664664
ImmArg<ArgIndex<FlagsStartIdx>>];
665665
}
666666

667+
class NVVM_TCGEN05_LDST_NAME<string Op, string Shape, int Num> {
668+
string intr = "llvm.nvvm.tcgen05." # Op
669+
# "." # Shape
670+
# "." # "x" # !shl(1, Num);
671+
672+
string record = !subst(".", "_",
673+
!subst("llvm.", "int_", intr));
674+
}
675+
676+
677+
class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num> {
678+
int shift = !cond(!eq(Shape, "16x128b"): 1,
679+
!eq(Shape, "16x256b"): 2,
680+
true : 0);
681+
682+
int veclen = !shl(1, !add(Num, shift));
683+
684+
int valid = !le(veclen, 128);
685+
LLVMType type = !cond(!eq(veclen, 1): llvm_i32_ty,
686+
!eq(veclen, 2): llvm_v2i32_ty,
687+
!eq(veclen, 4): llvm_v4i32_ty,
688+
!eq(veclen, 8): llvm_v8i32_ty,
689+
!eq(veclen, 16): llvm_v16i32_ty,
690+
!eq(veclen, 32): llvm_v32i32_ty,
691+
!eq(veclen, 64): llvm_v64i32_ty,
692+
!eq(veclen, 128): llvm_v128i32_ty,
693+
true : llvm_void_ty);
694+
}
695+
667696
let TargetPrefix = "nvvm" in {
668697
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
669698
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -5186,4 +5215,40 @@ foreach cta_group = ["cg1", "cg2"] in {
51865215
}
51875216
}
51885217

5218+
// Tcgen05 ld intrinsics
5219+
class NVVM_TCGEN05_LD<string Shape, int Num> :
5220+
Intrinsic<[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
5221+
!listconcat([llvm_tmem_ptr_ty],
5222+
!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
5223+
[llvm_i1_ty]),
5224+
!listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
5225+
!if(!eq(Shape, "16x32bx2"),
5226+
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
5227+
[ImmArg<ArgIndex<1>>])),
5228+
NVVM_TCGEN05_LDST_NAME<"ld", Shape, Num>.intr>;
5229+
5230+
// Tcgen05 st intrinsics
5231+
class NVVM_TCGEN05_ST<string Shape, int Num> :
5232+
Intrinsic<[],
5233+
!listconcat([llvm_tmem_ptr_ty],
5234+
!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
5235+
[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
5236+
[llvm_i1_ty]),
5237+
!listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
5238+
!if(!eq(Shape, "16x32bx2"),
5239+
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>],
5240+
[ImmArg<ArgIndex<2>>])),
5241+
NVVM_TCGEN05_LDST_NAME<"st", Shape, Num>.intr>;
5242+
5243+
foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
5244+
foreach num = !range(0, 8) in {
5245+
if NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>.valid then {
5246+
def NVVM_TCGEN05_LDST_NAME<"ld", shape, num>.record:
5247+
NVVM_TCGEN05_LD<shape, num>;
5248+
def NVVM_TCGEN05_LDST_NAME<"st", shape, num>.record:
5249+
NVVM_TCGEN05_ST<shape, num>;
5250+
}
5251+
}
5252+
}
5253+
51895254
} // let TargetPrefix = "nvvm"

0 commit comments

Comments
 (0)