Skip to content

Commit f5ee401

Browse files
[LLVM][NVPTX] Add codegen support for tcgen05.{ld, st} instructions (#126740)
This commit adds support for tcgen05.{ld, st} instructions with lit tests under tcgen05-ld.ll and tcgen05-st.ll and intrinsics documentation under NVPTXUsage.rst
1 parent db48d49 commit f5ee401

File tree

9 files changed

+2264
-0
lines changed

9 files changed

+2264
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,6 +1288,101 @@ For more information on the decompression schemes, refer to the PTX ISA
12881288
For more information on the tcgen05.cp instruction, refer to the PTX ISA
12891289
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-cp>`_.
12901290

1291+
'``llvm.nvvm.tcgen05.ld.*``'
1292+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1293+
1294+
Syntax:
1295+
"""""""
1296+
1297+
.. code-block:: llvm
1298+
1299+
declare <n x i32> @llvm.nvvm.tcgen05.ld.<shape>.<num>(ptr addrspace(6) %tmem_addr, i1 %pack)
1300+
1301+
declare <n x i32> @llvm.nvvm.tcgen05.ld.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, i64 %offset, i1 %pack)
1302+
1303+
Overview:
1304+
"""""""""
1305+
1306+
This group of intrinsics asynchronously load data from the Tensor Memory at the location specified
1307+
by the 32-bit address operand `tmem_addr` into the destination registers, collectively across all threads
1308+
of the warps.
1309+
1310+
All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address
1311+
of the collective load operation. Otherwise, the behavior is undefined.
1312+
1313+
The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
1314+
is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
1315+
indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
1316+
1317+
Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
1318+
1319+
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
1320+
1321+
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
1322+
1323+
The result of the intrinsic is a vector consisting of one or more 32-bit registers derived from `shape` and
1324+
`num` as shown below.
1325+
1326+
=========== ========================= ========== ==========
1327+
num/shape 16x32bx2/16x64b/32x32b 16x128b 16x256b
1328+
=========== ========================= ========== ==========
1329+
x1 1 2 4
1330+
x2 2 4 8
1331+
x4 4 8 16
1332+
x8 8 16 32
1333+
x16 16 32 64
1334+
x32 32 64 128
1335+
x64 64 128 NA
1336+
x128 128 NA NA
1337+
=========== ========================= ========== ==========
1338+
1339+
The last argument `i1 %pack` is a compile-time constant which when set, indicates that the adjacent columns are packed into a single 32-bit element during the load
1340+
1341+
For more information, refer to the
1342+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-ld>`__.
1343+
1344+
1345+
'``llvm.nvvm.tcgen05.st.*``'
1346+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
1347+
1348+
Syntax:
1349+
"""""""
1350+
1351+
.. code-block:: llvm
1352+
1353+
declare void @llvm.nvvm.tcgen05.st.<shape>.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i1 %unpack)
1354+
1355+
declare void @llvm.nvvm.tcgen05.st.16x32bx2.<num>(ptr addrspace(6) %tmem_addr, <n x i32> %args, i64 %offset, i1 %unpack)
1356+
1357+
Overview:
1358+
"""""""""
1359+
1360+
This group of intrinsics asynchronously store data from the source vector into the Tensor Memory at the location
1361+
specified by the 32-bit address operand 'tmem_addr` collectively across all threads of the warps.
1362+
1363+
All the threads in the warp must specify the same value of `tmem_addr`, which must be the base address of the
1364+
collective load operation. Otherwise, the behavior is undefined.
1365+
1366+
The `shape` qualifier and the `num` qualifier together determines the total dimension of the data ('n') which
1367+
is loaded from the Tensor Memory. The `shape` qualifier indicates the base dimension of data. The `num` qualifier
1368+
indicates the repeat factor on the base dimension resulting in the total dimension of the data that is accessed.
1369+
1370+
Allowed values for the 'num' are `x1, x2, x4, x8, x16, x32, x64, x128`.
1371+
1372+
Allowed values for the 'shape' in the first intrinsic are `16x64b, 16x128b, 16x256b, 32x32b`.
1373+
1374+
Allowed value for the 'shape' in the second intrinsic is `16x32bx2`.
1375+
1376+
`args` argument is a vector consisting of one or more 32-bit registers derived from `shape` and
1377+
`num` as listed in the table listed in the `tcgen05.ld` section.
1378+
1379+
Each shape support an `unpack` mode to allow a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns. `unpack` mode can be enabled by setting the `%unpack` operand to 1 and can be disabled by setting it to 0.
1380+
1381+
The last argument `i1 %unpack` is a compile-time constant which when set, indicates that a 32-bit element in the register to be unpacked into two 16-bit elements and store them in adjacent columns.
1382+
1383+
For more information, refer to the
1384+
`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-instructions-tcgen05-st>`__.
1385+
12911386
Other Intrinsics
12921387
----------------
12931388

llvm/include/llvm/IR/Intrinsics.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,7 @@ def llvm_v8i32_ty : LLVMType<v8i32>; // 8 x i32
564564
def llvm_v16i32_ty : LLVMType<v16i32>; // 16 x i32
565565
def llvm_v32i32_ty : LLVMType<v32i32>; // 32 x i32
566566
def llvm_v64i32_ty : LLVMType<v64i32>; // 64 x i32
567+
def llvm_v128i32_ty : LLVMType<v128i32>; //128 x i32
567568
def llvm_v256i32_ty : LLVMType<v256i32>; //256 x i32
568569

569570
def llvm_v1i64_ty : LLVMType<v1i64>; // 1 x i64

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -664,6 +664,35 @@ class CP_ASYNC_BULK_TENSOR_REDUCE_INTR<int dim, string mode, string op> {
664664
ImmArg<ArgIndex<FlagsStartIdx>>];
665665
}
666666

667+
class NVVM_TCGEN05_LDST_NAME<string Op, string Shape, int Num> {
668+
string intr = "llvm.nvvm.tcgen05." # Op
669+
# "." # Shape
670+
# "." # "x" # !shl(1, Num);
671+
672+
string record = !subst(".", "_",
673+
!subst("llvm.", "int_", intr));
674+
}
675+
676+
677+
class NVVM_TCGEN05_LDST_ACCESS_SIZE<string Shape, int Num> {
678+
int shift = !cond(!eq(Shape, "16x128b"): 1,
679+
!eq(Shape, "16x256b"): 2,
680+
true : 0);
681+
682+
int veclen = !shl(1, !add(Num, shift));
683+
684+
int valid = !le(veclen, 128);
685+
LLVMType type = !cond(!eq(veclen, 1): llvm_i32_ty,
686+
!eq(veclen, 2): llvm_v2i32_ty,
687+
!eq(veclen, 4): llvm_v4i32_ty,
688+
!eq(veclen, 8): llvm_v8i32_ty,
689+
!eq(veclen, 16): llvm_v16i32_ty,
690+
!eq(veclen, 32): llvm_v32i32_ty,
691+
!eq(veclen, 64): llvm_v64i32_ty,
692+
!eq(veclen, 128): llvm_v128i32_ty,
693+
true : llvm_void_ty);
694+
}
695+
667696
let TargetPrefix = "nvvm" in {
668697
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
669698
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -5186,4 +5215,40 @@ foreach cta_group = ["cg1", "cg2"] in {
51865215
}
51875216
}
51885217

5218+
// Tcgen05 ld intrinsics
5219+
class NVVM_TCGEN05_LD<string Shape, int Num> :
5220+
Intrinsic<[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
5221+
!listconcat([llvm_tmem_ptr_ty],
5222+
!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
5223+
[llvm_i1_ty]),
5224+
!listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
5225+
!if(!eq(Shape, "16x32bx2"),
5226+
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>],
5227+
[ImmArg<ArgIndex<1>>])),
5228+
NVVM_TCGEN05_LDST_NAME<"ld", Shape, Num>.intr>;
5229+
5230+
// Tcgen05 st intrinsics
5231+
class NVVM_TCGEN05_ST<string Shape, int Num> :
5232+
Intrinsic<[],
5233+
!listconcat([llvm_tmem_ptr_ty],
5234+
!if(!eq(Shape, "16x32bx2"), [llvm_i64_ty], []),
5235+
[NVVM_TCGEN05_LDST_ACCESS_SIZE<Shape, Num>.type],
5236+
[llvm_i1_ty]),
5237+
!listconcat([IntrConvergent, IntrArgMemOnly, NoCapture<ArgIndex<0>>],
5238+
!if(!eq(Shape, "16x32bx2"),
5239+
[ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<3>>],
5240+
[ImmArg<ArgIndex<2>>])),
5241+
NVVM_TCGEN05_LDST_NAME<"st", Shape, Num>.intr>;
5242+
5243+
foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in {
5244+
foreach num = !range(0, 8) in {
5245+
if NVVM_TCGEN05_LDST_ACCESS_SIZE<shape, num>.valid then {
5246+
def NVVM_TCGEN05_LDST_NAME<"ld", shape, num>.record:
5247+
NVVM_TCGEN05_LD<shape, num>;
5248+
def NVVM_TCGEN05_LDST_NAME<"st", shape, num>.record:
5249+
NVVM_TCGEN05_ST<shape, num>;
5250+
}
5251+
}
5252+
}
5253+
51895254
} // let TargetPrefix = "nvvm"

0 commit comments

Comments
 (0)