Skip to content

Commit edf5cae

Browse files
authored
[mlir][gpu] Support Cluster of Thread Blocks in gpu.launch_func (#72871)
NVIDIA Hopper architecture introduced the Cooperative Group Array (CGA). It is a new level of parallelism, allowing clustering of Cooperative Thread Arrays (CTA) to synchronize and communicate through shared memory while running concurrently. This PR enables support for CGA within the `gpu.launch_func` in the GPU dialect. It extends `gpu.launch_func` to accommodate this functionality. The GPU dialect remains architecture-agnostic, so we've added CGA functionality as optional parameters. We want to leverage mechanisms that we have in the GPU dialects such as outlining and kernel launching, making it a practical and convenient choice. An example of this implementation can be seen below: ``` gpu.launch_func @kernel_module::@kernel clusters in (%1, %0, %0) // <-- Optional blocks in (%0, %0, %0) threads in (%0, %0, %0) ``` The PR also introduces index and dimensions Ops specific to clusters, binding them to NVVM Ops: ``` %cidX = gpu.cluster_id x %cidY = gpu.cluster_id y %cidZ = gpu.cluster_id z %cdimX = gpu.cluster_dim x %cdimY = gpu.cluster_dim y %cdimZ = gpu.cluster_dim z ``` We will introduce cluster support in `gpu.launch` Op in an upcoming PR. See [the documentation](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-of-cooperative-thread-arrays) provided by NVIDIA for details.
1 parent d1652ff commit edf5cae

File tree

12 files changed

+356
-28
lines changed

12 files changed

+356
-28
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,32 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
5353
let assemblyFormat = "$dimension attr-dict";
5454
}
5555

56+
def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
57+
let description = [{
58+
Returns the number of thread blocks in the cluster along
59+
the x, y, or z `dimension`.
60+
61+
Example:
62+
63+
```mlir
64+
%cDimX = gpu.cluster_dim x
65+
```
66+
}];
67+
}
68+
69+
def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
70+
let description = [{
71+
Returns the cluster id, i.e. the index of the current cluster within the
72+
grid along the x, y, or z `dimension`.
73+
74+
Example:
75+
76+
```mlir
77+
%cIdY = gpu.cluster_id y
78+
```
79+
}];
80+
}
81+
5682
def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
5783
let description = [{
5884
Returns the number of threads in the thread block (aka the block size) along
@@ -467,8 +493,15 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
467493
"blockSizeY", "blockSizeZ"]>]>,
468494
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
469495
SymbolRefAttr:$kernel,
470-
LaunchIndx:$gridSizeX, LaunchIndx:$gridSizeY, LaunchIndx:$gridSizeZ,
471-
LaunchIndx:$blockSizeX, LaunchIndx:$blockSizeY, LaunchIndx:$blockSizeZ,
496+
LaunchIndx:$gridSizeX,
497+
LaunchIndx:$gridSizeY,
498+
LaunchIndx:$gridSizeZ,
499+
LaunchIndx:$blockSizeX,
500+
LaunchIndx:$blockSizeY,
501+
LaunchIndx:$blockSizeZ,
502+
Optional<LaunchIndx>:$clusterSizeX,
503+
Optional<LaunchIndx>:$clusterSizeY,
504+
Optional<LaunchIndx>:$clusterSizeZ,
472505
Optional<I32>:$dynamicSharedMemorySize,
473506
Variadic<AnyType>:$kernelOperands,
474507
Optional<AnyType>:$asyncObject)>,
@@ -506,6 +539,12 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
506539
The remaining operands if present are passed as arguments to the kernel
507540
function.
508541

542+
The `gpu.launch_func` also supports kernel launching with clusters if
543+
supported by the target architecture. The cluster size can be set by
544+
`clusterSizeX`, `clusterSizeY`, and `clusterSizeZ` arguments. When these
545+
arguments are present, the Op launches a kernel that clusters the given
546+
thread blocks. This feature is exclusive to certain architectures.
547+
509548
Example:
510549

511550
```mlir
@@ -535,6 +574,15 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
535574
%gDimY = gpu.grid_dim y
536575
%gDimZ = gpu.grid_dim z
537576

577+
// (Optional) Cluster size only for support architectures
578+
%cIdX = gpu.cluster_id x
579+
%cIdY = gpu.cluster_id y
580+
%cIdZ = gpu.cluster_id z
581+
582+
%cDimX = gpu.cluster_dim x
583+
%cDimY = gpu.cluster_dim y
584+
%cDimZ = gpu.cluster_dim z
585+
538586
"some_op"(%bx, %tx) : (index, index) -> ()
539587
%42 = load %arg1[%bx] : memref<?xf32, 1>
540588
}
@@ -545,6 +593,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
545593
async // (Optional) Don't block host, return token.
546594
[%t0] // (Optional) Execute only after %t0 has completed.
547595
@kernels::@kernel_1 // Kernel function.
596+
clusters in (%cst, %cst, %cst) // (Optional) Cluster size only for support architectures.
548597
blocks in (%cst, %cst, %cst) // Grid size.
549598
threads in (%cst, %cst, %cst) // Block size.
550599
dynamic_shared_memory_size %s // (Optional) Amount of dynamic shared
@@ -562,11 +611,13 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
562611
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
563612
"ValueRange":$kernelOperands,
564613
CArg<"Type", "nullptr">:$asyncTokenType,
565-
CArg<"ValueRange", "{}">:$asyncDependencies)>,
614+
CArg<"ValueRange", "{}">:$asyncDependencies,
615+
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>,
566616
OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize,
567617
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
568618
"ValueRange":$kernelOperands,
569-
CArg<"Value", "nullptr">:$asyncObject)>
619+
CArg<"Value", "nullptr">:$asyncObject,
620+
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>
570621
];
571622

572623
let extraClassDeclaration = [{
@@ -576,12 +627,23 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
576627
/// The name of the kernel.
577628
StringAttr getKernelName();
578629

630+
/// Returns true if cluster size is specified.
631+
bool hasClusterSize() {
632+
if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
633+
return true;
634+
return false;
635+
}
636+
579637
/// The number of operands passed to the kernel function.
580638
unsigned getNumKernelOperands();
581639

582640
/// The i-th operand passed to the kernel function.
583641
Value getKernelOperand(unsigned i);
584642

643+
/// Get the SSA values passed as operands to specify the cluster size.
644+
/// When the cluster sizes are not specified, it asserts.
645+
KernelDim3 getClusterSizeOperandValues();
646+
585647
/// Get the SSA values passed as operands to specify the grid size.
586648
KernelDim3 getGridSizeOperandValues();
587649

@@ -597,10 +659,11 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
597659
let assemblyFormat = [{
598660
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
599661
(`<` $asyncObject^ `:` type($asyncObject) `>`)?
600-
$kernel
662+
$kernel
663+
( `clusters` `in` ` ` `(` $clusterSizeX^ `,` $clusterSizeY `,` $clusterSizeZ `)` )?
601664
`blocks` `in` ` ` `(` $gridSizeX `,` $gridSizeY `,` $gridSizeZ `)`
602665
`threads` `in` ` ` `(` $blockSizeX `,` $blockSizeY `,` $blockSizeZ `)`
603-
custom<LaunchDimType>(type($gridSizeX))
666+
custom<LaunchDimType>(type($gridSizeX), ref($clusterSizeX), type($clusterSizeX), type($clusterSizeY), type($clusterSizeZ))
604667
(`dynamic_shared_memory_size` $dynamicSharedMemorySize^)?
605668
custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict
606669
}];

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1128,13 +1128,19 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
11281128
loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(),
11291129
rewriter, /*useBarePtrCallConv=*/kernelBarePtrCallConv);
11301130

1131+
std::optional<gpu::KernelDim3> clusterSize = std::nullopt;
1132+
if (launchOp.hasClusterSize()) {
1133+
clusterSize =
1134+
gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(),
1135+
adaptor.getClusterSizeZ()};
1136+
}
11311137
rewriter.create<gpu::LaunchFuncOp>(
11321138
launchOp.getLoc(), launchOp.getKernelAttr(),
11331139
gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
11341140
adaptor.getGridSizeZ()},
11351141
gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
11361142
adaptor.getBlockSizeZ()},
1137-
adaptor.getDynamicSharedMemorySize(), arguments, stream);
1143+
adaptor.getDynamicSharedMemorySize(), arguments, stream, clusterSize);
11381144
if (launchOp.getAsyncToken())
11391145
rewriter.replaceOp(launchOp, {stream});
11401146
else

mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -313,17 +313,20 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
313313
RewritePatternSet &patterns) {
314314
populateWithGenerated(patterns);
315315
patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
316-
patterns
317-
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
318-
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
319-
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
320-
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
321-
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
322-
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
323-
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
324-
NVVM::GridDimYOp, NVVM::GridDimZOp>,
325-
GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
326-
converter);
316+
patterns.add<
317+
GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
318+
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
319+
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
320+
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
321+
GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
322+
NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
323+
GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
324+
NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
325+
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
326+
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
327+
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
328+
NVVM::GridDimYOp, NVVM::GridDimZOp>,
329+
GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
327330

328331
patterns.add<GPUDynamicSharedMemoryOpLowering>(
329332
converter, NVVM::kSharedMemoryAlignmentBit);

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/Support/CommandLine.h"
3333
#include "llvm/Support/ErrorHandling.h"
3434
#include "llvm/Support/StringSaver.h"
35+
#include <cassert>
3536

3637
using namespace mlir;
3738
using namespace mlir::gpu;
@@ -985,14 +986,17 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
985986
GPUFuncOp kernelFunc, KernelDim3 gridSize,
986987
KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
987988
ValueRange kernelOperands, Type asyncTokenType,
988-
ValueRange asyncDependencies) {
989+
ValueRange asyncDependencies,
990+
std::optional<KernelDim3> clusterSize) {
989991
result.addOperands(asyncDependencies);
990992
if (asyncTokenType)
991993
result.types.push_back(builder.getType<AsyncTokenType>());
992994

993995
// Add grid and block sizes as op operands, followed by the data operands.
994996
result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
995997
getBlockSize.y, getBlockSize.z});
998+
if (clusterSize.has_value())
999+
result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z});
9961000
if (dynamicSharedMemorySize)
9971001
result.addOperands(dynamicSharedMemorySize);
9981002
result.addOperands(kernelOperands);
@@ -1008,6 +1012,11 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
10081012
for (auto &sz : prop.operandSegmentSizes)
10091013
sz = 1;
10101014
prop.operandSegmentSizes[0] = asyncDependencies.size();
1015+
if (!clusterSize.has_value()) {
1016+
prop.operandSegmentSizes[segmentSizesLen - 4] = 0;
1017+
prop.operandSegmentSizes[segmentSizesLen - 5] = 0;
1018+
prop.operandSegmentSizes[segmentSizesLen - 6] = 0;
1019+
}
10111020
prop.operandSegmentSizes[segmentSizesLen - 3] =
10121021
dynamicSharedMemorySize ? 1 : 0;
10131022
prop.operandSegmentSizes[segmentSizesLen - 2] =
@@ -1018,10 +1027,13 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
10181027
void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
10191028
SymbolRefAttr kernel, KernelDim3 gridSize,
10201029
KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
1021-
ValueRange kernelOperands, Value asyncObject) {
1030+
ValueRange kernelOperands, Value asyncObject,
1031+
std::optional<KernelDim3> clusterSize) {
10221032
// Add grid and block sizes as op operands, followed by the data operands.
10231033
result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
10241034
getBlockSize.y, getBlockSize.z});
1035+
if (clusterSize.has_value())
1036+
result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z});
10251037
if (dynamicSharedMemorySize)
10261038
result.addOperands(dynamicSharedMemorySize);
10271039
result.addOperands(kernelOperands);
@@ -1034,6 +1046,11 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
10341046
for (auto &sz : prop.operandSegmentSizes)
10351047
sz = 1;
10361048
prop.operandSegmentSizes[0] = 0;
1049+
if (!clusterSize.has_value()) {
1050+
prop.operandSegmentSizes[segmentSizesLen - 4] = 0;
1051+
prop.operandSegmentSizes[segmentSizesLen - 5] = 0;
1052+
prop.operandSegmentSizes[segmentSizesLen - 6] = 0;
1053+
}
10371054
prop.operandSegmentSizes[segmentSizesLen - 3] =
10381055
dynamicSharedMemorySize ? 1 : 0;
10391056
prop.operandSegmentSizes[segmentSizesLen - 2] =
@@ -1067,6 +1084,13 @@ KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
10671084
return KernelDim3{operands[3], operands[4], operands[5]};
10681085
}
10691086

1087+
KernelDim3 LaunchFuncOp::getClusterSizeOperandValues() {
1088+
assert(hasClusterSize() &&
1089+
"cluster size is not set, check hasClusterSize() first");
1090+
auto operands = getOperands().drop_front(getAsyncDependencies().size());
1091+
return KernelDim3{operands[6], operands[7], operands[8]};
1092+
}
1093+
10701094
LogicalResult LaunchFuncOp::verify() {
10711095
auto module = (*this)->getParentOfType<ModuleOp>();
10721096
if (!module)
@@ -1078,21 +1102,35 @@ LogicalResult LaunchFuncOp::verify() {
10781102
GPUDialect::getContainerModuleAttrName() +
10791103
"' attribute");
10801104

1105+
if (hasClusterSize()) {
1106+
if (getClusterSizeY().getType() != getClusterSizeX().getType() ||
1107+
getClusterSizeZ().getType() != getClusterSizeX().getType())
1108+
return emitOpError()
1109+
<< "expects types of the cluster dimensions must be the same";
1110+
}
1111+
10811112
return success();
10821113
}
10831114

1084-
static ParseResult parseLaunchDimType(OpAsmParser &parser, Type &dimTy) {
1115+
static ParseResult
1116+
parseLaunchDimType(OpAsmParser &parser, Type &dimTy,
1117+
std::optional<OpAsmParser::UnresolvedOperand> clusterValue,
1118+
Type &clusterXTy, Type &clusterYTy, Type &clusterZTy) {
10851119
if (succeeded(parser.parseOptionalColon())) {
10861120
if (parser.parseType(dimTy))
10871121
return failure();
10881122
} else {
10891123
dimTy = IndexType::get(parser.getContext());
10901124
}
1125+
if (clusterValue.has_value()) {
1126+
clusterXTy = clusterYTy = clusterZTy = dimTy;
1127+
}
10911128
return success();
10921129
}
10931130

1094-
static void printLaunchDimType(OpAsmPrinter &printer, Operation *op,
1095-
Type dimTy) {
1131+
static void printLaunchDimType(OpAsmPrinter &printer, Operation *op, Type dimTy,
1132+
Value clusterValue, Type clusterXTy,
1133+
Type clusterYTy, Type clusterZTy) {
10961134
if (!dimTy.isIndex())
10971135
printer << ": " << dimTy;
10981136
}

mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ using namespace mlir::gpu;
1919

2020
// Maximum grid and block dimensions of all known GPUs are less than 2^32.
2121
static constexpr uint64_t kMaxDim = std::numeric_limits<uint32_t>::max();
22+
// Maximum cluster size
23+
static constexpr uint64_t kMaxClusterDim = 8;
2224
// Maximum subgroups are no larger than 128.
2325
static constexpr uint64_t kMaxSubgroupSize = 128;
2426

@@ -82,6 +84,17 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
8284
return std::nullopt;
8385
}
8486

87+
void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
88+
SetIntRangeFn setResultRange) {
89+
setResultRange(getResult(), getIndexRange(1, kMaxClusterDim));
90+
}
91+
92+
void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
93+
SetIntRangeFn setResultRange) {
94+
uint64_t max = kMaxClusterDim;
95+
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
96+
}
97+
8598
void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
8699
SetIntRangeFn setResultRange) {
87100
std::optional<uint64_t> knownVal =

0 commit comments

Comments
 (0)