Skip to content

[mlir][gpu] Support Cluster of Thread Blocks in gpu.launch_func #72871

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 69 additions & 6 deletions mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,32 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
let assemblyFormat = "$dimension attr-dict";
}

def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
let description = [{
Returns the number of thread blocks in the cluster along
the x, y, or z `dimension`.

Example:

```mlir
%cDimX = gpu.cluster_dim x
```
}];
}

def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
let description = [{
Returns the cluster id, i.e. the index of the current cluster within the
grid along the x, y, or z `dimension`.

Example:

```mlir
%cIdY = gpu.cluster_id y
```
}];
}

def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
let description = [{
Returns the number of threads in the thread block (aka the block size) along
Expand Down Expand Up @@ -441,8 +467,15 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
"blockSizeY", "blockSizeZ"]>]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
SymbolRefAttr:$kernel,
LaunchIndx:$gridSizeX, LaunchIndx:$gridSizeY, LaunchIndx:$gridSizeZ,
LaunchIndx:$blockSizeX, LaunchIndx:$blockSizeY, LaunchIndx:$blockSizeZ,
LaunchIndx:$gridSizeX,
LaunchIndx:$gridSizeY,
LaunchIndx:$gridSizeZ,
LaunchIndx:$blockSizeX,
LaunchIndx:$blockSizeY,
LaunchIndx:$blockSizeZ,
Optional<LaunchIndx>:$clusterSizeX,
Optional<LaunchIndx>:$clusterSizeY,
Optional<LaunchIndx>:$clusterSizeZ,
Optional<I32>:$dynamicSharedMemorySize,
Variadic<AnyType>:$kernelOperands,
Optional<AnyType>:$asyncObject)>,
Expand Down Expand Up @@ -480,6 +513,12 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
The remaining operands if present are passed as arguments to the kernel
function.

The `gpu.launch_func` also supports kernel launching with clusters if
supported by the target architecture. The cluster size can be set by
`clusterSizeX`, `clusterSizeY`, and `clusterSizeZ` arguments. When these
arguments are present, the Op launches a kernel that clusters the given
thread blocks. This feature is exclusive to certain architectures.

Example:

```mlir
Expand Down Expand Up @@ -509,6 +548,15 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
%gDimY = gpu.grid_dim y
%gDimZ = gpu.grid_dim z

// (Optional) Cluster size only for support architectures
%cIdX = gpu.cluster_id x
%cIdY = gpu.cluster_id y
%cIdZ = gpu.cluster_id z

%cDimX = gpu.cluster_dim x
%cDimY = gpu.cluster_dim y
%cDimZ = gpu.cluster_dim z

"some_op"(%bx, %tx) : (index, index) -> ()
%42 = load %arg1[%bx] : memref<?xf32, 1>
}
Expand All @@ -519,6 +567,7 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
async // (Optional) Don't block host, return token.
[%t0] // (Optional) Execute only after %t0 has completed.
@kernels::@kernel_1 // Kernel function.
clusters in (%cst, %cst, %cst) // (Optional) Cluster size only for support architectures.
blocks in (%cst, %cst, %cst) // Grid size.
threads in (%cst, %cst, %cst) // Block size.
dynamic_shared_memory_size %s // (Optional) Amount of dynamic shared
Expand All @@ -536,11 +585,13 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
"ValueRange":$kernelOperands,
CArg<"Type", "nullptr">:$asyncTokenType,
CArg<"ValueRange", "{}">:$asyncDependencies)>,
CArg<"ValueRange", "{}">:$asyncDependencies,
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>,
OpBuilder<(ins "SymbolRefAttr":$kernel, "KernelDim3":$gridSize,
"KernelDim3":$blockSize, "Value":$dynamicSharedMemorySize,
"ValueRange":$kernelOperands,
CArg<"Value", "nullptr">:$asyncObject)>
CArg<"Value", "nullptr">:$asyncObject,
CArg<"std::optional<KernelDim3>", "std::nullopt">:$clusterSize)>
];

let extraClassDeclaration = [{
Expand All @@ -550,12 +601,23 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
/// The name of the kernel.
StringAttr getKernelName();

/// Returns true if cluster size is specified.
bool hasClusterSize() {
if (getClusterSizeX() && getClusterSizeY() && getClusterSizeZ())
return true;
return false;
}

/// The number of operands passed to the kernel function.
unsigned getNumKernelOperands();

/// The i-th operand passed to the kernel function.
Value getKernelOperand(unsigned i);

/// Get the SSA values passed as operands to specify the cluster size.
/// When the cluster sizes are not specified, it asserts.
KernelDim3 getClusterSizeOperandValues();

/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();

Expand All @@ -571,10 +633,11 @@ def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
(`<` $asyncObject^ `:` type($asyncObject) `>`)?
$kernel
$kernel
( `clusters` `in` ` ` `(` $clusterSizeX^ `,` $clusterSizeY `,` $clusterSizeZ `)` )?
`blocks` `in` ` ` `(` $gridSizeX `,` $gridSizeY `,` $gridSizeZ `)`
`threads` `in` ` ` `(` $blockSizeX `,` $blockSizeY `,` $blockSizeZ `)`
custom<LaunchDimType>(type($gridSizeX))
custom<LaunchDimType>(type($gridSizeX), ref($clusterSizeX), type($clusterSizeX), type($clusterSizeY), type($clusterSizeZ))
(`dynamic_shared_memory_size` $dynamicSharedMemorySize^)?
custom<LaunchFuncOperands>($kernelOperands, type($kernelOperands)) attr-dict
}];
Expand Down
8 changes: 7 additions & 1 deletion mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1128,13 +1128,19 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(),
rewriter, /*useBarePtrCallConv=*/kernelBarePtrCallConv);

std::optional<gpu::KernelDim3> clusterSize = std::nullopt;
if (launchOp.hasClusterSize()) {
clusterSize =
gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(),
adaptor.getClusterSizeZ()};
}
rewriter.create<gpu::LaunchFuncOp>(
launchOp.getLoc(), launchOp.getKernelAttr(),
gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
adaptor.getGridSizeZ()},
gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
adaptor.getBlockSizeZ()},
adaptor.getDynamicSharedMemorySize(), arguments, stream);
adaptor.getDynamicSharedMemorySize(), arguments, stream, clusterSize);
if (launchOp.getAsyncToken())
rewriter.replaceOp(launchOp, {stream});
else
Expand Down
25 changes: 14 additions & 11 deletions mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,17 +313,20 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns) {
populateWithGenerated(patterns);
patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
patterns
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
NVVM::GridDimYOp, NVVM::GridDimZOp>,
GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
converter);
patterns.add<
GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
NVVM::GridDimYOp, NVVM::GridDimZOp>,
GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);

// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
Expand Down
48 changes: 43 additions & 5 deletions mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/StringSaver.h"
#include <cassert>

using namespace mlir;
using namespace mlir::gpu;
Expand Down Expand Up @@ -983,14 +984,17 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
GPUFuncOp kernelFunc, KernelDim3 gridSize,
KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
ValueRange kernelOperands, Type asyncTokenType,
ValueRange asyncDependencies) {
ValueRange asyncDependencies,
std::optional<KernelDim3> clusterSize) {
result.addOperands(asyncDependencies);
if (asyncTokenType)
result.types.push_back(builder.getType<AsyncTokenType>());

// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
getBlockSize.y, getBlockSize.z});
if (clusterSize.has_value())
result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
result.addOperands(kernelOperands);
Expand All @@ -1006,6 +1010,11 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
for (auto &sz : prop.operandSegmentSizes)
sz = 1;
prop.operandSegmentSizes[0] = asyncDependencies.size();
if (!clusterSize.has_value()) {
prop.operandSegmentSizes[segmentSizesLen - 4] = 0;
prop.operandSegmentSizes[segmentSizesLen - 5] = 0;
prop.operandSegmentSizes[segmentSizesLen - 6] = 0;
}
prop.operandSegmentSizes[segmentSizesLen - 3] =
dynamicSharedMemorySize ? 1 : 0;
prop.operandSegmentSizes[segmentSizesLen - 2] =
Expand All @@ -1016,10 +1025,13 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
SymbolRefAttr kernel, KernelDim3 gridSize,
KernelDim3 getBlockSize, Value dynamicSharedMemorySize,
ValueRange kernelOperands, Value asyncObject) {
ValueRange kernelOperands, Value asyncObject,
std::optional<KernelDim3> clusterSize) {
// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands({gridSize.x, gridSize.y, gridSize.z, getBlockSize.x,
getBlockSize.y, getBlockSize.z});
if (clusterSize.has_value())
result.addOperands({clusterSize->x, clusterSize->y, clusterSize->z});
if (dynamicSharedMemorySize)
result.addOperands(dynamicSharedMemorySize);
result.addOperands(kernelOperands);
Expand All @@ -1032,6 +1044,11 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
for (auto &sz : prop.operandSegmentSizes)
sz = 1;
prop.operandSegmentSizes[0] = 0;
if (!clusterSize.has_value()) {
prop.operandSegmentSizes[segmentSizesLen - 4] = 0;
prop.operandSegmentSizes[segmentSizesLen - 5] = 0;
prop.operandSegmentSizes[segmentSizesLen - 6] = 0;
}
prop.operandSegmentSizes[segmentSizesLen - 3] =
dynamicSharedMemorySize ? 1 : 0;
prop.operandSegmentSizes[segmentSizesLen - 2] =
Expand Down Expand Up @@ -1065,6 +1082,13 @@ KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
return KernelDim3{operands[3], operands[4], operands[5]};
}

KernelDim3 LaunchFuncOp::getClusterSizeOperandValues() {
assert(hasClusterSize() &&
"cluster size is not set, check hasClusterSize() first");
auto operands = getOperands().drop_front(getAsyncDependencies().size());
return KernelDim3{operands[6], operands[7], operands[8]};
}

LogicalResult LaunchFuncOp::verify() {
auto module = (*this)->getParentOfType<ModuleOp>();
if (!module)
Expand All @@ -1076,21 +1100,35 @@ LogicalResult LaunchFuncOp::verify() {
GPUDialect::getContainerModuleAttrName() +
"' attribute");

if (hasClusterSize()) {
if (getClusterSizeY().getType() != getClusterSizeX().getType() ||
getClusterSizeZ().getType() != getClusterSizeX().getType())
return emitOpError()
<< "expects types of the cluster dimensions must be the same";
}

return success();
}

static ParseResult parseLaunchDimType(OpAsmParser &parser, Type &dimTy) {
static ParseResult
parseLaunchDimType(OpAsmParser &parser, Type &dimTy,
std::optional<OpAsmParser::UnresolvedOperand> clusterValue,
Type &clusterXTy, Type &clusterYTy, Type &clusterZTy) {
if (succeeded(parser.parseOptionalColon())) {
if (parser.parseType(dimTy))
return failure();
} else {
dimTy = IndexType::get(parser.getContext());
}
if (clusterValue.has_value()) {
clusterXTy = clusterYTy = clusterZTy = dimTy;
}
return success();
}

static void printLaunchDimType(OpAsmPrinter &printer, Operation *op,
Type dimTy) {
static void printLaunchDimType(OpAsmPrinter &printer, Operation *op, Type dimTy,
Value clusterValue, Type clusterXTy,
Type clusterYTy, Type clusterZTy) {
if (!dimTy.isIndex())
printer << ": " << dimTy;
}
Expand Down
13 changes: 13 additions & 0 deletions mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ using namespace mlir::gpu;

// Maximum grid and block dimensions of all known GPUs are less than 2^32.
static constexpr uint64_t kMaxDim = std::numeric_limits<uint32_t>::max();
// Maximum cluster size
static constexpr uint64_t kMaxClusterDim = 8;
// Maximum subgroups are no larger than 128.
static constexpr uint64_t kMaxSubgroupSize = 128;

Expand Down Expand Up @@ -82,6 +84,17 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
return std::nullopt;
}

void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
setResultRange(getResult(), getIndexRange(1, kMaxClusterDim));
}

void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxClusterDim;
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
}

void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
std::optional<uint64_t> knownVal =
Expand Down
Loading