Skip to content

[MLIR][GPU] Add gpu.cluster_dim_blocks and gpu.cluster_block_id Ops #95245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 26 additions & 1 deletion mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :

def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
let description = [{
Returns the number of thread blocks in the cluster along
Returns the number of cluster identifiers per grid along
the x, y, or z `dimension`.

Example:
Expand All @@ -81,6 +81,19 @@ def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
}];
}

def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> {
let description = [{
Returns the number of thread blocks in the cluster along
the x, y, or z `dimension`.

Example:

```mlir
%cDimBlocksX = gpu.cluster_dim_blocks x
```
}];
}

def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
let description = [{
Returns the cluster id, i.e. the index of the current cluster within the
Expand All @@ -94,6 +107,18 @@ def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
}];
}

def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> {
let description = [{
Returns the block id within the cluster along the x, y, or z `dimension`.

Example:

```mlir
%cBlockIdY = gpu.cluster_block_id y
```
}];
}

def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
let description = [{
Returns the number of threads in the thread block (aka the block size) along
Expand Down
6 changes: 3 additions & 3 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">;
def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
def NVVM_GridInClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
def NVVM_GridInClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
def NVVM_GridInClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;

//===----------------------------------------------------------------------===//
// CTA index and across Cluster dimensions
Expand Down
6 changes: 6 additions & 0 deletions mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,14 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
GPUIndexIntrinsicOpLowering<
gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
GPUIndexIntrinsicOpLowering<
gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
Expand Down
12 changes: 12 additions & 0 deletions mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {

void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = APInt::getMaxValue(64).getZExtValue();
setResultRange(getResult(), getIndexRange(1, max));
}

void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
setResultRange(getResult(), getIndexRange(1, kMaxClusterDim));
}

Expand All @@ -95,6 +101,12 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
}

void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
uint64_t max = kMaxClusterDim;
setResultRange(getResult(), getIndexRange(0, max - 1ULL));
}

void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
SetIntRangeFn setResultRange) {
std::optional<uint64_t> knownVal =
Expand Down
6 changes: 3 additions & 3 deletions mlir/test/Integration/GPU/CUDA/sm90/cga_cluster.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ module attributes {gpu.container_module} {
%cidX = gpu.cluster_id x
%cidY = gpu.cluster_id y
%cidZ = gpu.cluster_id z
%cdimX = gpu.cluster_dim x
%cdimY = gpu.cluster_dim y
%cdimZ = gpu.cluster_dim z
%cdimX = gpu.cluster_dim_blocks x
%cdimY = gpu.cluster_dim_blocks y
%cdimZ = gpu.cluster_dim_blocks z
%bidX = gpu.block_id x
%bidY = gpu.block_id y
%bidZ = gpu.block_id z
Expand Down
Loading