llvm · durga4github · Jun 14, 2024 · Jun 12, 2024 · Jun 13, 2024
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -70,7 +70,7 @@ class GPU_IndexOp<string mnemonic, list<Trait> traits = []> :
 
 def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
   let description = [{
-    Returns the number of thread blocks in the cluster along
+    Returns the number of cluster identifiers per grid along
     the x, y, or z `dimension`.
 
     Example:
@@ -81,6 +81,19 @@ def GPU_ClusterDimOp : GPU_IndexOp<"cluster_dim"> {
   }];
 }
 
+def GPU_ClusterDimBlocksOp : GPU_IndexOp<"cluster_dim_blocks"> {
+  let description = [{
+    Returns the number of thread blocks in the cluster along
+    the x, y, or z `dimension`.
+
+    Example:
+
+    ```mlir
+    %cDimBlocksX = gpu.cluster_dim_blocks x
+    ```
+  }];
+}
+
 def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
   let description = [{
     Returns the cluster id, i.e. the index of the current cluster within the
@@ -94,6 +107,18 @@ def GPU_ClusterIdOp : GPU_IndexOp<"cluster_id"> {
   }];
 }
 
+def GPU_ClusterBlockIdOp : GPU_IndexOp<"cluster_block_id"> {
+  let description = [{
+    Returns the block id within the cluster along the x, y, or z `dimension`.
+
+    Example:
+
+    ```mlir
+    %cBlockIdY = gpu.cluster_block_id y
+    ```
+  }];
+}
+
 def GPU_BlockDimOp : GPU_IndexOp<"block_dim"> {
   let description = [{
     Returns the number of threads in the thread block (aka the block size) along

@@ -160,9 +160,9 @@ def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">;
 def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
 def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
 def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
-def NVVM_GridInClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
-def NVVM_GridInClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
-def NVVM_GridInClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
+def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
+def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
+def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA index and across Cluster dimensions

diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -342,8 +342,14 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                   NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
       GPUIndexIntrinsicOpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
                                   NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
+      GPUIndexIntrinsicOpLowering<
+          gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
+          NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
       GPUIndexIntrinsicOpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
                                   NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
+      GPUIndexIntrinsicOpLowering<
+          gpu::ClusterDimBlocksOp, NVVM::ClusterDimBlocksXOp,
+          NVVM::ClusterDimBlocksYOp, NVVM::ClusterDimBlocksZOp>,
       GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
                                   NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
       GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,

diff --git a/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp b/mlir/lib/Dialect/GPU/IR/InferIntRangeInterfaceImpls.cpp
@@ -86,6 +86,12 @@ static std::optional<uint64_t> getKnownLaunchDim(Op op, LaunchDims type) {
 
 void ClusterDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                      SetIntRangeFn setResultRange) {
+  uint64_t max = APInt::getMaxValue(64).getZExtValue();
+  setResultRange(getResult(), getIndexRange(1, max));
+}
+
+void ClusterDimBlocksOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
+                                           SetIntRangeFn setResultRange) {
   setResultRange(getResult(), getIndexRange(1, kMaxClusterDim));
 }
 
@@ -95,6 +101,12 @@ void ClusterIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
   setResultRange(getResult(), getIndexRange(0, max - 1ULL));
 }
 
+void ClusterBlockIdOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
+                                         SetIntRangeFn setResultRange) {
+  uint64_t max = kMaxClusterDim;
+  setResultRange(getResult(), getIndexRange(0, max - 1ULL));
+}
+
 void BlockDimOp::inferResultRanges(ArrayRef<ConstantIntRanges>,
                                    SetIntRangeFn setResultRange) {
   std::optional<uint64_t> knownVal =

@@ -22,9 +22,9 @@ module attributes {gpu.container_module} {
       %cidX = gpu.cluster_id  x
       %cidY = gpu.cluster_id  y
       %cidZ = gpu.cluster_id  z
-      %cdimX = gpu.cluster_dim  x
-      %cdimY = gpu.cluster_dim  y
-      %cdimZ = gpu.cluster_dim  z
+      %cdimX = gpu.cluster_dim_blocks  x
+      %cdimY = gpu.cluster_dim_blocks  y
+      %cdimZ = gpu.cluster_dim_blocks  z
       %bidX = gpu.block_id  x
       %bidY = gpu.block_id  y
       %bidZ = gpu.block_id  z