updates

lialan · lialan · commit 225f75a6a093 · 2025-04-23T18:33:43.000-04:00
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,7 +52,7 @@ namespace mlir {
 
 using namespace mlir;
 
-/// Query function for static subgroup size lookup for given chipset.
+/// Returns the static subgroup size lookup for the given chipset.
 // TODO: move this function to a common place.
 static int64_t querySubgroupSize(const amdgpu::Chipset &chipset) {
   // The subgroup size is the same as the wavefront size for all chipsets.
@@ -242,15 +242,34 @@ struct GPUSubgroupIdOpToROCDL final
   LogicalResult
   matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    // Calculation of the thread's subgroup identifier.
+    //
+    // The process involves mapping the thread's 3D identifier within its
+    // workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
+    // This linearization assumes a layout where the x-dimension (w_dim.x)
+    // varies most rapidly (i.e., it is the innermost dimension).
+    //
+    // The formula for the linearized thread index is:
+    // L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+    //
+    // Subsequently, the range of linearized indices [0, N_threads-1] is
+    // divided into consecutive, non-overlapping segments, each representing
+    // a subgroup of size 'subgroup_size'.
+    //
+    // Example Partitioning (N = subgroup_size):
+    // | Subgroup 0      | Subgroup 1      | Subgroup 2      | ... |
+    // | Indices 0..N-1  | Indices N..2N-1 | Indices 2N..3N-1| ... |
+    //
+    // The subgroup identifier is obtained via integer division of the
+    // linearized thread index by the predefined 'subgroup_size'.
+    //
+    // subgroup_id = floor( L / subgroup_size )
+    //             = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
+    //             subgroup_size
     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
-    auto loc = op.getLoc();
+    Location loc = op.getLoc();
     LLVM::IntegerOverflowFlags flags =
         LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
-    // linearized thread ids are divided into consecutive subgroups.
-    // Where thread id is calculated as:
-    // thread_id = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
-    // And the subgroup id of the thread is calculated as:
-    // subgroup_id = thread_id / subgroup_size
     Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
     Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
     Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,6 +1,7 @@
 // RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='allowed-dialects=func,arith,math' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx1010' -split-input-file | FileCheck --check-prefix=CHECKGFX10 %s
 
 // CHECK-LABEL: @test_module
 // CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
@@ -756,6 +757,8 @@ gpu.module @test_module {
     // CHECK: %[[int7:.*]] = llvm.mul %[[dimx]], %[[int6]] overflow<nsw, nuw> : i32
     // CHECK: %[[int8:.*]] = llvm.add %[[widx]], %[[int7]] overflow<nsw, nuw> : i32
     // CHECK: %[[ssize:.*]] = llvm.mlir.constant(64 : i32) : i32
+    // GFX10 and above have default subgroup size of 32
+    // CHECKGFX10: = llvm.mlir.constant(32 : i32) : i32
     // CHECK: = llvm.sdiv %[[int8]], %[[ssize]] : i32
     // CHECK: = llvm.sext %10 : i32 to i64
     %subgroupId = gpu.subgroup_id : index