Skip to content

Commit 225f75a

Browse files
committed
updates
1 parent d70291a commit 225f75a

File tree

2 files changed

+29
-7
lines changed

2 files changed

+29
-7
lines changed

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ namespace mlir {
5252

5353
using namespace mlir;
5454

55-
/// Query function for static subgroup size lookup for given chipset.
55+
/// Returns the static subgroup size lookup for the given chipset.
5656
// TODO: move this function to a common place.
5757
static int64_t querySubgroupSize(const amdgpu::Chipset &chipset) {
5858
// The subgroup size is the same as the wavefront size for all chipsets.
@@ -242,15 +242,34 @@ struct GPUSubgroupIdOpToROCDL final
242242
LogicalResult
243243
matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
244244
ConversionPatternRewriter &rewriter) const override {
245+
// Calculation of the thread's subgroup identifier.
246+
//
247+
// The process involves mapping the thread's 3D identifier within its
248+
// workgroup/block (w_id.x, w_id.y, w_id.z) to a 1D linear index.
249+
// This linearization assumes a layout where the x-dimension (w_dim.x)
250+
// varies most rapidly (i.e., it is the innermost dimension).
251+
//
252+
// The formula for the linearized thread index is:
253+
// L = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
254+
//
255+
// Subsequently, the range of linearized indices [0, N_threads-1] is
256+
// divided into consecutive, non-overlapping segments, each representing
257+
// a subgroup of size 'subgroup_size'.
258+
//
259+
// Example Partitioning (N = subgroup_size):
260+
// | Subgroup 0 | Subgroup 1 | Subgroup 2 | ... |
261+
// | Indices 0..N-1 | Indices N..2N-1 | Indices 2N..3N-1| ... |
262+
//
263+
// The subgroup identifier is obtained via integer division of the
264+
// linearized thread index by the predefined 'subgroup_size'.
265+
//
266+
// subgroup_id = floor( L / subgroup_size )
267+
// = (w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) /
268+
// subgroup_size
245269
auto int32Type = IntegerType::get(rewriter.getContext(), 32);
246-
auto loc = op.getLoc();
270+
Location loc = op.getLoc();
247271
LLVM::IntegerOverflowFlags flags =
248272
LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
249-
// linearized thread ids are divided into consecutive subgroups.
250-
// Where thread id is calculated as:
251-
// thread_id = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
252-
// And the subgroup id of the thread is calculated as:
253-
// subgroup_id = thread_id / subgroup_size
254273
Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
255274
Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
256275
Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
22
// RUN: mlir-opt %s -convert-gpu-to-rocdl='allowed-dialects=func,arith,math' -split-input-file | FileCheck %s
33
// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
4+
// RUN: mlir-opt %s -convert-gpu-to-rocdl='chipset=gfx1010' -split-input-file | FileCheck --check-prefix=CHECKGFX10 %s
45

56
// CHECK-LABEL: @test_module
67
// CHECK-SAME: llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
@@ -756,6 +757,8 @@ gpu.module @test_module {
756757
// CHECK: %[[int7:.*]] = llvm.mul %[[dimx]], %[[int6]] overflow<nsw, nuw> : i32
757758
// CHECK: %[[int8:.*]] = llvm.add %[[widx]], %[[int7]] overflow<nsw, nuw> : i32
758759
// CHECK: %[[ssize:.*]] = llvm.mlir.constant(64 : i32) : i32
760+
// GFX10 and above have default subgroup size of 32
761+
// CHECKGFX10: = llvm.mlir.constant(32 : i32) : i32
759762
// CHECK: = llvm.sdiv %[[int8]], %[[ssize]] : i32
760763
// CHECK: = llvm.sext %10 : i32 to i64
761764
%subgroupId = gpu.subgroup_id : index

0 commit comments

Comments
 (0)