[MLIR][ROCDL] Add conversion for gpu.subgroup_id to ROCDL

lialan · lialan · commit d75c2106ae48 · 2025-04-19T01:16:54.000-04:00
Creates `rocdl.wave_id` op with llvm conversion to:
`__builtin_amdgcn_s_get_waveid_in_workgroup`
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -204,6 +204,14 @@ def ROCDL_ReadlaneOp : ROCDL_IntrOp<"readlane", [], [0], [AllTypesMatch<["res",
    }];
 }
 
+// the intrinsic function name is too long so we use a shorter name for rocdl.
+def ROCDL_WaveIdOp :  LLVM_IntrOpBase<ROCDL_Dialect, "wave_id",
+                        "amdgcn_s_get_waveid_in_workgroup", [], [], [Pure], 1>,
+  Arguments<(ins)> {
+  let results = (outs LLVM_Type:$res);
+  let assemblyFormat = "attr-dict `:` type($res)";
+}
+
 //===----------------------------------------------------------------------===//
 // Thread index and Block index
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -80,6 +80,24 @@ static constexpr StringLiteral amdgcnDataLayout =
     "64-S32-A5-G1-ni:7:8:9";
 
 namespace {
+
+// Truncate or extend the result depending on the index bitwidth specified
+// by the LLVMTypeConverter options.
+static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
+                                  Location loc, Value value,
+                                  const LLVMTypeConverter *converter) {
+  auto intWidth = cast<IntegerType>(value.getType()).getWidth();
+  auto indexBitwidth = converter->getIndexTypeBitwidth();
+  if (indexBitwidth > intWidth) {
+    return rewriter.create<LLVM::SExtOp>(
+        loc, IntegerType::get(rewriter.getContext(), indexBitwidth), value);
+  } else if (indexBitwidth < intWidth) {
+    return rewriter.create<LLVM::TruncOp>(
+        loc, IntegerType::get(rewriter.getContext(), indexBitwidth), value);
+  }
+  return value;
+}
+
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
 
@@ -98,16 +116,7 @@ struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
         rewriter.create<ROCDL::MbcntLoOp>(loc, intTy, ValueRange{minus1, zero});
     Value laneId = rewriter.create<ROCDL::MbcntHiOp>(
         loc, intTy, ValueRange{minus1, mbcntLo});
-    // Truncate or extend the result depending on the index bitwidth specified
-    // by the LLVMTypeConverter options.
-    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
-    if (indexBitwidth > 32) {
-      laneId = rewriter.create<LLVM::SExtOp>(
-          loc, IntegerType::get(context, indexBitwidth), laneId);
-    } else if (indexBitwidth < 32) {
-      laneId = rewriter.create<LLVM::TruncOp>(
-          loc, IntegerType::get(context, indexBitwidth), laneId);
-    }
+    laneId = truncOrExtToLLVMType(rewriter, loc, laneId, getTypeConverter());
     rewriter.replaceOp(op, {laneId});
     return success();
   }
@@ -190,6 +199,21 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
   }
 };
 
+struct GPUSubgroupIdOpToROCDL : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
+  using ConvertOpToLLVMPattern<gpu::SubgroupIdOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+    Value waveIdOp = rewriter.create<ROCDL::WaveIdOp>(op.getLoc(), int32Type);
+    waveIdOp = truncOrExtToLLVMType(rewriter, op.getLoc(), waveIdOp,
+                                    getTypeConverter());
+    rewriter.replaceOp(op, {waveIdOp});
+    return success();
+  }
+};
+
 /// Import the GPU Ops to ROCDL Patterns.
 #include "GPUToROCDL.cpp.inc"
 
@@ -405,7 +429,9 @@ void mlir::populateGpuToROCDLConversionPatterns(
   // TODO: Add alignment for workgroup memory
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
-  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
+  patterns
+      .add<GPUShuffleOpLowering, GPULaneIdOpToROCDL, GPUSubgroupIdOpToROCDL>(
+          converter);
 
   populateMathToROCDLConversionPatterns(converter, patterns);
 }
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -11,7 +11,7 @@ gpu.module @test_module {
   func.func @gpu_index_ops()
       -> (index, index, index, index, index, index,
           index, index, index, index, index, index,
-          index) {
+          index, index) {
     // CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64
 
     // CHECK: rocdl.workitem.id.x : i32
@@ -59,12 +59,16 @@ gpu.module @test_module {
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %laneId = gpu.lane_id
 
+    // CHECK: = rocdl.wave_id : i32
+    // CHECK: = llvm.sext %{{.*}} : i32 to i64
+    %waveId = gpu.subgroup_id : index
+
     func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
                %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ,
-               %laneId
+               %laneId, %waveId
         : index, index, index, index, index, index,
           index, index, index, index, index, index,
-          index
+          index, index
   }
 }
 
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -88,6 +88,12 @@ llvm.func @rocdl.lane_id() -> i32 {
   llvm.return %3 : i32
 }
 
+llvm.func @rocdl.wave_id() -> i32 {
+  // CHECK: call i32 @llvm.amdgcn.s.get.waveid.in.workgroup()
+  %0 = rocdl.wave_id : i32
+  llvm.return %0 : i32
+}
+
 llvm.func @rocdl.swizzle(%src : i32) -> i32 {
   // CHECK-LABEL: rocdl.swizzle
   // CHECK: call i32 @llvm.amdgcn.ds.swizzle