update according to comments.

lialan · lialan · commit d70291a4099b · 2025-04-23T13:41:47.000-04:00
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -34,7 +34,8 @@ class GPUModuleOp;
 void populateGpuToROCDLConversionPatterns(const LLVMTypeConverter &converter,
                                           RewritePatternSet &patterns,
                                           gpu::amd::Runtime runtime,
-                                          mlir::amdgpu::Chipset chipset);
+                                          mlir::amdgpu::Chipset chipset,
+                                          std::optional<int64_t> subgroupSize);
 
 /// Configure target to convert from the GPU dialect to ROCDL.
 void configureGpuToROCDLConversionLegality(ConversionTarget &target);
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
@@ -608,6 +608,10 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
                clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
                clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL",
                           "OpenCL"))}]>,
+    Option<"subgroupSize", "subgroup-size", "unsigned",
+           "0",
+           "specify subgroup size for the kernel, if left empty, the default "
+           "value will be decided by the target chipset.">,
     ListOption<"allowedDialects", "allowed-dialects", "std::string",
                "Run conversion patterns of only the specified dialects">,
   ];
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -52,6 +52,26 @@ namespace mlir {
 
 using namespace mlir;
 
+/// Query function for static subgroup size lookup for given chipset.
+// TODO: move this function to a common place.
+static int64_t querySubgroupSize(const amdgpu::Chipset &chipset) {
+  // The subgroup size is the same as the wavefront size for all chipsets.
+  // The wavefront size is 64 for GCN and 32 for RDNA.
+
+  // There are two ways we can know the subgroup size:
+  // 1. subgroup size is passed down as part of configuration by the caller.
+  // 2. lower subgroup size down to LLVM intrinsic:
+  // `Intrinsic::amdgcn_wavefrontsize`, which will then be folded into a
+  // constant according to subtarget info.
+
+  // TODO: change to prefer method 1 if the caller has provided a subgroup size,
+  // otherwise use method 2. for now statically query the subgroup size
+  // according to the chipset.
+  if (chipset.majorVersion >= 10)
+    return 32;
+  return 64;
+}
+
 /// Returns true if the given `gpu.func` can be safely called using the bare
 /// pointer calling convention.
 static bool canBeCalledWithBarePointers(gpu::GPUFuncOp func) {
@@ -90,7 +110,7 @@ static Value truncOrExtToLLVMType(ConversionPatternRewriter &rewriter,
   int64_t indexBitwidth = converter.getIndexTypeBitwidth();
   auto indexBitwidthType =
       IntegerType::get(rewriter.getContext(), converter.getIndexTypeBitwidth());
-  // TODO: use <=> in C++20
+  // TODO: use <=> in C++20.
   if (indexBitwidth > intWidth) {
     return rewriter.create<LLVM::SExtOp>(loc, indexBitwidthType, value);
   }
@@ -203,13 +223,21 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
 
 struct GPUSubgroupIdOpToROCDL final
     : ConvertOpToLLVMPattern<gpu::SubgroupIdOp> {
-  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
 
   GPUSubgroupIdOpToROCDL(const LLVMTypeConverter &converter,
-                         const mlir::amdgpu::Chipset &chipset)
-      : ConvertOpToLLVMPattern(converter), chipset(chipset) {}
+                         const mlir::amdgpu::Chipset &chipset,
+                         std::optional<int64_t> subgroupSize = std::nullopt)
+      : ConvertOpToLLVMPattern(converter), chipset(chipset),
+        subgroupSize(subgroupSize) {}
 
   const mlir::amdgpu::Chipset chipset;
+  const std::optional<int64_t> subgroupSize;
+
+  int64_t getSubgroupSize() const {
+    if (subgroupSize)
+      return *subgroupSize;
+    return querySubgroupSize(chipset);
+  }
 
   LogicalResult
   matchAndRewrite(gpu::SubgroupIdOp op, gpu::SubgroupIdOp::Adaptor adaptor,
@@ -218,7 +246,11 @@ struct GPUSubgroupIdOpToROCDL final
     auto loc = op.getLoc();
     LLVM::IntegerOverflowFlags flags =
         LLVM::IntegerOverflowFlags::nsw | LLVM::IntegerOverflowFlags::nuw;
-    // w_id.x + w_dim.x * (w_id.y + w_dim.y * w_id.z)) / subgroup_size
+    // linearized thread ids are divided into consecutive subgroups.
+    // Where thread id is calculated as:
+    // thread_id = w_id.x + w_dim.x * (w_id.y + (w_dim.y * w_id.z))
+    // And the subgroup id of the thread is calculated as:
+    // subgroup_id = thread_id / subgroup_size
     Value workitemIdX = rewriter.create<ROCDL::ThreadIdXOp>(loc, int32Type);
     Value workitemIdY = rewriter.create<ROCDL::ThreadIdYOp>(loc, int32Type);
     Value workitemIdZ = rewriter.create<ROCDL::ThreadIdZOp>(loc, int32Type);
@@ -233,8 +265,9 @@ struct GPUSubgroupIdOpToROCDL final
     Value workitemIdXPlusDimYxIdZPlusIdYTimesDimX =
         rewriter.create<LLVM::AddOp>(loc, int32Type, workitemIdX,
                                      dimYxIdZPlusIdYTimesDimX, flags);
-    Value subgroupSize = rewriter.create<LLVM::ConstantOp>(
-        loc, IntegerType::get(rewriter.getContext(), 32), 64);
+
+    Value subgroupSize =
+        rewriter.create<LLVM::ConstantOp>(loc, int32Type, getSubgroupSize());
     Value waveIdOp = rewriter.create<LLVM::SDivOp>(
         loc, workitemIdXPlusDimYxIdZPlusIdYTimesDimX, subgroupSize);
 
@@ -361,8 +394,10 @@ struct LowerGpuOpsToROCDLOpsPass final
 
     populateAMDGPUToROCDLConversionPatterns(converter, llvmPatterns,
                                             *maybeChipset);
-    populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime,
-                                         *maybeChipset);
+    populateGpuToROCDLConversionPatterns(
+        converter, llvmPatterns, runtime, *maybeChipset,
+        subgroupSize == 0 ? std::nullopt
+                          : std::optional<int64_t>(subgroupSize));
     configureGpuToROCDLConversionLegality(target);
     if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
       signalPassFailure();
@@ -410,7 +445,8 @@ void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
 
 void mlir::populateGpuToROCDLConversionPatterns(
     const LLVMTypeConverter &converter, RewritePatternSet &patterns,
-    mlir::gpu::amd::Runtime runtime, mlir::amdgpu::Chipset chipset) {
+    mlir::gpu::amd::Runtime runtime, mlir::amdgpu::Chipset chipset,
+    std::optional<int64_t> subgroupSize) {
   using gpu::index_lowering::IndexKind;
   using gpu::index_lowering::IntrType;
   using mlir::gpu::amd::Runtime;
@@ -449,7 +485,7 @@ void mlir::populateGpuToROCDLConversionPatterns(
   patterns.add<GPUDynamicSharedMemoryOpLowering>(converter);
 
   patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
-  patterns.add<GPUSubgroupIdOpToROCDL>(converter, chipset);
+  patterns.add<GPUSubgroupIdOpToROCDL>(converter, chipset, subgroupSize);
   populateMathToROCDLConversionPatterns(converter, patterns);
 }