Addressing review feedbacks

jerryyin · jerryyin · commit 4610b01665b3 · 2025-03-18T21:00:53.000Z
diff --git a/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp b/mlir/lib/Conversion/VectorToAMDGPU/VectorToAMDGPU.cpp
@@ -15,7 +15,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/WalkPatternRewriteDriver.h"
 
 namespace mlir {
 #define GEN_PASS_DEF_CONVERTVECTORTOAMDGPUPASS
@@ -36,17 +36,16 @@ using namespace mlir;
 /// - The permutation map doesn't perform permutation (broadcasting is allowed).
 /// Note: those conditions mostly come from TransferReadToVectorLoadLowering
 /// pass.
-static LogicalResult
-transferPreconditions(PatternRewriter &rewriter,
-                      VectorTransferOpInterface xferOp,
-                      SmallVector<unsigned> &broadcastedDims,
-                      VectorType &unbroadcastedVectorType) {
+static LogicalResult transferPreconditions(
+    PatternRewriter &rewriter, VectorTransferOpInterface xferOp,
+    bool &requiresBroadcasting, VectorType &unbroadcastedVectorType) {
   if (!xferOp.getMask())
     return rewriter.notifyMatchFailure(xferOp, "Only support masked transfer");
 
   // Permutations are handled by VectorToSCF or
   // populateVectorTransferPermutationMapLoweringPatterns.
   // We let the 0-d corner case pass-through as it is supported.
+  SmallVector<unsigned> broadcastedDims;
   if (!xferOp.getPermutationMap().isMinorIdentityWithBroadcasting(
           &broadcastedDims))
     return rewriter.notifyMatchFailure(xferOp, "not minor identity + bcast");
@@ -56,9 +55,8 @@ transferPreconditions(PatternRewriter &rewriter,
     return rewriter.notifyMatchFailure(xferOp, "not a memref source");
 
   Attribute addrSpace = memRefType.getMemorySpace();
-  if (!addrSpace ||
-      llvm::dyn_cast<amdgpu::AddressSpaceAttr>(addrSpace).getValue() !=
-          amdgpu::AddressSpace::FatRawBuffer)
+  if (!addrSpace || dyn_cast<amdgpu::AddressSpaceAttr>(addrSpace).getValue() !=
+                        amdgpu::AddressSpace::FatRawBuffer)
     return rewriter.notifyMatchFailure(xferOp, "not in buffer address space");
 
   // Non-unit strides are handled by VectorToSCF.
@@ -73,6 +71,7 @@ transferPreconditions(PatternRewriter &rewriter,
     unbroadcastedVectorShape[i] = 1;
   unbroadcastedVectorType = xferOp.getVectorType().cloneWith(
       unbroadcastedVectorShape, xferOp.getVectorType().getElementType());
+  requiresBroadcasting = !broadcastedDims.empty();
 
   // `vector.load` supports vector types as memref's elements only when the
   // resulting vector type is the same as the element type.
@@ -98,31 +97,31 @@ transferPreconditions(PatternRewriter &rewriter,
   return success();
 }
 
-struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
-  using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
+struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
 
-    SmallVector<unsigned> broadcastedDims;
+    bool requiresBroadcasting = false;
     VectorType unbroadcastedVectorType;
-    if (failed(transferPreconditions(rewriter, readOp, broadcastedDims,
+    if (failed(transferPreconditions(rewriter, readOp, requiresBroadcasting,
                                      unbroadcastedVectorType))) {
       return failure();
     }
 
-    Value fill = rewriter.create<vector::SplatOp>(
-        readOp.getLoc(), unbroadcastedVectorType, readOp.getPadding());
+    Location loc = readOp.getLoc();
+    Value fill = rewriter.create<vector::SplatOp>(loc, unbroadcastedVectorType,
+                                                  readOp.getPadding());
     Value load = rewriter.create<vector::LoadOp>(
-        readOp.getLoc(), unbroadcastedVectorType, readOp.getSource(),
-        readOp.getIndices());
-    Value res = rewriter.create<arith::SelectOp>(
-        readOp.getLoc(), unbroadcastedVectorType, readOp.getMask(), load, fill);
+        loc, unbroadcastedVectorType, readOp.getSource(), readOp.getIndices());
+    Value res = rewriter.create<arith::SelectOp>(loc, unbroadcastedVectorType,
+                                                 readOp.getMask(), load, fill);
 
     // Insert a broadcasting op if required.
-    if (!broadcastedDims.empty()) {
-      res = rewriter.create<vector::BroadcastOp>(readOp.getLoc(),
-                                                 readOp.getVectorType(), res);
+    if (requiresBroadcasting) {
+      res = rewriter.create<vector::BroadcastOp>(loc, readOp.getVectorType(),
+                                                 res);
     }
 
     rewriter.replaceOp(readOp, res);
@@ -136,12 +135,11 @@ void mlir::populateVectorToAMDGPUConversionPatterns(
   patterns.add<TransferReadLowering>(patterns.getContext());
 }
 
-struct ConvertVectorToAMDGPUPass
+struct ConvertVectorToAMDGPUPass final
     : public impl::ConvertVectorToAMDGPUPassBase<ConvertVectorToAMDGPUPass> {
   void runOnOperation() override {
     RewritePatternSet patterns(&getContext());
     populateVectorToAMDGPUConversionPatterns(patterns);
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns))))
-      return signalPassFailure();
+    walkAndApplyPatterns(getOperation(), std::move(patterns));
   }
 };
diff --git a/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir b/mlir/test/Conversion/VectorToAMDGPU/vector-transfer-read-to-vector-load.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-amdgpu --split-input-file | FileCheck %s
+// RUN: mlir-opt %s --convert-vector-to-amdgpu --split-input-file | FileCheck %s
 
 // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer(
 // CHECK-SAME: %[[ARG0:.*]]: memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>
@@ -9,9 +9,10 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad
   %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf32>
   return %res : vector<4xf32>
 }
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
+// CHECK: %[[CST:.*]] = arith.constant 0.0
+// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]]
 // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
-// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
+// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]]
 // CHECK: return %[[SELECT]] : vector<4xf32>
 
 // -----
@@ -43,9 +44,10 @@ func.func @transfer_broadcasting(%mem : memref<8x8xf32, #amdgpu.address_space<fa
       : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<4xf32>
   return %res : vector<4xf32>
 }
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
+// CHECK: %[[CST:.*]] = arith.constant 0.0
+// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]]
 // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
-// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
+// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]]
 // CHECK: %[[BROADCAST:.*]] = vector.broadcast %[[SELECT]] : vector<1xf32> to vector<4xf32>
 // CHECK: return %[[BROADCAST]] : vector<4xf32>
 
@@ -62,7 +64,8 @@ func.func @transfer_scalar(%mem : memref<8x8xf32, #amdgpu.address_space<fat_raw_
       : memref<8x8xf32, #amdgpu.address_space<fat_raw_buffer>>, vector<1xf32>
   return %res : vector<1xf32>
 }
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00>
+// CHECK: %[[CST:.*]] = arith.constant 0.0
+// CHECK: %[[SPLAT:.*]] = vector.splat %[[CST]]
 // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
-// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[CST]]
+// CHECK: %[[SELECT:.*]] = arith.select %arg2, %[[LOAD]], %[[SPLAT]]
 // CHECK: return %[[SELECT]] : vector<1xf32>