Addressing review feedbacks

jerryyin · jerryyin · commit ddee079d1c12 · 2025-04-14T21:08:20.000Z
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/TransferReadToLoad.cpp
@@ -24,7 +24,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "mlir/Transforms/WalkPatternRewriteDriver.h"
+#include "llvm/Support/MathExtras.h"
 
 namespace mlir::amdgpu {
 #define GEN_PASS_DEF_AMDGPUTRANSFERREADTOLOADPASS
@@ -76,6 +76,9 @@ static LogicalResult transferPreconditions(
   if (!memRefType.isLastDimUnitStride())
     return rewriter.notifyMatchFailure(xferOp, "!= 1 stride needs VectorToSCF");
 
+  if (memRefType.getElementTypeBitWidth() < 8)
+    return rewriter.notifyMatchFailure(xferOp, "unsupported sub-byte type");
+
   // If there is broadcasting involved then we first load the unbroadcasted
   // vector, and then broadcast it with `vector.broadcast`.
   ArrayRef<int64_t> vectorShape = xferOp.getVectorType().getShape();
@@ -127,14 +130,17 @@ static Value createVectorLoadForMaskedLoad(OpBuilder &builder, Location loc,
   return res;
 }
 
+static constexpr char kTransferReadNeedsMask[] =
+    "amdgpu.buffer_transfer_read_needs_mask";
+
 namespace {
 
 struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
                                 PatternRewriter &rewriter) const override {
-    if (readOp->hasAttr("amdgpu.buffer_transfer_read_needs_mask"))
+    if (readOp->hasAttr(kTransferReadNeedsMask))
       return failure();
 
     bool requiresBroadcasting = false;
@@ -154,71 +160,96 @@ struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
 
     auto stridedMetadata =
         rewriter.create<memref::ExtractStridedMetadataOp>(loc, src);
+    SmallVector<OpFoldResult> strides =
+        stridedMetadata.getConstifiedMixedStrides();
+    SmallVector<OpFoldResult> sizes =
+        stridedMetadata.getConstifiedMixedSizes();
+    OpFoldResult offset =
+        stridedMetadata.getConstifiedMixedOffset();
     OpFoldResult linearizedIndices;
     std::tie(std::ignore, linearizedIndices) =
-        memref::getLinearizedMemRefOffsetAndSize(
-            rewriter, loc, elementBitWidth, elementBitWidth,
-            stridedMetadata.getConstifiedMixedOffset(),
-            stridedMetadata.getConstifiedMixedSizes(),
-            stridedMetadata.getConstifiedMixedStrides(), indices);
-    Value linearIndex =
-        getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices);
+        memref::getLinearizedMemRefOffsetAndSize(rewriter, loc, elementBitWidth,
+                                                 elementBitWidth, offset, sizes,
+                                                 strides, indices);
 
     // TODO(jerryyin): Fix the getLinearizedMemRefOffsetAndSize() function
     // Note below doesn't give the correct result for the linearized size.
     // Value totalSize = getValueOrCreateConstantIndexOp(
     //    rewriter, loc, linearizedInfo.linearizedSize);
-    // It compute the mutiplied sizes of all dimensions instead of taking
+    // It computes the multiplied sizes of all dimensions instead of taking
     // the maximum of each dimension size * stride.
     SmallVector<AffineExpr> productExpressions;
     SmallVector<Value> productResults;
     unsigned sourceRank = cast<ShapedType>(src.getType()).getRank();
 
     SmallVector<AffineExpr> symbols(2 * sourceRank);
-    SmallVector<Value> offsetValues(2 * sourceRank);
+    SmallVector<Value> offsetValues;
     bindSymbolsList(rewriter.getContext(), MutableArrayRef{symbols});
+
+    size_t symbolIndex = 0;
     for (size_t i = 0; i < sourceRank; ++i) {
-      unsigned offsetIdx = 2 * i;
-      productExpressions.push_back(symbols[offsetIdx] * symbols[offsetIdx + 1]);
-      offsetValues[offsetIdx] = stridedMetadata.getStrides()[i];
-      offsetValues[offsetIdx + 1] = stridedMetadata.getSizes()[i];
+      AffineExpr strideExpr, sizeExpr;
+      OpFoldResult stride = strides[i];
+      OpFoldResult size = sizes[i];
+      if (auto constantStride =
+              getConstantIntValue(stride)) {
+        strideExpr = rewriter.getAffineConstantExpr(*constantStride);
+      } else {
+        strideExpr = symbols[symbolIndex++];
+        offsetValues.push_back(getValueOrCreateConstantIndexOp(
+            rewriter, loc, stride));
+      }
+
+      if (auto constantSize =
+              getConstantIntValue(size)) {
+        sizeExpr = rewriter.getAffineConstantExpr(*constantSize);
+      } else {
+        sizeExpr = symbols[symbolIndex++];
+        offsetValues.push_back(getValueOrCreateConstantIndexOp(
+            rewriter, loc, size));
+      }
+
+      productExpressions.push_back(strideExpr * sizeExpr);
     }
 
     AffineMap maxMap = AffineMap::get(
-        /*dimCount=*/0, /*symbolCount=*/symbols.size(), productExpressions,
+        /*dimCount=*/0, /*symbolCount=*/symbolIndex, productExpressions,
         rewriter.getContext());
     Value totalSize =
         rewriter.create<affine::AffineMaxOp>(loc, maxMap, offsetValues);
 
     // delta = bufferSize - linearizedOffset
     Value vectorSizeOffset =
         rewriter.create<arith::ConstantIndexOp>(loc, vectorSize);
+    Value linearIndex =
+        getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices);
     Value delta = rewriter.create<arith::SubIOp>(loc, totalSize, linearIndex);
 
     // 1) check if delta < vectorSize
     Value isOutofBounds = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::ule, delta, vectorSizeOffset);
+        loc, arith::CmpIPredicate::ult, delta, vectorSizeOffset);
 
     // 2) check if (detla_bytes % (32 / elementBitwidth) != 0)
     Value deltaBytes = rewriter.create<arith::MulIOp>(
         loc, delta,
         rewriter.create<arith::ConstantIndexOp>(loc, elementBitWidth / 8));
     Value elementsPerWord = rewriter.create<arith::ConstantIndexOp>(
-        loc, elementBitWidth < 32 ? 32 / elementBitWidth : 1);
+        loc, llvm::divideCeil(32, elementBitWidth));
     Value isNotWordAligned = rewriter.create<arith::CmpIOp>(
         loc, arith::CmpIPredicate::ne,
         rewriter.create<arith::RemUIOp>(loc, deltaBytes, elementsPerWord),
         rewriter.create<arith::ConstantIndexOp>(loc, 0));
 
     // We take the fallback of transfer_read default lowering only it is both
-    // out-of-bounds and not word aligned.
+    // out-of-bounds and not word aligned. The fallback ensures correct results
+    // when loading at the boundary of the buffer since buffer load returns
+    // inconsistent zeros for the whole word when boundary is crossed.
     Value ifCondition =
         rewriter.create<arith::AndIOp>(loc, isOutofBounds, isNotWordAligned);
 
     auto thenBuilder = [&](OpBuilder &builder, Location loc) {
       Operation *read = builder.clone(*readOp.getOperation());
-      read->setAttr("amdgpu.buffer_transfer_read_needs_mask",
-                    builder.getUnitAttr());
+      read->setAttr(kTransferReadNeedsMask, builder.getUnitAttr());
       Value readResult = read->getResult(0);
       builder.create<scf::YieldOp>(loc, readResult);
     };
@@ -243,7 +274,6 @@ struct TransferReadLowering final : OpRewritePattern<vector::TransferReadOp> {
 void mlir::amdgpu::populateAmdgpuTransferReadToLoadPatterns(
     RewritePatternSet &patterns) {
   patterns.add<TransferReadLowering>(patterns.getContext());
-  vector::populateVectorTransferLoweringPatterns(patterns);
 }
 
 struct AmdgpuTransferReadToLoadPass final
diff --git a/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir b/mlir/test/Dialect/AMDGPU/transfer-read-to-load.mlir
@@ -12,7 +12,7 @@ func.func @transfer_to_maskedload_fatrawbuffer(%mem : memref<8x8xf32, #amdgpu.ad
 
 // CHECK: %[[FALSE:.*]] = arith.constant false
 // CHECK: %[[IF:.*]] = scf.if %[[FALSE]] -> (vector<4xf32>) {
-// CHECK: vector.maskedload %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[ARG2]]
+// CHECK: vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]]
 
 // CHECK: } else {
 // CHECK: %[[LOAD:.*]] = vector.load %arg0[%arg1, %arg1]
@@ -39,23 +39,23 @@ func.func @transfer_to_maskedload_fatrawbuffer_f16(%mem : memref<8x8xf16, #amdgp
 
 // CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[ARG2]]]
 // CHECK: %[[DELTA:.*]] = arith.subi %[[SIZE]], %[[LINEAR]]
-// CHECK: %[[COND1:.*]] = arith.cmpi ule, %[[DELTA]], %[[VECTORSIZE]]
+// CHECK: %[[COND1:.*]] = arith.cmpi ult, %[[DELTA]], %[[VECTORSIZE]]
 
 // CHECK: %[[DELTABYTES:.*]] = arith.muli %[[DELTA]], %[[BYTES]]
 // CHECK: %[[REM:.*]] = arith.remui %[[DELTABYTES]], %[[BYTES]]
 // CHECK: %[[COND2:.*]] = arith.cmpi ne, %[[REM]], %[[C0]]
 
 // CHECK: %[[COND:.*]] = arith.andi %[[COND1]], %[[COND2]]
 // CHECK: %[[IF:.*]] = scf.if %[[COND]] -> (vector<4xf16>) {
-// CHECK: vector.maskedload %[[ARG0]][%[[ARG1]], %[[ARG2]]], %[[ARG3]]
+// CHECK: vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG2]]]
 // CHECK: } else {
 // CHECK: %[[LOAD:.*]] = vector.load %[[ARG0]][%[[ARG1]], %[[ARG2]]]
 // CHECK: return %[[IF]] : vector<4xf16>
 
 // -----
 
 // CHECK: #map = affine_map<()[s0, s1, s2] -> (s0 * s1 + s2)>
-// CHECK: #map1 = affine_map<()[s0, s1, s2, s3] -> (s0 * s1, s2 * s3)>
+// CHECK: #map1 = affine_map<()[s0, s1, s2] -> (s0 * s1, s2)>
 // CHECK-LABEL: func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(
 // CHECK-SAME: %[[ARG0:.*]]: memref<?x?xi8, #amdgpu.address_space<fat_raw_buffer>>
 // CHECK-SAME: %[[ARG1:.*]]: index, %[[ARG2:.*]]: index
@@ -69,10 +69,9 @@ func.func @transfer_to_maskedload_fatrawbuffer_dynamic_i8(%mem : memref<?x?xi8,
 // CHECK: %[[CST:.*]] = arith.constant dense<0> : vector<4xi8>
 // CHECK: %[[C0:.*]] = arith.constant 0 : index
 // CHECK: %[[C4:.*]] = arith.constant 4 : index
-// CHECK: %[[C1:.*]] = arith.constant 1 : index
 // CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG0]]
 // CHECK: %[[LINEAR:.*]] = affine.apply #map()[%[[ARG1]], %[[STRIDES]]#0, %[[ARG2]]]
-// CHECK: %[[SIZE:.*]] = affine.max #map1()[%[[STRIDES]]#0, %[[SIZES]]#0, %[[C1]], %[[SIZES]]#1]
+// CHECK: %[[SIZE:.*]] = affine.max #map1()[%[[STRIDES]]#0, %[[SIZES]]#0, %[[SIZES]]#1]
 // CHECK: %[[IF:.*]] = scf.if
 // CHECK: return
 
@@ -87,8 +86,8 @@ func.func @transfer_to_maskedload_regular(%mem : memref<8x8xf32>, %idx : index,
   %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32>, vector<4xf32>
   return %res : vector<4xf32>
 }
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-// CHECK: %[[RES:.*]] = vector.maskedload %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[ARG2]], %[[CST]]
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+// CHECK: %[[RES:.*]] = vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[CST]], %[[ARG2]]
 // CHECK: return %[[RES]] : vector<4xf32>
 
 // -----
@@ -102,8 +101,8 @@ func.func @transfer_to_maskedload_addrspace(%mem : memref<8x8xf32, #gpu.address_
   %res = vector.transfer_read %mem[%idx, %idx], %cf0, %mask {in_bounds = [true]} : memref<8x8xf32, #gpu.address_space<workgroup>>, vector<4xf32>
   return %res : vector<4xf32>
 }
-// CHECK: %[[CST:.*]] = arith.constant dense<0.000000e+00> : vector<4xf32>
-// CHECK: %[[RES:.*]] = vector.maskedload %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[ARG2]], %[[CST]]
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00
+// CHECK: %[[RES:.*]] = vector.transfer_read %[[ARG0]][%[[ARG1]], %[[ARG1]]], %[[CST]], %[[ARG2]]
 // CHECK: return %[[RES]] : vector<4xf32>
 
 // -----