fixup! [mlir][ArmSME] Remove ConvertIllegalShapeCastOpsToTransposes

banach-space · banach-space · commit d04d335a2ef4 · 2025-06-18T09:17:40.000+01:00
Add LowerColumnTransferReadToLoops. Note, this is to address Ben's comment here: * https://github.com/llvm/llvm-project/pull/139706/files#r2088605443
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -867,6 +867,116 @@ struct LowerIllegalTransposeStoreViaZA
   }
 };
 
+/// Lower `vector.transfer_read` of a scalable column to `scf::for`
+///
+/// Lowers a "read" of a scalable column from a MemRef for which there is no
+/// hardware pperation that we could use to a loop over the rows to read and
+/// loads one element at a time.
+///
+///  BEFORE:
+///  ```
+///  %res = vector.transfer_read %mem[%a, %b] (...)
+///    : memref<?x?xf32>, vector<[4]x1xf32>
+///  ```
+///
+///  AFTER:
+///  ```
+///    %cst = arith.constant (...) : vector<[4]xf32>
+///    %vscale = vector.vscale
+///    %c4_vscale = arith.muli %vscale, %c4 : index
+///    %scf = scf.for %lb = %c0 to %c4_vscale step %c1 iter_args(%arg4 = %cst)
+///      -> (vector<[4]xf32>) {
+///
+///        %load = memref.load %mem[%arg3 + %a, %b] : memref<?x?xf32>
+///        %vec = vector.insert %load, %cst [%arg3] : f32 into vector<[4]xf32>
+///        scf.yield %vec : vector<[4]xf32>
+///    }
+///    %res = vector.shape_cast %scf : vector<[4]xf32> to vector<[4]x1xf32>
+///  ```
+///
+///  TODO: This transformation isn't specific to SME - move it to the SVE
+///  dialect.
+///  TODO: Check the in_bounds attribute and generate vector.maskedload if
+///  required.
+struct LowerColumnTransferReadToLoops
+    : public OpRewritePattern<vector::TransferReadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
+                                PatternRewriter &rewriter) const override {
+    // NOTE: This is a fairly low-level transformation, so we shouldn't be
+    // adding support for Tensors without good rationale.
+    if (readOp.hasPureTensorSemantics())
+      return rewriter.notifyMatchFailure(
+          readOp, "Tensor semantics are unsupported (either bufferize or "
+                  "extend this pattern)");
+
+    auto resType = readOp.getVectorType();
+
+    if (resType.getRank() != 2)
+      return rewriter.notifyMatchFailure(readOp,
+                                         "Only 2D vectors are supported!");
+
+    if (resType.getShape()[1] != 1)
+      return rewriter.notifyMatchFailure(
+          readOp, "The trailing output dim is != 1 (not supported ATM)");
+
+    if (!resType.getScalableDims()[0] || resType.getScalableDims()[1])
+      return rewriter.notifyMatchFailure(
+          readOp, "Expected the leading dim to be scalable and the trailing "
+                  "dim to be fixed.");
+
+    // Create new result type - similar to the original vector with the
+    // trailing unit dim collapsed.
+    int64_t numRows = resType.getShape()[0];
+    VectorType newResType = VectorType::get(numRows, resType.getElementType(),
+                                            /*scalableDims=*/{true});
+
+    // Create a loop over all rows and load one element at a time.
+    auto loc = readOp.getLoc();
+    auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    auto createVscaleMultiple =
+        vector::makeVscaleConstantBuilder(rewriter, loc);
+    auto upperBound = createVscaleMultiple(numRows);
+    auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    Value init = rewriter.create<arith::ConstantOp>(
+        loc, newResType, DenseElementsAttr::get(newResType, 0.0f));
+
+    scf::ForOp loadLoop;
+    {
+      OpBuilder::InsertionGuard g(rewriter);
+      loadLoop = rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step,
+                                             ValueRange{init});
+      rewriter.setInsertionPointToStart(loadLoop.getBody());
+
+      auto tileSliceIndex = loadLoop.getInductionVar();
+
+      auto idx0 = rewriter.create<arith::AddIOp>(loc, tileSliceIndex,
+                                                 readOp.getIndices()[0]);
+      auto idx1 = readOp.getIndices()[1];
+
+      Value scalar = rewriter.create<memref::LoadOp>(
+          loc, readOp.getBase(), SmallVector<Value>({idx0, idx1}));
+
+      Operation *updateInit = rewriter.create<vector::InsertOp>(
+          loc, scalar, loadLoop.getRegionIterArg(0), tileSliceIndex);
+
+      rewriter.create<scf::YieldOp>(loc, updateInit->getResult(0));
+    }
+
+    // The read operation has been "legalized", but since the original result
+    // type was a 2D vector, we need to cast before returning the result. This
+    // ShapeCast should cancel-out with some other ShapeCast (i.e. it's a
+    // no-op).
+    auto sc = rewriter.create<vector::ShapeCastOp>(
+        loc, readOp.getResult().getType(), loadLoop.getResult(0));
+
+    rewriter.replaceOp(readOp, sc);
+
+    return success();
+  }
+};
+
 struct VectorLegalizationPass
     : public arm_sme::impl::VectorLegalizationBase<VectorLegalizationPass> {
   void runOnOperation() override {
@@ -888,9 +998,10 @@ struct VectorLegalizationPass
 
     // Apply preprocessing patterns.
     RewritePatternSet rewritePatterns(context);
-    rewritePatterns.add<FoldExtractFromVectorOfSMELikeCreateMasks,
-                        LiftIllegalVectorTransposeToMemory,
-                        LowerIllegalTransposeStoreViaZA>(context);
+    rewritePatterns
+        .add<FoldExtractFromVectorOfSMELikeCreateMasks,
+             LowerColumnTransferReadToLoops, LiftIllegalVectorTransposeToMemory,
+             LowerIllegalTransposeStoreViaZA>(context);
     if (failed(
             applyPatternsGreedily(getOperation(), std::move(rewritePatterns))))
       return signalPassFailure();
diff --git a/mlir/test/Dialect/ArmSME/vector-legalization.mlir b/mlir/test/Dialect/ArmSME/vector-legalization.mlir
@@ -611,3 +611,59 @@ func.func @vector_mask_without_maskable_op(%mask: vector<16x2xi1>, %vec: vector<
   %0 = vector.mask %mask { vector.yield %vec : vector<16x16xf32> } : vector<16x2xi1> -> vector<16x16xf32>
   return %0 : vector<16x16xf32>
 }
+
+// -----
+
+//=============================================================================
+// 1D examples - to be moved to the SVE dialect
+//=============================================================================
+
+/// TODO: Handle in_bounds
+
+// CHECK-LABEL:   func.func @xfer_read_scalable_column(
+// CHECK-SAME:      %[[IDX_0:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[IDX_1:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:      %[[PAD:.*]]: f32,
+// CHECK-SAME:      %[[SRC:.*]]: memref<?x?xf32>) -> vector<[4]x1xf32> {
+func.func @xfer_read_scalable_column(%a: index, %b: index, %pad: f32, %src: memref<?x?xf32>) -> (vector<[4]x1xf32>) {
+  // CHECK:           %[[INIT:.*]] = arith.constant dense<0.000000e+00> : vector<[4]xf32>
+  // CHECK:           %[[STEP:.*]] = arith.constant 1 : index
+  // CHECK:           %[[C4:.*]] = arith.constant 4 : index
+  // CHECK:           %[[LB:.*]] = arith.constant 0 : index
+  // CHECK:           %[[VSCALE:.*]] = vector.vscale
+  // CHECK:           %[[C4_VSCALE:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+
+  // <scf.for>
+  // CHECK:           %[[SCF:.*]] = scf.for %[[IND_VAR:.*]] = %[[LB]] to %[[C4_VSCALE]] step %[[STEP]] iter_args(%[[SCF_RES:.*]] = %[[INIT]]) -> (vector<[4]xf32>) {
+  // CHECK:             %[[IDX_0_UPDATED:.*]] = arith.addi %[[IND_VAR]], %[[IDX_0]] : index
+  // CHECK:             %[[VAL_10:.*]] = memref.load %[[SRC]][%[[IDX_0_UPDATED]], %[[IDX_1]]] : memref<?x?xf32>
+  // CHECK:             %[[RES_UPDATED:.*]] = vector.insert %[[VAL_10]], %[[SCF_RES]] [%[[IND_VAR]]] : f32 into vector<[4]xf32>
+  // CHECK:             scf.yield %[[RES_UPDATED]] : vector<[4]xf32>
+  // CHECK:           }
+
+  // <shape-cast>
+  // CHECK:           %[[SC:.*]] = vector.shape_cast %[[SCF]] : vector<[4]xf32> to vector<[4]x1xf32>
+  // CHECK:           return %[[SC]]
+  %read = vector.transfer_read %src[%a, %b], %pad : memref<?x?xf32>, vector<[4]x1xf32>
+  return %read : vector<[4]x1xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @negative_xfer_read_scalable_column_x2
+func.func @negative_xfer_read_scalable_column_x2(%a: index, %b: index, %pad: f32, %src: memref<?x?xf32>) -> (vector<[4]x2xf32>) {
+  // CHECK-NOT: scf.for
+  // CHECK-NOT: memref.load
+  %read = vector.transfer_read %src[%a, %b], %pad : memref<?x?xf32>, vector<[4]x2xf32>
+  return %read : vector<[4]x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func.func @negative_xfer_read_scalable_column_scalable_trailing_dim
+func.func @negative_xfer_read_scalable_column_scalable_trailing_dim(%a: index, %b: index, %pad: f32, %src: memref<?x?xf32>) -> (vector<4x[1]xf32>) {
+  // CHECK-NOT: scf.for
+  // CHECK-NOT: memref.load
+  %read = vector.transfer_read %src[%a, %b], %pad : memref<?x?xf32>, vector<4x[1]xf32>
+  return %read : vector<4x[1]xf32>
+}