Fixups

MacDue · MacDue · commit 0fc3fbd479ad · 2024-01-26T10:57:21.000Z
diff --git a/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td
@@ -131,6 +131,10 @@ def VectorLegalization
     than a single SME tile (e.g. `vector<[8]x[8]xf32>`) into multiple SME
     tile-sized operations, as well as rewrites needed to get operations into
     forms compatible with SME lowerings.
+
+    Note: Decomposition is currently limited to vector types that are an exact
+    multiple of SME tiles. That is scalable in two dimensions, with both the
+    rows and columns divisible by the SVE vector length for the element type.
   }];
   let constructor = "mlir::arm_sme::createVectorLegalizationPass()";
   let dependentDialects = [
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp b/mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp
@@ -244,8 +244,9 @@ struct AssignTileIDsPattern
 
     // Set all operations dependent on `tileOp` to use the same tile ID.
     // This is a naive tile allocation scheme, but works for common cases. For
-    // example, as this only allocates tile IDs to existing ops, it can't solve
-    // cases like this (%tileA and %tileB come from different root operations):
+    // example, as this only allocates tile IDs to existing ops, it can't
+    // solve cases like this (%tileA and %tileB come from different root
+    // operations):
     //
     // %tile = scf.if %some_cond -> vector<[4]x[4]xi32> {
     //   scf.yield %tileA {tile_id = 0} : vector<[4]x[4]xi32>
@@ -254,9 +255,9 @@ struct AssignTileIDsPattern
     // }
     //
     // This case would require allocating a new tile for the result of the
-    // scf.if, and moving the contents of %tileA or %tileB to result tile (based
-    // on the %some_cond).
-    // Find all the ops that (transitively) depend on this tile.
+    // scf.if, and moving the contents of %tileA or %tileB to result tile
+    // (based on the %some_cond). Find all the ops that (transitively) depend
+    // on this tile.
     SetVector<Operation *> dependantOps;
     findDependantOps(tileOp->getResult(0), dependantOps);
     auto tileIDAttr = rewriter.getI32IntegerAttr(*tileId);
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp b/mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp
@@ -5,6 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// This pass legalizes vector operations so they can be lowered to ArmSME.
+// Currently, this only implements the decomposition of vector operations that
+// use vector sizes larger than an SME tile, into multiple SME-sized operations.
+//
+// Note: In the context of this pass 'tile' always refers to an SME tile.
+//
+//===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/ArmSME/IR/ArmSME.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
@@ -35,35 +43,49 @@ static constexpr StringLiteral MATCH_FAILURE_UNSUPPORTED_MASK_OP(
 static constexpr StringLiteral
     MATCH_FAILURE_NON_PERMUTATION_MAP("op affine map is not a permutation");
 
-struct SMETile {
+/// An SMESubTile represents a single SME-sized sub-tile from decomposing a
+/// larger vector type. The (`row`, `col`) are the position of the tile in the
+/// original vector type. For example for an [8]x[8] tile would have four
+/// [4]x[4] sub-tiles.
+///
+///           8 x vscale
+/// ┌─────────────┬─────────────┐
+/// │(0,0)        │(0,4)        │
+/// │             │             │
+/// ├─────────────┼─────────────┤ 8 x vscale
+/// │(4,0)        │(4,4)        │
+/// │             │             │
+/// └─────────────┴─────────────┘
+struct SMESubTile {
   // Note: The units of (row, col) are vscale (as SME tiles are scalable).
   int row{0};
   int col{0};
+  // The SME tile type.
   VectorType type;
 };
 
-/// Adds a constant scalable offset to `indices` (which are of equal length).
-/// For example, in the 2D case this would return:
+/// Adds a constant elementwise scalable offset to `indices` (which are of equal
+/// length). For example, in the 2D case this would return:
 // { indices[0] + offset[0] * vscale, indices[1] + offset[1] *  vscale }
 SmallVector<Value, 2> addConstantScalableOffset(OpBuilder &builder,
                                                 Location loc,
                                                 ValueRange indices,
-                                                ArrayRef<int> scalableOffset) {
+                                                ArrayRef<int> scalableOffsets) {
   auto vscale = builder.create<vector::VectorScaleOp>(loc);
   return llvm::map_to_vector(
-      llvm::zip_equal(indices, scalableOffset), [&](auto pair) -> Value {
+      llvm::zip_equal(indices, scalableOffsets), [&](auto pair) -> Value {
         auto [index, base] = pair;
         auto offset = builder.create<arith::MulIOp>(
             loc, builder.create<arith::ConstantIndexOp>(loc, base), vscale);
         return builder.create<arith::AddIOp>(loc, index, offset);
       });
 }
 
-/// Remaps `indices` (e.g. from a load/store) for a larger vector type to
-/// indices for one of the SME tiles it will decompose into.
+/// Adjusts `indices` (e.g. from a load/store) for a larger vector type to
+/// indices for one of the SME sub-tiles it will decompose into.
 ///
 /// For example, if you were to decompose an 8x8 load into four 4x4 tiles, the
-/// indices for each tile would need to be remapped as follows:
+/// indices for each tile would need to be adjusted as follows:
 ///
 /// initial indices = [a,b], inital size = 8x8, target size = 4x4
 /// ┌─────────────┬─────────────┐
@@ -73,11 +95,11 @@ SmallVector<Value, 2> addConstantScalableOffset(OpBuilder &builder,
 /// │[a+4,b]      │[a+4,b+4]    │
 /// │             │             │
 /// └─────────────┴─────────────┘
-SmallVector<Value, 2> remapIndicesForSMETile(OpBuilder &builder, Location loc,
-                                             ValueRange indices,
-                                             SMETile tileTile) {
+SmallVector<Value, 2> getSMESubTileIndices(OpBuilder &builder, Location loc,
+                                           ValueRange indices,
+                                           SMESubTile smeTile) {
   return addConstantScalableOffset(builder, loc, indices,
-                                   {tileTile.row, tileTile.col});
+                                   {smeTile.row, smeTile.col});
 }
 
 /// Returns true if `mask` is generated by an operation that can be decomposed
@@ -86,21 +108,21 @@ bool isSupportedMaskOp(Value mask) {
   return !mask || mask.getDefiningOp<vector::CreateMaskOp>();
 }
 
-/// Extracts a mask for an SME tile from the mask of a larger vector type.
+/// Extracts a mask for an SME sub-tile from the mask of a larger vector type.
 Value extractSMEMask(OpBuilder &builder, Location loc, Value mask,
-                     SMETile tileTile) {
+                     SMESubTile smeTile) {
   assert(isSupportedMaskOp(mask));
   if (!mask)
     return Value{};
   auto createMask = mask.getDefiningOp<vector::CreateMaskOp>();
   // The operands of `vector.create_mask` (from a 2D perspective) are the
   // coordinates where the mask ends. So we subtract where this tile starts,
-  // from the mask operands to get the parameters for this tile tile.
-  auto tileMaskDims = addConstantScalableOffset(
-      builder, loc, createMask.getOperands(), {-tileTile.row, -tileTile.col});
-  auto createTileMask = builder.create<vector::CreateMaskOp>(
-      loc, tileTile.type.clone(builder.getI1Type()), tileMaskDims);
-  return createTileMask.getResult();
+  // from the mask operands to get the parameters for this sub-tile.
+  auto smeTileMaskDims = addConstantScalableOffset(
+      builder, loc, createMask.getOperands(), {-smeTile.row, -smeTile.col});
+  auto smeTileCreateMask = builder.create<vector::CreateMaskOp>(
+      loc, smeTile.type.clone(builder.getI1Type()), smeTileMaskDims);
+  return smeTileCreateMask.getResult();
 }
 
 /// Constructs an iterator that returns each SME tile (with coordinates)
@@ -110,7 +132,8 @@ Value extractSMEMask(OpBuilder &builder, Location loc, Value mask,
 auto decomposeToSMETiles(OpBuilder &builder, VectorType type,
                          VectorType smeTileType,
                          bool transposeIndices = false) {
-  assert(isMultipleOfSMETileVectorType(type));
+  assert(isMultipleOfSMETileVectorType(type) &&
+         "`type` not multiple of SME tiles");
   return llvm::map_range(
       StaticTileOffsetRange(type.getShape(), {smeTileType.getDimSize(0),
                                               smeTileType.getDimSize(1)}),
@@ -119,14 +142,15 @@ auto decomposeToSMETiles(OpBuilder &builder, VectorType type,
         int col = int(indices[1]);
         if (transposeIndices)
           std::swap(row, col);
-        return SMETile{row, col, smeTileType};
+        return SMESubTile{row, col, smeTileType};
       });
 }
 
 /// Returns the number of SME tiles that fit into the (2D-scalable) vector type
 /// `type`.
 int getNumberOfSMETilesForVectorType(VectorType type) {
-  assert(isMultipleOfSMETileVectorType(type));
+  assert(isMultipleOfSMETileVectorType(type) &&
+         "`type` not multiple of SME tiles");
   int64_t vectorRows = type.getDimSize(0);
   int64_t vectorCols = type.getDimSize(1);
   auto elementType = type.getElementType();
@@ -162,25 +186,25 @@ struct LegalizeVectorOuterProductOpsByDecomposition
                                          MATCH_FAILURE_UNSUPPORTED_MASK_OP);
 
     ValueRange accSMETiles = adaptor.getAcc();
-    auto tileType = getSMETileTypeForElement(vectorType.getElementType());
-    VectorType sliceType = VectorType::Builder(tileType).dropDim(0);
+    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
+    VectorType sliceType = VectorType::Builder(smeTileType).dropDim(0);
 
     SmallVector<Value> resultSMETiles;
-    for (auto [index, tileTile] :
-         llvm::enumerate(decomposeToSMETiles(rewriter, vectorType, tileType))) {
+    for (auto [index, smeTile] : llvm::enumerate(
+             decomposeToSMETiles(rewriter, vectorType, smeTileType))) {
 
-      auto tileMask = extractSMEMask(rewriter, loc, mask, tileTile);
+      auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
       auto lhs = rewriter.create<vector::ScalableExtractOp>(
-          loc, sliceType, outerProductOp.getLhs(), tileTile.row);
+          loc, sliceType, outerProductOp.getLhs(), smeTile.row);
       auto rhs = rewriter.create<vector::ScalableExtractOp>(
-          loc, sliceType, outerProductOp.getRhs(), tileTile.col);
-      auto tileOuterProduct = rewriter.create<vector::OuterProductOp>(
-          loc, tileType, lhs, rhs,
+          loc, sliceType, outerProductOp.getRhs(), smeTile.col);
+      auto smeOuterProduct = rewriter.create<vector::OuterProductOp>(
+          loc, smeTileType, lhs, rhs,
           !accSMETiles.empty() ? accSMETiles[index] : Value{},
           outerProductOp.getKind());
 
       auto maskedOuterProduct =
-          vector::maskOperation(rewriter, tileOuterProduct, tileMask);
+          vector::maskOperation(rewriter, smeOuterProduct, smeMask);
       resultSMETiles.push_back(maskedOuterProduct->getResult(0));
     }
 
@@ -241,18 +265,18 @@ struct LegalizeTransferReadOpsByDecomposition
     bool transposed = !permutationMap.isIdentity();
 
     auto loc = readOp.getLoc();
-    auto tileType = getSMETileTypeForElement(vectorType.getElementType());
+    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
 
     SmallVector<Value> resultSMETiles;
-    for (SMETile tileTile :
-         decomposeToSMETiles(rewriter, vectorType, tileType, transposed)) {
-      auto tileMask = extractSMEMask(rewriter, loc, mask, tileTile);
-      auto transferRead = rewriter.create<vector::TransferReadOp>(
-          loc, tileType, readOp.getSource(),
-          remapIndicesForSMETile(rewriter, loc, readOp.getIndices(), tileTile),
-          readOp.getPermutationMapAttr(), readOp.getPadding(), tileMask,
+    for (SMESubTile smeTile :
+         decomposeToSMETiles(rewriter, vectorType, smeTileType, transposed)) {
+      auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
+      auto smeRead = rewriter.create<vector::TransferReadOp>(
+          loc, smeTileType, readOp.getSource(),
+          getSMESubTileIndices(rewriter, loc, readOp.getIndices(), smeTile),
+          readOp.getPermutationMapAttr(), readOp.getPadding(), smeMask,
           readOp.getInBoundsAttr());
-      resultSMETiles.push_back(transferRead);
+      resultSMETiles.push_back(smeRead);
     }
 
     rewriter.replaceOp(readOp, resultSMETiles, adaptor.getResultMapping());
@@ -289,19 +313,19 @@ struct LegalizeTransferWriteOpsByDecomposition
     bool transposed = !permutationMap.isIdentity();
 
     auto loc = writeOp.getLoc();
-    auto tileType = getSMETileTypeForElement(vectorType.getElementType());
+    auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
     auto inputSMETiles = adaptor.getVector();
 
     Value destTensorOrMemref = writeOp.getSource();
-    for (auto [index, tileTile] : llvm::enumerate(
-             decomposeToSMETiles(rewriter, vectorType, tileType, transposed))) {
-      auto tileMask = extractSMEMask(rewriter, loc, mask, tileTile);
-      auto tileWrite = rewriter.create<vector::TransferWriteOp>(
+    for (auto [index, smeTile] : llvm::enumerate(decomposeToSMETiles(
+             rewriter, vectorType, smeTileType, transposed))) {
+      auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
+      auto smeWrite = rewriter.create<vector::TransferWriteOp>(
           loc, inputSMETiles[index], destTensorOrMemref,
-          remapIndicesForSMETile(rewriter, loc, writeOp.getIndices(), tileTile),
-          writeOp.getPermutationMapAttr(), tileMask, writeOp.getInBoundsAttr());
+          getSMESubTileIndices(rewriter, loc, writeOp.getIndices(), smeTile),
+          writeOp.getPermutationMapAttr(), smeMask, writeOp.getInBoundsAttr());
       if (writeOp.hasPureTensorSemantics())
-        destTensorOrMemref = tileWrite.getResult();
+        destTensorOrMemref = smeWrite.getResult();
     }
 
     if (writeOp.hasPureTensorSemantics())
@@ -326,9 +350,10 @@ struct VectorLegalizationPass
            SmallVectorImpl<Type> &types) -> std::optional<LogicalResult> {
           if (!isMultipleOfSMETileVectorType(vectorType))
             return std::nullopt;
-          auto tileTileCount = getNumberOfSMETilesForVectorType(vectorType);
-          auto tileType = getSMETileTypeForElement(vectorType.getElementType());
-          types = SmallVector<Type>(tileTileCount, tileType);
+          auto smeTileTileCount = getNumberOfSMETilesForVectorType(vectorType);
+          auto smeTileType =
+              getSMETileTypeForElement(vectorType.getElementType());
+          types = SmallVector<Type>(smeTileTileCount, smeTileType);
           return success();
         });