Skip to content

Commit 0fc3fbd

Browse files
committed
Fixups
1 parent aad4e7e commit 0fc3fbd

File tree

3 files changed

+88
-58
lines changed

3 files changed

+88
-58
lines changed

mlir/include/mlir/Dialect/ArmSME/Transforms/Passes.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,10 @@ def VectorLegalization
131131
than a single SME tile (e.g. `vector<[8]x[8]xf32>`) into multiple SME
132132
tile-sized operations, as well as rewrites needed to get operations into
133133
forms compatible with SME lowerings.
134+
135+
Note: Decomposition is currently limited to vector types that are an exact
136+
multiple of SME tiles. That is scalable in two dimensions, with both the
137+
rows and columns divisible by the SVE vector length for the element type.
134138
}];
135139
let constructor = "mlir::arm_sme::createVectorLegalizationPass()";
136140
let dependentDialects = [

mlir/lib/Dialect/ArmSME/Transforms/TileAllocation.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,9 @@ struct AssignTileIDsPattern
244244

245245
// Set all operations dependent on `tileOp` to use the same tile ID.
246246
// This is a naive tile allocation scheme, but works for common cases. For
247-
// example, as this only allocates tile IDs to existing ops, it can't solve
248-
// cases like this (%tileA and %tileB come from different root operations):
247+
// example, as this only allocates tile IDs to existing ops, it can't
248+
// solve cases like this (%tileA and %tileB come from different root
249+
// operations):
249250
//
250251
// %tile = scf.if %some_cond -> vector<[4]x[4]xi32> {
251252
// scf.yield %tileA {tile_id = 0} : vector<[4]x[4]xi32>
@@ -254,9 +255,9 @@ struct AssignTileIDsPattern
254255
// }
255256
//
256257
// This case would require allocating a new tile for the result of the
257-
// scf.if, and moving the contents of %tileA or %tileB to result tile (based
258-
// on the %some_cond).
259-
// Find all the ops that (transitively) depend on this tile.
258+
// scf.if, and moving the contents of %tileA or %tileB to result tile
259+
// (based on the %some_cond). Find all the ops that (transitively) depend
260+
// on this tile.
260261
SetVector<Operation *> dependantOps;
261262
findDependantOps(tileOp->getResult(0), dependantOps);
262263
auto tileIDAttr = rewriter.getI32IntegerAttr(*tileId);

mlir/lib/Dialect/ArmSME/Transforms/VectorLegalization.cpp

Lines changed: 78 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,14 @@
55
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
66
//
77
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass legalizes vector operations so they can be lowered to ArmSME.
10+
// Currently, this only implements the decomposition of vector operations that
11+
// use vector sizes larger than an SME tile, into multiple SME-sized operations.
12+
//
13+
// Note: In the context of this pass 'tile' always refers to an SME tile.
14+
//
15+
//===----------------------------------------------------------------------===//
816

917
#include "mlir/Dialect/ArmSME/IR/ArmSME.h"
1018
#include "mlir/Dialect/ArmSME/Transforms/Passes.h"
@@ -35,35 +43,49 @@ static constexpr StringLiteral MATCH_FAILURE_UNSUPPORTED_MASK_OP(
3543
static constexpr StringLiteral
3644
MATCH_FAILURE_NON_PERMUTATION_MAP("op affine map is not a permutation");
3745

38-
struct SMETile {
46+
/// An SMESubTile represents a single SME-sized sub-tile from decomposing a
47+
/// larger vector type. The (`row`, `col`) are the position of the tile in the
48+
/// original vector type. For example for an [8]x[8] tile would have four
49+
/// [4]x[4] sub-tiles.
50+
///
51+
/// 8 x vscale
52+
/// ┌─────────────┬─────────────┐
53+
/// │(0,0) │(0,4) │
54+
/// │ │ │
55+
/// ├─────────────┼─────────────┤ 8 x vscale
56+
/// │(4,0) │(4,4) │
57+
/// │ │ │
58+
/// └─────────────┴─────────────┘
59+
struct SMESubTile {
3960
// Note: The units of (row, col) are vscale (as SME tiles are scalable).
4061
int row{0};
4162
int col{0};
63+
// The SME tile type.
4264
VectorType type;
4365
};
4466

45-
/// Adds a constant scalable offset to `indices` (which are of equal length).
46-
/// For example, in the 2D case this would return:
67+
/// Adds a constant elementwise scalable offset to `indices` (which are of equal
68+
/// length). For example, in the 2D case this would return:
4769
// { indices[0] + offset[0] * vscale, indices[1] + offset[1] * vscale }
4870
SmallVector<Value, 2> addConstantScalableOffset(OpBuilder &builder,
4971
Location loc,
5072
ValueRange indices,
51-
ArrayRef<int> scalableOffset) {
73+
ArrayRef<int> scalableOffsets) {
5274
auto vscale = builder.create<vector::VectorScaleOp>(loc);
5375
return llvm::map_to_vector(
54-
llvm::zip_equal(indices, scalableOffset), [&](auto pair) -> Value {
76+
llvm::zip_equal(indices, scalableOffsets), [&](auto pair) -> Value {
5577
auto [index, base] = pair;
5678
auto offset = builder.create<arith::MulIOp>(
5779
loc, builder.create<arith::ConstantIndexOp>(loc, base), vscale);
5880
return builder.create<arith::AddIOp>(loc, index, offset);
5981
});
6082
}
6183

62-
/// Remaps `indices` (e.g. from a load/store) for a larger vector type to
63-
/// indices for one of the SME tiles it will decompose into.
84+
/// Adjusts `indices` (e.g. from a load/store) for a larger vector type to
85+
/// indices for one of the SME sub-tiles it will decompose into.
6486
///
6587
/// For example, if you were to decompose an 8x8 load into four 4x4 tiles, the
66-
/// indices for each tile would need to be remapped as follows:
88+
/// indices for each tile would need to be adjusted as follows:
6789
///
6890
/// initial indices = [a,b], inital size = 8x8, target size = 4x4
6991
/// ┌─────────────┬─────────────┐
@@ -73,11 +95,11 @@ SmallVector<Value, 2> addConstantScalableOffset(OpBuilder &builder,
7395
/// │[a+4,b] │[a+4,b+4] │
7496
/// │ │ │
7597
/// └─────────────┴─────────────┘
76-
SmallVector<Value, 2> remapIndicesForSMETile(OpBuilder &builder, Location loc,
77-
ValueRange indices,
78-
SMETile tileTile) {
98+
SmallVector<Value, 2> getSMESubTileIndices(OpBuilder &builder, Location loc,
99+
ValueRange indices,
100+
SMESubTile smeTile) {
79101
return addConstantScalableOffset(builder, loc, indices,
80-
{tileTile.row, tileTile.col});
102+
{smeTile.row, smeTile.col});
81103
}
82104

83105
/// Returns true if `mask` is generated by an operation that can be decomposed
@@ -86,21 +108,21 @@ bool isSupportedMaskOp(Value mask) {
86108
return !mask || mask.getDefiningOp<vector::CreateMaskOp>();
87109
}
88110

89-
/// Extracts a mask for an SME tile from the mask of a larger vector type.
111+
/// Extracts a mask for an SME sub-tile from the mask of a larger vector type.
90112
Value extractSMEMask(OpBuilder &builder, Location loc, Value mask,
91-
SMETile tileTile) {
113+
SMESubTile smeTile) {
92114
assert(isSupportedMaskOp(mask));
93115
if (!mask)
94116
return Value{};
95117
auto createMask = mask.getDefiningOp<vector::CreateMaskOp>();
96118
// The operands of `vector.create_mask` (from a 2D perspective) are the
97119
// coordinates where the mask ends. So we subtract where this tile starts,
98-
// from the mask operands to get the parameters for this tile tile.
99-
auto tileMaskDims = addConstantScalableOffset(
100-
builder, loc, createMask.getOperands(), {-tileTile.row, -tileTile.col});
101-
auto createTileMask = builder.create<vector::CreateMaskOp>(
102-
loc, tileTile.type.clone(builder.getI1Type()), tileMaskDims);
103-
return createTileMask.getResult();
120+
// from the mask operands to get the parameters for this sub-tile.
121+
auto smeTileMaskDims = addConstantScalableOffset(
122+
builder, loc, createMask.getOperands(), {-smeTile.row, -smeTile.col});
123+
auto smeTileCreateMask = builder.create<vector::CreateMaskOp>(
124+
loc, smeTile.type.clone(builder.getI1Type()), smeTileMaskDims);
125+
return smeTileCreateMask.getResult();
104126
}
105127

106128
/// Constructs an iterator that returns each SME tile (with coordinates)
@@ -110,7 +132,8 @@ Value extractSMEMask(OpBuilder &builder, Location loc, Value mask,
110132
auto decomposeToSMETiles(OpBuilder &builder, VectorType type,
111133
VectorType smeTileType,
112134
bool transposeIndices = false) {
113-
assert(isMultipleOfSMETileVectorType(type));
135+
assert(isMultipleOfSMETileVectorType(type) &&
136+
"`type` not multiple of SME tiles");
114137
return llvm::map_range(
115138
StaticTileOffsetRange(type.getShape(), {smeTileType.getDimSize(0),
116139
smeTileType.getDimSize(1)}),
@@ -119,14 +142,15 @@ auto decomposeToSMETiles(OpBuilder &builder, VectorType type,
119142
int col = int(indices[1]);
120143
if (transposeIndices)
121144
std::swap(row, col);
122-
return SMETile{row, col, smeTileType};
145+
return SMESubTile{row, col, smeTileType};
123146
});
124147
}
125148

126149
/// Returns the number of SME tiles that fit into the (2D-scalable) vector type
127150
/// `type`.
128151
int getNumberOfSMETilesForVectorType(VectorType type) {
129-
assert(isMultipleOfSMETileVectorType(type));
152+
assert(isMultipleOfSMETileVectorType(type) &&
153+
"`type` not multiple of SME tiles");
130154
int64_t vectorRows = type.getDimSize(0);
131155
int64_t vectorCols = type.getDimSize(1);
132156
auto elementType = type.getElementType();
@@ -162,25 +186,25 @@ struct LegalizeVectorOuterProductOpsByDecomposition
162186
MATCH_FAILURE_UNSUPPORTED_MASK_OP);
163187

164188
ValueRange accSMETiles = adaptor.getAcc();
165-
auto tileType = getSMETileTypeForElement(vectorType.getElementType());
166-
VectorType sliceType = VectorType::Builder(tileType).dropDim(0);
189+
auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
190+
VectorType sliceType = VectorType::Builder(smeTileType).dropDim(0);
167191

168192
SmallVector<Value> resultSMETiles;
169-
for (auto [index, tileTile] :
170-
llvm::enumerate(decomposeToSMETiles(rewriter, vectorType, tileType))) {
193+
for (auto [index, smeTile] : llvm::enumerate(
194+
decomposeToSMETiles(rewriter, vectorType, smeTileType))) {
171195

172-
auto tileMask = extractSMEMask(rewriter, loc, mask, tileTile);
196+
auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
173197
auto lhs = rewriter.create<vector::ScalableExtractOp>(
174-
loc, sliceType, outerProductOp.getLhs(), tileTile.row);
198+
loc, sliceType, outerProductOp.getLhs(), smeTile.row);
175199
auto rhs = rewriter.create<vector::ScalableExtractOp>(
176-
loc, sliceType, outerProductOp.getRhs(), tileTile.col);
177-
auto tileOuterProduct = rewriter.create<vector::OuterProductOp>(
178-
loc, tileType, lhs, rhs,
200+
loc, sliceType, outerProductOp.getRhs(), smeTile.col);
201+
auto smeOuterProduct = rewriter.create<vector::OuterProductOp>(
202+
loc, smeTileType, lhs, rhs,
179203
!accSMETiles.empty() ? accSMETiles[index] : Value{},
180204
outerProductOp.getKind());
181205

182206
auto maskedOuterProduct =
183-
vector::maskOperation(rewriter, tileOuterProduct, tileMask);
207+
vector::maskOperation(rewriter, smeOuterProduct, smeMask);
184208
resultSMETiles.push_back(maskedOuterProduct->getResult(0));
185209
}
186210

@@ -241,18 +265,18 @@ struct LegalizeTransferReadOpsByDecomposition
241265
bool transposed = !permutationMap.isIdentity();
242266

243267
auto loc = readOp.getLoc();
244-
auto tileType = getSMETileTypeForElement(vectorType.getElementType());
268+
auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
245269

246270
SmallVector<Value> resultSMETiles;
247-
for (SMETile tileTile :
248-
decomposeToSMETiles(rewriter, vectorType, tileType, transposed)) {
249-
auto tileMask = extractSMEMask(rewriter, loc, mask, tileTile);
250-
auto transferRead = rewriter.create<vector::TransferReadOp>(
251-
loc, tileType, readOp.getSource(),
252-
remapIndicesForSMETile(rewriter, loc, readOp.getIndices(), tileTile),
253-
readOp.getPermutationMapAttr(), readOp.getPadding(), tileMask,
271+
for (SMESubTile smeTile :
272+
decomposeToSMETiles(rewriter, vectorType, smeTileType, transposed)) {
273+
auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
274+
auto smeRead = rewriter.create<vector::TransferReadOp>(
275+
loc, smeTileType, readOp.getSource(),
276+
getSMESubTileIndices(rewriter, loc, readOp.getIndices(), smeTile),
277+
readOp.getPermutationMapAttr(), readOp.getPadding(), smeMask,
254278
readOp.getInBoundsAttr());
255-
resultSMETiles.push_back(transferRead);
279+
resultSMETiles.push_back(smeRead);
256280
}
257281

258282
rewriter.replaceOp(readOp, resultSMETiles, adaptor.getResultMapping());
@@ -289,19 +313,19 @@ struct LegalizeTransferWriteOpsByDecomposition
289313
bool transposed = !permutationMap.isIdentity();
290314

291315
auto loc = writeOp.getLoc();
292-
auto tileType = getSMETileTypeForElement(vectorType.getElementType());
316+
auto smeTileType = getSMETileTypeForElement(vectorType.getElementType());
293317
auto inputSMETiles = adaptor.getVector();
294318

295319
Value destTensorOrMemref = writeOp.getSource();
296-
for (auto [index, tileTile] : llvm::enumerate(
297-
decomposeToSMETiles(rewriter, vectorType, tileType, transposed))) {
298-
auto tileMask = extractSMEMask(rewriter, loc, mask, tileTile);
299-
auto tileWrite = rewriter.create<vector::TransferWriteOp>(
320+
for (auto [index, smeTile] : llvm::enumerate(decomposeToSMETiles(
321+
rewriter, vectorType, smeTileType, transposed))) {
322+
auto smeMask = extractSMEMask(rewriter, loc, mask, smeTile);
323+
auto smeWrite = rewriter.create<vector::TransferWriteOp>(
300324
loc, inputSMETiles[index], destTensorOrMemref,
301-
remapIndicesForSMETile(rewriter, loc, writeOp.getIndices(), tileTile),
302-
writeOp.getPermutationMapAttr(), tileMask, writeOp.getInBoundsAttr());
325+
getSMESubTileIndices(rewriter, loc, writeOp.getIndices(), smeTile),
326+
writeOp.getPermutationMapAttr(), smeMask, writeOp.getInBoundsAttr());
303327
if (writeOp.hasPureTensorSemantics())
304-
destTensorOrMemref = tileWrite.getResult();
328+
destTensorOrMemref = smeWrite.getResult();
305329
}
306330

307331
if (writeOp.hasPureTensorSemantics())
@@ -326,9 +350,10 @@ struct VectorLegalizationPass
326350
SmallVectorImpl<Type> &types) -> std::optional<LogicalResult> {
327351
if (!isMultipleOfSMETileVectorType(vectorType))
328352
return std::nullopt;
329-
auto tileTileCount = getNumberOfSMETilesForVectorType(vectorType);
330-
auto tileType = getSMETileTypeForElement(vectorType.getElementType());
331-
types = SmallVector<Type>(tileTileCount, tileType);
353+
auto smeTileTileCount = getNumberOfSMETilesForVectorType(vectorType);
354+
auto smeTileType =
355+
getSMETileTypeForElement(vectorType.getElementType());
356+
types = SmallVector<Type>(smeTileTileCount, smeTileType);
332357
return success();
333358
});
334359

0 commit comments

Comments
 (0)