Skip to content

Commit ed350bb

Browse files
authored
[mlir][ArmSME] Add support for lowering masked tile_store ops (#71180)
This patch extends ArmSMEToSCF to support lowering of masked tile_store ops. Only masks created by 'vector.create_mask' are currently supported. Example: %mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1> arm_sme.tile_store %tile, %dest[%c0, %c0], %mask : memref<?x?xi32>, vector<[4]x[4]xi32> Produces: %num_rows = arith.constant 3 : index %num_cols = vector.create_mask %c2 : vector<[4]xi1> scf.for %slice_idx = %c0 to %num_rows step %c1 arm_sme.store_tile_slice %tile, %slice_idx, %num_cols, %dest[%slice_idx, %c0] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
1 parent b178cec commit ed350bb

File tree

3 files changed

+188
-24
lines changed

3 files changed

+188
-24
lines changed

mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp

Lines changed: 43 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -173,38 +173,59 @@ struct TileStoreOpConversion : public OpRewritePattern<arm_sme::TileStoreOp> {
173173
auto tileType = tileStoreOp.getVectorType();
174174
auto tileElementType = tileType.getElementType();
175175

176-
// Create a loop that stores each ZA tile slice from memory.
176+
auto predicateType =
177+
VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
178+
179+
Value maskCols;
180+
Value upperBound;
181+
auto maskOp = tileStoreOp.getMask();
182+
if (maskOp) {
183+
auto createMaskOp = maskOp.getDefiningOp<vector::CreateMaskOp>();
184+
if (!createMaskOp)
185+
return rewriter.notifyMatchFailure(
186+
tileStoreOp, "unsupported mask op, only 'vector.create_mask' is "
187+
"currently supported");
188+
189+
auto numRows = createMaskOp.getOperands()[0];
190+
auto numCols = createMaskOp.getOperands()[1];
191+
192+
upperBound = numRows;
193+
maskCols =
194+
rewriter.create<vector::CreateMaskOp>(loc, predicateType, numCols);
195+
} else {
196+
// Store all tile slices if no mask.
197+
auto minTileSlices = rewriter.create<arith::ConstantIndexOp>(
198+
loc, arm_sme::getSMETileSliceMinNumElts(tileElementType));
199+
auto vscale =
200+
rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
201+
// This describes both the number of ZA tile slices and the number of
202+
// elements in a vector of SVL bits for a given element type (SVL_B,
203+
// SVL_H,
204+
// ..., SVL_Q).
205+
auto numTileSlices =
206+
rewriter.create<arith::MulIOp>(loc, minTileSlices, vscale);
207+
208+
upperBound = numTileSlices;
209+
// Create an 'all true' predicate for the tile slice.
210+
maskCols = rewriter.create<arith::ConstantOp>(
211+
loc, DenseElementsAttr::get(predicateType, true));
212+
}
213+
214+
// Create a loop that stores each (active) active ZA tile slice from memory.
177215
auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
178-
auto minTileSlices = rewriter.create<arith::ConstantIndexOp>(
179-
loc, arm_sme::getSMETileSliceMinNumElts(tileElementType));
180-
auto vscale =
181-
rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
182216
auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
183-
// This describes both the number of ZA tile slices and the number of
184-
// elements in a vector of SVL bits for a given element type (SVL_B, SVL_H,
185-
// ..., SVL_Q).
186-
auto numTileSlices =
187-
rewriter.create<arith::MulIOp>(loc, minTileSlices, vscale);
188-
auto forOp =
189-
rewriter.create<scf::ForOp>(loc, lowerBound, numTileSlices, step);
217+
auto forOp = rewriter.create<scf::ForOp>(loc, lowerBound, upperBound, step);
190218

191219
rewriter.setInsertionPointToStart(forOp.getBody());
192220

193-
// Create an 'all true' predicate for the tile slice.
194-
auto predicateType =
195-
VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
196-
auto allTruePredicate = rewriter.create<arith::ConstantOp>(
197-
loc, DenseElementsAttr::get(predicateType, true));
198-
199221
SmallVector<Value> memrefIndices;
200222
auto tileSliceIndex = forOp.getInductionVar();
201223
getMemrefIndices(tileStoreOp.getIndices(),
202224
tileStoreOp.getMemRefType().getRank(), tileSliceIndex,
203-
numTileSlices, memrefIndices, loc, rewriter);
225+
upperBound, memrefIndices, loc, rewriter);
204226
rewriter.replaceOpWithNewOp<arm_sme::StoreTileSliceOp>(
205-
tileStoreOp, tileStoreOp.getValueToStore(), tileSliceIndex,
206-
allTruePredicate, tileStoreOp.getBase(), memrefIndices,
207-
tileStoreOp.getLayout());
227+
tileStoreOp, tileStoreOp.getValueToStore(), tileSliceIndex, maskCols,
228+
tileStoreOp.getBase(), memrefIndices, tileStoreOp.getLayout());
208229

209230
return success();
210231
}

mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ func.func @arm_sme_tile_load_ver(%src : memref<?x?xi32>) {
4646
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
4747
// CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
4848
// CHECK-DAG: %[[VSCALE:.*]] = vector.vscale
49-
// CHECK: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
49+
// CHECK-DAG: %[[PTRUE_S:.*]] = arith.constant dense<true> : vector<[4]xi1>
50+
// CHECK-DAG: %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
5051
// CHECK: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] {
51-
// CHECK: %[[PTRUE_S:.*]] = arith.constant dense<true> : vector<[4]xi1>
5252
// CHECK: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
5353
// CHECK: arm_sme.store_tile_slice %[[TILE]], %[[TILE_SLICE_INDEX]], %[[PTRUE_S]], %[[DEST]]{{\[}}%[[OFFSET]], %[[C0]]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
5454
func.func @arm_sme_tile_store_hor(%tile : vector<[4]x[4]xi32>, %dest : memref<?x?xi32>) {
@@ -67,6 +67,27 @@ func.func @arm_sme_tile_store_ver(%tile : vector<[4]x[4]xi32>, %dest : memref<?x
6767
return
6868
}
6969

70+
// -----
71+
72+
// CHECK-LABEL: func.func @arm_sme_tile_store_hor_with_mask(
73+
// CHECK-SAME: %[[TILE:.*]]: vector<[4]x[4]xi32>,
74+
// CHECK-SAME: %[[DEST:.*]]: memref<?x?xi32>) {
75+
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
76+
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
77+
// CHECK-DAG: %[[NUM_ROWS:.*]] = arith.constant 3 : index
78+
// CHECK-DAG: %[[NUM_COLS:.*]] = vector.create_mask %c2 : vector<[4]xi1>
79+
// CHECK-NEXT: scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_ROWS]] step %[[C1]] {
80+
// CHECK-NEXT: %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
81+
// CHECK-NEXT: arm_sme.store_tile_slice %[[TILE]], %[[TILE_SLICE_INDEX]], %[[NUM_COLS]], %[[DEST]]{{\[}}%[[OFFSET]], %[[C0]]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
82+
func.func @arm_sme_tile_store_hor_with_mask(%tile : vector<[4]x[4]xi32>, %dest : memref<?x?xi32>) {
83+
%c0 = arith.constant 0 : index
84+
%c2 = arith.constant 2 : index
85+
%c3 = arith.constant 3 : index
86+
%mask = vector.create_mask %c3, %c2 : vector<[4]x[4]xi1>
87+
arm_sme.tile_store %tile, %dest[%c0, %c0], %mask : memref<?x?xi32>, vector<[4]x[4]xi32>
88+
return
89+
}
90+
7091
//===----------------------------------------------------------------------===//
7192
// vector.print
7293
//===----------------------------------------------------------------------===//
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
// DEFINE: %{entry_point} = entry
2+
// DEFINE: %{compile} = mlir-opt %s \
3+
// DEFINE: -enable-arm-streaming="mode=locally enable-za" \
4+
// DEFINE: -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
5+
// DEFINE: -convert-vector-to-llvm="enable-arm-sme" -cse -canonicalize \
6+
// DEFINE: -allocate-arm-sme-tiles -test-lower-to-llvm
7+
// DEFINE: %{run} = %mcr_aarch64_cmd \
8+
// DEFINE: -march=aarch64 -mattr=+sve,+sme \
9+
// DEFINE: -e %{entry_point} -entry-point-result=void \
10+
// DEFINE: -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
11+
12+
// RUN: %{compile} | %{run} | FileCheck %s
13+
14+
// Vector store.
15+
func.func @transfer_write_2d(%A : memref<?x?xf32>, %base1: index, %base2: index) {
16+
%c0 = arith.constant 0.0 : f32
17+
%zero = vector.splat %c0 : vector<[4]x[4]xf32>
18+
vector.transfer_write %zero, %A[%base1, %base2] {in_bounds=[true, true]} :
19+
vector<[4]x[4]xf32>, memref<?x?xf32>
20+
return
21+
}
22+
23+
// Masked vector store.
24+
func.func @transfer_write_2d_mask(%A : memref<?x?xf32>, %base1: index, %base2: index) {
25+
%c0 = arith.constant 0.0 : f32
26+
%c2 = arith.constant 2 : index
27+
%c3 = arith.constant 3 : index
28+
%mask = vector.create_mask %c2, %c3 : vector<[4]x[4]xi1>
29+
%zero = vector.splat %c0 : vector<[4]x[4]xf32>
30+
vector.transfer_write %zero, %A[%base1, %base2], %mask {in_bounds=[true, true]} :
31+
vector<[4]x[4]xf32>, memref<?x?xf32>
32+
return
33+
}
34+
35+
// Vector load + print.
36+
func.func @load_and_print(%A : memref<?x?xf32>, %base1: index, %base2: index) {
37+
%0 = vector.load %A[%base1, %base2] : memref<?x?xf32>, vector<[4]x[4]xf32>
38+
39+
vector.print str "TILE BEGIN:"
40+
vector.print %0: vector<[4]x[4]xf32>
41+
42+
return
43+
}
44+
45+
// Allocate heap memory of size 'd0' x 'd1' and initialize.
46+
//
47+
// Example:
48+
//
49+
// initialize_memory(%c4, %c5)
50+
//
51+
// 0, 1, 2, 3, 4
52+
// 10, 11, 12, 13, 14
53+
// 20, 21, 22, 23, 24
54+
// 30, 31, 32, 33, 34
55+
//
56+
// Returns dynamic memref. It's the callers responsiblity to free the returned
57+
// memref.
58+
func.func @initialize_memory(%d0 : index, %d1 : index) -> memref<?x?xf32> {
59+
%c0 = arith.constant 0 : index
60+
%c1 = arith.constant 1 : index
61+
%c1_f32 = arith.constant 1.0 : f32
62+
%c10_f32 = arith.constant 10.0 : f32
63+
64+
%A = memref.alloc(%d0, %d1) : memref<?x?xf32>
65+
66+
%init = arith.constant 0.0 : f32
67+
scf.for %i = %c0 to %d0 step %c1 iter_args(%val = %init) -> f32 {
68+
scf.for %j = %c0 to %d1 step %c1 iter_args(%inner_val = %val) -> f32 {
69+
memref.store %inner_val, %A[%i, %j] : memref<?x?xf32>
70+
%inner_val_next = arith.addf %inner_val, %c1_f32 : f32
71+
scf.yield %inner_val_next : f32
72+
}
73+
%val_next = arith.addf %val, %c10_f32 : f32
74+
scf.yield %val_next : f32
75+
}
76+
77+
return %A : memref<?x?xf32>
78+
}
79+
80+
func.func @entry() {
81+
%c0 = arith.constant 0 : index
82+
%c2 = arith.constant 2 : index
83+
%c4 = arith.constant 4 : index
84+
85+
// 1. Initialize memory
86+
//
87+
// Allocate enough memory to load a 32-bit tile plus a tiny bit more to test
88+
// non-zero offsets while remaining inbounds.
89+
%vscale = vector.vscale
90+
%svl_s = arith.muli %c4, %vscale : index
91+
%svl_s_plus_two = arith.addi %svl_s, %c2 : index
92+
%A = call @initialize_memory(%svl_s_plus_two, %svl_s_plus_two) : (index, index) -> memref<?x?xf32>
93+
94+
// CHECK-LABEL: TILE BEGIN:
95+
// CHECK-NEXT: ( 0, 1, 2, 3
96+
// CHECK-NEXT: ( 10, 11, 12, 13
97+
// CHECK-NEXT: ( 20, 21, 22, 23
98+
// CHECK-NEXT: ( 30, 31, 32, 33
99+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
100+
101+
// 2. Write 2-D vector of zeroes to 1. at offset [2, 2].
102+
// CHECK-LABEL: TILE BEGIN:
103+
// CHECK-NEXT: ( 0, 1, 2, 3
104+
// CHECK-NEXT: ( 10, 11, 12, 13
105+
// CHECK-NEXT: ( 20, 21, 0, 0
106+
// CHECK-NEXT: ( 30, 31, 0, 0
107+
call @transfer_write_2d(%A, %c2, %c2) : (memref<?x?xf32>, index, index) -> ()
108+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
109+
110+
// 3. Write 2-D vector of zeroes to 2. but with mask (nrows=2, ncols=3).
111+
// CHECK-LABEL: TILE BEGIN:
112+
// CHECK-NEXT: ( 0, 0, 0, 3
113+
// CHECK-NEXT: ( 0, 0, 0, 13
114+
// CHECK-NEXT: ( 20, 21, 0, 0
115+
// CHECK-NEXT: ( 30, 31, 0, 0
116+
call @transfer_write_2d_mask(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
117+
call @load_and_print(%A, %c0, %c0) : (memref<?x?xf32>, index, index) -> ()
118+
119+
memref.dealloc %A : memref<?x?xf32>
120+
121+
return
122+
}

0 commit comments

Comments
 (0)