Skip to content

Commit 5be2464

Browse files
committed
[MLIR] Fix VectorEmulateNarrowType constant op mask bug
This commit adds support for handling mask constants generated by the `arith.constant` op in the `VectorEmulateNarrowType` pattern. Previously, this pattern would not match due to the lack of mask constant handling in `getCompressedMaskOp`. The changes include: 1. Updating `getCompressedMaskOp` to recognize and handle `arith.constant` ops as mask value sources. 2. Handling cases where the mask is not aligned with the emulated load width. The compressed mask is adjusted to account for the offset. Limitations: - The arith.constant op can only have 1-dimensional constant values. Resolves: #115742 Signed-off-by: Alan Li <[email protected]>
1 parent 1e5bfac commit 5be2464

File tree

3 files changed

+107
-2
lines changed

3 files changed

+107
-2
lines changed

mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,15 +70,18 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
7070
Operation *maskOp = mask.getDefiningOp();
7171
SmallVector<vector::ExtractOp, 2> extractOps;
7272
// Finding the mask creation operation.
73-
while (maskOp && !isa<vector::CreateMaskOp, vector::ConstantMaskOp>(maskOp)) {
73+
while (maskOp &&
74+
!isa<arith::ConstantOp, vector::CreateMaskOp, vector::ConstantMaskOp>(
75+
maskOp)) {
7476
if (auto extractOp = dyn_cast<vector::ExtractOp>(maskOp)) {
7577
maskOp = extractOp.getVector().getDefiningOp();
7678
extractOps.push_back(extractOp);
7779
}
7880
}
7981
auto createMaskOp = dyn_cast_or_null<vector::CreateMaskOp>(maskOp);
8082
auto constantMaskOp = dyn_cast_or_null<vector::ConstantMaskOp>(maskOp);
81-
if (!createMaskOp && !constantMaskOp)
83+
auto constantOp = dyn_cast_or_null<arith::ConstantOp>(maskOp);
84+
if (!createMaskOp && !constantMaskOp && !constantOp)
8285
return failure();
8386

8487
// Computing the "compressed" mask. All the emulation logic (i.e. computing
@@ -129,6 +132,45 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
129132
auto denseAttr = DenseElementsAttr::get(newMaskType, newMaskValues);
130133
newMask = rewriter.create<arith::ConstantOp>(loc, newMaskType, denseAttr);
131134
}
135+
} else if (constantOp) {
136+
assert(shape.size() == 1 && "expected 1-D mask");
137+
// Rearrange the original mask values to cover the whole potential loading
138+
// region. For example, in the case of using byte-size for emulation, given
139+
// the following mask:
140+
//
141+
// %mask = vector.constant_mask [0, 1, 0, 1, 0, 0] : vector<6xi2>
142+
//
143+
// with front offset of 1, the mask will be padded zeros in the front and
144+
// back so that its length is multiple of `scale` (and the total coverage
145+
// size is mulitiple of bytes):
146+
// %new_mask = vector.constant_mask [0, 0, 1, 0, 1, 0, 0, 0] :
147+
// vector<8xi2>
148+
//
149+
// The %new_mask is now aligned with the effective loading area and can now
150+
// be compressed.
151+
SmallVector<bool> maskValues(intraDataOffset, false);
152+
if (auto denseAttr =
153+
mlir::dyn_cast<DenseIntElementsAttr>(constantOp.getValue())) {
154+
for (auto value : denseAttr.getValues<bool>()) {
155+
maskValues.push_back(value);
156+
}
157+
while (maskValues.size() < numElements * scale) {
158+
maskValues.push_back(false);
159+
}
160+
} else {
161+
return failure();
162+
}
163+
// Compressing by combining every `scale` elements:
164+
SmallVector<bool> compressedMaskValues;
165+
for (size_t i = 0; i < maskValues.size(); i += scale) {
166+
bool combinedValue = false;
167+
for (int j = 0; j < scale; ++j) {
168+
combinedValue |= maskValues[i + j];
169+
}
170+
compressedMaskValues.push_back(combinedValue);
171+
}
172+
newMask = rewriter.create<arith::ConstantOp>(
173+
loc, DenseElementsAttr::get(newMaskType, compressedMaskValues));
132174
}
133175

134176
while (!extractOps.empty()) {

mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,3 +249,41 @@ func.func @vector_maskedload_i2_dynamic_indexing_mixed(%passthru: vector<3xi2>,
249249
// CHECK: %[[IN8:.+]] = vector.insert %[[EX8]], %[[IN7]] [1] : i2 into vector<3xi2>
250250
// CHECK: %[[EX9:.+]] = vector.extract %[[SELECT]][%[[INCIDX2]]] : i2 from vector<8xi2>
251251
// CHECK: %[[IN9:.+]] = vector.insert %[[EX9]], %[[IN8]] [2] : i2 into vector<3xi2>
252+
253+
// -----
254+
255+
func.func @vector_maskedload_i4_constant_mask_unaligned(%passthru: vector<5xi2>) -> vector<5xi2> {
256+
%0 = memref.alloc() : memref<3x5xi2>
257+
%mask = arith.constant dense<[false, true, true, true, false]> : vector<5xi1>
258+
%c0 = arith.constant 0 : index
259+
%c1 = arith.constant 1 : index
260+
%1 = vector.maskedload %0[%c1, %c0], %mask, %passthru :
261+
memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
262+
return %1 : vector<5xi2>
263+
}
264+
265+
// CHECK: func @vector_maskedload_i4_constant_mask_unaligned(
266+
// CHECK-SAME: %[[PTH:.+]]: vector<5xi2>) -> vector<5xi2>
267+
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<4xi8>
268+
// CHECK: %[[MASK:.+]] = arith.constant dense<[false, true, true, true, false]> : vector<5xi1>
269+
270+
// CHECK: %[[CST0:.+]] = arith.constant dense<true> : vector<2xi1>
271+
// CHECK: %[[CST1:.+]] = arith.constant dense<0> : vector<8xi2>
272+
// CHECK: %[[INSERT:.+]] = vector.insert_strided_slice %[[PTH]], %[[CST1]]
273+
// CHECK-SAME: {offsets = [1], strides = [1]} : vector<5xi2> into vector<8xi2>
274+
275+
// Emulated masked load from alloc:
276+
// CHECK: %[[BCAST:.+]] = vector.bitcast %[[INSERT]] : vector<8xi2> to vector<2xi8>
277+
// CHECK: %[[C1:.+]] = arith.constant 1 : index
278+
// CHECK: %[[MASKLOAD:.+]] = vector.maskedload %[[ALLOC]][%[[C1]]], %[[CST0]], %[[BCAST]]
279+
// CHECK: %[[BCAST2:.+]] = vector.bitcast %[[MASKLOAD]] : vector<2xi8> to vector<8xi2>
280+
281+
// Select from emulated loaded vector and passthru vector:
282+
// TODO: fold this part if possible.
283+
// CHECK: %[[CST2:.+]] = arith.constant dense<false> : vector<8xi1>
284+
// CHECK: %[[INSERT2:.+]] = vector.insert_strided_slice %[[MASK]], %[[CST2]]
285+
// CHECK-SAME: {offsets = [1], strides = [1]} : vector<5xi1> into vector<8xi1>
286+
// CHECK: %[[SELECT:.+]] = arith.select %[[INSERT2]], %[[BCAST2]], %[[INSERT]] : vector<8xi1>, vector<8xi2>
287+
// CHECK: %[[EXTRACT:.+]] = vector.extract_strided_slice %[[SELECT]]
288+
// CHECK-SAME: {offsets = [1], sizes = [5], strides = [1]} : vector<8xi2> to vector<5xi2>
289+
// CHECK: return %[[EXTRACT]] : vector<5xi2>

mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,3 +624,28 @@ func.func @vector_maskedstore_i4_constant_mask(
624624
// CHECK32: %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[VAL_TO_STORE]], %[[BITCAST]] : vector<8xi1>, vector<8xi4>
625625
// CHECK32: %[[NEW_VAL:.+]] = vector.bitcast %[[SELECT]] : vector<8xi4> to vector<1xi32>
626626
// CHECK32: vector.maskedstore %[[ALLOC]]{{\[}}%[[LIDX]]], %[[NEW_MASK]], %[[NEW_VAL]] : memref<3xi32>, vector<1xi1>, vector<1xi32>
627+
628+
// -----
629+
630+
func.func @vector_maskedload_i4_arith_constant(%passthru: vector<8xi4>) -> vector<8xi4> {
631+
%0 = memref.alloc() : memref<3x8xi4>
632+
%cst = arith.constant dense<0> : vector<8xi4>
633+
%mask = arith.constant dense<[false, true, true, true, true, false, false, false]> : vector<8xi1>
634+
%c0 = arith.constant 0 : index
635+
%1 = vector.maskedload %0[%c0, %c0], %mask, %passthru :
636+
memref<3x8xi4>, vector<8xi1>, vector<8xi4> into vector<8xi4>
637+
return %1 : vector<8xi4>
638+
}
639+
640+
// CHECK: func @vector_maskedload_i4_arith_constant(
641+
// CHECK-SAME: %[[PASSTHRU:[a-zA-Z0-9]+]]: vector<8xi4>) -> vector<8xi4> {
642+
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
643+
// CHECK: %[[MASK:.+]] = arith.constant dense<[false, true, true, true, true, false, false, false]> : vector<8xi1>
644+
// CHECK: %[[MASK2:.+]] = arith.constant dense<[true, true, true, false]> : vector<4xi1>
645+
// CHECK: %[[BITCAST:.+]] = vector.bitcast %[[PASSTHRU]] : vector<8xi4> to vector<4xi8>
646+
// CHECK: %[[C0:.+]] = arith.constant 0 : index
647+
// CHECK: %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[C0]]], %[[MASK2]], %[[BITCAST]]
648+
// CHECK-SAME : memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
649+
// CHECK: %[[BITCAST2:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
650+
// CHECK: %[[SELECT:.+]] = arith.select %[[MASK]], %[[BITCAST2]], %[[PASSTHRU]] : vector<8xi1>, vector<8xi4>
651+
// CHECK: return %[[SELECT]] : vector<8xi4>

0 commit comments

Comments
 (0)