Skip to content

Commit b129fc0

Browse files
committed
[MLIR] extend getCompressedMaskOp support in VectorEmulateNarrowType
Previously when `numFrontPadElems`is not zero, `getCompressedMaskOp` produces wrong result if the mask generator op is `vector.create_mask`. This patch resolves such issue when `numFrontPadElems` is not zero. Signed-off-by: Alan Li <[email protected]>
1 parent ec066d3 commit b129fc0

File tree

3 files changed

+61
-8
lines changed

3 files changed

+61
-8
lines changed

mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,14 @@ static FailureOr<Operation *> getCompressedMaskOp(OpBuilder &rewriter,
104104
if (createMaskOp) {
105105
OperandRange maskOperands = createMaskOp.getOperands();
106106
size_t numMaskOperands = maskOperands.size();
107+
// The `vector.create_mask` op creates a mask arrangement without any zeros
108+
// at the front. Also, because `numFrontPadElems` is strictly smaller than
109+
// `numSrcElemsPerDest`, the compressed mask generated by shifting the
110+
// original mask by `numFrontPadElems` will not have any zeros at the front
111+
// as well.
107112
AffineExpr s0;
108113
bindSymbols(rewriter.getContext(), s0);
109-
s0 = s0 + numSrcElemsPerDest - 1;
110-
s0 = s0.floorDiv(numSrcElemsPerDest);
114+
s0 = (s0 + numFrontPadElems).ceilDiv(numSrcElemsPerDest);
111115
OpFoldResult origIndex =
112116
getAsOpFoldResult(maskOperands[numMaskOperands - 1]);
113117
OpFoldResult maskIndex =

mlir/test/Dialect/Vector/vector-emulate-narrow-type-unaligned.mlir

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,55 @@ func.func @vector_cst_maskedload_i2(%passthru: vector<5xi2>) -> vector<3x5xi2> {
7474

7575
// -----
7676

77+
// This tests the correctness of generating compressed mask with `vector.create_mask` and a dynamic input.
78+
// Specifically, the program masked loads a vector<5xi2> from `vector<3x5xi2>[1, 0]`, with an unknown mask generator `m`.
79+
// After emulation transformation, it masked loads 2 bytes from linearized index `vector<4xi8>[1]`, with a new compressed mask
80+
// given by `ceildiv(m + 1, 4)`.
81+
func.func @check_unaligned_create_mask_dynamic_i2(%m : index, %passthru: vector<5xi2>) -> vector<5xi2> {
82+
%0 = memref.alloc() : memref<3x5xi2>
83+
%c0 = arith.constant 0 : index
84+
%c1 = arith.constant 1 : index
85+
%mask = vector.create_mask %m : vector<5xi1>
86+
%1 = vector.maskedload %0[%c1, %c0], %mask, %passthru :
87+
memref<3x5xi2>, vector<5xi1>, vector<5xi2> into vector<5xi2>
88+
return %1 : vector<5xi2>
89+
}
90+
91+
// CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0] -> ((s0 + 1) ceildiv 4)>
92+
// CHECK: func @check_unaligned_create_mask_dynamic_i2(
93+
// CHECK-SAME: %[[MASK:.+]]: index, %[[PASSTHRU:.+]]: vector<5xi2>)
94+
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<4xi8>
95+
// CHECK: %[[COMP_MASK:.+]] = affine.apply #map()[%[[MASK]]]
96+
// CHECK: vector.create_mask %[[COMP_MASK]] : vector<2xi1>
97+
// CHECK: %[[C1:.+]] = arith.constant 1 : index
98+
// CHECK: vector.maskedload %[[ALLOC]][%[[C1]]]
99+
100+
// -----
101+
102+
// This tests the correctness of generated compressed mask with `vector.create_mask`, and a static input.
103+
// Quite the same as the previous test, but the mask generator is a static value.
104+
// In this case, the desired slice `vector<7xi2>` spans over 3 bytes.
105+
func.func @check_unaligned_create_mask_static_i2(%passthru: vector<7xi2>) -> vector<7xi2> {
106+
%0 = memref.alloc() : memref<3x7xi2>
107+
%c0 = arith.constant 0 : index
108+
%c1 = arith.constant 1 : index
109+
%c3 = arith.constant 3 : index
110+
%mask = vector.create_mask %c3 : vector<7xi1>
111+
%1 = vector.maskedload %0[%c1, %c0], %mask, %passthru :
112+
memref<3x7xi2>, vector<7xi1>, vector<7xi2> into vector<7xi2>
113+
return %1 : vector<7xi2>
114+
}
115+
116+
// CHECK: func @check_unaligned_create_mask_static_i2(
117+
// CHECK-SAME: %[[PASSTHRU:[a-zA-Z0-9]+]]: vector<7xi2>)
118+
// CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<6xi8>
119+
// CHECK: %[[C2:.+]] = arith.constant 2 : index
120+
// CHECK: %[[COMP_MASK:.+]] = vector.create_mask %[[C2]] : vector<3xi1>
121+
// CHECK: %[[C1:.+]] = arith.constant 1 : index
122+
// CHECK: %4 = vector.maskedload %[[ALLOC]][%[[C1]]], %[[COMP_MASK]]
123+
124+
// -----
125+
77126
func.func @vector_load_i2_dynamic_indexing(%idx1: index, %idx2: index) -> vector<3xi2> {
78127
%0 = memref.alloc() : memref<3x3xi2>
79128
%cst = arith.constant dense<0> : vector<3x3xi2>

mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func.func @vector_maskedload_i8(%arg1: index, %arg2: index, %arg3: index, %passt
141141
// CHECK-NEXT: return
142142

143143
// CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
144-
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
144+
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
145145
// CHECK32: func @vector_maskedload_i8(
146146
// CHECK32-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
147147
// CHECK32-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
@@ -169,7 +169,7 @@ func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passt
169169
return %2 : vector<3x8xi4>
170170
}
171171
// CHECK-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
172-
// CHECK-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 1) floordiv 2)>
172+
// CHECK-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
173173
// CHECK: func @vector_maskedload_i4(
174174
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
175175
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
@@ -185,7 +185,7 @@ func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passt
185185
// CHECK: %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>
186186

187187
// CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
188-
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
188+
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)>
189189
// CHECK32: func @vector_maskedload_i4(
190190
// CHECK32-SAME: %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
191191
// CHECK32-SAME: %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
@@ -473,7 +473,7 @@ func.func @vector_maskedstore_i8(%arg0: index, %arg1: index, %arg2: index, %valu
473473
// CHECK-NEXT: return
474474

475475
// CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 2 + s1 floordiv 4)>
476-
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
476+
// CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
477477
// CHECK32: func @vector_maskedstore_i8(
478478
// CHECK32-SAME: %[[ARG0:[a-zA-Z0-9]+]]
479479
// CHECK32-SAME: %[[ARG1:[a-zA-Z0-9]+]]
@@ -506,7 +506,7 @@ func.func @vector_maskedstore_i4(
506506
return
507507
}
508508
// CHECK: #[[$ATTR_10:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
509-
// CHECK: #[[$ATTR_11:.+]] = affine_map<()[s0] -> ((s0 + 1) floordiv 2)>
509+
// CHECK: #[[$ATTR_11:.+]] = affine_map<()[s0] -> (s0 ceildiv 2)>
510510

511511
// CHECK-LABEL: func.func @vector_maskedstore_i4(
512512
// CHECK-SAME: %[[IDX_1:[a-zA-Z0-9]+]]: index,
@@ -526,7 +526,7 @@ func.func @vector_maskedstore_i4(
526526
// CHECK: vector.maskedstore %[[ALLOC]]{{\[}}%[[LIDX]]], %[[NEW_MASK]], %[[NEW_VAL]] : memref<12xi8>, vector<4xi1>, vector<4xi8>
527527

528528
// CHECK32: #[[$ATTR_17:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
529-
// CHECK32: #[[$ATTR_18:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
529+
// CHECK32: #[[$ATTR_18:.+]] = affine_map<()[s0] -> (s0 ceildiv 8)>
530530

531531
// CHECK32-LABEL: func.func @vector_maskedstore_i4(
532532
// CHECK32-SAME: %[[IDX_1:[a-zA-Z0-9]+]]: index,

0 commit comments

Comments
 (0)