Skip to content

Commit d80b04a

Browse files
[mlir][Affine][Vector] Support vectorizing reduction loops
This patch adds support for vectorizing loops with 'iter_args' implementing known reductions along the vector dimension. Comparing to the non-vector-dimension case, two additional things are done during vectorization of such loops: - The resulting vector returned from the loop is reduced to a scalar using `vector.reduce`. - In some cases a mask is applied to the vector yielded at the end of the loop to prevent garbage values from being written to the accumulator. Vectorization of reduction loops is disabled by default. To enable it, a map from loops to array of reduction descriptors should be explicitly passed to `vectorizeAffineLoops`, or `vectorize-reductions=true` should be passed to the SuperVectorize pass. Current limitations: - Loops with a non-unit step size are not supported. - n-D vectorization with n > 1 is not supported. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D100694
1 parent 20d0aca commit d80b04a

File tree

13 files changed

+984
-67
lines changed

13 files changed

+984
-67
lines changed

mlir/include/mlir/Analysis/AffineAnalysis.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ bool isLoopParallel(
4747
AffineForOp forOp,
4848
SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);
4949

50+
/// Returns true if `forOp' doesn't have memory dependences preventing
51+
/// parallelization. This function doesn't check iter_args and should be used
52+
/// only as a building block for full parallel-checking functions.
53+
bool isLoopMemoryParallel(AffineForOp forOp);
54+
5055
/// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
5156
/// Operations that are reachable via a search starting from `operands` and
5257
/// ending at those operands that are not the result of an AffineApplyOp.

mlir/include/mlir/Dialect/Affine/Passes.td

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,11 @@ def AffineVectorize : FunctionPass<"affine-super-vectorize"> {
112112
"Specify a 1-D, 2-D or 3-D pattern of fastest varying memory "
113113
"dimensions to match. See defaultPatterns in Vectorize.cpp for "
114114
"a description and examples. This is used for testing purposes",
115-
"llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">
115+
"llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated">,
116+
Option<"vectorizeReductions", "vectorize-reductions", "bool",
117+
/*default=*/"false",
118+
"Vectorize known reductions expressed via iter_args. "
119+
"Switched off by default.">
116120
];
117121
}
118122

mlir/include/mlir/Dialect/Affine/Utils.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#ifndef MLIR_DIALECT_AFFINE_UTILS_H
1414
#define MLIR_DIALECT_AFFINE_UTILS_H
1515

16+
#include "mlir/Analysis/AffineAnalysis.h"
1617
#include "mlir/IR/AffineExpr.h"
1718
#include "mlir/Support/LLVM.h"
1819
#include "llvm/ADT/DenseMap.h"
@@ -27,6 +28,8 @@ struct LogicalResult;
2728
struct LoopReduction;
2829
class Operation;
2930

31+
using ReductionLoopMap = DenseMap<Operation *, SmallVector<LoopReduction, 2>>;
32+
3033
/// Replaces parallel affine.for op with 1-d affine.parallel op.
3134
/// mlir::isLoopParallel detects the parallel affine.for ops.
3235
/// Parallelizes the specified reductions. Parallelization will fail in presence
@@ -81,16 +84,23 @@ struct VectorizationStrategy {
8184
// The candidate will be vectorized using the vectorization factor in
8285
// 'vectorSizes' for that dimension.
8386
DenseMap<Operation *, unsigned> loopToVectorDim;
87+
// Maps loops that implement vectorizable reductions to the corresponding
88+
// reduction descriptors.
89+
ReductionLoopMap reductionLoops;
8490
};
8591

8692
/// Vectorizes affine loops in 'loops' using the n-D vectorization factors in
8793
/// 'vectorSizes'. By default, each vectorization factor is applied
8894
/// inner-to-outer to the loops of each loop nest. 'fastestVaryingPattern' can
8995
/// be optionally used to provide a different loop vectorization order.
96+
/// If `reductionLoops` is not empty, the given reduction loops may be
97+
/// vectorized along the reduction dimension.
98+
/// TODO: Vectorizing reductions is supported only for 1-D vectorization.
9099
void vectorizeAffineLoops(
91100
Operation *parentOp,
92101
llvm::DenseSet<Operation *, DenseMapInfo<Operation *>> &loops,
93-
ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern);
102+
ArrayRef<int64_t> vectorSizes, ArrayRef<int64_t> fastestVaryingPattern,
103+
const ReductionLoopMap &reductionLoops = ReductionLoopMap());
94104

95105
/// External utility to vectorize affine loops from a single loop nest using an
96106
/// n-D vectorization strategy (see doc in VectorizationStrategy definition).

mlir/include/mlir/Dialect/StandardOps/IR/Ops.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,20 @@ bool applyCmpPredicate(CmpFPredicate predicate, const APFloat &lhs,
124124
/// Ignore integer bitwitdh and type mismatch that come from the fact there is
125125
/// no IndexAttr and that IndexType have no bitwidth.
126126
bool isEqualConstantIntOrValue(OpFoldResult ofr1, OpFoldResult ofr2);
127+
128+
/// Returns the identity value attribute associated with an AtomicRMWKind op.
129+
Attribute getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
130+
OpBuilder &builder, Location loc);
131+
132+
/// Returns the identity value associated with an AtomicRMWKind op.
133+
Value getIdentityValue(AtomicRMWKind op, Type resultType, OpBuilder &builder,
134+
Location loc);
135+
136+
/// Returns the value obtained by applying the reduction operation kind
137+
/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
138+
Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
139+
Value lhs, Value rhs);
140+
127141
} // end namespace mlir
128142

129143
#endif // MLIR_DIALECT_IR_STANDARDOPS_IR_OPS_H

mlir/include/mlir/Dialect/Vector/VectorOps.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H
1414
#define MLIR_DIALECT_VECTOR_VECTOROPS_H
1515

16+
#include "mlir/Dialect/StandardOps/IR/Ops.h"
1617
#include "mlir/IR/AffineMap.h"
1718
#include "mlir/IR/Attributes.h"
1819
#include "mlir/IR/BuiltinTypes.h"
@@ -192,6 +193,11 @@ IntegerType getVectorSubscriptType(Builder &builder);
192193
/// the integer type required for subscripts in the vector dialect.
193194
ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);
194195

196+
/// Returns the value obtained by reducing the vector into a scalar using the
197+
/// operation kind associated with a binary AtomicRMWKind op.
198+
Value getVectorReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
199+
Value vector);
200+
195201
namespace impl {
196202
/// Build the default minor identity map suitable for a vector transfer. This
197203
/// also handles the case memref<... x vector<...>> -> vector<...> in which the

mlir/lib/Analysis/AffineAnalysis.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@ static Value getSupportedReduction(AffineForOp forOp, unsigned pos,
7070
return nullptr;
7171
if (!forOp.getRegionIterArgs()[pos].hasOneUse())
7272
return nullptr;
73+
if (!yielded.hasOneUse())
74+
return nullptr;
7375

7476
Optional<AtomicRMWKind> maybeKind =
7577
TypeSwitch<Operation *, Optional<AtomicRMWKind>>(definition)
@@ -123,6 +125,14 @@ bool mlir::isLoopParallel(AffineForOp forOp,
123125
return false;
124126
}
125127

128+
// Check memory dependences.
129+
return isLoopMemoryParallel(forOp);
130+
}
131+
132+
/// Returns true if `forOp' doesn't have memory dependences preventing
133+
/// parallelization. This function doesn't check iter_args and should be used
134+
/// only as a building block for full parallel-checking functions.
135+
bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
126136
// Collect all load and store ops in loop nest rooted at 'forOp'.
127137
SmallVector<Operation *, 8> loadAndStoreOps;
128138
auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {

mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -367,49 +367,6 @@ class AffineForLowering : public OpRewritePattern<AffineForOp> {
367367
}
368368
};
369369

370-
/// Returns the identity value associated with an AtomicRMWKind op.
371-
static Value getIdentityValue(AtomicRMWKind op, Type resultType,
372-
OpBuilder &builder, Location loc) {
373-
switch (op) {
374-
case AtomicRMWKind::addf:
375-
return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 0));
376-
case AtomicRMWKind::addi:
377-
return builder.create<ConstantOp>(loc,
378-
builder.getIntegerAttr(resultType, 0));
379-
case AtomicRMWKind::mulf:
380-
return builder.create<ConstantOp>(loc, builder.getFloatAttr(resultType, 1));
381-
case AtomicRMWKind::muli:
382-
return builder.create<ConstantOp>(loc,
383-
builder.getIntegerAttr(resultType, 1));
384-
// TODO: Add remaining reduction operations.
385-
default:
386-
(void)emitOptionalError(loc, "Reduction operation type not supported");
387-
break;
388-
}
389-
return nullptr;
390-
}
391-
392-
/// Return the value obtained by applying the reduction operation kind
393-
/// associated with a binary AtomicRMWKind op to `lhs` and `rhs`.
394-
static Value getReductionOp(AtomicRMWKind op, OpBuilder &builder, Location loc,
395-
Value lhs, Value rhs) {
396-
switch (op) {
397-
case AtomicRMWKind::addf:
398-
return builder.create<AddFOp>(loc, lhs, rhs);
399-
case AtomicRMWKind::addi:
400-
return builder.create<AddIOp>(loc, lhs, rhs);
401-
case AtomicRMWKind::mulf:
402-
return builder.create<MulFOp>(loc, lhs, rhs);
403-
case AtomicRMWKind::muli:
404-
return builder.create<MulIOp>(loc, lhs, rhs);
405-
// TODO: Add remaining reduction operations.
406-
default:
407-
(void)emitOptionalError(loc, "Reduction operation type not supported");
408-
break;
409-
}
410-
return nullptr;
411-
}
412-
413370
/// Convert an `affine.parallel` (loop nest) operation into a `scf.parallel`
414371
/// operation.
415372
class AffineParallelLowering : public OpRewritePattern<AffineParallelOp> {

0 commit comments

Comments
 (0)