-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[MLIR][Linalg] Scalable Vectorization of Reduction on the Trailing Dimension #97788
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
0861873
fba222e
1e5ef34
75f0da2
00c683a
5a4ac6d
7c71012
0044740
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -586,6 +586,12 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) { | |
llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator)); | ||
} | ||
|
||
static bool hasLinalgReduction(LinalgOp &op) { | ||
return isa<linalg::ReduceOp>(op) || | ||
(isa<linalg::GenericOp>(op) && | ||
llvm::any_of(op.getIteratorTypesArray(), isReductionIterator)); | ||
} | ||
|
||
/// Build a vector.transfer_write of `value` into `outputOperand` at indices set | ||
/// to all `0`; where `outputOperand` is an output operand of the LinalgOp | ||
/// currently being vectorized. If `dest` has null rank, build an memref.store. | ||
|
@@ -1787,6 +1793,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op, | |
if (isa<ConvolutionOpInterface>(op.getOperation())) | ||
return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv); | ||
|
||
if (hasLinalgReduction(op)) | ||
return reductionPreconditions(op); | ||
|
||
// TODO: Masking only supports dynamic element-wise ops, linalg.generic ops, | ||
// linalg.copy ops and ops that implement ContractionOpInterface for now. | ||
if (!isElementwise(op) && | ||
|
@@ -1976,6 +1985,7 @@ vectorizeScalableVectorPrecondition(Operation *op, | |
// 1. exactly 1 dim is scalable and that's the _last_ parallel dim | ||
// 2. exactly 2 dims are scalable and those are the _last two adjacent_ | ||
// parallel dims | ||
// 3. exactly 1 reduction dim is scalable and that's the last (innermost) dim | ||
// The 2nd restriction above means that only Matmul-like Ops are supported | ||
// when 2 dims are scalable, e.g. : | ||
// * iterators = [parallel, parallel, reduction] | ||
|
@@ -1992,11 +2002,15 @@ vectorizeScalableVectorPrecondition(Operation *op, | |
scalableFlags.pop_back(); | ||
} | ||
|
||
// TODO: Support scalable vectorisation for reduction dims | ||
if (iterators.back() == utils::IteratorType::reduction) | ||
return failure(); | ||
if (iterators.back() == utils::IteratorType::reduction) { | ||
if (iterators.size() != inputVectorSizes.size()) { | ||
LDBG("Non-trailing reduction dim requested for scalable " | ||
"vectorization\n"); | ||
return failure(); | ||
} | ||
} | ||
|
||
// If this is not the _last_ parallel dim, 1. above is not met | ||
// If this is not the _last_ parallel dim, 1. or 3. above is not met | ||
if (seenParalell) | ||
return failure(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are two cases here. Should we turn this into a switch statement to combine this somehow? switch (iterators.back()) {
case utils::IteratorType::reduction: {
// Check 3. above is met.
if (iterators.size() != inputVectorSizes.size()) {
LDBG("Non-trailing reduction dim requested for scalable "
"vectorization\n");
return failure();
break;
}
}
case utils::IteratorType::parallel: {
// Check 1. and 2. above are met.
if (seenParalell) {
LDBG("Inner parallel dim requested for scalable "
"vectorization\n");
return failure();
}
break;
} WDYT? I'm open to suggestion :) |
||
|
||
|
@@ -2017,7 +2031,8 @@ vectorizeScalableVectorPrecondition(Operation *op, | |
// presence of scalable vectors | ||
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) || | ||
isa<linalg::MatmulTransposeAOp>(op) || | ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op)); | ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op) || | ||
hasLinalgReduction(linalgOp)); | ||
} | ||
|
||
LogicalResult mlir::linalg::vectorizeOpPrecondition( | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -189,3 +189,85 @@ module attributes {transform.with_named_sequence} { | |||
transform.yield | ||||
} | ||||
} | ||||
|
||||
// ----- | ||||
|
||||
func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>, | ||||
%arg1: tensor<f32>) -> tensor<f32> { | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nit] Indentation - please align |
||||
|
||||
%0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0] | ||||
(%in: f32, %init: f32) { | ||||
%0 = arith.addf %in, %init : f32 | ||||
linalg.yield %0 : f32 | ||||
} | ||||
return %0 : tensor<f32> | ||||
} | ||||
|
||||
// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d( | ||||
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> { | ||||
// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index | ||||
// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?xf32> | ||||
// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index | ||||
// CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32 | ||||
// CHECK: %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1> | ||||
// CHECK: %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32> | ||||
// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32 | ||||
// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor<f32>, vector<f32> | ||||
// CHECK: %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector<f32> | ||||
// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction <add>, %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32 | ||||
// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector<f32> | ||||
// CHECK: %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector<f32>, tensor<f32> | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nit] Kind request - descriptive LIT variable names. |
||||
// CHECK: return %[[VAL_11]] : tensor<f32> | ||||
// CHECK: } | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two lines can be skipped: |
||||
|
||||
module attributes {transform.with_named_sequence} { | ||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { | ||||
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op | ||||
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op | ||||
transform.yield | ||||
} | ||||
} | ||||
|
||||
// ----- | ||||
|
||||
// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir. | ||||
func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>, | ||||
%arg1: tensor<?xf32>) -> tensor<?xf32> { | ||||
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, | ||||
affine_map<(d0, d1) -> (d0)>], | ||||
iterator_types = ["parallel", "reduction"] } | ||||
ins(%arg0 : tensor<?x?xf32>) | ||||
outs(%arg1 : tensor<?xf32>) { | ||||
^bb(%in: f32, %out: f32) : | ||||
%0 = arith.addf %in, %out : f32 | ||||
linalg.yield %0 : f32 | ||||
} -> tensor<?xf32> | ||||
return %0 : tensor<?xf32> | ||||
} | ||||
|
||||
// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d( | ||||
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> { | ||||
// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index | ||||
// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?x?xf32> | ||||
// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index | ||||
// CHECK: %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor<?x?xf32> | ||||
// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index | ||||
// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32 | ||||
// CHECK: %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1> | ||||
// CHECK: %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32> | ||||
// CHECK: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32 | ||||
// CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1> | ||||
// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor<?xf32>, vector<1xf32> } : vector<1xi1> -> vector<1xf32> | ||||
// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction <add>, %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32> | ||||
// CHECK: %[[VAL_12:.*]] = arith.constant 0 : index | ||||
// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor<?xf32> } : vector<1xi1> -> tensor<?xf32> | ||||
// CHECK: return %[[VAL_13]] : tensor<?xf32> | ||||
// CHECK: } | ||||
|
||||
module attributes {transform.with_named_sequence} { | ||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { | ||||
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op | ||||
transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nit] Use
|
||||
transform.yield | ||||
} | ||||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -129,35 +129,35 @@ module attributes {transform.with_named_sequence} { | |||||
|
||||||
// ----- | ||||||
|
||||||
func.func @linalg_reduce_scalable(%input: tensor<?xf32>, | ||||||
%acc: tensor<f32>) -> tensor<f32> { | ||||||
func.func @linalg_reduce_scalable_leading_dim(%input: tensor<?x?xf32>, | ||||||
%acc: tensor<?xf32>) -> tensor<?xf32> { | ||||||
|
||||||
// expected-error @+1 {{Attempted to vectorize, but failed}} | ||||||
%0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0] | ||||||
%0 = linalg.reduce ins(%input : tensor<?x?xf32>) outs(%acc : tensor<?xf32>) dimensions = [0] | ||||||
(%in: f32, %init: f32) { | ||||||
%0 = arith.addf %in, %init : f32 | ||||||
linalg.yield %0 : f32 | ||||||
} | ||||||
return %0 : tensor<f32> | ||||||
return %0 : tensor<?xf32> | ||||||
} | ||||||
|
||||||
module attributes {transform.with_named_sequence} { | ||||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { | ||||||
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op | ||||||
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op | ||||||
transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op | ||||||
transform.yield | ||||||
} | ||||||
} | ||||||
|
||||||
// ----- | ||||||
|
||||||
func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>, | ||||||
%acc: tensor<?xf32>) -> tensor<?xf32> { | ||||||
func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor<?x?xf32>, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. [nit] I think this would read a bit better:
Suggested change
|
||||||
%acc: tensor<?xf32>) -> tensor<?xf32> { | ||||||
|
||||||
// expected-error @+1 {{Attempted to vectorize, but failed}} | ||||||
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, | ||||||
affine_map<(d0, d1) -> (d0)>], | ||||||
iterator_types = ["parallel", "reduction"] } | ||||||
affine_map<(d0, d1) -> (d1)>], | ||||||
iterator_types = ["reduction", "parallel"] } | ||||||
ins(%input : tensor<?x?xf32>) | ||||||
outs(%acc : tensor<?xf32>) { | ||||||
^bb(%in: f32, %out: f32) : | ||||||
|
@@ -170,7 +170,7 @@ func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>, | |||||
module attributes {transform.with_named_sequence} { | ||||||
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) { | ||||||
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op | ||||||
transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op | ||||||
transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op | ||||||
transform.yield | ||||||
} | ||||||
} | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,134 @@ | ||
// DEFINE: %{compile} = mlir-opt %s \ | ||
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \ | ||
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \ | ||
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t | ||
// DEFINE: %{entry_point} = reduce_1d_f32 | ||
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\ | ||
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | ||
|
||
// RUN: %{compile} | ||
|
||
// RUN: %{run} | FileCheck %s --check-prefix=REDUCE | ||
|
||
// REDEFINE: %{entry_point} = generic_reduce_1d_f32 | ||
// RUN: %{run} | FileCheck %s --check-prefix=GENERIC | ||
|
||
func.func @reduce_1d_f32() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you mind adding a test for |
||
// 1-D Tensor | ||
%N = arith.constant 1000 : index | ||
%c0_f32 = arith.constant 0.0 : f32 | ||
|
||
// Allocate the input and output tensors | ||
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32> | ||
%C_alloc = bufferization.alloc_tensor() : tensor<f32> | ||
|
||
// Initialise the tensors | ||
%pi = arith.constant 3.1416 : f32 | ||
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32> | ||
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32> | ||
|
||
// Reduce | ||
%C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0] | ||
(%in: f32, %init: f32) { | ||
%0 = arith.addf %in, %init : f32 | ||
linalg.yield %0 : f32 | ||
} | ||
|
||
// Print and verify the output | ||
// REDUCE-LABEL: SVE: START OF TEST OUTPUT | ||
vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
||
// REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = | ||
// REDUCE-NEXT: [3141.6] | ||
|
||
%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32> | ||
call @printMemrefF32(%xf) : (tensor<*xf32>) -> () | ||
|
||
// REDUCE-NEXT: SVE: END OF TEST OUTPUT | ||
vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
||
return | ||
} | ||
|
||
func.func @generic_reduce_1d_f32() { | ||
// 1-D Tensor | ||
%N = arith.constant 1000 : index | ||
%c0_f32 = arith.constant 0.0 : f32 | ||
|
||
// Allocate the input and output tensors | ||
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32> | ||
%C_alloc = bufferization.alloc_tensor() : tensor<f32> | ||
|
||
// Initialise the tensors | ||
%pi = arith.constant 3.1416 : f32 | ||
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32> | ||
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32> | ||
|
||
// Reduce | ||
%C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>, | ||
affine_map<(d0) -> ()>], | ||
iterator_types = ["reduction"] } | ||
ins(%A_in : tensor<?xf32>) | ||
outs(%C_in : tensor<f32>) { | ||
^bb(%in: f32, %out: f32) : | ||
%0 = arith.addf %in, %out : f32 | ||
linalg.yield %0 : f32 | ||
} -> tensor<f32> | ||
|
||
// Print and verify the output | ||
// GENERIC-LABEL: SVE: START OF TEST OUTPUT | ||
vector.print str "SVE: START OF TEST OUTPUT\n" | ||
|
||
// GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data = | ||
// GENERIC-NEXT: [3141.6] | ||
|
||
%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32> | ||
call @printMemrefF32(%xf) : (tensor<*xf32>) -> () | ||
|
||
// GENERIC-NEXT: SVE: END OF TEST OUTPUT | ||
vector.print str "SVE: END OF TEST OUTPUT\n" | ||
|
||
return | ||
} | ||
|
||
module attributes {transform.with_named_sequence} { | ||
// A sequence that will tile and vectorise a Reduce Op | ||
transform.named_sequence @tile_and_vectorize_reduce(%func | ||
: !transform.op<"func.func"> {transform.readonly}) { | ||
|
||
// Step 0: Get a handle to the reduce Op | ||
%reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func | ||
: (!transform.op<"func.func">) -> !transform.any_op | ||
|
||
// Step 1: Tile | ||
%tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]] | ||
: (!transform.any_op) -> (!transform.any_op, !transform.any_op) | ||
|
||
// Step 2: Vectorize | ||
transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op | ||
|
||
// Step 3: Lower vector.multi_reduction | ||
transform.apply_patterns to %func { | ||
transform.apply_patterns.vector.lower_masked_transfers | ||
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction" | ||
} : !transform.op<"func.func"> | ||
|
||
transform.yield | ||
} | ||
|
||
// A sequence that goes over all functions in tis module and applies | ||
// "tile_and_vectorize_reduce" | ||
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) { | ||
%funcs = transform.structured.match ops{["func.func"]} in %module | ||
: (!transform.any_op) -> !transform.op<"func.func"> | ||
|
||
transform.foreach %funcs : !transform.op<"func.func"> { | ||
^bb2(%func : !transform.op<"func.func">): | ||
transform.include @tile_and_vectorize_reduce failures(propagate) | ||
(%func) : (!transform.op<"func.func">) -> () | ||
} | ||
transform.yield | ||
} | ||
} | ||
|
||
func.func private @printMemrefF32(%ptr : tensor<*xf32>) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please document. Also, why
hasLinalgReduction
raher thanisaLinalgReduction
?