Skip to content

[MLIR][Linalg] Scalable Vectorization of Reduction on the Trailing Dimension #97788

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -586,6 +586,12 @@ static SmallVector<bool> getDimsToReduce(LinalgOp linalgOp) {
llvm::map_range(linalgOp.getIteratorTypesArray(), isReductionIterator));
}

static bool hasLinalgReduction(LinalgOp &op) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please document. Also, why hasLinalgReduction raher than isaLinalgReduction?

return isa<linalg::ReduceOp>(op) ||
(isa<linalg::GenericOp>(op) &&
llvm::any_of(op.getIteratorTypesArray(), isReductionIterator));
}

/// Build a vector.transfer_write of `value` into `outputOperand` at indices set
/// to all `0`; where `outputOperand` is an output operand of the LinalgOp
/// currently being vectorized. If `dest` has null rank, build an memref.store.
Expand Down Expand Up @@ -1787,6 +1793,9 @@ vectorizeDynamicLinalgOpPrecondition(linalg::LinalgOp op,
if (isa<ConvolutionOpInterface>(op.getOperation()))
return vectorizeDynamicConvOpPrecondition(op, flatten1DDepthwiseConv);

if (hasLinalgReduction(op))
return reductionPreconditions(op);

// TODO: Masking only supports dynamic element-wise ops, linalg.generic ops,
// linalg.copy ops and ops that implement ContractionOpInterface for now.
if (!isElementwise(op) &&
Expand Down Expand Up @@ -1976,6 +1985,7 @@ vectorizeScalableVectorPrecondition(Operation *op,
// 1. exactly 1 dim is scalable and that's the _last_ parallel dim
// 2. exactly 2 dims are scalable and those are the _last two adjacent_
// parallel dims
// 3. exactly 1 reduction dim is scalable and that's the last (innermost) dim
// The 2nd restriction above means that only Matmul-like Ops are supported
// when 2 dims are scalable, e.g. :
// * iterators = [parallel, parallel, reduction]
Expand All @@ -1992,11 +2002,15 @@ vectorizeScalableVectorPrecondition(Operation *op,
scalableFlags.pop_back();
}

// TODO: Support scalable vectorisation for reduction dims
if (iterators.back() == utils::IteratorType::reduction)
return failure();
if (iterators.back() == utils::IteratorType::reduction) {
if (iterators.size() != inputVectorSizes.size()) {
LDBG("Non-trailing reduction dim requested for scalable "
"vectorization\n");
return failure();
}
}

// If this is not the _last_ parallel dim, 1. above is not met
// If this is not the _last_ parallel dim, 1. or 3. above is not met
if (seenParalell)
return failure();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are two cases here. Should we turn this into a switch statement to combine this somehow?

switch (iterators.back()) {
  case utils::IteratorType::reduction: {
    // Check 3. above is met.
    if (iterators.size() != inputVectorSizes.size()) {
      LDBG("Non-trailing reduction dim requested for scalable "
           "vectorization\n");
      return failure();
      break;
    }
  }
  case utils::IteratorType::parallel: {
    // Check 1. and 2. above are met.
    if (seenParalell) {
      LDBG("Inner parallel dim requested for scalable "
           "vectorization\n");
      return failure();
    }
    break;
  }

WDYT? I'm open to suggestion :)


Expand All @@ -2017,7 +2031,8 @@ vectorizeScalableVectorPrecondition(Operation *op,
// presence of scalable vectors
return success(isElementwise(linalgOp) || isa<linalg::MatmulOp>(op) ||
isa<linalg::MatmulTransposeAOp>(op) ||
isa<linalg::DepthwiseConv1DNwcWcOp>(op));
isa<linalg::DepthwiseConv1DNwcWcOp>(op) ||
hasLinalgReduction(linalgOp));
}

LogicalResult mlir::linalg::vectorizeOpPrecondition(
Expand Down
82 changes: 82 additions & 0 deletions mlir/test/Dialect/Linalg/vectorization-scalable.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,85 @@ module attributes {transform.with_named_sequence} {
transform.yield
}
}

// -----

func.func @vectorize_dynamic_reduction_scalable_1d(%arg0: tensor<?xf32>,
%arg1: tensor<f32>) -> tensor<f32> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] Indentation - please align %arg1 with %arg0. Same comment below.


%0 = linalg.reduce ins(%arg0 : tensor<?xf32>) outs(%arg1 : tensor<f32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}
return %0 : tensor<f32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_1d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?xf32>, %[[ARG_1:.*]]: tensor<f32>) -> tensor<f32> {
// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index
// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?xf32>
// CHECK: %[[VAL_2:.*]] = arith.constant 0 : index
// CHECK: %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[VAL_4:.*]] = vector.create_mask %[[VAL_1]] : vector<[4]xi1>
// CHECK: %[[VAL_5:.*]] = vector.mask %[[VAL_4]] { vector.transfer_read %[[ARG_0]][%[[VAL_2]]], %[[VAL_3]] {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32> } : vector<[4]xi1> -> vector<[4]xf32>
// CHECK: %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[VAL_7:.*]] = vector.transfer_read %[[ARG_1]][], %[[VAL_6]] : tensor<f32>, vector<f32>
// CHECK: %[[VAL_8:.*]] = vector.extractelement %[[VAL_7]][] : vector<f32>
// CHECK: %[[VAL_9:.*]] = vector.mask %[[VAL_4]] { vector.multi_reduction <add>, %[[VAL_5]], %[[VAL_8]] [0] : vector<[4]xf32> to f32 } : vector<[4]xi1> -> f32
// CHECK: %[[VAL_10:.*]] = vector.broadcast %[[VAL_9]] : f32 to vector<f32>
// CHECK: %[[VAL_11:.*]] = vector.transfer_write %[[VAL_10]], %[[ARG_1]][] : vector<f32>, tensor<f32>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] Kind request - descriptive LIT variable names.

// CHECK: return %[[VAL_11]] : tensor<f32>
// CHECK: }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
transform.yield
}
}

// -----

// Note: scalable version of `vectorize_dynamic_reduction` in test/Dialect/Linalg/vectorization.mlir.
func.func @vectorize_dynamic_reduction_scalable_2d(%arg0: tensor<?x?xf32>,
%arg1: tensor<?xf32>) -> tensor<?xf32> {
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
ins(%arg0 : tensor<?x?xf32>)
outs(%arg1 : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
%0 = arith.addf %in, %out : f32
linalg.yield %0 : f32
} -> tensor<?xf32>
return %0 : tensor<?xf32>
}

// CHECK-LABEL: func.func @vectorize_dynamic_reduction_scalable_2d(
// CHECK-SAME: %[[ARG_0:.*]]: tensor<?x?xf32>, %[[ARG_1:.*]]: tensor<?xf32>) -> tensor<?xf32> {
// CHECK: %[[VAL_0:.*]] = arith.constant 0 : index
// CHECK: %[[VAL_1:.*]] = tensor.dim %[[ARG_0]], %[[VAL_0]] : tensor<?x?xf32>
// CHECK: %[[VAL_2:.*]] = arith.constant 1 : index
// CHECK: %[[VAL_3:.*]] = tensor.dim %[[ARG_0]], %[[VAL_2]] : tensor<?x?xf32>
// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index
// CHECK: %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[VAL_6:.*]] = vector.create_mask %[[VAL_1]], %[[VAL_3]] : vector<1x[4]xi1>
// CHECK: %[[VAL_7:.*]] = vector.mask %[[VAL_6]] { vector.transfer_read %[[ARG_0]][%[[VAL_4]], %[[VAL_4]]], %[[VAL_5]] {in_bounds = [true, true]} : tensor<?x?xf32>, vector<1x[4]xf32> } : vector<1x[4]xi1> -> vector<1x[4]xf32>
// CHECK: %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
// CHECK: %[[VAL_9:.*]] = vector.create_mask %[[VAL_1]] : vector<1xi1>
// CHECK: %[[VAL_10:.*]] = vector.mask %[[VAL_9]] { vector.transfer_read %[[ARG_1]][%[[VAL_4]]], %[[VAL_8]] {in_bounds = [true]} : tensor<?xf32>, vector<1xf32> } : vector<1xi1> -> vector<1xf32>
// CHECK: %[[VAL_11:.*]] = vector.mask %[[VAL_6]] { vector.multi_reduction <add>, %[[VAL_7]], %[[VAL_10]] [1] : vector<1x[4]xf32> to vector<1xf32> } : vector<1x[4]xi1> -> vector<1xf32>
// CHECK: %[[VAL_12:.*]] = arith.constant 0 : index
// CHECK: %[[VAL_13:.*]] = vector.mask %[[VAL_9]] { vector.transfer_write %[[VAL_11]], %[[ARG_1]][%[[VAL_12]]] {in_bounds = [true]} : vector<1xf32>, tensor<?xf32> } : vector<1xi1> -> tensor<?xf32>
// CHECK: return %[[VAL_13]] : tensor<?xf32>
// CHECK: }

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] Use [4, [8]] instead for consistency:

transform.structured.vectorize %0 vector_sizes [4, 8] : !transform.any_op

transform.yield
}
}
20 changes: 10 additions & 10 deletions mlir/test/Dialect/Linalg/vectorization-unsupported.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -129,35 +129,35 @@ module attributes {transform.with_named_sequence} {

// -----

func.func @linalg_reduce_scalable(%input: tensor<?xf32>,
%acc: tensor<f32>) -> tensor<f32> {
func.func @linalg_reduce_scalable_leading_dim(%input: tensor<?x?xf32>,
%acc: tensor<?xf32>) -> tensor<?xf32> {

// expected-error @+1 {{Attempted to vectorize, but failed}}
%0 = linalg.reduce ins(%input : tensor<?xf32>) outs(%acc : tensor<f32>) dimensions = [0]
%0 = linalg.reduce ins(%input : tensor<?x?xf32>) outs(%acc : tensor<?xf32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}
return %0 : tensor<f32>
return %0 : tensor<?xf32>
}

module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.reduce"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4]] : !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
transform.yield
}
}

// -----

func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
%acc: tensor<?xf32>) -> tensor<?xf32> {
func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor<?x?xf32>,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nit] I think this would read a bit better:

Suggested change
func.func @linalg_generic_scalable_reduction_leading_dim(%input: tensor<?x?xf32>,
func.func @linalg_generic_reduction_scalable_leading_dim(%input: tensor<?x?xf32>,

%acc: tensor<?xf32>) -> tensor<?xf32> {

// expected-error @+1 {{Attempted to vectorize, but failed}}
%0 = linalg.generic { indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
affine_map<(d0, d1) -> (d0)>],
iterator_types = ["parallel", "reduction"] }
affine_map<(d0, d1) -> (d1)>],
iterator_types = ["reduction", "parallel"] }
ins(%input : tensor<?x?xf32>)
outs(%acc : tensor<?xf32>) {
^bb(%in: f32, %out: f32) :
Expand All @@ -170,7 +170,7 @@ func.func @linalg_generic_scalable_reduction_dim(%input: tensor<?x?xf32>,
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
transform.structured.vectorize %0 vector_sizes [1, [4]] : !transform.any_op
transform.structured.vectorize %0 vector_sizes [[4], 1] : !transform.any_op
transform.yield
}
}
Expand Down
134 changes: 134 additions & 0 deletions mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/reduce_1d.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// DEFINE: %{compile} = mlir-opt %s \
// DEFINE: -transform-interpreter -test-transform-dialect-erase-schedule \
// DEFINE: -one-shot-bufferize="bufferize-function-boundaries" -buffer-deallocation-pipeline -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
// DEFINE: -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
// DEFINE: %{entry_point} = reduce_1d_f32
// DEFINE: %{run} = %mcr_aarch64_cmd %t -e %{entry_point} -entry-point-result=void --march=aarch64 --mattr="+sve"\
// DEFINE: -shared-libs=%mlir_native_utils_lib_dir/libmlir_runner_utils%shlibext,%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext

// RUN: %{compile}

// RUN: %{run} | FileCheck %s --check-prefix=REDUCE

// REDEFINE: %{entry_point} = generic_reduce_1d_f32
// RUN: %{run} | FileCheck %s --check-prefix=GENERIC

func.func @reduce_1d_f32() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you mind adding a test for i16 as well? Or any integer value. Just to make sure that we check both FP and integers. Thanks!

// 1-D Tensor
%N = arith.constant 1000 : index
%c0_f32 = arith.constant 0.0 : f32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
%C_alloc = bufferization.alloc_tensor() : tensor<f32>

// Initialise the tensors
%pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>

// Reduce
%C_out = linalg.reduce ins(%A_in : tensor<?xf32>) outs(%C_in: tensor<f32>) dimensions = [0]
(%in: f32, %init: f32) {
%0 = arith.addf %in, %init : f32
linalg.yield %0 : f32
}

// Print and verify the output
// REDUCE-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// REDUCE-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
// REDUCE-NEXT: [3141.6]

%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()

// REDUCE-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

func.func @generic_reduce_1d_f32() {
// 1-D Tensor
%N = arith.constant 1000 : index
%c0_f32 = arith.constant 0.0 : f32

// Allocate the input and output tensors
%A_alloc = bufferization.alloc_tensor(%N) : tensor<?xf32>
%C_alloc = bufferization.alloc_tensor() : tensor<f32>

// Initialise the tensors
%pi = arith.constant 3.1416 : f32
%A_in = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?xf32>) -> tensor<?xf32>
%C_in = tensor.insert %c0_f32 into %C_alloc[] : tensor<f32>

// Reduce
%C_out = linalg.generic { indexing_maps = [affine_map<(d0) -> (d0)>,
affine_map<(d0) -> ()>],
iterator_types = ["reduction"] }
ins(%A_in : tensor<?xf32>)
outs(%C_in : tensor<f32>) {
^bb(%in: f32, %out: f32) :
%0 = arith.addf %in, %out : f32
linalg.yield %0 : f32
} -> tensor<f32>

// Print and verify the output
// GENERIC-LABEL: SVE: START OF TEST OUTPUT
vector.print str "SVE: START OF TEST OUTPUT\n"

// GENERIC-NEXT: Unranked Memref {{.*}} rank = 0 offset = 0 sizes = [] strides = [] data =
// GENERIC-NEXT: [3141.6]

%xf = tensor.cast %C_out : tensor<f32> to tensor<*xf32>
call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()

// GENERIC-NEXT: SVE: END OF TEST OUTPUT
vector.print str "SVE: END OF TEST OUTPUT\n"

return
}

module attributes {transform.with_named_sequence} {
// A sequence that will tile and vectorise a Reduce Op
transform.named_sequence @tile_and_vectorize_reduce(%func
: !transform.op<"func.func"> {transform.readonly}) {

// Step 0: Get a handle to the reduce Op
%reduce = transform.structured.match ops{["linalg.reduce", "linalg.generic"]} in %func
: (!transform.op<"func.func">) -> !transform.any_op

// Step 1: Tile
%tiled_reduce, %loops:1 = transform.structured.tile_using_for %reduce tile_sizes [[4]]
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)

// Step 2: Vectorize
transform.structured.vectorize %tiled_reduce vector_sizes [[4]] : !transform.any_op

// Step 3: Lower vector.multi_reduction
transform.apply_patterns to %func {
transform.apply_patterns.vector.lower_masked_transfers
transform.apply_patterns.vector.lower_multi_reduction lowering_strategy = "innerreduction"
} : !transform.op<"func.func">

transform.yield
}

// A sequence that goes over all functions in tis module and applies
// "tile_and_vectorize_reduce"
transform.named_sequence @__transform_main(%module: !transform.any_op {transform.readonly}) {
%funcs = transform.structured.match ops{["func.func"]} in %module
: (!transform.any_op) -> !transform.op<"func.func">

transform.foreach %funcs : !transform.op<"func.func"> {
^bb2(%func : !transform.op<"func.func">):
transform.include @tile_and_vectorize_reduce failures(propagate)
(%func) : (!transform.op<"func.func">) -> ()
}
transform.yield
}
}

func.func private @printMemrefF32(%ptr : tensor<*xf32>)
Loading
Loading