Skip to content

[mlir][linalg][NFC] Remove linalg subset hoisting #70636

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 0 additions & 103 deletions mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,109 +45,6 @@ namespace linalg {
/// when used on distributed loops with memref semantics!
void hoistRedundantVectorTransfers(func::FuncOp func);

/// Greedily hoist redundant subset extract/insert operations on tensors outside
/// of `forOp`. The logic follows:
/// 1. Look for a write walking back from the `forOp` yield.
/// 2. Check the uses of the matching block argument and look for a matching
/// read (i.e. extract_slice of transfer_read) with matching indices.
/// 3. In the case of a transfer_write, we can bypass other non-conflicting
/// operations and find more hoisting opportunities.
/// 4. Hoist the read/write pair and update the tensor SSA links.
///
/// Return the unmodified `forOp` if no hoisting occured.
/// Return a new scf::ForOp if hoisting on tensors occured.
///
/// After this transformation the returned scf::ForOp may have unused arguments
/// that can be removed by application of canonicalization patterns.
///
/// Example:
/// ========
/// IR Resembling:
///
/// ```
/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0)->(tensor<10xf32>) {
/// %1 = scf.for %j = %l to %u step %s iter_args(%a6 = %a0)->(tensor<10xf32>) {
/// %e = tensor.extract_slice %a6[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
/// %r = vector.transfer_read %e[%c0], %cst: tensor<?xf32>, vector<4xf32>
/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
/// %w = vector.transfer_write %u, %e[%c0] : vector<4xf32>, tensor<?xf32>
/// %st = tensor.insert_slice %w into %a6[%i][%sz][1]
/// : tensor<?xf32> into tensor<10xf32>
/// scf.yield %st: tensor<10xf32>
/// }
/// scf.yield %1: tensor<10xf32>
/// }
/// ```
///
/// Progressively hoists to:
///
/// ```
/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
/// %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
/// %1:2 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e)
/// -> (tensor<10xf32>, tensor<?xf32>) {
/// %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
/// %w = vector.transfer_write %u, %a7[%c0] : vector<4xf32>, tensor<?xf32>
/// scf.yield %a6, %w: tensor<10xf32>, tensor<?xf32>
/// }
/// %st = tensor.insert_slice %1#1 into %1#0[%i][%sz][1]
/// : tensor<?xf32> into tensor<10xf32>
/// scf.yield %1: tensor<10xf32>
/// }
/// ```
///
/// and
///
/// ```
/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
/// %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
/// %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
/// %1:3 = scf.for %j = %l to %u step %s iter_args(%a6 = a0, %a7 = %e, %a7 = r)
/// -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
/// scf.yield %a6, %a7, %u: tensor<10xf32>, tensor<?xf32>, vector<4xf32>
/// }
/// %w = vector.transfer_write %1#2, %1#1[%c0] : vector<4xf32>, tensor<?xf32>
/// %st = tensor.insert_slice %w into %1#0[%i][%sz][1]
/// : tensor<?xf32> into tensor<10xf32>
/// scf.yield %1: tensor<10xf32>
/// }
/// ```
///
/// It can then canonicalize to:
///
/// ```
/// %0 = scf.for %i = %l to %u step %s iter_args(%a0 = %t0) -> (tensor<10xf32>){
/// %e = tensor.extract_slice %a0[%i][%sz][1]: tensor<10xf32> to tensor<?xf32>
/// %r = vector.transfer_read %a7[%c0], %cst: tensor<?xf32>, vector<4xf32>
/// %1 = scf.for %j = %l to %u step %s iter_args(%a7 = r)
/// -> (tensor<10xf32>, tensor<?xf32>, vector<4xf32>) {
/// %u = "some_use"(%r) : (vector<4xf32>) -> vector<4xf32>
/// scf.yield %u: vector<4xf32>
/// }
/// %w = vector.transfer_write %1, %e[%c0] : vector<4xf32>, tensor<?xf32>
/// %st = tensor.insert_slice %w into %a0[%i][%sz][1]
/// : tensor<?xf32> into tensor<10xf32>
/// scf.yield %1: tensor<10xf32>
/// }
/// ```
///
// TODO: This should be further generalized along a few different axes:
// - Other loops than scf.ForOp that operate on tensors (both sequential and
// parallel loops).
// - Other subset extract/insert pairs than tensor.extract/insert_slice and
// vector.transfer_read/write.
// - More general areSubsetDisjoint analysis/interface to work across all
// subset op types and allow bypassing non-WAW-conflicting operations in
// more cases.
scf::ForOp hoistRedundantSubsetExtractInsert(RewriterBase &rewriter,
scf::ForOp forOp);

/// Call into `hoistRedundantSubsetInsertExtract` without a RewriterBase.
// TODO: obsolete and should be retired
void hoistRedundantVectorTransfersOnTensor(func::FuncOp func);

} // namespace linalg
} // namespace mlir

Expand Down
1 change: 0 additions & 1 deletion mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ add_mlir_dialect_library(MLIRLinalgTransforms
Specialize.cpp
Split.cpp
SplitReduction.cpp
SubsetHoisting.cpp
SubsetInsertionOpInterfaceImpl.cpp
SwapExtractSliceWithFillPatterns.cpp
Tiling.cpp
Expand Down
5 changes: 3 additions & 2 deletions mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "mlir/IR/Dominance.h"
#include "mlir/IR/Matchers.h"
#include "mlir/Interfaces/DestinationStyleOpInterface.h"
#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
#include "mlir/Transforms/RegionUtils.h"
#include "llvm/Support/Debug.h"

Expand Down Expand Up @@ -292,8 +293,8 @@ void HoistPaddingAnalysis::enableHoistPadding(RewriterBase &rewriter) {
// enclosing loop, try to apply hoisting on this outermost loop.
// TODO: we may want finer-grained hoisting of only that particular `sliceOp`.
if (!outermostEnclosingForOp.isDefinedOutsideOfLoop(sliceOp.getSource())) {
outermostEnclosingForOp =
hoistRedundantSubsetExtractInsert(rewriter, outermostEnclosingForOp);
outermostEnclosingForOp = cast<scf::ForOp>(
hoistLoopInvariantSubsets(rewriter, outermostEnclosingForOp));
}
}

Expand Down
9 changes: 0 additions & 9 deletions mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,6 @@ using llvm::dbgs;
using namespace mlir;
using namespace mlir::linalg;

void mlir::linalg::hoistRedundantVectorTransfersOnTensor(func::FuncOp func) {
IRRewriter rewriter(func->getContext());
// TODO: walking in some reverse / inside-out order would be more efficient
// and would capture more cases.
func.walk([&](scf::ForOp forOp) {
hoistRedundantSubsetExtractInsert(rewriter, forOp);
});
}

static bool noAliasingUseInLoop(vector::TransferReadOp transferRead,
LoopLikeOpInterface loop) {
Value source = transferRead.getSource();
Expand Down
Loading