-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][linalg] unfold projected permutation. #114704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
cdf865c
[mlir][linalg] unfold projected permutation.
javedabsar1 ce58238
[mlir][linalg] fix clang format issue
javedabsar1 b9094dc
[mlir][linalg] Address review comments
javedabsar1 3b238c6
Revise more based on review comments.
javedabsar1 e3373b8
Revise based on 3rd round review comments
javedabsar1 296f805
Revise based on 3rd round review comments
javedabsar1 6f61f9a
Revise based on 5th review comments
javedabsar1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
249 changes: 249 additions & 0 deletions
249
mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
//===- DecomposeGenericByUnfoldingPermutation.cpp -------===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// | ||
#include "mlir/Dialect/Affine/IR/AffineOps.h" | ||
#include "mlir/Dialect/Linalg/IR/Linalg.h" | ||
#include "mlir/Dialect/Linalg/Transforms/Transforms.h" | ||
#include <map> | ||
#include <optional> | ||
#include <utility> | ||
|
||
using namespace mlir; | ||
using namespace mlir::linalg; | ||
|
||
namespace { | ||
|
||
/// This pattern decomposes the input operand(s) of a linalg.generic that has | ||
/// a `transpose`, `broadcast`, or a mixture of two, into explicit transpose | ||
/// and broadcast. Having them folded into the linalg.generic is a good | ||
/// optimization but sometimes we may want to unwrap, i.e., `unfold` them as | ||
/// explicit transpose and broadcast. This rewrite pattern helps do it for | ||
/// each input operand. This is useful for instance when trying to recognize | ||
/// named ops. | ||
/// | ||
/// The transpose, broadcast, or mixture of both, are expressed in the affine | ||
/// map of the operand. Technically it is essentially `projected permutation`. | ||
/// | ||
/// Example | ||
/// | ||
/// ```mlir | ||
/// | ||
/// #projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)> | ||
/// #identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> | ||
/// ... | ||
/// %res = linalg.generic | ||
/// { indexing_maps = [#projection, #identity, #identity], | ||
/// iterator_types = ["parallel", "parallel", "parallel", | ||
/// "parallel", "parallel"]} | ||
/// ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>) | ||
/// outs(%z : tensor<5x9x7x8x10xf32>) { | ||
/// ^bb0(%in: f32, %in_1: f32, %out: f32): | ||
/// %div = arith.divf %in, %in_1 : f32 | ||
/// linalg.yield %div : f32 | ||
/// } -> tensor<5x9x7x8x10xf32> | ||
/// ``` | ||
/// | ||
/// In the above IR operand `%x` map is a projected-permutation. This can be | ||
/// unfolded as: | ||
/// | ||
/// ```mlir | ||
/// ... | ||
/// %x_trans = linalg.transpose | ||
/// ins(%x : tensor<7x8x9xf32>) | ||
/// outs(%e1 : tensor<9x7x8xf32>) permutation = [2, 0, 1] | ||
/// ... | ||
/// %x_trans_bc = linalg.broadcast | ||
/// ins(%x_trans : tensor<9x7x8xf32>) | ||
/// outs(%e2 : tensor<5x9x7x8x10xf32>) dimensions = [0, 4] | ||
/// %2 = linalg.div | ||
/// ins(%x_trans_bc, %y : | ||
/// tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>) | ||
/// outs(%arg2 : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> | ||
/// | ||
/// Note that linalg.generic has been 'specialized' to linalg.div. | ||
/// | ||
/// To unfold it, it is more optimal to transpose first and then do the | ||
/// broadcast. However, if transpose is done first, the permutation map needs | ||
/// to be expressed in terms of reduced dimension as broadcast hasn't happened | ||
/// yet. Also, the broadcast dimensions in a linalg.generic come from other | ||
/// operands (those not broadcasted along that particular dimension). We work | ||
/// this out by computing the convex-polyhedron shape of the linalg.generic | ||
/// iteration space from shapes of all the operands, both inputs and outputs. | ||
/// | ||
struct DecomposeProjectedPermutation : public OpRewritePattern<GenericOp> { | ||
using OpRewritePattern<GenericOp>::OpRewritePattern; | ||
|
||
LogicalResult matchAndRewrite(GenericOp genericOp, | ||
PatternRewriter &rewriter) const override; | ||
}; | ||
|
||
/// For the given `map`, determine what dimensions are transposed and what | ||
/// dimensions are broadcasted. | ||
/// Returns : | ||
/// transpose-permutation, broadcast-dimensions` (empty if not needed) | ||
/// | ||
std::pair<SmallVector<int64_t>, SmallVector<int64_t>> | ||
computeTransposeBroadcast(AffineMap &map) { | ||
assert(map.isProjectedPermutation(false) && "not a projection"); | ||
|
||
// As the map is a projection it likely operates on a smaller set of | ||
// dimensions as far as the transpose is concerned (rest are broadcast). | ||
int64_t minorSize = map.getNumResults(); | ||
|
||
SmallVector<int64_t> minorResult; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added comments |
||
for (int64_t i = 0; i < minorSize; ++i) { | ||
auto expr = cast<AffineDimExpr>(map.getResults()[i]); | ||
minorResult.push_back(expr.getPosition()); | ||
} | ||
|
||
// If dims are not monotonically increasing then transpose is present. | ||
SmallVector<int64_t> sortedResMap(minorResult); | ||
std::sort(sortedResMap.begin(), sortedResMap.end()); | ||
bool hasTranspose = !std::equal(minorResult.begin(), minorResult.end(), | ||
sortedResMap.begin(), sortedResMap.end()); | ||
|
||
// Walk the sorted map result to determine which dimensions are broadcasted. | ||
SmallVector<int64_t> broadcast; | ||
for (int64_t i = 0, j = 0; i < map.getNumInputs(); ++i) { | ||
if (j < minorSize && sortedResMap[j] == i) { | ||
j++; | ||
continue; | ||
} | ||
broadcast.push_back(i); | ||
} | ||
|
||
SmallVector<int64_t> permutation; | ||
if (hasTranspose) { | ||
// Consider an operand `x : tensor<7x8x9>` of a genericOp that has | ||
// affine map `affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)>` | ||
// `x`s access is both transposed and broadcast. But when specifying | ||
// the `linalg.transpose(x : tensor<7x8x9>)` the dimensions need to be | ||
// specified as `affine_map<(d0,d1,d2) -> (d1, d2, d0)` instead of | ||
// refering to d3, d4. Therefore, re-base the transpose dimensions so | ||
// that they start from d0. | ||
permutation.resize(minorSize); | ||
std::map<int64_t, int64_t> minorMap; | ||
for (int64_t i = 0; i < minorSize; ++i) | ||
minorMap.insert({sortedResMap[i], i}); | ||
|
||
// Re-map the dimensions. | ||
SmallVector<int64_t> remappedResult(minorSize); | ||
for (int64_t i = 0; i < minorSize; ++i) | ||
remappedResult[i] = minorMap[minorResult[i]]; | ||
|
||
/// Calculate the permutation for the transpose. | ||
for (unsigned i = 0; i < minorSize; ++i) { | ||
permutation[remappedResult[i]] = i; | ||
} | ||
} | ||
return {permutation, broadcast}; | ||
} | ||
|
||
LogicalResult DecomposeProjectedPermutation::matchAndRewrite( | ||
GenericOp op, PatternRewriter &rewriter) const { | ||
if (!op.hasPureTensorSemantics() || op.isSingleInputOutput() || | ||
op.isSingleYieldOp() || !op.isAllParallelLoops()) | ||
return failure(); | ||
|
||
// If the map of an operand is not a `projected permutation` then | ||
// it cannot be decomposed to mere transpose and broadcast. | ||
// The requirement that all maps be `projected permutation` may be | ||
// over-restrictive but since we need to determine shape of the | ||
// iteration space as well, reject if any map violates assumption. | ||
for (auto &opOperand : op->getOpOperands()) { | ||
auto map = op.getMatchingIndexingMap(&opOperand); | ||
if (!map.isProjectedPermutation(false)) | ||
return failure(); | ||
} | ||
|
||
// Decomposing linalg.generic involves creating `tensor.empty` | ||
// which can have dynamic shapes but then we would have to work | ||
// out which operand can supply that runtime-value (tensor.dim). | ||
// Leaving it as a future TODO. | ||
if (llvm::any_of(op->getOpOperands(), [](OpOperand &oper) { | ||
auto opType = cast<RankedTensorType>(oper.get().getType()); | ||
return ShapedType::isDynamicShape(opType.getShape()); | ||
})) | ||
return failure(); | ||
|
||
auto outputShape = op.getStaticLoopRanges(); | ||
|
||
auto loc = op.getLoc(); | ||
bool isChanged = false; | ||
SmallVector<Value> newInitValues = op.getDpsInputs(); | ||
SmallVector<AffineMap> newMap = op.getIndexingMapsArray(); | ||
|
||
// Walk over each input operand and unfold if it is transposed, broadcast | ||
// or mix of two via operand's affine-map. | ||
for (int64_t i = 0; i < op.getNumDpsInputs(); ++i) { | ||
auto &map = newMap[i]; | ||
auto inputRTType = cast<RankedTensorType>(newInitValues[i].getType()); | ||
auto elType = inputRTType.getElementType(); | ||
|
||
/// Nothing to do if map is already an identity. | ||
if (map.isIdentity()) | ||
continue; | ||
|
||
auto [permutation, broadcastedDims] = computeTransposeBroadcast(map); | ||
|
||
// Does it need transpose? | ||
if (!permutation.empty()) { | ||
/// linalg.transpose permutes the dimensions of input using | ||
/// rule: dim(result, i) = dim(input, permutation[i]) | ||
SmallVector<int64_t> transposedShape(map.getNumResults()); | ||
for (int64_t i = 0; i < map.getNumResults(); ++i) | ||
transposedShape[i] = inputRTType.getShape()[permutation[i]]; | ||
|
||
Value emptyTensor = | ||
rewriter.create<tensor::EmptyOp>(loc, transposedShape, elType); | ||
|
||
auto transposeOp = rewriter.create<TransposeOp>(loc, newInitValues[i], | ||
emptyTensor, permutation); | ||
newInitValues[i] = transposeOp->getResult(0); | ||
isChanged = true; | ||
} | ||
|
||
// Does it require broadcast? | ||
if (!broadcastedDims.empty()) { | ||
assert(broadcastedDims.size() && "should have non size broadcast"); | ||
Value emptyTensor = rewriter.create<tensor::EmptyOp>( | ||
loc, outputShape, inputRTType.getElementType()); | ||
|
||
auto broadcastOp = rewriter.create<linalg::BroadcastOp>( | ||
loc, newInitValues[i], emptyTensor, broadcastedDims); | ||
|
||
newInitValues[i] = broadcastOp->getResult(0); | ||
isChanged = true; | ||
} | ||
newMap[i] = rewriter.getMultiDimIdentityMap(map.getNumDims()); | ||
} | ||
|
||
if (isChanged) { | ||
SmallVector<Value> operands = op->getOperands(); | ||
ValueRange operandsRef(operands); | ||
|
||
auto newOp = rewriter.create<linalg::GenericOp>( | ||
/*location=*/op.getLoc(), | ||
/*resultTensorTypes=*/op->getResultTypes(), | ||
/*inputs=*/newInitValues, | ||
/*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()), | ||
/*indexingMaps=*/newMap, | ||
/*iteratorTypes=*/op.getIteratorTypesArray()); | ||
|
||
newOp.getRegion().takeBody(op->getRegion(0)); | ||
rewriter.replaceOp(op, newOp->getResults()); | ||
} | ||
return success(); | ||
} | ||
|
||
} // namespace | ||
|
||
void mlir::linalg::populateDecomposeProjectedPermutationPatterns( | ||
RewritePatternSet &patterns) { | ||
patterns.insert<DecomposeProjectedPermutation>(patterns.getContext()); | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
71 changes: 71 additions & 0 deletions
71
mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// RUN: mlir-opt %s -split-input-file --linalg-specialize-generic-ops | FileCheck %s | ||
|
||
#projection = affine_map<(d0, d1, d2, d3, d4) -> (d2, d3, d1)> | ||
#identity = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> | ||
|
||
func.func @transpose_and_broadcast(%x : tensor<7x8x9xf32>, %y: tensor<5x9x7x8x10xf32>, %z : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> { | ||
%res = linalg.generic | ||
{ indexing_maps = [#projection, #identity, #identity], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} | ||
ins(%x, %y : tensor<7x8x9xf32>, tensor<5x9x7x8x10xf32>) outs(%z : tensor<5x9x7x8x10xf32>) { | ||
^bb0(%in: f32, %in_1: f32, %out: f32): | ||
%div = arith.divf %in, %in_1 : f32 | ||
linalg.yield %div : f32 | ||
} -> tensor<5x9x7x8x10xf32> | ||
return %res : tensor<5x9x7x8x10xf32> | ||
} | ||
|
||
// CHECK-LABEL: transpose_and_broadcast | ||
// CHECK-SAME: %[[X:.+]]: tensor<7x8x9xf32>, %[[Y:.+]]: tensor<5x9x7x8x10xf32>, %[[Z:.+]]: tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> { | ||
// CHECK: %[[E0:.+]] = tensor.empty() : tensor<9x7x8xf32> | ||
// CHECK: %[[X_trans:.+]] = linalg.transpose ins(%[[X]] : tensor<7x8x9xf32>) outs(%[[E0]] : tensor<9x7x8xf32>) permutation = [2, 0, 1] | ||
// CHECK: %[[E1:.+]] = tensor.empty() : tensor<5x9x7x8x10xf32> | ||
// CHECK: %[[X_trans_bc:.+]] = linalg.broadcast ins(%[[X_trans]] : tensor<9x7x8xf32>) outs(%[[E1]] : tensor<5x9x7x8x10xf32>) dimensions = [0, 4] | ||
// CHECK: {{.*}} = linalg.div ins(%[[X_trans_bc]], %[[Y]] : tensor<5x9x7x8x10xf32>, tensor<5x9x7x8x10xf32>) outs(%[[Z]] : tensor<5x9x7x8x10xf32>) -> tensor<5x9x7x8x10xf32> | ||
// CHECK-NOT: linalg.generic | ||
|
||
// ----- | ||
|
||
#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | ||
#transposed = affine_map<(d0, d1, d2) -> (d2, d0, d1)> | ||
|
||
func.func @transpose_only(%x : tensor<32x2x16xf32>, %y: tensor<2x16x32xf32>, %z : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
%res = linalg.generic | ||
{ indexing_maps = [#transposed, #identity, #identity], iterator_types = ["parallel", "parallel", "parallel"]} | ||
ins(%x, %y : tensor<32x2x16xf32>, tensor<2x16x32xf32>) | ||
outs(%z : tensor<2x16x32xf32>) { | ||
^bb0(%in: f32, %in_1: f32, %out: f32): | ||
%div = arith.divf %in, %in_1 : f32 | ||
linalg.yield %div : f32 | ||
} -> tensor<2x16x32xf32> | ||
return %res : tensor<2x16x32xf32> | ||
} | ||
|
||
// CHECK-LABEL: transpose_only | ||
// CHECK-SAME: %[[X:.+]]: tensor<32x2x16xf32>, %[[Y:.+]]: tensor<2x16x32xf32>, %[[Z:.+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
// CHECK: %[[E0:.+]] = tensor.empty() : tensor<2x16x32xf32> | ||
// CHECK: %[[X_trans:.+]] = linalg.transpose ins(%[[X]] : tensor<32x2x16xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) permutation = [1, 2, 0] | ||
// CHECK: {{.*}} = linalg.div ins(%[[X_trans]], %[[Y]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%[[Z]] : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> | ||
// CHECK-NOT: linalg.generic | ||
|
||
// ----- | ||
|
||
#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)> | ||
#broadcast = affine_map<(d0, d1, d2) -> (d0, d2)> | ||
func.func @broadcast_only(%x : tensor<2x16x32xf32>, %y: tensor<2x32xf32>, %z : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
%res = linalg.generic | ||
{ indexing_maps = [#identity, #broadcast, #identity], iterator_types = ["parallel", "parallel", "parallel"]} | ||
ins(%x, %y : tensor<2x16x32xf32>, tensor<2x32xf32>) | ||
outs(%z : tensor<2x16x32xf32>) { | ||
^bb0(%in: f32, %in_1: f32, %out: f32): | ||
%div = arith.divf %in, %in_1 : f32 | ||
linalg.yield %div : f32 | ||
} -> tensor<2x16x32xf32> | ||
return %res : tensor<2x16x32xf32> | ||
} | ||
|
||
// CHECK-LABEL: broadcast_only | ||
// CHECK-SAME: %[[X:.+]]: tensor<2x16x32xf32>, %[[Y:.+]]: tensor<2x32xf32>, %[[Z:.+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> { | ||
// CHECK: %[[E0:.+]] = tensor.empty() : tensor<2x16x32xf32> | ||
// CHECK: %[[X_bc:.+]] = linalg.broadcast ins(%[[Y]] : tensor<2x32xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) dimensions = [1] | ||
// CHECK: {{.*}} = linalg.div ins(%[[X]], %[[X_bc]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%arg2 : tensor<2x16x32xf32>) -> tensor<2x16x32xf32> | ||
// CHECK-NOT: linalg.generic |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.