Skip to content

Commit bc29fc9

Browse files
[MLIR] Create GPU utils library & move distribution utils (#119264)
Continue the move of `warp_execute_on_lane_0` op to the gpu dialect (#116994). This patch creates a utils library in GPU and moves generic helper functions there.
1 parent d6cc140 commit bc29fc9

File tree

13 files changed

+302
-223
lines changed

13 files changed

+302
-223
lines changed

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
99
#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
1010

11-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
11+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1212
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
1313
#include "mlir/IR/Builders.h"
1414
#include "mlir/IR/Types.h"

mlir/include/mlir/Dialect/GPU/Transforms/Passes.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
1414
#define MLIR_DIALECT_GPU_TRANSFORMS_PASSES_H_
1515

16-
#include "Utils.h"
1716
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
17+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1818
#include "mlir/IR/PatternMatch.h"
1919
#include "mlir/Pass/Pass.h"
2020
#include <optional>
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
//===- DistributionUtils.h - Distribution Utilities -------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_
10+
#define MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBITIONUTILS_H_
11+
12+
#include "mlir/Dialect/Affine/IR/AffineOps.h"
13+
#include "mlir/Dialect/Arith/IR/Arith.h"
14+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
15+
#include "mlir/IR/PatternMatch.h"
16+
#include "mlir/IR/Value.h"
17+
18+
namespace mlir::gpu {
19+
struct WarpDistributionPattern : OpRewritePattern<WarpExecuteOnLane0Op> {
20+
using OpRewritePattern::OpRewritePattern;
21+
using Base = WarpDistributionPattern;
22+
23+
virtual LogicalResult
24+
matchAndRewrite(WarpExecuteOnLane0Op op,
25+
PatternRewriter &rewriter) const override = 0;
26+
27+
protected:
28+
/// Return a value yielded by `warpOp` which statifies the filter lamdba
29+
/// condition and is not dead.
30+
OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
31+
llvm::function_ref<bool(Operation *)> fn) const;
32+
33+
/// Helper to create a new WarpExecuteOnLane0Op with different signature.
34+
WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
35+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
36+
ValueRange newYieldedValues, TypeRange newReturnTypes) const;
37+
38+
/// Helper to create a new WarpExecuteOnLane0Op region with extra outputs.
39+
/// `indices` return the index of each new output.
40+
WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
41+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
42+
ValueRange newYieldedValues, TypeRange newReturnTypes,
43+
SmallVector<size_t> &indices) const;
44+
45+
/// Delinearize the given `laneId` into multiple dimensions, where each
46+
/// dimension's size is determined by `originalShape` and `distributedShape`
47+
/// together. This function expects the total numbers of threads needed for
48+
/// distribution is equal to `warpSize`. Returns true and updates
49+
/// `delinearizedIds` if so.
50+
bool delinearizeLaneId(OpBuilder &builder, Location loc,
51+
ArrayRef<int64_t> originalShape,
52+
ArrayRef<int64_t> distributedShape, int64_t warpSize,
53+
Value laneId,
54+
SmallVectorImpl<Value> &delinearizedIds) const;
55+
};
56+
57+
} // namespace mlir::gpu
58+
59+
#endif // MLIR_DIALECT_GPU_TRANSFORMS_DISTRIBUTIONUTILS_H_

mlir/lib/Dialect/GPU/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ add_mlir_dialect_library(MLIRGPUTransforms
4040
Transforms/ShuffleRewriter.cpp
4141
Transforms/SPIRVAttachTarget.cpp
4242
Transforms/SubgroupReduceLowering.cpp
43-
Transforms/Utils.cpp
4443

4544
OBJECT
4645

@@ -59,6 +58,7 @@ add_mlir_dialect_library(MLIRGPUTransforms
5958
MLIRDataLayoutInterfaces
6059
MLIRExecutionEngineUtils
6160
MLIRGPUDialect
61+
MLIRGPUUtils
6262
MLIRIR
6363
MLIRIndexDialect
6464
MLIRLLVMDialect
@@ -76,3 +76,4 @@ add_mlir_dialect_library(MLIRGPUTransforms
7676

7777
add_subdirectory(TransformOps)
7878
add_subdirectory(Pipelines)
79+
add_subdirectory(Utils)

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include "mlir/Dialect/Async/IR/Async.h"
1717
#include "mlir/Dialect/Func/IR/FuncOps.h"
1818
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
19-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
19+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
2020
#include "mlir/IR/Builders.h"
2121
#include "mlir/IR/IRMapping.h"
2222
#include "mlir/IR/PatternMatch.h"

mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#include "mlir/Dialect/DLTI/DLTI.h"
1919
#include "mlir/Dialect/Func/IR/FuncOps.h"
2020
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
21-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
21+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
2222
#include "mlir/Dialect/MemRef/IR/MemRef.h"
2323
#include "mlir/IR/Builders.h"
2424
#include "mlir/IR/BuiltinAttributes.h"

mlir/lib/Dialect/GPU/Transforms/SubgroupReduceLowering.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#include "mlir/Dialect/Arith/IR/Arith.h"
1414
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
1515
#include "mlir/Dialect/GPU/Transforms/Passes.h"
16-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
16+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1717
#include "mlir/Dialect/Vector/IR/VectorOps.h"
1818
#include "mlir/IR/BuiltinTypes.h"
1919
#include "mlir/IR/Location.h"
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
add_mlir_dialect_library(MLIRGPUUtils
2+
Utils.cpp
3+
DistributionUtils.cpp
4+
5+
ADDITIONAL_HEADER_DIRS
6+
${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU/Utils
7+
8+
LINK_LIBS PUBLIC
9+
MLIRArithDialect
10+
MLIRAffineDialect
11+
MLIRGPUDialect
12+
MLIRSupport
13+
MLIRIR
14+
)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
//===- DistributionUtils.cpp - Distribution tools for GPUOps --------------===//
2+
//
3+
// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This file implements distribution utility methods.
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
14+
#include "mlir/Dialect/Affine/IR/AffineOps.h"
15+
#include "mlir/Dialect/Arith/IR/Arith.h"
16+
#include "mlir/IR/Value.h"
17+
18+
#include <numeric>
19+
20+
using namespace mlir;
21+
using namespace mlir::gpu;
22+
23+
WarpExecuteOnLane0Op
24+
WarpDistributionPattern::moveRegionToNewWarpOpAndReplaceReturns(
25+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
26+
ValueRange newYieldedValues, TypeRange newReturnTypes) const {
27+
// Create a new op before the existing one, with the extra operands.
28+
OpBuilder::InsertionGuard g(rewriter);
29+
rewriter.setInsertionPoint(warpOp);
30+
auto newWarpOp = rewriter.create<WarpExecuteOnLane0Op>(
31+
warpOp.getLoc(), newReturnTypes, warpOp.getLaneid(), warpOp.getWarpSize(),
32+
warpOp.getArgs(), warpOp.getBody()->getArgumentTypes());
33+
34+
Region &opBody = warpOp.getBodyRegion();
35+
Region &newOpBody = newWarpOp.getBodyRegion();
36+
Block &newOpFirstBlock = newOpBody.front();
37+
rewriter.inlineRegionBefore(opBody, newOpBody, newOpBody.begin());
38+
rewriter.eraseBlock(&newOpFirstBlock);
39+
assert(newWarpOp.getWarpRegion().hasOneBlock() &&
40+
"expected WarpOp with single block");
41+
42+
auto yield =
43+
cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
44+
45+
rewriter.modifyOpInPlace(
46+
yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
47+
return newWarpOp;
48+
}
49+
50+
WarpExecuteOnLane0Op
51+
WarpDistributionPattern::moveRegionToNewWarpOpAndAppendReturns(
52+
RewriterBase &rewriter, WarpExecuteOnLane0Op warpOp,
53+
ValueRange newYieldedValues, TypeRange newReturnTypes,
54+
SmallVector<size_t> &indices) const {
55+
SmallVector<Type> types(warpOp.getResultTypes().begin(),
56+
warpOp.getResultTypes().end());
57+
auto yield = cast<gpu::YieldOp>(
58+
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
59+
llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
60+
yield.getOperands().end());
61+
for (auto [value, type] : llvm::zip_equal(newYieldedValues, newReturnTypes)) {
62+
if (yieldValues.insert(value)) {
63+
types.push_back(type);
64+
indices.push_back(yieldValues.size() - 1);
65+
} else {
66+
// If the value already exit the region don't create a new output.
67+
for (auto [idx, yieldOperand] :
68+
llvm::enumerate(yieldValues.getArrayRef())) {
69+
if (yieldOperand == value) {
70+
indices.push_back(idx);
71+
break;
72+
}
73+
}
74+
}
75+
}
76+
yieldValues.insert(newYieldedValues.begin(), newYieldedValues.end());
77+
WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
78+
rewriter, warpOp, yieldValues.getArrayRef(), types);
79+
rewriter.replaceOp(warpOp,
80+
newWarpOp.getResults().take_front(warpOp.getNumResults()));
81+
return newWarpOp;
82+
}
83+
84+
OpOperand *WarpDistributionPattern::getWarpResult(
85+
WarpExecuteOnLane0Op warpOp,
86+
llvm::function_ref<bool(Operation *)> fn) const {
87+
auto yield = cast<gpu::YieldOp>(
88+
warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
89+
for (OpOperand &yieldOperand : yield->getOpOperands()) {
90+
Value yieldValues = yieldOperand.get();
91+
Operation *definedOp = yieldValues.getDefiningOp();
92+
if (definedOp && fn(definedOp)) {
93+
if (!warpOp.getResult(yieldOperand.getOperandNumber()).use_empty())
94+
return &yieldOperand;
95+
}
96+
}
97+
return nullptr;
98+
}
99+
100+
bool WarpDistributionPattern::delinearizeLaneId(
101+
OpBuilder &builder, Location loc, ArrayRef<int64_t> originalShape,
102+
ArrayRef<int64_t> distributedShape, int64_t warpSize, Value laneId,
103+
SmallVectorImpl<Value> &delinearizedIds) const {
104+
// If the original shape and the distributed shape is the same, we don't
105+
// distribute at all--every thread is handling the whole. For such case, we
106+
// should not rely on lane IDs later. So just return an empty lane ID vector.
107+
if (originalShape == distributedShape) {
108+
delinearizedIds.clear();
109+
return true;
110+
}
111+
112+
SmallVector<int64_t> sizes;
113+
for (auto [large, small] : llvm::zip_equal(originalShape, distributedShape)) {
114+
if (large % small != 0)
115+
return false;
116+
sizes.push_back(large / small);
117+
}
118+
if (std::accumulate(sizes.begin(), sizes.end(), 1,
119+
std::multiplies<int64_t>()) != warpSize)
120+
return false;
121+
122+
AffineExpr s0, s1;
123+
bindSymbols(builder.getContext(), s0, s1);
124+
125+
int64_t usedThreads = 1;
126+
127+
Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
128+
delinearizedIds.assign(sizes.size(), zero);
129+
130+
for (int i = sizes.size() - 1; i >= 0; --i) {
131+
usedThreads *= sizes[i];
132+
if (usedThreads == warpSize) {
133+
// We've used up all available threads. Don't need to perform modulo
134+
// anymore. And we can stop the calculation for further dimensions.
135+
delinearizedIds[i] = laneId;
136+
break;
137+
}
138+
delinearizedIds[i] =
139+
affine::makeComposedAffineApply(builder, loc, s0 % sizes[i], {laneId});
140+
laneId = affine::makeComposedAffineApply(
141+
builder, loc, s0.floorDiv(usedThreads), {laneId});
142+
}
143+
return true;
144+
}

mlir/lib/Dialect/GPU/Transforms/Utils.cpp renamed to mlir/lib/Dialect/GPU/Utils/Utils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
#include "mlir/Dialect/GPU/Transforms/Utils.h"
13+
#include "mlir/Dialect/GPU/Utils/GPUUtils.h"
1414
#include "llvm/Support/ErrorHandling.h"
1515

1616
namespace mlir::gpu {

mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
3838
MLIRArithDialect
3939
MLIRDialectUtils
4040
MLIRGPUDialect
41+
MLIRGPUUtils
4142
MLIRIR
4243
MLIRLinalgDialect
4344
MLIRMemRefDialect

0 commit comments

Comments
 (0)