Skip to content

Commit 0b21371

Browse files
committed
[mlir] Support pre-existing tokens in 'gpu-async-region'
Allow gpu ops implementing the async interface to already be async when running the GpuAsyncRegionPass. That pass threads a 'current token' through a block with ops implementing the gpu async interface. After this change, existing async ops (returning a !gpu.async.token) set the current token. Existing synchronous `gpu.wait` ops reset the current token. Reviewed By: herhut Differential Revision: https://reviews.llvm.org/D103396
1 parent a115c52 commit 0b21371

File tree

2 files changed

+42
-12
lines changed

2 files changed

+42
-12
lines changed

mlir/lib/Dialect/GPU/Transforms/AsyncRegionRewriter.cpp

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ static bool hasSideEffects(Operation *op) {
4747
struct GpuAsyncRegionPass::ThreadTokenCallback {
4848
ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
4949

50+
WalkResult operator()(Block *block) {
51+
for (Operation &op : make_early_inc_range(*block)) {
52+
if (failed(visit(&op)))
53+
return WalkResult::interrupt();
54+
}
55+
return WalkResult::advance();
56+
}
57+
58+
private:
5059
// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
5160
// create a current token (unless it already exists), and 'thread' that token
5261
// through the `op` so that it executes asynchronously.
@@ -55,11 +64,15 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
5564
// host-synchronize execution. A `!gpu.async.token` will therefore only be
5665
// used inside of its block and GPU execution will always synchronize with
5766
// the host at block boundaries.
58-
WalkResult operator()(Operation *op) {
67+
LogicalResult visit(Operation *op) {
5968
if (isa<gpu::LaunchOp>(op))
6069
return op->emitOpError("replace with gpu.launch_func first");
61-
if (isa<gpu::WaitOp>(op))
62-
return op->emitOpError("unexpected pre-existing gpu.wait");
70+
if (auto waitOp = llvm::dyn_cast<gpu::WaitOp>(op)) {
71+
if (currentToken)
72+
waitOp.addAsyncDependency(currentToken);
73+
currentToken = waitOp.asyncToken();
74+
return success();
75+
}
6376
builder.setInsertionPoint(op);
6477
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
6578
return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
@@ -71,14 +84,9 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
7184
return success();
7285
}
7386

74-
private:
7587
// Replaces asyncOp with a clone that returns a token.
7688
LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
7789
auto *op = asyncOp.getOperation();
78-
if (asyncOp.getAsyncToken())
79-
// TODO: Support ops that are already async.
80-
return op->emitOpError("is already async");
81-
8290
auto tokenType = builder.getType<gpu::AsyncTokenType>();
8391

8492
// If there is no current token, insert a `gpu.wait async` without
@@ -87,6 +95,11 @@ struct GpuAsyncRegionPass::ThreadTokenCallback {
8795
currentToken = createWaitOp(op->getLoc(), tokenType, {});
8896
asyncOp.addAsyncDependency(currentToken);
8997

98+
// Return early if op returns a token already.
99+
currentToken = asyncOp.getAsyncToken();
100+
if (currentToken)
101+
return success();
102+
90103
// Clone the op to return a token in addition to the other results.
91104
SmallVector<Type, 1> resultTypes;
92105
resultTypes.reserve(1 + op->getNumResults());
@@ -315,10 +328,7 @@ struct GpuAsyncRegionPass::SingleTokenUseCallback {
315328
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
316329
// execution semantics and that no GPU ops are asynchronous yet.
317330
void GpuAsyncRegionPass::runOnFunction() {
318-
if (getFunction()
319-
.getRegion()
320-
.walk(ThreadTokenCallback(getContext()))
321-
.wasInterrupted())
331+
if (getFunction()->walk(ThreadTokenCallback(getContext())).wasInterrupted())
322332
return signalPassFailure();
323333

324334
// Collect gpu.wait ops that we can move out of async.execute regions.

mlir/test/Dialect/GPU/async-region.mlir

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,24 @@ module attributes {gpu.container_module} {
169169
}
170170
return
171171
}
172+
173+
// CHECK-LABEL:func @existing_tokens()
174+
func @existing_tokens() {
175+
// CHECK: %[[t0:.*]] = gpu.wait async
176+
// CHECK-NOT: [{{.*}}]
177+
%t0 = gpu.wait async
178+
// CHECK: %[[t1:.*]] = gpu.wait async [%[[t0]], %[[t0]]]
179+
%t1 = gpu.wait async [%t0]
180+
// CHECK: %[[m:.*]], %[[t2:.*]] = gpu.alloc async [%[[t1]], %[[t0]]] ()
181+
%0 = gpu.alloc [%t0] () : memref<7xf32>
182+
// CHECK: %[[t3:.*]] = gpu.dealloc async [%[[t2]]] %[[m]]
183+
%t2 = gpu.dealloc async %0 : memref<7xf32>
184+
// CHECK: gpu.wait [%[[t3]]]
185+
gpu.wait
186+
// CHECK: gpu.wait
187+
// CHECK-NOT: async
188+
// CHECK-NOT: [{{.*}}]
189+
gpu.wait
190+
return
191+
}
172192
}

0 commit comments

Comments
 (0)