-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] Update cuf.kernel_launch stream and conversion #136179
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesUpdate Full diff: https://github.com/llvm/llvm-project/pull/136179.diff 6 Files Affected:
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index f55f3e8a4466d..ccf9969e73a8e 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -200,7 +200,7 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
let arguments = (ins SymbolRefAttr:$callee, I32:$grid_x, I32:$grid_y,
I32:$grid_z, I32:$block_x, I32:$block_y, I32:$block_z,
- Optional<I32>:$bytes, Optional<AnyIntegerType>:$stream,
+ Optional<I32>:$bytes, Optional<fir_ReferenceType>:$stream,
Variadic<AnyType>:$args, OptionalAttr<DictArrayAttr>:$arg_attrs,
OptionalAttr<DictArrayAttr>:$res_attrs);
@@ -237,6 +237,8 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
*this, getNbNoArgOperand(), getArgs().size() - 1);
}
}];
+
+ let hasVerifier = 1;
}
def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index 31f2650917781..f28778ce6c1c9 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -589,7 +589,7 @@ Fortran::lower::genCallOpAndResult(
mlir::Value stream; // stream is optional.
if (caller.getCallDescription().chevrons().size() > 3)
- stream = fir::getBase(converter.genExprValue(
+ stream = fir::getBase(converter.genExprAddr(
caller.getCallDescription().chevrons()[3], stmtCtx));
builder.create<cuf::KernelLaunchOp>(
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index ce197d48d4860..0b0f8811e137a 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -139,6 +139,24 @@ llvm::LogicalResult cuf::DeallocateOp::verify() {
return mlir::success();
}
+//===----------------------------------------------------------------------===//
+// KernelLaunchop
+//===----------------------------------------------------------------------===//
+
+template <typename OpTy>
+static llvm::LogicalResult checkStreamType(OpTy op) {
+ if (!op.getStream())
+ return mlir::success();
+ auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType());
+ if (!refTy.getEleTy().isInteger(64))
+ return op.emitOpError("stream is expected to be a i64 reference");
+ return mlir::success();
+}
+
+llvm::LogicalResult cuf::KernelLaunchOp::verify() {
+ return checkStreamType(*this);
+}
+
//===----------------------------------------------------------------------===//
// KernelOp
//===----------------------------------------------------------------------===//
@@ -324,10 +342,7 @@ void cuf::SharedMemoryOp::build(
//===----------------------------------------------------------------------===//
llvm::LogicalResult cuf::StreamCastOp::verify() {
- auto refTy = mlir::dyn_cast<fir::ReferenceType>(getStream().getType());
- if (!refTy.getEleTy().isInteger(64))
- return emitOpError("stream is expected to be a i64 reference");
- return mlir::success();
+ return checkStreamType(*this);
}
// Tablegen operators
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index caa59c6c17d0f..22ffc61de4020 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -879,8 +879,15 @@ struct CUFLaunchOpConversion
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
}
- if (op.getStream())
- gpuLaunchOp.getAsyncObjectMutable().assign(op.getStream());
+ if (op.getStream()) {
+ mlir::OpBuilder::InsertionGuard guard(rewriter);
+ rewriter.setInsertionPoint(gpuLaunchOp);
+ mlir::Value stream =
+ rewriter.create<cuf::StreamCastOp>(loc, op.getStream());
+ llvm::errs() << stream << "\n";
+ gpuLaunchOp.getAsyncDependenciesMutable().append(stream);
+ llvm::errs() << gpuLaunchOp << "\n";
+ }
if (procAttr)
gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
rewriter.replaceOp(op, gpuLaunchOp);
@@ -933,6 +940,7 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
/*forceUnifiedTBAATree=*/false, *dl);
target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
mlir::gpu::GPUDialect>();
+ target.addLegalOp<cuf::StreamCastOp>();
cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
patterns);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
diff --git a/flang/test/Fir/CUDA/cuda-launch.fir b/flang/test/Fir/CUDA/cuda-launch.fir
index 621772efff415..319991546d3fe 100644
--- a/flang/test/Fir/CUDA/cuda-launch.fir
+++ b/flang/test/Fir/CUDA/cuda-launch.fir
@@ -146,8 +146,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
%1:2 = hlfir.declare %0 {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
%c1_i32 = arith.constant 1 : i32
%c0_i32 = arith.constant 0 : i32
- %2 = fir.load %1#0 : !fir.ref<i64>
- cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c0_i32, %2 : i64>>>()
+ cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c0_i32, %1#0 : !fir.ref<i64>>>>()
return
}
}
@@ -155,5 +154,5 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: func.func @_QQmain()
// CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
-// CHECK: %[[STREAM_LOADED:.*]] = fir.load %[[DECL_STREAM]]#0 : !fir.ref<i64>
-// CHECK: gpu.launch_func <%[[STREAM_LOADED]] : i64> @cuda_device_mod::@_QMdevptrPtest
+// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : <i64>
+// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
index d66d2811f7a8b..71e594e4742ec 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -45,8 +45,8 @@ contains
call dev_kernel0<<<10, 20, 2>>>()
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>()
- call dev_kernel0<<<10, 20, 2, 0>>>()
-! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
+ call dev_kernel0<<<10, 20, 2, 0_8>>>()
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %{{.*}} : !fir.ref<i64>>>>()
call dev_kernel1<<<1, 32>>>(a)
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}}) : (!fir.ref<f32>)
@@ -55,7 +55,7 @@ contains
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}})
call dev_kernel1<<<*,32,0,stream>>>(a)
-! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}, %c0{{.*}}, %{{.*}} : i64>>>(%{{.*}}) : (!fir.ref<f32>)
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}, %c0{{.*}}, %{{.*}} : !fir.ref<i64>>>>(%{{.*}}) : (!fir.ref<f32>)
end
|
@@ -139,6 +139,24 @@ llvm::LogicalResult cuf::DeallocateOp::verify() { | |||
return mlir::success(); | |||
} | |||
|
|||
//===----------------------------------------------------------------------===// | |||
// KernelLaunchop |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: KernelLaunchOp to be consistent with other ops.
rewriter.setInsertionPoint(gpuLaunchOp); | ||
mlir::Value stream = | ||
rewriter.create<cuf::StreamCastOp>(loc, op.getStream()); | ||
llvm::errs() << stream << "\n"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this intended output?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for catching this.
rewriter.create<cuf::StreamCastOp>(loc, op.getStream()); | ||
llvm::errs() << stream << "\n"; | ||
gpuLaunchOp.getAsyncDependenciesMutable().append(stream); | ||
llvm::errs() << gpuLaunchOp << "\n"; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here.
…6179) Update `cuf.kernel_launch` to take the stream as a reference. Update the conversion to insert the `cuf.stream_cast` op so the stream can be set as dependency.
…6179) Update `cuf.kernel_launch` to take the stream as a reference. Update the conversion to insert the `cuf.stream_cast` op so the stream can be set as dependency.
Update
cuf.kernel_launch
to take the stream as a reference. Update the conversion to insert thecuf.stream_cast
op so the stream can be set as dependency.