Skip to content

[flang][cuda] Carry over the stream information to kernel launch #136217

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 8 additions & 9 deletions flang-rt/lib/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ extern "C" {

void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
intptr_t stream, int32_t smem, void **params, void **extra) {
int64_t *stream, int32_t smem, void **params, void **extra) {
dim3 gridDim;
gridDim.x = gridX;
gridDim.y = gridY;
Expand Down Expand Up @@ -77,13 +77,13 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
}
cudaStream_t defaultStream = 0;
CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
}

void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
intptr_t stream, int32_t smem, void **params, void **extra) {
int64_t *stream, int32_t smem, void **params, void **extra) {
cudaLaunchConfig_t config;
config.gridDim.x = gridX;
config.gridDim.y = gridY;
Expand Down Expand Up @@ -141,8 +141,8 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
terminator.Crash("Too many invalid grid dimensions");
}
config.dynamicSmemBytes = smem;
if (stream != kNoAsyncId) {
config.stream = (cudaStream_t)stream;
if (stream != nullptr) {
config.stream = (cudaStream_t)(*stream);
} else {
config.stream = 0;
}
Expand All @@ -158,7 +158,7 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,

void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
void **extra) {
dim3 gridDim;
gridDim.x = gridX;
Expand Down Expand Up @@ -218,9 +218,8 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
terminator.Crash("Too many invalid grid dimensions");
}
cudaStream_t defaultStream = 0;
CUDA_REPORT_IF_ERROR(
cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, params, smem,
stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
}

} // extern "C"
2 changes: 1 addition & 1 deletion flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -383,7 +383,7 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
Later in the lowering this will become a no op.
}];

let arguments = (ins fir_ReferenceType:$stream);
let arguments = (ins AnyTypeOf<[fir_ReferenceType, LLVM_AnyPointer]>:$stream);

let results = (outs GPU_AsyncToken:$token);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@ class LLVMTypeConverter;

namespace cuf {

void populateCUFGPUToLLVMConversionPatterns(
const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
mlir::PatternBenefit benefit = 1);
void populateCUFGPUToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
mlir::RewritePatternSet &patterns,
mlir::PatternBenefit benefit = 1);

} // namespace cuf

Expand Down
6 changes: 3 additions & 3 deletions flang/include/flang/Runtime/CUDA/kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ extern "C" {

void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
void **extra);

void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
intptr_t stream, int32_t smem, void **params, void **extra);
int64_t *stream, int32_t smem, void **params, void **extra);

void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
void **extra);

} // extern "C"
Expand Down
6 changes: 3 additions & 3 deletions flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,9 @@ template <typename OpTy>
static llvm::LogicalResult checkStreamType(OpTy op) {
if (!op.getStream())
return mlir::success();
auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType());
if (!refTy.getEleTy().isInteger(64))
return op.emitOpError("stream is expected to be a i64 reference");
if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
if (!refTy.getEleTy().isInteger(64))
return op.emitOpError("stream is expected to be an i64 reference");
return mlir::success();
}

Expand Down
55 changes: 41 additions & 14 deletions flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ struct GPULaunchKernelConversion
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
/*isVarArg=*/false);
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
Expand All @@ -133,10 +133,15 @@ struct GPULaunchKernelConversion
launchKernelFuncOp.setVisibility(
mlir::SymbolTable::Visibility::Private);
}
mlir::Value stream = adaptor.getAsyncObject();
if (!stream)
stream = rewriter.create<mlir::LLVM::ConstantOp>(
loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));

mlir::Value stream = nullPtr;
if (!adaptor.getAsyncDependencies().empty()) {
if (adaptor.getAsyncDependencies().size() != 1)
return rewriter.notifyMatchFailure(
op, "Can only convert with exactly one stream dependency.");
stream = adaptor.getAsyncDependencies().front();
}

rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchClusterKernel,
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
Expand All @@ -157,8 +162,8 @@ struct GPULaunchKernelConversion
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
voidTy,
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
i32Ty, ptrTy, ptrTy},
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
ptrTy},
/*isVarArg=*/false);
auto cufLaunchKernel =
mlir::SymbolRefAttr::get(mod.getContext(), fctName);
Expand All @@ -171,10 +176,13 @@ struct GPULaunchKernelConversion
mlir::SymbolTable::Visibility::Private);
}

mlir::Value stream = adaptor.getAsyncObject();
if (!stream)
stream = rewriter.create<mlir::LLVM::ConstantOp>(
loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
mlir::Value stream = nullPtr;
if (!adaptor.getAsyncDependencies().empty()) {
if (adaptor.getAsyncDependencies().size() != 1)
return rewriter.notifyMatchFailure(
op, "Can only convert with exactly one stream dependency.");
stream = adaptor.getAsyncDependencies().front();
}

rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
op, funcTy, cufLaunchKernel,
Expand Down Expand Up @@ -251,6 +259,22 @@ struct CUFSharedMemoryOpConversion
}
};

struct CUFStreamCastConversion
: public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
mlir::PatternBenefit benefit)
: mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
benefit) {}
using OpAdaptor = typename cuf::StreamCastOp::Adaptor;

mlir::LogicalResult
matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
rewriter.replaceOp(op, adaptor.getStream());
return mlir::success();
}
};

class CUFGPUToLLVMConversion
: public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
public:
Expand Down Expand Up @@ -283,8 +307,11 @@ class CUFGPUToLLVMConversion
} // namespace

void cuf::populateCUFGPUToLLVMConversionPatterns(
const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
mlir::PatternBenefit benefit) {
patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion>(
converter, benefit);
converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
});
patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
CUFStreamCastConversion>(converter, benefit);
}
65 changes: 64 additions & 1 deletion flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
llvm.return
}
Expand Down Expand Up @@ -166,3 +166,66 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve

// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel

// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
llvm.return
}
llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
%0 = llvm.mlir.constant(1 : index) : i64
%stream = llvm.alloca %0 x i64 : (i64) -> !llvm.ptr
%1 = llvm.mlir.constant(2 : index) : i64
%2 = llvm.mlir.constant(0 : i32) : i32
%3 = llvm.mlir.constant(10 : index) : i64
%token = cuf.stream_cast %stream : !llvm.ptr
gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
llvm.return
}
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
}

// CHECK-LABEL: llvm.func @_QQmain()
// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()

// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
llvm.func @_QMmod1Phost_sub() {
%0 = llvm.mlir.constant(1 : i32) : i32
%one = llvm.mlir.constant(1 : i64) : i64
%1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
%stream = llvm.alloca %one x i64 : (i64) -> !llvm.ptr
%2 = llvm.mlir.constant(40 : i64) : i64
%3 = llvm.mlir.constant(16 : i32) : i32
%4 = llvm.mlir.constant(25 : i32) : i32
%5 = llvm.mlir.constant(21 : i32) : i32
%6 = llvm.mlir.constant(17 : i32) : i32
%7 = llvm.mlir.constant(1 : index) : i64
%8 = llvm.mlir.constant(27 : i32) : i32
%9 = llvm.mlir.constant(6 : i32) : i32
%10 = llvm.mlir.constant(1 : i32) : i32
%11 = llvm.mlir.constant(0 : i32) : i32
%12 = llvm.mlir.constant(10 : index) : i64
%13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
%14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
%token = cuf.stream_cast %stream : !llvm.ptr
gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
llvm.return
}
llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
%0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
llvm.return %0 : !llvm.array<2 x i8>
}
llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
}

// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()
2 changes: 1 addition & 1 deletion flang/test/Fir/CUDA/cuda-launch.fir
Original file line number Diff line number Diff line change
Expand Up @@ -154,5 +154,5 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
// CHECK-LABEL: func.func @_QQmain()
// CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : <i64>
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref<i64>
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
2 changes: 1 addition & 1 deletion flang/test/Fir/CUDA/cuda-stream.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@ module attributes {gpu.container_module} {

// CHECK-LABEL: func.func @_QMmod1Phost_sub()
// CHECK: %[[STREAM:.*]] = fir.alloca i64
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : <i64>
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : !fir.ref<i64>
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMmod1Psub1
Loading