Skip to content

Commit d79bb93

Browse files
authored
[flang][cuda] Carry over the stream information to kernel launch (#136217)
In CUDA Fortran the stream is encoded in an INTEGER(cuda_stream_kind) variable. This information is carried over the GPU dialect through the `cuf.stream_cast` and the token in the GPU ops. When converting the `gpu.launch_func` to runtime call, the `cuf.stream_cast` becomes a no-op and the reference to the stream is passed to the runtime. The runtime is adapted to take integer references instead of value for stream.
1 parent ba273be commit d79bb93

File tree

9 files changed

+125
-36
lines changed

9 files changed

+125
-36
lines changed

flang-rt/lib/cuda/kernel.cpp

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ extern "C" {
1717

1818
void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
1919
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
20-
intptr_t stream, int32_t smem, void **params, void **extra) {
20+
int64_t *stream, int32_t smem, void **params, void **extra) {
2121
dim3 gridDim;
2222
gridDim.x = gridX;
2323
gridDim.y = gridY;
@@ -77,13 +77,13 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
7777
}
7878
cudaStream_t defaultStream = 0;
7979
CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
80-
stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
80+
stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
8181
}
8282

8383
void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
8484
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
8585
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
86-
intptr_t stream, int32_t smem, void **params, void **extra) {
86+
int64_t *stream, int32_t smem, void **params, void **extra) {
8787
cudaLaunchConfig_t config;
8888
config.gridDim.x = gridX;
8989
config.gridDim.y = gridY;
@@ -141,8 +141,8 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
141141
terminator.Crash("Too many invalid grid dimensions");
142142
}
143143
config.dynamicSmemBytes = smem;
144-
if (stream != kNoAsyncId) {
145-
config.stream = (cudaStream_t)stream;
144+
if (stream != nullptr) {
145+
config.stream = (cudaStream_t)(*stream);
146146
} else {
147147
config.stream = 0;
148148
}
@@ -158,7 +158,7 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
158158

159159
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
160160
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
161-
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
161+
intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
162162
void **extra) {
163163
dim3 gridDim;
164164
gridDim.x = gridX;
@@ -218,9 +218,8 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
218218
terminator.Crash("Too many invalid grid dimensions");
219219
}
220220
cudaStream_t defaultStream = 0;
221-
CUDA_REPORT_IF_ERROR(
222-
cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, params, smem,
223-
stream != kNoAsyncId ? (cudaStream_t)stream : defaultStream));
221+
CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
222+
params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
224223
}
225224

226225
} // extern "C"

flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -383,7 +383,7 @@ def cuf_StreamCastOp : cuf_Op<"stream_cast", [NoMemoryEffect]> {
383383
Later in the lowering this will become a no op.
384384
}];
385385

386-
let arguments = (ins fir_ReferenceType:$stream);
386+
let arguments = (ins AnyTypeOf<[fir_ReferenceType, LLVM_AnyPointer]>:$stream);
387387

388388
let results = (outs GPU_AsyncToken:$token);
389389

flang/include/flang/Optimizer/Transforms/CUFGPUToLLVMConversion.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ class LLVMTypeConverter;
1919

2020
namespace cuf {
2121

22-
void populateCUFGPUToLLVMConversionPatterns(
23-
const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
24-
mlir::PatternBenefit benefit = 1);
22+
void populateCUFGPUToLLVMConversionPatterns(fir::LLVMTypeConverter &converter,
23+
mlir::RewritePatternSet &patterns,
24+
mlir::PatternBenefit benefit = 1);
2525

2626
} // namespace cuf
2727

flang/include/flang/Runtime/CUDA/kernel.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,17 @@ extern "C" {
2121

2222
void RTDEF(CUFLaunchKernel)(const void *kernelName, intptr_t gridX,
2323
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
24-
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
24+
intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
2525
void **extra);
2626

2727
void RTDEF(CUFLaunchClusterKernel)(const void *kernelName, intptr_t clusterX,
2828
intptr_t clusterY, intptr_t clusterZ, intptr_t gridX, intptr_t gridY,
2929
intptr_t gridZ, intptr_t blockX, intptr_t blockY, intptr_t blockZ,
30-
intptr_t stream, int32_t smem, void **params, void **extra);
30+
int64_t *stream, int32_t smem, void **params, void **extra);
3131

3232
void RTDEF(CUFLaunchCooperativeKernel)(const void *kernelName, intptr_t gridX,
3333
intptr_t gridY, intptr_t gridZ, intptr_t blockX, intptr_t blockY,
34-
intptr_t blockZ, intptr_t stream, int32_t smem, void **params,
34+
intptr_t blockZ, int64_t *stream, int32_t smem, void **params,
3535
void **extra);
3636

3737
} // extern "C"

flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,9 @@ template <typename OpTy>
147147
static llvm::LogicalResult checkStreamType(OpTy op) {
148148
if (!op.getStream())
149149
return mlir::success();
150-
auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType());
151-
if (!refTy.getEleTy().isInteger(64))
152-
return op.emitOpError("stream is expected to be a i64 reference");
150+
if (auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType()))
151+
if (!refTy.getEleTy().isInteger(64))
152+
return op.emitOpError("stream is expected to be an i64 reference");
153153
return mlir::success();
154154
}
155155

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ struct GPULaunchKernelConversion
121121
voidTy,
122122
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
123123
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
124-
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, i32Ty, ptrTy, ptrTy},
124+
llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy, ptrTy},
125125
/*isVarArg=*/false);
126126
auto cufLaunchClusterKernel = mlir::SymbolRefAttr::get(
127127
mod.getContext(), RTNAME_STRING(CUFLaunchClusterKernel));
@@ -133,10 +133,15 @@ struct GPULaunchKernelConversion
133133
launchKernelFuncOp.setVisibility(
134134
mlir::SymbolTable::Visibility::Private);
135135
}
136-
mlir::Value stream = adaptor.getAsyncObject();
137-
if (!stream)
138-
stream = rewriter.create<mlir::LLVM::ConstantOp>(
139-
loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
136+
137+
mlir::Value stream = nullPtr;
138+
if (!adaptor.getAsyncDependencies().empty()) {
139+
if (adaptor.getAsyncDependencies().size() != 1)
140+
return rewriter.notifyMatchFailure(
141+
op, "Can only convert with exactly one stream dependency.");
142+
stream = adaptor.getAsyncDependencies().front();
143+
}
144+
140145
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
141146
op, funcTy, cufLaunchClusterKernel,
142147
mlir::ValueRange{kernelPtr, adaptor.getClusterSizeX(),
@@ -157,8 +162,8 @@ struct GPULaunchKernelConversion
157162
auto funcTy = mlir::LLVM::LLVMFunctionType::get(
158163
voidTy,
159164
{ptrTy, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
160-
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmIntPtrType,
161-
i32Ty, ptrTy, ptrTy},
165+
llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, ptrTy, i32Ty, ptrTy,
166+
ptrTy},
162167
/*isVarArg=*/false);
163168
auto cufLaunchKernel =
164169
mlir::SymbolRefAttr::get(mod.getContext(), fctName);
@@ -171,10 +176,13 @@ struct GPULaunchKernelConversion
171176
mlir::SymbolTable::Visibility::Private);
172177
}
173178

174-
mlir::Value stream = adaptor.getAsyncObject();
175-
if (!stream)
176-
stream = rewriter.create<mlir::LLVM::ConstantOp>(
177-
loc, llvmIntPtrType, rewriter.getIntegerAttr(llvmIntPtrType, -1));
179+
mlir::Value stream = nullPtr;
180+
if (!adaptor.getAsyncDependencies().empty()) {
181+
if (adaptor.getAsyncDependencies().size() != 1)
182+
return rewriter.notifyMatchFailure(
183+
op, "Can only convert with exactly one stream dependency.");
184+
stream = adaptor.getAsyncDependencies().front();
185+
}
178186

179187
rewriter.replaceOpWithNewOp<mlir::LLVM::CallOp>(
180188
op, funcTy, cufLaunchKernel,
@@ -251,6 +259,22 @@ struct CUFSharedMemoryOpConversion
251259
}
252260
};
253261

262+
struct CUFStreamCastConversion
263+
: public mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp> {
264+
explicit CUFStreamCastConversion(const fir::LLVMTypeConverter &typeConverter,
265+
mlir::PatternBenefit benefit)
266+
: mlir::ConvertOpToLLVMPattern<cuf::StreamCastOp>(typeConverter,
267+
benefit) {}
268+
using OpAdaptor = typename cuf::StreamCastOp::Adaptor;
269+
270+
mlir::LogicalResult
271+
matchAndRewrite(cuf::StreamCastOp op, OpAdaptor adaptor,
272+
mlir::ConversionPatternRewriter &rewriter) const override {
273+
rewriter.replaceOp(op, adaptor.getStream());
274+
return mlir::success();
275+
}
276+
};
277+
254278
class CUFGPUToLLVMConversion
255279
: public fir::impl::CUFGPUToLLVMConversionBase<CUFGPUToLLVMConversion> {
256280
public:
@@ -283,8 +307,11 @@ class CUFGPUToLLVMConversion
283307
} // namespace
284308

285309
void cuf::populateCUFGPUToLLVMConversionPatterns(
286-
const fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
310+
fir::LLVMTypeConverter &converter, mlir::RewritePatternSet &patterns,
287311
mlir::PatternBenefit benefit) {
288-
patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion>(
289-
converter, benefit);
312+
converter.addConversion([&converter](mlir::gpu::AsyncTokenType) -> Type {
313+
return mlir::LLVM::LLVMPointerType::get(&converter.getContext());
314+
});
315+
patterns.add<CUFSharedMemoryOpConversion, GPULaunchKernelConversion,
316+
CUFStreamCastConversion>(converter, benefit);
290317
}

flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
113113
// -----
114114

115115
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
116-
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
116+
llvm.func @_FortranACUFLaunchClusterKernel(!llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) attributes {sym_visibility = "private"}
117117
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
118118
llvm.return
119119
}
@@ -166,3 +166,66 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
166166

167167
// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
168168
// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel
169+
170+
// -----
171+
172+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git 4116c1370ff76adf1e58eb3c39d0a14721794c70)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
173+
llvm.func @_QMmod1Psub1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>} {
174+
llvm.return
175+
}
176+
llvm.func @_QQmain() attributes {fir.bindc_name = "test"} {
177+
%0 = llvm.mlir.constant(1 : index) : i64
178+
%stream = llvm.alloca %0 x i64 : (i64) -> !llvm.ptr
179+
%1 = llvm.mlir.constant(2 : index) : i64
180+
%2 = llvm.mlir.constant(0 : i32) : i32
181+
%3 = llvm.mlir.constant(10 : index) : i64
182+
%token = cuf.stream_cast %stream : !llvm.ptr
183+
gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
184+
llvm.return
185+
}
186+
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
187+
}
188+
189+
// CHECK-LABEL: llvm.func @_QQmain()
190+
// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
191+
// CHECK: %[[KERNEL_PTR:.*]] = llvm.mlir.addressof @_QMmod1Psub1
192+
// CHECK: llvm.call @_FortranACUFLaunchKernel(%[[KERNEL_PTR]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()
193+
194+
// -----
195+
196+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 ([email protected]:clementval/llvm-project.git ddcfd4d2dc17bf66cee8c3ef6284118684a2b0e6)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
197+
llvm.func @_QMmod1Phost_sub() {
198+
%0 = llvm.mlir.constant(1 : i32) : i32
199+
%one = llvm.mlir.constant(1 : i64) : i64
200+
%1 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
201+
%stream = llvm.alloca %one x i64 : (i64) -> !llvm.ptr
202+
%2 = llvm.mlir.constant(40 : i64) : i64
203+
%3 = llvm.mlir.constant(16 : i32) : i32
204+
%4 = llvm.mlir.constant(25 : i32) : i32
205+
%5 = llvm.mlir.constant(21 : i32) : i32
206+
%6 = llvm.mlir.constant(17 : i32) : i32
207+
%7 = llvm.mlir.constant(1 : index) : i64
208+
%8 = llvm.mlir.constant(27 : i32) : i32
209+
%9 = llvm.mlir.constant(6 : i32) : i32
210+
%10 = llvm.mlir.constant(1 : i32) : i32
211+
%11 = llvm.mlir.constant(0 : i32) : i32
212+
%12 = llvm.mlir.constant(10 : index) : i64
213+
%13 = llvm.mlir.addressof @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5 : !llvm.ptr
214+
%14 = llvm.call @_FortranACUFMemAlloc(%2, %11, %13, %6) : (i64, i32, !llvm.ptr, i32) -> !llvm.ptr
215+
%token = cuf.stream_cast %stream : !llvm.ptr
216+
gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
217+
llvm.return
218+
}
219+
llvm.func @_QMmod1Psub1(!llvm.ptr) -> ()
220+
llvm.mlir.global linkonce constant @_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5() {addr_space = 0 : i32} : !llvm.array<2 x i8> {
221+
%0 = llvm.mlir.constant("a\00") : !llvm.array<2 x i8>
222+
llvm.return %0 : !llvm.array<2 x i8>
223+
}
224+
llvm.func @_FortranACUFMemAlloc(i64, i32, !llvm.ptr, i32) -> !llvm.ptr attributes {fir.runtime, sym_visibility = "private"}
225+
llvm.func @_FortranACUFMemFree(!llvm.ptr, i32, !llvm.ptr, i32) -> !llvm.struct<()> attributes {fir.runtime, sym_visibility = "private"}
226+
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
227+
}
228+
229+
// CHECK-LABEL: llvm.func @_QMmod1Phost_sub()
230+
// CHECK: %[[STREAM:.*]] = llvm.alloca %{{.*}} x i64 : (i64) -> !llvm.ptr
231+
// CHECK: llvm.call @_FortranACUFLaunchCooperativeKernel(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[STREAM]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr, i64, i64, i64, i64, i64, i64, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr) -> ()

flang/test/Fir/CUDA/cuda-launch.fir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,5 +154,5 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
154154
// CHECK-LABEL: func.func @_QQmain()
155155
// CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
156156
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
157-
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : <i64>
157+
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : !fir.ref<i64>
158158
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest

flang/test/Fir/CUDA/cuda-stream.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ module attributes {gpu.container_module} {
1717

1818
// CHECK-LABEL: func.func @_QMmod1Phost_sub()
1919
// CHECK: %[[STREAM:.*]] = fir.alloca i64
20-
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : <i64>
20+
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[STREAM]] : !fir.ref<i64>
2121
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMmod1Psub1

0 commit comments

Comments
 (0)