Skip to content

Commit ca53463

Browse files
authored
[flang][cuda] Propagate stream information to gpu.launch_func op (#135227)
Use the information from `cuf.kernel_launch` to `gpu.launch_func`
1 parent ccdbd3b commit ca53463

File tree

2 files changed

+30
-0
lines changed

2 files changed

+30
-0
lines changed

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,8 @@ struct CUFLaunchOpConversion
879879
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
880880
gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
881881
}
882+
if (op.getStream())
883+
gpuLaunchOp.getAsyncObjectMutable().assign(op.getStream());
882884
if (procAttr)
883885
gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
884886
rewriter.replaceOp(op, gpuLaunchOp);

flang/test/Fir/CUDA/cuda-launch.fir

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,3 +129,31 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
129129

130130
// CHECK-LABEL: func.func @_QQmain()
131131
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
132+
133+
// -----
134+
135+
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
136+
gpu.module @cuda_device_mod {
137+
gpu.func @_QMdevptrPtest() kernel {
138+
gpu.return
139+
}
140+
}
141+
func.func @_QMdevptrPtest() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
142+
return
143+
}
144+
func.func @_QQmain() {
145+
%0 = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
146+
%1:2 = hlfir.declare %0 {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
147+
%c1_i32 = arith.constant 1 : i32
148+
%c0_i32 = arith.constant 0 : i32
149+
%2 = fir.load %1#0 : !fir.ref<i64>
150+
cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c0_i32, %2 : i64>>>()
151+
return
152+
}
153+
}
154+
155+
// CHECK-LABEL: func.func @_QQmain()
156+
// CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
157+
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
158+
// CHECK: %[[STREAM_LOADED:.*]] = fir.load %[[DECL_STREAM]]#0 : !fir.ref<i64>
159+
// CHECK: gpu.launch_func <%[[STREAM_LOADED]] : i64> @cuda_device_mod::@_QMdevptrPtest

0 commit comments

Comments
 (0)