Skip to content

Commit d08e980

Browse files
authored
[flang][cuda] Only convert launch from CUDA Fortran kernels (llvm#136221)
Make sure `gpu.launch_func` has a CUDA proc attribute and update the conversion pattern to only convert those with the attribute.
1 parent 5739a22 commit d08e980

File tree

4 files changed

+27
-9
lines changed

4 files changed

+27
-9
lines changed

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,11 @@ struct GPULaunchKernelConversion
8282
mlir::LogicalResult
8383
matchAndRewrite(mlir::gpu::LaunchFuncOp op, OpAdaptor adaptor,
8484
mlir::ConversionPatternRewriter &rewriter) const override {
85+
// Only convert gpu.launch_func for CUDA Fortran.
86+
if (!op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
87+
cuf::getProcAttrName()))
88+
return mlir::failure();
89+
8590
mlir::Location loc = op.getLoc();
8691
auto *ctx = rewriter.getContext();
8792
mlir::ModuleOp mod = op->getParentOfType<mlir::ModuleOp>();
@@ -293,7 +298,15 @@ class CUFGPUToLLVMConversion
293298
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
294299
/*forceUnifiedTBAATree=*/false, *dl);
295300
cuf::populateCUFGPUToLLVMConversionPatterns(typeConverter, patterns);
296-
target.addIllegalOp<mlir::gpu::LaunchFuncOp>();
301+
302+
target.addDynamicallyLegalOp<mlir::gpu::LaunchFuncOp>(
303+
[&](mlir::gpu::LaunchFuncOp op) {
304+
if (op.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
305+
cuf::getProcAttrName()))
306+
return false;
307+
return true;
308+
});
309+
297310
target.addIllegalOp<cuf::SharedMemoryOp>();
298311
target.addLegalDialect<mlir::LLVM::LLVMDialect>();
299312
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,11 @@ struct CUFLaunchOpConversion
888888
}
889889
if (procAttr)
890890
gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
891+
else
892+
// Set default global attribute of the original was not found.
893+
gpuLaunchOp->setAttr(cuf::getProcAttrName(),
894+
cuf::ProcAttributeAttr::get(
895+
op.getContext(), cuf::ProcAttribute::Global));
891896
rewriter.replaceOp(op, gpuLaunchOp);
892897
return mlir::success();
893898
}

flang/test/Fir/CUDA/cuda-gpu-launch-func.mlir

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i1, dense<8> : ve
5454
llvm.br ^bb1(%44 : i64)
5555
^bb3: // pred: ^bb1
5656
%45 = llvm.call @_FortranACUFDataTransferPtrPtr(%14, %25, %2, %11, %13, %5) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()>
57-
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr)
57+
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 blocks in (%7, %7, %7) threads in (%12, %7, %7) : i64 dynamic_shared_memory_size %11 args(%14 : !llvm.ptr) {cuf.proc_attr = #cuf.cuda_proc<global>}
5858
%46 = llvm.call @_FortranACUFDataTransferPtrPtr(%25, %14, %2, %10, %13, %4) : (!llvm.ptr, !llvm.ptr, i64, i32, !llvm.ptr, i32) -> !llvm.struct<()>
5959
%47 = llvm.call @_FortranAioBeginExternalListOutput(%9, %13, %8) {fastmathFlags = #llvm.fastmath<contract>} : (i32, !llvm.ptr, i32) -> !llvm.ptr
6060
%48 = llvm.mlir.constant(9 : i32) : i32
@@ -122,7 +122,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
122122
%1 = llvm.mlir.constant(2 : index) : i64
123123
%2 = llvm.mlir.constant(0 : i32) : i32
124124
%3 = llvm.mlir.constant(10 : index) : i64
125-
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
125+
gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%1, %1, %0) blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 {cuf.proc_attr = #cuf.cuda_proc<global>}
126126
llvm.return
127127
}
128128
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]
@@ -180,7 +180,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr<272>, d
180180
%2 = llvm.mlir.constant(0 : i32) : i32
181181
%3 = llvm.mlir.constant(10 : index) : i64
182182
%token = cuf.stream_cast %stream : !llvm.ptr
183-
gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2
183+
gpu.launch_func [%token] @cuda_device_mod::@_QMmod1Psub1 blocks in (%3, %3, %0) threads in (%3, %3, %0) : i64 dynamic_shared_memory_size %2 {cuf.proc_attr = #cuf.cuda_proc<global>}
184184
llvm.return
185185
}
186186
gpu.binary @cuda_device_mod [#gpu.object<#nvvm.target, "">]

flang/test/Fir/CUDA/cuda-launch.fir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
2626
%c1024_i32 = arith.constant 1024 : i32
2727
%c6_i32 = arith.constant 6 : i32
2828
%c1_i32 = arith.constant 1 : i32
29-
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}}
29+
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} {cuf.proc_attr = #cuf.cuda_proc<global>}
3030
cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
3131

32-
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}}
32+
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c1024{{.*}} {cuf.proc_attr = #cuf.cuda_proc<global>}
3333
cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1024_i32>>>()
3434

35-
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
35+
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>) {cuf.proc_attr = #cuf.cuda_proc<global>}
3636
cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
3737
return
3838
}
@@ -64,7 +64,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
6464
}
6565

6666
// CHECK-LABEL: func.func @_QMmod1Phost_sub()
67-
// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
67+
// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}}) {cuf.proc_attr = #cuf.cuda_proc<global>}
6868

6969
// -----
7070

@@ -107,7 +107,7 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
107107
// CHECK: %[[CONV_ADDR:.*]] = fir.convert %[[ADDROF]] : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) -> !fir.llvm_ptr<i8>
108108
// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
109109
// CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
110-
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
110+
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {cuf.proc_attr = #cuf.cuda_proc<global>}
111111

112112
// -----
113113

0 commit comments

Comments
 (0)