Skip to content

Commit 6811a3b

Browse files
authored
[flang][cuda] Allocate extra descriptor in managed memory when it is coming from device (#140818)
1 parent 7b51339 commit 6811a3b

File tree

2 files changed

+23
-3
lines changed

2 files changed

+23
-3
lines changed

flang/lib/Optimizer/CodeGen/CodeGen.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1830,7 +1830,9 @@ static bool isDeviceAllocation(mlir::Value val, mlir::Value adaptorVal) {
18301830
(callOp.getCallee().value().getRootReference().getValue().starts_with(
18311831
RTNAME_STRING(CUFMemAlloc)) ||
18321832
callOp.getCallee().value().getRootReference().getValue().starts_with(
1833-
RTNAME_STRING(CUFAllocDescriptor))))
1833+
RTNAME_STRING(CUFAllocDescriptor)) ||
1834+
callOp.getCallee().value().getRootReference().getValue() ==
1835+
"__tgt_acc_get_deviceptr"))
18341836
return true;
18351837
return false;
18361838
}
@@ -3253,8 +3255,9 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
32533255
if (auto callOp = mlir::dyn_cast_or_null<mlir::LLVM::CallOp>(
32543256
inputBoxStorage.getDefiningOp())) {
32553257
if (callOp.getCallee() &&
3256-
(*callOp.getCallee())
3257-
.starts_with(RTNAME_STRING(CUFAllocDescriptor))) {
3258+
((*callOp.getCallee())
3259+
.starts_with(RTNAME_STRING(CUFAllocDescriptor)) ||
3260+
(*callOp.getCallee()).starts_with("__tgt_acc_get_deviceptr"))) {
32583261
// CUDA Fortran local descriptor are allocated in managed memory. So
32593262
// new storage must be allocated the same way.
32603263
auto mod = load->getParentOfType<mlir::ModuleOp>();

flang/test/Fir/CUDA/cuda-code-gen.mlir

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,20 @@ func.func @_QMm1Psub1(%arg0: !fir.box<!fir.array<?xi32>> {cuf.data_attr = #cuf.c
204204
fir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
205205

206206
// CHECK: llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
207+
208+
// -----
209+
210+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
211+
func.func @_QQmain() attributes {fir.bindc_name = "cufkernel_global"} {
212+
%c0 = arith.constant 0 : index
213+
%3 = fir.call @__tgt_acc_get_deviceptr() : () -> !fir.ref<!fir.box<none>>
214+
%4 = fir.convert %3 : (!fir.ref<!fir.box<none>>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
215+
%5 = fir.load %4 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
216+
return
217+
}
218+
219+
// CHECK-LABEL: llvm.func @_QQmain()
220+
// CHECK: llvm.call @_FortranACUFAllocDescriptor
221+
222+
func.func private @__tgt_acc_get_deviceptr() -> !fir.ref<!fir.box<none>>
223+
}

0 commit comments

Comments
 (0)