Skip to content

Commit 953aa10

Browse files
authored
[flang][cuda] Lower device to host and device to device transfer (#87387)
Add more support for CUDA data transfer in assignment. This patch adds device to device and device to host support. If device symbols are present on the rhs, some implicit data transfer are initiated. A temporary is created and the data are transferred to the host. The expression is evaluated on the host and the assignment is done.
1 parent 5f9ed2f commit 953aa10

File tree

3 files changed

+182
-11
lines changed

3 files changed

+182
-11
lines changed

flang/include/flang/Evaluate/tools.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "flang/Evaluate/type.h"
2020
#include "flang/Parser/message.h"
2121
#include "flang/Semantics/attr.h"
22+
#include "flang/Semantics/scope.h"
2223
#include "flang/Semantics/symbol.h"
2324
#include <array>
2425
#include <optional>
@@ -1240,6 +1241,35 @@ inline bool HasCUDAAttrs(const Expr<SomeType> &expr) {
12401241
return false;
12411242
}
12421243

1244+
/// Check if the expression is a mix of host and device variables that require
1245+
/// implicit data transfer.
1246+
inline bool HasCUDAImplicitTransfer(const Expr<SomeType> &expr) {
1247+
unsigned hostSymbols{0};
1248+
unsigned deviceSymbols{0};
1249+
for (const Symbol &sym : CollectSymbols(expr)) {
1250+
if (const auto *details =
1251+
sym.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()) {
1252+
if (details->cudaDataAttr()) {
1253+
++deviceSymbols;
1254+
} else {
1255+
if (sym.owner().IsDerivedType()) {
1256+
if (const auto *details =
1257+
sym.owner()
1258+
.GetSymbol()
1259+
->GetUltimate()
1260+
.detailsIf<semantics::ObjectEntityDetails>()) {
1261+
if (details->cudaDataAttr()) {
1262+
++deviceSymbols;
1263+
}
1264+
}
1265+
}
1266+
++hostSymbols;
1267+
}
1268+
}
1269+
}
1270+
return hostSymbols > 0 && deviceSymbols > 0;
1271+
}
1272+
12431273
} // namespace Fortran::evaluate
12441274

12451275
namespace Fortran::semantics {

flang/lib/Lower/Bridge.cpp

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3710,16 +3710,18 @@ class FirConverter : public Fortran::lower::AbstractConverter {
37103710
return false;
37113711
}
37123712

3713-
static void genCUDADataTransfer(fir::FirOpBuilder &builder,
3714-
mlir::Location loc, bool lhsIsDevice,
3715-
hlfir::Entity &lhs, bool rhsIsDevice,
3716-
hlfir::Entity &rhs) {
3713+
void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
3714+
const Fortran::evaluate::Assignment &assign,
3715+
hlfir::Entity &lhs, hlfir::Entity &rhs) {
3716+
bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
3717+
bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
37173718
if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue())
37183719
TODO(loc, "CUDA data transfler with descriptors");
3720+
3721+
// device = host
37193722
if (lhsIsDevice && !rhsIsDevice) {
37203723
auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
37213724
builder.getContext(), fir::CUDADataTransferKind::HostDevice);
3722-
// device = host
37233725
if (!rhs.isVariable()) {
37243726
auto associate = hlfir::genAssociateExpr(
37253727
loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
@@ -3732,7 +3734,73 @@ class FirConverter : public Fortran::lower::AbstractConverter {
37323734
}
37333735
return;
37343736
}
3735-
TODO(loc, "Assignement with CUDA Fortran variables");
3737+
3738+
// host = device
3739+
if (!lhsIsDevice && rhsIsDevice) {
3740+
auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
3741+
builder.getContext(), fir::CUDADataTransferKind::DeviceHost);
3742+
if (!rhs.isVariable()) {
3743+
// evaluateRhs loads scalar. Look for the memory reference to be used in
3744+
// the transfer.
3745+
if (mlir::isa_and_nonnull<fir::LoadOp>(rhs.getDefiningOp())) {
3746+
auto loadOp = mlir::dyn_cast<fir::LoadOp>(rhs.getDefiningOp());
3747+
builder.create<fir::CUDADataTransferOp>(loc, loadOp.getMemref(), lhs,
3748+
transferKindAttr);
3749+
return;
3750+
}
3751+
} else {
3752+
builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs,
3753+
transferKindAttr);
3754+
}
3755+
return;
3756+
}
3757+
3758+
if (lhsIsDevice && rhsIsDevice) {
3759+
assert(rhs.isVariable() && "CUDA Fortran assignment rhs is not legal");
3760+
auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
3761+
builder.getContext(), fir::CUDADataTransferKind::DeviceDevice);
3762+
builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs, transferKindAttr);
3763+
return;
3764+
}
3765+
llvm_unreachable("Unhandled CUDA data transfer");
3766+
}
3767+
3768+
llvm::SmallVector<mlir::Value>
3769+
genCUDAImplicitDataTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
3770+
const Fortran::evaluate::Assignment &assign) {
3771+
llvm::SmallVector<mlir::Value> temps;
3772+
localSymbols.pushScope();
3773+
auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
3774+
builder.getContext(), fir::CUDADataTransferKind::DeviceHost);
3775+
unsigned nbDeviceResidentObject = 0;
3776+
for (const Fortran::semantics::Symbol &sym :
3777+
Fortran::evaluate::CollectSymbols(assign.rhs)) {
3778+
if (const auto *details =
3779+
sym.GetUltimate()
3780+
.detailsIf<Fortran::semantics::ObjectEntityDetails>()) {
3781+
if (details->cudaDataAttr()) {
3782+
if (sym.owner().IsDerivedType() && IsAllocatable(sym.GetUltimate()))
3783+
TODO(loc, "Device resident allocatable derived-type component");
3784+
// TODO: This should probably being checked in semantic and give a
3785+
// proper error.
3786+
assert(
3787+
nbDeviceResidentObject <= 1 &&
3788+
"Only one reference to the device resident object is supported");
3789+
auto addr = getSymbolAddress(sym);
3790+
hlfir::Entity entity{addr};
3791+
auto [temp, cleanup] =
3792+
hlfir::createTempFromMold(loc, builder, entity);
3793+
auto needCleanup = fir::getIntIfConstant(cleanup);
3794+
if (needCleanup && *needCleanup)
3795+
temps.push_back(temp);
3796+
addSymbol(sym, temp, /*forced=*/true);
3797+
builder.create<fir::CUDADataTransferOp>(loc, addr, temp,
3798+
transferKindAttr);
3799+
++nbDeviceResidentObject;
3800+
}
3801+
}
3802+
}
3803+
return temps;
37363804
}
37373805

37383806
void genDataAssignment(
@@ -3741,8 +3809,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
37413809
mlir::Location loc = getCurrentLocation();
37423810
fir::FirOpBuilder &builder = getFirOpBuilder();
37433811

3744-
bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
3745-
bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
3812+
bool isCUDATransfer = Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
3813+
Fortran::evaluate::HasCUDAAttrs(assign.rhs);
3814+
bool hasCUDAImplicitTransfer =
3815+
Fortran::evaluate::HasCUDAImplicitTransfer(assign.rhs);
3816+
llvm::SmallVector<mlir::Value> implicitTemps;
3817+
if (hasCUDAImplicitTransfer)
3818+
implicitTemps = genCUDAImplicitDataTransfer(builder, loc, assign);
37463819

37473820
// Gather some information about the assignment that will impact how it is
37483821
// lowered.
@@ -3800,12 +3873,16 @@ class FirConverter : public Fortran::lower::AbstractConverter {
38003873
Fortran::lower::StatementContext localStmtCtx;
38013874
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
38023875
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
3803-
if (lhsIsDevice || rhsIsDevice) {
3804-
genCUDADataTransfer(builder, loc, lhsIsDevice, lhs, rhsIsDevice, rhs);
3805-
} else {
3876+
if (isCUDATransfer && !hasCUDAImplicitTransfer)
3877+
genCUDADataTransfer(builder, loc, assign, lhs, rhs);
3878+
else
38063879
builder.create<hlfir::AssignOp>(loc, rhs, lhs,
38073880
isWholeAllocatableAssignment,
38083881
keepLhsLengthInAllocatableAssignment);
3882+
if (hasCUDAImplicitTransfer) {
3883+
localSymbols.popScope();
3884+
for (mlir::Value temp : implicitTemps)
3885+
builder.create<fir::FreeMemOp>(loc, temp);
38093886
}
38103887
return;
38113888
}

flang/test/Lower/CUDA/cuda-data-transfer.cuf

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33
! Test CUDA Fortran data transfer using assignment statements.
44

5+
module mod1
6+
type :: t1
7+
integer :: i
8+
end type
9+
end
10+
511
subroutine sub1()
612
integer, device :: m
713
integer, device :: adev(10)
@@ -55,3 +61,61 @@ end
5561
! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
5662
! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
5763
! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
64+
65+
subroutine sub2()
66+
integer, device :: m
67+
integer, device :: adev(10), bdev(10)
68+
integer :: i, ahost(10), bhost(10)
69+
70+
ahost = adev
71+
72+
i = m
73+
74+
ahost(1:5) = adev(1:5)
75+
76+
bdev = adev
77+
78+
! Implicit data transfer of adev before evaluation.
79+
bhost = ahost + adev
80+
81+
end
82+
83+
! CHECK-LABEL: func.func @_QPsub2()
84+
! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
85+
! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub2Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
86+
! CHECK: %[[BDEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Ebdev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
87+
! CHECK: %[[BHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub2Ebhost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
88+
! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
89+
! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
90+
! CHECK: fir.cuda_data_transfer %[[ADEV]]#0 to %[[AHOST]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
91+
! CHECK: fir.cuda_data_transfer %[[M]]#0 to %[[I]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<i32>, !fir.ref<i32>
92+
93+
! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
94+
! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%{{.*}}:%{{.*}}:%{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
95+
! CHECK: fir.cuda_data_transfer %[[DES_ADEV]] to %[[DES_AHOST]] {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
96+
97+
! CHECK: fir.cuda_data_transfer %[[ADEV]]#0 to %[[BDEV]]#0 {transfer_kind = #fir.cuda_transfer<device_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
98+
99+
! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32> {bindc_name = ".tmp", uniq_name = ""}
100+
! CHECK: %[[DECL_TEMP:.*]]:2 = hlfir.declare %[[TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
101+
! CHECK: %[[ADEV_TEMP:.*]]:2 = hlfir.declare %21#0 {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.heap<!fir.array<10xi32>>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
102+
! CHECK: fir.cuda_data_transfer %[[ADEV]]#1 to %[[DECL_TEMP]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>
103+
! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32>
104+
! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[BHOST]]#0 : !hlfir.expr<10xi32>, !fir.ref<!fir.array<10xi32>>
105+
! CHECK: fir.freemem %[[DECL_TEMP]]#0 : !fir.heap<!fir.array<10xi32>>
106+
107+
subroutine sub3()
108+
use mod1
109+
type(t1), device :: t
110+
integer :: ahost(10), bhost(10)
111+
112+
bhost = ahost + t%i
113+
end
114+
115+
! CHECK-LABEL: func.func @_QPsub3()
116+
! CHECK: %[[TMP:.*]] = fir.alloca !fir.type<_QMmod1Tt1{i:i32}> {bindc_name = ".tmp"}
117+
! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub3Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
118+
! CHECK: %[[BHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub3Ebhost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
119+
! CHECK: %[[T:.*]]:2 = hlfir.declare %7 {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub3Et"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
120+
! CHECK: %[[TMP_DECL:.*]]:2 = hlfir.declare %0 {uniq_name = ".tmp"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
121+
! CHECK: fir.cuda_data_transfer %[[T]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>

0 commit comments

Comments
 (0)