Skip to content

Commit 4e6745c

Browse files
authored
[flang][cuda] Lower simple host to device data transfer (#85960)
In CUDA Fortran data transfer can be done via assignment statements between host and device variables. This patch introduces a `fir.cuda_data_transfer` operation that materialized the data transfer between two memory references. Simple transfer not involving descriptors from host to device are also lowered in this patch. When the rhs is an expression that required an evaluation, a temporary is created. The evaluation is done on the host and then the transfer is initiated. Implicit transfer when device symbol are present on the rhs is not part of this patch. Transfer from device to host is not part of this patch.
1 parent 8b9c3b5 commit 4e6745c

File tree

5 files changed

+134
-7
lines changed

5 files changed

+134
-7
lines changed

flang/include/flang/Optimizer/Dialect/FIRAttr.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,20 @@ def fir_CUDAClusterDimsAttr : fir_Attr<"CUDAClusterDims"> {
137137
let assemblyFormat = "`<` struct(params) `>`";
138138
}
139139

140+
def fir_CUDADataTransferKind : I32EnumAttr<
141+
"CUDADataTransferKind", "CUDA Fortran data transfer kind",
142+
[
143+
I32EnumAttrCase<"DeviceHost", 0, "device_host">,
144+
I32EnumAttrCase<"HostDevice", 1, "host_device">,
145+
I32EnumAttrCase<"DeviceDevice", 2, "device_device">,
146+
]> {
147+
let genSpecializedAttr = 0;
148+
let cppNamespace = "::fir";
149+
}
150+
151+
def fir_CUDADataTransferKindAttr :
152+
EnumAttr<FIROpsDialect, fir_CUDADataTransferKind, "cuda_transfer"> {
153+
let assemblyFormat = [{ ```<` $value `>` }];
154+
}
155+
140156
#endif // FIR_DIALECT_FIR_ATTRS

flang/include/flang/Optimizer/Dialect/FIROps.td

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3165,4 +3165,29 @@ def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
31653165
let hasVerifier = 1;
31663166
}
31673167

3168+
def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> {
3169+
let summary = "Represent a data transfer between host and device memory";
3170+
3171+
let description = [{
3172+
CUDA Fortran allows data transfer to be done via intrinsic assignment
3173+
between a host and a device variable. This operation is used to materialized
3174+
the data transfer between the lhs and rhs memory references.
3175+
The kind of transfer is specified in the attribute.
3176+
3177+
```
3178+
adev = a ! transfer host to device
3179+
a = adev ! transfer device to host
3180+
bdev = adev ! transfer device to device
3181+
```
3182+
}];
3183+
3184+
let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
3185+
Arg<AnyReferenceLike, "", [MemRead]>:$dst,
3186+
fir_CUDADataTransferKindAttr:$transfer_kind);
3187+
3188+
let assemblyFormat = [{
3189+
$src `to` $dst attr-dict `:` type(operands)
3190+
}];
3191+
}
3192+
31683193
#endif

flang/lib/Lower/Bridge.cpp

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3706,15 +3706,39 @@ class FirConverter : public Fortran::lower::AbstractConverter {
37063706
return false;
37073707
}
37083708

3709+
static void genCUDADataTransfer(fir::FirOpBuilder &builder,
3710+
mlir::Location loc, bool lhsIsDevice,
3711+
hlfir::Entity &lhs, bool rhsIsDevice,
3712+
hlfir::Entity &rhs) {
3713+
if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue())
3714+
TODO(loc, "CUDA data transfler with descriptors");
3715+
if (lhsIsDevice && !rhsIsDevice) {
3716+
auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
3717+
builder.getContext(), fir::CUDADataTransferKind::HostDevice);
3718+
// device = host
3719+
if (!rhs.isVariable()) {
3720+
auto associate = hlfir::genAssociateExpr(
3721+
loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
3722+
builder.create<fir::CUDADataTransferOp>(loc, associate.getBase(), lhs,
3723+
transferKindAttr);
3724+
builder.create<hlfir::EndAssociateOp>(loc, associate);
3725+
} else {
3726+
builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs,
3727+
transferKindAttr);
3728+
}
3729+
return;
3730+
}
3731+
TODO(loc, "Assignement with CUDA Fortran variables");
3732+
}
3733+
37093734
void genDataAssignment(
37103735
const Fortran::evaluate::Assignment &assign,
37113736
const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
37123737
mlir::Location loc = getCurrentLocation();
37133738
fir::FirOpBuilder &builder = getFirOpBuilder();
37143739

3715-
if (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
3716-
Fortran::evaluate::HasCUDAAttrs(assign.rhs))
3717-
TODO(loc, "Assignement with CUDA Fortran variables");
3740+
bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
3741+
bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
37183742

37193743
// Gather some information about the assignment that will impact how it is
37203744
// lowered.
@@ -3772,9 +3796,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
37723796
Fortran::lower::StatementContext localStmtCtx;
37733797
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
37743798
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
3775-
builder.create<hlfir::AssignOp>(loc, rhs, lhs,
3776-
isWholeAllocatableAssignment,
3777-
keepLhsLengthInAllocatableAssignment);
3799+
if (lhsIsDevice || rhsIsDevice) {
3800+
genCUDADataTransfer(builder, loc, lhsIsDevice, lhs, rhsIsDevice, rhs);
3801+
} else {
3802+
builder.create<hlfir::AssignOp>(loc, rhs, lhs,
3803+
isWholeAllocatableAssignment,
3804+
keepLhsLengthInAllocatableAssignment);
3805+
}
37783806
return;
37793807
}
37803808
// Assignments inside Forall, Where, or assignments to a vector subscripted

flang/lib/Optimizer/Dialect/FIRAttr.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,5 +299,6 @@ void FIROpsDialect::registerAttributes() {
299299
addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
300300
LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
301301
UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
302-
CUDALaunchBoundsAttr, CUDAClusterDimsAttr>();
302+
CUDALaunchBoundsAttr, CUDAClusterDimsAttr,
303+
CUDADataTransferKindAttr>();
303304
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
2+
3+
! Test CUDA Fortran data transfer using assignment statements.
4+
5+
subroutine sub1()
6+
integer, device :: m
7+
integer, device :: adev(10)
8+
integer :: i, ahost(10), bhost(10)
9+
10+
m = 1 + i
11+
12+
m = 1
13+
14+
adev = ahost
15+
16+
adev = ahost + 1
17+
18+
adev(1:5) = ahost(1:5)
19+
20+
adev = ahost + bhost
21+
22+
end
23+
24+
! CHECK-LABEL: func.func @_QPsub1()
25+
26+
! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
27+
! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
28+
! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
29+
! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
30+
31+
! CHECK: %[[C1:.*]] = arith.constant 1 : i32
32+
! CHECK: %[[LOADED_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
33+
! CHECK: %[[ADD:.*]] = arith.addi %[[C1]], %[[LOADED_I]] : i32
34+
! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ADD]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
35+
! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
36+
! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
37+
38+
! CHECK: %[[C1:.*]] = arith.constant 1 : i32
39+
! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[C1]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
40+
! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
41+
! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
42+
43+
! CHECK: fir.cuda_data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
44+
45+
! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
46+
! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
47+
! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
48+
! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
49+
50+
! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
51+
! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
52+
! CHECK: fir.cuda_data_transfer %[[DES_AHOST]] to %[[DES_ADEV]] {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
53+
54+
! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32>
55+
! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
56+
! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
57+
! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1

0 commit comments

Comments
 (0)