-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] Lower simple host to device data transfer #85960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesIn CUDA Fortran data transfer can be done via assignment statements between host and device variables. This patch introduces a Simple transfer not involving descriptors from host to device are also lowered in this patch. When the rhs is an expression that required an evaluation, a temporary is created. The evaluation is done on the host and then the transfer is initiated. Implicit transfer when device symbol are present on the rhs is not part of this patch. Transfer from device to host is not part of this patch. Full diff: https://github.com/llvm/llvm-project/pull/85960.diff 5 Files Affected:
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index 2ac4af9e66aa80..f8b3fb861cc62f 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -137,4 +137,20 @@ def fir_CUDAClusterDimsAttr : fir_Attr<"CUDAClusterDims"> {
let assemblyFormat = "`<` struct(params) `>`";
}
+def fir_CUDADataTransferKind : I32EnumAttr<
+ "CUDADataTransferKind", "CUDA Fortran data transfer kind",
+ [
+ I32EnumAttrCase<"DeviceHost", 0, "device_host">,
+ I32EnumAttrCase<"HostDevice", 1, "host_device">,
+ I32EnumAttrCase<"DeviceDevice", 2, "device_device">,
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::fir";
+}
+
+def fir_CUDADataTransferKindAttr :
+ EnumAttr<FIROpsDialect, fir_CUDADataTransferKind, "cuda_transfer"> {
+ let assemblyFormat = [{ ```<` $value `>` }];
+}
+
#endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 6e520d111701f0..3a1af1258aff28 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -3165,4 +3165,29 @@ def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
let hasVerifier = 1;
}
+def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> {
+ let summary = "Represent a data transfer between host and device memory";
+
+ let description = [{
+ CUDA Fortran allows data transfer to be done via intrinsic assignment
+ between a host and a device variable. This operation is used to materialized
+ the data transfer between the lhs and rhs memory references.
+ The kind of transfer is specified in the attribute.
+
+ ```
+ adev = a ! transfer host to device
+ a = adev ! transfer device to host
+ bdev = adev ! transfer device to device
+ ```
+ }];
+
+ let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
+ Arg<AnyReferenceLike, "", [MemRead]>:$dst,
+ fir_CUDADataTransferKindAttr:$transfer_kind);
+
+ let assemblyFormat = [{
+ $src `to` $dst attr-dict `:` type(operands)
+ }];
+}
+
#endif
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c3cb9ba6a47e3d..1f923ba953843e 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3706,15 +3706,40 @@ class FirConverter : public Fortran::lower::AbstractConverter {
return false;
}
+ static void genCUDADataTransfer(fir::FirOpBuilder &builder,
+ mlir::Location loc, bool lhsIsDevice,
+ hlfir::Entity &lhs, bool rhsIsDevice,
+ hlfir::Entity &rhs) {
+ if (rhs.isBoxAddressOrValue() || lhs.isBoxAddressOrValue())
+ TODO(loc, "CUDA data transfler with descriptors");
+ if (lhsIsDevice && !rhsIsDevice) {
+ auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
+ builder.getContext(), fir::CUDADataTransferKind::HostDevice);
+ // device = host
+ if (!rhs.isVariable()) {
+ auto [temp, cleanup] = hlfir::createTempFromMold(loc, builder, rhs);
+ builder.create<hlfir::AssignOp>(loc, rhs, temp, false, false);
+ builder.create<fir::CUDADataTransferOp>(loc, temp, lhs,
+ transferKindAttr);
+ if (mlir::isa<fir::HeapType>(temp.getType()))
+ builder.create<fir::FreeMemOp>(loc, temp);
+ } else {
+ builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs,
+ transferKindAttr);
+ }
+ return;
+ }
+ TODO(loc, "Assignement with CUDA Fortran variables");
+ }
+
void genDataAssignment(
const Fortran::evaluate::Assignment &assign,
const Fortran::evaluate::ProcedureRef *userDefinedAssignment) {
mlir::Location loc = getCurrentLocation();
fir::FirOpBuilder &builder = getFirOpBuilder();
- if (Fortran::evaluate::HasCUDAAttrs(assign.lhs) ||
- Fortran::evaluate::HasCUDAAttrs(assign.rhs))
- TODO(loc, "Assignement with CUDA Fortran variables");
+ bool lhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.lhs);
+ bool rhsIsDevice = Fortran::evaluate::HasCUDAAttrs(assign.rhs);
// Gather some information about the assignment that will impact how it is
// lowered.
@@ -3772,9 +3797,13 @@ class FirConverter : public Fortran::lower::AbstractConverter {
Fortran::lower::StatementContext localStmtCtx;
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
- builder.create<hlfir::AssignOp>(loc, rhs, lhs,
- isWholeAllocatableAssignment,
- keepLhsLengthInAllocatableAssignment);
+ if (lhsIsDevice || rhsIsDevice) {
+ genCUDADataTransfer(builder, loc, lhsIsDevice, lhs, rhsIsDevice, rhs);
+ } else {
+ builder.create<hlfir::AssignOp>(loc, rhs, lhs,
+ isWholeAllocatableAssignment,
+ keepLhsLengthInAllocatableAssignment);
+ }
return;
}
// Assignments inside Forall, Where, or assignments to a vector subscripted
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 0cf8dfb9f784c3..e43710f5627ee0 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -299,5 +299,6 @@ void FIROpsDialect::registerAttributes() {
addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
- CUDALaunchBoundsAttr, CUDAClusterDimsAttr>();
+ CUDALaunchBoundsAttr, CUDAClusterDimsAttr,
+ CUDADataTransferKindAttr>();
}
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
new file mode 100644
index 00000000000000..8ae42cbddce862
--- /dev/null
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -0,0 +1,64 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Test CUDA Fortran data transfer using assignment statements.
+
+subroutine sub1()
+ integer, device :: m
+ integer, device :: adev(10)
+ integer :: i, ahost(10), bhost(10)
+
+ m = 1 + i
+
+ m = 1
+
+ adev = ahost
+
+ adev = ahost + 1
+
+ adev(1:5) = ahost(1:5)
+
+ adev = ahost + bhost
+
+end
+
+! CHECK-LABEL: func.func @_QPsub1()
+! CHECK: %[[TMP1_ALLOC:.*]] = fir.alloca i32 {bindc_name = ".tmp"}
+! CHECK: %[[TMP0_ALLOC:.*]] = fir.alloca i32 {bindc_name = ".tmp"}
+
+! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[LOADED_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+! CHECK: %[[ADD:.*]] = arith.addi %[[C1]], %[[LOADED_I]] : i32
+! CHECK: %[[TMP0:.*]]:2 = hlfir.declare %[[TMP0_ALLOC:.*]] {uniq_name = ".tmp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: hlfir.assign %[[ADD]] to %[[TMP0]]#0 : i32, !fir.ref<i32>
+! CHECK: fir.cuda_data_transfer %[[TMP0]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+
+! CHECK: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK: %[[TMP1:.*]]:2 = hlfir.declare %[[TMP1_ALLOC]] {uniq_name = ".tmp"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: hlfir.assign %[[C1]] to %[[TMP1]]#0 : i32, !fir.ref<i32>
+! CHECK: fir.cuda_data_transfer %[[TMP1]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+
+! CHECK: fir.cuda_data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+
+! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
+
+! CHECK: %[[ALLOCMEM:.*]] = fir.allocmem !fir.array<10xi32> {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOCMEM]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
+! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[TEMP]]#0 : !hlfir.expr<10xi32>, !fir.heap<!fir.array<10xi32>>
+! CHECK: fir.cuda_data_transfer %[[TEMP]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.heap<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: fir.freemem %[[TEMP]]#0 : !fir.heap<!fir.array<10xi32>>
+
+! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
+! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}}) shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
+! CHECK: fir.cuda_data_transfer %[[DES_AHOST]] to %[[DES_ADEV]] {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
+
+! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
+! CHECK: %[[ALLOCMEM:.*]] = fir.allocmem !fir.array<10xi32> {bindc_name = ".tmp", uniq_name = ""}
+! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOCMEM]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
+! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[TEMP]]#0 : !hlfir.expr<10xi32>, !fir.heap<!fir.array<10xi32>>
+! CHECK: fir.cuda_data_transfer %[[TEMP]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.heap<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: fir.freemem %[[TEMP]]#0 : !fir.heap<!fir.array<10xi32>>
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The operation syntax looks good to me. It is however not clear to me how much of the Fortran intrinsic assignment semantics is placed into this new operation, or if it is just a "dumb" data transfer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks for the update!
In CUDA Fortran data transfer can be done via assignment statements between host and device variables.
This patch introduces a
fir.cuda_data_transfer
operation that materialized the data transfer between two memory references.Simple transfer not involving descriptors from host to device are also lowered in this patch. When the rhs is an expression that required an evaluation, a temporary is created. The evaluation is done on the host and then the transfer is initiated.
Implicit transfer when device symbol are present on the rhs is not part of this patch. Transfer from device to host is not part of this patch.