-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[flang][cuda] Use fir.cuda_deallocate for automatic deallocation #89662
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-openacc @llvm/pr-subscribers-flang-fir-hlfir Author: Valentin Clement (バレンタイン クレメン) (clementval) ChangesAutomatic deallocation of allocatable that are cuda device variable must use the fir.cuda_deallocate operation. This patch update the automatic deallocation code generation to use this operation when the variable is a cuda variable. This patch as also the side effect to correctly call Full diff: https://github.com/llvm/llvm-project/pull/89662.diff 6 Files Affected:
diff --git a/flang/include/flang/Lower/Allocatable.h b/flang/include/flang/Lower/Allocatable.h
index d3c16de377c1d7a..e8738f0407e77ff 100644
--- a/flang/include/flang/Lower/Allocatable.h
+++ b/flang/include/flang/Lower/Allocatable.h
@@ -55,12 +55,14 @@ void genDeallocateStmt(AbstractConverter &converter,
void genDeallocateBox(AbstractConverter &converter,
const fir::MutableBoxValue &box, mlir::Location loc,
+ const Fortran::semantics::Symbol *sym = nullptr,
mlir::Value declaredTypeDesc = {});
/// Deallocate an allocatable if it is allocated at the end of its lifetime.
void genDeallocateIfAllocated(AbstractConverter &converter,
const fir::MutableBoxValue &box,
- mlir::Location loc);
+ mlir::Location loc,
+ const Fortran::semantics::Symbol *sym = nullptr);
/// Create a MutableBoxValue for an allocatable or pointer entity.
/// If the variables is a local variable that is not a dummy, it will be
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 38f61528d7e28ad..8e84ea2fc5d5223 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -859,18 +859,20 @@ genDeallocate(fir::FirOpBuilder &builder,
void Fortran::lower::genDeallocateBox(
Fortran::lower::AbstractConverter &converter,
const fir::MutableBoxValue &box, mlir::Location loc,
- mlir::Value declaredTypeDesc) {
+ const Fortran::semantics::Symbol *sym, mlir::Value declaredTypeDesc) {
const Fortran::lower::SomeExpr *statExpr = nullptr;
const Fortran::lower::SomeExpr *errMsgExpr = nullptr;
ErrorManager errorManager;
errorManager.init(converter, loc, statExpr, errMsgExpr);
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
- genDeallocate(builder, converter, loc, box, errorManager, declaredTypeDesc);
+ genDeallocate(builder, converter, loc, box, errorManager, declaredTypeDesc,
+ sym);
}
void Fortran::lower::genDeallocateIfAllocated(
Fortran::lower::AbstractConverter &converter,
- const fir::MutableBoxValue &box, mlir::Location loc) {
+ const fir::MutableBoxValue &box, mlir::Location loc,
+ const Fortran::semantics::Symbol *sym) {
fir::FirOpBuilder &builder = converter.getFirOpBuilder();
mlir::Value isAllocated =
fir::factory::genIsAllocatedOrAssociatedTest(builder, loc, box);
@@ -880,9 +882,9 @@ void Fortran::lower::genDeallocateIfAllocated(
eleType.isa<fir::RecordType>() && box.isPolymorphic()) {
mlir::Value declaredTypeDesc = builder.create<fir::TypeDescOp>(
loc, mlir::TypeAttr::get(eleType));
- genDeallocateBox(converter, box, loc, declaredTypeDesc);
+ genDeallocateBox(converter, box, loc, sym, declaredTypeDesc);
} else {
- genDeallocateBox(converter, box, loc);
+ genDeallocateBox(converter, box, loc, sym);
}
})
.end();
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 2d2d9eba905bdd5..c40435c0977c743 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -916,13 +916,14 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
break;
case VariableCleanUp::Deallocate:
auto *converterPtr = &converter;
- converter.getFctCtx().attachCleanup([converterPtr, loc, exv]() {
+ auto *sym = &var.getSymbol();
+ converter.getFctCtx().attachCleanup([converterPtr, loc, exv, sym]() {
const fir::MutableBoxValue *mutableBox =
exv.getBoxOf<fir::MutableBoxValue>();
assert(mutableBox &&
"trying to deallocate entity not lowered as allocatable");
Fortran::lower::genDeallocateIfAllocated(*converterPtr, *mutableBox,
- loc);
+ loc, sym);
});
}
}
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d933c07aba0e0c0..596c00c9ddb1556 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -4187,21 +4187,24 @@ void Fortran::lower::attachDeclarePostDeallocAction(
std::stringstream fctName;
fctName << converter.mangleName(sym) << declarePostDeallocSuffix.str();
- mlir::Operation &op = builder.getInsertionBlock()->back();
- if (op.hasAttr(mlir::acc::getDeclareActionAttrName())) {
- auto attr = op.getAttrOfType<mlir::acc::DeclareActionAttr>(
+ mlir::Operation *op = &builder.getInsertionBlock()->back();
+ if (mlir::isa<fir::ResultOp>(*op))
+ op = op->getPrevNode();
+ assert(op && "expect operation to attach the post deallocation action");
+ if (op->hasAttr(mlir::acc::getDeclareActionAttrName())) {
+ auto attr = op->getAttrOfType<mlir::acc::DeclareActionAttr>(
mlir::acc::getDeclareActionAttrName());
- op.setAttr(mlir::acc::getDeclareActionAttrName(),
- mlir::acc::DeclareActionAttr::get(
- builder.getContext(), attr.getPreAlloc(),
- attr.getPostAlloc(), attr.getPreDealloc(),
- /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
+ op->setAttr(mlir::acc::getDeclareActionAttrName(),
+ mlir::acc::DeclareActionAttr::get(
+ builder.getContext(), attr.getPreAlloc(),
+ attr.getPostAlloc(), attr.getPreDealloc(),
+ /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
} else {
- op.setAttr(mlir::acc::getDeclareActionAttrName(),
- mlir::acc::DeclareActionAttr::get(
- builder.getContext(),
- /*preAlloc=*/{}, /*postAlloc=*/{}, /*preDealloc=*/{},
- /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
+ op->setAttr(mlir::acc::getDeclareActionAttrName(),
+ mlir::acc::DeclareActionAttr::get(
+ builder.getContext(),
+ /*preAlloc=*/{}, /*postAlloc=*/{}, /*preDealloc=*/{},
+ /*postDealloc=*/builder.getSymbolRefAttr(fctName.str())));
}
}
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index 251ff16a56c797c..eff5f13669e904c 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -17,6 +17,15 @@ end subroutine
! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %[[BOX_LOAD:.*]] = fir.load %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+! CHECK: %[[ADDR:.*]] = fir.box_addr %[[BOX_LOAD]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
+! CHECK: %[[ADDR_I64:.*]] = fir.convert %[[ADDR]] : (!fir.heap<!fir.array<?xf32>>) -> i64
+! CHECK: %[[C0:.*]] = arith.constant 0 : i64
+! CHECK: %[[NE_C0:.*]] = arith.cmpi ne, %[[ADDR_I64]], %[[C0]] : i64
+! CHECK: fir.if %[[NE_C0]] {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
+
subroutine sub2()
real, allocatable, managed :: a(:)
integer :: istat
@@ -37,6 +46,10 @@ end subroutine
! CHECK: %[[STAT:.*]] = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>} -> i32
+! CHECK: }
+
subroutine sub3()
integer, allocatable, pinned :: a(:,:)
logical :: plog
@@ -50,6 +63,9 @@ end subroutine
! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
! CHECK-2: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {cuda_attr = #fir.cuda<pinned>} -> i32
+! CHECK: }
subroutine sub4()
real, allocatable, device :: a(:)
@@ -65,6 +81,9 @@ end subroutine
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
subroutine sub5()
real, allocatable, device :: a(:)
@@ -80,6 +99,11 @@ end subroutine
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableSetBounds
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: fir.if
+! CHECK: fir.freemem
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
subroutine sub6()
real, allocatable, device :: a(:)
@@ -95,6 +119,9 @@ end subroutine
! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
! CHECK: fir.call @_FortranAAllocatableApplyMold
! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
subroutine sub7()
real, allocatable, device :: a(:)
@@ -120,3 +147,6 @@ end subroutine
! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
! CHECK: %[[STAT:.*]] = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%15 : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
+! CHECK: fir.if %{{.*}} {
+! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: }
diff --git a/flang/test/Lower/OpenACC/acc-declare.f90 b/flang/test/Lower/OpenACC/acc-declare.f90
index 401b654adeb61b6..5d3f9e3fe97e4a1 100644
--- a/flang/test/Lower/OpenACC/acc-declare.f90
+++ b/flang/test/Lower/OpenACC/acc-declare.f90
@@ -245,6 +245,11 @@ subroutine acc_declare_allocate()
! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
! CHECK: fir.store %{{.*}} to %{{.*}} {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: fir.if
+! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xi32>>
+! CHECK: fir.store %{{.*}} to %{{.*}}#1 {acc.declare_action = #acc.declare_action<postDealloc = @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_dealloc>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK: }
+
end subroutine
! CHECK-LABEL: func.func private @_QMacc_declareFacc_declare_allocateEa_acc_declare_update_desc_post_alloc(
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you, Valentin! Feel free to ignore my inline comment for this commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks good
…m#89450) Automatic deallocation of allocatable that are cuda device variable must use the fir.cuda_deallocate operation. This patch update the automatic deallocation code generation to use this operation when the variable is a cuda variable.
cb3aede
to
9b6be0f
Compare
Automatic deallocation of allocatable that are cuda device variable must use the fir.cuda_deallocate operation. This patch update the automatic deallocation code generation to use this operation when the variable is a cuda variable.
This patch has also the side effect to correctly call
attachDeclarePostDeallocAction
for OpenACC declare variable on automatic deallocation as well. Update the code inattachDeclarePostDeallocAction
so we do not attach on fir.result but on the correct last op.